diff options
Diffstat (limited to 'usr/src/uts/common/sys')
119 files changed, 5109 insertions, 400 deletions
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index e3516c1c96..cee6dfb94d 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -254,6 +254,7 @@ CHKHDRS= \ flock.h \ flock_impl.h \ fork.h \ + frameio.h \ fss.h \ fsspriocntl.h \ fsid.h \ @@ -279,6 +280,7 @@ CHKHDRS= \ idmap.h \ ieeefp.h \ id_space.h \ + inotify.h \ instance.h \ int_const.h \ int_fmtio.h \ @@ -347,6 +349,7 @@ CHKHDRS= \ lgrp.h \ lgrp_user.h \ libc_kernel.h \ + limits.h \ link.h \ linker_set.h \ list.h \ @@ -432,6 +435,9 @@ CHKHDRS= \ ontrap.h \ open.h \ openpromio.h \ + overlay.h \ + overlay_common.h \ + overlay_target.h \ panic.h \ param.h \ pathconf.h \ @@ -657,6 +663,8 @@ CHKHDRS= \ vmem.h \ vmem_impl.h \ vmsystm.h \ + vnd.h \ + vnd_errno.h \ vnic.h \ vnic_impl.h \ vnode.h \ @@ -668,11 +676,13 @@ CHKHDRS= \ vuid_queue.h \ vuid_state.h \ vuid_store.h \ + vxlan.h \ wait.h \ waitq.h \ watchpoint.h \ winlockio.h \ zcons.h \ + zfd.h \ zone.h \ xti_inet.h \ xti_osi.h \ @@ -838,13 +848,14 @@ FSHDRS= \ autofs.h \ decomp.h \ dv_node.h \ - sdev_impl.h \ fifonode.h \ hsfs_isospec.h \ hsfs_node.h \ hsfs_rrip.h \ hsfs_spec.h \ hsfs_susp.h \ + hyprlofs.h \ + hyprlofs_info.h \ lofs_info.h \ lofs_node.h \ mntdata.h \ @@ -854,6 +865,8 @@ FSHDRS= \ pc_label.h \ pc_node.h \ pxfs_ki.h \ + sdev_impl.h \ + sdev_plugin.h \ snode.h \ swapnode.h \ tmp.h \ diff --git a/usr/src/uts/common/sys/acct.h b/usr/src/uts/common/sys/acct.h index f00884681b..e01ad61025 100644 --- a/usr/src/uts/common/sys/acct.h +++ b/usr/src/uts/common/sys/acct.h @@ -22,6 +22,7 @@ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -88,7 +89,7 @@ extern int acct(const char *); #if defined(_KERNEL) -void acct(char); +void acct(int); int sysacct(char *); struct vnode; diff --git a/usr/src/uts/common/sys/aggr_impl.h b/usr/src/uts/common/sys/aggr_impl.h index 547c9cc241..80733aa31e 100644 --- a/usr/src/uts/common/sys/aggr_impl.h +++ b/usr/src/uts/common/sys/aggr_impl.h @@ -21,6 +21,8 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 OmniTI Computer Consulting, Inc All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_AGGR_IMPL_H @@ -54,25 +56,47 @@ extern "C" { */ #define MAC_PSEUDO_RING_INUSE 0x01 +#define MAX_GROUPS_PER_PORT 128 + +/* + * VLAN filters placed on the Rx pseudo group. + */ +typedef struct aggr_vlan { + list_node_t av_link; + uint16_t av_vid; /* VLAN ID */ + uint_t av_refs; /* num aggr clients using this VID */ +} aggr_vlan_t; + typedef struct aggr_unicst_addr_s { uint8_t aua_addr[ETHERADDRL]; struct aggr_unicst_addr_s *aua_next; } aggr_unicst_addr_t; typedef struct aggr_pseudo_rx_ring_s { - mac_ring_handle_t arr_rh; /* filled in by aggr_fill_ring() */ - struct aggr_port_s *arr_port; - mac_ring_handle_t arr_hw_rh; - uint_t arr_flags; - uint64_t arr_gen; + mac_ring_handle_t arr_rh; /* set by aggr_fill_ring() */ + struct aggr_port_s *arr_port; + struct aggr_pseudo_rx_group_s *arr_grp; + mac_ring_handle_t arr_hw_rh; + uint_t arr_flags; + uint64_t arr_gen; } aggr_pseudo_rx_ring_t; +/* + * An aggr pseudo group abstracts the underlying ports' HW groups. For + * example, if each port has 8 groups (mac_group_t), then the aggr + * will create 8 pseudo groups. Each pseudo group represents a + * collection of HW groups: one group from each port. If you have + * three ports then the pseudo group stands in for three HW groups. + */ typedef struct aggr_pseudo_rx_group_s { + uint_t arg_index; struct aggr_grp_s *arg_grp; /* filled in by aggr_fill_group() */ mac_group_handle_t arg_gh; /* filled in by aggr_fill_group() */ aggr_unicst_addr_t *arg_macaddr; aggr_pseudo_rx_ring_t arg_rings[MAX_RINGS_PER_GROUP]; uint_t arg_ring_cnt; + uint_t arg_untagged; /* num clients untagged */ + list_t arg_vlans; /* VLANs on this group */ } aggr_pseudo_rx_group_t; typedef struct aggr_pseudo_tx_ring_s { @@ -106,12 +130,13 @@ typedef struct aggr_port_s { lp_collector_enabled : 1, lp_promisc_on : 1, lp_no_link_update : 1, - lp_rx_grp_added : 1, lp_tx_grp_added : 1, lp_closing : 1, - lp_pad_bits : 24; + lp_pad_bits : 25; mac_handle_t lp_mh; - mac_client_handle_t lp_mch; + + mac_client_handle_t lp_mch; + const mac_info_t *lp_mip; mac_notify_handle_t lp_mnh; uint_t lp_tx_idx; /* idx in group's tx array */ @@ -123,13 +148,19 @@ typedef struct aggr_port_s { aggr_lacp_port_t lp_lacp; /* LACP state */ lacp_stats_t lp_lacp_stats; uint32_t lp_margin; - mac_promisc_handle_t lp_mphp; + mac_unicast_handle_t lp_mah; /* List of non-primary addresses that requires promiscous mode set */ aggr_unicst_addr_t *lp_prom_addr; - /* handle of the underlying HW RX group */ - mac_group_handle_t lp_hwgh; + + /* + * References to the underlying HW Rx groups of this port. + * Used by aggr to program HW classification for the pseudo + * groups. + */ + mac_group_handle_t lp_hwghs[MAX_GROUPS_PER_PORT]; + int lp_tx_ring_cnt; /* handles of the underlying HW TX rings */ mac_ring_handle_t *lp_tx_rings; @@ -176,7 +207,7 @@ typedef struct aggr_grp_s { lg_lso : 1, lg_pad_bits : 8; aggr_port_t *lg_ports; /* list of configured ports */ - aggr_port_t *lg_mac_addr_port; + aggr_port_t *lg_mac_addr_port; /* using address of this port */ mac_handle_t lg_mh; zoneid_t lg_zoneid; uint_t lg_nattached_ports; @@ -186,11 +217,18 @@ typedef struct aggr_grp_s { uint_t lg_tx_ports_size; /* size of lg_tx_ports */ uint32_t lg_tx_policy; /* outbound policy */ uint8_t lg_mac_tx_policy; - uint64_t lg_ifspeed; link_state_t lg_link_state; + + + /* + * The lg_stat_lock must be held when accessing these fields. + */ + kmutex_t lg_stat_lock; + uint64_t lg_ifspeed; link_duplex_t lg_link_duplex; uint64_t lg_stat[MAC_NSTAT]; uint64_t lg_ether_stat[ETHER_NSTAT]; + aggr_lacp_mode_t lg_lacp_mode; /* off, active, or passive */ Agg_t aggr; /* 802.3ad data */ uint32_t lg_hcksum_txflags; @@ -213,7 +251,9 @@ typedef struct aggr_grp_s { kthread_t *lg_lacp_rx_thread; boolean_t lg_lacp_done; - aggr_pseudo_rx_group_t lg_rx_group; + uint_t lg_rx_group_count; + aggr_pseudo_rx_group_t lg_rx_groups[MAX_GROUPS_PER_PORT]; + aggr_pseudo_tx_group_t lg_tx_group; kmutex_t lg_tx_flowctl_lock; @@ -335,8 +375,11 @@ extern void aggr_grp_port_hold(aggr_port_t *); extern void aggr_grp_port_rele(aggr_port_t *); extern void aggr_grp_port_wait(aggr_grp_t *); -extern int aggr_port_addmac(aggr_port_t *, const uint8_t *); -extern void aggr_port_remmac(aggr_port_t *, const uint8_t *); +extern int aggr_port_addmac(aggr_port_t *, uint_t, const uint8_t *); +extern void aggr_port_remmac(aggr_port_t *, uint_t, const uint8_t *); + +extern int aggr_port_addvlan(aggr_port_t *, uint_t, uint16_t); +extern int aggr_port_remvlan(aggr_port_t *, uint_t, uint16_t); extern mblk_t *aggr_ring_tx(void *, mblk_t *); extern mblk_t *aggr_find_tx_ring(void *, mblk_t *, diff --git a/usr/src/uts/common/sys/auxv.h b/usr/src/uts/common/sys/auxv.h index 1fb5011970..b3b2898987 100644 --- a/usr/src/uts/common/sys/auxv.h +++ b/usr/src/uts/common/sys/auxv.h @@ -78,6 +78,9 @@ typedef struct { #define AT_FLAGS 8 /* processor flags */ #define AT_ENTRY 9 /* a.out entry point */ +/* First introduced on Linux */ +#define AT_RANDOM 25 /* address of 16 random bytes */ + /* * These relate to the original PPC ABI document; Linux reused * the values for other things (see below), so disambiguation of @@ -90,19 +93,18 @@ typedef struct { * These are the values from LSB 1.3, the first five are also described * in the draft amd64 ABI. * - * At the time of writing, Solaris doesn't place any of these values into - * the aux vector, except AT_CLKTCK which is placed on the aux vector for - * lx branded processes; also, we do similar things via AT_SUN_ values. + * At the time of writing, illumos doesn't place any of these values into the + * aux vector, except where noted. We do similar things via AT_SUN_ values. * * AT_NOTELF 10 program is not ELF? - * AT_UID 11 real user id - * AT_EUID 12 effective user id - * AT_GID 13 real group id - * AT_EGID 14 effective group id + * AT_UID 11 real user id (provided in LX) + * AT_EUID 12 effective user id (provided in LX) + * AT_GID 13 real group id (provided in LX) + * AT_EGID 14 effective group id (provided in LX) * * AT_PLATFORM 15 * AT_HWCAP 16 - * AT_CLKTCK 17 c.f. _SC_CLK_TCK + * AT_CLKTCK 17 c.f. _SC_CLK_TCK (provided in LX) * AT_FPUCW 18 * * AT_DCACHEBSIZE 19 (moved from 10) @@ -110,6 +112,16 @@ typedef struct { * AT_UCACHEBSIZE 21 (moved from 12) * * AT_IGNOREPPC 22 + * + * On Linux: + * AT_* values 18 through 22 are reserved + * AT_SECURE 23 secure mode boolean (provided in LX) + * AT_BASE_PLATFORM 24 string identifying real platform, may + * differ from AT_PLATFORM. + * AT_HWCAP2 26 extension of AT_HWCAP + * AT_EXECFN 31 filename of program + * AT_SYSINFO 32 + * AT_SYSINFO_EHDR 33 The vDSO location */ /* @@ -186,6 +198,8 @@ extern uint_t getisax(uint32_t *, uint_t); #define AT_SUN_BRAND_AUX1 2020 #define AT_SUN_BRAND_AUX2 2021 #define AT_SUN_BRAND_AUX3 2022 +#define AT_SUN_BRAND_AUX4 2025 +#define AT_SUN_BRAND_NROOT 2024 /* * Aux vector for comm page diff --git a/usr/src/uts/common/sys/brand.h b/usr/src/uts/common/sys/brand.h index badc3faff8..df22f492bf 100644 --- a/usr/src/uts/common/sys/brand.h +++ b/usr/src/uts/common/sys/brand.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_BRAND_H @@ -102,29 +103,106 @@ struct brand_mach_ops; struct intpdata; struct execa; +/* + * Common structure to define hooks for brand operation. + * + * Required Fields: + * b_init_brand_data - Setup zone brand data during zone_setbrand + * b_free_brand_data - Free zone brand data during zone_destroy + * b_brandsys - Syscall handler for brandsys + * b_setbrand - Initialize process brand data + * b_getattr - Get brand-custom zone attribute + * b_setattr - Set brand-custom zone attribute + * b_copy_procdata - Copy process brand data during fork + * b_proc_exit - Perform process brand exit processing + * b_exec - Reset branded process state on exec + * b_lwp_setrval - Set return code for forked child + * b_initlwp - Initialize lwp brand data (cannot drop p->p_lock) + * b_forklwp - Copy lwp brand data during fork + * b_freelwp - Free lwp brand data + * b_lwpexit - Perform lwp-specific brand exit processing + * b_elfexec - Load and execute ELF binary + * b_sigset_native_to_brand - Convert sigset native->brand + * b_sigset_brand_to_native - Convert sigset brand->native + * b_nsig - Maxiumum signal number + * b_sendsig - Update process state after sendsig + * + * Optional Fields: + * b_lwpdata_alloc - Speculatively allocate data for use in b_initlwp + * b_lwpdata_free - Free data from allocated by b_lwpdata_alloc if errors occur + * during lwp creation before b_initlwp could be called. + * b_initlwp_post - Complete lwp branding (can temporarily drop p->p_lock) + * b_exit_with_sig - Instead of sending SIGCLD, exit with custom behavior + * b_psig_to_proc - Custom additional behavior during psig + * b_wait_filter - Filter processes from being matched by waitid + * b_native_exec - Provide interpreter path prefix for executables + * b_ptrace_exectrap - Custom behavior for legacy ptrace traps + * b_map32limit - Specify alternate limit for MAP_32BIT mappings + * b_stop_notify - Hook process stop events + * b_waitid_helper - Generate synthetic results for waitid + * b_sigcld_repost - Post synthetic SIGCLD signals + * b_issig_stop - Alter/suppress signal delivery during issig + * b_sig_ignorable - Disallow discarding of signals + * b_savecontext - Alter context during savecontext + * b_restorecontext - Alter context during restorecontext + * b_sendsig_stack - Override stack used for signal delivery + * b_setid_clear - Override setid_clear behavior + * b_pagefault - Trap pagefault events + * b_intp_parse_arg - Controls interpreter argument handling (allow 1 or all) + * b_clearbrand - Perform any actions necessary when clearing the brand. + * b_rpc_statd - Upcall to rpc.statd running within the zone + * b_acct_out - Output properly formatted accounting record + */ struct brand_ops { - void (*b_init_brand_data)(zone_t *); + void (*b_init_brand_data)(zone_t *, kmutex_t *); void (*b_free_brand_data)(zone_t *); int (*b_brandsys)(int, int64_t *, uintptr_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t); + uintptr_t); void (*b_setbrand)(struct proc *); int (*b_getattr)(zone_t *, int, void *, size_t *); int (*b_setattr)(zone_t *, int, void *, size_t); void (*b_copy_procdata)(struct proc *, struct proc *); - void (*b_proc_exit)(struct proc *, klwp_t *); + void (*b_proc_exit)(struct proc *); void (*b_exec)(); void (*b_lwp_setrval)(klwp_t *, int, int); - int (*b_initlwp)(klwp_t *); + void *(*b_lwpdata_alloc)(struct proc *); + void (*b_lwpdata_free)(void *); + void (*b_initlwp)(klwp_t *, void *); + void (*b_initlwp_post)(klwp_t *); void (*b_forklwp)(klwp_t *, klwp_t *); void (*b_freelwp)(klwp_t *); void (*b_lwpexit)(klwp_t *); - int (*b_elfexec)(struct vnode *vp, struct execa *uap, - struct uarg *args, struct intpdata *idata, int level, - long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + int (*b_elfexec)(struct vnode *, struct execa *, struct uarg *, + struct intpdata *, int, size_t *, int, caddr_t, struct cred *, + int *); void (*b_sigset_native_to_brand)(sigset_t *); void (*b_sigset_brand_to_native)(sigset_t *); + void (*b_sigfd_translate)(k_siginfo_t *); int b_nsig; + void (*b_exit_with_sig)(proc_t *, sigqueue_t *); + boolean_t (*b_wait_filter)(proc_t *, proc_t *); + boolean_t (*b_native_exec)(uint8_t, const char **); + uint32_t (*b_map32limit)(proc_t *); + void (*b_stop_notify)(proc_t *, klwp_t *, ushort_t, ushort_t); + int (*b_waitid_helper)(idtype_t, id_t, k_siginfo_t *, int, + boolean_t *, int *); + int (*b_sigcld_repost)(proc_t *, sigqueue_t *); + int (*b_issig_stop)(proc_t *, klwp_t *); + boolean_t (*b_sig_ignorable)(proc_t *, klwp_t *, int); + void (*b_savecontext)(ucontext_t *); +#if defined(_SYSCALL32_IMPL) + void (*b_savecontext32)(ucontext32_t *); +#endif + void (*b_restorecontext)(ucontext_t *); + caddr_t (*b_sendsig_stack)(int); + void (*b_sendsig)(int); + int (*b_setid_clear)(vattr_t *vap, cred_t *cr); + int (*b_pagefault)(proc_t *, klwp_t *, caddr_t, enum fault_type, + enum seg_rw); + boolean_t b_intp_parse_arg; + void (*b_clearbrand)(proc_t *, boolean_t); + void (*b_rpc_statd)(int, void *, void *); + void (*b_acct_out)(struct vnode *, int); }; /* @@ -135,6 +213,7 @@ typedef struct brand { char *b_name; struct brand_ops *b_ops; struct brand_mach_ops *b_machops; + size_t b_data_size; } brand_t; extern brand_t native_brand; @@ -165,7 +244,7 @@ extern brand_t *brand_register_zone(struct brand_attr *); extern brand_t *brand_find_name(char *); extern void brand_unregister_zone(brand_t *); extern int brand_zone_count(brand_t *); -extern void brand_setbrand(proc_t *); +extern int brand_setbrand(proc_t *, boolean_t); extern void brand_clearbrand(proc_t *, boolean_t); /* @@ -178,17 +257,16 @@ extern int brand_solaris_cmd(int, uintptr_t, uintptr_t, uintptr_t, extern void brand_solaris_copy_procdata(proc_t *, proc_t *, struct brand *); extern int brand_solaris_elfexec(vnode_t *, execa_t *, uarg_t *, - intpdata_t *, int, long *, int, caddr_t, cred_t *, int, - struct brand *, char *, char *, char *, char *, char *); + intpdata_t *, int, size_t *, int, caddr_t, cred_t *, int *, + struct brand *, char *, char *, char *); extern void brand_solaris_exec(struct brand *); extern int brand_solaris_fini(char **, struct modlinkage *, struct brand *); extern void brand_solaris_forklwp(klwp_t *, klwp_t *, struct brand *); extern void brand_solaris_freelwp(klwp_t *, struct brand *); -extern int brand_solaris_initlwp(klwp_t *, struct brand *); +extern void brand_solaris_initlwp(klwp_t *, struct brand *); extern void brand_solaris_lwpexit(klwp_t *, struct brand *); -extern void brand_solaris_proc_exit(struct proc *, klwp_t *, - struct brand *); +extern void brand_solaris_proc_exit(struct proc *, struct brand *); extern void brand_solaris_setbrand(proc_t *, struct brand *); #if defined(_SYSCALL32) diff --git a/usr/src/uts/common/sys/buf.h b/usr/src/uts/common/sys/buf.h index e20e0e0c35..b6b5c20e44 100644 --- a/usr/src/uts/common/sys/buf.h +++ b/usr/src/uts/common/sys/buf.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012 Joyent, Inc. All rights reserved. * * Copyright 2017 RackTop Systems. */ @@ -188,6 +189,7 @@ struct biostats { #define B_STARTED 0x2000000 /* io:::start probe called for buf */ #define B_ABRWRITE 0x4000000 /* Application based recovery active */ #define B_PAGE_NOWAIT 0x8000000 /* Skip the page if it is locked */ +#define B_INVALCURONLY 0x10000000 /* invalidate only for curproc */ /* * There is some confusion over the meaning of B_FREE and B_INVAL and what @@ -200,6 +202,12 @@ struct biostats { * between the sole use of these two flags. In both cases, IO will be done * if the page is not yet committed to storage. * + * The B_INVALCURONLY flag modifies the behavior of the B_INVAL flag and is + * intended to be used in conjunction with B_INVAL. B_INVALCURONLY has no + * meaning on its own. When both B_INVALCURONLY and B_INVAL are set, then + * the mapping for the page is only invalidated for the current process. + * In this case, the page is not destroyed unless this was the final mapping. + * * In order to discard pages without writing them back, (B_INVAL | B_TRUNC) * should be used. * diff --git a/usr/src/uts/common/sys/contract/process.h b/usr/src/uts/common/sys/contract/process.h index 21cf94dcf9..2c70d7c9f1 100644 --- a/usr/src/uts/common/sys/contract/process.h +++ b/usr/src/uts/common/sys/contract/process.h @@ -21,13 +21,12 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_CONTRACT_PROCESS_H #define _SYS_CONTRACT_PROCESS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/contract.h> #include <sys/time.h> @@ -55,7 +54,8 @@ typedef struct cont_process cont_process_t; #define CT_PR_NOORPHAN 0x2 /* kill when contract is abandoned */ #define CT_PR_PGRPONLY 0x4 /* only kill process group on fatal errors */ #define CT_PR_REGENT 0x8 /* automatically detach inherited contracts */ -#define CT_PR_ALLPARAM 0xf +#define CT_PR_KEEP_EXEC 0x10 /* preserve template accross exec */ +#define CT_PR_ALLPARAM 0x1f /* * ctr_ev_* flags diff --git a/usr/src/uts/common/sys/cpucaps.h b/usr/src/uts/common/sys/cpucaps.h index 6063ff4380..6bc042108c 100644 --- a/usr/src/uts/common/sys/cpucaps.h +++ b/usr/src/uts/common/sys/cpucaps.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_H @@ -84,12 +85,16 @@ extern void cpucaps_zone_remove(zone_t *); */ extern int cpucaps_project_set(kproject_t *, rctl_qty_t); extern int cpucaps_zone_set(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_base(zone_t *, rctl_qty_t); +extern int cpucaps_zone_set_burst_time(zone_t *, rctl_qty_t); /* * Get current CPU usage for a project/zone. */ extern rctl_qty_t cpucaps_project_get(kproject_t *); extern rctl_qty_t cpucaps_zone_get(zone_t *); +extern rctl_qty_t cpucaps_zone_get_base(zone_t *); +extern rctl_qty_t cpucaps_zone_get_burst_time(zone_t *); /* * Scheduling class hooks into CPU caps framework. diff --git a/usr/src/uts/common/sys/cpucaps_impl.h b/usr/src/uts/common/sys/cpucaps_impl.h index 95afd21827..2cd4ed644d 100644 --- a/usr/src/uts/common/sys/cpucaps_impl.h +++ b/usr/src/uts/common/sys/cpucaps_impl.h @@ -22,6 +22,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011, 2012, Joyent, Inc. All rights reserved. */ #ifndef _SYS_CPUCAPS_IMPL_H @@ -66,8 +67,12 @@ typedef struct cpucap { waitq_t cap_waitq; /* waitq for capped threads */ kstat_t *cap_kstat; /* cpucaps specific kstat */ int64_t cap_gen; /* zone cap specific */ + hrtime_t cap_chk_value; /* effective CPU usage cap */ hrtime_t cap_value; /* scaled CPU usage cap */ hrtime_t cap_usage; /* current CPU usage */ + hrtime_t cap_base; /* base CPU for burst */ + u_longlong_t cap_burst_limit; /* max secs (in tics) for a burst */ + u_longlong_t cap_bursting; /* # of ticks currently bursting */ disp_lock_t cap_usagelock; /* protects cap_usage above */ /* * Per cap statistics. @@ -75,6 +80,7 @@ typedef struct cpucap { hrtime_t cap_maxusage; /* maximum cap usage */ u_longlong_t cap_below; /* # of ticks spend below the cap */ u_longlong_t cap_above; /* # of ticks spend above the cap */ + u_longlong_t cap_above_base; /* # of ticks spent above the base */ } cpucap_t; /* diff --git a/usr/src/uts/common/sys/cpuvar.h b/usr/src/uts/common/sys/cpuvar.h index 6cfb19f56c..21bdfbd160 100644 --- a/usr/src/uts/common/sys/cpuvar.h +++ b/usr/src/uts/common/sys/cpuvar.h @@ -23,7 +23,7 @@ * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright 2017 RackTop Systems. * Copyright 2019 Joyent, Inc. */ @@ -390,6 +390,8 @@ extern cpu_core_t cpu_core[]; #define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */ #define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ +/* Note: inside ifdef: _KERNEL || _KMEMUSER || _BOOT */ + /* * Macros for manipulating sets of CPUs as a bitmap. Note that this * bitmap may vary in size depending on the maximum CPU id a specific @@ -512,6 +514,7 @@ extern struct cpu **cpu_seq; /* indexed by sequential CPU id */ extern cpu_t *cpu_list; /* list of CPUs */ extern cpu_t *cpu_active; /* list of active CPUs */ extern cpuset_t cpu_active_set; /* cached set of active CPUs */ +extern cpuset_t cpu_available; /* cached set of available CPUs */ extern int ncpus; /* number of CPUs present */ extern int ncpus_online; /* number of CPUs not quiesced */ extern int ncpus_intr_enabled; /* nr of CPUs taking I/O intrs */ diff --git a/usr/src/uts/common/sys/cred.h b/usr/src/uts/common/sys/cred.h index fb79dfecde..1f938132e0 100644 --- a/usr/src/uts/common/sys/cred.h +++ b/usr/src/uts/common/sys/cred.h @@ -93,6 +93,7 @@ extern gid_t crgetgid(const cred_t *); extern gid_t crgetrgid(const cred_t *); extern gid_t crgetsgid(const cred_t *); extern zoneid_t crgetzoneid(const cred_t *); +extern zoneid_t crgetzonedid(const cred_t *); extern projid_t crgetprojid(const cred_t *); extern cred_t *crgetmapped(const cred_t *); diff --git a/usr/src/uts/common/sys/cyclic.h b/usr/src/uts/common/sys/cyclic.h index 5f28543f9f..270a09449f 100644 --- a/usr/src/uts/common/sys/cyclic.h +++ b/usr/src/uts/common/sys/cyclic.h @@ -23,6 +23,7 @@ * Use is subject to license terms. * * Copyright 2017 RackTop Systems. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_CYCLIC_H @@ -81,6 +82,7 @@ extern cyclic_id_t cyclic_add_omni(cyc_omni_handler_t *); extern void cyclic_remove(cyclic_id_t); extern void cyclic_bind(cyclic_id_t, cpu_t *, cpupart_t *); extern int cyclic_reprogram(cyclic_id_t, hrtime_t); +extern void cyclic_move_here(cyclic_id_t); extern hrtime_t cyclic_getres(); extern int cyclic_offline(cpu_t *cpu); diff --git a/usr/src/uts/common/sys/ddi_hp.h b/usr/src/uts/common/sys/ddi_hp.h index eadb88ed49..b88762a9f5 100644 --- a/usr/src/uts/common/sys/ddi_hp.h +++ b/usr/src/uts/common/sys/ddi_hp.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DDI_HP_H @@ -28,6 +30,9 @@ /* * Sun DDI hotplug support definitions + * + * See the big theory statement in uts/common/os/ddi_hp_impl.c for more + * information. */ #ifdef __cplusplus @@ -73,7 +78,8 @@ typedef enum { /* * ddi_hp_cn_info_t * - * Hotplug Connection (CN) information structure + * Hotplug Connection (CN) information structure. + * A Connection is either a Connector or a Port. */ typedef struct ddi_hp_cn_info { char *cn_name; /* Name of the Connection */ diff --git a/usr/src/uts/common/sys/ddi_hp_impl.h b/usr/src/uts/common/sys/ddi_hp_impl.h index fb220119dd..b52df77cac 100644 --- a/usr/src/uts/common/sys/ddi_hp_impl.h +++ b/usr/src/uts/common/sys/ddi_hp_impl.h @@ -21,6 +21,12 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. + */ + +/* + * See the big theory statement in uts/common/os/ddi_hp_impl.c for more + * information about the structures and functions defined here. */ #ifndef _SYS_DDI_HP_IMPL_H diff --git a/usr/src/uts/common/sys/dktp/dadk.h b/usr/src/uts/common/sys/dktp/dadk.h index f5c990e7c0..2178ad1f0d 100644 --- a/usr/src/uts/common/sys/dktp/dadk.h +++ b/usr/src/uts/common/sys/dktp/dadk.h @@ -65,6 +65,8 @@ struct dadk { kstat_t *dad_errstats; /* error stats */ kmutex_t dad_cmd_mutex; int dad_cmd_count; + uint32_t dad_err_cnt; /* number of recent errors */ + hrtime_t dad_last_log; /* time of last error log */ }; #define DAD_SECSIZ dad_phyg.g_secsiz diff --git a/usr/src/uts/common/sys/dld.h b/usr/src/uts/common/sys/dld.h index 6449f39a35..5be223ce93 100644 --- a/usr/src/uts/common/sys/dld.h +++ b/usr/src/uts/common/sys/dld.h @@ -192,6 +192,7 @@ typedef struct dld_ioc_rename { datalink_id_t dir_linkid1; datalink_id_t dir_linkid2; char dir_link[MAXLINKNAMELEN]; + boolean_t dir_zoneinit; } dld_ioc_rename_t; /* @@ -204,6 +205,7 @@ typedef struct dld_ioc_rename { typedef struct dld_ioc_zid { zoneid_t diz_zid; datalink_id_t diz_linkid; + boolean_t diz_transient; } dld_ioc_zid_t; /* @@ -356,6 +358,7 @@ typedef struct dld_ioc_led { #define DLD_CAPAB_POLL 0x00000002 #define DLD_CAPAB_PERIM 0x00000003 #define DLD_CAPAB_LSO 0x00000004 +#define DLD_CAPAB_IPCHECK 0x00000005 #define DLD_ENABLE 0x00000001 #define DLD_DISABLE 0x00000002 @@ -382,6 +385,7 @@ typedef struct dld_ioc_led { */ typedef int (*dld_capab_func_t)(void *, uint_t, void *, uint_t); +#define DI_DIRECT_RAW 0x1 /* * Direct Tx/Rx capability. */ @@ -406,8 +410,16 @@ typedef struct dld_capab_direct_s { /* flow control "can I put on a ring" callback */ uintptr_t di_tx_fctl_df; /* canput-like callback */ void *di_tx_fctl_dh; + + /* flags that control our behavior */ + uint_t di_flags; } dld_capab_direct_t; +typedef struct dld_capab_ipcheck_s { + uintptr_t ipc_allowed_df; + void *ipc_allowed_dh; +} dld_capab_ipcheck_t; + /* * Polling/softring capability. */ diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index 035eea893a..336fa9cb67 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -53,7 +53,8 @@ typedef enum { typedef enum { DLD_UNINITIALIZED, DLD_PASSIVE, - DLD_ACTIVE + DLD_ACTIVE, + DLD_EXCLUSIVE } dld_passivestate_t; /* @@ -256,6 +257,8 @@ extern void dld_str_rx_unitdata(void *, mac_resource_handle_t, extern void dld_str_notify_ind(dld_str_t *); extern mac_tx_cookie_t str_mdata_fastpath_put(dld_str_t *, mblk_t *, uintptr_t, uint16_t); +extern mac_tx_cookie_t str_mdata_raw_fastpath_put(dld_str_t *, mblk_t *, + uintptr_t, uint16_t); extern int dld_flow_ctl_callb(dld_str_t *, uint64_t, int (*func)(), void *); diff --git a/usr/src/uts/common/sys/dld_ioc.h b/usr/src/uts/common/sys/dld_ioc.h index 2f519a8eda..093a4dc0c3 100644 --- a/usr/src/uts/common/sys/dld_ioc.h +++ b/usr/src/uts/common/sys/dld_ioc.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLD_IOC_H @@ -59,6 +60,7 @@ extern "C" { #define IPTUN_IOC 0x454A #define BRIDGE_IOC 0xB81D #define IBPART_IOC 0x6171 +#define OVERLAY_IOC 0x2005 /* GLDv3 modules use these macros to generate unique ioctl commands */ #define DLDIOC(cmdid) DLD_IOC_CMD(DLD_IOC, (cmdid)) @@ -68,6 +70,7 @@ extern "C" { #define IPTUNIOC(cmdid) DLD_IOC_CMD(IPTUN_IOC, (cmdid)) #define BRIDGEIOC(cmdid) DLD_IOC_CMD(BRIDGE_IOC, (cmdid)) #define IBPARTIOC(cmdid) DLD_IOC_CMD(IBPART_IOC, (cmdid)) +#define OVERLAYIOC(cmdid) DLD_IOC_CMD(OVERLAY_IOC, (cmdid)) #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 5bc2bd41c5..d76daffeb7 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -107,6 +108,7 @@ typedef struct dl_ipnetinfo { #define DL_PASSIVE_REQ 0x114 /* Allow access to aggregated link */ #define DL_INTR_MODE_REQ 0x115 /* Request Rx processing in INTR mode */ #define DL_NOTIFY_CONF 0x116 /* Notification from upstream */ +#define DL_EXCLUSIVE_REQ 0x117 /* Make bind active */ /* * Primitives used for Connectionless Service @@ -388,6 +390,8 @@ typedef struct dl_ipnetinfo { #define DL_PROMISC_PHYS 0x01 /* promiscuous mode at phys level */ #define DL_PROMISC_SAP 0x02 /* promiscuous mode at sap level */ #define DL_PROMISC_MULTI 0x03 /* promiscuous mode for multicast */ +#define DL_PROMISC_RX_ONLY 0x04 /* above only enabled for rx */ +#define DL_PROMISC_FIXUPS 0x05 /* above will be fixed up */ /* * DLPI notification codes for DL_NOTIFY_REQ primitives. @@ -673,11 +677,11 @@ typedef struct { #define HCKSUM_ENABLE 0x01 /* Set to enable hardware checksum */ /* capability */ #define HCKSUM_INET_PARTIAL 0x02 /* Partial 1's complement checksum */ - /* ability */ + /* ability for TCP/UDP packets. */ #define HCKSUM_INET_FULL_V4 0x04 /* Full 1's complement checksum */ - /* ability for IPv4 packets. */ + /* ability for IPv4 TCP/UDP packets. */ #define HCKSUM_INET_FULL_V6 0x08 /* Full 1's complement checksum */ - /* ability for IPv6 packets. */ + /* ability for IPv6 TCP/UDP packets. */ #define HCKSUM_IPHDRCKSUM 0x10 /* IPv4 Header checksum offload */ /* capability */ #ifdef _KERNEL @@ -1107,6 +1111,13 @@ typedef struct { } dl_intr_mode_req_t; /* + * DL_EXCLUSIVE_REQ, M_PROTO type + */ +typedef struct { + t_uscalar_t dl_primitive; +} dl_exclusive_req_t; + +/* * CONNECTION-ORIENTED SERVICE PRIMITIVES */ @@ -1528,6 +1539,7 @@ union DL_primitives { dl_control_ack_t control_ack; dl_passive_req_t passive_req; dl_intr_mode_req_t intr_mode_req; + dl_exclusive_req_t exclusive_req; }; #define DL_INFO_REQ_SIZE sizeof (dl_info_req_t) @@ -1596,6 +1608,7 @@ union DL_primitives { #define DL_CONTROL_ACK_SIZE sizeof (dl_control_ack_t) #define DL_PASSIVE_REQ_SIZE sizeof (dl_passive_req_t) #define DL_INTR_MODE_REQ_SIZE sizeof (dl_intr_mode_req_t) +#define DL_EXCLUSIVE_REQ_SIZE sizeof (dl_exclusive_req_t) #ifdef _KERNEL /* diff --git a/usr/src/uts/common/sys/dls.h b/usr/src/uts/common/sys/dls.h index 2d5bf3c2ea..81f9e2abac 100644 --- a/usr/src/uts/common/sys/dls.h +++ b/usr/src/uts/common/sys/dls.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_H @@ -85,6 +86,8 @@ typedef struct dls_link_s dls_link_t; #define DLS_PROMISC_SAP 0x00000001 #define DLS_PROMISC_MULTI 0x00000002 #define DLS_PROMISC_PHYS 0x00000004 +#define DLS_PROMISC_RX_ONLY 0x00000008 +#define DLS_PROMISC_FIXUPS 0x00000010 extern int dls_open(dls_link_t *, dls_dl_handle_t, dld_str_t *); extern void dls_close(dld_str_t *); @@ -106,11 +109,13 @@ extern void str_notify(void *, mac_notify_type_t); extern int dls_devnet_open(const char *, dls_dl_handle_t *, dev_t *); +extern int dls_devnet_open_in_zone(const char *, + dls_dl_handle_t *, dev_t *, zoneid_t); extern void dls_devnet_close(dls_dl_handle_t); extern boolean_t dls_devnet_rebuild(); extern int dls_devnet_rename(datalink_id_t, datalink_id_t, - const char *); + const char *, boolean_t); extern int dls_devnet_create(mac_handle_t, datalink_id_t, zoneid_t); extern int dls_devnet_destroy(mac_handle_t, datalink_id_t *, @@ -128,7 +133,7 @@ extern uint16_t dls_devnet_vid(dls_dl_handle_t); extern datalink_id_t dls_devnet_linkid(dls_dl_handle_t); extern int dls_devnet_dev2linkid(dev_t, datalink_id_t *); extern int dls_devnet_phydev(datalink_id_t, dev_t *); -extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t); +extern int dls_devnet_setzid(dls_dl_handle_t, zoneid_t, boolean_t); extern zoneid_t dls_devnet_getzid(dls_dl_handle_t); extern zoneid_t dls_devnet_getownerzid(dls_dl_handle_t); extern boolean_t dls_devnet_islinkvisible(datalink_id_t, zoneid_t); @@ -142,6 +147,8 @@ extern int dls_mgmt_update(const char *, uint32_t, boolean_t, extern int dls_mgmt_get_linkinfo(datalink_id_t, char *, datalink_class_t *, uint32_t *, uint32_t *); extern int dls_mgmt_get_linkid(const char *, datalink_id_t *); +extern int dls_mgmt_get_linkid_in_zone(const char *, + datalink_id_t *, zoneid_t); extern datalink_id_t dls_mgmt_get_next(datalink_id_t, datalink_class_t, datalink_media_t, uint32_t); extern int dls_devnet_macname2linkid(const char *, diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index cd13a41413..329f8dd08e 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_DLS_IMPL_H @@ -62,6 +63,7 @@ struct dls_link_s { /* Protected by */ uint_t dl_zone_ref; link_tagmode_t dl_tagmode; /* atomic */ uint_t dl_nonip_cnt; /* SL */ + uint_t dl_exclusive; /* SL */ }; typedef struct dls_head_s { @@ -97,7 +99,8 @@ extern void dls_create_str_kstats(dld_str_t *); extern int dls_stat_update(kstat_t *, dls_link_t *, int); extern int dls_stat_create(const char *, int, const char *, zoneid_t, int (*)(struct kstat *, int), void *, - kstat_t **); + kstat_t **, zoneid_t); +extern void dls_stat_delete(kstat_t *); extern int dls_devnet_open_by_dev(dev_t, dls_link_t **, dls_dl_handle_t *); @@ -129,6 +132,7 @@ extern void dls_mgmt_init(void); extern void dls_mgmt_fini(void); extern int dls_mgmt_get_phydev(datalink_id_t, dev_t *); +extern int dls_exclusive_set(dld_str_t *, boolean_t); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/dls_mgmt.h b/usr/src/uts/common/sys/dls_mgmt.h index e2893a2295..6fec277991 100644 --- a/usr/src/uts/common/sys/dls_mgmt.h +++ b/usr/src/uts/common/sys/dls_mgmt.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _DLS_MGMT_H @@ -46,13 +47,15 @@ typedef enum { DATALINK_CLASS_SIMNET = 0x20, DATALINK_CLASS_BRIDGE = 0x40, DATALINK_CLASS_IPTUN = 0x80, - DATALINK_CLASS_PART = 0x100 + DATALINK_CLASS_PART = 0x100, + DATALINK_CLASS_OVERLAY = 0x200 } datalink_class_t; #define DATALINK_CLASS_ALL (DATALINK_CLASS_PHYS | \ DATALINK_CLASS_VLAN | DATALINK_CLASS_AGGR | DATALINK_CLASS_VNIC | \ DATALINK_CLASS_ETHERSTUB | DATALINK_CLASS_SIMNET | \ - DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART) + DATALINK_CLASS_BRIDGE | DATALINK_CLASS_IPTUN | DATALINK_CLASS_PART | \ + DATALINK_CLASS_OVERLAY) /* * A combination of flags and media. @@ -111,10 +114,14 @@ typedef uint64_t datalink_media_t; #define DLMGMT_CMD_BASE 128 /* - * Indicate the link mapping is active or persistent + * Indicate if the link mapping is active, persistent, or transient. A + * transient link is an active link with a twist -- it is an active + * link which is destroyed along with the zone rather than reassigned + * to the GZ. */ #define DLMGMT_ACTIVE 0x01 #define DLMGMT_PERSIST 0x02 +#define DLMGMT_TRANSIENT 0x04 /* upcall argument */ typedef struct dlmgmt_door_arg { @@ -165,6 +172,7 @@ typedef struct dlmgmt_door_getname { typedef struct dlmgmt_door_getlinkid { int ld_cmd; char ld_link[MAXLINKNAMELEN]; + zoneid_t ld_zoneid; } dlmgmt_door_getlinkid_t; typedef struct dlmgmt_door_getnext_s { diff --git a/usr/src/uts/common/sys/dumpadm.h b/usr/src/uts/common/sys/dumpadm.h index 616828bb2b..8ca10ff3c5 100644 --- a/usr/src/uts/common/sys/dumpadm.h +++ b/usr/src/uts/common/sys/dumpadm.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DUMPADM_H @@ -44,11 +45,13 @@ extern "C" { #define DIOCSETUUID (DDIOC | 0x17) #define DIOCGETUUID (DDIOC | 0x18) #define DIOCRMDEV (DDIOC | 0x19) +#define DIOCSCRYPTKEY (DDIOC | 0x1a) /* * Kernel-controlled dump state flags for dump_conflags */ #define DUMP_EXCL 0x00000001 /* dedicated dump device (not swap) */ +#define DUMP_ENCRYPT 0x00000002 /* encrypt dump */ #define DUMP_STATE 0x0000ffff /* the set of all kernel flags */ /* diff --git a/usr/src/uts/common/sys/dumphdr.h b/usr/src/uts/common/sys/dumphdr.h index f418913257..57a9a9c2dc 100644 --- a/usr/src/uts/common/sys/dumphdr.h +++ b/usr/src/uts/common/sys/dumphdr.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_DUMPHDR_H @@ -60,6 +61,22 @@ extern "C" { sizeof (summary_dump_t) + 1024), \ DUMP_OFFSET)) /* summary save area */ +#define DUMP_CRYPT_KEYLEN 32 /* byte len for crypto key */ +#define DUMP_CRYPT_NONCELEN 8 /* byte len for nonce */ +#define DUMP_CRYPT_HMACLEN 64 /* byte len for HMAC */ +#define DUMP_CRYPT_BLOCKSHIFT 6 /* 64-byte blocks */ + +#define DUMP_CRYPT_ALGO_NONE 0 /* dump not encrypted */ +#define DUMP_CRYPT_ALGO_CHACHA20 1 /* ChaCha20 */ + +#if DUMP_OFFSET & ((1 << DUMP_CRYPT_BLOCKSHIFT) - 1) +#error DUMP_OFFSET not DUMP_CRYPT_BLOCKSHIFT aligned +#endif + +#if DUMP_LOGSIZE & ((1 << DUMP_CRYPT_BLOCKSHIFT) - 1) +#error DUMP_LOGSIZE not DUMP_CRYPT_BLOCKSHIFT aligned +#endif + typedef struct dumphdr { uint32_t dump_magic; /* magic number */ uint32_t dump_version; /* version number */ @@ -86,12 +103,22 @@ typedef struct dumphdr { } dumphdr_t; /* + * If DF_ENCRYPTED is set, this header will be found after the dumphdr. + */ +typedef struct dump_crypt { + uint8_t dump_crypt_algo; /* encryption algorithm */ + uint8_t dump_crypt_hmac[DUMP_CRYPT_HMACLEN]; /* HMAC for crypto key */ + uint8_t dump_crypt_nonce[DUMP_CRYPT_NONCELEN]; /* encryption none */ +} dump_crypt_t; + +/* * Values for dump_flags */ #define DF_VALID 0x00000001 /* Dump is valid (savecore clears) */ #define DF_COMPLETE 0x00000002 /* All pages present as configured */ #define DF_LIVE 0x00000004 /* Dump was taken on a live system */ #define DF_COMPRESSED 0x00000008 /* Dump is compressed */ +#define DF_ENCRYPTED 0x00000010 /* Dump is encrypted */ #define DF_KERNEL 0x00010000 /* Contains kernel pages only */ #define DF_ALL 0x00020000 /* Contains all pages */ #define DF_CURPROC 0x00040000 /* Contains kernel + cur proc pages */ @@ -175,6 +202,8 @@ extern u_offset_t dumpvp_size; extern struct dumphdr *dumphdr; extern int dump_conflags; extern char *dumppath; +extern uint8_t dump_crypt_key[DUMP_CRYPT_KEYLEN]; +extern uint8_t dump_crypt_nonce[DUMP_CRYPT_NONCELEN]; extern int dump_timeout; extern int dump_timeleft; diff --git a/usr/src/uts/common/sys/elf.h b/usr/src/uts/common/sys/elf.h index 4bd884e9c2..1a2ca397ef 100644 --- a/usr/src/uts/common/sys/elf.h +++ b/usr/src/uts/common/sys/elf.h @@ -500,6 +500,11 @@ typedef struct { #define PT_GNU_STACK 0x6474e551 /* Indicates stack executability */ #define PT_GNU_RELRO 0x6474e552 /* Read-only after relocation */ +/* + * Linux specific program headers not even used by Linux (!!) + */ +#define PT_PAX_FLAGS 0x65041580 /* PaX flags (see below) */ + #define PT_LOSUNW 0x6ffffffa #define PT_SUNWBSS 0x6ffffffa /* Sun Specific segment (unused) */ #define PT_SUNWSTACK 0x6ffffffb /* describes the stack segment */ @@ -515,6 +520,45 @@ typedef struct { #define PF_W 0x2 #define PF_X 0x1 +/* + * PaX is a regrettable series of never-integrated Linux patches for a + * facility to provide additional protections on memory pages for purposes of + * increasing security, and for allowing binaries to demand (or refuse) those + * protections via the PT_PAX_FLAGS program header. (Portents of its + * rudderless existence, "PaX" is a term of indefinite origin written by an + * unknown group of people.) This facility is unfortunate in any number of + * ways, and was largely obviated by the broad adoption of non-executable + * stacks at any rate -- but it lives on in binaries that continue to mark + * themselves to explicitly refuse the (never-integrated, now-obviated) + * facility. One might cringe that PaX overloads the meaning of the p_flags + * to specify protections, but that is the least of its transgressions: + * instead of using one p_type constant to explicitly enable a series of + * protections and another to explicitly disable others, it insists on + * conflating both actions into PT_PAX_FLAGS. The resulting doubling of + * constant definitions (two constant definitions for every protection instead + * of merely one) assures that the values can't even fit in the eight + * PF_MASKOS bits putatively defined to provide a modicum of cleanliness for + * such filthy functionality. And were all of this not enough, there is one + * final nomenclature insult to be added to this semantic injury: the + * constants for the p_flags don't even embed "_PAX_" in their name -- despite + * the fact that this is their only purpose! We resist the temptation to + * right this final wrong here; we grit our teeth and provide exactly the + * Linux definitions -- or rather, what would have been the Linux definitions + * had this belching jalopy ever been permitted to crash itself into mainline. + */ +#define PF_PAGEEXEC 0x00000010 /* PaX: enable PAGEEXEC */ +#define PF_NOPAGEEXEC 0x00000020 /* PaX: disable PAGEEXEC */ +#define PF_SEGMEXEC 0x00000040 /* PaX: enable SEGMEXEC */ +#define PF_NOSEGMEXEC 0x00000080 /* PaX: disable SEGMEXEC */ +#define PF_MPROTECT 0x00000100 /* PaX: enable MPROTECT */ +#define PF_NOMPROTECT 0x00000200 /* PaX: disable MPROTECT */ +#define PF_RANDEXEC 0x00000400 /* PaX: enable RANDEXEC */ +#define PF_NORANDEXEC 0x00000800 /* PaX: disable RANDEXEC */ +#define PF_EMUTRAMP 0x00001000 /* PaX: enable EMUTRAMP */ +#define PF_NOEMUTRAMP 0x00002000 /* PaX: disable EMUTRAMP */ +#define PF_RANDMMAP 0x00004000 /* PaX: enable RANDMMAP */ +#define PF_NORANDMMAP 0x00008000 /* PaX: disable RANDMMAP */ + #define PF_MASKOS 0x0ff00000 /* OS specific values */ #define PF_MASKPROC 0xf0000000 /* processor specific values */ diff --git a/usr/src/uts/common/sys/eventfd.h b/usr/src/uts/common/sys/eventfd.h index 1b0d961b0b..b64a101348 100644 --- a/usr/src/uts/common/sys/eventfd.h +++ b/usr/src/uts/common/sys/eventfd.h @@ -10,7 +10,7 @@ */ /* - * Copyright (c) 2015 Joyent, Inc. All rights reserved. + * Copyright (c) 2017, Joyent, Inc. */ /* @@ -47,6 +47,13 @@ typedef uint64_t eventfd_t; #define EVENTFDIOC (('e' << 24) | ('f' << 16) | ('d' << 8)) #define EVENTFDIOC_SEMAPHORE (EVENTFDIOC | 1) /* toggle sem state */ +/* + * Kernel-internal method to write to eventfd while bypassing overflow limits, + * therefore avoiding potential to block as well. This is used to fulfill AIO + * behavior in LX related to eventfd notification. + */ +#define EVENTFDIOC_POST (EVENTFDIOC | 2) + #ifndef _KERNEL extern int eventfd(unsigned int, int); @@ -58,6 +65,7 @@ extern int eventfd_write(int, eventfd_t); #define EVENTFDMNRN_EVENTFD 0 #define EVENTFDMNRN_CLONE 1 #define EVENTFD_VALMAX (ULLONG_MAX - 1ULL) +#define EVENTFD_VALOVERFLOW ULLONG_MAX #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/exec.h b/usr/src/uts/common/sys/exec.h index 8056f9a8e8..d66a8dc15d 100644 --- a/usr/src/uts/common/sys/exec.h +++ b/usr/src/uts/common/sys/exec.h @@ -26,6 +26,10 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ +/* + * Copyright 2019 Joyent, Inc. + */ + #ifndef _SYS_EXEC_H #define _SYS_EXEC_H @@ -76,10 +80,11 @@ typedef struct uarg { ssize_t na; ssize_t ne; ssize_t nc; - ssize_t arglen; + size_t argstrlen; + size_t envstrlen; char *fname; char *pathname; - ssize_t auxsize; + size_t auxsize; caddr_t stackend; size_t stk_align; size_t stk_size; @@ -102,10 +107,13 @@ typedef struct uarg { vnode_t *ex_vp; char *emulator; char *brandname; + const char *brand_nroot; char *auxp_auxflags; /* addr of auxflags auxv on the user stack */ char *auxp_brand; /* address of first brand auxv on user stack */ cred_t *pfcred; boolean_t scrubenv; + uintptr_t maxstack; + boolean_t stk_prot_override; uintptr_t commpage; } uarg_t; @@ -175,8 +183,8 @@ struct execsw { int exec_maglen; int (*exec_func)(struct vnode *vp, struct execa *uap, struct uarg *args, struct intpdata *idata, int level, - long *execsz, int setid, caddr_t exec_file, - struct cred *cred, int brand_action); + size_t *execsz, int setid, caddr_t exec_file, + struct cred *cred, int *brand_action); int (*exec_core)(struct vnode *vp, struct proc *p, struct cred *cred, rlim64_t rlimit, int sig, core_content_t content); @@ -213,8 +221,8 @@ extern int exece(const char *fname, const char **argp, const char **envp); extern int exec_common(const char *fname, const char **argp, const char **envp, int brand_action); extern int gexec(vnode_t **vp, struct execa *uap, struct uarg *args, - struct intpdata *idata, int level, long *execsz, caddr_t exec_file, - struct cred *cred, int brand_action); + struct intpdata *idata, int level, size_t *execsz, caddr_t exec_file, + struct cred *cred, int *brand_action); extern struct execsw *allocate_execsw(char *name, char *magic, size_t magic_size); extern struct execsw *findexecsw(char *magic); @@ -239,26 +247,32 @@ extern void exec_set_sp(size_t); * when compiling the 32-bit compatability elf code in the elfexec module. */ extern int elfexec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); extern int mapexec_brand(vnode_t *, uarg_t *, Ehdr *, Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); +extern int elfreadhdr(vnode_t *, cred_t *, Ehdr *, uint_t *, caddr_t *, + size_t *); #endif /* !_ELF32_COMPAT */ #if defined(_LP64) extern int elf32exec(vnode_t *, execa_t *, uarg_t *, intpdata_t *, int, - long *, int, caddr_t, cred_t *, int); + size_t *, int, caddr_t, cred_t *, int *); extern int mapexec32_brand(vnode_t *, uarg_t *, Elf32_Ehdr *, Elf32_Addr *, - intptr_t *, caddr_t, int *, caddr_t *, caddr_t *, size_t *, uintptr_t *); + intptr_t *, caddr_t, char **, caddr_t *, caddr_t *, size_t *, + uintptr_t *, uintptr_t *); +extern int elf32readhdr(vnode_t *, cred_t *, Elf32_Ehdr *, uint_t *, caddr_t *, + size_t *); #endif /* _LP64 */ /* * Utility functions for exec module core routines: */ -extern int core_seg(proc_t *, vnode_t *, offset_t, caddr_t, - size_t, rlim64_t, cred_t *); +extern int core_seg(proc_t *, vnode_t *, u_offset_t, caddr_t, size_t, + rlim64_t, cred_t *); -extern int core_write(vnode_t *, enum uio_seg, offset_t, - const void *, size_t, rlim64_t, cred_t *); +extern int core_write(vnode_t *, enum uio_seg, u_offset_t, const void *, + size_t, rlim64_t, cred_t *); /* a.out stuff */ diff --git a/usr/src/uts/common/sys/file.h b/usr/src/uts/common/sys/file.h index ec0741fe08..556a7ab2a1 100644 --- a/usr/src/uts/common/sys/file.h +++ b/usr/src/uts/common/sys/file.h @@ -27,13 +27,13 @@ /* All Rights Reserved */ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ -/* Copyright 2015 Joyent, Inc. */ +/* Copyright 2017 Joyent, Inc. */ #ifndef _SYS_FILE_H #define _SYS_FILE_H #include <sys/t_lock.h> -#ifdef _KERNEL +#if defined(_KERNEL) || defined(_FAKE_KERNEL) #include <sys/model.h> #include <sys/user.h> #endif @@ -122,11 +122,6 @@ typedef struct fpollinfo { #if defined(_KERNEL) || defined(_FAKE_KERNEL) /* - * This is a flag that is set on f_flag2, but is never user-visible - */ -#define FEPOLLED 0x8000 - -/* * Fake flags for driver ioctl calls to inform them of the originating * process' model. See <sys/model.h> * @@ -200,6 +195,7 @@ struct vattr; struct uf_info; extern file_t *getf(int); +extern file_t *getf_gen(int, uf_entry_gen_t *); extern void releasef(int); extern void areleasef(int, struct uf_info *); #ifndef _BOOT @@ -226,6 +222,7 @@ extern void fcnt_add(struct uf_info *, int); extern void close_exec(struct uf_info *); extern void clear_stale_fd(void); extern void clear_active_fd(int); +extern void set_active_fd(int); extern void free_afd(afd_t *afd); extern int fgetstartvp(int, char *, struct vnode **); extern int fsetattrat(int, char *, int, struct vattr *); diff --git a/usr/src/uts/common/sys/frameio.h b/usr/src/uts/common/sys/frameio.h new file mode 100644 index 0000000000..54e6dbeedf --- /dev/null +++ b/usr/src/uts/common/sys/frameio.h @@ -0,0 +1,107 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FRAMEIO_H +#define _SYS_FRAMEIO_H + +/* + * Frame I/O definitions + */ + +#include <sys/types.h> + +#ifdef _KERNEL +/* Kernel only headers */ +#include <sys/stream.h> +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * An individual frame vector component. Collections of these are used to make + * ioctls. + */ +typedef struct framevec { + void *fv_buf; /* Buffer with data */ + size_t fv_buflen; /* Size of the buffer */ + size_t fv_actlen; /* Amount of buffer consumed, ignore on error */ +} framevec_t; + +/* + * The base unit used with frameio. + */ +typedef struct frameio { + uint_t fio_version; /* Should always be FRAMEIO_CURRENT_VERSION */ + uint_t fio_nvpf; /* How many vectors make up one frame */ + uint_t fio_nvecs; /* The total number of vectors */ + framevec_t fio_vecs[]; /* C99 VLA */ +} frameio_t; + + +#define FRAMEIO_VERSION_ONE 1 +#define FRAMEIO_CURRENT_VERSION FRAMEIO_VERSION_ONE + +#define FRAMEIO_NVECS_MAX 32 + +/* + * Definitions for kernel modules to include as helpers. These are consolidation + * private. + */ +#ifdef _KERNEL + +/* + * 32-bit versions for 64-bit kernels + */ +typedef struct framevec32 { + caddr32_t fv_buf; + size32_t fv_buflen; + size32_t fv_actlen; +} framevec32_t; + +typedef struct frameio32 { + uint_t fio_version; + uint_t fio_vecspframe; + uint_t fio_nvecs; + framevec32_t fio_vecs[]; +} frameio32_t; + +/* + * Describe the different ways that vectors should map to frames. + */ +typedef enum frameio_write_mblk_map { + MAP_BLK_FRAME +} frameio_write_mblk_map_t; + +int frameio_init(void); +void frameio_fini(void); +frameio_t *frameio_alloc(int); +void frameio_free(frameio_t *); +int frameio_hdr_copyin(frameio_t *, int, const void *, uint_t); +int frameio_mblk_chain_read(frameio_t *, mblk_t **, int *, int); +int frameio_mblk_chain_write(frameio_t *, frameio_write_mblk_map_t, mblk_t *, + int *, int); +int frameio_hdr_copyout(frameio_t *, int, void *, uint_t); +size_t frameio_frame_length(frameio_t *, framevec_t *); +void frameio_mark_consumed(frameio_t *, int); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FRAMEIO_H */ diff --git a/usr/src/uts/common/sys/fs/fifonode.h b/usr/src/uts/common/sys/fs/fifonode.h index d8b158ce3c..1ea8563e1c 100644 --- a/usr/src/uts/common/sys/fs/fifonode.h +++ b/usr/src/uts/common/sys/fs/fifonode.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -83,6 +84,7 @@ struct fifonode { struct msgb *fn_tail; /* last message to read */ fifolock_t *fn_lock; /* pointer to per fifo lock */ uint_t fn_count; /* Number of bytes on fn_mp */ + uint_t fn_hiwat; /* pipe (fifofast) high water */ kcondvar_t fn_wait_cv; /* fifo conditional variable */ ushort_t fn_wcnt; /* number of writers */ ushort_t fn_rcnt; /* number of readers */ @@ -135,6 +137,8 @@ typedef struct fifodata { #define FIFOPOLLRBAND 0x20000 #define FIFOSTAYFAST 0x40000 /* don't turn into stream mode */ #define FIFOWAITMODE 0x80000 /* waiting for the possibility to change mode */ +/* Data on loan, block reads. Use in conjunction with FIFOSTAYFAST. */ +#define FIFORDBLOCK 0x100000 #define FIFOHIWAT (16 * 1024) #define FIFOLOWAT (0) @@ -147,16 +151,6 @@ typedef struct fifodata { #if defined(_KERNEL) -/* - * Fifohiwat defined as a variable is to allow tuning of the high - * water mark if needed. It is not meant to be released. - */ -#if FIFODEBUG -extern int Fifohiwat; -#else /* FIFODEBUG */ -#define Fifohiwat FIFOHIWAT -#endif /* FIFODEBUG */ - extern struct vnodeops *fifo_vnodeops; extern const struct fs_operation_def fifo_vnodeops_template[]; extern struct kmem_cache *fnode_cache; @@ -181,6 +175,8 @@ extern void fifo_fastoff(fifonode_t *); extern struct streamtab *fifo_getinfo(); extern void fifo_wakereader(fifonode_t *, fifolock_t *); extern void fifo_wakewriter(fifonode_t *, fifolock_t *); +extern boolean_t fifo_stayfast_enter(fifonode_t *); +extern void fifo_stayfast_exit(fifonode_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs.h b/usr/src/uts/common/sys/fs/hyprlofs.h new file mode 100644 index 0000000000..b8c4149df2 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_H +#define _SYS_FS_HYPRLOFS_H + +#include <sys/param.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hyprlofs ioctl numbers. + */ +#define HYPRLOFS_IOC ('H' << 8) + +#define HYPRLOFS_ADD_ENTRIES (HYPRLOFS_IOC | 1) +#define HYPRLOFS_RM_ENTRIES (HYPRLOFS_IOC | 2) +#define HYPRLOFS_RM_ALL (HYPRLOFS_IOC | 3) +#define HYPRLOFS_GET_ENTRIES (HYPRLOFS_IOC | 4) + +typedef struct { + char *hle_path; + uint_t hle_plen; + char *hle_name; + uint_t hle_nlen; +} hyprlofs_entry_t; + +typedef struct { + hyprlofs_entry_t *hle_entries; + uint_t hle_len; +} hyprlofs_entries_t; + +typedef struct { + char hce_path[MAXPATHLEN]; + char hce_name[MAXPATHLEN]; +} hyprlofs_curr_entry_t; + +typedef struct { + hyprlofs_curr_entry_t *hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries_t; + +#ifdef _KERNEL +typedef struct { + caddr32_t hle_path; + uint_t hle_plen; + caddr32_t hle_name; + uint_t hle_nlen; +} hyprlofs_entry32_t; + +typedef struct { + caddr32_t hle_entries; + uint_t hle_len; +} hyprlofs_entries32_t; + +typedef struct { + caddr32_t hce_entries; + uint_t hce_cnt; +} hyprlofs_curr_entries32_t; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_H */ diff --git a/usr/src/uts/common/sys/fs/hyprlofs_info.h b/usr/src/uts/common/sys/fs/hyprlofs_info.h new file mode 100644 index 0000000000..38389f77d9 --- /dev/null +++ b/usr/src/uts/common/sys/fs/hyprlofs_info.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_HYPRLOFS_INFO_H +#define _SYS_FS_HYPRLOFS_INFO_H + +#include <sys/t_lock.h> +#include <vm/seg.h> +#include <vm/seg_vn.h> +#include <sys/vfs_opreg.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * hlnode is the file system dependent node for hyprlofs. + * It is modeled on the tmpfs tmpnode. + * + * hln_rwlock protects access of the directory list at hln_dir + * as well as syncronizing read/writes to directory hlnodes. + * hln_tlock protects updates to hln_mode and hln_nlink. + * hln_tlock doesn't require any hlnode locks. + */ +typedef struct hlnode { + struct hlnode *hln_back; /* linked list of hlnodes */ + struct hlnode *hln_forw; /* linked list of hlnodes */ + union { + struct { + struct hldirent *un_dirlist; /* dirent list */ + uint_t un_dirents; /* number of dirents */ + } un_dirstruct; + vnode_t *un_realvp; /* real vnode */ + } un_hlnode; + vnode_t *hln_vnode; /* vnode for this hlnode */ + int hln_gen; /* pseudo gen num for hlfid */ + int hln_looped; /* flag indicating loopback */ + vattr_t hln_attr; /* attributes */ + krwlock_t hln_rwlock; /* rw - serialize mods and */ + /* directory updates */ + kmutex_t hln_tlock; /* time, flag, and nlink lock */ +} hlnode_t; + +/* + * hyprlofs per-mount data structure. + * All fields are protected by hlm_contents. + */ +typedef struct { + vfs_t *hlm_vfsp; /* filesystem's vfs struct */ + hlnode_t *hlm_rootnode; /* root hlnode */ + char *hlm_mntpath; /* name of hyprlofs mount point */ + dev_t hlm_dev; /* unique dev # of mounted `device' */ + uint_t hlm_gen; /* pseudo generation number for files */ + kmutex_t hlm_contents; /* lock for hlfsmount structure */ +} hlfsmount_t; + +/* + * hyprlofs directories are made up of a linked list of hldirent structures + * hanging off directory hlnodes. File names are not fixed length, + * but are null terminated. + */ +typedef struct hldirent { + hlnode_t *hld_hlnode; /* hlnode for this file */ + struct hldirent *hld_next; /* next directory entry */ + struct hldirent *hld_prev; /* prev directory entry */ + uint_t hld_offset; /* "offset" of dir entry */ + uint_t hld_hash; /* a hash of td_name */ + struct hldirent *hld_link; /* linked via the hash table */ + hlnode_t *hld_parent; /* parent, dir we are in */ + char *hld_name; /* must be null terminated */ + /* max length is MAXNAMELEN */ +} hldirent_t; + +/* + * hlfid overlays the fid structure (for VFS_VGET) + */ +typedef struct { + uint16_t hlfid_len; + ino32_t hlfid_ino; + int32_t hlfid_gen; +} hlfid_t; + +/* + * File system independent to hyprlofs conversion macros + */ +#define VFSTOHLM(vfsp) ((hlfsmount_t *)(vfsp)->vfs_data) +#define VTOHLM(vp) ((hlfsmount_t *)(vp)->v_vfsp->vfs_data) +#define VTOHLN(vp) ((hlnode_t *)(vp)->v_data) +#define HLNTOV(tp) ((tp)->hln_vnode) +#define REALVP(vp) ((vnode_t *)VTOHLN(vp)->hln_realvp) +#define hlnode_hold(tp) VN_HOLD(HLNTOV(tp)) +#define hlnode_rele(tp) VN_RELE(HLNTOV(tp)) + +#define hln_dir un_hlnode.un_dirstruct.un_dirlist +#define hln_dirents un_hlnode.un_dirstruct.un_dirents +#define hln_realvp un_hlnode.un_realvp + +/* + * Attributes + */ +#define hln_mask hln_attr.va_mask +#define hln_type hln_attr.va_type +#define hln_mode hln_attr.va_mode +#define hln_uid hln_attr.va_uid +#define hln_gid hln_attr.va_gid +#define hln_fsid hln_attr.va_fsid +#define hln_nodeid hln_attr.va_nodeid +#define hln_nlink hln_attr.va_nlink +#define hln_size hln_attr.va_size +#define hln_atime hln_attr.va_atime +#define hln_mtime hln_attr.va_mtime +#define hln_ctime hln_attr.va_ctime +#define hln_rdev hln_attr.va_rdev +#define hln_blksize hln_attr.va_blksize +#define hln_nblocks hln_attr.va_nblocks +#define hln_seq hln_attr.va_seq + +/* + * enums + */ +enum de_op { DE_CREATE, DE_MKDIR }; /* direnter ops */ +enum dr_op { DR_REMOVE, DR_RMDIR }; /* dirremove ops */ + +/* + * hyprlofs_minfree is the amount (in pages) of anonymous memory that hyprlofs + * leaves free for the rest of the system. The default value for + * hyprlofs_minfree is btopr(HYPRLOFSMINFREE) but it can be patched to a + * different number of pages. Since hyprlofs doesn't actually use much + * memory, its unlikely this ever needs to be patched. + */ +#define HYPRLOFSMINFREE 8 * 1024 * 1024 /* 8 Megabytes */ + +extern size_t hyprlofs_minfree; /* Anonymous memory in pages */ + +extern void hyprlofs_node_init(hlfsmount_t *, hlnode_t *, vattr_t *, + cred_t *); +extern int hyprlofs_dirlookup(hlnode_t *, char *, hlnode_t **, cred_t *); +extern int hyprlofs_dirdelete(hlnode_t *, hlnode_t *, char *, enum dr_op, + cred_t *); +extern void hyprlofs_dirinit(hlnode_t *, hlnode_t *); +extern void hyprlofs_dirtrunc(hlnode_t *); +extern int hyprlofs_taccess(void *, int, cred_t *); +extern int hyprlofs_direnter(hlfsmount_t *, hlnode_t *, char *, enum de_op, + vnode_t *, vattr_t *, hlnode_t **, cred_t *); + +extern struct vnodeops *hyprlofs_vnodeops; +extern const struct fs_operation_def hyprlofs_vnodeops_template[]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_HYPRLOFS_INFO_H */ diff --git a/usr/src/uts/common/sys/fs/sdev_impl.h b/usr/src/uts/common/sys/fs/sdev_impl.h index 9f9ce5c8c1..d1c5f674f1 100644 --- a/usr/src/uts/common/sys/fs/sdev_impl.h +++ b/usr/src/uts/common/sys/fs/sdev_impl.h @@ -37,6 +37,7 @@ extern "C" { #include <sys/vfs_opreg.h> #include <sys/list.h> #include <sys/nvpair.h> +#include <sys/fs/sdev_plugin.h> #include <sys/sunddi.h> /* @@ -129,6 +130,21 @@ typedef struct sdev_local_data { struct sdev_dprof sdev_lprof; /* profile for multi-inst */ } sdev_local_data_t; +/* sdev_flags */ +typedef enum sdev_flags { + SDEV_BUILD = 0x0001, /* directory cache out-of-date */ + SDEV_GLOBAL = 0x0002, /* global /dev nodes */ + SDEV_PERSIST = 0x0004, /* backing store persisted node */ + SDEV_NO_NCACHE = 0x0008, /* do not include in neg. cache */ + SDEV_DYNAMIC = 0x0010, /* special-purpose vnode ops */ + /* (ex: pts) */ + SDEV_VTOR = 0x0020, /* validate sdev_nodes during search */ + SDEV_ATTR_INVALID = 0x0040, /* invalid node attributes, */ + /* need update */ + SDEV_SUBDIR = 0x0080, /* match all subdirs under here */ + SDEV_ZONED = 0x0100 /* zoned subdir */ +} sdev_flags_t; + /* * /dev filesystem sdev_node defines */ @@ -151,7 +167,7 @@ typedef struct sdev_node { ino64_t sdev_ino; /* inode */ uint_t sdev_nlink; /* link count */ int sdev_state; /* state of this node */ - int sdev_flags; /* flags bit */ + sdev_flags_t sdev_flags; /* flags bit */ kmutex_t sdev_lookup_lock; /* node creation synch lock */ kcondvar_t sdev_lookup_cv; /* node creation sync cv */ @@ -162,7 +178,7 @@ typedef struct sdev_node { struct sdev_global_data sdev_globaldata; struct sdev_local_data sdev_localdata; } sdev_instance_data; - + list_node_t sdev_plist; /* link on plugin list */ void *sdev_private; } sdev_node_t; @@ -193,29 +209,11 @@ typedef enum { SDEV_READY } sdev_node_state_t; -/* sdev_flags */ -#define SDEV_BUILD 0x0001 /* directory cache out-of-date */ -#define SDEV_GLOBAL 0x0002 /* global /dev nodes */ -#define SDEV_PERSIST 0x0004 /* backing store persisted node */ -#define SDEV_NO_NCACHE 0x0008 /* do not include in neg. cache */ -#define SDEV_DYNAMIC 0x0010 /* special-purpose vnode ops */ - /* (ex: pts) */ -#define SDEV_VTOR 0x0020 /* validate sdev_nodes during search */ -#define SDEV_ATTR_INVALID 0x0040 /* invalid node attributes, */ - /* need update */ -#define SDEV_SUBDIR 0x0080 /* match all subdirs under here */ -#define SDEV_ZONED 0x0100 /* zoned subdir */ - /* sdev_lookup_flags */ #define SDEV_LOOKUP 0x0001 /* node creation in progress */ #define SDEV_READDIR 0x0002 /* VDIR readdir in progress */ #define SDEV_LGWAITING 0x0004 /* waiting for devfsadm completion */ -#define SDEV_VTOR_INVALID -1 -#define SDEV_VTOR_SKIP 0 -#define SDEV_VTOR_VALID 1 -#define SDEV_VTOR_STALE 2 - /* convenient macros */ #define SDEV_IS_GLOBAL(dv) \ (dv->sdev_flags & SDEV_GLOBAL) @@ -368,8 +366,13 @@ extern void sdev_devfsadmd_thread(struct sdev_node *, struct sdev_node *, extern int devname_profile_update(char *, size_t); extern struct sdev_data *sdev_find_mntinfo(char *); void sdev_mntinfo_rele(struct sdev_data *); +typedef void (*sdev_mnt_walk_f)(struct sdev_node *, void *); +void sdev_mnt_walk(sdev_mnt_walk_f, void *); extern struct vnodeops *devpts_getvnodeops(void); extern struct vnodeops *devvt_getvnodeops(void); +extern void sdev_plugin_nodeready(struct sdev_node *); +extern int sdev_plugin_init(void); +extern int sdev_plugin_fini(void); /* * boot states - warning, the ordering here is significant @@ -515,6 +518,23 @@ extern void sdev_nc_path_exists(sdev_nc_list_t *, char *); extern void sdev_modctl_dump_files(void); /* + * plugin and legacy vtab stuff + */ +/* directory dependent vop table */ +typedef struct sdev_vop_table { + char *vt_name; /* subdirectory name */ + const fs_operation_def_t *vt_service; /* vnodeops table */ + struct vnodeops **vt_global_vops; /* global container for vop */ + int (*vt_vtor)(struct sdev_node *); /* validate sdev_node */ + int vt_flags; +} sdev_vop_table_t; + +extern struct sdev_vop_table vtab[]; +extern struct vnodeops *sdev_get_vop(struct sdev_node *); +extern void sdev_set_no_negcache(struct sdev_node *); +extern void *sdev_get_vtor(struct sdev_node *dv); + +/* * globals */ extern kmutex_t sdev_lock; @@ -527,6 +547,7 @@ extern struct vnodeops *devipnet_vnodeops; extern struct vnodeops *devvt_vnodeops; extern struct sdev_data *sdev_origins; /* mount info for global /dev instance */ extern struct vnodeops *devzvol_vnodeops; +extern int sdev_vnodeops_tbl_size; extern const fs_operation_def_t sdev_vnodeops_tbl[]; extern const fs_operation_def_t devpts_vnodeops_tbl[]; diff --git a/usr/src/uts/common/sys/fs/sdev_plugin.h b/usr/src/uts/common/sys/fs/sdev_plugin.h new file mode 100644 index 0000000000..f4ed813c1e --- /dev/null +++ b/usr/src/uts/common/sys/fs/sdev_plugin.h @@ -0,0 +1,106 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#ifndef _SYS_SDEV_PLUGIN_H +#define _SYS_SDEV_PLUGIN_H + +/* + * Kernel sdev plugin interface + */ + +#ifdef _KERNEL + +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/vnode.h> + +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef uintptr_t sdev_plugin_hdl_t; +typedef uintptr_t sdev_ctx_t; + +/* + * Valid return values for sdev_plugin_validate_t. + */ +typedef enum sdev_plugin_validate { + SDEV_VTOR_INVALID = -1, + SDEV_VTOR_SKIP = 0, + SDEV_VTOR_VALID = 1, + SDEV_VTOR_STALE = 2 +} sdev_plugin_validate_t; + +/* + * Valid flags + */ +typedef enum sdev_plugin_flags { + SDEV_PLUGIN_NO_NCACHE = 0x1, + SDEV_PLUGIN_SUBDIR = 0x2 +} sdev_plugin_flags_t; + +#define SDEV_PLUGIN_FLAGS_MASK 0x3 + +/* + * Functions a module must implement + */ +typedef sdev_plugin_validate_t (*sp_valid_f)(sdev_ctx_t); +typedef int (*sp_filldir_f)(sdev_ctx_t); +typedef void (*sp_inactive_f)(sdev_ctx_t); + +#define SDEV_PLUGIN_VERSION 1 + +typedef struct sdev_plugin_ops { + int spo_version; + sdev_plugin_flags_t spo_flags; + sp_valid_f spo_validate; + sp_filldir_f spo_filldir; + sp_inactive_f spo_inactive; +} sdev_plugin_ops_t; + +extern sdev_plugin_hdl_t sdev_plugin_register(const char *, sdev_plugin_ops_t *, + int *); +extern int sdev_plugin_unregister(sdev_plugin_hdl_t); + +typedef enum sdev_ctx_flags { + SDEV_CTX_GLOBAL = 0x2 /* node belongs to the GZ */ +} sdev_ctx_flags_t; + +/* + * Context helper functions + */ +extern sdev_ctx_flags_t sdev_ctx_flags(sdev_ctx_t); +extern const char *sdev_ctx_name(sdev_ctx_t); +extern const char *sdev_ctx_path(sdev_ctx_t); +extern int sdev_ctx_minor(sdev_ctx_t, minor_t *); +extern enum vtype sdev_ctx_vtype(sdev_ctx_t); + +/* + * Callbacks to manipulate nodes + */ +extern int sdev_plugin_mkdir(sdev_ctx_t, char *); +extern int sdev_plugin_mknod(sdev_ctx_t, char *, mode_t, dev_t); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SDEV_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/fs/tmp.h b/usr/src/uts/common/sys/fs/tmp.h index fb07de6588..f4cee09244 100644 --- a/usr/src/uts/common/sys/fs/tmp.h +++ b/usr/src/uts/common/sys/fs/tmp.h @@ -23,7 +23,7 @@ * All rights reserved. Use is subject to license terms. */ /* - * Copyright 2015 Joyent, Inc. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_FS_TMP_H @@ -43,8 +43,10 @@ struct tmount { struct vfs *tm_vfsp; /* filesystem's vfs struct */ struct tmpnode *tm_rootnode; /* root tmpnode */ char *tm_mntpath; /* name of tmpfs mount point */ - ulong_t tm_anonmax; /* file system max anon reservation */ - pgcnt_t tm_anonmem; /* pages of reserved anon memory */ + size_t tm_anonmax; /* file system max anon reservation */ + size_t tm_anonmem; /* bytes of reserved anon memory */ + /* and allocated kmem for the fs */ + size_t tm_allocmem; /* bytes alloced from tmp_kmem_ funcs */ dev_t tm_dev; /* unique dev # of mounted `device' */ uint_t tm_gen; /* pseudo generation number for files */ kmutex_t tm_contents; /* lock for tmount structure */ @@ -58,6 +60,7 @@ struct tmount { #define VTOTM(vp) ((struct tmount *)(vp)->v_vfsp->vfs_data) #define VTOTN(vp) ((struct tmpnode *)(vp)->v_data) #define TNTOV(tp) ((tp)->tn_vnode) +#define TNTOTM(tp) (VTOTM(TNTOV(tp))) #define tmpnode_hold(tp) VN_HOLD(TNTOV(tp)) #define tmpnode_rele(tp) VN_RELE(TNTOV(tp)) @@ -69,41 +72,39 @@ enum dr_op { DR_REMOVE, DR_RMDIR, DR_RENAME }; /* dirremove ops */ /* * tmpfs_minfree is the amount (in pages) of anonymous memory that tmpfs - * leaves free for the rest of the system. E.g. in a system with 32MB of - * configured swap space, if 16MB were reserved (leaving 16MB free), - * tmpfs could allocate up to 16MB - tmpfs_minfree. The default value - * for tmpfs_minfree is btopr(TMPMINFREE) but it can cautiously patched - * to a different number of pages. - * NB: If tmpfs allocates too much swap space, other processes will be - * unable to execute. + * leaves free for the rest of the system. In antiquity, this number could be + * relevant on a system-wide basis, as physical DRAM was routinely exhausted; + * however, in more modern times, the relative growth of DRAM with respect to + * application footprint means that this number is only likely to become + * factor in a virtualized OS environment (e.g., a zone) -- and even then only + * when DRAM and swap have both been capped low to allow for maximum tenancy. + * TMPMINFREE -- the value from which tmpfs_minfree is derived -- should + * therefore be configured to a value that is roughly the smallest practical + * value for memory + swap minus the largest reasonable size for tmpfs in such + * a configuration. As of this writing, the smallest practical memory + swap + * configuration is 128MB, and it seems reasonable to allow tmpfs to consume + * no more than seven-eighths of this, yielding a TMPMINFREE of 16MB. Care + * should be exercised in changing this: tuning this value too high will + * result in spurious ENOSPC errors in tmpfs in small zones (a problem that + * can induce cascading failure surprisingly often); tuning this value too low + * will result in tmpfs consumption alone to alone induce application-level + * memory allocation failure. */ -#define TMPMINFREE 2 * 1024 * 1024 /* 2 Megabytes */ +#define TMPMINFREE 16 * 1024 * 1024 /* 16 Megabytes */ extern size_t tmpfs_minfree; /* Anonymous memory in pages */ -/* - * tmpfs can allocate only a certain percentage of kernel memory, - * which is used for tmpnodes, directories, file names, etc. - * This is statically set as TMPMAXFRACKMEM of physical memory. - * The actual number of allocatable bytes can be patched in tmpfs_maxkmem. - */ -#define TMPMAXFRACKMEM 25 /* 1/25 of physical memory */ - -extern size_t tmp_kmemspace; -extern size_t tmpfs_maxkmem; /* Allocatable kernel memory in bytes */ - extern void tmpnode_init(struct tmount *, struct tmpnode *, struct vattr *, struct cred *); +extern void tmpnode_cleanup(struct tmpnode *tp); extern int tmpnode_trunc(struct tmount *, struct tmpnode *, ulong_t); extern void tmpnode_growmap(struct tmpnode *, ulong_t); extern int tdirlookup(struct tmpnode *, char *, struct tmpnode **, struct cred *); extern int tdirdelete(struct tmpnode *, struct tmpnode *, char *, enum dr_op, struct cred *); -extern void tdirinit(struct tmpnode *, struct tmpnode *); +extern int tdirinit(struct tmpnode *, struct tmpnode *); extern void tdirtrunc(struct tmpnode *); -extern void *tmp_memalloc(size_t, int); -extern void tmp_memfree(void *, size_t); extern int tmp_resv(struct tmount *, struct tmpnode *, size_t, int); extern int tmp_taccess(void *, int, struct cred *); extern int tmp_sticky_remove_access(struct tmpnode *, struct tmpnode *, @@ -114,6 +115,9 @@ extern int tdirenter(struct tmount *, struct tmpnode *, char *, enum de_op, struct tmpnode *, struct tmpnode *, struct vattr *, struct tmpnode **, struct cred *, caller_context_t *); +extern void *tmp_kmem_zalloc(struct tmount *, size_t, int); +extern void tmp_kmem_free(struct tmount *, void *, size_t); + #define TMP_MUSTHAVE 0x01 #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/fss.h b/usr/src/uts/common/sys/fss.h index e73dd5c0e8..87d798d6c1 100644 --- a/usr/src/uts/common/sys/fss.h +++ b/usr/src/uts/common/sys/fss.h @@ -22,7 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2012 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_FSS_H @@ -140,7 +140,7 @@ typedef struct fssproc { * than one cpu partition then it will have a few of these structures. */ typedef struct fsszone { - struct zone *fssz_zone; /* ptr to our zone structure */ + struct zone *fssz_zone; /* ptr to our zone structure */ struct fsszone *fssz_next; /* next fsszone_t in fsspset_t */ struct fsszone *fssz_prev; /* prev fsszone_t in fsspset_t */ uint32_t fssz_shares; /* sum of all project shares */ @@ -160,7 +160,7 @@ typedef struct fsszone { /* * fss_flags */ -#define FSSKPRI 0x01 /* the thread is in kernel mode */ +/* Formerly: FSSKPRI 0x01 - the thread is in kernel mode */ #define FSSBACKQ 0x02 /* thread should be placed at the back of */ /* the dispatch queue if preempted */ #define FSSRESTORE 0x04 /* thread was not preempted, due to schedctl */ diff --git a/usr/src/uts/common/sys/fx.h b/usr/src/uts/common/sys/fx.h index 2d4e1aa7fb..4a48af52a1 100644 --- a/usr/src/uts/common/sys/fx.h +++ b/usr/src/uts/common/sys/fx.h @@ -21,13 +21,12 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_FX_H #define _SYS_FX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/types.h> #include <sys/thread.h> #include <sys/ddi.h> @@ -145,7 +144,14 @@ typedef struct fxkparms { uint_t fx_cflags; } fxkparms_t; +/* + * control flags (kparms->fx_cflags). + */ +#define FX_DOUPRILIM 0x01 /* change user priority limit */ +#define FX_DOUPRI 0x02 /* change user priority */ +#define FX_DOTQ 0x04 /* change FX time quantum */ +#define FXMAXUPRI 60 /* maximum user priority setting */ /* * Interface for partner private code. This is not a public interface. diff --git a/usr/src/uts/common/sys/gsqueue.h b/usr/src/uts/common/sys/gsqueue.h new file mode 100644 index 0000000000..91ab46fc44 --- /dev/null +++ b/usr/src/uts/common/sys/gsqueue.h @@ -0,0 +1,59 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _SYS_GSQUEUE_H +#define _SYS_GSQUEUE_H + +/* + * Standard interfaces to serializaion queues for everyone (except IP). + */ + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct gsqueue gsqueue_t; +typedef struct gsqueue_set gsqueue_set_t; + +typedef void (*gsqueue_cb_f)(gsqueue_set_t *, gsqueue_t *, void *, boolean_t); +typedef void (*gsqueue_proc_f)(void *, mblk_t *, gsqueue_t *, void *); + +extern gsqueue_set_t *gsqueue_set_create(pri_t); +extern void gsqueue_set_destroy(gsqueue_set_t *); +extern gsqueue_t *gsqueue_set_get(gsqueue_set_t *, uint_t); + +extern uintptr_t gsqueue_set_cb_add(gsqueue_set_t *, gsqueue_cb_f, void *); +extern int gsqueue_set_cb_remove(gsqueue_set_t *, uintptr_t); + +#define GSQUEUE_FILL 0x0001 +#define GSQUEUE_NODRAIN 0x0002 +#define GSQUEUE_PROCESS 0x0004 + +extern void gsqueue_enter_one(gsqueue_t *, mblk_t *, gsqueue_proc_f, void *, + int, uint8_t); + +#define GSQUEUE_DEFAULT_PRIORITY MAXCLSYSPRI + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_GSQUEUE_H */ diff --git a/usr/src/uts/common/sys/hook_impl.h b/usr/src/uts/common/sys/hook_impl.h index d8a15f0fe5..f3337bbacf 100644 --- a/usr/src/uts/common/sys/hook_impl.h +++ b/usr/src/uts/common/sys/hook_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018, Joyent, Inc. */ /* @@ -171,7 +172,7 @@ typedef struct hook_family_int { cvwaitlock_t hfi_lock; SLIST_ENTRY(hook_family_int) hfi_entry; hook_event_int_head_t hfi_head; - hook_family_t hfi_family; + hook_family_t hfi_family; kstat_t *hfi_kstat; struct hook_stack *hfi_stack; hook_notify_head_t hfi_nhead; @@ -209,6 +210,7 @@ typedef struct hook_stack_head hook_stack_head_t; #define Hn_ARP "arp" #define Hn_IPV4 "inet" #define Hn_IPV6 "inet6" +#define Hn_VIONA "viona_inet" extern int hook_run(hook_family_int_t *, hook_event_token_t, hook_data_t); extern int hook_register(hook_family_int_t *, char *, hook_t *); diff --git a/usr/src/uts/common/sys/ia.h b/usr/src/uts/common/sys/ia.h index 26c1002134..567c121bb0 100644 --- a/usr/src/uts/common/sys/ia.h +++ b/usr/src/uts/common/sys/ia.h @@ -22,6 +22,7 @@ /* * Copyright (c) 1997-1998 by Sun Microsystems, Inc. * All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -85,7 +86,7 @@ typedef struct iaproc { /* flags */ -#define IAKPRI 0x01 /* thread at kernel mode priority */ +/* Formerly: IAKPRI 0x01 - thread at kernel model priority */ #define IABACKQ 0x02 /* thread goes to back of disp q when preempted */ #define IASLEPT 0x04 /* thread had long-term suspend - give new slice */ diff --git a/usr/src/uts/common/sys/id_space.h b/usr/src/uts/common/sys/id_space.h index d56fcceb5a..46d25f207f 100644 --- a/usr/src/uts/common/sys/id_space.h +++ b/usr/src/uts/common/sys/id_space.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All Rights reserved. */ #ifndef _ID_SPACE_H @@ -34,8 +35,6 @@ extern "C" { #include <sys/mutex.h> #include <sys/vmem.h> -#ifdef _KERNEL - typedef vmem_t id_space_t; id_space_t *id_space_create(const char *, id_t, id_t); @@ -48,8 +47,6 @@ id_t id_allocff_nosleep(id_space_t *); id_t id_alloc_specific_nosleep(id_space_t *, id_t); void id_free(id_space_t *, id_t); -#endif /* _KERNEL */ - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/inotify.h b/usr/src/uts/common/sys/inotify.h new file mode 100644 index 0000000000..8acc1a7280 --- /dev/null +++ b/usr/src/uts/common/sys/inotify.h @@ -0,0 +1,153 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +/* + * Header file to support for the inotify facility. Note that this facility + * is designed to be binary compatible with the Linux inotify facility; values + * for constants here should therefore exactly match those found in Linux, and + * this facility shouldn't be extended independently of Linux. + */ + +#ifndef _SYS_INOTIFY_H +#define _SYS_INOTIFY_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Events that can be explicitly requested on any inotify watch. + */ +#define IN_ACCESS 0x00000001 +#define IN_MODIFY 0x00000002 +#define IN_ATTRIB 0x00000004 +#define IN_CLOSE_WRITE 0x00000008 +#define IN_CLOSE_NOWRITE 0x00000010 +#define IN_OPEN 0x00000020 +#define IN_MOVED_FROM 0x00000040 +#define IN_MOVED_TO 0x00000080 +#define IN_CREATE 0x00000100 +#define IN_DELETE 0x00000200 +#define IN_DELETE_SELF 0x00000400 +#define IN_MOVE_SELF 0x00000800 + +/* + * Events that can be sent to an inotify watch -- requested or not. + */ +#define IN_UNMOUNT 0x00002000 +#define IN_Q_OVERFLOW 0x00004000 +#define IN_IGNORED 0x00008000 + +/* + * Flags that can modify an inotify event. + */ +#define IN_ONLYDIR 0x01000000 +#define IN_DONT_FOLLOW 0x02000000 +#define IN_EXCL_UNLINK 0x04000000 +#define IN_MASK_ADD 0x20000000 +#define IN_ISDIR 0x40000000 +#define IN_ONESHOT 0x80000000 + +/* + * Helpful constants. + */ +#define IN_CLOSE (IN_CLOSE_WRITE | IN_CLOSE_NOWRITE) +#define IN_MOVE (IN_MOVED_FROM | IN_MOVED_TO) +#define IN_ALL_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_OPEN | IN_MOVED_FROM | IN_MOVED_TO | \ + IN_DELETE | IN_CREATE | IN_DELETE_SELF | IN_MOVE_SELF) + +#define IN_CHILD_EVENTS \ + (IN_ACCESS | IN_MODIFY | IN_ATTRIB | IN_CLOSE_WRITE | \ + IN_CLOSE_NOWRITE | IN_MODIFY | IN_OPEN) + +/* + * To assure binary compatibility with Linux, these values are fixed at their + * Linux equivalents, not their native ones. + */ +#define IN_CLOEXEC 02000000 /* LX_O_CLOEXEC */ +#define IN_NONBLOCK 04000 /* LX_O_NONBLOCK */ + +struct inotify_event { + int32_t wd; /* watch descriptor */ + uint32_t mask; /* mask of events */ + uint32_t cookie; /* event association cookie, if any */ + uint32_t len; /* size of name field */ + char name[]; /* optional NUL-terminated name */ +}; + +/* + * These ioctl values are specific to the native implementation; applications + * shouldn't be using them directly, and they should therefore be safe to + * change without breaking apps. + */ +#define INOTIFYIOC (('i' << 24) | ('n' << 16) | ('y' << 8)) +#define INOTIFYIOC_ADD_WATCH (INOTIFYIOC | 1) /* add watch */ +#define INOTIFYIOC_RM_WATCH (INOTIFYIOC | 2) /* remove watch */ +#define INOTIFYIOC_ADD_CHILD (INOTIFYIOC | 3) /* add child watch */ +#define INOTIFYIOC_ACTIVATE (INOTIFYIOC | 4) /* activate watch */ + +#ifndef _LP64 +#ifndef _LITTLE_ENDIAN +#define INOTIFY_PTR(type, name) uint32_t name##pad; type *name +#else +#define INOTIFY_PTR(type, name) type *name; uint32_t name##pad +#endif +#else +#define INOTIFY_PTR(type, name) type *name +#endif + +typedef struct inotify_addwatch { + int inaw_fd; /* open fd for object */ + uint32_t inaw_mask; /* desired mask */ +} inotify_addwatch_t; + +typedef struct inotify_addchild { + INOTIFY_PTR(char, inac_name); /* pointer to name */ + int inac_fd; /* open fd for parent */ +} inotify_addchild_t; + +#ifndef _KERNEL + +extern int inotify_init(void); +extern int inotify_init1(int); +extern int inotify_add_watch(int, const char *, uint32_t); +extern int inotify_rm_watch(int, int); + +#else + +#define IN_UNMASKABLE \ + (IN_UNMOUNT | IN_Q_OVERFLOW | IN_IGNORED | IN_ISDIR) + +#define IN_MODIFIERS \ + (IN_EXCL_UNLINK | IN_ONESHOT) + +#define IN_FLAGS \ + (IN_ONLYDIR | IN_DONT_FOLLOW | IN_MASK_ADD) + +#define IN_REMOVAL (1ULL << 32) +#define INOTIFYMNRN_INOTIFY 0 +#define INOTIFYMNRN_CLONE 1 + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_INOTIFY_H */ diff --git a/usr/src/uts/common/sys/ipc_impl.h b/usr/src/uts/common/sys/ipc_impl.h index 0569c3e967..d7dc365c09 100644 --- a/usr/src/uts/common/sys/ipc_impl.h +++ b/usr/src/uts/common/sys/ipc_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016, Joyent, Inc. */ #ifndef _IPC_IMPL_H @@ -226,6 +227,7 @@ int ipc_commit_begin(ipc_service_t *, key_t, int, kipc_perm_t *); kmutex_t *ipc_commit_end(ipc_service_t *, kipc_perm_t *); void ipc_cleanup(ipc_service_t *, kipc_perm_t *); +void ipc_rmsvc(ipc_service_t *, kipc_perm_t *); int ipc_rmid(ipc_service_t *, int, cred_t *); int ipc_ids(ipc_service_t *, int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/ipd.h b/usr/src/uts/common/sys/ipd.h index bad74f8b81..f21c3fb5af 100644 --- a/usr/src/uts/common/sys/ipd.h +++ b/usr/src/uts/common/sys/ipd.h @@ -20,7 +20,7 @@ */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -35,7 +35,7 @@ extern "C" { #endif #define IPD_DEV_PATH "/dev/ipd" -#define IPD_MAX_DELAY 10000 /* 10 ms in us */ +#define IPD_MAX_DELAY 1000000 /* 1 second in microseconds */ typedef struct ipd_ioc_perturb { zoneid_t ipip_zoneid; diff --git a/usr/src/uts/common/sys/iso/signal_iso.h b/usr/src/uts/common/sys/iso/signal_iso.h index bf89ef0d33..0a76ee19a7 100644 --- a/usr/src/uts/common/sys/iso/signal_iso.h +++ b/usr/src/uts/common/sys/iso/signal_iso.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -95,7 +96,7 @@ extern "C" { /* insert new signals here, and move _SIGRTM* appropriately */ #define _SIGRTMIN 42 /* first (highest-priority) realtime signal */ -#define _SIGRTMAX 73 /* last (lowest-priority) realtime signal */ +#define _SIGRTMAX 74 /* last (lowest-priority) realtime signal */ extern long _sysconf(int); /* System Private interface to sysconf() */ #define SIGRTMIN ((int)_sysconf(_SC_SIGRT_MIN)) /* first realtime signal */ #define SIGRTMAX ((int)_sysconf(_SC_SIGRT_MAX)) /* last realtime signal */ diff --git a/usr/src/uts/common/sys/klwp.h b/usr/src/uts/common/sys/klwp.h index 41b70f6a6e..0ea1a396b9 100644 --- a/usr/src/uts/common/sys/klwp.h +++ b/usr/src/uts/common/sys/klwp.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_KLWP_H @@ -191,7 +191,14 @@ typedef struct _klwp { struct ct_template *lwp_ct_active[CTT_MAXTYPE]; /* active templates */ struct contract *lwp_ct_latest[CTT_MAXTYPE]; /* last created contract */ - void *lwp_brand; /* per-lwp brand data */ + /* + * Branding: + * lwp_brand - per-lwp brand data + * lwp_brand_syscall - brand syscall interposer + */ + void *lwp_brand; + int (*lwp_brand_syscall)(void); + struct psinfo *lwp_spymaster; /* if an agent LWP, our spymaster */ } klwp_t; diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h index 2396ef4625..d52a54f6b7 100644 --- a/usr/src/uts/common/sys/kobj.h +++ b/usr/src/uts/common/sys/kobj.h @@ -24,6 +24,9 @@ * * Copyright 2017 RackTop Systems. */ +/* + * Copyright (c) 2017 Joyent, Inc. + */ #ifndef _SYS_KOBJ_H #define _SYS_KOBJ_H @@ -47,6 +50,12 @@ struct module_list { struct module *mp; }; +typedef struct hotinline_desc { + char *hid_symname; /* symbol name */ + uintptr_t hid_instr_offset; /* offset of call in text */ + struct hotinline_desc *hid_next; /* next hotinline */ +} hotinline_desc_t; + typedef unsigned short symid_t; /* symbol table index */ typedef unsigned char *reloc_dest_t; @@ -99,6 +108,8 @@ struct module { caddr_t textwin; caddr_t textwin_base; + hotinline_desc_t *hi_calls; + sdt_probedesc_t *sdt_probes; size_t sdt_nprobes; char *sdt_tab; @@ -187,6 +198,7 @@ extern int kobj_read_file(struct _buf *, char *, unsigned, unsigned); extern int kobj_get_filesize(struct _buf *, uint64_t *size); extern uintptr_t kobj_getelfsym(char *, void *, int *); extern void kobj_set_ctf(struct module *, caddr_t data, size_t size); +extern void do_hotinlines(struct module *); extern int kobj_filbuf(struct _buf *); extern void kobj_sync(void); diff --git a/usr/src/uts/common/sys/ksocket.h b/usr/src/uts/common/sys/ksocket.h index 5d8827f1ae..d720caa631 100644 --- a/usr/src/uts/common/sys/ksocket.h +++ b/usr/src/uts/common/sys/ksocket.h @@ -21,6 +21,7 @@ /* * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _SYS_KSOCKET_H_ @@ -122,6 +123,11 @@ extern int ksocket_close(ksocket_t, struct cred *); extern void ksocket_hold(ksocket_t); extern void ksocket_rele(ksocket_t); +typedef boolean_t (*ksocket_krecv_f)(ksocket_t, struct msgb *, size_t, int, + void *); +extern int ksocket_krecv_set(ksocket_t, ksocket_krecv_f, void *); +extern void ksocket_krecv_unblock(ksocket_t); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/limits.h b/usr/src/uts/common/sys/limits.h new file mode 100644 index 0000000000..88625d1829 --- /dev/null +++ b/usr/src/uts/common/sys/limits.h @@ -0,0 +1,32 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_LIMITS_H +#define _SYS_LIMITS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define IOV_MAX 1024 + +#ifdef _KERNEL +#define IOV_MAX_STACK 16 /* max. IOV on-stack allocation */ +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIMITS_H */ diff --git a/usr/src/uts/common/sys/mac.h b/usr/src/uts/common/sys/mac.h index 0907d6deff..1d7ddf9648 100644 --- a/usr/src/uts/common/sys/mac.h +++ b/usr/src/uts/common/sys/mac.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2018 Joyent, Inc. * Copyright (c) 2015 Garrett D'Amore <garrett@damore.org> */ @@ -101,6 +101,14 @@ typedef struct mac_propval_uint32_range_s { } mac_propval_uint32_range_t; /* + * Defines ranges which are a series of C style strings. + */ +typedef struct mac_propval_str_range_s { + uint32_t mpur_nextbyte; + char mpur_data[1]; +} mac_propval_str_range_t; + +/* * Data type of property values. */ typedef enum { @@ -120,6 +128,7 @@ typedef struct mac_propval_range_s { mac_propval_type_t mpr_type; /* type of value */ union { mac_propval_uint32_range_t mpr_uint32[1]; + mac_propval_str_range_t mpr_str; } u; } mac_propval_range_t; @@ -614,6 +623,38 @@ typedef struct mactype_register_s { } mactype_register_t; /* + * Flags to describe the hardware emulation desired from a client when + * calling mac_hw_emul(). + * + * MAC_HWCKSUM_EMUL + * + * If an mblk is marked with HCK_* flags, then calculate those + * checksums and update the checksum flags. + * + * MAC_IPCKSUM_EMUL + * + * Like MAC_HWCKSUM_EMUL, except only calculate the IPv4 header + * checksum. We still update both the IPv4 and ULP checksum + * flags. + * + * MAC_LSO_EMUL + * + * If an mblk is marked with HW_LSO, then segment the LSO mblk + * into a new chain of mblks which reference the original data + * block. This flag DOES NOT imply MAC_HWCKSUM_EMUL. If the + * caller needs both then it must set both. + */ +typedef enum mac_emul { + MAC_HWCKSUM_EMUL = (1 << 0), + MAC_IPCKSUM_EMUL = (1 << 1), + MAC_LSO_EMUL = (1 << 2) +} mac_emul_t; + +#define MAC_HWCKSUM_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL) +#define MAC_ALL_EMULS (MAC_HWCKSUM_EMUL | MAC_IPCKSUM_EMUL | \ + MAC_LSO_EMUL) + +/* * Driver interface functions. */ extern int mac_open_by_linkid(datalink_id_t, diff --git a/usr/src/uts/common/sys/mac_client.h b/usr/src/uts/common/sys/mac_client.h index 0fc4939503..8fff314bfe 100644 --- a/usr/src/uts/common/sys/mac_client.h +++ b/usr/src/uts/common/sys/mac_client.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -88,6 +88,7 @@ typedef enum { } mac_client_promisc_type_t; /* flags passed to mac_unicast_add() */ + #define MAC_UNICAST_NODUPCHECK 0x0001 #define MAC_UNICAST_PRIMARY 0x0002 #define MAC_UNICAST_HW 0x0004 @@ -115,6 +116,7 @@ typedef enum { #define MAC_PROMISC_FLAGS_NO_PHYS 0x0002 #define MAC_PROMISC_FLAGS_VLAN_TAG_STRIP 0x0004 #define MAC_PROMISC_FLAGS_NO_COPY 0x0008 +#define MAC_PROMISC_FLAGS_DO_FIXUPS 0x0010 /* flags passed to mac_tx() */ #define MAC_DROP_ON_NO_DESC 0x01 /* freemsg() if no tx descs */ @@ -136,6 +138,7 @@ extern void mac_multicast_remove(mac_client_handle_t, const uint8_t *); extern void mac_rx_set(mac_client_handle_t, mac_rx_t, void *); extern void mac_rx_clear(mac_client_handle_t); +extern void mac_rx_barrier(mac_client_handle_t); extern void mac_secondary_dup(mac_client_handle_t, mac_client_handle_t); extern void mac_secondary_cleanup(mac_client_handle_t); extern mac_tx_cookie_t mac_tx(mac_client_handle_t, mblk_t *, @@ -198,6 +201,8 @@ extern int mac_set_mtu(mac_handle_t, uint_t, uint_t *); extern void mac_client_set_rings(mac_client_handle_t, int, int); +extern void mac_hw_emul(mblk_t **, mblk_t **, uint_t *, mac_emul_t); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/mac_client_impl.h b/usr/src/uts/common/sys/mac_client_impl.h index 9b3b4fe369..21641b884d 100644 --- a/usr/src/uts/common/sys/mac_client_impl.h +++ b/usr/src/uts/common/sys/mac_client_impl.h @@ -24,7 +24,7 @@ * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_MAC_CLIENT_IMPL_H @@ -57,7 +57,7 @@ typedef struct mac_unicast_impl_s { /* Protected by */ uint16_t mui_vid; /* SL */ } mac_unicast_impl_t; -#define MAC_CLIENT_FLAGS_PRIMARY 0X0001 +#define MAC_CLIENT_FLAGS_PRIMARY 0x0001 #define MAC_CLIENT_FLAGS_VNIC_PRIMARY 0x0002 #define MAC_CLIENT_FLAGS_MULTI_PRIMARY 0x0004 #define MAC_CLIENT_FLAGS_PASSIVE_PRIMARY 0x0008 @@ -83,6 +83,7 @@ typedef struct mac_promisc_impl_s { /* Protected by */ boolean_t mpi_no_phys; /* WO */ boolean_t mpi_strip_vlan_tag; /* WO */ boolean_t mpi_no_copy; /* WO */ + boolean_t mpi_do_fixups; /* WO */ } mac_promisc_impl_t; typedef union mac_tx_percpu_s { @@ -131,12 +132,17 @@ struct mac_client_impl_s { /* Protected by */ uint32_t mci_flags; /* SL */ krwlock_t mci_rw_lock; mac_unicast_impl_t *mci_unicast_list; /* mci_rw_lock */ + /* * The mac_client_impl_t may be shared by multiple clients, i.e * multiple VLANs sharing the same MAC client. In this case the - * address/vid tubles differ and are each associated with their + * address/vid tuples differ and are each associated with their * own flow entry, but the rest underlying components SRS, etc, * are common. + * + * This is only needed to support sun4v vsw. There are several + * places in MAC we could simplify the code if we removed + * sun4v support. */ flow_entry_t *mci_flent_list; /* mci_rw_lock */ uint_t mci_nflents; /* mci_rw_lock */ @@ -313,6 +319,74 @@ extern int mac_tx_percpu_cnt; (((mcip)->mci_state_flags & MCIS_TAG_DISABLE) == 0 && \ (mcip)->mci_nvids == 1) \ +/* + * MAC Client Implementation State (mci_state_flags) + * + * MCIS_IS_VNIC + * + * The client is a VNIC. + * + * MCIS_EXCLUSIVE + * + * The client has exclusive control over the MAC, such that it is + * the sole client of the MAC. + * + * MCIS_TAG_DISABLE + * + * MAC will not add VLAN tags to outgoing traffic. If this flag + * is set it is up to the client to add the correct VLAN tag. + * + * MCIS_STRIP_DISABLE + * + * MAC will not strip the VLAN tags on incoming traffic before + * passing it to mci_rx_fn. This only applies to non-bypass + * traffic. + * + * MCIS_IS_AGGR_PORT + * + * The client represents a port on an aggr. + * + * MCIS_CLIENT_POLL_CAPABLE + * + * The client is capable of polling the Rx TCP/UDP softrings. + * + * MCIS_DESC_LOGGED + * + * This flag is set when the client's link info has been logged + * by the mac_log_linkinfo() timer. This ensures that the + * client's link info is only logged once. + * + * MCIS_SHARE_BOUND + * + * This client has an HIO share bound to it. + * + * MCIS_DISABLE_TX_VID_CHECK + * + * MAC will not check the VID of the client's Tx traffic. + * + * MCIS_USE_DATALINK_NAME + * + * The client is using the same name as its underlying MAC. This + * happens when dlmgmtd is unreachable during client creation. + * + * MCIS_UNICAST_HW + * + * The client requires MAC address hardware classification. This + * is only used by sun4v vsw. + * + * MCIS_IS_AGGR_CLIENT + * + * The client sits atop an aggr. + * + * MCIS_RX_BYPASS_DISABLE + * + * Do not allow the client to enable DLS bypass. + * + * MCIS_NO_UNICAST_ADDR + * + * This client has no MAC unicast addresss associated with it. + * + */ /* MCI state flags */ #define MCIS_IS_VNIC 0x0001 #define MCIS_EXCLUSIVE 0x0002 @@ -325,7 +399,7 @@ extern int mac_tx_percpu_cnt; #define MCIS_DISABLE_TX_VID_CHECK 0x0100 #define MCIS_USE_DATALINK_NAME 0x0200 #define MCIS_UNICAST_HW 0x0400 -#define MCIS_IS_AGGR 0x0800 +#define MCIS_IS_AGGR_CLIENT 0x0800 #define MCIS_RX_BYPASS_DISABLE 0x1000 #define MCIS_NO_UNICAST_ADDR 0x2000 @@ -337,8 +411,8 @@ extern int mac_tx_percpu_cnt; extern void mac_promisc_client_dispatch(mac_client_impl_t *, mblk_t *); extern void mac_client_init(void); extern void mac_client_fini(void); -extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, - mac_client_impl_t *); +extern void mac_promisc_dispatch(mac_impl_t *, mblk_t *, mac_client_impl_t *, + boolean_t); extern int mac_validate_props(mac_impl_t *, mac_resource_props_t *); diff --git a/usr/src/uts/common/sys/mac_client_priv.h b/usr/src/uts/common/sys/mac_client_priv.h index 6b409513a6..97b3fd685a 100644 --- a/usr/src/uts/common/sys/mac_client_priv.h +++ b/usr/src/uts/common/sys/mac_client_priv.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. */ /* @@ -58,6 +58,9 @@ extern const mac_info_t *mac_info(mac_handle_t); extern boolean_t mac_info_get(const char *, mac_info_t *); extern boolean_t mac_promisc_get(mac_handle_t); +extern boolean_t mac_protect_check_addr(mac_client_handle_t, boolean_t, + in6_addr_t *); + extern int mac_start(mac_handle_t); extern void mac_stop(mac_handle_t); @@ -121,9 +124,17 @@ extern void mac_tx_client_quiesce(mac_client_handle_t); extern void mac_tx_client_condemn(mac_client_handle_t); extern void mac_tx_client_restart(mac_client_handle_t); extern void mac_srs_perm_quiesce(mac_client_handle_t, boolean_t); +extern uint_t mac_hwrings_idx_get(mac_handle_t, uint_t, mac_group_handle_t *, + mac_ring_handle_t *, mac_ring_type_t); extern int mac_hwrings_get(mac_client_handle_t, mac_group_handle_t *, mac_ring_handle_t *, mac_ring_type_t); extern uint_t mac_hwring_getinfo(mac_ring_handle_t); +extern void mac_hwring_set_passthru(mac_ring_handle_t, mac_rx_t, void *, + mac_resource_handle_t); +extern void mac_hwring_clear_passthru(mac_ring_handle_t); +extern void mac_client_set_flow_cb(mac_client_handle_t, mac_rx_t, void *); +extern void mac_client_clear_flow_cb(mac_client_handle_t); + extern void mac_hwring_setup(mac_ring_handle_t, mac_resource_handle_t, mac_ring_handle_t); extern void mac_hwring_teardown(mac_ring_handle_t); @@ -131,6 +142,8 @@ extern int mac_hwring_disable_intr(mac_ring_handle_t); extern int mac_hwring_enable_intr(mac_ring_handle_t); extern int mac_hwring_start(mac_ring_handle_t); extern void mac_hwring_stop(mac_ring_handle_t); +extern int mac_hwring_activate(mac_ring_handle_t); +extern void mac_hwring_quiesce(mac_ring_handle_t); extern mblk_t *mac_hwring_poll(mac_ring_handle_t, int); extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); extern int mac_hwring_getstat(mac_ring_handle_t, uint_t, uint64_t *); @@ -144,6 +157,13 @@ extern void mac_hwring_set_default(mac_handle_t, mac_ring_handle_t); extern int mac_hwgroup_addmac(mac_group_handle_t, const uint8_t *); extern int mac_hwgroup_remmac(mac_group_handle_t, const uint8_t *); +extern int mac_hwgroup_addvlan(mac_group_handle_t, uint16_t); +extern int mac_hwgroup_remvlan(mac_group_handle_t, uint16_t); + +extern boolean_t mac_has_hw_vlan(mac_handle_t); + +extern uint_t mac_get_num_rx_groups(mac_handle_t); +extern int mac_set_promisc(mac_handle_t, boolean_t); extern void mac_set_upper_mac(mac_client_handle_t, mac_handle_t, mac_resource_props_t *); @@ -171,6 +191,7 @@ extern void mac_client_set_intr_cpu(void *, mac_client_handle_t, int32_t); extern void *mac_get_devinfo(mac_handle_t); extern boolean_t mac_is_vnic(mac_handle_t); +extern boolean_t mac_is_overlay(mac_handle_t); extern uint32_t mac_no_notification(mac_handle_t); extern int mac_set_prop(mac_handle_t, mac_prop_id_t, char *, void *, uint_t); diff --git a/usr/src/uts/common/sys/mac_flow.h b/usr/src/uts/common/sys/mac_flow.h index e290ba7dbe..d37752ec23 100644 --- a/usr/src/uts/common/sys/mac_flow.h +++ b/usr/src/uts/common/sys/mac_flow.h @@ -22,7 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2013 Joyent, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. All rights reserved. */ #ifndef _MAC_FLOW_H @@ -155,6 +155,14 @@ typedef enum { #define MPT_MAXIPADDR MPT_MAXCNT #define MPT_MAXCID MPT_MAXCNT #define MPT_MAXCIDLEN 256 +#define MPT_FALSE 0x00000000 +#define MPT_TRUE 0x00000001 + +/* Dynamic address detection types */ +#define MPT_DYN_DHCPV4 0x00000001 +#define MPT_DYN_DHCPV6 0x00000002 +#define MPT_DYN_SLAAC 0x00000004 +#define MPT_DYN_ALL 0x00000007 typedef struct mac_ipaddr_s { uint32_t ip_version; @@ -175,11 +183,13 @@ typedef struct mac_dhcpcid_s { } mac_dhcpcid_t; typedef struct mac_protect_s { - uint32_t mp_types; - uint32_t mp_ipaddrcnt; - mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR]; - uint32_t mp_cidcnt; - mac_dhcpcid_t mp_cids[MPT_MAXCID]; + uint32_t mp_types; /* Enabled protection types */ + uint32_t mp_ipaddrcnt; /* Count of allowed IPs */ + mac_ipaddr_t mp_ipaddrs[MPT_MAXIPADDR]; /* Allowed IPs */ + uint32_t mp_cidcnt; /* Count of allowed DHCP CIDs */ + mac_dhcpcid_t mp_cids[MPT_MAXCID]; /* Allowed DHCP CIDs */ + uint32_t mp_allcids; /* Whether to allow all CIDs through */ + uint32_t mp_dynamic; /* Enabled dynamic address methods */ } mac_protect_t; /* The default priority for links */ diff --git a/usr/src/uts/common/sys/mac_impl.h b/usr/src/uts/common/sys/mac_impl.h index 774c4fad9a..21f2c10a8e 100644 --- a/usr/src/uts/common/sys/mac_impl.h +++ b/usr/src/uts/common/sys/mac_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_MAC_IMPL_H @@ -35,6 +35,7 @@ #include <net/if.h> #include <sys/mac_flow_impl.h> #include <netinet/ip6.h> +#include <sys/pattr.h> #ifdef __cplusplus extern "C" { @@ -108,6 +109,7 @@ typedef struct mac_cb_info_s { kcondvar_t mcbi_cv; uint_t mcbi_del_cnt; /* Deleted callback cnt */ uint_t mcbi_walker_cnt; /* List walker count */ + uint_t mcbi_barrier_cnt; /* Barrier waiter count */ } mac_cb_info_t; typedef struct mac_notify_cb_s { @@ -123,40 +125,18 @@ typedef struct mac_notify_cb_s { */ typedef boolean_t (*mcb_func_t)(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -#define MAC_CALLBACK_WALKER_INC(mcbi) { \ - mutex_enter((mcbi)->mcbi_lockp); \ - (mcbi)->mcbi_walker_cnt++; \ - mutex_exit((mcbi)->mcbi_lockp); \ -} +#define MAC_CALLBACK_WALKER_INC(mcbi) \ + mac_callback_walker_enter(mcbi) -#define MAC_CALLBACK_WALKER_INC_HELD(mcbi) (mcbi)->mcbi_walker_cnt++; - -#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) { \ - mac_cb_t *rmlist; \ - \ - mutex_enter((mcbi)->mcbi_lockp); \ - if (--(mcbi)->mcbi_walker_cnt == 0 && (mcbi)->mcbi_del_cnt != 0) { \ - rmlist = mac_callback_walker_cleanup((mcbi), headp); \ - mac_callback_free(rmlist); \ - cv_broadcast(&(mcbi)->mcbi_cv); \ - } \ - mutex_exit((mcbi)->mcbi_lockp); \ -} +#define MAC_CALLBACK_WALKER_DCR(mcbi, headp) \ + mac_callback_walker_exit(mcbi, headp, B_FALSE) -#define MAC_PROMISC_WALKER_INC(mip) \ - MAC_CALLBACK_WALKER_INC(&(mip)->mi_promisc_cb_info) - -#define MAC_PROMISC_WALKER_DCR(mip) { \ - mac_cb_info_t *mcbi; \ - \ - mcbi = &(mip)->mi_promisc_cb_info; \ - mutex_enter(mcbi->mcbi_lockp); \ - if (--mcbi->mcbi_walker_cnt == 0 && mcbi->mcbi_del_cnt != 0) { \ - i_mac_promisc_walker_cleanup(mip); \ - cv_broadcast(&mcbi->mcbi_cv); \ - } \ - mutex_exit(mcbi->mcbi_lockp); \ -} +#define MAC_PROMISC_WALKER_INC(mip) \ + mac_callback_walker_enter(&(mip)->mi_promisc_cb_info) + +#define MAC_PROMISC_WALKER_DCR(mip) \ + mac_callback_walker_exit(&(mip)->mi_promisc_cb_info, \ + &(mip)->mi_promisc_list, B_TRUE) typedef struct mactype_s { const char *mt_ident; @@ -208,9 +188,18 @@ struct mac_ring_s { mac_ring_t *mr_next; /* next ring in the chain */ mac_group_handle_t mr_gh; /* reference to group */ - mac_classify_type_t mr_classify_type; /* HW vs SW */ + mac_classify_type_t mr_classify_type; struct mac_soft_ring_set_s *mr_srs; /* associated SRS */ - mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */ + mac_ring_handle_t mr_prh; /* associated pseudo ring hdl */ + + /* + * Ring passthru callback and arguments. See the + * MAC_PASSTHRU_CLASSIFIER comment in mac_provider.h. + */ + mac_rx_t mr_pt_fn; + void *mr_pt_arg1; + mac_resource_handle_t mr_pt_arg2; + uint_t mr_refcnt; /* Ring references */ /* ring generation no. to guard against drivers using stale rings */ uint64_t mr_gen_num; @@ -244,7 +233,7 @@ struct mac_ring_s { (mr)->mr_refcnt++; \ } -#define MR_REFRELE(mr) { \ +#define MR_REFRELE(mr) { \ mutex_enter(&(mr)->mr_lock); \ ASSERT((mr)->mr_refcnt != 0); \ (mr)->mr_refcnt--; \ @@ -255,8 +244,8 @@ struct mac_ring_s { } /* - * Per mac client flow information associated with a RX group. - * The entire structure is SL protected. + * Used to attach MAC clients to an Rx group. The members are SL + * protected. */ typedef struct mac_grp_client { struct mac_grp_client *mgc_next; @@ -270,15 +259,20 @@ typedef struct mac_grp_client { ((g)->mrg_clients->mgc_next == NULL)) ? \ (g)->mrg_clients->mgc_client : NULL) +#define MAC_GROUP_HW_VLAN(g) \ + (((g) != NULL) && \ + ((g)->mrg_info.mgi_addvlan != NULL) && \ + ((g)->mrg_info.mgi_remvlan != NULL)) + /* * Common ring group data structure for ring control and management. - * The entire structure is SL protected + * The entire structure is SL protected. */ struct mac_group_s { int mrg_index; /* index in the list */ mac_ring_type_t mrg_type; /* ring type */ mac_group_state_t mrg_state; /* state of the group */ - mac_group_t *mrg_next; /* next ring in the chain */ + mac_group_t *mrg_next; /* next group in the chain */ mac_handle_t mrg_mh; /* reference to MAC */ mac_ring_t *mrg_rings; /* grouped rings */ uint_t mrg_cur_count; /* actual size of group */ @@ -296,54 +290,6 @@ struct mac_group_s { #define GROUP_INTR_ENABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_enable #define GROUP_INTR_DISABLE_FUNC(g) (g)->mrg_info.mgi_intr.mi_disable -#define MAC_RING_TX(mhp, rh, mp, rest) { \ - mac_ring_handle_t mrh = rh; \ - mac_impl_t *mimpl = (mac_impl_t *)mhp; \ - /* \ - * Send packets through a selected tx ring, or through the \ - * default handler if there is no selected ring. \ - */ \ - if (mrh == NULL) \ - mrh = mimpl->mi_default_tx_ring; \ - if (mrh == NULL) { \ - rest = mimpl->mi_tx(mimpl->mi_driver, mp); \ - } else { \ - rest = mac_hwring_tx(mrh, mp); \ - } \ -} - -/* - * This is the final stop before reaching the underlying driver - * or aggregation, so this is where the bridging hook is implemented. - * Packets that are bridged will return through mac_bridge_tx(), with - * rh nulled out if the bridge chooses to send output on a different - * link due to forwarding. - */ -#define MAC_TX(mip, rh, mp, src_mcip) { \ - mac_ring_handle_t rhandle = (rh); \ - /* \ - * If there is a bound Hybrid I/O share, send packets through \ - * the default tx ring. (When there's a bound Hybrid I/O share, \ - * the tx rings of this client are mapped in the guest domain \ - * and not accessible from here.) \ - */ \ - _NOTE(CONSTANTCONDITION) \ - if ((src_mcip)->mci_state_flags & MCIS_SHARE_BOUND) \ - rhandle = (mip)->mi_default_tx_ring; \ - if (mip->mi_promisc_list != NULL) \ - mac_promisc_dispatch(mip, mp, src_mcip); \ - /* \ - * Grab the proper transmit pointer and handle. Special \ - * optimization: we can test mi_bridge_link itself atomically, \ - * and if that indicates no bridge send packets through tx ring.\ - */ \ - if (mip->mi_bridge_link == NULL) { \ - MAC_RING_TX(mip, rhandle, mp, mp); \ - } else { \ - mp = mac_bridge_tx(mip, rhandle, mp); \ - } \ -} - /* mci_tx_flag */ #define MCI_TX_QUIESCE 0x1 @@ -360,17 +306,23 @@ typedef struct mac_mcast_addrs_s { } mac_mcast_addrs_t; typedef enum { - MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* hardware steering */ + MAC_ADDRESS_TYPE_UNICAST_CLASSIFIED = 1, /* HW classification */ MAC_ADDRESS_TYPE_UNICAST_PROMISC /* promiscuous mode */ } mac_address_type_t; +typedef struct mac_vlan_s { + struct mac_vlan_s *mv_next; + uint16_t mv_vid; +} mac_vlan_t; + typedef struct mac_address_s { mac_address_type_t ma_type; /* address type */ - int ma_nusers; /* number of users */ - /* of that address */ + int ma_nusers; /* num users of addr */ struct mac_address_s *ma_next; /* next address */ uint8_t ma_addr[MAXMACADDRLEN]; /* address value */ size_t ma_len; /* address length */ + mac_vlan_t *ma_vlans; /* VLANs on this addr */ + boolean_t ma_untagged; /* accept untagged? */ mac_group_t *ma_group; /* asscociated group */ mac_impl_t *ma_mip; /* MAC handle */ } mac_address_t; @@ -486,8 +438,11 @@ struct mac_impl_s { mac_led_mode_t mi_led_modes; mac_capab_led_t mi_led; + /* Cache of the Tx DB_CKSUMFLAGS that this MAC supports. */ + uint16_t mi_tx_cksum_flags; /* SL */ + /* - * MAC address list. SL protected. + * MAC address and VLAN lists. SL protected. */ mac_address_t *mi_addresses; @@ -654,6 +609,7 @@ struct mac_impl_s { #define MIS_LEGACY 0x0040 #define MIS_NO_ACTIVE 0x0080 #define MIS_POLL_DISABLE 0x0100 +#define MIS_IS_OVERLAY 0x0200 #define mi_getstat mi_callbacks->mc_getstat #define mi_start mi_callbacks->mc_start @@ -722,23 +678,38 @@ typedef struct mac_client_impl_s mac_client_impl_t; extern void mac_init(void); extern int mac_fini(void); +/* + * MAC packet/chain drop functions to aggregate all dropped-packet + * debugging to a single surface. + */ +/*PRINTFLIKE2*/ +extern void mac_drop_pkt(mblk_t *, const char *, ...) + __KPRINTFLIKE(2); + +/*PRINTFLIKE2*/ +extern void mac_drop_chain(mblk_t *, const char *, ...) + __KPRINTFLIKE(2); + extern void mac_ndd_ioctl(mac_impl_t *, queue_t *, mblk_t *); extern boolean_t mac_ip_hdr_length_v6(ip6_t *, uint8_t *, uint16_t *, uint8_t *, ip6_frag_t **); extern mblk_t *mac_copymsgchain_cksum(mblk_t *); -extern mblk_t *mac_fix_cksum(mblk_t *); extern void mac_packet_print(mac_handle_t, mblk_t *); extern void mac_rx_deliver(void *, mac_resource_handle_t, mblk_t *, mac_header_info_t *); extern void mac_tx_notify(mac_impl_t *); - -extern boolean_t mac_callback_find(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); -extern void mac_callback_remove_wait(mac_cb_info_t *); -extern void mac_callback_free(mac_cb_t *); -extern mac_cb_t *mac_callback_walker_cleanup(mac_cb_info_t *, mac_cb_t **); +extern mblk_t *mac_ring_tx(mac_handle_t, mac_ring_handle_t, mblk_t *); +extern mblk_t *mac_provider_tx(mac_impl_t *, mac_ring_handle_t, mblk_t *, + mac_client_impl_t *); + +extern void mac_callback_add(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); +extern boolean_t mac_callback_remove(mac_cb_info_t *, mac_cb_t **, mac_cb_t *); +extern void mac_callback_remove_wait(mac_cb_info_t *); +extern void mac_callback_barrier(mac_cb_info_t *); +extern void mac_callback_free(mac_cb_t *); +extern void mac_callback_walker_enter(mac_cb_info_t *); +extern void mac_callback_walker_exit(mac_cb_info_t *, mac_cb_t **, boolean_t); /* in mac_bcast.c */ extern void mac_bcast_init(void); @@ -759,6 +730,8 @@ extern void mac_client_bcast_refresh(mac_client_impl_t *, mac_multicst_t, */ extern int mac_group_addmac(mac_group_t *, const uint8_t *); extern int mac_group_remmac(mac_group_t *, const uint8_t *); +extern int mac_group_addvlan(mac_group_t *, uint16_t); +extern int mac_group_remvlan(mac_group_t *, uint16_t); extern int mac_rx_group_add_flow(mac_client_impl_t *, flow_entry_t *, mac_group_t *); extern mblk_t *mac_hwring_tx(mac_ring_handle_t, mblk_t *); @@ -779,6 +752,7 @@ extern void mac_rx_switch_grp_to_sw(mac_group_t *); * MAC address functions are used internally by MAC layer. */ extern mac_address_t *mac_find_macaddr(mac_impl_t *, uint8_t *); +extern mac_address_t *mac_find_macaddr_vlan(mac_impl_t *, uint8_t *, uint16_t); extern boolean_t mac_check_macaddr_shared(mac_address_t *); extern int mac_update_macaddr(mac_address_t *, uint8_t *); extern void mac_freshen_macaddr(mac_address_t *, uint8_t *); @@ -829,7 +803,7 @@ extern void mac_flow_set_name(flow_entry_t *, const char *); extern mblk_t *mac_add_vlan_tag(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_add_vlan_tag_chain(mblk_t *, uint_t, uint16_t); extern mblk_t *mac_strip_vlan_tag_chain(mblk_t *); -extern void mac_pkt_drop(void *, mac_resource_handle_t, mblk_t *, boolean_t); +extern void mac_rx_def(void *, mac_resource_handle_t, mblk_t *, boolean_t); extern mblk_t *mac_rx_flow(mac_handle_t, mac_resource_handle_t, mblk_t *); extern void i_mac_share_alloc(mac_client_impl_t *); @@ -849,7 +823,6 @@ extern void mac_tx_client_block(mac_client_impl_t *); extern void mac_tx_client_unblock(mac_client_impl_t *); extern void mac_tx_invoke_callbacks(mac_client_impl_t *, mac_tx_cookie_t); extern int i_mac_promisc_set(mac_impl_t *, boolean_t); -extern void i_mac_promisc_walker_cleanup(mac_impl_t *); extern mactype_t *mactype_getplugin(const char *); extern void mac_addr_factory_init(mac_impl_t *); extern void mac_addr_factory_fini(mac_impl_t *); @@ -863,8 +836,9 @@ extern int mac_start_group(mac_group_t *); extern void mac_stop_group(mac_group_t *); extern int mac_start_ring(mac_ring_t *); extern void mac_stop_ring(mac_ring_t *); -extern int mac_add_macaddr(mac_impl_t *, mac_group_t *, uint8_t *, boolean_t); -extern int mac_remove_macaddr(mac_address_t *); +extern int mac_add_macaddr_vlan(mac_impl_t *, mac_group_t *, uint8_t *, + uint16_t, boolean_t); +extern int mac_remove_macaddr_vlan(mac_address_t *, uint16_t); extern void mac_set_group_state(mac_group_t *, mac_group_state_t); extern void mac_group_add_client(mac_group_t *, mac_client_impl_t *); diff --git a/usr/src/uts/common/sys/mac_provider.h b/usr/src/uts/common/sys/mac_provider.h index 4c91c03967..2dea3a4758 100644 --- a/usr/src/uts/common/sys/mac_provider.h +++ b/usr/src/uts/common/sys/mac_provider.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc. */ #ifndef _SYS_MAC_PROVIDER_H @@ -108,6 +108,7 @@ typedef enum { MAC_CAPAB_NO_ZCOPY = 0x00100000, /* boolean only, no data */ MAC_CAPAB_LEGACY = 0x00200000, /* data is mac_capab_legacy_t */ MAC_CAPAB_VRRP = 0x00400000, /* data is mac_capab_vrrp_t */ + MAC_CAPAB_OVERLAY = 0x00800000, /* boolean only, no data */ MAC_CAPAB_TRANSCEIVER = 0x01000000, /* mac_capab_transciever_t */ MAC_CAPAB_LED = 0x02000000 /* data is mac_capab_led_t */ } mac_capab_t; @@ -242,16 +243,59 @@ typedef struct mac_callbacks_s { /* * Virtualization Capabilities */ + /* - * The ordering of entries below is important. MAC_HW_CLASSIFIER - * is the cutoff below which are entries which don't depend on - * H/W. MAC_HW_CLASSIFIER and entries after that are cases where - * H/W has been updated through add/modify/delete APIs. + * The type of ring classification. This is used by MAC to determine + * what, if any, processing it has to do upon receiving traffic on a + * particular Rx ring. + * + * MAC_NO_CLASSIFIER + * + * No classification has been set. No traffic should cross an Rx + * ring in this state. + * + * MAC_SW_CLASSIFIER + * + * The driver delivers traffic for multiple clients to this ring. + * All traffic must be software classified by MAC to guarantee + * delivery to the correct client. This classification type may + * be chosen for several reasons. + * + * o The driver provides only one group and there are multiple + * clients using the MAC. + * + * o The driver provides some hardware filtering but not enough + * to fully classify the traffic. E.g., a VLAN VNIC requires L2 + * unicast address filtering as well as VLAN filtering, but + * some drivers may only support the former. + * + * o The ring belongs to the default group. The default group + * acts as a spillover for all clients that can't reserve an + * exclusive group. It also handles multicast traffic for all + * clients. For these reasons, the default group's rings are + * always software classified. + * + * MAC_HW_CLASSIFIER + * + * The driver delivers traffic for a single MAC client across + * this ring. With this guarantee, MAC can simply pass the + * traffic up the stack or even allow polling of the ring. + * + * MAC_PASSTHRU_CLASSIFIER + * + * The ring is in "passthru" mode. In this mode we bypass all of + * the typical MAC processing and pass the traffic directly to + * the mr_pt_fn callback, see mac_rx_common(). This is used in + * cases where there is another module acting as MAC provider on + * behalf of the driver. E.g., link aggregations use this mode to + * take full control of the port's rings; allowing it to enforce + * LACP protocols and aggregate rings across discrete drivers. */ typedef enum { MAC_NO_CLASSIFIER = 0, MAC_SW_CLASSIFIER, - MAC_HW_CLASSIFIER + MAC_HW_CLASSIFIER, + MAC_PASSTHRU_CLASSIFIER } mac_classify_type_t; typedef void (*mac_rx_func_t)(void *, mac_resource_handle_t, mblk_t *, @@ -281,6 +325,28 @@ typedef enum { } mac_ring_type_t; /* + * The value VLAN_ID_NONE (VID 0) means a client does not have + * membership to any VLAN. However, this statement is true for both + * untagged packets and priority tagged packets leading to confusion + * over what semantic is intended. To the provider, VID 0 is a valid + * VID when priority tagging is in play. To MAC and everything above + * VLAN_ID_NONE almost universally implies untagged traffic. Thus, we + * convert VLAN_ID_NONE to a sentinel value (MAC_VLAN_UNTAGGED) at the + * border between MAC and MAC provider. This informs the provider that + * the client is interested in untagged traffic and the provider + * should set any relevant bits to receive such traffic. + * + * Currently, the API between MAC and the provider passes the VID as a + * unit16_t. In the future this could actually be the entire TCI mask + * (PCP, DEI, and VID). This current scheme is safe in that potential + * future world as well; as 0xFFFF is not a valid TCI (the 0xFFF VID + * is reserved and never transmitted across networks). + */ +#define MAC_VLAN_UNTAGGED UINT16_MAX +#define MAC_VLAN_UNTAGGED_VID(vid) \ + (((vid) == VLAN_ID_NONE) ? MAC_VLAN_UNTAGGED : (vid)) + +/* * Grouping type of a ring group * * MAC_GROUP_TYPE_STATIC: The ring group can not be re-grouped. @@ -342,6 +408,7 @@ typedef struct mac_ring_info_s { mac_ring_poll_t poll; } mrfunion; mac_ring_stat_t mri_stat; + /* * mri_flags will have some bits set to indicate some special * property/feature of a ring like serialization needed for a @@ -358,6 +425,8 @@ typedef struct mac_ring_info_s { * #defines for mri_flags. The flags are temporary flags that are provided * only to workaround issues in specific drivers, and they will be * removed in the future. + * + * These are consumed only by sun4v and neptune (nxge). */ #define MAC_RING_TX_SERIALIZE 0x1 #define MAC_RING_RX_ENQUEUE 0x2 @@ -366,6 +435,8 @@ typedef int (*mac_group_start_t)(mac_group_driver_t); typedef void (*mac_group_stop_t)(mac_group_driver_t); typedef int (*mac_add_mac_addr_t)(void *, const uint8_t *); typedef int (*mac_rem_mac_addr_t)(void *, const uint8_t *); +typedef int (*mac_add_vlan_filter_t)(mac_group_driver_t, uint16_t); +typedef int (*mac_rem_vlan_filter_t)(mac_group_driver_t, uint16_t); struct mac_group_info_s { mac_group_driver_t mgi_driver; /* Driver reference */ @@ -374,9 +445,11 @@ struct mac_group_info_s { uint_t mgi_count; /* Count of rings */ mac_intr_t mgi_intr; /* Optional per-group intr */ - /* Only used for rx groups */ + /* Only used for Rx groups */ mac_add_mac_addr_t mgi_addmac; /* Add a MAC address */ mac_rem_mac_addr_t mgi_remmac; /* Remove a MAC address */ + mac_add_vlan_filter_t mgi_addvlan; /* Add a VLAN filter */ + mac_rem_vlan_filter_t mgi_remvlan; /* Remove a VLAN filter */ }; /* @@ -558,11 +631,12 @@ extern void mac_prop_info_set_range_uint32( extern void mac_prop_info_set_perm(mac_prop_info_handle_t, uint8_t); -extern void mac_hcksum_get(mblk_t *, uint32_t *, +extern void mac_hcksum_get(const mblk_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); extern void mac_hcksum_set(mblk_t *, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t); +extern void mac_hcksum_clone(const mblk_t *, mblk_t *); extern void mac_lso_get(mblk_t *, uint32_t *, uint32_t *); diff --git a/usr/src/uts/common/sys/mman.h b/usr/src/uts/common/sys/mman.h index 0d49a2ff4d..65819c1209 100644 --- a/usr/src/uts/common/sys/mman.h +++ b/usr/src/uts/common/sys/mman.h @@ -340,6 +340,7 @@ struct memcntl_mha32 { #define MS_SYNC 0x4 /* wait for msync */ #define MS_ASYNC 0x1 /* return immediately */ #define MS_INVALIDATE 0x2 /* invalidate caches */ +#define MS_INVALCURPROC 0x8 /* invalidate cache for curproc only */ #if (_POSIX_C_SOURCE <= 2) && !defined(_XPG4_2) || defined(__EXTENSIONS__) /* functions to mctl */ diff --git a/usr/src/uts/common/sys/mntent.h b/usr/src/uts/common/sys/mntent.h index 88c98dc5a4..7196f7b3ac 100644 --- a/usr/src/uts/common/sys/mntent.h +++ b/usr/src/uts/common/sys/mntent.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T @@ -47,6 +48,7 @@ extern "C" { #define MNTTYPE_PCFS "pcfs" /* PC (MSDOS) file system */ #define MNTTYPE_PC MNTTYPE_PCFS /* Deprecated name; use MNTTYPE_PCFS */ #define MNTTYPE_LOFS "lofs" /* Loop back file system */ +#define MNTTYPE_HYPRLOFS "hyprlofs" /* Hyperlofs file system */ #define MNTTYPE_LO MNTTYPE_LOFS /* Deprecated name; use MNTTYPE_LOFS */ #define MNTTYPE_HSFS "hsfs" /* High Sierra (9660) file system */ #define MNTTYPE_SWAP "swap" /* Swap file system */ diff --git a/usr/src/uts/common/sys/netconfig.h b/usr/src/uts/common/sys/netconfig.h index 6407534a3b..658f9f3f6b 100644 --- a/usr/src/uts/common/sys/netconfig.h +++ b/usr/src/uts/common/sys/netconfig.h @@ -28,6 +28,7 @@ * * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_NETCONFIG_H diff --git a/usr/src/uts/common/sys/neti.h b/usr/src/uts/common/sys/neti.h index b21504109c..92bd5b897d 100644 --- a/usr/src/uts/common/sys/neti.h +++ b/usr/src/uts/common/sys/neti.h @@ -21,6 +21,8 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * + * Copyright 2018, Joyent, Inc. */ #ifndef _SYS_NETI_H @@ -46,6 +48,9 @@ struct msgb; /* avoiding sys/stream.h here */ #define NHF_INET "NHF_INET" #define NHF_INET6 "NHF_INET6" #define NHF_ARP "NHF_ARP" +#define NHF_VND_INET "NHF_VND_INET" +#define NHF_VND_INET6 "NHF_VND_INET6" +#define NHF_VIONA "NHF_VIONA" /* * Event identification @@ -61,7 +66,7 @@ struct msgb; /* avoiding sys/stream.h here */ /* * Network NIC hardware checksum capability */ -#define NET_HCK_NONE 0x00 +#define NET_HCK_NONE 0x00 #define NET_HCK_L3_FULL 0x01 #define NET_HCK_L3_PART 0x02 #define NET_HCK_L4_FULL 0x10 diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h index 7ee33318cd..b327e69fad 100644 --- a/usr/src/uts/common/sys/netstack.h +++ b/usr/src/uts/common/sys/netstack.h @@ -88,7 +88,8 @@ typedef id_t netstackid_t; #define NS_IPSECESP 16 #define NS_IPNET 17 #define NS_ILB 18 -#define NS_MAX (NS_ILB+1) +#define NS_VND 19 +#define NS_MAX (NS_VND+1) /* * State maintained for each module which tracks the state of diff --git a/usr/src/uts/common/sys/overlay.h b/usr/src/uts/common/sys/overlay.h new file mode 100644 index 0000000000..12d0dbca51 --- /dev/null +++ b/usr/src/uts/common/sys/overlay.h @@ -0,0 +1,96 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015, Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_H +#define _SYS_OVERLAY_H + +/* + * Overlay device support + */ + +#include <sys/param.h> +#include <sys/dld_ioc.h> +#include <sys/mac.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVERLAY_IOC_CREATE OVERLAYIOC(1) +#define OVERLAY_IOC_DELETE OVERLAYIOC(2) +#define OVERLAY_IOC_PROPINFO OVERLAYIOC(3) +#define OVERLAY_IOC_GETPROP OVERLAYIOC(4) +#define OVERLAY_IOC_SETPROP OVERLAYIOC(5) +#define OVERLAY_IOC_NPROPS OVERLAYIOC(6) +#define OVERLAY_IOC_ACTIVATE OVERLAYIOC(7) +#define OVERLAY_IOC_STATUS OVERLAYIOC(8) + +typedef struct overlay_ioc_create { + datalink_id_t oic_linkid; + uint32_t oic_filler; + uint64_t oic_vnetid; + char oic_encap[MAXLINKNAMELEN]; +} overlay_ioc_create_t; + +typedef struct overlay_ioc_activate { + datalink_id_t oia_linkid; +} overlay_ioc_activate_t; + +typedef struct overlay_ioc_delete { + datalink_id_t oid_linkid; +} overlay_ioc_delete_t; + +typedef struct overlay_ioc_nprops { + datalink_id_t oipn_linkid; + int32_t oipn_nprops; +} overlay_ioc_nprops_t; + +typedef struct overlay_ioc_propinfo { + datalink_id_t oipi_linkid; + int32_t oipi_id; + char oipi_name[OVERLAY_PROP_NAMELEN]; + uint_t oipi_type; + uint_t oipi_prot; + uint8_t oipi_default[OVERLAY_PROP_SIZEMAX]; + uint32_t oipi_defsize; + uint32_t oipi_posssize; + uint8_t oipi_poss[OVERLAY_PROP_SIZEMAX]; +} overlay_ioc_propinfo_t; + +typedef struct overlay_ioc_prop { + datalink_id_t oip_linkid; + int32_t oip_id; + char oip_name[OVERLAY_PROP_NAMELEN]; + uint8_t oip_value[OVERLAY_PROP_SIZEMAX]; + uint32_t oip_size; +} overlay_ioc_prop_t; + +typedef enum overlay_status { + OVERLAY_I_OK = 0x00, + OVERLAY_I_DEGRADED = 0x01 +} overlay_status_t; + +typedef struct overlay_ioc_status { + datalink_id_t ois_linkid; + uint_t ois_status; + char ois_message[OVERLAY_STATUS_BUFLEN]; +} overlay_ioc_status_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_H */ diff --git a/usr/src/uts/common/sys/overlay_common.h b/usr/src/uts/common/sys/overlay_common.h new file mode 100644 index 0000000000..d638096006 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_common.h @@ -0,0 +1,65 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_COMMON_H +#define _SYS_OVERLAY_COMMON_H + +/* + * Common overlay definitions + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum overlay_target_mode { + OVERLAY_TARGET_NONE = 0x0, + OVERLAY_TARGET_POINT, + OVERLAY_TARGET_DYNAMIC +} overlay_target_mode_t; + +typedef enum overlay_plugin_dest { + OVERLAY_PLUGIN_D_INVALID = 0x0, + OVERLAY_PLUGIN_D_ETHERNET = 0x1, + OVERLAY_PLUGIN_D_IP = 0x2, + OVERLAY_PLUGIN_D_PORT = 0x4, + OVERLAY_PLUGIN_D_MASK = 0x7 +} overlay_plugin_dest_t; + +typedef enum overlay_prop_type { + OVERLAY_PROP_T_INT = 0x1, /* signed int */ + OVERLAY_PROP_T_UINT, /* unsigned int */ + OVERLAY_PROP_T_IP, /* sinaddr6 */ + OVERLAY_PROP_T_STRING /* OVERLAY_PROPS_SIZEMAX */ +} overlay_prop_type_t; + +typedef enum overlay_prop_prot { + OVERLAY_PROP_PERM_REQ = 0x1, + OVERLAY_PROP_PERM_READ = 0x2, + OVERLAY_PROP_PERM_WRITE = 0x4, + OVERLAY_PROP_PERM_RW = 0x6, + OVERLAY_PROP_PERM_RRW = 0x7, + OVERLAY_PROP_PERM_MASK = 0x7 +} overlay_prop_prot_t; + +#define OVERLAY_PROP_NAMELEN 64 +#define OVERLAY_PROP_SIZEMAX 256 +#define OVERLAY_STATUS_BUFLEN 256 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_COMMON_H */ diff --git a/usr/src/uts/common/sys/overlay_impl.h b/usr/src/uts/common/sys/overlay_impl.h new file mode 100644 index 0000000000..7fb8b8da1d --- /dev/null +++ b/usr/src/uts/common/sys/overlay_impl.h @@ -0,0 +1,205 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_IMPL_H +#define _SYS_OVERLAY_IMPL_H + +/* + * Overlay device support + */ + +#include <sys/overlay.h> +#include <sys/overlay_common.h> +#include <sys/overlay_plugin.h> +#include <sys/overlay_target.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/avl.h> +#include <sys/ksocket.h> +#include <sys/socket.h> +#include <sys/refhash.h> +#include <sys/ethernet.h> +#include <sys/list.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION_ONE 0x1 + +typedef struct overlay_plugin { + kmutex_t ovp_mutex; + list_node_t ovp_link; /* overlay_plugin_lock */ + uint_t ovp_active; /* ovp_mutex */ + const char *ovp_name; /* RO */ + const overlay_plugin_ops_t *ovp_ops; /* RO */ + const char *const *ovp_props; /* RO */ + uint_t ovp_nprops; /* RO */ + uint_t ovp_id_size; /* RO */ + overlay_plugin_flags_t ovp_flags; /* RO */ + overlay_plugin_dest_t ovp_dest; /* RO */ +} overlay_plugin_t; + +typedef struct overlay_mux { + list_node_t omux_lnode; + ksocket_t omux_ksock; /* RO */ + overlay_plugin_t *omux_plugin; /* RO: associated encap */ + int omux_domain; /* RO: socket domain */ + int omux_family; /* RO: socket family */ + int omux_protocol; /* RO: socket protocol */ + struct sockaddr *omux_addr; /* RO: socket address */ + socklen_t omux_alen; /* RO: sockaddr len */ + kmutex_t omux_lock; /* Protects everything below */ + uint_t omux_count; /* Active instances */ + avl_tree_t omux_devices; /* Tree of devices */ +} overlay_mux_t; + +typedef enum overlay_target_flag { + OVERLAY_T_TEARDOWN = 0x1 +} overlay_target_flag_t; + +typedef struct overlay_target { + kmutex_t ott_lock; + kcondvar_t ott_cond; + overlay_target_mode_t ott_mode; /* RO */ + overlay_plugin_dest_t ott_dest; /* RO */ + uint64_t ott_id; /* RO */ + overlay_target_flag_t ott_flags; /* ott_lock */ + uint_t ott_ocount; /* ott_lock */ + union { /* ott_lock */ + overlay_target_point_t ott_point; + struct overlay_target_dyn { + refhash_t *ott_dhash; + avl_tree_t ott_tree; + } ott_dyn; + } ott_u; +} overlay_target_t; + +typedef enum overlay_dev_flag { + OVERLAY_F_ACTIVATED = 0x01, /* Activate ioctl completed */ + OVERLAY_F_IN_MUX = 0x02, /* Currently in a mux */ + OVERLAY_F_IN_TX = 0x04, /* Currently doing tx */ + OVERLAY_F_IN_RX = 0x08, /* Currently doing rx */ + OVERLAY_F_IOMASK = 0x0c, /* A mask for rx and tx */ + OVERLAY_F_MDDROP = 0x10, /* Drop traffic for metadata update */ + OVERLAY_F_STOPMASK = 0x1e, /* None set when stopping */ + OVERLAY_F_VARPD = 0x20, /* varpd plugin exists */ + OVERLAY_F_DEGRADED = 0x40, /* device is degraded */ + OVERLAY_F_MASK = 0x7f /* mask of everything */ +} overlay_dev_flag_t; + +typedef struct overlay_dev { + kmutex_t odd_lock; + kcondvar_t odd_iowait; + list_node_t odd_link; /* overlay_dev_lock */ + mac_handle_t odd_mh; /* RO */ + overlay_plugin_t *odd_plugin; /* RO */ + datalink_id_t odd_linkid; /* RO */ + void *odd_pvoid; /* RO -- only used by plugin */ + uint_t odd_ref; /* protected by odd_lock */ + uint_t odd_mtu; /* protected by odd_lock */ + overlay_dev_flag_t odd_flags; /* protected by odd_lock */ + uint_t odd_rxcount; /* protected by odd_lock */ + uint_t odd_txcount; /* protected by odd_lock */ + overlay_mux_t *odd_mux; /* protected by odd_lock */ + uint64_t odd_vid; /* RO if active else odd_lock */ + avl_node_t odd_muxnode; /* managed by mux */ + overlay_target_t *odd_target; /* See big theory statement */ + char odd_fmamsg[OVERLAY_STATUS_BUFLEN]; /* odd_lock */ +} overlay_dev_t; + +typedef enum overlay_target_entry_flags { + OVERLAY_ENTRY_F_PENDING = 0x01, /* lookup in progress */ + OVERLAY_ENTRY_F_VALID = 0x02, /* entry is currently valid */ + OVERLAY_ENTRY_F_DROP = 0x04, /* always drop target */ + OVERLAY_ENTRY_F_VALID_MASK = 0x06 +} overlay_target_entry_flags_t; + +typedef struct overlay_target_entry { + kmutex_t ote_lock; + refhash_link_t ote_reflink; /* hashtable link */ + avl_node_t ote_avllink; /* iteration link */ + list_node_t ote_qlink; + overlay_target_entry_flags_t ote_flags; /* RW: state flags */ + uint8_t ote_addr[ETHERADDRL]; /* RO: mac addr */ + overlay_target_t *ote_ott; /* RO */ + overlay_dev_t *ote_odd; /* RO */ + overlay_target_point_t ote_dest; /* RW: destination */ + mblk_t *ote_chead; /* RW: blocked mb chain head */ + mblk_t *ote_ctail; /* RW: blocked mb chain tail */ + size_t ote_mbsize; /* RW: outstanding mblk size */ + hrtime_t ote_vtime; /* RW: valid timestamp */ +} overlay_target_entry_t; + + +#define OVERLAY_CTL "overlay" + +extern dev_info_t *overlay_dip; + +extern mblk_t *overlay_m_tx(void *, mblk_t *); + +typedef int (*overlay_dev_iter_f)(overlay_dev_t *, void *); +extern void overlay_dev_iter(overlay_dev_iter_f, void *); + +extern void overlay_plugin_init(void); +extern overlay_plugin_t *overlay_plugin_lookup(const char *); +extern void overlay_plugin_rele(overlay_plugin_t *); +extern void overlay_plugin_fini(void); +typedef int (*overlay_plugin_walk_f)(overlay_plugin_t *, void *); +extern void overlay_plugin_walk(overlay_plugin_walk_f, void *); + +extern void overlay_io_start(overlay_dev_t *, overlay_dev_flag_t); +extern void overlay_io_done(overlay_dev_t *, overlay_dev_flag_t); + +extern void overlay_mux_init(void); +extern void overlay_mux_fini(void); + +extern overlay_mux_t *overlay_mux_open(overlay_plugin_t *, int, int, int, + struct sockaddr *, socklen_t, int *); +extern void overlay_mux_close(overlay_mux_t *); +extern void overlay_mux_add_dev(overlay_mux_t *, overlay_dev_t *); +extern void overlay_mux_remove_dev(overlay_mux_t *, overlay_dev_t *); +extern int overlay_mux_tx(overlay_mux_t *, struct msghdr *, mblk_t *); + +extern void overlay_prop_init(overlay_prop_handle_t); + +extern void overlay_target_init(void); +extern int overlay_target_busy(void); +extern int overlay_target_open(dev_t *, int, int, cred_t *); +extern int overlay_target_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +extern int overlay_target_close(dev_t, int, int, cred_t *); +extern void overlay_target_free(overlay_dev_t *); + +#define OVERLAY_TARGET_OK 0 +#define OVERLAY_TARGET_DROP 1 +#define OVERLAY_TARGET_ASYNC 2 +extern int overlay_target_lookup(overlay_dev_t *, mblk_t *, struct sockaddr *, + socklen_t *); +extern void overlay_target_quiesce(overlay_target_t *); +extern void overlay_target_fini(void); + +extern void overlay_fm_init(void); +extern void overlay_fm_fini(void); +extern void overlay_fm_degrade(overlay_dev_t *, const char *); +extern void overlay_fm_restore(overlay_dev_t *); + +extern overlay_dev_t *overlay_hold_by_dlid(datalink_id_t); +extern void overlay_hold_rele(overlay_dev_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_IMPL_H */ diff --git a/usr/src/uts/common/sys/overlay_plugin.h b/usr/src/uts/common/sys/overlay_plugin.h new file mode 100644 index 0000000000..07efaa05df --- /dev/null +++ b/usr/src/uts/common/sys/overlay_plugin.h @@ -0,0 +1,324 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _SYS_OVERLAY_PLUGIN_H +#define _SYS_OVERLAY_PLUGIN_H + +/* + * overlay plugin interface for encapsulation/decapsulation modules + * + * This header file defines how encapsulation and decapsulation plugins + * interact within the broader system. At this time, these interfaces are + * considered private to illumos and therefore are subject to change. As we gain + * more experience with a few of the different encapsulation formats, say nvgre + * or geneve, then we can move to make this a more-stable interface. + * + * A plugin is a general kernel module that uses the miscellaneous mod-linkage. + * + * In it's _init(9E) routine, it must register itself with the overlay + * subsystem. To do this, it allocates an overlay_plugin_register_t via + * overlay_plugin_alloc(), that it then * fills out with various required + * information and then attempts to register with the system via a call to + * overlay_plugin_register(). If that succeeds, it should then call + * mod_install(9F). If the mod_install(9F) fails, then it should call + * overlay_plugin_unregister(). Regardless of success or failure, it should call + * overlay_plugin_free() to ensure that any memory that may be associated with + * the registration is freed. + * + * When the module's _fini(9E) is called, overlay_plugin_unregister() should be + * called first. It may return an error, such as EBUSY. In such cases, it should + * be returned as the return status of _fini(9E). This is quite necessary, it + * ensures that if the module is in use it doesn't get unloaded out from under + * us the broader subsystem while it's still in use. A driver can use that to + * know that there are no current instances of its private data. + * + * ------------------ + * Plugin Definitions + * ------------------ + * + * A plugin is required to fill in both an operations vector and a series of + * information to the callback routine. Here are the routines and their + * purposes. The full signatures are available below. + * + * overlay_plugin_init_t + * + * This interface is used to create a new instance of a plugin. An instance + * of a plugin will be created for each overlay device that is created. For + * example, if a device is created with VXLAN ID 23 and ID 42, then there + * will be two different calls to this function. + * + * This function gives the plugin a chance to create a private data + * structure that will be returned on subsequent calls to the system. + * + * overlay_plugin_fini_t + * + * This is the opposite of overlay_plugin_init_t. It will be called when it + * is safe to remove any private data that is associated with this instance + * of the plugin. + * + * overlay_plugin_propinfo_t + * + * This is called with the name of a property that is registered when the + * plugin is created. This function will be called with the name of the + * property that information is being requested about. The plugin is + * responsible for filling out information such as setting the name, the + * type of property it is, the protection of the property (can a user + * update it?), whether the property is required, an optional default value + * for the property, and an optional set of values or ranges that are + * allowed. + * + * overlay_plugin_getprop_t + * + * Return the value of the named property from the current instance of the + * plugin. + * + * overlay_plugin_setprop_t + * + * Set the value of the named property to the specified value for the + * current instance of the plugin. Note, that it is the plugin's + * responsibility to ensure that the value of the property is valid and to + * update state as appropriate. + * + * overlay_plugin_socket_t + * + * Every overlay device has a corresponding socket that it uses to send and + * receive traffic. This routine is used to get the parameters that should + * be used to define such a socket. The actual socket may be multiplexed + * with other uses of it. + * + * overlay_plugin_sockopt_t + * + * Allow a plugin to set any necessary socket options that it needs on the + * kernel socket that is being used by a mux. This will only be called once + * for a given mux, if additional devices are added to a mux, it will not + * be called additional times. + * + * overlay_plugin_encap_t + * + * In this routine you're given a message block and information about the + * packet, such as the identifier and are asked to fill out a message block + * that represents the encapsulation header and optionally manipulate the + * input message if required. + * + * overlay_plugin_decap_t + * + * In this routine, you're given the encapsulated message block. The + * requirement is to decapsulate it and determine what is the correct + * overlay identifier for this network and to fill in the header size so + * the broader system knows how much of this data should be considered + * consumed. + * + * ovpo_callbacks + * + * This should be set to zero, it's reserved for future use. + * + * Once these properties are defined, the module should define the following + * members in the overlay_plugin_register_t. + * + * ovep_version + * + * Should be set to the value of the macro OVEP_VERSION. + * + * ovep_name + * + * Should be set to a character string that has the name of the module. + * Generally this should match the name of the kernel module; however, this + * is the name that users will use to refer to this module when creating + * devices. + * + * overlay_plugin_ops_t + * + * Should be set to the functions as described above. + * + * ovep_props + * + * This is an array of character strings that holds the names of the + * properties of the encapsulation plugin. + * + * + * ovep_id_size + * + * This is the size in bytes of the valid range for the identifier. The + * valid identifier range is considered a ovep_id_size byte unsigned + * integer, [ 0, 1 << (ovep_id_size * 8) ). + * + * ovep_flags + * + * A series of flags that indicate optional features that are supported. + * Valid flags include: + * + * OVEP_F_VLAN_TAG + * + * The encapsulation format allows for the encapsulated + * packet to maintain a VLAN tag. + * + * ovep_dest + * + * Describes the kind of destination that the overlay plugin supports for + * sending traffic. For example, vxlan uses UDP, therefore it requires both + * an IP address and a port; however, nvgre uses the gre header and + * therefore only requires an IP address. The following flags may be + * combined: + * + * OVERLAY_PLUGIN_D_ETHERNET + * + * Indicates that to send a packet to its destination, we + * require a link-layer ethernet address. + * + * OVERLAY_PLUGIN_D_IP + * + * Indicates that to send a packet to its destination, we + * require an IP address. Note, all IP addresses are + * transmitted as IPv6 addresses and for an IPv4 + * destination, using an IPv4-mapped IPv6 address is the + * expected way to transmit that. + * + * OVERLAY_PLUGIN_D_PORT + * + * Indicates that to send a packet to its destination, a + * port is required, this usually indicates that the + * protocol uses something like TCP or UDP. + * + * + * ------------------------------------------------- + * Downcalls, Upcalls, and Synchronization Guarantees + * ------------------------------------------------- + * + * Every instance of a given module is independent. The kernel only guarantees + * that it will probably perform downcalls into different instances in parallel + * at some point. No locking is provided by the framework for synchronization + * across instances. If a module finds itself needing that, it will be up to it + * to provide it. + * + * In a given instance, the kernel may call into entry points in parallel. If + * the instance has private data, it should likely synchronize it. The one + * guarantee that we do make, is that calls to getprop and setprop will be done + * synchronized by a caller holding the MAC perimeter. + * + * While servicing a downcall from the general overlay device framework, a + * kernel module should not make any upcalls, excepting those functions that are + * defined in this header file, eg. the property related callbacks. Improtantly, + * it cannot make any assumptions about what locks may or may not be held by the + * broader system. The only thing that it is safe for it to use are its own + * locks. + * + * ---------------- + * Downcall Context + * ---------------- + * + * For all of the downcalls, excepting the overlay_plugin_encap_t and + * overlay_plugin_decap_t, the calls will be made either in kernel or user + * context, the module should not assume either way. + * + * overlay_plugin_encap_t and overlay_plugin_decap_t may be called in user, + * kernel or interrupt context; however, it is guaranteed that the interrupt + * will be below LOCK_LEVEL, and therefore it is safe to grab locks. + */ + +#include <sys/stream.h> +#include <sys/mac_provider.h> +#include <sys/ksocket.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#define OVEP_VERSION 0x1 + +typedef enum overlay_plugin_flags { + OVEP_F_VLAN_TAG = 0x01 /* Supports VLAN Tags */ +} overlay_plugin_flags_t; + +/* + * The ID space could easily be more than a 64-bit number, even + * though today it's either a 24-64 bit value. How should we future + * proof ourselves here? + */ +typedef struct ovep_encap_info { + uint64_t ovdi_id; + size_t ovdi_hdr_size; +} ovep_encap_info_t; + +typedef struct __overlay_prop_handle *overlay_prop_handle_t; +typedef struct __overlay_handle *overlay_handle_t; + +/* + * Plugins are guaranteed that calls to setprop are serialized. However, any + * number of other calls can be going on in parallel otherwise. + */ +typedef int (*overlay_plugin_encap_t)(void *, mblk_t *, + ovep_encap_info_t *, mblk_t **); +typedef int (*overlay_plugin_decap_t)(void *, mblk_t *, + ovep_encap_info_t *); +typedef int (*overlay_plugin_init_t)(overlay_handle_t, void **); +typedef void (*overlay_plugin_fini_t)(void *); +typedef int (*overlay_plugin_socket_t)(void *, int *, int *, int *, + struct sockaddr *, socklen_t *); +typedef int (*overlay_plugin_sockopt_t)(ksocket_t); +typedef int (*overlay_plugin_getprop_t)(void *, const char *, void *, + uint32_t *); +typedef int (*overlay_plugin_setprop_t)(void *, const char *, const void *, + uint32_t); +typedef int (*overlay_plugin_propinfo_t)(const char *, overlay_prop_handle_t); + +typedef struct overlay_plugin_ops { + uint_t ovpo_callbacks; + overlay_plugin_init_t ovpo_init; + overlay_plugin_fini_t ovpo_fini; + overlay_plugin_encap_t ovpo_encap; + overlay_plugin_decap_t ovpo_decap; + overlay_plugin_socket_t ovpo_socket; + overlay_plugin_sockopt_t ovpo_sockopt; + overlay_plugin_getprop_t ovpo_getprop; + overlay_plugin_setprop_t ovpo_setprop; + overlay_plugin_propinfo_t ovpo_propinfo; +} overlay_plugin_ops_t; + +typedef struct overlay_plugin_register { + uint_t ovep_version; + const char *ovep_name; + const overlay_plugin_ops_t *ovep_ops; + const char **ovep_props; + uint_t ovep_id_size; + uint_t ovep_flags; + uint_t ovep_dest; +} overlay_plugin_register_t; + +/* + * Functions that interact with registration + */ +extern overlay_plugin_register_t *overlay_plugin_alloc(uint_t); +extern void overlay_plugin_free(overlay_plugin_register_t *); +extern int overlay_plugin_register(overlay_plugin_register_t *); +extern int overlay_plugin_unregister(const char *); + +/* + * Property information callbacks + */ +extern void overlay_prop_set_name(overlay_prop_handle_t, const char *); +extern void overlay_prop_set_prot(overlay_prop_handle_t, overlay_prop_prot_t); +extern void overlay_prop_set_type(overlay_prop_handle_t, overlay_prop_type_t); +extern int overlay_prop_set_default(overlay_prop_handle_t, void *, ssize_t); +extern void overlay_prop_set_nodefault(overlay_prop_handle_t); +extern void overlay_prop_set_range_uint32(overlay_prop_handle_t, uint32_t, + uint32_t); +extern void overlay_prop_set_range_str(overlay_prop_handle_t, const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_OVERLAY_PLUGIN_H */ diff --git a/usr/src/uts/common/sys/overlay_target.h b/usr/src/uts/common/sys/overlay_target.h new file mode 100644 index 0000000000..ae92ef3532 --- /dev/null +++ b/usr/src/uts/common/sys/overlay_target.h @@ -0,0 +1,293 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2015 Joyent, Inc. + */ + +#ifndef _OVERLAY_TARGET_H +#define _OVERLAY_TARGET_H + +/* + * Overlay device varpd ioctl interface (/dev/overlay) + */ + +#include <sys/types.h> +#include <sys/ethernet.h> +#include <netinet/in.h> +#include <sys/overlay_common.h> + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct overlay_target_point { + uint8_t otp_mac[ETHERADDRL]; + struct in6_addr otp_ip; + uint16_t otp_port; +} overlay_target_point_t; + +#define OVERLAY_TARG_IOCTL (('o' << 24) | ('v' << 16) | ('t' << 8)) + +#define OVERLAY_TARG_INFO (OVERLAY_TARG_IOCTL | 0x01) + +typedef enum overlay_targ_info_flags { + OVERLAY_TARG_INFO_F_ACTIVE = 0x01, + OVERLAY_TARG_INFO_F_DEGRADED = 0x02 +} overlay_targ_info_flags_t; + +/* + * Get target information about an overlay device + */ +typedef struct overlay_targ_info { + datalink_id_t oti_linkid; + uint32_t oti_needs; + uint64_t oti_flags; + uint64_t oti_vnetid; +} overlay_targ_info_t; + +/* + * Declare an association between a given varpd instance and a datalink. + */ +#define OVERLAY_TARG_ASSOCIATE (OVERLAY_TARG_IOCTL | 0x02) + +typedef struct overlay_targ_associate { + datalink_id_t ota_linkid; + uint32_t ota_mode; + uint64_t ota_id; + uint32_t ota_provides; + overlay_target_point_t ota_point; +} overlay_targ_associate_t; + +/* + * Remove an association from a device. If the device has already been started, + * this implies OVERLAY_TARG_DEGRADE. + */ +#define OVERLAY_TARG_DISASSOCIATE (OVERLAY_TARG_IOCTL | 0x3) + +/* + * Tells the kernel that while a varpd instance still exists, it basically isn't + * making any forward progress, so the device should consider itself degraded. + */ +#define OVERLAY_TARG_DEGRADE (OVERLAY_TARG_IOCTL | 0x4) + +typedef struct overlay_targ_degrade { + datalink_id_t otd_linkid; + uint32_t otd_pad; + char otd_buf[OVERLAY_STATUS_BUFLEN]; +} overlay_targ_degrade_t; + +/* + * Tells the kernel to remove the degraded status that it set on a device. + */ +#define OVERLAY_TARG_RESTORE (OVERLAY_TARG_IOCTL | 0x5) + +typedef struct overlay_targ_id { + datalink_id_t otid_linkid; +} overlay_targ_id_t; + +/* + * The following ioctls are all used to support dynamic lookups from userland, + * generally serviced by varpd. + * + * The way this is designed to work is that user land will have threads sitting + * in OVERLAY_TARG_LOOKUP ioctls waiting to service requests. A thread will sit + * waiting for work for up to approximately one second of time before they will + * be sent back out to user land to give user land a chance to clean itself up + * or more generally, come back into the kernel for work. Once these threads + * return, they will have a request with which more action can be done. The + * following ioctls can all be used to answer the request. + * + * OVERLAY_TARG_RESPOND - overlay_targ_resp_t + * + * The overlay_targ_resp_t has the appropriate information from + * which a reply can be generated. The information is filled into + * an overlay_targ_point_t as appropriate based on the + * overlay_plugin_dest_t type. + * + * + * OVERLAY_TARG_DROP - overlay_targ_resp_t + * + * The overlay_targ_resp_t should identify a request for which to + * drop a packet. + * + * + * OVERLAY_TARG_INJECT - overlay_targ_pkt_t + * + * The overlay_targ_pkt_t injects a fully formed packet into the + * virtual network. It may either be identified by its data link id + * or by the request id. If both are specified, the + * datalink id will be used. Note, that an injection is not + * considered a reply and if this corresponds to a requeset, then + * that individual packet must still be dropped. + * + * + * OVERLAY_TARG_PKT - overlay_targ_pkt_t + * + * This ioctl can be used to copy data from a given request into a + * user buffer. This can be used in combination with + * OVERLAY_TARG_INJECT to implemnt services such as a proxy-arp. + * + * + * OVERLAY_TARG_RESEND - overlay_targ_pkt_t + * + * This ioctl is similar to the OVERLAY_TARG_INJECT, except instead + * of receiving it on the local mac handle, it queues it for + * retransmission again. This is useful if you have a packet that + * was originally destined for some broadcast or multicast address + * that you now want to send to a unicast address. + */ +#define OVERLAY_TARG_LOOKUP (OVERLAY_TARG_IOCTL | 0x10) +#define OVERLAY_TARG_RESPOND (OVERLAY_TARG_IOCTL | 0x11) +#define OVERLAY_TARG_DROP (OVERLAY_TARG_IOCTL | 0x12) +#define OVERLAY_TARG_INJECT (OVERLAY_TARG_IOCTL | 0x13) +#define OVERLAY_TARG_PKT (OVERLAY_TARG_IOCTL | 0x14) +#define OVERLAY_TARG_RESEND (OVERLAY_TARG_IOCTL | 0x15) + +typedef struct overlay_targ_lookup { + uint64_t otl_dlid; + uint64_t otl_reqid; + uint64_t otl_varpdid; + uint64_t otl_vnetid; + uint64_t otl_hdrsize; + uint64_t otl_pktsize; + uint8_t otl_srcaddr[ETHERADDRL]; + uint8_t otl_dstaddr[ETHERADDRL]; + uint32_t otl_dsttype; + uint32_t otl_sap; + int32_t otl_vlan; +} overlay_targ_lookup_t; + +typedef struct overlay_targ_resp { + uint64_t otr_reqid; + overlay_target_point_t otr_answer; +} overlay_targ_resp_t; + +typedef struct overlay_targ_pkt { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + void *otp_buf; +} overlay_targ_pkt_t; + +#ifdef _KERNEL + +typedef struct overlay_targ_pkt32 { + uint64_t otp_linkid; + uint64_t otp_reqid; + uint64_t otp_size; + caddr32_t otp_buf; +} overlay_targ_pkt32_t; + +#endif /* _KERNEL */ + +/* + * This provides a way to get a list of active overlay devices independently + * from dlmgmtd. At the end of the day the kernel always knows what will exist + * and this allows varpd which is an implementation of libdladm not to end up + * needing to call back into dlmgmtd via libdladm and create an unfortunate + * dependency cycle. + */ + +#define OVERLAY_TARG_LIST (OVERLAY_TARG_IOCTL | 0x20) + +typedef struct overlay_targ_list { + uint32_t otl_nents; + uint32_t otl_ents[]; +} overlay_targ_list_t; + +/* + * The following family of ioctls all manipulate the target cache of a given + * device. + * + * OVERLAY_TARG_CACHE_GET - overlay_targ_cache_t + * + * The overlay_targ_cache_t should be have its link identifier and + * the desired mac address filled in. On return, it will fill in + * the otc_dest member, if the entry exists in the table. + * + * + * OVERLAY_TARG_CACHE_SET - overlay_targ_cache_t + * + * The cache table entry of the mac address referred to by otc_mac + * and otd_linkid will be filled in with the details provided by in + * the otc_dest member. + * + * OVERLAY_TARG_CACHE_REMOVE - overlay_targ_cache_t + * + * Removes the cache entry identified by otc_mac from the table. + * Note that this does not stop any in-flight lookups or deal with + * any data that is awaiting a lookup. + * + * + * OVERLAY_TARG_CACHE_FLUSH - overlay_targ_cache_t + * + * Similar to OVERLAY_TARG_CACHE_REMOVE, but functions on the + * entire table identified by otc_linkid. All other parameters are + * ignored. + * + * + * OVERLAY_TARG_CACHE_ITER - overlay_targ_cache_iter_t + * + * Iterates over the contents of a target cache identified by + * otci_linkid. Iteration is guaranteed to be exactly once for + * items which are in the hashtable at the beginning and end of + * iteration. For items which are added or removed after iteration + * has begun, only at most once semantics are guaranteed. Consumers + * should ensure that otci_marker is zeroed before starting + * iteration and should preserve its contents across calls. + * + * Before calling in, otci_count should be set to the number of + * entries that space has been allocated for in otci_ents. The + * value will be updated to indicate the total number written out. + */ + +#define OVERLAY_TARG_CACHE_GET (OVERLAY_TARG_IOCTL | 0x30) +#define OVERLAY_TARG_CACHE_SET (OVERLAY_TARG_IOCTL | 0x31) +#define OVERLAY_TARG_CACHE_REMOVE (OVERLAY_TARG_IOCTL | 0x32) +#define OVERLAY_TARG_CACHE_FLUSH (OVERLAY_TARG_IOCTL | 0x33) +#define OVERLAY_TARG_CACHE_ITER (OVERLAY_TARG_IOCTL | 0x34) + +/* + * This is a pretty arbitrary number that we're constraining ourselves to + * for iteration. Basically the goal is to make sure that we can't have a user + * ask us to allocate too much memory on their behalf at any time. A more + * dynamic form may be necessary some day. + */ +#define OVERLAY_TARGET_ITER_MAX 500 + +#define OVERLAY_TARGET_CACHE_DROP 0x01 + +typedef struct overlay_targ_cache_entry { + uint8_t otce_mac[ETHERADDRL]; + uint16_t otce_flags; + overlay_target_point_t otce_dest; +} overlay_targ_cache_entry_t; + +typedef struct overlay_targ_cache { + datalink_id_t otc_linkid; + overlay_targ_cache_entry_t otc_entry; +} overlay_targ_cache_t; + +typedef struct overlay_targ_cache_iter { + datalink_id_t otci_linkid; + uint32_t otci_pad; + uint64_t otci_marker; + uint16_t otci_count; + uint8_t otci_pad2[3]; + overlay_targ_cache_entry_t otci_ents[]; +} overlay_targ_cache_iter_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _OVERLAY_TARGET_H */ diff --git a/usr/src/uts/common/sys/param.h b/usr/src/uts/common/sys/param.h index 282d84b912..66bd91f76f 100644 --- a/usr/src/uts/common/sys/param.h +++ b/usr/src/uts/common/sys/param.h @@ -116,7 +116,7 @@ extern "C" { #define DEFAULT_MAXPID 999999 #define DEFAULT_JUMPPID 100000 #else -#define DEFAULT_MAXPID 30000 +#define DEFAULT_MAXPID 99999 #define DEFAULT_JUMPPID 0 #endif diff --git a/usr/src/uts/common/sys/pattr.h b/usr/src/uts/common/sys/pattr.h index 1269aeca10..a1fb21ad21 100644 --- a/usr/src/uts/common/sys/pattr.h +++ b/usr/src/uts/common/sys/pattr.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_PATTR_H @@ -97,6 +98,8 @@ typedef struct pattr_hcksum_s { #define HCK_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \ HCK_FULLCKSUM | HCK_FULLCKSUM_OK) +#define HCK_TX_FLAGS (HCK_IPV4_HDRCKSUM | HCK_PARTIALCKSUM | \ + HCK_FULLCKSUM) /* * Extended hardware offloading flags that also use hcksum_flags */ diff --git a/usr/src/uts/common/sys/pci_cap.h b/usr/src/uts/common/sys/pci_cap.h index 730e10d77b..9804913241 100644 --- a/usr/src/uts/common/sys/pci_cap.h +++ b/usr/src/uts/common/sys/pci_cap.h @@ -82,12 +82,12 @@ typedef enum { #define PCI_CAP_GET32(h, i, b, o) ((uint32_t) \ pci_cap_get(h, PCI_CAP_CFGSZ_32, i, b, o)) -#define PCI_CAP_PUT8(h, i, b, o, d) ((uint8_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d)) -#define PCI_CAP_PUT16(h, i, b, o, d) ((uint16_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d)) -#define PCI_CAP_PUT32(h, i, b, o, d) ((uint32_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d)) +#define PCI_CAP_PUT8(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_8, i, b, o, d) +#define PCI_CAP_PUT16(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_16, i, b, o, d) +#define PCI_CAP_PUT32(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_32, i, b, o, d) #define PCI_XCAP_GET8(h, i, b, o) ((uint8_t) \ pci_cap_get(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o)) @@ -96,12 +96,12 @@ typedef enum { #define PCI_XCAP_GET32(h, i, b, o) ((uint32_t) \ pci_cap_get(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o)) -#define PCI_XCAP_PUT8(h, i, b, o, d) ((uint8_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d)) -#define PCI_XCAP_PUT16(h, i, b, o, d) ((uint16_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d)) -#define PCI_XCAP_PUT32(h, i, b, o, d) ((uint32_t) \ - pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d)) +#define PCI_XCAP_PUT8(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_8, PCI_CAP_XCFG_SPC(i), b, o, d) +#define PCI_XCAP_PUT16(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_16, PCI_CAP_XCFG_SPC(i), b, o, d) +#define PCI_XCAP_PUT32(h, i, b, o, d) \ + pci_cap_put(h, PCI_CAP_CFGSZ_32, PCI_CAP_XCFG_SPC(i), b, o, d) extern int pci_cap_probe(ddi_acc_handle_t h, uint16_t index, diff --git a/usr/src/uts/common/sys/pcie_impl.h b/usr/src/uts/common/sys/pcie_impl.h index d1d13625c2..442c55043c 100644 --- a/usr/src/uts/common/sys/pcie_impl.h +++ b/usr/src/uts/common/sys/pcie_impl.h @@ -166,6 +166,7 @@ extern "C" { #define PCIE_ADV_BDG_HDR(pfd_p, n) PCIE_ADV_BDG_REG(pfd_p)->pcie_sue_hdr[n] #define PCIE_ADV_RP_REG(pfd_p) \ PCIE_ADV_REG(pfd_p)->pcie_ext.pcie_adv_rp_regs +#define PCIE_SLOT_REG(pfd_p) pfd_p->pe_pcie_slot_regs #define PFD_AFFECTED_DEV(pfd_p) pfd_p->pe_affected_dev #define PFD_SET_AFFECTED_FLAG(pfd_p, aff_flag) \ PFD_AFFECTED_DEV(pfd_p)->pe_affected_flags = aff_flag @@ -262,6 +263,18 @@ typedef struct pf_pcie_err_regs { pf_pcie_adv_err_regs_t *pcie_adv_regs; /* pcie aer regs */ } pf_pcie_err_regs_t; +/* + * Slot register values for hotplug-capable Downstream Ports or Root Ports with + * the Slot Implemented capability bit set. We gather these to help determine + * whether the slot's child device is physically present. + */ +typedef struct pf_pcie_slot_regs { + boolean_t pcie_slot_regs_valid; /* true if register values are valid */ + uint32_t pcie_slot_cap; /* pcie slot capabilities register */ + uint16_t pcie_slot_control; /* pcie slot control register */ + uint16_t pcie_slot_status; /* pcie slot status register */ +} pf_pcie_slot_regs_t; + typedef enum { PF_INTR_TYPE_NONE = 0, PF_INTR_TYPE_FABRIC = 1, /* Fabric Message */ @@ -431,6 +444,7 @@ struct pf_data { pf_pcie_err_regs_t *pe_pcie_regs; /* PCIe error reg */ } pe_ext; pf_pcix_bdg_err_regs_t *pe_pcix_bdg_regs; /* PCI-X bridge regs */ + pf_pcie_slot_regs_t *pe_pcie_slot_regs; /* PCIe slot regs */ pf_data_t *pe_prev; /* Next error in queue */ pf_data_t *pe_next; /* Next error in queue */ boolean_t pe_rber_fatal; diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index de15be4d60..816d6995cf 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -108,6 +108,7 @@ int secpolicy_ipc_owner(const cred_t *, const struct kipc_perm *); int secpolicy_kmdb(const cred_t *); int secpolicy_lock_memory(const cred_t *); int secpolicy_meminfo(const cred_t *); +int secpolicy_fs_import(const cred_t *); int secpolicy_modctl(const cred_t *, int); int secpolicy_net(const cred_t *, int, boolean_t); int secpolicy_net_bindmlp(const cred_t *); @@ -176,6 +177,7 @@ int secpolicy_setid_setsticky_clear(vnode_t *, vattr_t *, const vattr_t *, cred_t *); int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, vtype_t); int secpolicy_xvm_control(const cred_t *); +int secpolicy_hyprlofs_control(const cred_t *); int secpolicy_basic_exec(const cred_t *, vnode_t *); int secpolicy_basic_fork(const cred_t *); diff --git a/usr/src/uts/common/sys/poll_impl.h b/usr/src/uts/common/sys/poll_impl.h index 67b47f9a1e..3e0eb3b21f 100644 --- a/usr/src/uts/common/sys/poll_impl.h +++ b/usr/src/uts/common/sys/poll_impl.h @@ -25,7 +25,7 @@ */ /* - * Copyright 2015, Joyent, Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_POLL_IMPL_H @@ -140,6 +140,7 @@ struct pollstate { pollstate_t *ps_contend_nextp; /* next in contender list */ pollstate_t **ps_contend_pnextp; /* pointer-to-previous-next */ int ps_flags; /* state flags */ + short ps_implicit_ev; /* implicit poll event interest */ }; /* pollstate flags */ @@ -225,6 +226,7 @@ struct polldat { int pd_nsets; /* num of xref sets, used by poll(2) */ xref_t *pd_ref; /* ptr to xref info, 1 for each set */ port_kevent_t *pd_portev; /* associated port event struct */ + uf_entry_gen_t pd_gen; /* fd generation at cache time */ uint64_t pd_epolldata; /* epoll data, if any */ }; @@ -256,6 +258,7 @@ struct pollcache { /* pc_flag */ #define PC_POLLWAKE 0x02 /* pollwakeup() occurred */ +#define PC_EPOLL 0x04 /* pollcache is epoll-enabled */ #if defined(_KERNEL) /* diff --git a/usr/src/uts/common/sys/proc.h b/usr/src/uts/common/sys/proc.h index 712bd7cb24..7d2209132d 100644 --- a/usr/src/uts/common/sys/proc.h +++ b/usr/src/uts/common/sys/proc.h @@ -315,6 +315,7 @@ typedef struct proc { size_t p_swrss; /* resident set size before last swap */ struct aio *p_aio; /* pointer to async I/O struct */ struct itimer **p_itimer; /* interval timers */ + uint_t p_itimer_sz; /* max allocated interval timers */ timeout_id_t p_alarmid; /* alarm's timeout id */ caddr_t p_usrstack; /* top of the process stack */ uint_t p_stkprot; /* stack memory protection */ @@ -358,6 +359,7 @@ typedef struct proc { struct zone *p_zone; /* zone in which process lives */ struct vnode *p_execdir; /* directory that p_exec came from */ struct brand *p_brand; /* process's brand */ + void *p_brand_data; /* per-process brand state */ psecflags_t p_secflags; /* per-process security flags */ @@ -374,7 +376,6 @@ typedef struct proc { */ struct user p_user; /* (see sys/user.h) */ } proc_t; - #define PROC_T /* headers relying on proc_t are OK */ #ifdef _KERNEL @@ -640,6 +641,7 @@ extern int signal_is_blocked(kthread_t *, int); extern int sigcheck(proc_t *, kthread_t *); extern void sigdefault(proc_t *); +extern struct pid *pid_find(pid_t pid); extern void pid_setmin(void); extern pid_t pid_allocate(proc_t *, pid_t, int); extern int pid_rele(struct pid *); @@ -655,6 +657,7 @@ extern int sprtrylock_proc(proc_t *); extern void sprwaitlock_proc(proc_t *); extern void sprlock_proc(proc_t *); extern void sprunlock(proc_t *); +extern void sprunprlock(proc_t *); extern void pid_init(void); extern proc_t *pid_entry(int); extern int pid_slot(proc_t *); @@ -729,6 +732,10 @@ extern kthread_t *thread_unpin(void); extern void thread_init(void); extern void thread_load(kthread_t *, void (*)(), caddr_t, size_t); +extern void thread_splitstack(void (*)(void *), void *, size_t); +extern void thread_splitstack_run(caddr_t, void (*)(void *), void *); +extern void thread_splitstack_cleanup(void); + extern void tsd_create(uint_t *, void (*)(void *)); extern void tsd_destroy(uint_t *); extern void *tsd_getcreate(uint_t *, void (*)(void *), void *(*)(void)); @@ -770,7 +777,7 @@ extern void pokelwps(proc_t *); extern void continuelwps(proc_t *); extern int exitlwps(int); extern void lwp_ctmpl_copy(klwp_t *, klwp_t *); -extern void lwp_ctmpl_clear(klwp_t *); +extern void lwp_ctmpl_clear(klwp_t *, boolean_t); extern klwp_t *forklwp(klwp_t *, proc_t *, id_t); extern void lwp_load(klwp_t *, gregset_t, uintptr_t); extern void lwp_setrval(klwp_t *, int, int); diff --git a/usr/src/uts/common/sys/procfs.h b/usr/src/uts/common/sys/procfs.h index dfb54eaef5..b24b2d9da1 100644 --- a/usr/src/uts/common/sys/procfs.h +++ b/usr/src/uts/common/sys/procfs.h @@ -25,7 +25,7 @@ */ /* * Copyright 2012 DEY Storage Systems, Inc. All rights reserved. - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_PROCFS_H @@ -236,6 +236,7 @@ typedef struct pstatus { #define PR_FAULTED 6 #define PR_SUSPENDED 7 #define PR_CHECKPOINT 8 +#define PR_BRAND 9 /* * lwp ps(1) information file. /proc/<pid>/lwp/<lwpid>/lwpsinfo @@ -270,10 +271,12 @@ typedef struct lwpsinfo { int pr_filler[4]; /* reserved for future use */ } lwpsinfo_t; +#define PRARGSZ 80 /* number of chars of arguments */ +#define PRMAXARGVLEN 4096 /* max len of /proc/%s/argv */ + /* * process ps(1) information file. /proc/<pid>/psinfo */ -#define PRARGSZ 80 /* number of chars of arguments */ typedef struct psinfo { int pr_flag; /* process flags (DEPRECATED; do not use) */ int pr_nlwp; /* number of active lwps in the process */ diff --git a/usr/src/uts/common/sys/prsystm.h b/usr/src/uts/common/sys/prsystm.h index 7adc920da2..75259dc421 100644 --- a/usr/src/uts/common/sys/prsystm.h +++ b/usr/src/uts/common/sys/prsystm.h @@ -28,7 +28,7 @@ /* All Rights Reserved */ /* - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef _SYS_PRSYSTM_H @@ -86,7 +86,7 @@ extern void prgetcred(proc_t *, struct prcred *); extern void prgetpriv(proc_t *, struct prpriv *); extern size_t prgetprivsize(void); extern void prgetsecflags(proc_t *, struct prsecflags *); -extern int prnsegs(struct as *, int); +extern uint_t prnsegs(struct as *, int); extern void prexit(proc_t *); extern void prfree(proc_t *); extern void prlwpexit(kthread_t *); diff --git a/usr/src/uts/common/sys/ptms.h b/usr/src/uts/common/sys/ptms.h index 55987fe6d7..8b97fd7e3b 100644 --- a/usr/src/uts/common/sys/ptms.h +++ b/usr/src/uts/common/sys/ptms.h @@ -126,6 +126,12 @@ extern void ptms_logp(char *, uintptr_t); #define DDBGP(a, b) #endif +typedef struct __ptmptsopencb_arg *ptmptsopencb_arg_t; +typedef struct ptmptsopencb { + boolean_t (*ppocb_func)(ptmptsopencb_arg_t); + ptmptsopencb_arg_t ppocb_arg; +} ptmptsopencb_t; + #endif /* _KERNEL */ typedef struct pt_own { @@ -157,6 +163,19 @@ typedef struct pt_own { #define ZONEPT (('P'<<8)|4) /* set zone of master/slave pair */ #define OWNERPT (('P'<<8)|5) /* set owner/group for slave device */ +#ifdef _KERNEL +/* + * kernel ioctl commands + * + * PTMPTSOPENCB: Returns a callback function pointer and opaque argument. + * The return value of the callback function when it's invoked + * with the opaque argument passed to it will indicate if the + * pts slave device is currently open. + */ +#define PTMPTSOPENCB (('P'<<8)|6) /* check if the slave is open */ + +#endif /* _KERNEL */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h b/usr/src/uts/common/sys/refhash.h index 2069e6d3f1..b7427a454d 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_hash.h +++ b/usr/src/uts/common/sys/refhash.h @@ -10,11 +10,11 @@ */ /* - * Copyright 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ -#ifndef _SYS_SCSI_ADAPTERS_MPTHASH_H -#define _SYS_SCSI_ADAPTERS_MPTHASH_H +#ifndef _SYS_REFHASH_H +#define _SYS_REFHASH_H #include <sys/types.h> #include <sys/list.h> @@ -58,4 +58,4 @@ extern void *refhash_first(refhash_t *); extern void *refhash_next(refhash_t *, void *); extern boolean_t refhash_obj_valid(refhash_t *hp, const void *); -#endif /* _SYS_SCSI_ADAPTERS_MPTHASH_H */ +#endif /* _SYS_REFHASH_H */ diff --git a/usr/src/uts/common/sys/resource.h b/usr/src/uts/common/sys/resource.h index 13166f378d..d65ca00f69 100644 --- a/usr/src/uts/common/sys/resource.h +++ b/usr/src/uts/common/sys/resource.h @@ -23,6 +23,7 @@ * * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ diff --git a/usr/src/uts/common/sys/rt.h b/usr/src/uts/common/sys/rt.h index d4233aecb5..2ed7320a09 100644 --- a/usr/src/uts/common/sys/rt.h +++ b/usr/src/uts/common/sys/rt.h @@ -22,6 +22,7 @@ /* * Copyright 2004 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -75,6 +76,16 @@ typedef struct rtkparms { int rt_tqsig; /* real-time time quantum signal */ uint_t rt_cflags; /* real-time control flags */ } rtkparms_t; + +#define RTGPPRIO0 100 /* Global priority for RT priority 0 */ + +/* + * control flags (kparms->rt_cflags). + */ +#define RT_DOPRI 0x01 /* change priority */ +#define RT_DOTQ 0x02 /* change RT time quantum */ +#define RT_DOSIG 0x04 /* change RT time quantum signal */ + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h new file mode 100644 index 0000000000..afb7a94c58 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/mpi2_pci.h @@ -0,0 +1,147 @@ +/*- + * Copyright (c) 2012-2015 LSI Corp. + * Copyright (c) 2013-2016 Avago Technologies + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright (c) 2000-2015 LSI Corporation. + * Copyright (c) 2013-2016 Avago Technologies + * All rights reserved. + * + * + * Name: mpi2_pci.h + * Title: MPI PCIe Attached Devices structures and definitions. + * Creation Date: October 9, 2012 + * + * mpi2_pci.h Version: 02.00.02 + * + * NOTE: Names (typedefs, defines, etc.) beginning with an MPI25 or Mpi25 + * prefix are for use only on MPI v2.5 products, and must not be used + * with MPI v2.0 products. Unless otherwise noted, names beginning with + * MPI2 or Mpi2 are for use with both MPI v2.0 and MPI v2.5 products. + * + * Version History + * --------------- + * + * Date Version Description + * -------- -------- ------------------------------------------------------ + * 03-16-15 02.00.00 Initial version. + * 02-17-16 02.00.01 Removed AHCI support. + * Removed SOP support. + * 07-01-16 02.00.02 Added MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP to + * NVME Encapsulated Request. + * -------------------------------------------------------------------------- + */ + +#ifndef MPI2_PCI_H +#define MPI2_PCI_H + + +/* + * Values for the PCIe DeviceInfo field used in PCIe Device Status Change Event + * data and PCIe Configuration pages. + */ +#define MPI26_PCIE_DEVINFO_DIRECT_ATTACH (0x00000010) + +#define MPI26_PCIE_DEVINFO_MASK_DEVICE_TYPE (0x0000000F) +#define MPI26_PCIE_DEVINFO_NO_DEVICE (0x00000000) +#define MPI26_PCIE_DEVINFO_PCI_SWITCH (0x00000001) +#define MPI26_PCIE_DEVINFO_NVME (0x00000003) + + +/**************************************************************************** +* NVMe Encapsulated message +****************************************************************************/ + +/* NVME Encapsulated Request Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_REQUEST +{ + U16 DevHandle; /* 0x00 */ + U8 ChainOffset; /* 0x02 */ + U8 Function; /* 0x03 */ + U16 EncapsulatedCommandLength; /* 0x04 */ + U8 Reserved1; /* 0x06 */ + U8 MsgFlags; /* 0x07 */ + U8 VP_ID; /* 0x08 */ + U8 VF_ID; /* 0x09 */ + U16 Reserved2; /* 0x0A */ + U32 Reserved3; /* 0x0C */ + U64 ErrorResponseBaseAddress; /* 0x10 */ + U16 ErrorResponseAllocationLength; /* 0x18 */ + U16 Flags; /* 0x1A */ + U32 DataLength; /* 0x1C */ + U8 NVMe_Command[4]; /* 0x20 */ /* variable length */ + +} MPI26_NVME_ENCAPSULATED_REQUEST, MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_REQUEST, + Mpi26NVMeEncapsulatedRequest_t, MPI2_POINTER pMpi26NVMeEncapsulatedRequest_t; + +/* defines for the Flags field */ +#define MPI26_NVME_FLAGS_FORCE_ADMIN_ERR_RESP (0x0020) +/* Submission Queue Type*/ +#define MPI26_NVME_FLAGS_SUBMISSIONQ_MASK (0x0010) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_IO (0x0000) +#define MPI26_NVME_FLAGS_SUBMISSIONQ_ADMIN (0x0010) +/* Error Response Address Space */ +#define MPI26_NVME_FLAGS_MASK_ERROR_RSP_ADDR (0x000C) +#define MPI26_NVME_FLAGS_SYSTEM_RSP_ADDR (0x0000) +#define MPI26_NVME_FLAGS_IOCPLB_RSP_ADDR (0x0008) +#define MPI26_NVME_FLAGS_IOCPLBNTA_RSP_ADDR (0x000C) +/* Data Direction*/ +#define MPI26_NVME_FLAGS_DATADIRECTION_MASK (0x0003) +#define MPI26_NVME_FLAGS_NODATATRANSFER (0x0000) +#define MPI26_NVME_FLAGS_WRITE (0x0001) +#define MPI26_NVME_FLAGS_READ (0x0002) +#define MPI26_NVME_FLAGS_BIDIRECTIONAL (0x0003) + + +/* NVMe Encapuslated Reply Message */ +typedef struct _MPI26_NVME_ENCAPSULATED_ERROR_REPLY +{ + U16 DevHandle; /* 0x00 */ + U8 MsgLength; /* 0x02 */ + U8 Function; /* 0x03 */ + U16 EncapsulatedCommandLength; /* 0x04 */ + U8 Reserved1; /* 0x06 */ + U8 MsgFlags; /* 0x07 */ + U8 VP_ID; /* 0x08 */ + U8 VF_ID; /* 0x09 */ + U16 Reserved2; /* 0x0A */ + U16 Reserved3; /* 0x0C */ + U16 IOCStatus; /* 0x0E */ + U32 IOCLogInfo; /* 0x10 */ + U16 ErrorResponseCount; /* 0x14 */ + U16 Reserved4; /* 0x16 */ +} MPI26_NVME_ENCAPSULATED_ERROR_REPLY, + MPI2_POINTER PTR_MPI26_NVME_ENCAPSULATED_ERROR_REPLY, + Mpi26NVMeEncapsulatedErrorReply_t, + MPI2_POINTER pMpi26NVMeEncapsulatedErrorReply_t; + + +#endif + + diff --git a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h index ba340549c6..5b7a3f6442 100644 --- a/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h +++ b/usr/src/uts/common/sys/scsi/adapters/mpt_sas/mptsas_var.h @@ -58,11 +58,11 @@ #include <sys/byteorder.h> #include <sys/queue.h> +#include <sys/refhash.h> #include <sys/isa_defs.h> #include <sys/sunmdi.h> #include <sys/mdi_impldefs.h> #include <sys/ddi_ufm.h> -#include <sys/scsi/adapters/mpt_sas/mptsas_hash.h> #include <sys/scsi/adapters/mpt_sas/mptsas_ioctl.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_tool.h> #include <sys/scsi/adapters/mpt_sas/mpi/mpi2_cnfg.h> diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h new file mode 100644 index 0000000000..5aba743834 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt.h @@ -0,0 +1,750 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017, Joyent, Inc. + */ + +#ifndef _SMRT_H +#define _SMRT_H + +#include <sys/types.h> +#include <sys/pci.h> +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/conf.h> +#include <sys/map.h> +#include <sys/modctl.h> +#include <sys/kmem.h> +#include <sys/cmn_err.h> +#include <sys/stat.h> +#include <sys/scsi/scsi.h> +#include <sys/scsi/impl/spc3_types.h> +#include <sys/devops.h> +#include <sys/ddi.h> +#include <sys/sunddi.h> +#include <sys/sdt.h> +#include <sys/policy.h> +#include <sys/ctype.h> + +#if !defined(_LITTLE_ENDIAN) || !defined(_BIT_FIELDS_LTOH) +/* + * This driver contains a number of multi-byte bit fields and other structs + * that are only correct on a system with the same ordering as x86. + */ +#error "smrt: driver works only on little endian systems" +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Some structures are statically sized based on the expected number of logical + * drives and controllers in the system. These definitions are used throughout + * other driver-specific header files, and must appear prior to their + * inclusion. + */ +#define SMRT_MAX_LOGDRV 64 /* Maximum number of logical drives */ +#define SMRT_MAX_PHYSDEV 128 /* Maximum number of physical devices */ + +#include <sys/scsi/adapters/smrt/smrt_ciss.h> +#include <sys/scsi/adapters/smrt/smrt_scsi.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern ddi_device_acc_attr_t smrt_dev_attributes; + +typedef enum smrt_init_level { + SMRT_INITLEVEL_BASIC = (0x1 << 0), + SMRT_INITLEVEL_I2O_MAPPED = (0x1 << 1), + SMRT_INITLEVEL_CFGTBL_MAPPED = (0x1 << 2), + SMRT_INITLEVEL_PERIODIC = (0x1 << 3), + SMRT_INITLEVEL_INT_ALLOC = (0x1 << 4), + SMRT_INITLEVEL_INT_ADDED = (0x1 << 5), + SMRT_INITLEVEL_INT_ENABLED = (0x1 << 6), + SMRT_INITLEVEL_SCSA = (0x1 << 7), + SMRT_INITLEVEL_MUTEX = (0x1 << 8), + SMRT_INITLEVEL_TASKQ = (0x1 << 9), + SMRT_INITLEVEL_ASYNC_EVENT = (0x1 << 10), +} smrt_init_level_t; + +/* + * Commands issued to the controller carry a (generally 32-bit, though with + * two reserved signalling bits) identifying tag number. In order to avoid + * having the controller confuse us by double-reporting the completion of a + * particular tag, we try to reuse them as infrequently as possible. In + * practice, this means looping through a range of values. The minimum and + * maximum value are defined below. A single command tag value is set aside + * for polled commands sent prior to full initialisation of the driver. + */ +#define SMRT_PRE_TAG_NUMBER 0x00000bad +#define SMRT_MIN_TAG_NUMBER 0x00001000 +#define SMRT_MAX_TAG_NUMBER 0x0fffffff + +/* + * Character strings that represent the names of the iports used for both + * physical and virtual volumes. + */ +#define SMRT_IPORT_PHYS "p0" +#define SMRT_IPORT_VIRT "v0" + +/* + * Definitions to support waiting for the controller to converge on a + * particular state: ready or not ready. These are used with + * smrt_ctlr_wait_for_state(). + */ +#define SMRT_WAIT_DELAY_SECONDS 120 +typedef enum smrt_wait_state { + SMRT_WAIT_STATE_READY = 1, + SMRT_WAIT_STATE_UNREADY +} smrt_wait_state_t; + +typedef enum smrt_ctlr_mode { + SMRT_CTLR_MODE_UNKNOWN = 0, + SMRT_CTLR_MODE_SIMPLE +} smrt_ctlr_mode_t; + +/* + * In addition to Logical Volumes, we also expose the controller at a + * pseudo target address on the SCSI bus we are essentially pretending to be. + */ +#define SMRT_CONTROLLER_TARGET 128 + +/* + * When waiting for volume discovery to complete, we wait for a maximum + * duration (in seconds) before giving up. + */ +#define SMRT_DISCOVER_TIMEOUT 30 + +/* + * The maintenance routine which checks for controller lockup and aborts + * commands that have passed their timeout runs periodically. The time is + * expressed in seconds. + */ +#define SMRT_PERIODIC_RATE 5 + +/* + * At times, we need to check if the controller is still responding. To do + * that, we send a Nop message to the controller and make sure it completes + * successfully. So that we don't wait forever, we set a timeout (in seconds). + */ +#define SMRT_PING_CHECK_TIMEOUT 60 + +/* + * When detaching the device, we may need to have an asynchronous event + * cancellation be issued. While this should be relatively smooth, we don't + * want to wait forever for it. As such we set a timeout in seconds. + */ +#define SMRT_ASYNC_CANCEL_TIMEOUT 60 + +/* + * HP PCI vendor ID and Generation 9 device ID. Used to identify generations of + * supported controllers. + */ +#define SMRT_VENDOR_HP 0x103c +#define SMRT_DEVICE_GEN9 0x3238 + +typedef enum smrt_controller_status { + /* + * An attempt is being made to detach the controller instance. + */ + SMRT_CTLR_STATUS_DETACHING = (0x1 << 0), + + /* + * The controller is believed to be functioning correctly. The driver + * is to allow command submission, process interrupts, and perform + * periodic background maintenance. + */ + SMRT_CTLR_STATUS_RUNNING = (0x1 << 1), + + /* + * The controller is currently being reset. + */ + SMRT_CTLR_STATUS_RESETTING = (0x1 << 2), + + /* + * Our async event notification command is currently in need of help + * from the broader driver. This will be set by smrt_event_complete() + * to indicate that the command is not being processed due to a + * controller reset or because another fatal error occurred. The + * periodic will have to pick up and recover this for us. It is only + * safe for the driver to manipulate the event command outside of + * smrt_event_complete() if this flag is set. + */ + SMRT_CTLR_ASYNC_INTERVENTION = (0x1 << 3), + + /* + * See the theory statement on discovery and resets in smrt_ciss.c for + * an explanation of these values. + */ + SMRT_CTLR_DISCOVERY_REQUESTED = (0x1 << 4), + SMRT_CTLR_DISCOVERY_RUNNING = (0x1 << 5), + SMRT_CTLR_DISCOVERY_PERIODIC = (0x1 << 6), + SMRT_CTLR_DISCOVERY_REQUIRED = (0x1 << 7), +} smrt_controller_status_t; + +#define SMRT_CTLR_DISCOVERY_MASK (SMRT_CTLR_DISCOVERY_REQUESTED | \ + SMRT_CTLR_DISCOVERY_RUNNING | SMRT_CTLR_DISCOVERY_PERIODIC) + +typedef struct smrt_stats { + uint64_t smrts_tran_aborts; + uint64_t smrts_tran_resets; + uint64_t smrts_tran_starts; + uint64_t smrts_ctlr_resets; + unsigned smrts_max_inflight; + uint64_t smrts_unclaimed_interrupts; + uint64_t smrts_claimed_interrupts; + uint64_t smrts_ignored_scsi_cmds; + uint64_t smrts_events_received; + uint64_t smrts_events_errors; + uint64_t smrts_events_intervened; + uint64_t smrts_discovery_tq_errors; +} smrt_stats_t; + +typedef struct smrt_versions { + uint8_t smrtv_hardware_version; + + /* + * These strings must be large enough to hold the 4 byte version string + * retrieved from an IDENTIFY CONTROLLER response, as well as the + * terminating NUL byte: + */ + char smrtv_firmware_rev[5]; + char smrtv_recovery_rev[5]; + char smrtv_bootblock_rev[5]; +} smrt_versions_t; + +typedef struct smrt smrt_t; +typedef struct smrt_command smrt_command_t; +typedef struct smrt_command_internal smrt_command_internal_t; +typedef struct smrt_command_scsa smrt_command_scsa_t; +typedef struct smrt_pkt smrt_pkt_t; + +/* + * Per-Controller Structure + */ +struct smrt { + dev_info_t *smrt_dip; + int smrt_instance; + smrt_controller_status_t smrt_status; + smrt_stats_t smrt_stats; + + /* + * Controller configuration discovered during initialisation. + */ + uint32_t smrt_host_support; + uint32_t smrt_bus_support; + uint32_t smrt_maxcmds; + uint32_t smrt_sg_cnt; + smrt_versions_t smrt_versions; + uint16_t smrt_pci_vendor; + uint16_t smrt_pci_device; + + /* + * iport specific data + */ + dev_info_t *smrt_virt_iport; + dev_info_t *smrt_phys_iport; + scsi_hba_tgtmap_t *smrt_virt_tgtmap; + scsi_hba_tgtmap_t *smrt_phys_tgtmap; + + /* + * The transport mode of the controller. + */ + smrt_ctlr_mode_t smrt_ctlr_mode; + + /* + * The current initialisation level of the driver. Bits in this field + * are set during initialisation and unset during cleanup of the + * allocated resources. + */ + smrt_init_level_t smrt_init_level; + + /* + * Essentially everything is protected by "smrt_mutex". When the + * completion queue is updated, threads sleeping on "smrt_cv_finishq" + * are awoken. + */ + kmutex_t smrt_mutex; + kcondvar_t smrt_cv_finishq; + + /* + * List of enumerated logical volumes (smrt_volume_t). + */ + list_t smrt_volumes; + + /* + * List of enumerated physical devices (smrt_physical_t). + */ + list_t smrt_physicals; + + /* + * List of attached SCSA target drivers (smrt_target_t). + */ + list_t smrt_targets; + + /* + * Controller Heartbeat Tracking + */ + uint32_t smrt_last_heartbeat; + hrtime_t smrt_last_heartbeat_time; + + hrtime_t smrt_last_interrupt_claimed; + hrtime_t smrt_last_interrupt_unclaimed; + hrtime_t smrt_last_reset_start; + hrtime_t smrt_last_reset_finish; + + /* + * Command object tracking. These lists, and all commands within the + * lists, are protected by "smrt_mutex". + */ + uint32_t smrt_next_tag; + avl_tree_t smrt_inflight; + list_t smrt_commands; /* List of all commands. */ + list_t smrt_finishq; /* List of completed commands. */ + list_t smrt_abortq; /* List of commands to abort. */ + + /* + * Discovery coordination + */ + ddi_taskq_t *smrt_discover_taskq; + hrtime_t smrt_last_phys_discovery; + hrtime_t smrt_last_log_discovery; + uint64_t smrt_discover_gen; + + /* + * Controller interrupt handler registration. + */ + int smrt_interrupt_type; + int smrt_interrupt_cap; + uint_t smrt_interrupt_pri; + ddi_intr_handle_t smrt_interrupts[1]; + int smrt_ninterrupts; + + ddi_periodic_t smrt_periodic; + + scsi_hba_tran_t *smrt_hba_tran; + + ddi_dma_attr_t smrt_dma_attr; + + /* + * Access to the I2O Registers: + */ + unsigned smrt_i2o_bar; + caddr_t smrt_i2o_space; + ddi_acc_handle_t smrt_i2o_handle; + + /* + * Access to the Configuration Table: + */ + unsigned smrt_ct_bar; + uint32_t smrt_ct_baseaddr; + CfgTable_t *smrt_ct; + ddi_acc_handle_t smrt_ct_handle; + + /* + * Asynchronous Event State + */ + uint32_t smrt_event_count; + smrt_command_t *smrt_event_cmd; + smrt_command_t *smrt_event_cancel_cmd; + kcondvar_t smrt_event_queue; +}; + +/* + * Logical Volume Structure + */ +typedef enum smrt_volume_flags { + SMRT_VOL_FLAG_WWN = (0x1 << 0), +} smrt_volume_flags_t; + +typedef struct smrt_volume { + LUNAddr_t smlv_addr; + smrt_volume_flags_t smlv_flags; + + uint8_t smlv_wwn[16]; + uint64_t smlv_gen; + + smrt_t *smlv_ctlr; + list_node_t smlv_link; + + /* + * List of SCSA targets currently attached to this Logical Volume: + */ + list_t smlv_targets; +} smrt_volume_t; + +typedef struct smrt_physical { + LUNAddr_t smpt_addr; + uint64_t smpt_wwn; + uint8_t smpt_dtype; + uint16_t smpt_bmic; + uint64_t smpt_gen; + boolean_t smpt_supported; + boolean_t smpt_visible; + boolean_t smpt_unsup_warn; + list_node_t smpt_link; + list_t smpt_targets; + smrt_t *smpt_ctlr; + smrt_identify_physical_drive_t *smpt_info; +} smrt_physical_t; + +/* + * Per-Target Structure + */ +typedef struct smrt_target { + struct scsi_device *smtg_scsi_dev; + + boolean_t smtg_physical; + + /* + * This is only used when performing discovery during panic, as we need + * a mechanism to determine if the set of drives has shifted. + */ + boolean_t smtg_gone; + + /* + * Linkage back to the device that this target represents. This may be + * either a smrt_volume_t or a smrt_physical_t. We keep a pointer to the + * address, as that's the one thing we generally care about. + */ + union { + smrt_physical_t *smtg_phys; + smrt_volume_t *smtg_vol; + } smtg_lun; + list_node_t smtg_link_lun; + LUNAddr_t *smtg_addr; + + /* + * Linkage back to the controller: + */ + smrt_t *smtg_ctlr; + list_node_t smtg_link_ctlr; +} smrt_target_t; + +/* + * DMA Resource Tracking Structure + */ +typedef enum smrt_dma_level { + SMRT_DMALEVEL_HANDLE_ALLOC = (0x1 << 0), + SMRT_DMALEVEL_MEMORY_ALLOC = (0x1 << 1), + SMRT_DMALEVEL_HANDLE_BOUND = (0x1 << 2), +} smrt_dma_level_t; + +typedef struct smrt_dma { + smrt_dma_level_t smdma_level; + size_t smdma_real_size; + ddi_dma_handle_t smdma_dma_handle; + ddi_acc_handle_t smdma_acc_handle; + ddi_dma_cookie_t smdma_dma_cookies[1]; + uint_t smdma_dma_ncookies; +} smrt_dma_t; + + +typedef enum smrt_command_status { + /* + * When a command is submitted to the controller, it is marked USED + * to avoid accidental reuse of the command without reinitialising + * critical fields. The submitted command is also marked INFLIGHT + * to reflect its inclusion in the "smrt_inflight" AVL tree. When + * the command is completed by the controller, INFLIGHT is unset. + */ + SMRT_CMD_STATUS_USED = (0x1 << 0), + SMRT_CMD_STATUS_INFLIGHT = (0x1 << 1), + + /* + * This flag is set during abort queue processing to record that this + * command was aborted in response to an expired timeout, and not some + * other cancellation. If the controller is able to abort the command, + * we use this flag to let the SCSI framework know that the command + * timed out. + */ + SMRT_CMD_STATUS_TIMEOUT = (0x1 << 2), + + /* + * The controller set the error bit when completing this command. + * Details of the particular fault may be read from the error + * information written by the controller. + */ + SMRT_CMD_STATUS_ERROR = (0x1 << 3), + + /* + * This command has been abandoned by the original submitter. This + * could happen if the command did not complete in a timely fashion. + * When it reaches the finish queue it will be freed without further + * processing. + */ + SMRT_CMD_STATUS_ABANDONED = (0x1 << 4), + + /* + * This command has made it through the completion queue and had final + * processing performed. + */ + SMRT_CMD_STATUS_COMPLETE = (0x1 << 5), + + /* + * A polled message will be ignored by the regular processing of the + * completion queue. The blocking function doing the polling is + * responsible for watching the command on which it has set the POLLED + * flag. Regular completion queue processing (which might happen in + * the polling function, or it might happen in the interrupt handler) + * will set POLL_COMPLETE once it is out of the finish queue + * altogether. + */ + SMRT_CMD_STATUS_POLLED = (0x1 << 6), + SMRT_CMD_STATUS_POLL_COMPLETE = (0x1 << 7), + + /* + * An abort message has been sent to the controller in an attempt to + * cancel this command. + */ + SMRT_CMD_STATUS_ABORT_SENT = (0x1 << 8), + + /* + * This command has been passed to our tran_start(9E) handler. + */ + SMRT_CMD_STATUS_TRAN_START = (0x1 << 9), + + /* + * This command was for a SCSI command that we are explicitly avoiding + * sending to the controller. + */ + SMRT_CMD_STATUS_TRAN_IGNORED = (0x1 << 10), + + /* + * This command has been submitted once, and subsequently passed to + * smrt_command_reuse(). + */ + SMRT_CMD_STATUS_REUSED = (0x1 << 11), + + /* + * A controller reset has been issued, so a response for this command + * is not expected. If one arrives before the controller reset has + * taken effect, it likely cannot be trusted. + */ + SMRT_CMD_STATUS_RESET_SENT = (0x1 << 12), + + /* + * Certain commands related to discovery and pinging need to be run + * during the context after a reset has occurred, but before the + * controller is considered. Such commands can use this flag to bypass + * the normal smrt_submit() check. + */ + SMRT_CMD_IGNORE_RUNNING = (0x1 << 13), +} smrt_command_status_t; + +typedef enum smrt_command_type { + SMRT_CMDTYPE_INTERNAL = 1, + SMRT_CMDTYPE_EVENT, + SMRT_CMDTYPE_ABORTQ, + SMRT_CMDTYPE_SCSA, + SMRT_CMDTYPE_PREINIT, +} smrt_command_type_t; + +struct smrt_command { + uint32_t smcm_tag; + smrt_command_type_t smcm_type; + smrt_command_status_t smcm_status; + + smrt_t *smcm_ctlr; + smrt_target_t *smcm_target; + + list_node_t smcm_link; /* Linkage for allocated list. */ + list_node_t smcm_link_finish; /* Linkage for completion list. */ + list_node_t smcm_link_abort; /* Linkage for abort list. */ + avl_node_t smcm_node; /* Inflight AVL membership. */ + + hrtime_t smcm_time_submit; + hrtime_t smcm_time_complete; + + hrtime_t smcm_expiry; + + /* + * The time at which an abort message was sent to try and terminate + * this command, as well as the tag of the abort message itself: + */ + hrtime_t smcm_abort_time; + uint32_t smcm_abort_tag; + + /* + * Ancillary data objects. Only one of these will be allocated for any + * given command, but we nonetheless resist the temptation to use a + * union of pointers in order to make incorrect usage obvious. + */ + smrt_command_scsa_t *smcm_scsa; + smrt_command_internal_t *smcm_internal; + + /* + * Physical allocation tracking for the actual command to send to the + * controller. + */ + smrt_dma_t smcm_contig; + + CommandList_t *smcm_va_cmd; + uint32_t smcm_pa_cmd; + + ErrorInfo_t *smcm_va_err; + uint32_t smcm_pa_err; +}; + +/* + * Commands issued internally to the driver (as opposed to by the HBA + * framework) generally require a buffer in which to assemble the command body, + * and for receiving the response from the controller. The following object + * tracks this (optional) extra buffer. + */ +struct smrt_command_internal { + smrt_dma_t smcmi_contig; + + void *smcmi_va; + uint32_t smcmi_pa; + size_t smcmi_len; +}; + +/* + * Commands issued via the SCSI framework have a number of additional + * properties. + */ +struct smrt_command_scsa { + struct scsi_pkt *smcms_pkt; + smrt_command_t *smcms_command; +}; + + +/* + * CISS transport routines. + */ +void smrt_periodic(void *); +void smrt_lockup_check(smrt_t *); +int smrt_submit(smrt_t *, smrt_command_t *); +void smrt_submit_simple(smrt_t *, smrt_command_t *); +int smrt_retrieve(smrt_t *); +void smrt_retrieve_simple(smrt_t *); +int smrt_poll_for(smrt_t *, smrt_command_t *); +int smrt_preinit_command_simple(smrt_t *, smrt_command_t *); + +/* + * Interrupt service routines. + */ +int smrt_interrupts_setup(smrt_t *); +int smrt_interrupts_enable(smrt_t *); +void smrt_interrupts_teardown(smrt_t *); +uint32_t smrt_isr_hw_simple(caddr_t, caddr_t); + +/* + * Interrupt enable/disable routines. + */ +void smrt_intr_set(smrt_t *, boolean_t); + +/* + * Controller initialisation routines. + */ +int smrt_ctlr_init(smrt_t *); +void smrt_ctlr_teardown(smrt_t *); +int smrt_ctlr_reset(smrt_t *); +int smrt_ctlr_wait_for_state(smrt_t *, smrt_wait_state_t); +int smrt_ctlr_init_simple(smrt_t *); +void smrt_ctlr_teardown_simple(smrt_t *); +int smrt_cfgtbl_flush(smrt_t *); +int smrt_cfgtbl_transport_has_support(smrt_t *, int); +void smrt_cfgtbl_transport_set(smrt_t *, int); +int smrt_cfgtbl_transport_confirm(smrt_t *, int); +uint32_t smrt_ctlr_get_cmdsoutmax(smrt_t *); +uint32_t smrt_ctlr_get_maxsgelements(smrt_t *); + +/* + * Device enumeration and lookup routines. + */ +void smrt_discover_request(smrt_t *); + +int smrt_logvol_discover(smrt_t *, uint16_t, uint64_t); +void smrt_logvol_teardown(smrt_t *); +smrt_volume_t *smrt_logvol_lookup_by_id(smrt_t *, unsigned long); +void smrt_logvol_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t, + void **); +boolean_t smrt_logvol_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t, + void *, scsi_tgtmap_deact_rsn_t); + +int smrt_phys_discover(smrt_t *, uint16_t, uint64_t); +smrt_physical_t *smrt_phys_lookup_by_ua(smrt_t *, const char *); +void smrt_phys_teardown(smrt_t *); +void smrt_phys_tgtmap_activate(void *, char *, scsi_tgtmap_tgt_type_t, + void **); +boolean_t smrt_phys_tgtmap_deactivate(void *, char *, scsi_tgtmap_tgt_type_t, + void *, scsi_tgtmap_deact_rsn_t); + +/* + * SCSI framework routines. + */ +int smrt_ctrl_hba_setup(smrt_t *); +void smrt_ctrl_hba_teardown(smrt_t *); + +int smrt_logvol_hba_setup(smrt_t *, dev_info_t *); +void smrt_logvol_hba_teardown(smrt_t *, dev_info_t *); +int smrt_phys_hba_setup(smrt_t *, dev_info_t *); +void smrt_phys_hba_teardown(smrt_t *, dev_info_t *); + +void smrt_hba_complete(smrt_command_t *); + +void smrt_process_finishq(smrt_t *); +void smrt_process_abortq(smrt_t *); + +/* + * Command block management. + */ +smrt_command_t *smrt_command_alloc(smrt_t *, smrt_command_type_t, + int); +smrt_command_t *smrt_command_alloc_preinit(smrt_t *, size_t, int); +int smrt_command_attach_internal(smrt_t *, smrt_command_t *, size_t, + int); +void smrt_command_free(smrt_command_t *); +smrt_command_t *smrt_lookup_inflight(smrt_t *, uint32_t); +void smrt_command_reuse(smrt_command_t *); + +/* + * Device message construction routines. + */ +void smrt_write_lun_addr_phys(LUNAddr_t *, boolean_t, unsigned, unsigned); +void smrt_write_controller_lun_addr(LUNAddr_t *); +uint16_t smrt_lun_addr_to_bmic(PhysDevAddr_t *); +void smrt_write_message_abort_one(smrt_command_t *, uint32_t); +void smrt_write_message_abort_all(smrt_command_t *, LUNAddr_t *); +void smrt_write_message_nop(smrt_command_t *, int); +void smrt_write_message_event_notify(smrt_command_t *); + +/* + * Device management routines. + */ +int smrt_device_setup(smrt_t *); +void smrt_device_teardown(smrt_t *); +uint32_t smrt_get32(smrt_t *, offset_t); +void smrt_put32(smrt_t *, offset_t, uint32_t); + +/* + * SATA related routines. + */ +int smrt_sata_determine_wwn(smrt_t *, PhysDevAddr_t *, uint64_t *, uint16_t); + +/* + * Asynchronous Event Notification + */ +int smrt_event_init(smrt_t *); +void smrt_event_fini(smrt_t *); +void smrt_event_complete(smrt_command_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SMRT_H */ diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h new file mode 100644 index 0000000000..e1f1db68b3 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_ciss.h @@ -0,0 +1,345 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (C) 2013 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2017, Joyent, Inc. + */ + +#ifndef _SMRT_CISS_H +#define _SMRT_CISS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Maximum number of Scatter/Gather List entries. These entries are statically + * allocated for all commands. + */ +#define CISS_MAXSGENTRIES 64 + +/* + * If the controller advertises a value of 0 for the maximum S/G list length it + * supports, the specification states that we should assume a value of 31. + */ +#define CISS_SGCNT_FALLBACK 31 + +/* + * The CDB field in the request block is fixed at 16 bytes in length. (See + * "3.2. Request Block" in the CISS specification.) + */ +#define CISS_CDBLEN 16 + +/* + * Command Status Values. These are listed in "Table 2 Command Status" in "3.3 + * Error Info" of the CISS specification. + */ +#define CISS_CMD_SUCCESS 0x00 +#define CISS_CMD_TARGET_STATUS 0x01 +#define CISS_CMD_DATA_UNDERRUN 0x02 +#define CISS_CMD_DATA_OVERRUN 0x03 +#define CISS_CMD_INVALID 0x04 +#define CISS_CMD_PROTOCOL_ERR 0x05 +#define CISS_CMD_HARDWARE_ERR 0x06 +#define CISS_CMD_CONNECTION_LOST 0x07 +#define CISS_CMD_ABORTED 0x08 +#define CISS_CMD_ABORT_FAILED 0x09 +#define CISS_CMD_UNSOLICITED_ABORT 0x0a +#define CISS_CMD_TIMEOUT 0x0b +#define CISS_CMD_UNABORTABLE 0x0c + +/* + * Request Transfer Directions, used in "RequestBlock.Type.Direction": + */ +#define CISS_XFER_NONE 0x00 +#define CISS_XFER_WRITE 0x01 +#define CISS_XFER_READ 0x02 +#define CISS_XFER_RSVD 0x03 + +/* + * Request Attributes, used in "RequestBlock.Type.Attribute": + */ +#define CISS_ATTR_UNTAGGED 0x00 +#define CISS_ATTR_SIMPLE 0x04 +#define CISS_ATTR_HEADOFQUEUE 0x05 +#define CISS_ATTR_ORDERED 0x06 + +/* + * Request Type, used in "RequestBlock.Type.Type": + */ +#define CISS_TYPE_CMD 0x00 +#define CISS_TYPE_MSG 0x01 + +/* + * I2O Space Register Offsets + * + * The name "I2O", and these register offsets, appear to be amongst the last + * vestiges of a long-defunct attempt at standardising mainframe-style I/O + * channels in the Intel server space: the Intelligent Input/Output (I2O) + * Architecture Specification. + * + * The draft of version 1.5 of this specification, in section "4.2.1.5.1 + * Extensions for PCI", suggests that the following are memory offsets into + * "the memory region specified by the first base address configuration + * register indicating memory space (offset 10h, 14h, and so forth)". These + * match up with the offsets of the first two BARs in a PCI configuration space + * type 0 header. + * + * The specification also calls out the Inbound Post List FIFO, write-only at + * offset 40h; the Outbound Post List FIFO, read-only at offset 44h; the + * Interrupt Status Register, at offset 30h; and the Interrupt Mask Register, + * at offset 34h. + * + * This ill-fated attempt to increase the proprietary complexity of (and + * presumably, thus, the gross margin on) computer systems is all but extinct. + * The transport layer of this storage controller is all that's left of their + * religion. + */ +#define CISS_I2O_INBOUND_DOORBELL 0x20 +#define CISS_I2O_INTERRUPT_STATUS 0x30 +#define CISS_I2O_INTERRUPT_MASK 0x34 +#define CISS_I2O_INBOUND_POST_Q 0x40 +#define CISS_I2O_OUTBOUND_POST_Q 0x44 +#define CISS_I2O_OUTBOUND_DOORBELL_STATUS 0x9c +#define CISS_I2O_OUTBOUND_DOORBELL_CLEAR 0xa0 +#define CISS_I2O_SCRATCHPAD 0xb0 +#define CISS_I2O_CFGTBL_CFG_OFFSET 0xb4 +#define CISS_I2O_CFGTBL_MEM_OFFSET 0xb8 + +/* + * Rather than make a lot of small mappings for each part of the address + * space we wish to access, we will make one large mapping. If more + * offsets are added to the I2O list above, this space should be extended + * appropriately. + */ +#define CISS_I2O_MAP_BASE 0x20 +#define CISS_I2O_MAP_LIMIT 0x100 + +/* + * The Scratchpad Register (I2O_SCRATCHPAD) is not mentioned in the CISS + * specification. It serves at least two known functions: + * - Signalling controller readiness + * - Exposing a debugging code when the controller firmware locks up + */ +#define CISS_SCRATCHPAD_INITIALISED 0xffff0000 + +/* + * Outbound Doorbell Register Values. + * + * These are read from the Outbound Doorbell Set/Status Register + * (CISS_I2O_OUTBOUND_DOORBELL_STATUS), but cleared by writing to the Clear + * Register (CISS_I2O_OUTBOUND_DOORBELL_CLEAR). + */ +#define CISS_ODR_BIT_INTERRUPT (1UL << 0) +#define CISS_ODR_BIT_LOCKUP (1UL << 1) + +/* + * Inbound Doorbell Register Values. + * + * These are written to and read from the Inbound Doorbell Register + * (CISS_I2O_INBOUND_DOORBELL). + */ +#define CISS_IDR_BIT_CFGTBL_CHANGE (1UL << 0) + +/* + * Interrupt Mask Register Values. + * + * These are written to and read from the Interrupt Mask Register + * (CISS_I2O_INTERRUPT_MASK). Note that a 1 bit in this register masks or + * disables the interrupt in question; to enable the interrupt the bit must be + * set to 0. + */ +#define CISS_IMR_BIT_SIMPLE_INTR_DISABLE (1UL << 3) + +/* + * Interrupt Status Register Values. + * + * These are read from the Interrupt Status Register + * (CISS_I2O_INTERRUPT_STATUS). + */ +#define CISS_ISR_BIT_SIMPLE_INTR (1UL << 3) + +/* + * Transport Methods. + * + * These bit positions are used in the Configuration Table to detect controller + * support for a particular method, via "TransportSupport"; to request that the + * controller enable a particular method, via "TransportRequest"; and to detect + * whether the controller has acknowledged the request and enabled the desired + * method, via "TransportActive". + * + * See: "9.1 Configuration Table" in the CISS Specification. + */ +#define CISS_CFGTBL_READY_FOR_COMMANDS (1UL << 0) +#define CISS_CFGTBL_XPORT_SIMPLE (1UL << 1) +#define CISS_CFGTBL_XPORT_PERFORMANT (1UL << 2) +#define CISS_CFGTBL_XPORT_MEMQ (1UL << 4) + +/* + * In the Simple Transport Method, when the appropriate interrupt status bit is + * set (CISS_ISR_BIT_SIMPLE_INTR), the Outbound Post Queue register is + * repeatedly read for notifications of the completion of commands previously + * submitted to the controller. These macros help break up the read value into + * its component fields: the tag number, and whether or not the command + * completed in error. + */ +#define CISS_OPQ_READ_TAG(x) ((x) >> 2) +#define CISS_OPQ_READ_ERROR(x) ((x) & (1UL << 1)) + +/* + * Physical devices that are reported may be marked as 'masked'. A masked device + * is one that the driver can see, but must not perform any I/O to. + */ +#define SMRT_CISS_MODE_MASKED 3 + +/* + * The following packed structures are used to ease the manipulation of + * requests and responses from the controller. + */ +#pragma pack(1) + +typedef struct smrt_tag { + uint32_t reserved:1; + uint32_t error:1; + uint32_t tag_value:30; + uint32_t unused; +} smrt_tag_t; + +typedef union SCSI3Addr { + struct { + uint8_t Dev; + uint8_t Bus:6; + uint8_t Mode:2; + } PeripDev; + struct { + uint8_t DevLSB; + uint8_t DevMSB:6; + uint8_t Mode:2; + } LogDev; + struct { + uint8_t Dev:5; + uint8_t Bus:3; + uint8_t Targ:6; + uint8_t Mode:2; + } LogUnit; +} SCSI3Addr_t; + +typedef struct PhysDevAddr { + uint32_t TargetId:24; + uint32_t Bus:6; + uint32_t Mode:2; + SCSI3Addr_t Target[2]; +} PhysDevAddr_t; + +typedef struct LogDevAddr { + uint32_t VolId:30; + uint32_t Mode:2; + uint8_t reserved[4]; +} LogDevAddr_t; + +typedef union LUNAddr { + uint8_t LunAddrBytes[8]; + SCSI3Addr_t SCSI3Lun[4]; + PhysDevAddr_t PhysDev; + LogDevAddr_t LogDev; +} LUNAddr_t; + +typedef struct CommandListHeader { + uint8_t ReplyQueue; + uint8_t SGList; + uint16_t SGTotal; + smrt_tag_t Tag; + LUNAddr_t LUN; +} CommandListHeader_t; + +typedef struct RequestBlock { + uint8_t CDBLen; + struct { + uint8_t Type:3; + uint8_t Attribute:3; + uint8_t Direction:2; + } Type; + uint16_t Timeout; + uint8_t CDB[CISS_CDBLEN]; +} RequestBlock_t; + +typedef struct ErrDescriptor { + uint64_t Addr; + uint32_t Len; +} ErrDescriptor_t; + +typedef struct SGDescriptor { + uint64_t Addr; + uint32_t Len; + uint32_t Ext; +} SGDescriptor_t; + +typedef struct CommandList { + CommandListHeader_t Header; + RequestBlock_t Request; + ErrDescriptor_t ErrDesc; + SGDescriptor_t SG[CISS_MAXSGENTRIES]; +} CommandList_t; + +typedef union MoreErrInfo { + struct { + uint8_t Reserved[3]; + uint8_t Type; + uint32_t ErrorInfo; + } Common_Info; + struct { + uint8_t Reserved[2]; + uint8_t offense_size; + uint8_t offense_num; + uint32_t offense_value; + } Invalid_Cmd; +} MoreErrInfo_t; + +typedef struct ErrorInfo { + uint8_t ScsiStatus; + uint8_t SenseLen; + uint16_t CommandStatus; + uint32_t ResidualCnt; + MoreErrInfo_t MoreErrInfo; + uint8_t SenseInfo[MAX_SENSE_LENGTH]; +} ErrorInfo_t; + +typedef struct CfgTable { + uint8_t Signature[4]; + uint32_t SpecValence; + uint32_t TransportSupport; + uint32_t TransportActive; + uint32_t TransportRequest; + uint32_t Upper32Addr; + uint32_t CoalIntDelay; + uint32_t CoalIntCount; + uint32_t CmdsOutMax; + uint32_t BusTypes; + uint32_t TransportMethodOffset; + uint8_t ServerName[16]; + uint32_t HeartBeat; + uint32_t HostDrvrSupport; + uint32_t MaxSGElements; + uint32_t MaxLunSupport; + uint32_t MaxPhyDevSupport; + uint32_t MaxPhyDrvPerLun; + uint32_t MaxPerfModeCmdsOutMax; + uint32_t MaxBlockFetchCount; +} CfgTable_t; + +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SMRT_CISS_H */ diff --git a/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h new file mode 100644 index 0000000000..47ef99b2e0 --- /dev/null +++ b/usr/src/uts/common/sys/scsi/adapters/smrt/smrt_scsi.h @@ -0,0 +1,371 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (C) 2013 Hewlett-Packard Development Company, L.P. + * Copyright (c) 2017 Joyent, Inc. + */ + +#ifndef _SMRT_SCSI_H +#define _SMRT_SCSI_H + +#include <sys/types.h> + +#include <sys/scsi/adapters/smrt/smrt_ciss.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* CISS LUN Addressing MODEs */ +#define PERIPHERIAL_DEV_ADDR 0x0 +#define LOGICAL_VOL_ADDR 0x1 +#define MASK_PERIPHERIAL_DEV_ADDR 0x3 +#define CISS_PHYS_MODE 0x0 + +/* + * Vendor-specific SCSI Commands + * + * These command opcodes are for use in the opcode byte of the CDB in a request + * of type CISS_TYPE_CMD. They are custom SCSI commands, using the + * vendor-specific part of the opcode space; i.e., 0xC0 through 0xFF. + */ +#define CISS_SCMD_READ 0xC0 +#define CISS_SCMD_WRITE 0xC1 +#define CISS_SCMD_REPORT_LOGICAL_LUNS 0xC2 +#define CISS_SCMD_REPORT_PHYSICAL_LUNS 0xC3 + +/* + * These command opcodes are _not_ in the usual vendor-specific space, but are + * nonetheless vendor-specific. They allow BMIC commands to be written to and + * read from the controller. If a command transfers no data, the specification + * suggests that BMIC_WRITE (0x27) is appropriate. + */ +#define CISS_SCMD_BMIC_READ 0x26 +#define CISS_SCMD_BMIC_WRITE 0x27 + +/* + * CISS Messages + * + * The CISS specification describes several directives that do not behave like + * SCSI commands. They are sent in requests of type CISS_TYPE_MSG. + * + * The Abort, Reset, and Nop, messages are defined in "8. Messages" in the CISS + * Specification. + */ +#define CISS_MSG_ABORT 0x0 +#define CISS_ABORT_TASK 0x0 +#define CISS_ABORT_TASKSET 0x1 + +#define CISS_MSG_RESET 0x1 +#define CISS_RESET_CTLR 0x0 +#define CISS_RESET_BUS 0x1 +#define CISS_RESET_TGT 0x3 +#define CISS_RESET_LUN 0x4 + +#define CISS_MSG_NOP 0x3 + +/* + * BMIC Commands + * + * These commands allow for the use of non-standard facilities specific to the + * Smart Array firmware. They are sent to the controller through a specially + * constructed CDB with the CISS_SCMD_BMIC_READ or CISS_SCMD_BMIC_WRITE opcode. + */ +#define CISS_BMIC_IDENTIFY_CONTROLLER 0x11 +#define CISS_BMIC_IDENTIFY_PHYSICAL_DEVICE 0x15 +#define CISS_BMIC_NOTIFY_ON_EVENT 0xD0 +#define CISS_BMIC_NOTIFY_ON_EVENT_CANCEL 0xD1 + +/* + * Device and Phy type codes. These are used across many commands, including + * IDENTIFY PHYSICAL DEVICE and the REPORT PHYSICAL LUN extended reporting. + */ +#define SMRT_DTYPE_PSCSI 0x00 +#define SMRT_DTYPE_SATA 0x01 +#define SMRT_DTYPE_SAS 0x02 +#define SMRT_DTYPE_SATA_BW 0x03 +#define SMRT_DTYPE_SAS_BW 0x04 +#define SMRT_DTYPE_EXPANDER 0x05 +#define SMRT_DTYPE_SES 0x06 +#define SMRT_DTYPE_CONTROLLER 0x07 +#define SMRT_DTYPE_SGPIO 0x08 +#define SMRT_DTYPE_NVME 0x09 +#define SMRT_DTYPE_NOPHY 0xFF + +/* + * The following packed structures are used to ease the manipulation of SCSI + * and BMIC commands sent to, and status information returned from, the + * controller. + */ +#pragma pack(1) + +typedef struct smrt_report_logical_lun_ent { + LogDevAddr_t smrle_addr; +} smrt_report_logical_lun_ent_t; + +typedef struct smrt_report_logical_lun_extent { + LogDevAddr_t smrle_addr; + uint8_t smrle_wwn[16]; +} smrt_report_logical_lun_extent_t; + +typedef struct smrt_report_logical_lun { + uint32_t smrll_datasize; /* Big Endian */ + uint8_t smrll_extflag; + uint8_t smrll_reserved1[3]; + union { + smrt_report_logical_lun_ent_t ents[SMRT_MAX_LOGDRV]; + smrt_report_logical_lun_extent_t extents[SMRT_MAX_LOGDRV]; + } smrll_data; +} smrt_report_logical_lun_t; + +typedef struct smrt_report_logical_lun_req { + uint8_t smrllr_opcode; + uint8_t smrllr_extflag; + uint8_t smrllr_reserved1[4]; + uint32_t smrllr_datasize; /* Big Endian */ + uint8_t smrllr_reserved2; + uint8_t smrllr_control; +} smrt_report_logical_lun_req_t; + +typedef struct smrt_report_physical_lun_ent { + PhysDevAddr_t srple_addr; +} smrt_report_physical_lun_ent_t; + +/* + * This structure represents the 'physical node identifier' extended option for + * REPORT PHYSICAL LUNS. This is triggered when the extended flags is set to + * 0x1. Note that for SAS the other structure should always be used. + */ +typedef struct smrt_report_physical_pnid { + uint8_t srpp_node[8]; + uint8_t srpp_port[8]; +} smrt_report_physical_pnid_t; + +/* + * This structure represents the 'other physical device info' extended option + * for report physical luns. This is triggered when the extended flags is set + * to 0x2. + */ +typedef struct smrt_report_physical_opdi { + uint8_t srpo_wwid[8]; + uint8_t srpo_dtype; + uint8_t srpo_flags; + uint8_t srpo_multilun; + uint8_t srpo_paths; + uint32_t srpo_iohdl; +} smrt_report_physical_opdi_t; + +typedef struct smrt_report_physical_lun_extent { + PhysDevAddr_t srple_addr; + union { + smrt_report_physical_pnid_t srple_pnid; + smrt_report_physical_opdi_t srple_opdi; + } srple_extdata; +} smrt_report_physical_lun_extent_t; + +/* + * Values that can be ORed together into smrllr_extflag. smprl_extflag indicates + * if any extended processing was done or not. + */ +#define SMRT_REPORT_PHYSICAL_LUN_EXT_NONE 0x00 +#define SMRT_REPORT_PHYSICAL_LUN_EXT_PNID 0x01 +#define SMRT_REPORT_PHYSICAL_LUN_EXT_OPDI 0x02 +#define SMRT_REPORT_PHYSICAL_LUN_EXT_MASK 0x0f +#define SMRT_REPORT_PHYSICAL_LUN_CTRL_ONLY (1 << 6) +#define SMRT_REPORT_PHYSICAL_LUN_ALL_PATHS (1 << 7) + +typedef struct smrt_report_physical_lun { + uint32_t smrpl_datasize; /* Big Endian */ + uint8_t smrpl_extflag; + uint8_t smrpl_reserved1[3]; + union { + smrt_report_physical_lun_ent_t ents[SMRT_MAX_PHYSDEV]; + smrt_report_physical_lun_extent_t extents[SMRT_MAX_PHYSDEV]; + } smrpl_data; +} smrt_report_physical_lun_t; + + +typedef struct smrt_report_physical_lun_req { + uint8_t smrplr_opcode; + uint8_t smrplr_extflag; + uint8_t smrplr_reserved[1]; + uint32_t smrplr_datasize; /* Big Endian */ + uint8_t smrplr_reserved2; + uint8_t smrplr_control; +} smrt_report_physical_lun_req_t; + +/* + * Request structure for the BMIC command IDENTIFY CONTROLLER. This structure + * is written into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode. Reserved + * fields should be filled with zeroes. + */ +typedef struct smrt_identify_controller_req { + uint8_t smicr_opcode; + uint8_t smicr_lun; + uint8_t smicr_reserved1[4]; + uint8_t smicr_command; + uint8_t smicr_reserved2[2]; + uint8_t smicr_reserved3[1]; + uint8_t smicr_reserved4[6]; +} smrt_identify_controller_req_t; + +/* + * Response structure for IDENTIFY CONTROLLER. This structure is used to + * interpret the response the controller will write into the data buffer. + */ +typedef struct smrt_identify_controller { + uint8_t smic_logical_drive_count; + uint32_t smic_config_signature; + uint8_t smic_firmware_rev[4]; + uint8_t smic_recovery_rev[4]; + uint8_t smic_hardware_version; + uint8_t smic_bootblock_rev[4]; + + /* + * These are obsolete for SAS controllers: + */ + uint32_t smic_drive_present_map; + uint32_t smic_external_drive_map; + + uint32_t smic_board_id; +} smrt_identify_controller_t; + +/* + * Request structure for IDENTIFY PHYSICAL DEVICE. This structure is written + * into the CDB with the CISS_SCMD_BMIC_READ SCSI opcode. Reserved fields + * should be filled with zeroes. Note, the lower 8 bits of the BMIC ID are in + * index1, whereas the upper 8 bites are in index2; however, the controller may + * only support 8 bits worth of devices (and this driver does not support that + * many devices). + */ +typedef struct smrt_identify_physical_drive_req { + uint8_t sipdr_opcode; + uint8_t sipdr_lun; + uint8_t sipdr_bmic_index1; + uint8_t sipdr_reserved1[3]; + uint8_t sipdr_command; + uint8_t sipdr_reserved2[2]; + uint8_t sipdr_bmic_index2; + uint8_t sipdr_reserved4[6]; +} smrt_identify_physical_drive_req_t; + +/* + * Relevant values for the sipd_more_flags member. + */ +#define SMRT_MORE_FLAGS_LOGVOL (1 << 5) +#define SMRT_MORE_FLAGS_SPARE (1 << 6) + +/* + * Response structure for IDENTIFY PHYSICAL DEVICE. This structure is used to + * describe aspects of a physical drive. Note, not all fields are valid in all + * firmware revisions. + */ +typedef struct smrt_identify_physical_drive { + uint8_t sipd_scsi_bus; /* Invalid for SAS */ + uint8_t sipd_scsi_id; /* Invalid for SAS */ + uint16_t sipd_lblk_size; + uint32_t sipd_nblocks; + uint32_t sipd_rsrvd_blocsk; + uint8_t sipd_model[40]; + uint8_t sipd_serial[40]; + uint8_t sipd_firmware[8]; + uint8_t sipd_scsi_inquiry; + uint8_t sipd_compaq_stamp; + uint8_t sipd_last_failure; + uint8_t sipd_flags; + uint8_t sipd_more_flags; + uint8_t sipd_scsi_lun; /* Invalid for SAS */ + uint8_t sipd_yet_more_flags; + uint8_t sipd_even_more_flags; + uint32_t sipd_spi_speed_rules; + uint8_t sipd_phys_connector[2]; + uint8_t sipd_phys_box_on_bus; + uint8_t sipd_phys_bay_in_box; + uint32_t sipd_rpm; + uint8_t sipd_device_type; + uint8_t sipd_sata_version; + uint64_t sipd_big_nblocks; + uint64_t sipd_ris_slba; + uint32_t sipd_ris_size; + uint8_t sipd_wwid[20]; + uint8_t sipd_controller_phy_map[32]; + uint16_t sipd_phy_count; + uint8_t sipd_phy_connected_dev_type[256]; + uint8_t sipd_phy_to_drive_bay[256]; + uint16_t sipd_phy_to_attached_dev[256]; + uint8_t sipd_box_index; + uint8_t sipd_drive_support; + uint16_t sipd_extra_flags; + uint8_t sipd_neogiated_link_rate[256]; + uint8_t sipd_phy_to_phy_map[256]; + uint8_t sipd_pad[312]; +} smrt_identify_physical_drive_t; + +/* + * Note that this structure describes the CISS version of the command. There + * also exists a BMIC version, but it has a slightly different structure. This + * structure is also used for the cancellation request; however, in that case, + * the senr_flags field is reserved. + */ +typedef struct smrt_event_notify_req { + uint8_t senr_opcode; + uint8_t senr_subcode; + uint8_t senr_reserved1[2]; + uint32_t senr_flags; /* Big Endian */ + uint32_t senr_size; /* Big Endian */ + uint8_t senr_control; +} smrt_event_notify_req_t; + +/* + * When receiving event notifications, the buffer size must be 512 bytes large. + * We make sure that we always allocate a buffer of this size, even though we + * define a structure that is much shorter and only uses the fields that we end + * up caring about. This size requirement comes from the specification. + */ +#define SMRT_EVENT_NOTIFY_BUFLEN 512 + +#define SMRT_EVENT_CLASS_PROTOCOL 0 +#define SMRT_EVENT_PROTOCOL_SUBCLASS_ERROR 1 + +#define SMRT_EVENT_CLASS_HOTPLUG 1 +#define SMRT_EVENT_HOTPLUG_SUBCLASS_DRIVE 0 + +#define SMRT_EVENT_CLASS_HWERROR 2 +#define SMRT_EVENT_CLASS_ENVIRONMENT 3 + +#define SMRT_EVENT_CLASS_PHYS 4 +#define SMRT_EVENT_PHYS_SUBCLASS_STATE 0 + +#define SMRT_EVENT_CLASS_LOGVOL 5 + +typedef struct smrt_event_notify { + uint32_t sen_timestamp; + uint16_t sen_class; + uint16_t sen_subclass; + uint16_t sen_detail; + uint8_t sen_data[64]; + char sen_message[80]; + uint32_t sen_tag; + uint16_t sen_date; + uint16_t sen_year; + uint32_t sen_time; + uint16_t sen_pre_power_time; + LUNAddr_t sen_addr; +} smrt_event_notify_t; + +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SMRT_SCSI_H */ diff --git a/usr/src/uts/common/sys/scsi/generic/inquiry.h b/usr/src/uts/common/sys/scsi/generic/inquiry.h index ddfd683169..fcbf00d5dc 100644 --- a/usr/src/uts/common/sys/scsi/generic/inquiry.h +++ b/usr/src/uts/common/sys/scsi/generic/inquiry.h @@ -362,7 +362,8 @@ struct scsi_inquiry { #define DTYPE_NOTPRESENT (DPQ_NEVER | DTYPE_UNKNOWN) /* - * Defined Response Data Formats: + * Defined Versions for inquiry data. These represent the base version that a + * device supports. */ #define RDF_LEVEL0 0x00 /* no conformance claim (SCSI-1) */ #define RDF_CCS 0x01 /* Obsolete (pseudo-spec) */ @@ -370,7 +371,8 @@ struct scsi_inquiry { #define RDF_SCSI_SPC 0x03 /* ANSI INCITS 301-1997 (SPC) */ #define RDF_SCSI_SPC2 0x04 /* ANSI INCITS 351-2001 (SPC-2) */ #define RDF_SCSI_SPC3 0x05 /* ANSI INCITS 408-2005 (SPC-3) */ -#define RDF_SCSI_SPC4 0x06 /* t10 (SPC-4) */ +#define RDF_SCSI_SPC4 0x06 /* ANSI INCITS 513-2015 (SPC-4) */ +#define RDF_SCSI_SPC5 0x07 /* t10 (SPC-5) */ /* * Defined Target Port Group Select values: @@ -436,6 +438,7 @@ struct vpd_desc { #define PM_CAPABLE_SPC2 RDF_SCSI_SPC2 #define PM_CAPABLE_SPC3 RDF_SCSI_SPC3 #define PM_CAPABLE_SPC4 RDF_SCSI_SPC4 +#define PM_CAPABLE_SPC5 RDF_SCSI_SPC5 #define PM_CAPABLE_LOG_MASK 0xffff0000 /* use upper 16 bit to */ /* indicate log specifics */ #define PM_CAPABLE_LOG_SUPPORTED 0x10000 /* Log page 0xE might be */ diff --git a/usr/src/uts/common/sys/scsi/targets/sddef.h b/usr/src/uts/common/sys/scsi/targets/sddef.h index fe73b1acde..df39257ee6 100644 --- a/usr/src/uts/common/sys/scsi/targets/sddef.h +++ b/usr/src/uts/common/sys/scsi/targets/sddef.h @@ -775,6 +775,12 @@ _NOTE(MUTEX_PROTECTS_DATA(sd_lun::un_fi_mutex, #define SD_FM_LOG(un) (((struct sd_fm_internal *)\ ((un)->un_fm_private))->fm_log_level) +/* + * Version Related Macros + */ +#define SD_SCSI_VERS_IS_GE_SPC_4(un) \ + (SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC4 || \ + SD_INQUIRY(un)->inq_ansi == RDF_SCSI_SPC5) /* * Values for un_ctype @@ -1851,6 +1857,10 @@ struct sd_fm_internal { #define SD_PM_CAPABLE_IS_SPC_4(pm_cap) \ ((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) +#define SD_PM_CAPABLE_IS_GE_SPC_4(pm_cap) \ + (((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC4) || \ + ((pm_cap & PM_CAPABLE_PM_MASK) == PM_CAPABLE_SPC5)) + #define SD_PM_CAP_LOG_SUPPORTED(pm_cap) \ ((pm_cap & PM_CAPABLE_LOG_SUPPORTED) ? TRUE : FALSE) diff --git a/usr/src/uts/common/sys/shm.h b/usr/src/uts/common/sys/shm.h index 0219fc2cf7..8f530afda2 100644 --- a/usr/src/uts/common/sys/shm.h +++ b/usr/src/uts/common/sys/shm.h @@ -21,6 +21,7 @@ */ /* * Copyright 2014 Garrett D'Amore <garrett@damore.org> + * Copyright 2016 Joyent, Inc. * * Copyright 2003 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. @@ -120,6 +121,10 @@ struct shmid_ds { #define SHM_LOCK 3 /* Lock segment in core */ #define SHM_UNLOCK 4 /* Unlock segment */ +#if defined(_KERNEL) +#define SHM_RMID 5 /* Private RMID for lx support */ +#endif + #if !defined(_KERNEL) int shmget(key_t, size_t, int); int shmids(int *, uint_t, uint_t *); diff --git a/usr/src/uts/common/sys/shm_impl.h b/usr/src/uts/common/sys/shm_impl.h index 4d8cdcede5..1eae2ca0a4 100644 --- a/usr/src/uts/common/sys/shm_impl.h +++ b/usr/src/uts/common/sys/shm_impl.h @@ -21,13 +21,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_SHM_IMPL_H #define _SYS_SHM_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/ipc_impl.h> #if defined(_KERNEL) || defined(_KMEMUSER) #include <sys/shm.h> @@ -70,7 +69,11 @@ typedef struct kshmid { time_t shm_ctime; /* last change time */ struct sptinfo *shm_sptinfo; /* info about ISM segment */ struct seg *shm_sptseg; /* pointer to ISM segment */ - long shm_sptprot; /* was reserved (still a "long") */ + ulong_t shm_opts; + /* + * Composed of: sptprot (uchar_t) and + * RM_PENDING flag (1 bit). + */ } kshmid_t; /* @@ -78,6 +81,14 @@ typedef struct kshmid { */ #define SHMSA_ISM 1 /* uses shared page table */ +/* + * shm_opts definitions + * Low byte in shm_opts is used for sptprot (see PROT_ALL). The upper bits are + * used for additional options. + */ +#define SHM_PROT_MASK 0xff +#define SHM_RM_PENDING 0x100 + typedef struct sptinfo { struct as *sptas; /* dummy as ptr. for spt segment */ } sptinfo_t; diff --git a/usr/src/uts/common/sys/signal.h b/usr/src/uts/common/sys/signal.h index aece147bec..b12dff6034 100644 --- a/usr/src/uts/common/sys/signal.h +++ b/usr/src/uts/common/sys/signal.h @@ -22,6 +22,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -158,8 +159,8 @@ struct sigaction32 { * use of these symbols by applications is injurious * to binary compatibility */ -#define NSIG 74 /* valid signals range from 1 to NSIG-1 */ -#define MAXSIG 73 /* size of u_signal[], NSIG-1 <= MAXSIG */ +#define NSIG 75 /* valid signals range from 1 to NSIG-1 */ +#define MAXSIG 74 /* size of u_signal[], NSIG-1 <= MAXSIG */ #endif /* defined(__EXTENSIONS__) || !defined(_XPG4_2) */ #define MINSIGSTKSZ 2048 diff --git a/usr/src/uts/common/sys/socket.h b/usr/src/uts/common/sys/socket.h index 93b0af97e8..d6e13d4823 100644 --- a/usr/src/uts/common/sys/socket.h +++ b/usr/src/uts/common/sys/socket.h @@ -22,6 +22,7 @@ * Copyright 2014 Garrett D'Amore <garrett@damore.org> * * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -39,6 +40,9 @@ /* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. */ +/* + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + */ #ifndef _SYS_SOCKET_H #define _SYS_SOCKET_H @@ -204,6 +208,7 @@ struct so_snd_bufinfo { #define SO_SRCADDR 0x2001 /* Internal: AF_UNIX source address */ #define SO_FILEP 0x2002 /* Internal: AF_UNIX file pointer */ #define SO_UNIX_CLOSE 0x2003 /* Internal: AF_UNIX peer closed */ +#define SO_REUSEPORT 0x2004 /* allow simultaneous port reuse */ #endif /* _KERNEL */ /* @@ -303,8 +308,9 @@ struct linger { #define AF_INET_OFFLOAD 30 /* Sun private; do not use */ #define AF_TRILL 31 /* TRILL interface */ #define AF_PACKET 32 /* PF_PACKET Linux socket interface */ +#define AF_LX_NETLINK 33 /* Linux-compatible netlink */ -#define AF_MAX 32 +#define AF_MAX 33 /* * Protocol families, same as address families for now. @@ -344,6 +350,7 @@ struct linger { #define PF_INET_OFFLOAD AF_INET_OFFLOAD /* Sun private; do not use */ #define PF_TRILL AF_TRILL #define PF_PACKET AF_PACKET +#define PF_LX_NETLINK AF_LX_NETLINK #define PF_MAX AF_MAX @@ -429,6 +436,7 @@ struct msghdr32 { /* with left over data */ #define MSG_XPG4_2 0x8000 /* Private: XPG4.2 flag */ +/* Obsolete but kept for compilation compatability. Use IOV_MAX. */ #define MSG_MAXIOVLEN 16 #ifdef _KERNEL diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h index f5c4d801de..55a182fa68 100644 --- a/usr/src/uts/common/sys/socketvar.h +++ b/usr/src/uts/common/sys/socketvar.h @@ -21,6 +21,7 @@ /* * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -103,6 +104,7 @@ struct sockaddr_ux { typedef struct sonodeops sonodeops_t; typedef struct sonode sonode_t; +typedef boolean_t (*so_krecv_f)(sonode_t *, mblk_t *, size_t, int, void *); struct sodirect_s; @@ -245,6 +247,10 @@ struct sonode { struct sof_instance *so_filter_top; /* top of stack */ struct sof_instance *so_filter_bottom; /* bottom of stack */ clock_t so_filter_defertime; /* time when deferred */ + + /* Kernel direct receive callbacks */ + so_krecv_f so_krecv_cb; /* recv callback */ + void *so_krecv_arg; /* recv cb arg */ }; #define SO_HAVE_DATA(so) \ @@ -298,15 +304,16 @@ struct sonode { #define SS_OOBPEND 0x00002000 /* OOB pending or present - poll */ #define SS_HAVEOOBDATA 0x00004000 /* OOB data present */ #define SS_HADOOBDATA 0x00008000 /* OOB data consumed */ -#define SS_CLOSING 0x00010000 /* in process of closing */ +#define SS_CLOSING 0x00010000 /* in process of closing */ #define SS_FIL_DEFER 0x00020000 /* filter deferred notification */ #define SS_FILOP_OK 0x00040000 /* socket can attach filters */ #define SS_FIL_RCV_FLOWCTRL 0x00080000 /* filter asserted rcv flow ctrl */ + #define SS_FIL_SND_FLOWCTRL 0x00100000 /* filter asserted snd flow ctrl */ #define SS_FIL_STOP 0x00200000 /* no more filter actions */ - #define SS_SODIRECT 0x00400000 /* transport supports sodirect */ +#define SS_FILOP_UNSF 0x00800000 /* block attaching unsafe filters */ #define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */ #define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */ @@ -322,7 +329,8 @@ struct sonode { /* * Sockets that can fall back to TPI must ensure that fall back is not - * initiated while a thread is using a socket. + * initiated while a thread is using a socket. Otherwise this disables all + * future filter attachment. */ #define SO_BLOCK_FALLBACK(so, fn) \ ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ @@ -338,6 +346,24 @@ struct sonode { } \ } +/* + * Sockets that can fall back to TPI must ensure that fall back is not + * initiated while a thread is using a socket. Otherwise this disables all + * future unsafe filter attachment. Safe filters can still attach after + * we execute the function in which this macro is used. + */ +#define SO_BLOCK_FALLBACK_SAFE(so, fn) \ + ASSERT(MUTEX_NOT_HELD(&(so)->so_lock)); \ + rw_enter(&(so)->so_fallback_rwlock, RW_READER); \ + if ((so)->so_state & SS_FALLBACK_COMP) { \ + rw_exit(&(so)->so_fallback_rwlock); \ + return (fn); \ + } else if (((so)->so_state & SS_FILOP_UNSF) == 0) { \ + mutex_enter(&(so)->so_lock); \ + (so)->so_state |= SS_FILOP_UNSF; \ + mutex_exit(&(so)->so_lock); \ + } + #define SO_UNBLOCK_FALLBACK(so) { \ rw_exit(&(so)->so_fallback_rwlock); \ } @@ -369,6 +395,7 @@ struct sonode { /* The modes below are only for non-streams sockets */ #define SM_ACCEPTSUPP 0x400 /* can handle accept() */ #define SM_SENDFILESUPP 0x800 /* Private: proto supp sendfile */ +#define SM_DEFERERR 0x1000 /* Private: defer so_error delivery */ /* * Socket versions. Used by the socket library when calling _so_socket(). @@ -947,6 +974,15 @@ extern struct sonode *socreate(struct sockparams *, int, int, int, int, extern int so_copyin(const void *, void *, size_t, int); extern int so_copyout(const void *, void *, size_t, int); +/* + * Functions to manipulate the use of direct receive callbacks. This should not + * be used outside of sockfs and ksocket. These are generally considered a use + * once interface for a socket and will cause all outstanding data on the socket + * to be flushed. + */ +extern int so_krecv_set(sonode_t *, so_krecv_f, void *); +extern void so_krecv_unblock(sonode_t *); + #endif /* diff --git a/usr/src/uts/common/sys/sockfilter.h b/usr/src/uts/common/sys/sockfilter.h index 9f6d8b499b..c4dd6539de 100644 --- a/usr/src/uts/common/sys/sockfilter.h +++ b/usr/src/uts/common/sys/sockfilter.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #ifndef _SYS_SOCKFILTER_H @@ -129,6 +130,15 @@ typedef struct sof_ops { #define SOF_VERSION 1 +/* + * Flag indicating that the filter module is safe to attach after bind, + * getsockname, getsockopt or setsockopt calls. By default filters are unsafe + * so may not be attached after any socket operation. However, a safe filter + * can still be attached after one of the above calls. This makes attaching + * the filter less dependent on the initial socket setup order. + */ +#define SOF_ATT_SAFE 0x1 + extern int sof_register(int, const char *, const sof_ops_t *, int); extern int sof_unregister(const char *); diff --git a/usr/src/uts/common/sys/squeue.h b/usr/src/uts/common/sys/squeue.h index f1bd429815..89b355970e 100644 --- a/usr/src/uts/common/sys/squeue.h +++ b/usr/src/uts/common/sys/squeue.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_SQUEUE_H @@ -29,6 +30,17 @@ extern "C" { #endif +/* + * Originally in illumos, we had an IP-centric view of the serialization queue + * abstraction. While that has useful properties, the implementation of squeues + * hardcodes various parts of the implementation of IP into it which makes it + * unsuitable for other consumers. To enable them, we created another interface, + * but opted not to port all of the functionality that IP uses in the form of + * ip_squeue.c As other consumers need the functionality that IP has in squeues, + * then we'll come up with more genericized methods and add that functionality + * to <sys/gsqueue.h>. Please do not continue to use this header. + */ + #include <sys/types.h> #include <sys/processor.h> #include <sys/stream.h> @@ -76,16 +88,17 @@ typedef enum { struct ip_recv_attr_s; extern void squeue_init(void); -extern squeue_t *squeue_create(clock_t, pri_t); +extern squeue_t *squeue_create(pri_t, boolean_t); extern void squeue_bind(squeue_t *, processorid_t); extern void squeue_unbind(squeue_t *); extern void squeue_enter(squeue_t *, mblk_t *, mblk_t *, uint32_t, struct ip_recv_attr_s *, int, uint8_t); extern uintptr_t *squeue_getprivate(squeue_t *, sqprivate_t); +extern void squeue_destroy(squeue_t *); struct conn_s; extern int squeue_synch_enter(struct conn_s *, mblk_t *); -extern void squeue_synch_exit(struct conn_s *); +extern void squeue_synch_exit(struct conn_s *, int); #ifdef __cplusplus } diff --git a/usr/src/uts/common/sys/squeue_impl.h b/usr/src/uts/common/sys/squeue_impl.h index 22550886eb..2bb717fb52 100644 --- a/usr/src/uts/common/sys/squeue_impl.h +++ b/usr/src/uts/common/sys/squeue_impl.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_SQUEUE_IMPL_H @@ -84,7 +85,6 @@ typedef void (*sq_enter_proc_t)(squeue_t *, mblk_t *, mblk_t *, uint32_t, struct ip_recv_attr_s *, int, uint8_t); typedef void (*sq_drain_proc_t)(squeue_t *, uint_t, hrtime_t); -extern void squeue_worker_wakeup(squeue_t *); extern int ip_squeue_flag; struct squeue_s { @@ -99,14 +99,11 @@ struct squeue_s { ill_rx_ring_t *sq_rx_ring; /* The Rx ring tied to this sq */ ill_t *sq_ill; /* The ill this squeue is tied to */ - clock_t sq_curr_time; /* Current tick (lbolt) */ + hrtime_t sq_awoken; /* time of worker wake req */ kcondvar_t sq_worker_cv; /* cond var. worker thread blocks on */ kcondvar_t sq_poll_cv; /* cond variable poll_thr waits on */ kcondvar_t sq_synch_cv; /* cond var. synch thread waits on */ kcondvar_t sq_ctrlop_done_cv; /* cond variable for ctrl ops */ - clock_t sq_wait; /* lbolts to wait after a fill() */ - timeout_id_t sq_tid; /* timer id of pending timeout() */ - clock_t sq_awaken; /* time async thread was awakened */ processorid_t sq_bind; /* processor to bind to */ kthread_t *sq_worker; /* kernel thread id */ @@ -117,6 +114,7 @@ struct squeue_s { squeue_set_t *sq_set; /* managed by squeue creator */ pri_t sq_priority; /* squeue thread priority */ + boolean_t sq_isip; /* use IP-centric features */ /* Keep the debug-only fields at the end of the structure */ #ifdef DEBUG @@ -140,7 +138,6 @@ struct squeue_s { #define SQS_USER 0x00000010 /* A non interrupt user */ #define SQS_BOUND 0x00000020 /* Worker thread is bound */ #define SQS_REENTER 0x00000040 /* Re entered thread */ -#define SQS_TMO_PROG 0x00000080 /* Timeout is being set */ #define SQS_POLL_CAPAB 0x00000100 /* Squeue can control interrupts */ #define SQS_ILL_BOUND 0x00000200 /* Squeue bound to an ill */ @@ -165,6 +162,7 @@ struct squeue_s { #define SQS_POLL_RESTART_DONE 0x01000000 #define SQS_POLL_THR_QUIESCE 0x02000000 #define SQS_PAUSE 0x04000000 /* The squeue has been paused */ +#define SQS_EXIT 0x08000000 /* squeue is being torn down */ #define SQS_WORKER_THR_CONTROL \ (SQS_POLL_QUIESCE | SQS_POLL_RESTART | SQS_POLL_CLEANUP) diff --git a/usr/src/uts/common/sys/stream.h b/usr/src/uts/common/sys/stream.h index 4be8d794fc..7488d3dee8 100644 --- a/usr/src/uts/common/sys/stream.h +++ b/usr/src/uts/common/sys/stream.h @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. All rights reserved. * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. */ @@ -644,16 +645,13 @@ struct stroptions { /* * Structure for rw (read/write) procedure calls. A pointer * to a struiod_t is passed as a parameter to the rwnext() call. - * - * Note: DEF_IOV_MAX is defined and used as it is in "fs/vncalls.c" - * as there isn't a formal definition of IOV_MAX ??? */ #define DEF_IOV_MAX 16 struct struiod { mblk_t *d_mp; /* pointer to mblk (chain) */ uio_t d_uio; /* uio info */ - iovec_t d_iov[DEF_IOV_MAX]; /* iov referenced by uio */ + iovec_t *d_iov; /* iov referenced by uio */ }; /* diff --git a/usr/src/uts/common/sys/strsubr.h b/usr/src/uts/common/sys/strsubr.h index 306a2f6b29..65bdfb2e17 100644 --- a/usr/src/uts/common/sys/strsubr.h +++ b/usr/src/uts/common/sys/strsubr.h @@ -25,6 +25,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2018 Joyent, Inc. */ /* @@ -1254,10 +1255,17 @@ extern void strsignal_nolock(stdata_t *, int, uchar_t); struct multidata_s; struct pdesc_s; + +/* + * Now that NIC drivers are expected to deal only with M_DATA mblks, the + * hcksum_assoc and hcksum_retrieve functions are deprecated in favor of their + * respective mac_hcksum_set and mac_hcksum_get counterparts. + */ extern int hcksum_assoc(mblk_t *, struct multidata_s *, struct pdesc_s *, uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, int); extern void hcksum_retrieve(mblk_t *, struct multidata_s *, struct pdesc_s *, uint32_t *, uint32_t *, uint32_t *, uint32_t *, uint32_t *); + extern void lso_info_set(mblk_t *, uint32_t, uint32_t); extern void lso_info_cleanup(mblk_t *); extern unsigned int bcksum(uchar_t *, int, unsigned int); diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index f81a391f41..8ce8508114 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -1592,8 +1592,14 @@ int ddi_ffs(long mask); int +ddi_ffsll(long long mask); + +int ddi_fls(long mask); +int +ddi_flsll(long long mask); + /* * The ddi_soft_state* routines comprise generic storage management utilities * for driver soft state structures. Two types of soft_state indexes are diff --git a/usr/src/uts/common/sys/sysconfig.h b/usr/src/uts/common/sys/sysconfig.h index 3a68d76ebe..d5b65ef78c 100644 --- a/usr/src/uts/common/sys/sysconfig.h +++ b/usr/src/uts/common/sys/sysconfig.h @@ -25,6 +25,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_SYSCONFIG_H @@ -101,6 +102,8 @@ extern int mach_sysconfig(int); #define _CONFIG_EPHID_MAX 47 /* maximum ephemeral uid */ +#define _CONFIG_NPROC_NCPU 48 /* NCPU (sometimes > NPROC_MAX) */ + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/systrace.h b/usr/src/uts/common/sys/systrace.h index d43974451e..17e509d4d8 100644 --- a/usr/src/uts/common/sys/systrace.h +++ b/usr/src/uts/common/sys/systrace.h @@ -22,13 +22,12 @@ /* * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2014 Joyent, Inc. All rights reserved. */ #ifndef _SYS_SYSTRACE_H #define _SYS_SYSTRACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/dtrace.h> #ifdef __cplusplus @@ -47,16 +46,18 @@ extern systrace_sysent_t *systrace_sysent; extern systrace_sysent_t *systrace_sysent32; extern void (*systrace_probe)(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern void systrace_stub(dtrace_id_t, uintptr_t, uintptr_t, - uintptr_t, uintptr_t, uintptr_t, uintptr_t); + uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t, uintptr_t); extern int64_t dtrace_systrace_syscall(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #ifdef _SYSCALL32_IMPL extern int64_t dtrace_systrace_syscall32(uintptr_t arg0, uintptr_t arg1, - uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5); + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4, uintptr_t arg5, + uintptr_t arg6, uintptr_t arg7); #endif #endif diff --git a/usr/src/uts/common/sys/termios.h b/usr/src/uts/common/sys/termios.h index 0c07623ce6..b955e5f3f2 100644 --- a/usr/src/uts/common/sys/termios.h +++ b/usr/src/uts/common/sys/termios.h @@ -363,6 +363,24 @@ extern pid_t tcgetsid(int); #define TCSETSF (_TIOC|16) /* + * linux terminal ioctls we need to be aware of + */ +#define TIOCSETLD (_TIOC|123) /* set line discipline parms */ +#define TIOCGETLD (_TIOC|124) /* get line discipline parms */ + +/* + * The VMIN and VTIME and solaris overlap with VEOF and VEOL - This is + * perfectly legal except, linux expects them to be separate. So we keep + * them separately. + */ +struct lx_cc { + unsigned char veof; /* veof value */ + unsigned char veol; /* veol value */ + unsigned char vmin; /* vmin value */ + unsigned char vtime; /* vtime value */ +}; + +/* * NTP PPS ioctls */ #define TIOCGPPS (_TIOC|125) diff --git a/usr/src/uts/common/sys/thread.h b/usr/src/uts/common/sys/thread.h index 2704bdd021..3ecb7c00b0 100644 --- a/usr/src/uts/common/sys/thread.h +++ b/usr/src/uts/common/sys/thread.h @@ -71,7 +71,10 @@ typedef struct ctxop { void (*exit_op)(void *); /* invoked during {thread,lwp}_exit() */ void (*free_op)(void *, int); /* function which frees the context */ void *arg; /* argument to above functions, ctx pointer */ - struct ctxop *next; /* next context ops */ + struct ctxop *next; /* next context ops */ + struct ctxop *prev; /* previous context ops */ + hrtime_t save_ts; /* timestamp of last save */ + hrtime_t restore_ts; /* timestamp of last restore */ } ctxop_t; /* @@ -198,16 +201,15 @@ typedef struct _kthread { * it should be grabbed only by thread_lock(). */ disp_lock_t *t_lockp; /* pointer to the dispatcher lock */ - ushort_t t_oldspl; /* spl level before dispatcher locked */ + ushort_t t_oldspl; /* spl level before dispatcher locked */ volatile char t_pre_sys; /* pre-syscall work needed */ lock_t t_lock_flush; /* for lock_mutex_flush() impl */ struct _disp *t_disp_queue; /* run queue for chosen CPU */ clock_t t_disp_time; /* last time this thread was running */ - uint_t t_kpri_req; /* kernel priority required */ /* * Post-syscall / post-trap flags. - * No lock is required to set these. + * No lock is required to set these. * These must be cleared only by the thread itself. * * t_astflag indicates that some post-trap processing is required, @@ -216,7 +218,7 @@ typedef struct _kthread { * t_post_sys indicates that some unusualy post-system call * handling is required, such as an error or tracing. * t_sig_check indicates that some condition in ISSIG() must be - * checked, but doesn't prevent returning to user. + * checked, but doesn't prevent returning to user. * t_post_sys_ast is a way of checking whether any of these three * flags are set. */ @@ -358,7 +360,7 @@ typedef struct _kthread { /* * Thread flag (t_flag) definitions. * These flags must be changed only for the current thread, - * and not during preemption code, since the code being + * and not during preemption code, since the code being * preempted could be modifying the flags. * * For the most part these flags do not need locking. @@ -374,7 +376,7 @@ typedef struct _kthread { #define T_WOULDBLOCK 0x0020 /* for lockfs */ #define T_DONTBLOCK 0x0040 /* for lockfs */ #define T_DONTPEND 0x0080 /* for lockfs */ -#define T_SYS_PROF 0x0100 /* profiling on for duration of system call */ +#define T_SPLITSTK 0x0100 /* kernel stack is currently split */ #define T_WAITCVSEM 0x0200 /* waiting for a lwp_cv or lwp_sema on sleepq */ #define T_WATCHPT 0x0400 /* thread undergoing a watchpoint emulation */ #define T_PANIC 0x0800 /* thread initiated a system panic */ @@ -403,6 +405,7 @@ typedef struct _kthread { #define TP_CHANGEBIND 0x1000 /* thread has a new cpu/cpupart binding */ #define TP_ZTHREAD 0x2000 /* this is a kernel thread for a zone */ #define TP_WATCHSTOP 0x4000 /* thread is stopping via holdwatch() */ +#define TP_KTHREAD 0x8000 /* in-kernel worker thread for a process */ /* * Thread scheduler flag (t_schedflag) definitions. @@ -423,8 +426,9 @@ typedef struct _kthread { #define TS_RESUME 0x1000 /* setrun() by CPR resume process */ #define TS_CREATE 0x2000 /* setrun() by syslwp_create() */ #define TS_RUNQMATCH 0x4000 /* exact run queue balancing by setbackdq() */ +#define TS_BSTART 0x8000 /* setrun() by brand */ #define TS_ALLSTART \ - (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE) + (TS_CSTART|TS_UNPAUSE|TS_XSTART|TS_PSTART|TS_RESUME|TS_CREATE|TS_BSTART) #define TS_ANYWAITQ (TS_PROJWAITQ|TS_ZONEWAITQ) /* @@ -452,6 +456,10 @@ typedef struct _kthread { #define ISTOPPED(t) ((t)->t_state == TS_STOPPED && \ !((t)->t_schedflag & TS_PSTART)) +/* True if thread is stopped for a brand-specific reason */ +#define BSTOPPED(t) ((t)->t_state == TS_STOPPED && \ + !((t)->t_schedflag & TS_BSTART)) + /* True if thread is asleep and wakeable */ #define ISWAKEABLE(t) (((t)->t_state == TS_SLEEP && \ ((t)->t_flag & T_WAKEABLE))) @@ -511,10 +519,10 @@ typedef struct _kthread { * convert a thread pointer to its proc pointer. * * ttoproj(x) - * convert a thread pointer to its project pointer. + * convert a thread pointer to its project pointer. * * ttozone(x) - * convert a thread pointer to its zone pointer. + * convert a thread pointer to its zone pointer. * * lwptot(x) * convert a lwp pointer to its thread pointer. @@ -602,26 +610,13 @@ int thread_setname(kthread_t *, const char *); int thread_vsetname(kthread_t *, const char *, ...); extern int default_binding_mode; +extern int default_stksize; #endif /* _KERNEL */ #define THREAD_NAME_MAX 32 /* includes terminating NUL */ /* - * Macros to indicate that the thread holds resources that could be critical - * to other kernel threads, so this thread needs to have kernel priority - * if it blocks or is preempted. Note that this is not necessary if the - * resource is a mutex or a writer lock because of priority inheritance. - * - * The only way one thread may legally manipulate another thread's t_kpri_req - * is to hold the target thread's thread lock while that thread is asleep. - * (The rwlock code does this to implement direct handoff to waiting readers.) - */ -#define THREAD_KPRI_REQUEST() (curthread->t_kpri_req++) -#define THREAD_KPRI_RELEASE() (curthread->t_kpri_req--) -#define THREAD_KPRI_RELEASE_N(n) (curthread->t_kpri_req -= (n)) - -/* * Macro to change a thread's priority. */ #define THREAD_CHANGE_PRI(t, pri) { \ @@ -648,12 +643,12 @@ extern int default_binding_mode; * Point it at the transition lock, which is always held. * The previosly held lock is dropped. */ -#define THREAD_TRANSITION(tp) thread_transition(tp); +#define THREAD_TRANSITION(tp) thread_transition(tp); /* * Set the thread's lock to be the transition lock, without dropping * previosly held lock. */ -#define THREAD_TRANSITION_NOLOCK(tp) ((tp)->t_lockp = &transition_lock) +#define THREAD_TRANSITION_NOLOCK(tp) ((tp)->t_lockp = &transition_lock) /* * Put thread in run state, and set the lock pointer to the dispatcher queue diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h index 8a36f622c3..a69bf4dd63 100644 --- a/usr/src/uts/common/sys/time.h +++ b/usr/src/uts/common/sys/time.h @@ -15,6 +15,7 @@ * Use is subject to license terms. * * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -264,6 +265,14 @@ typedef longlong_t hrtime_t; #if defined(_KERNEL) || defined(_FAKE_KERNEL) +/* + * Unsigned counterpart to hrtime_t + */ +typedef u_longlong_t uhrtime_t; + +#define HRTIME_MAX LLONG_MAX +#define UHRTIME_MAX ULLONG_MAX + #include <sys/time_impl.h> #include <sys/mutex.h> diff --git a/usr/src/uts/common/sys/timer.h b/usr/src/uts/common/sys/timer.h index ec349c962f..748e0c0627 100644 --- a/usr/src/uts/common/sys/timer.h +++ b/usr/src/uts/common/sys/timer.h @@ -25,7 +25,7 @@ */ /* - * Copyright (c) 2015, Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #ifndef _SYS_TIMER_H @@ -34,6 +34,9 @@ #include <sys/types.h> #include <sys/proc.h> #include <sys/thread.h> +#include <sys/param.h> +#include <sys/siginfo.h> +#include <sys/port.h> #ifdef __cplusplus extern "C" { @@ -42,7 +45,13 @@ extern "C" { #ifdef _KERNEL #define _TIMER_MAX 32 -extern int timer_max; /* patchable via /etc/system */ +/* + * Max timers per process. This is patchable via /etc/system and can be + * updated via kmdb. Sticking to positive powers of 2 is recommended. + */ +extern int timer_max; + +#define _TIMER_ALLOC_INIT 8 /* initial size for p_itimer array */ /* * Bit values for the it_lock field. @@ -56,6 +65,7 @@ extern int timer_max; /* patchable via /etc/system */ */ #define IT_SIGNAL 0x01 #define IT_PORT 0x02 /* use event port notification */ +#define IT_CALLBACK 0x04 /* custom callback function */ struct clock_backend; @@ -83,14 +93,27 @@ struct itimer { struct clock_backend *it_backend; void (*it_fire)(itimer_t *); kmutex_t it_mutex; - void *it_portev; /* port_kevent_t pointer */ - void *it_portsrc; /* port_source_t pointer */ - int it_portfd; /* port file descriptor */ + union { + struct { + void *_it_portev; /* port_kevent_t pointer */ + void *_it_portsrc; /* port_source_t pointer */ + int _it_portfd; /* port file descriptor */ + } _it_ev_port; + struct { + void (*_it_cb_func)(itimer_t *); + uintptr_t _it_cb_data[2]; + } _it_ev_cb; + } _it_ev_data; }; #define it_sigq __data.__proc.__it_sigq #define it_lwp __data.__proc.__it_lwp #define it_frontend __data.__it_frontend +#define it_portev _it_ev_data._it_ev_port._it_portev +#define it_portsrc _it_ev_data._it_ev_port._it_portsrc +#define it_portfd _it_ev_data._it_ev_port._it_portfd +#define it_cb_func _it_ev_data._it_ev_cb._it_cb_func +#define it_cb_data _it_ev_data._it_ev_cb._it_cb_data typedef struct clock_backend { struct sigevent clk_default; @@ -107,7 +130,11 @@ typedef struct clock_backend { extern void clock_add_backend(clockid_t clock, clock_backend_t *backend); extern clock_backend_t *clock_get_backend(clockid_t clock); +extern void timer_release(struct proc *, itimer_t *); +extern void timer_delete_grabbed(struct proc *, timer_t tid, itimer_t *it); extern void timer_lwpbind(); +extern int timer_setup(clock_backend_t *, struct sigevent *, port_notify_t *, + itimer_t **, timer_t *); extern void timer_func(sigqueue_t *); extern void timer_exit(void); diff --git a/usr/src/uts/common/sys/ts.h b/usr/src/uts/common/sys/ts.h index 266d63a3ea..2cf5dcade3 100644 --- a/usr/src/uts/common/sys/ts.h +++ b/usr/src/uts/common/sys/ts.h @@ -22,6 +22,7 @@ /* * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -78,7 +79,8 @@ typedef struct tsproc { } tsproc_t; /* flags */ -#define TSKPRI 0x01 /* thread at kernel mode priority */ + +/* Formerly: TSKPRI 0x01 - thread at kernel mode priority */ #define TSBACKQ 0x02 /* thread goes to back of dispq if preempted */ #define TSIA 0x04 /* thread is interactive */ #define TSIASET 0x08 /* interactive thread is "on" */ diff --git a/usr/src/uts/common/sys/uadmin.h b/usr/src/uts/common/sys/uadmin.h index 904b52cac4..75d000b831 100644 --- a/usr/src/uts/common/sys/uadmin.h +++ b/usr/src/uts/common/sys/uadmin.h @@ -23,6 +23,7 @@ * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2011 Joyent, Inc. All rights reserved. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ @@ -159,7 +160,7 @@ extern kmutex_t ualock; extern void mdboot(int, int, char *, boolean_t); extern void mdpreboot(int, int, char *); extern int kadmin(int, int, void *, cred_t *); -extern void killall(zoneid_t); +extern void killall(zoneid_t, boolean_t); #endif extern int uadmin(int, int, uintptr_t); diff --git a/usr/src/uts/common/sys/uio.h b/usr/src/uts/common/sys/uio.h index bca1ed1fa3..9584be559f 100644 --- a/usr/src/uts/common/sys/uio.h +++ b/usr/src/uts/common/sys/uio.h @@ -145,7 +145,8 @@ typedef struct uioa_s { */ typedef enum xuio_type { UIOTYPE_ASYNCIO, - UIOTYPE_ZEROCOPY + UIOTYPE_ZEROCOPY, + UIOTYPE_PEEKSIZE } xuio_type_t; typedef struct xuio { @@ -175,6 +176,15 @@ typedef struct xuio { int xu_zc_rw; /* read or write buffer */ void *xu_zc_priv; /* fs specific */ } xu_zc; + + /* + * Peek Size Support -- facilitate peeking at the size of a + * waiting message on a socket. + */ + struct { + ssize_t xu_ps_size; /* size of waiting msg */ + boolean_t xu_ps_set; /* was size calculated? */ + } xu_ps; } xu_ext; } xuio_t; diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h index c96f914a70..f1b209faad 100644 --- a/usr/src/uts/common/sys/usb/clients/hid/hidminor.h +++ b/usr/src/uts/common/sys/usb/clients/hid/hidminor.h @@ -20,7 +20,7 @@ */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_USB_HIDMINOR_H @@ -44,21 +44,28 @@ extern "C" { * transparent. * * So we change minor node numbering scheme to be: - * external node minor num == instance << 1 - * internal node minor num == instance << 1 | 0x1 + * external node minor num == instance << 9 + * internal node minor num == instance << 9 | 0x100 * (There are only internal nodes for keyboard/mouse now.) + * + * The 8 bits of the LSB are used for ugen minor numbering (hence the use + * of the first bit of the next byte for the "internal" flag) */ -#define HID_MINOR_BITS_MASK 0x1 +#define HID_MINOR_BITS_MASK 0x1ff +#define HID_MINOR_UGEN_BITS_MASK 0xff #define HID_MINOR_INSTANCE_MASK ~HID_MINOR_BITS_MASK -#define HID_MINOR_INSTANCE_SHIFT 1 +#define HID_MINOR_INSTANCE_SHIFT 9 -#define HID_MINOR_INTERNAL 0x1 +#define HID_MINOR_INTERNAL 0x100 #define HID_MINOR_MAKE_INTERNAL(minor) \ ((minor) | HID_MINOR_INTERNAL) #define HID_IS_INTERNAL_OPEN(minor) \ (((minor) & HID_MINOR_INTERNAL)) +#define HID_IS_UGEN_OPEN(minor) \ + (((minor) & HID_MINOR_UGEN_BITS_MASK)) + #define HID_MINOR_TO_INSTANCE(minor) \ (((minor) & HID_MINOR_INSTANCE_MASK) >> \ HID_MINOR_INSTANCE_SHIFT) diff --git a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h index e9a25ea894..ee68f0088a 100644 --- a/usr/src/uts/common/sys/usb/clients/hid/hidvar.h +++ b/usr/src/uts/common/sys/usb/clients/hid/hidvar.h @@ -21,7 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. */ #ifndef _SYS_USB_HIDVAR_H @@ -33,6 +33,7 @@ extern "C" { #endif #include <sys/usb/usba/usbai_private.h> +#include <sys/usb/usba/usba_ugen.h> /* * HID : This header file contains the internal structures @@ -222,6 +223,8 @@ typedef struct hid_state { queue_t *hid_inuse_rq; int hid_internal_flag; /* see below */ int hid_external_flag; /* see below */ + + usb_ugen_hdl_t hid_ugen_hdl; /* ugen support */ } hid_state_t; /* warlock directives, stable data */ diff --git a/usr/src/uts/common/sys/user.h b/usr/src/uts/common/sys/user.h index 0b997c518c..8c424e7bf3 100644 --- a/usr/src/uts/common/sys/user.h +++ b/usr/src/uts/common/sys/user.h @@ -26,7 +26,7 @@ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ /* - * Copyright (c) 2018, Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ @@ -82,6 +82,21 @@ extern "C" { #endif /* + * File Descriptor assignment generation. + * + * Certain file descriptor consumers (namely epoll) need to be able to detect + * when the resource underlying an fd change due to (re)assignment. Checks + * comparing old and new file_t pointers work OK, but could easily be fooled by + * an entry freed-to and reused-from the cache. To better detect such + * assingments, a generation number is kept in the uf_entry. Whenever a + * non-NULL file_t is assigned to the entry, the generation is incremented, + * indicating the change. There is a minute possibility that a rollover of the + * value could cause assigments to evade detection by consumers, but it is + * considered acceptably small. + */ +typedef uint_t uf_entry_gen_t; + +/* * Entry in the per-process list of open files. * Note: only certain fields are copied in flist_grow() and flist_fork(). * This is indicated in brackets in the structure member comments. @@ -96,11 +111,13 @@ typedef struct uf_entry { short uf_busy; /* file is allocated [grow, fork] */ kcondvar_t uf_wanted_cv; /* waiting for setf() [never copied] */ kcondvar_t uf_closing_cv; /* waiting for close() [never copied] */ - struct portfd *uf_portfd; /* associated with port [grow] */ + struct portfd *uf_portfd; /* associated with port [grow] */ + uf_entry_gen_t uf_gen; /* assigned fd generation [grow,fork] */ /* Avoid false sharing - pad to coherency granularity (64 bytes) */ char uf_pad[64 - sizeof (kmutex_t) - 2 * sizeof (void*) - 2 * sizeof (int) - 2 * sizeof (short) - - 2 * sizeof (kcondvar_t) - sizeof (struct portfd *)]; + 2 * sizeof (kcondvar_t) - sizeof (struct portfd *) - + sizeof (uf_entry_gen_t)]; } uf_entry_t; /* @@ -185,9 +202,9 @@ typedef struct { /* kernel syscall set type */ * This value should not be changed in a patch. */ #if defined(__sparc) -#define __KERN_NAUXV_IMPL 20 +#define __KERN_NAUXV_IMPL 24 #elif defined(__i386) || defined(__amd64) -#define __KERN_NAUXV_IMPL 25 +#define __KERN_NAUXV_IMPL 28 #endif struct execsw; @@ -210,7 +227,11 @@ typedef struct user { char u_psargs[PSARGSZ]; /* arguments from exec */ int u_argc; /* value of argc passed to main() */ uintptr_t u_argv; /* value of argv passed to main() */ + uintptr_t u_argvstrs; /* argv string space pointer */ + size_t u_argvstrsize; /* size of argv string space */ uintptr_t u_envp; /* value of envp passed to main() */ + uintptr_t u_envstrs; /* env string space pointer */ + size_t u_envstrsize; /* size of env string space */ uintptr_t u_commpagep; /* address of mapped comm page */ /* diff --git a/usr/src/uts/common/sys/vm.h b/usr/src/uts/common/sys/vm.h index a8ca2ad377..0f7dfa9fd0 100644 --- a/usr/src/uts/common/sys/vm.h +++ b/usr/src/uts/common/sys/vm.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 1983, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 Joyent, Inc. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -57,6 +58,8 @@ int queue_io_request(struct vnode *, u_offset_t); extern kmutex_t memavail_lock; extern kcondvar_t memavail_cv; +#define WAKE_PAGEOUT_SCANNER() cv_broadcast(&proc_pageout->p_cv) + #endif /* defined(_KERNEL) */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vm_usage.h b/usr/src/uts/common/sys/vm_usage.h index 1aa4a8ee6d..afbf438eff 100644 --- a/usr/src/uts/common/sys/vm_usage.h +++ b/usr/src/uts/common/sys/vm_usage.h @@ -21,6 +21,7 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2017 Joyent, Inc. All rights reserved. */ #ifndef _SYS_VM_USAGE_H @@ -79,8 +80,12 @@ extern "C" { /* zoneid */ #define VMUSAGE_COL_EUSERS 0x2000 /* same as VMUSAGE_COL_RUSERS, but by */ /* euser */ +#define VMUSAGE_A_ZONE 0x4000 /* rss/swap for a specified zone */ -#define VMUSAGE_MASK 0x3fff /* all valid flags for getvmusage() */ +#define VMUSAGE_MASK 0x7fff /* all valid flags for getvmusage() */ + +#define VMUSAGE_ZONE_FLAGS (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | \ + VMUSAGE_A_ZONE) typedef struct vmusage { id_t vmu_zoneid; /* zoneid, or ALL_ZONES for */ diff --git a/usr/src/uts/common/sys/vmsystm.h b/usr/src/uts/common/sys/vmsystm.h index c274bae805..2292310bda 100644 --- a/usr/src/uts/common/sys/vmsystm.h +++ b/usr/src/uts/common/sys/vmsystm.h @@ -19,6 +19,9 @@ * CDDL HEADER END */ /* + * Copyright (c) 2017, Joyent, Inc. All rights reserved. + */ +/* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -58,6 +61,9 @@ extern pgcnt_t desscan; /* desired pages scanned per second */ extern pgcnt_t slowscan; extern pgcnt_t fastscan; extern pgcnt_t pushes; /* number of pages pushed to swap device */ +extern uint64_t low_mem_scan; /* num times page scan due to low memory */ +extern uint64_t zone_cap_scan; /* num times page scan due to zone cap */ +extern uint64_t n_throttle; /* num times page create throttled */ /* writable copies of tunables */ extern pgcnt_t maxpgio; /* max paging i/o per sec before start swaps */ @@ -159,6 +165,8 @@ extern void *boot_virt_alloc(void *addr, size_t size); extern size_t exec_get_spslew(void); +extern caddr_t map_userlimit(proc_t *pp, struct as *as, int flags); + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/sys/vnd.h b/usr/src/uts/common/sys/vnd.h new file mode 100644 index 0000000000..bc7c9c3122 --- /dev/null +++ b/usr/src/uts/common/sys/vnd.h @@ -0,0 +1,141 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_H +#define _SYS_VND_H + +#include <sys/types.h> +#include <sys/vnd_errno.h> +#include <sys/frameio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * We distinguish between normal ioctls and private ioctls we issues to out + * streams version. Streams ioctls have the upper bit set in the lowest byte. + * Note that there are no STREAMs ioctls for userland and all definitions + * related to them are not present in this file. + */ +#define VND_IOC (('v' << 24) | ('n' << 16) | ('d' << 8)) + +/* + * Attach the current minor instance to a given dlpi datalink identified by a + * vnd_ioc_name_t argument. This fails if it's already been attached. Note that + * unlike the other ioctls, this is passed directly as opposed to every other + * function which is passed as a pointer to the value. + */ +#define VND_IOC_ATTACH (VND_IOC | 0x1) + +#define VND_NAMELEN 32 + +typedef struct vnd_ioc_attach { + char via_name[VND_NAMELEN]; + zoneid_t via_zoneid; + uint32_t via_errno; +} vnd_ioc_attach_t; + +/* + * Link the current minor instance into the /devices name space. + * + * This ioctl adds entries into /devices with a name of the form z%d:%s vil_zid, + * vil_name. The device will be namespaced to the zone. The global zone will be + * able to see all minor nodes. In the zone, only the /dev entries will exist. + * At this time, a given device can only have one link at a time. Note that a + * user cannot specify the zone to pass in, rather it is the zone that the + * device was attached in. + */ +#define VND_IOC_LINK (VND_IOC | 0x2) + +typedef struct vnd_ioc_link { + char vil_name[VND_NAMELEN]; + uint32_t vil_errno; +} vnd_ioc_link_t; + +/* + * Unlink the opened minor instance from the /devices name space. A zone may use + * this to unlink an extent entry in /dev; however, they will not be able to + * link it in again. + */ +#define VND_IOC_UNLINK (VND_IOC | 0x3) +typedef struct vnd_ioc_unlink { + uint32_t viu_errno; +} vnd_ioc_unlink_t; + +/* + * Controls to get and set the current buffer recieve buffer size. + */ +typedef struct vnd_ioc_buf { + uint64_t vib_size; + uint32_t vib_filler; + uint32_t vib_errno; +} vnd_ioc_buf_t; + +#define VND_IOC_GETRXBUF (VND_IOC | 0x04) +#define VND_IOC_SETRXBUF (VND_IOC | 0x05) +#define VND_IOC_GETMAXBUF (VND_IOC | 0x06) +#define VND_IOC_GETTXBUF (VND_IOC | 0x07) +#define VND_IOC_SETTXBUF (VND_IOC | 0x08) +#define VND_IOC_GETMINTU (VND_IOC | 0x09) +#define VND_IOC_GETMAXTU (VND_IOC | 0x0a) + +/* + * Information and listing ioctls + * + * This gets information about all of the active vnd instances. vl_actents is + * always updated to the number around and vl_nents is the number of + * vnd_ioc_info_t elements are allocated in vl_ents. + */ +typedef struct vnd_ioc_info { + uint32_t vii_version; + zoneid_t vii_zone; + char vii_name[VND_NAMELEN]; + char vii_datalink[VND_NAMELEN]; +} vnd_ioc_info_t; + +typedef struct vnd_ioc_list { + uint_t vl_nents; + uint_t vl_actents; + vnd_ioc_info_t *vl_ents; +} vnd_ioc_list_t; + +#ifdef _KERNEL + +typedef struct vnd_ioc_list32 { + uint_t vl_nents; + uint_t vl_actents; + caddr32_t vl_ents; +} vnd_ioc_list32_t; + +#endif /* _KERNEL */ + +#define VND_IOC_LIST (VND_IOC | 0x20) + +/* + * Framed I/O ioctls + * + * Users should use the standard frameio_t as opposed to a vnd specific type. + * This is a consolidation private ioctl pending futher stability in the form of + * specific system work. + */ +#define VND_IOC_FRAMEIO_READ (VND_IOC | 0x30) +#define VND_IOC_FRAMEIO_WRITE (VND_IOC | 0x31) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_H */ diff --git a/usr/src/uts/common/sys/vnd_errno.h b/usr/src/uts/common/sys/vnd_errno.h new file mode 100644 index 0000000000..89e5fc2543 --- /dev/null +++ b/usr/src/uts/common/sys/vnd_errno.h @@ -0,0 +1,72 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_VND_ERRNO_H +#define _SYS_VND_ERRNO_H + +/* + * This header contains all of the available vnd errors. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vnd_errno { + VND_E_SUCCESS = 0, /* no error */ + VND_E_NOMEM, /* no memory */ + VND_E_NODATALINK, /* no such datalink */ + VND_E_NOTETHER, /* not DL_ETHER */ + VND_E_DLPIINVAL, /* Unknown DLPI failures */ + VND_E_ATTACHFAIL, /* DL_ATTACH_REQ failed */ + VND_E_BINDFAIL, /* DL_BIND_REQ failed */ + VND_E_PROMISCFAIL, /* DL_PROMISCON_REQ failed */ + VND_E_DIRECTFAIL, /* DLD_CAPAB_DIRECT enable failed */ + VND_E_CAPACKINVAL, /* bad dl_capability_ack_t */ + VND_E_SUBCAPINVAL, /* bad dl_capability_sub_t */ + VND_E_DLDBADVERS, /* bad dld version */ + VND_E_KSTATCREATE, /* failed to create kstats */ + VND_E_NODEV, /* no such vnd link */ + VND_E_NONETSTACK, /* netstack doesn't exist */ + VND_E_ASSOCIATED, /* device already associated */ + VND_E_ATTACHED, /* device already attached */ + VND_E_LINKED, /* device already linked */ + VND_E_BADNAME, /* invalid name */ + VND_E_PERM, /* can't touch this */ + VND_E_NOZONE, /* no such zone */ + VND_E_STRINIT, /* failed to initialize vnd stream module */ + VND_E_NOTATTACHED, /* device not attached */ + VND_E_NOTLINKED, /* device not linked */ + VND_E_LINKEXISTS, /* another device has the same link name */ + VND_E_MINORNODE, /* failed to create minor node */ + VND_E_BUFTOOBIG, /* requested buffer size is too large */ + VND_E_BUFTOOSMALL, /* requested buffer size is too small */ + VND_E_DLEXCL, /* unable to get dlpi excl access */ + VND_E_DIRECTNOTSUP, + /* DLD direct capability not suported over data link */ + VND_E_BADPROPSIZE, /* invalid property size */ + VND_E_BADPROP, /* invalid property */ + VND_E_PROPRDONLY, /* property is read only */ + VND_E_SYS, /* unexpected system error */ + VND_E_CAPABPASS, + /* capabilities invalid, pass-through module detected */ + VND_E_UNKNOWN /* unknown error */ +} vnd_errno_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VND_ERRNO_H */ diff --git a/usr/src/uts/common/sys/vnic_impl.h b/usr/src/uts/common/sys/vnic_impl.h index 1a91158da6..4c8d49c621 100644 --- a/usr/src/uts/common/sys/vnic_impl.h +++ b/usr/src/uts/common/sys/vnic_impl.h @@ -21,7 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. - * Copyright 2015 Joyent, Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _SYS_VNIC_IMPL_H @@ -64,6 +64,7 @@ typedef struct vnic_s { mac_notify_handle_t vn_mnh; uint32_t vn_hcksum_txflags; + mac_capab_lso_t vn_cap_lso; uint32_t vn_mtu; link_state_t vn_ls; } vnic_t; diff --git a/usr/src/uts/common/sys/vnode.h b/usr/src/uts/common/sys/vnode.h index b48db0afd6..494264731b 100644 --- a/usr/src/uts/common/sys/vnode.h +++ b/usr/src/uts/common/sys/vnode.h @@ -21,7 +21,7 @@ /* * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2017, Joyent, Inc. + * Copyright (c) 2018, Joyent, Inc. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2017 RackTop Systems. */ @@ -812,12 +812,14 @@ typedef enum vnevent { VE_RMDIR = 4, /* Remove of directory vnode's name */ VE_CREATE = 5, /* Create with vnode's name which exists */ VE_LINK = 6, /* Link with vnode's name as source */ - VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ + VE_RENAME_DEST_DIR = 7, /* Rename with vnode as target dir */ VE_MOUNTEDOVER = 8, /* File or Filesystem got mounted over vnode */ VE_TRUNCATE = 9, /* Truncate */ VE_PRE_RENAME_SRC = 10, /* Pre-rename, with vnode as source */ VE_PRE_RENAME_DEST = 11, /* Pre-rename, with vnode as target/dest. */ - VE_PRE_RENAME_DEST_DIR = 12 /* Pre-rename with vnode as target dir */ + VE_PRE_RENAME_DEST_DIR = 12, /* Pre-rename with vnode as target dir */ + VE_RENAME_SRC_DIR = 13, /* Rename with vnode as source dir */ + VE_RESIZE = 14 /* Resize/truncate to non-zero offset */ } vnevent_t; /* @@ -1377,7 +1379,8 @@ void vnevent_remove(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_rmdir(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_create(vnode_t *, caller_context_t *); void vnevent_link(vnode_t *, caller_context_t *); -void vnevent_rename_dest_dir(vnode_t *, caller_context_t *ct); +void vnevent_rename_dest_dir(vnode_t *, vnode_t *, char *, + caller_context_t *ct); void vnevent_mountedover(vnode_t *, caller_context_t *); void vnevent_truncate(vnode_t *, caller_context_t *); int vnevent_support(vnode_t *, caller_context_t *); @@ -1387,6 +1390,7 @@ void vnevent_pre_rename_dest(vnode_t *, vnode_t *, char *, caller_context_t *); void vnevent_pre_rename_dest_dir(vnode_t *, vnode_t *, char *, caller_context_t *); +void vnevent_resize(vnode_t *, caller_context_t *); /* Vnode specific data */ void vsd_create(uint_t *, void (*)(void *)); @@ -1489,6 +1493,7 @@ extern struct vnode kvps[]; typedef enum { KV_KVP, /* vnode for all segkmem pages */ KV_ZVP, /* vnode for all ZFS pages */ + KV_VVP, /* vnode for all VMM pages */ #if defined(__sparc) KV_MPVP, /* vnode for all page_t meta-pages */ KV_PROMVP, /* vnode for all PROM pages */ diff --git a/usr/src/uts/common/sys/vxlan.h b/usr/src/uts/common/sys/vxlan.h new file mode 100644 index 0000000000..d87786b507 --- /dev/null +++ b/usr/src/uts/common/sys/vxlan.h @@ -0,0 +1,47 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_VXLAN_H +#define _SYS_VXLAN_H + +/* + * Common VXLAN information + */ + +#include <sys/inttypes.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* Sizes in bytes */ +#define VXLAN_HDR_LEN 8 +#define VXLAN_ID_LEN 3 + +#define VXLAN_F_VDI 0x08000000 +#define VXLAN_ID_SHIFT 8 + +#pragma pack(1) +typedef struct vxlan_hdr { + uint32_t vxlan_flags; + uint32_t vxlan_id; +} vxlan_hdr_t; +#pragma pack() + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VXLAN_H */ diff --git a/usr/src/uts/common/sys/zfd.h b/usr/src/uts/common/sys/zfd.h new file mode 100644 index 0000000000..e08d75ecba --- /dev/null +++ b/usr/src/uts/common/sys/zfd.h @@ -0,0 +1,78 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _SYS_ZFD_H +#define _SYS_ZFD_H + +#include <sys/types.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Minor node name of the global zone side (often called the "master" side) + * of the zfd dev. + */ +#define ZFD_MASTER_NAME "master" + +/* + * Minor node name of the non-global zone side (often called the "slave" + * side) of the zfd dev. + */ +#define ZFD_SLAVE_NAME "slave" + +#define ZFD_NAME_LEN 16 + +/* + * ZFD_IOC forms the base for all zfd ioctls. + */ +#define ZFD_IOC (('Z' << 24) | ('f' << 16) | ('d' << 8)) + +/* + * This ioctl tells the slave side it should push the TTY stream modules + * so that the fd looks like a tty. + */ +#define ZFD_MAKETTY (ZFD_IOC | 0) + +/* + * This ioctl puts a hangup into the stream so that the slave side sees EOF. + */ +#define ZFD_EOF (ZFD_IOC | 1) + +/* + * This ioctl succeeds if the slave side is open. + */ +#define ZFD_HAS_SLAVE (ZFD_IOC | 2) + +/* + * This ioctl links two streams into a multiplexer configuration for in-zone + * logging. + */ +#define ZFD_MUX (ZFD_IOC | 3) + +/* + * This ioctl controls the flow control setting for the log multiplexer stream + * (1 = true, 0 = false). The default is false which implies teeing into the + * log stream is "best-effort" but data will be discarded if the stream + * becomes full. If set and the log stream begins to fill up, the primary + * stream will stop flowing. + */ +#define ZFD_MUX_FLOWCON (ZFD_IOC | 4) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFD_H */ diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 9eeb9a0db8..976841fae0 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -49,6 +49,7 @@ #include <sys/socket_impl.h> #include <sys/secflags.h> #include <sys/cpu_uarray.h> +#include <sys/nvpair.h> #include <sys/list.h> #include <sys/loadavg.h> #endif /* _KERNEL */ @@ -61,15 +62,27 @@ extern "C" { * NOTE * * The contents of this file are private to the implementation of - * Solaris and are subject to change at any time without notice. + * illumos and are subject to change at any time without notice. * Applications and drivers using these interfaces may fail to * run on future releases. */ /* Available both in kernel and for user space */ -/* zone id restrictions and special ids */ -#define MAX_ZONEID 9999 +/* + * zone id restrictions and special ids. + * See 'maxzones' for run-time zone limit. + * + * The current 8k value for MAX_ZONES was originally derived from the virtual + * interface limit in IP when "shared-stack" was the only supported networking + * for zones. The virtual interface limit is the number of addresses allowed + * on an interface (see MAX_ADDRS_PER_IF). Even with exclusive stacks, an 8k + * zone limit is still a reasonable choice at this time, given other limits + * within the kernel. Since we only support 8192 zones (which includes GZ), + * there is no point in allowing MAX_ZONEID > 8k. + */ +#define MAX_ZONES 8192 +#define MAX_ZONEID (MAX_ZONES - 1) #define MIN_USERZONEID 1 /* lowest user-creatable zone ID */ #define MIN_ZONEID 0 /* minimum zone ID on system */ #define GLOBAL_ZONEID 0 @@ -108,14 +121,18 @@ extern "C" { #define ZONE_ATTR_INITNAME 9 #define ZONE_ATTR_BOOTARGS 10 #define ZONE_ATTR_BRAND 11 -#define ZONE_ATTR_PHYS_MCAP 12 -#define ZONE_ATTR_SCHED_CLASS 13 -#define ZONE_ATTR_FLAGS 14 -#define ZONE_ATTR_HOSTID 15 -#define ZONE_ATTR_FS_ALLOWED 16 -#define ZONE_ATTR_NETWORK 17 -#define ZONE_ATTR_INITNORESTART 20 +#define ZONE_ATTR_SCHED_CLASS 12 +#define ZONE_ATTR_FLAGS 13 +#define ZONE_ATTR_HOSTID 14 +#define ZONE_ATTR_FS_ALLOWED 15 +#define ZONE_ATTR_NETWORK 16 +#define ZONE_ATTR_DID 17 +#define ZONE_ATTR_INITNORESTART 18 +#define ZONE_ATTR_APP_SVC_CT 19 +#define ZONE_ATTR_SCHED_FIXEDHI 20 #define ZONE_ATTR_SECFLAGS 21 +#define ZONE_ATTR_INITRESTART0 22 +#define ZONE_ATTR_INITREBOOT 23 /* Start of the brand-specific attribute namespace */ #define ZONE_ATTR_BRAND_ATTRS 32768 @@ -131,13 +148,18 @@ extern "C" { #define ZONE_EVENT_READY "ready" #define ZONE_EVENT_RUNNING "running" #define ZONE_EVENT_SHUTTING_DOWN "shutting_down" +#define ZONE_EVENT_FREE "free" #define ZONE_CB_NAME "zonename" #define ZONE_CB_NEWSTATE "newstate" #define ZONE_CB_OLDSTATE "oldstate" +#define ZONE_CB_RESTARTS "restarts" #define ZONE_CB_TIMESTAMP "when" #define ZONE_CB_ZONEID "zoneid" +#define ZONE_EVENT_INIT_CLASS "init" +#define ZONE_EVENT_INIT_RESTART_SC "restart" + /* * Exit values that may be returned by scripts or programs invoked by various * zone commands. @@ -196,6 +218,7 @@ typedef struct { uint32_t doi; /* DOI for label */ caddr32_t label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def32; #endif typedef struct { @@ -212,6 +235,7 @@ typedef struct { uint32_t doi; /* DOI for label */ const bslabel_t *label; /* label associated with zone */ int flags; + zoneid_t zoneid; /* requested zoneid */ } zone_def; /* extended error information */ @@ -236,7 +260,8 @@ typedef enum { ZONE_IS_EMPTY, ZONE_IS_DOWN, ZONE_IS_DYING, - ZONE_IS_DEAD + ZONE_IS_DEAD, + ZONE_IS_FREE /* transient state for zone sysevent */ } zone_status_t; #define ZONE_MIN_STATE ZONE_IS_UNINITIALIZED #define ZONE_MAX_STATE ZONE_IS_DEAD @@ -256,9 +281,12 @@ typedef enum zone_cmd { typedef struct zone_cmd_arg { uint64_t uniqid; /* unique "generation number" */ zone_cmd_t cmd; /* requested action */ - uint32_t _pad; /* need consistent 32/64 bit alignmt */ + int status; /* init status on shutdown */ + uint32_t debug; /* enable brand hook debug */ char locale[MAXPATHLEN]; /* locale in which to render messages */ char bootbuf[BOOTARGS_MAX]; /* arguments passed to zone_boot() */ + /* Needed for 32/64 zoneadm -> zoneadmd door arg size check. */ + int pad; } zone_cmd_arg_t; /* @@ -386,7 +414,7 @@ typedef struct zone_dataset { } zone_dataset_t; /* - * structure for zone kstats + * structure for rctl zone kstats */ typedef struct zone_kstat { kstat_named_t zk_zonename; @@ -397,12 +425,57 @@ typedef struct zone_kstat { struct cpucap; typedef struct { + hrtime_t cycle_start; + uint_t cycle_cnt; + hrtime_t zone_avg_cnt; +} sys_zio_cntr_t; + +typedef struct { + kstat_named_t zv_zonename; + kstat_named_t zv_nread; + kstat_named_t zv_reads; + kstat_named_t zv_rtime; + kstat_named_t zv_rlentime; + kstat_named_t zv_rcnt; + kstat_named_t zv_nwritten; + kstat_named_t zv_writes; + kstat_named_t zv_wtime; + kstat_named_t zv_wlentime; + kstat_named_t zv_wcnt; + kstat_named_t zv_10ms_ops; + kstat_named_t zv_100ms_ops; + kstat_named_t zv_1s_ops; + kstat_named_t zv_10s_ops; + kstat_named_t zv_delay_cnt; + kstat_named_t zv_delay_time; +} zone_vfs_kstat_t; + +typedef struct { + kstat_named_t zz_zonename; + kstat_named_t zz_nread; + kstat_named_t zz_reads; + kstat_named_t zz_rtime; + kstat_named_t zz_rlentime; + kstat_named_t zz_nwritten; + kstat_named_t zz_writes; + kstat_named_t zz_waittime; +} zone_zfs_kstat_t; + +typedef struct { kstat_named_t zm_zonename; + kstat_named_t zm_rss; + kstat_named_t zm_phys_cap; + kstat_named_t zm_swap; + kstat_named_t zm_swap_cap; + kstat_named_t zm_nover; + kstat_named_t zm_pagedout; kstat_named_t zm_pgpgin; kstat_named_t zm_anonpgin; kstat_named_t zm_execpgin; kstat_named_t zm_fspgin; kstat_named_t zm_anon_alloc_fail; + kstat_named_t zm_pf_throttle; + kstat_named_t zm_pf_throttle_usec; } zone_mcap_kstat_t; typedef struct { @@ -417,8 +490,10 @@ typedef struct { kstat_named_t zm_ffnoproc; kstat_named_t zm_ffnomem; kstat_named_t zm_ffmisc; + kstat_named_t zm_mfseglim; kstat_named_t zm_nested_intp; kstat_named_t zm_init_pid; + kstat_named_t zm_init_restarts; kstat_named_t zm_boot_time; } zone_misc_kstat_t; @@ -461,6 +536,7 @@ typedef struct zone { */ list_node_t zone_linkage; zoneid_t zone_id; /* ID of zone */ + zoneid_t zone_did; /* persistent debug ID of zone */ uint_t zone_ref; /* count of zone_hold()s on zone */ uint_t zone_cred_ref; /* count of zone_hold_cred()s on zone */ /* @@ -513,10 +589,11 @@ typedef struct zone { kcondvar_t zone_cv; /* used to signal state changes */ struct proc *zone_zsched; /* Dummy kernel "zsched" process */ pid_t zone_proc_initpid; /* pid of "init" for this zone */ - char *zone_initname; /* fs path to 'init' */ + uint_t zone_proc_init_restarts; /* times init restarted */ + char *zone_initname; /* fs path to 'init' */ + int zone_init_status; /* init's exit status */ int zone_boot_err; /* for zone_boot() if boot fails */ char *zone_bootargs; /* arguments passed via zone_boot() */ - uint64_t zone_phys_mcap; /* physical memory cap */ /* * zone_kthreads is protected by zone_status_lock. */ @@ -554,9 +631,13 @@ typedef struct zone { tsol_mlp_list_t zone_mlps; /* MLPs on zone-private addresses */ boolean_t zone_restart_init; /* Restart init if it dies? */ + boolean_t zone_reboot_on_init_exit; /* Reboot if init dies? */ + boolean_t zone_restart_init_0; /* Restart only if it exits 0 */ + boolean_t zone_setup_app_contract; /* setup contract? */ struct brand *zone_brand; /* zone's brand */ void *zone_brand_data; /* store brand specific data */ id_t zone_defaultcid; /* dflt scheduling class id */ + boolean_t zone_fixed_hipri; /* fixed sched. hi prio */ kstat_t *zone_swapresv_kstat; kstat_t *zone_lockedmem_kstat; /* @@ -565,8 +646,24 @@ typedef struct zone { list_t zone_dl_list; netstack_t *zone_netstack; struct cpucap *zone_cpucap; /* CPU caps data */ + + /* + * kstats and counters for VFS ops and bytes. + */ + kmutex_t zone_vfs_lock; /* protects VFS statistics */ + kstat_t *zone_vfs_ksp; + kstat_io_t zone_vfs_rwstats; + zone_vfs_kstat_t *zone_vfs_stats; + + /* + * kstats for ZFS I/O ops and bytes. + */ + kmutex_t zone_zfs_lock; /* protects ZFS statistics */ + kstat_t *zone_zfs_ksp; + zone_zfs_kstat_t *zone_zfs_stats; + /* - * Solaris Auditing per-zone audit context + * illumos Auditing per-zone audit context */ struct au_kcontext *zone_audit_kctxt; /* @@ -583,7 +680,11 @@ typedef struct zone { /* zone_rctls->rcs_lock */ kstat_t *zone_nprocs_kstat; - kmutex_t zone_mcap_lock; /* protects mcap statistics */ + /* + * kstats and counters for physical memory capping. + */ + kstat_t *zone_physmem_kstat; + kmutex_t zone_mcap_lock; /* protects mcap statistics */ kstat_t *zone_mcap_ksp; zone_mcap_kstat_t *zone_mcap_stats; uint64_t zone_pgpgin; /* pages paged in */ @@ -608,6 +709,8 @@ typedef struct zone { uint32_t zone_ffnomem; /* as_dup/memory error */ uint32_t zone_ffmisc; /* misc. other error */ + uint32_t zone_mfseglim; /* map failure (# segs limit) */ + uint32_t zone_nested_intp; /* nested interp. kstat */ struct loadavg_s zone_loadavg; /* loadavg for this zone */ @@ -635,6 +738,53 @@ typedef struct zone { } zone_t; /* + * Data and counters used for ZFS fair-share disk IO. + */ +typedef struct zone_zfs_io { + uint16_t zpers_zfs_io_pri; /* ZFS IO priority - 16k max */ + uint_t zpers_zfs_queued[2]; /* sync I/O enqueued count */ + sys_zio_cntr_t zpers_rd_ops; /* Counters for ZFS reads, */ + sys_zio_cntr_t zpers_wr_ops; /* writes, and */ + sys_zio_cntr_t zpers_lwr_ops; /* logical writes. */ + kstat_io_t zpers_zfs_rwstats; + uint64_t zpers_io_util; /* IO utilization metric */ + uint64_t zpers_zfs_rd_waittime; + uint8_t zpers_io_delay; /* IO delay on logical r/w */ + uint8_t zpers_zfs_weight; /* used to prevent starvation */ + uint8_t zpers_io_util_above_avg; /* IO util percent > avg. */ +} zone_zfs_io_t; + +/* + * "Persistent" zone data which can be accessed idependently of the zone_t. + */ +typedef struct zone_persist { + kmutex_t zpers_zfs_lock; /* Protects zpers_zfsp references */ + zone_zfs_io_t *zpers_zfsp; /* ZFS fair-share IO data */ + uint8_t zpers_over; /* currently over cap */ + uint32_t zpers_pg_cnt; /* current RSS in pages */ + uint32_t zpers_pg_limit; /* current RRS limit in pages */ + uint32_t zpers_nover; /* # of times over phys. cap */ +#ifndef DEBUG + uint64_t zpers_pg_out; /* # pages flushed */ +#else + /* + * To conserve memory, some detailed kstats are only kept for DEBUG + * builds. + */ + uint64_t zpers_zfs_rd_waittime; + + uint64_t zpers_pg_anon; /* # clean anon pages flushed */ + uint64_t zpers_pg_anondirty; /* # dirty anon pages flushed */ + uint64_t zpers_pg_fs; /* # clean fs pages flushed */ + uint64_t zpers_pg_fsdirty; /* # dirty fs pages flushed */ +#endif +} zone_persist_t; + +typedef enum zone_pageout_op { + ZPO_DIRTY, ZPO_FS, ZPO_ANON, ZPO_ANONDIRTY +} zone_pageout_op_t; + +/* * Special value of zone_psetid to indicate that pools are disabled. */ #define ZONE_PS_INVAL PS_MYID @@ -664,6 +814,7 @@ extern zone_t *zone_find_by_name(char *); extern zone_t *zone_find_by_any_path(const char *, boolean_t); extern zone_t *zone_find_by_path(const char *); extern zoneid_t getzoneid(void); +extern zoneid_t getzonedid(void); extern zone_t *zone_find_by_id_nolock(zoneid_t); extern int zone_datalink_walk(zoneid_t, int (*)(datalink_id_t, void *), void *); extern int zone_check_datalink(zoneid_t *, datalink_id_t); @@ -844,6 +995,7 @@ extern int zone_ncpus_online_get(zone_t *); * Returns true if the named pool/dataset is visible in the current zone. */ extern int zone_dataset_visible(const char *, int *); +extern int zone_dataset_visible_inzone(zone_t *, const char *, int *); /* * zone version of kadmin() @@ -856,10 +1008,25 @@ extern void mount_completed(zone_t *); extern int zone_walk(int (*)(zone_t *, void *), void *); +struct page; +extern void zone_add_page(struct page *); +extern void zone_rm_page(struct page *); +extern void zone_pageout_stat(int, zone_pageout_op_t); +extern void zone_get_physmem_data(int, pgcnt_t *, pgcnt_t *); + +/* Interfaces for page scanning */ +extern uint_t zone_num_over_cap; +extern zone_persist_t zone_pdata[MAX_ZONES]; + extern rctl_hndl_t rc_zone_locked_mem; extern rctl_hndl_t rc_zone_max_swap; +extern rctl_hndl_t rc_zone_phys_mem; extern rctl_hndl_t rc_zone_max_lofi; +/* For publishing sysevents related to a particular zone */ +extern void zone_sysevent_publish(zone_t *, const char *, const char *, + nvlist_t *); + #endif /* _KERNEL */ #ifdef __cplusplus |
