diff options
Diffstat (limited to 'usr/src/uts/common/inet/tcp/tcp.c')
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 3019 |
1 files changed, 1920 insertions, 1099 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index f55afe25f6..3c7ec52f22 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -57,6 +57,7 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; #include <sys/policy.h> #include <sys/priv.h> #include <sys/zone.h> +#include <sys/sunldi.h> #include <sys/errno.h> #include <sys/signal.h> @@ -154,7 +155,7 @@ const char tcp_version[] = "%Z%%M% %I% %E% SMI"; * * Opening a new connection: * - * The outgoing connection open is pretty simple. ip_tcpopen() does the + * The outgoing connection open is pretty simple. tcp_open() does the * work in creating the conn/tcp structure and initializing it. The * squeue assignment is done based on the CPU the application * is running on. So for outbound connections, processing is always done @@ -241,7 +242,7 @@ extern major_t TCP6_MAJ; * 2: squeue_enter * 3: squeue_fill */ -int tcp_squeue_close = 2; +int tcp_squeue_close = 2; /* Setable in /etc/system */ int tcp_squeue_wput = 2; squeue_func_t tcp_squeue_close_proc; @@ -280,7 +281,8 @@ int tcp_tx_pull_len = 16; * How to add new counters. * * 1) Add a field in the tcp_stat structure describing your counter. - * 2) Add a line in tcp_statistics with the name of the counter. + * 2) Add a line in the template in tcp_kstat2_init() with the name + * of the counter. * * IMPORTANT!! - make sure that both are in sync !! * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. @@ -320,119 +322,33 @@ static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; #endif #if TCP_DEBUG_COUNTER -#define TCP_DBGSTAT(x) atomic_add_64(&(tcp_statistics.x.value.ui64), 1) +#define TCP_DBGSTAT(tcps, x) \ + atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) +#define TCP_G_DBGSTAT(x) \ + atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) #elif defined(lint) -#define TCP_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); +#define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); +#define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); #else -#define TCP_DBGSTAT(x) +#define TCP_DBGSTAT(tcps, x) +#define TCP_G_DBGSTAT(x) #endif -tcp_stat_t tcp_statistics = { - { "tcp_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, - { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, - { "tcp_ip_output", KSTAT_DATA_UINT64 }, - { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, - { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, - { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, - { "tcp_eager_err1", KSTAT_DATA_UINT64 }, - { "tcp_eager_err2", KSTAT_DATA_UINT64 }, - { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, - { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, - { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, - { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, - { "tcp_no_listener", KSTAT_DATA_UINT64 }, - { "tcp_found_eager", KSTAT_DATA_UINT64 }, - { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, - { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, - { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, - { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, - { "tcp_open_alloc", KSTAT_DATA_UINT64 }, - { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, - { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_listendrop", KSTAT_DATA_UINT64 }, - { "tcp_listendropq0", KSTAT_DATA_UINT64 }, - { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, - { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, - { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, - { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, - { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, - { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, - { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, - { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, - { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, - { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, - { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, - { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, - { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, - { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, - { "tcp_ire_null1", KSTAT_DATA_UINT64 }, - { "tcp_ire_null", KSTAT_DATA_UINT64 }, - { "tcp_ip_send", KSTAT_DATA_UINT64 }, - { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, - { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, - { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, - { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, - { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, - { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, - { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, - { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, - { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, - { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, - { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, - { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, - { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, - { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, - { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, - { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, - { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, - { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, - { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, - { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, - { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, - { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, - { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, - { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, - { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, - { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, - { "tcp_lso_enabled", KSTAT_DATA_UINT64 }, - { "tcp_lso_disabled", KSTAT_DATA_UINT64 }, - { "tcp_lso_times", KSTAT_DATA_UINT64 }, - { "tcp_lso_pkt_out", KSTAT_DATA_UINT64 }, -}; +#define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) -static kstat_t *tcp_kstat; +tcp_g_stat_t tcp_g_statistics; +kstat_t *tcp_g_kstat; /* * Call either ip_output or ip_output_v6. This replaces putnext() calls on the * tcp write side. */ #define CALL_IP_WPUT(connp, q, mp) { \ + tcp_stack_t *tcps; \ + \ + tcps = connp->conn_netstack->netstack_tcp; \ ASSERT(((q)->q_flag & QREADR) == 0); \ - TCP_DBGSTAT(tcp_ip_output); \ + TCP_DBGSTAT(tcps, tcp_ip_output); \ connp->conn_send(connp, (mp), (q), IP_WPUT); \ } @@ -464,15 +380,9 @@ static kstat_t *tcp_kstat; #define ISS_INCR 250000 #define ISS_NSEC_SHT 12 -static uint32_t tcp_iss_incr_extra; /* Incremented for each connection */ -static kmutex_t tcp_iss_key_lock; -static MD5_CTX tcp_iss_key; static sin_t sin_null; /* Zero address for quick clears */ static sin6_t sin6_null; /* Zero address for quick clears */ -/* Packet dropper for TCP IPsec policy drops. */ -static ipdropper_t tcp_dropper; - /* * This implementation follows the 4.3BSD interpretation of the urgent * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause @@ -615,11 +525,15 @@ kmem_cache_t *tcp_iphc_cache; * The list manipulations (including tcp_time_wait_next/prev) * are protected by the tcp_time_wait_lock. The content of the * detached TIME_WAIT connections is protected by the normal perimeters. + * + * This list is per squeue and squeues are shared across the tcp_stack_t's. + * Things on tcp_time_wait_head remain associated with the tcp_stack_t + * and conn_netstack. + * The tcp_t's that are added to tcp_free_list are disassociated and + * have NULL tcp_tcps and conn_netstack pointers. */ - typedef struct tcp_squeue_priv_s { kmutex_t tcp_time_wait_lock; - /* Protects the next 3 globals */ timeout_id_t tcp_time_wait_tid; tcp_t *tcp_time_wait_head; tcp_t *tcp_time_wait_tail; @@ -832,13 +746,16 @@ static int tcp_tpistate(tcp_t *tcp); static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, int caller_holds_lock); static void tcp_bind_hash_remove(tcp_t *tcp); -static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id); +static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); static void tcp_acceptor_hash_remove(tcp_t *tcp); static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); static void tcp_info_req(tcp_t *tcp, mblk_t *mp); static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); +void tcp_g_q_setup(tcp_stack_t *); +void tcp_g_q_create(tcp_stack_t *); +void tcp_g_q_destroy(tcp_stack_t *); static int tcp_header_init_ipv4(tcp_t *tcp); static int tcp_header_init_ipv6(tcp_t *tcp); int tcp_init(tcp_t *tcp, queue_t *q); @@ -866,12 +783,13 @@ static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len); static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); -static boolean_t tcp_param_register(tcpparam_t *tcppa, int cnt); +static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, + tcp_stack_t *); static int tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static void tcp_iss_key_init(uint8_t *phrase, int len); +static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); @@ -884,7 +802,7 @@ static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp); static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); -static boolean_t tcp_send_rst_chk(void); +static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_ss_rexmit(tcp_t *tcp); static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); static void tcp_process_options(tcp_t *, tcph_t *); @@ -936,11 +854,11 @@ static void tcp_ack_timer(void *arg); static mblk_t *tcp_ack_mp(tcp_t *tcp); static void tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, - zoneid_t zoneid); + zoneid_t zoneid, tcp_stack_t *); static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl); -static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr); -static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr); +static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *); +static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *); static int setmaxps(queue_t *q, int maxpsz); static void tcp_set_rto(tcp_t *, time_t); static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, @@ -956,14 +874,14 @@ static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *); boolean_t tcp_reserved_port_del(in_port_t, in_port_t); -boolean_t tcp_reserved_port_check(in_port_t); -static tcp_t *tcp_alloc_temp_tcp(in_port_t); +boolean_t tcp_reserved_port_check(in_port_t, tcp_stack_t *); +static tcp_t *tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *); static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *); static mblk_t *tcp_mdt_info_mp(mblk_t *); static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, const boolean_t, const uint32_t, const uint32_t, - const uint32_t, const uint32_t); + const uint32_t, const uint32_t, tcp_stack_t *); static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, const uint_t, const uint_t, boolean_t *); static mblk_t *tcp_lso_info_mp(mblk_t *); @@ -974,10 +892,15 @@ extern void tcp_timermp_free(tcp_t *); static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); static void tcp_stop_lingering(tcp_t *tcp); static void tcp_close_linger_timeout(void *arg); -void tcp_ddi_init(void); -void tcp_ddi_destroy(void); -static void tcp_kstat_init(void); -static void tcp_kstat_fini(void); +static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); +static void tcp_stack_shutdown(netstackid_t stackid, void *arg); +static void tcp_stack_fini(netstackid_t stackid, void *arg); +static void *tcp_g_kstat_init(tcp_g_stat_t *); +static void tcp_g_kstat_fini(kstat_t *); +static void *tcp_kstat_init(netstackid_t, tcp_stack_t *); +static void tcp_kstat_fini(netstackid_t, kstat_t *); +static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); +static void tcp_kstat2_fini(netstackid_t, kstat_t *); static int tcp_kstat_update(kstat_t *kp, int rw); void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, @@ -1028,10 +951,10 @@ void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); -static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *); +static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, - boolean_t); + boolean_t, tcp_stack_t *); static struct module_info tcp_rinfo = { TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER @@ -1096,49 +1019,11 @@ struct streamtab tcpinfo = { &tcp_rinit, &tcp_winit }; -extern squeue_func_t tcp_squeue_wput_proc; -extern squeue_func_t tcp_squeue_timer_proc; - -/* Protected by tcp_g_q_lock */ -static queue_t *tcp_g_q; /* Default queue used during detached closes */ -kmutex_t tcp_g_q_lock; - -/* Protected by tcp_hsp_lock */ -/* - * XXX The host param mechanism should go away and instead we should use - * the metrics associated with the routes to determine the default sndspace - * and rcvspace. - */ -static tcp_hsp_t **tcp_hsp_hash; /* Hash table for HSPs */ -krwlock_t tcp_hsp_lock; - -/* - * Extra privileged ports. In host byte order. - * Protected by tcp_epriv_port_lock. - */ -#define TCP_NUM_EPRIV_PORTS 64 -static int tcp_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; -static uint16_t tcp_g_epriv_ports[TCP_NUM_EPRIV_PORTS] = { 2049, 4045 }; -kmutex_t tcp_epriv_port_lock; - /* - * The smallest anonymous port in the privileged port range which TCP - * looks for free port. Use in the option TCP_ANONPRIVBIND. + * Have to ensure that tcp_g_q_close is not done by an + * interrupt thread. */ -static in_port_t tcp_min_anonpriv_port = 512; - -/* Only modified during _init and _fini thus no locking is needed. */ -static caddr_t tcp_g_nd; /* Head of 'named dispatch' variable list */ - -/* Hint not protected by any lock */ -static uint_t tcp_next_port_to_try; - - -/* TCP bind hash list - all tcp_t with state >= BOUND. */ -tf_t tcp_bind_fanout[TCP_BIND_FANOUT_SIZE]; - -/* TCP queue hash list - all tcp_t in case they will be an acceptor. */ -static tf_t tcp_acceptor_fanout[TCP_FANOUT_SIZE]; +static taskq_t *tcp_taskq; /* * TCP has a private interface for other kernel modules to reserve a @@ -1171,23 +1056,9 @@ typedef struct tcp_rport_s { tcp_t **temp_tcp_array; } tcp_rport_t; -/* The reserved port array. */ -static tcp_rport_t tcp_reserved_port[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; - -/* Locks to protect the tcp_reserved_ports array. */ -static krwlock_t tcp_reserved_port_lock; - -/* The number of ranges in the array. */ -uint32_t tcp_reserved_port_array_size = 0; - -/* - * MIB-2 stuff for SNMP - * Note: tcpInErrs {tcp 15} is accumulated in ip.c - */ -mib2_tcp_t tcp_mib; /* SNMP fixed size info */ -kstat_t *tcp_mibkp; /* kstat exporting tcp_mib data */ - +/* Setable only in /etc/system. Move to ndd? */ boolean_t tcp_icmp_source_quench = B_FALSE; + /* * Following assumes TPI alignment requirements stay along 32 bit * boundaries @@ -1245,8 +1116,8 @@ static struct T_info_ack tcp_g_t_info_ack_v6 = { * tcp_wroff_xtra is the extra space in front of TCP/IP header for link * layer header. It has to be a multiple of 4. */ -static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; -#define tcp_wroff_xtra tcp_wroff_xtra_param.tcp_param_val +static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; +#define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val /* * All of these are alterable, within the min/max values given, at run time. @@ -1254,7 +1125,7 @@ static tcpparam_t tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; * per the TCP spec. */ /* BEGIN CSTYLED */ -tcpparam_t tcp_param_arr[] = { +static tcpparam_t lcl_tcp_param_arr[] = { /*min max value name */ { 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"}, { 1, PARAM_MAX, 128, "tcp_conn_req_max_q" }, @@ -1331,18 +1202,20 @@ tcpparam_t tcp_param_arr[] = { * each header fragment in the header buffer. Each parameter value has * to be a multiple of 4 (32-bit aligned). */ -static tcpparam_t tcp_mdt_head_param = { 32, 256, 32, "tcp_mdt_hdr_head_min" }; -static tcpparam_t tcp_mdt_tail_param = { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; -#define tcp_mdt_hdr_head_min tcp_mdt_head_param.tcp_param_val -#define tcp_mdt_hdr_tail_min tcp_mdt_tail_param.tcp_param_val +static tcpparam_t lcl_tcp_mdt_head_param = + { 32, 256, 32, "tcp_mdt_hdr_head_min" }; +static tcpparam_t lcl_tcp_mdt_tail_param = + { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; +#define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val +#define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val /* * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out * the maximum number of payload buffers associated per Multidata. */ -static tcpparam_t tcp_mdt_max_pbufs_param = +static tcpparam_t lcl_tcp_mdt_max_pbufs_param = { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; -#define tcp_mdt_max_pbufs tcp_mdt_max_pbufs_param.tcp_param_val +#define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val /* Round up the value to the nearest mss. */ #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) @@ -1373,14 +1246,6 @@ static tcpparam_t tcp_mdt_max_pbufs_param = #define DISP_PORT_ONLY 1 #define DISP_ADDR_AND_PORT 2 -/* - * This controls the rate some ndd info report functions can be used - * by non-privileged users. It stores the last time such info is - * requested. When those report functions are called again, this - * is checked with the current time and compare with the ndd param - * tcp_ndd_get_info_interval. - */ -static clock_t tcp_last_ndd_get_info_time = 0; #define NDD_TOO_QUICK_MSG \ "ndd get info rate too high for non-privileged users, try again " \ "later.\n" @@ -1389,17 +1254,6 @@ static clock_t tcp_last_ndd_get_info_time = 0; #define IS_VMLOANED_MBLK(mp) \ (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) -/* - * These two variables control the rate for TCP to generate RSTs in - * response to segments not belonging to any connections. We limit - * TCP to sent out tcp_rst_sent_rate (ndd param) number of RSTs in - * each 1 second interval. This is to protect TCP against DoS attack. - */ -static clock_t tcp_last_rst_intrvl; -static uint32_t tcp_rst_cnt; - -/* The number of RST not sent because of the rate limit. */ -static uint32_t tcp_rst_unsent; /* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ boolean_t tcp_mdt_chain = B_TRUE; @@ -1414,12 +1268,13 @@ uint_t tcp_mdt_smss_threshold = 1; uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ /* - * Forces all connections to obey the value of the tcp_maxpsz_multiplier + * Forces all connections to obey the value of the tcps_maxpsz_multiplier * tunable settable via NDD. Otherwise, the per-connection behavior is * determined dynamically during tcp_adapt_ire(), which is the default. */ boolean_t tcp_static_maxpsz = B_FALSE; +/* Setable in /etc/system */ /* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ uint32_t tcp_random_anon_port = 1; @@ -1559,6 +1414,9 @@ extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family, */ int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg); +static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), + void *arg, tcp_stack_t *tcps); + /* * Figure out the value of window scale opton. Note that the rwnd is * ASSUMED to be rounded up to the nearest MSS before the calculation. @@ -1595,6 +1453,8 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); mutex_enter(&tcp_time_wait->tcp_time_wait_lock); locked = B_TRUE; + } else { + ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); } if (tcp->tcp_time_wait_expire == 0) { @@ -1646,6 +1506,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) static void tcp_time_wait_append(tcp_t *tcp) { + tcp_stack_t *tcps = tcp->tcp_tcps; tcp_squeue_priv_t *tcp_time_wait = *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); @@ -1675,7 +1536,7 @@ tcp_time_wait_append(tcp_t *tcp) * modular arithmetic. */ tcp->tcp_time_wait_expire += - drv_usectohz(tcp_time_wait_interval * 1000); + drv_usectohz(tcps->tcps_time_wait_interval * 1000); if (tcp->tcp_time_wait_expire == 0) tcp->tcp_time_wait_expire = 1; @@ -1683,7 +1544,8 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); ASSERT(tcp->tcp_time_wait_next == NULL); ASSERT(tcp->tcp_time_wait_prev == NULL); - TCP_DBGSTAT(tcp_time_wait); + TCP_DBGSTAT(tcps, tcp_time_wait); + mutex_enter(&tcp_time_wait->tcp_time_wait_lock); if (tcp_time_wait->tcp_time_wait_head == NULL) { ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); @@ -1705,6 +1567,7 @@ tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(tcp != NULL); if (tcp->tcp_state == TCPS_CLOSED) { @@ -1718,7 +1581,7 @@ tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) tcp->tcp_ipversion == IPV6_VERSION))); ASSERT(!tcp->tcp_listener); - TCP_STAT(tcp_time_wait_reap); + TCP_STAT(tcps, tcp_time_wait_reap); ASSERT(TCP_IS_DETACHED(tcp)); /* @@ -1728,6 +1591,32 @@ tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) tcp_close_detached(tcp); } +/* + * Remove cached/latched IPsec references. + */ +void +tcp_ipsec_cleanup(tcp_t *tcp) +{ + conn_t *connp = tcp->tcp_connp; + + if (connp->conn_flags & IPCL_TCPCONN) { + if (connp->conn_latch != NULL) { + IPLATCH_REFRELE(connp->conn_latch, + connp->conn_netstack); + connp->conn_latch = NULL; + } + if (connp->conn_policy != NULL) { + IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); + connp->conn_policy = NULL; + } + } +} + +/* + * Cleaup before placing on free list. + * Disassociate from the netstack/tcp_stack_t since the freelist + * is per squeue and not per netstack. + */ void tcp_cleanup(tcp_t *tcp) { @@ -1737,8 +1626,14 @@ tcp_cleanup(tcp_t *tcp) int tcp_hdr_grown; tcp_sack_info_t *tcp_sack_info; conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; + netstack_t *ns = tcps->tcps_netstack; tcp_bind_hash_remove(tcp); + + /* Cleanup that which needs the netstack first */ + tcp_ipsec_cleanup(tcp); + tcp_free(tcp); /* Release any SSL context */ @@ -1754,12 +1649,6 @@ tcp_cleanup(tcp_t *tcp) tcp->tcp_kssl_pending = B_FALSE; conn_delete_ire(connp, NULL); - if (connp->conn_flags & IPCL_TCPCONN) { - if (connp->conn_latch != NULL) - IPLATCH_REFRELE(connp->conn_latch); - if (connp->conn_policy != NULL) - IPPH_REFRELE(connp->conn_policy); - } /* * Since we will bzero the entire structure, we need to @@ -1772,6 +1661,18 @@ tcp_cleanup(tcp_t *tcp) */ ipcl_globalhash_remove(connp); + /* + * Now it is safe to decrement the reference counts. + * This might be the last reference on the netstack and TCPS + * in which case it will cause the tcp_g_q_close and + * the freeing of the IP Instance. + */ + connp->conn_netstack = NULL; + netstack_rele(ns); + ASSERT(tcps != NULL); + tcp->tcp_tcps = NULL; + TCPS_REFRELE(tcps); + /* Save some state */ mp = tcp->tcp_timercache; @@ -1803,13 +1704,13 @@ tcp_cleanup(tcp_t *tcp) connp->conn_state_flags = CONN_INCIPIENT; connp->conn_ulp = IPPROTO_TCP; connp->conn_ref = 1; - - ipcl_globalhash_insert(connp); } /* * Blows away all tcps whose TIME_WAIT has expired. List traversal * is done forwards from the head. + * This walks all stack instances since + * tcp_time_wait remains global across all stacks. */ /* ARGSUSED */ void @@ -1831,12 +1732,15 @@ tcp_time_wait_collector(void *arg) if (tcp_time_wait->tcp_free_list != NULL && tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { - TCP_STAT(tcp_freelist_cleanup); + TCP_G_STAT(tcp_freelist_cleanup); while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; + tcp->tcp_time_wait_next = NULL; + tcp_time_wait->tcp_free_list_cnt--; + ASSERT(tcp->tcp_tcps == NULL); CONN_DEC_REF(tcp->tcp_connp); } - tcp_time_wait->tcp_free_list_cnt = 0; + ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); } /* @@ -1904,6 +1808,11 @@ tcp_time_wait_collector(void *arg) mutex_exit( &tcp_time_wait->tcp_time_wait_lock); tcp_cleanup(tcp); + ASSERT(connp->conn_latch == NULL); + ASSERT(connp->conn_policy == NULL); + ASSERT(tcp->tcp_tcps == NULL); + ASSERT(connp->conn_netstack == NULL); + mutex_enter( &tcp_time_wait->tcp_time_wait_lock); tcp->tcp_time_wait_next = @@ -1917,6 +1826,7 @@ tcp_time_wait_collector(void *arg) &tcp_time_wait->tcp_time_wait_lock); tcp_bind_hash_remove(tcp); conn_delete_ire(tcp->tcp_connp, NULL); + tcp_ipsec_cleanup(tcp); CONN_DEC_REF(tcp->tcp_connp); } } else { @@ -1984,7 +1894,6 @@ tcp_time_wait_collector(void *arg) timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY); mutex_exit(&tcp_time_wait->tcp_time_wait_lock); } - /* * Reply to a clients T_CONN_RES TPI message. This function * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES @@ -2003,6 +1912,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp) mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ mblk_t *ok_mp; mblk_t *mp1; + tcp_stack_t *tcps = listener->tcp_tcps; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { tcp_err_ack(listener, mp, TPROTO, 0); @@ -2071,7 +1981,7 @@ tcp_accept(tcp_t *listener, mblk_t *mp) acceptor = listener; CONN_INC_REF(acceptor->tcp_connp); } else { - acceptor = tcp_acceptor_hash_lookup(acceptor_id); + acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); if (acceptor == NULL) { if (listener->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 1, @@ -2415,8 +2325,9 @@ tcp_accept(tcp_t *listener, mblk_t *mp) */ finish: ASSERT(acceptor->tcp_detached); - acceptor->tcp_rq = tcp_g_q; - acceptor->tcp_wq = WR(tcp_g_q); + ASSERT(tcps->tcps_g_q != NULL); + acceptor->tcp_rq = tcps->tcps_g_q; + acceptor->tcp_wq = WR(tcps->tcps_g_q); (void) tcp_clean_death(acceptor, 0, 2); CONN_DEC_REF(acceptor->tcp_connp); @@ -2515,6 +2426,9 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) if (eager->tcp_cred != NULL) crfree(eager->tcp_cred); eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; + ASSERT(econnp->conn_netstack == aconnp->conn_netstack); + ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); + aconnp->conn_cred = NULL; econnp->conn_zoneid = aconnp->conn_zoneid; @@ -2591,13 +2505,15 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); ill_t *ill = NULL; boolean_t incoming = (ire_mp == NULL); + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; ASSERT(connp->conn_ire_cache == NULL); if (tcp->tcp_ipversion == IPV4_VERSION) { if (CLASSD(tcp->tcp_connp->conn_rem)) { - BUMP_MIB(&ip_mib, ipIfStatsInDiscards); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); return (0); } /* @@ -2620,12 +2536,13 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) if (tcp->tcp_connp->conn_nexthop_set) { ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, - tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW); + tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, + ipst); if (ire == NULL) { ire = ire_ftable_lookup( tcp->tcp_connp->conn_nexthop_v4, 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, - tsl, match_flags); + tsl, match_flags, ipst); if (ire == NULL) return (0); } else { @@ -2633,7 +2550,7 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) } } else { ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, - zoneid, tsl); + zoneid, tsl, ipst); if (ire != NULL) { ire_cacheable = B_TRUE; ire_uinfo = (ire_mp != NULL) ? @@ -2646,7 +2563,7 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) tcp->tcp_connp->conn_rem, 0, 0, 0, NULL, &sire, zoneid, 0, tsl, (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT)); + MATCH_IRE_DEFAULT), ipst); if (ire == NULL) return (0); ire_uinfo = (sire != NULL) ? @@ -2695,7 +2612,7 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) * should change. IP tells us the latest setting of * ip_path_mtu_discovery through ire_frag_flag. */ - if (ip_path_mtu_discovery) { + if (ipst->ips_ip_path_mtu_discovery) { tcp->tcp_ipha->ipha_fragment_offset_and_flags = htons(IPH_DF); } @@ -2741,7 +2658,7 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) dst_ipif = dst_ill->ill_ipif; } ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, - 0, 0, dst_ipif, zoneid, tsl, match_flags); + 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst); if (ire != NULL) { ire_cacheable = B_TRUE; @@ -2753,7 +2670,7 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) ire = ire_ftable_lookup_v6( &tcp->tcp_connp->conn_remv6, 0, 0, 0, dst_ipif, &sire, zoneid, - 0, tsl, match_flags); + 0, tsl, match_flags, ipst); if (ire == NULL) { if (dst_ill != NULL) ill_refrele(dst_ill); @@ -2834,12 +2751,13 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5); + tcps->tcps_rexmit_interval_extra + + (tcp->tcp_rtt_sa >> 5); - if (rto > tcp_rexmit_interval_max) { - tcp->tcp_rto = tcp_rexmit_interval_max; - } else if (rto < tcp_rexmit_interval_min) { - tcp->tcp_rto = tcp_rexmit_interval_min; + if (rto > tcps->tcps_rexmit_interval_max) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_max; + } else if (rto < tcps->tcps_rexmit_interval_min) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_min; } else { tcp->tcp_rto = rto; } @@ -2850,10 +2768,10 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; if (ire_uinfo->iulp_spipe > 0) { tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, - tcp_max_buf); - if (tcp_snd_lowat_fraction != 0) + tcps->tcps_max_buf); + if (tcps->tcps_snd_lowat_fraction != 0) tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / - tcp_snd_lowat_fraction; + tcps->tcps_snd_lowat_fraction; (void) tcp_maxpsz_set(tcp, B_TRUE); } /* @@ -2864,7 +2782,8 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) * info back to the caller. */ if (ire_uinfo->iulp_rpipe > 0) { - tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, tcp_max_buf); + tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, + tcps->tcps_max_buf); } if (ire_uinfo->iulp_rtomax > 0) { @@ -2940,9 +2859,9 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) /* Sanity check for MSS value. */ if (tcp->tcp_ipversion == IPV4_VERSION) - mss_max = tcp_mss_max_ipv4; + mss_max = tcps->tcps_mss_max_ipv4; else - mss_max = tcp_mss_max_ipv6; + mss_max = tcps->tcps_mss_max_ipv6; if (tcp->tcp_ipversion == IPV6_VERSION && (ire->ire_frag_flag & IPH_FRAG_HDR)) { @@ -2960,8 +2879,8 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) mss -= tcp->tcp_ipsec_overhead; - if (mss < tcp_mss_min) - mss = tcp_mss_min; + if (mss < tcps->tcps_mss_min) + mss = tcps->tcps_mss_min; if (mss > mss_max) mss = mss_max; @@ -2980,18 +2899,18 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) tcp->tcp_loopback = B_TRUE; if (tcp->tcp_ipversion == IPV4_VERSION) { - hsp = tcp_hsp_lookup(tcp->tcp_remote); + hsp = tcp_hsp_lookup(tcp->tcp_remote, tcps); } else { - hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6); + hsp = tcp_hsp_lookup_ipv6(&tcp->tcp_remote_v6, tcps); } if (hsp != NULL) { /* Only modify if we're going to make them bigger */ if (hsp->tcp_hsp_sendspace > tcp->tcp_xmit_hiwater) { tcp->tcp_xmit_hiwater = hsp->tcp_hsp_sendspace; - if (tcp_snd_lowat_fraction != 0) + if (tcps->tcps_snd_lowat_fraction != 0) tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / - tcp_snd_lowat_fraction; + tcps->tcps_snd_lowat_fraction; } if (hsp->tcp_hsp_recvspace > tcp->tcp_rwnd) { @@ -3082,6 +3001,7 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) zone_t *zone; cred_t *cr; in_port_t mlp_port; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { @@ -3266,7 +3186,8 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) if (requested_port == 0) { requested_port = tcp->tcp_anon_priv_bind ? tcp_get_next_priv_port(tcp) : - tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); + tcp_update_next_port(tcps->tcps_next_port_to_try, + tcp, B_TRUE); if (requested_port == 0) { tcp_err_ack(tcp, mp, TNOADDR, 0); return; @@ -3283,7 +3204,8 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) if (connp->conn_anon_mlp && is_system_labeled()) { zone = crgetzone(cr); addrtype = tsol_mlp_addr_type(zone->zone_id, - IPV6_VERSION, &v6addr); + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); if (addrtype == mlptSingle) { tcp_err_ack(tcp, mp, TNOADDR, 0); return; @@ -3306,12 +3228,12 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) * - the atomic assignment of the elements of the array */ cr = DB_CREDDEF(mp, tcp->tcp_cred); - if (requested_port < tcp_smallest_nonpriv_port) { + if (requested_port < tcps->tcps_smallest_nonpriv_port) { priv = B_TRUE; } else { - for (i = 0; i < tcp_g_num_epriv_ports; i++) { + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { if (requested_port == - tcp_g_epriv_ports[i]) { + tcps->tcps_g_epriv_ports[i]) { priv = B_TRUE; break; } @@ -3335,7 +3257,8 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) if (is_system_labeled()) { zone = crgetzone(cr); addrtype = tsol_mlp_addr_type(zone->zone_id, - IPV6_VERSION, &v6addr); + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); if (addrtype == mlptSingle) { tcp_err_ack(tcp, mp, TNOADDR, 0); return; @@ -3363,6 +3286,10 @@ tcp_bind(tcp_t *tcp, mblk_t *mp) * zone actually owns the MLP. Reject if not. */ if (mlptype == mlptShared && addrtype == mlptShared) { + /* + * No need to handle exclusive-stack zones since + * ALL_ZONES only applies to the shared stack. + */ zoneid_t mlpzone; mlpzone = tsol_mlp_findzone(IPPROTO_TCP, @@ -3475,10 +3402,10 @@ do_bind: tcp->tcp_conn_req_max = tbr->CONIND_number; if (tcp->tcp_conn_req_max) { - if (tcp->tcp_conn_req_max < tcp_conn_req_min) - tcp->tcp_conn_req_max = tcp_conn_req_min; - if (tcp->tcp_conn_req_max > tcp_conn_req_max_q) - tcp->tcp_conn_req_max = tcp_conn_req_max_q; + if (tcp->tcp_conn_req_max < tcps->tcps_conn_req_min) + tcp->tcp_conn_req_max = tcps->tcps_conn_req_min; + if (tcp->tcp_conn_req_max > tcps->tcps_conn_req_max_q) + tcp->tcp_conn_req_max = tcps->tcps_conn_req_max_q; /* * If this is a listener, do not reset the eager list * and other stuffs. Note that we don't check if the @@ -3492,7 +3419,7 @@ do_bind: tcp->tcp_eager_next_drop_q0 = tcp; tcp->tcp_eager_prev_drop_q0 = tcp; tcp->tcp_second_ctimer_threshold = - tcp_ip_abort_linterval; + tcps->tcps_ip_abort_linterval; } } @@ -3552,6 +3479,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int loopmax; conn_t *connp = tcp->tcp_connp; zoneid_t zoneid = connp->conn_zoneid; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -3576,10 +3504,11 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * loopmax = * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 */ - loopmax = IPPORT_RESERVED - tcp_min_anonpriv_port; + loopmax = IPPORT_RESERVED - + tcps->tcps_min_anonpriv_port; } else { - loopmax = (tcp_largest_anon_port - - tcp_smallest_anon_port + 1); + loopmax = (tcps->tcps_largest_anon_port - + tcps->tcps_smallest_anon_port + 1); } } do { @@ -3602,7 +3531,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * doing a CONN_INC_REF. */ tcp_bind_hash_remove(tcp); - tbf = &tcp_bind_fanout[TCP_BIND_HASH(lport)]; + tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; mutex_enter(&tbf->tf_lock); for (ltcp = tbf->tf_tcp; ltcp != NULL; ltcp = ltcp->tcp_bind_hash) { @@ -3776,7 +3705,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, tcp->tcp_lport = htons(port); *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; - ASSERT(&tcp_bind_fanout[TCP_BIND_HASH( + ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( tcp->tcp_lport)] == tbf); tcp_bind_hash_insert(tbf, tcp, 1); @@ -3795,7 +3724,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * be in the valid range. */ if (!tcp->tcp_anon_priv_bind) - tcp_next_port_to_try = port + 1; + tcps->tcps_next_port_to_try = port + 1; return (port); } @@ -3808,7 +3737,8 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * get one to start with. */ port = - tcp_update_next_port(tcp_next_port_to_try, + tcp_update_next_port( + tcps->tcps_next_port_to_try, tcp, B_TRUE); user_specified = B_FALSE; } else { @@ -3859,6 +3789,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) { mblk_t *mp; queue_t *q; + tcp_stack_t *tcps = tcp->tcp_tcps; TCP_CLD_STAT(tag); @@ -3907,7 +3838,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) return (0); } - TCP_STAT(tcp_clean_death_nondetached); + TCP_STAT(tcps, tcp_clean_death_nondetached); /* * If T_ORDREL_IND has not been sent yet (done when service routine @@ -3960,10 +3891,10 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) } if (tcp->tcp_state <= TCPS_SYN_RCVD) { /* SYN_SENT or SYN_RCVD */ - BUMP_MIB(&tcp_mib, tcpAttemptFails); + BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { /* ESTABLISHED or CLOSE_WAIT */ - BUMP_MIB(&tcp_mib, tcpEstabResets); + BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); } } @@ -3979,6 +3910,7 @@ static void tcp_stop_lingering(tcp_t *tcp) { clock_t delta = 0; + tcp_stack_t *tcps = tcp->tcp_tcps; tcp->tcp_linger_tid = 0; if (tcp->tcp_state > TCPS_LISTEN) { @@ -4002,12 +3934,13 @@ tcp_stop_lingering(tcp_t *tcp) tcp->tcp_detached = B_TRUE; - tcp->tcp_rq = tcp_g_q; - tcp->tcp_wq = WR(tcp_g_q); + ASSERT(tcps->tcps_g_q != NULL); + tcp->tcp_rq = tcps->tcps_g_q; + tcp->tcp_wq = WR(tcps->tcps_g_q); if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcp_detach_time_wait); + TCP_DBGSTAT(tcps, tcp_detach_time_wait); goto finish; } @@ -4028,8 +3961,9 @@ finish: /* Signal closing thread that it can complete close */ mutex_enter(&tcp->tcp_closelock); tcp->tcp_detached = B_TRUE; - tcp->tcp_rq = tcp_g_q; - tcp->tcp_wq = WR(tcp_g_q); + ASSERT(tcps->tcps_g_q != NULL); + tcp->tcp_rq = tcps->tcps_g_q; + tcp->tcp_wq = WR(tcps->tcps_g_q); tcp->tcp_closed = 1; cv_signal(&tcp->tcp_closecv); mutex_exit(&tcp->tcp_closelock); @@ -4225,6 +4159,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; clock_t delta = 0; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || (connp->conn_fanout == NULL && connp->conn_ref >= 3)); @@ -4369,7 +4304,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) tcp->tcp_detached = B_TRUE; if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcp_detach_time_wait); + TCP_DBGSTAT(tcps, tcp_detach_time_wait); ASSERT(connp->conn_ref >= 3); goto finish; } @@ -4391,10 +4326,10 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) if (msg) { if (tcp->tcp_state == TCPS_ESTABLISHED || tcp->tcp_state == TCPS_CLOSE_WAIT) - BUMP_MIB(&tcp_mib, tcpEstabResets); + BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); if (tcp->tcp_state == TCPS_SYN_SENT || tcp->tcp_state == TCPS_SYN_RCVD) - BUMP_MIB(&tcp_mib, tcpAttemptFails); + BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); } @@ -4407,13 +4342,13 @@ finish: * Although packets are always processed on the correct * tcp's perimeter and access is serialized via squeue's, * IP still needs a queue when sending packets in time_wait - * state so use WR(tcp_g_q) till ip_output() can be + * state so use WR(tcps_g_q) till ip_output() can be * changed to deal with just connp. For read side, we * could have set tcp_rq to NULL but there are some cases * in tcp_rput_data() from early days of this code which * do a putnext without checking if tcp is closed. Those * need to be identified before both tcp_rq and tcp_wq - * can be set to NULL and tcp_q_q can disappear forever. + * can be set to NULL and tcps_g_q can disappear forever. */ mutex_enter(&tcp->tcp_closelock); /* @@ -4423,8 +4358,13 @@ finish: */ if (!tcp->tcp_wait_for_eagers) { tcp->tcp_detached = B_TRUE; - tcp->tcp_rq = tcp_g_q; - tcp->tcp_wq = WR(tcp_g_q); + /* + * When default queue is closing we set tcps_g_q to NULL + * after the close is done. + */ + ASSERT(tcps->tcps_g_q != NULL); + tcp->tcp_rq = tcps->tcps_g_q; + tcp->tcp_wq = WR(tcps->tcps_g_q); } /* Signal tcp_close() to finish closing. */ @@ -4509,13 +4449,14 @@ tcp_closei_local(tcp_t *tcp) { ire_t *ire; conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; if (!TCP_IS_SOCKET(tcp)) tcp_acceptor_hash_remove(tcp); - UPDATE_MIB(&tcp_mib, tcpHCInSegs, tcp->tcp_ibsegs); + UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); tcp->tcp_ibsegs = 0; - UPDATE_MIB(&tcp_mib, tcpHCOutSegs, tcp->tcp_obsegs); + UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); tcp->tcp_obsegs = 0; /* @@ -4544,8 +4485,9 @@ tcp_closei_local(tcp_t *tcp) * listener queue, after we have released our * reference on the listener */ - tcp->tcp_rq = tcp_g_q; - tcp->tcp_wq = WR(tcp_g_q); + ASSERT(tcps->tcps_g_q != NULL); + tcp->tcp_rq = tcps->tcps_g_q; + tcp->tcp_wq = WR(tcps->tcps_g_q); CONN_DEC_REF(listener->tcp_connp); } else { mutex_exit(&listener->tcp_eager_lock); @@ -4609,6 +4551,8 @@ tcp_closei_local(tcp_t *tcp) tcp->tcp_kssl_ctx = NULL; } tcp->tcp_kssl_pending = B_FALSE; + + tcp_ipsec_cleanup(tcp); } /* @@ -4812,6 +4756,7 @@ tcp_drop_q0(tcp_t *tcp) { tcp_t *eager; mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); @@ -4837,12 +4782,12 @@ tcp_drop_q0(tcp_t *tcp) if (tcp->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, "tcp_drop_q0: listen half-open queue (max=%d) overflow" - " (%d pending) on %s, drop one", tcp_conn_req_max_q0, + " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, tcp->tcp_conn_req_cnt_q0, tcp_display(tcp, NULL, DISP_PORT_ONLY)); } - BUMP_MIB(&tcp_mib, tcpHalfOpenDrop); + BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop); /* Put a reference on the conn as we are enqueueing it in the sqeue */ CONN_INC_REF(eager->tcp_connp); @@ -4869,6 +4814,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, int err; int ifindex = 0; cred_t *cr; + tcp_stack_t *tcps = tcp->tcp_tcps; if (ipvers == IPV4_VERSION) { ipha = (ipha_t *)mp->b_rptr; @@ -4885,7 +4831,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, sin6.sin6_port = *(uint16_t *)tcph->th_lport; sin6.sin6_family = AF_INET6; sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst, - lconnp->conn_zoneid); + lconnp->conn_zoneid, tcps->tcps_netstack); if (tcp->tcp_recvdstaddr) { sin6_t sin6d; @@ -4925,7 +4871,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, sin6.sin6_family = AF_INET6; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, - lconnp->conn_zoneid); + lconnp->conn_zoneid, tcps->tcps_netstack); if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { /* Pass up the scope_id of remote addr */ @@ -4961,7 +4907,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER); connp->conn_fully_bound = B_FALSE; - if (tcp_trace) + if (tcps->tcps_trace) tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); /* Inherit information from the "parent" */ @@ -4969,7 +4915,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, tcp->tcp_family = ltcp->tcp_family; tcp->tcp_wq = ltcp->tcp_wq; tcp->tcp_rq = ltcp->tcp_rq; - tcp->tcp_mss = tcp_mss_def_ipv6; + tcp->tcp_mss = tcps->tcps_mss_def_ipv6; tcp->tcp_detached = B_TRUE; if ((err = tcp_init_values(tcp)) != 0) { freemsg(tpi_mp); @@ -5094,7 +5040,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, tcp->tcp_ipha->ipha_src = ipha->ipha_dst; /* Source routing option copyover (reverse it) */ - if (tcp_rev_src_routes) + if (tcps->tcps_rev_src_routes) tcp_opt_reverse(tcp, ipha); } else { ASSERT(ip6h != NULL); @@ -5135,6 +5081,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, mblk_t *tpi_mp = NULL; int err; cred_t *cr; + tcp_stack_t *tcps = tcp->tcp_tcps; sin = sin_null; sin.sin_addr.s_addr = ipha->ipha_src; @@ -5172,7 +5119,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, connp->conn_fport = *(uint16_t *)tcph->th_lport; connp->conn_lport = *(uint16_t *)tcph->th_fport; - if (tcp_trace) { + if (tcps->tcps_trace) { tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_NOSLEEP); } @@ -5181,7 +5128,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, tcp->tcp_family = ltcp->tcp_family; tcp->tcp_wq = ltcp->tcp_wq; tcp->tcp_rq = ltcp->tcp_rq; - tcp->tcp_mss = tcp_mss_def_ipv4; + tcp->tcp_mss = tcps->tcps_mss_def_ipv4; tcp->tcp_detached = B_TRUE; if ((err = tcp_init_values(tcp)) != 0) { freemsg(tpi_mp); @@ -5221,7 +5168,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t)); /* Source routing option copyover (reverse it) */ - if (tcp_rev_src_routes) + if (tcps->tcps_rev_src_routes) tcp_opt_reverse(tcp, ipha); ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); @@ -5262,7 +5209,7 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) boolean_t mctl_present = B_FALSE; uint_t ipvers; - econnp = tcp_get_conn(sqp); + econnp = tcp_get_conn(sqp, tcp->tcp_tcps); if (econnp == NULL) { freemsg(first_mp); return (NULL); @@ -5398,12 +5345,13 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) * there for too long. */ void * -tcp_get_conn(void *arg) +tcp_get_conn(void *arg, tcp_stack_t *tcps) { tcp_t *tcp = NULL; conn_t *connp = NULL; squeue_t *sqp = (squeue_t *)arg; tcp_squeue_priv_t *tcp_time_wait; + netstack_t *ns; tcp_time_wait = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); @@ -5418,11 +5366,24 @@ tcp_get_conn(void *arg) tcp->tcp_time_wait_next = NULL; connp = tcp->tcp_connp; connp->conn_flags |= IPCL_REUSED; + + ASSERT(tcp->tcp_tcps == NULL); + ASSERT(connp->conn_netstack == NULL); + ns = tcps->tcps_netstack; + netstack_hold(ns); + connp->conn_netstack = ns; + tcp->tcp_tcps = tcps; + TCPS_REFHOLD(tcps); + ipcl_globalhash_insert(connp); return ((void *)connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) + if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP, + tcps->tcps_netstack)) == NULL) return (NULL); + tcp = connp->conn_tcp; + tcp->tcp_tcps = tcps; + TCPS_REFHOLD(tcps); return ((void *)connp); } @@ -5441,7 +5402,8 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) int added; if (tsol_compute_label(cr, tcp->tcp_remote, optbuf, - connp->conn_mac_exempt) != 0) + connp->conn_mac_exempt, + tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) return (B_FALSE); added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len); @@ -5465,7 +5427,8 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf, - connp->conn_mac_exempt) != 0) + connp->conn_mac_exempt, + tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) return (B_FALSE); if (tsol_update_sticky(&tcp->tcp_sticky_ipp, &tcp->tcp_label_len, optbuf) != 0) @@ -5504,7 +5467,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * Sockfs ACCEPT Path: * ------------------- * - * open acceptor stream (ip_tcpopen allocates tcp_wput_accept() + * open acceptor stream (tcp_open allocates tcp_wput_accept() * as STREAM entry point) * * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept() @@ -5616,6 +5579,8 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) tcp_t *tcp = connp->conn_tcp; ire_t *ire; cred_t *credp; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst; if (tcp->tcp_state != TCPS_LISTEN) goto error2; @@ -5625,8 +5590,8 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) mutex_enter(&tcp->tcp_eager_lock); if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { mutex_exit(&tcp->tcp_eager_lock); - TCP_STAT(tcp_listendrop); - BUMP_MIB(&tcp_mib, tcpListenDrop); + TCP_STAT(tcps, tcp_listendrop); + BUMP_MIB(&tcps->tcps_mib, tcpListenDrop); if (tcp->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_conn_request: listen backlog (max=%d) " @@ -5638,7 +5603,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) } if (tcp->tcp_conn_req_cnt_q0 >= - tcp->tcp_conn_req_max + tcp_conn_req_max_q0) { + tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { /* * Q0 is full. Drop a pending half-open req from the queue * to make room for the new SYN req. Also mark the time we @@ -5647,16 +5612,16 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * A more aggressive defense against SYN attack will * be to set the "tcp_syn_defense" flag now. */ - TCP_STAT(tcp_listendropq0); + TCP_STAT(tcps, tcp_listendropq0); tcp->tcp_last_rcv_lbolt = lbolt64; if (!tcp_drop_q0(tcp)) { mutex_exit(&tcp->tcp_eager_lock); - BUMP_MIB(&tcp_mib, tcpListenDropQ0); + BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0); if (tcp->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, "tcp_conn_request: listen half-open queue " "(max=%d) full (%d pending) on %s", - tcp_conn_req_max_q0, + tcps->tcps_conn_req_max_q0, tcp->tcp_conn_req_cnt_q0, tcp_display(tcp, NULL, DISP_PORT_ONLY)); @@ -5677,9 +5642,10 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) new_sqp = (squeue_t *)DB_CKSUMSTART(mp); DB_CKSUMSTART(mp) = 0; mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - econnp = (conn_t *)tcp_get_conn(arg2); + econnp = (conn_t *)tcp_get_conn(arg2, tcps); if (econnp == NULL) goto error2; + ASSERT(econnp->conn_netstack == connp->conn_netstack); econnp->conn_sqp = new_sqp; } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { /* @@ -5692,6 +5658,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) */ return; } + ASSERT(econnp->conn_netstack == connp->conn_netstack); } else { goto error2; } @@ -5804,7 +5771,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) eager->tcp_hard_binding = B_TRUE; - tcp_bind_hash_insert(&tcp_bind_fanout[ + tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ TCP_BIND_HASH(eager->tcp_lport)], eager, 0); CL_INET_CONNECT(eager); @@ -5838,7 +5805,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) tcp_process_options(eager, tcph); /* Is the other end ECN capable? */ - if (tcp_ecn_permitted >= 1 && + if (tcps->tcps_ecn_permitted >= 1 && (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { eager->tcp_ecn_ok = B_TRUE; } @@ -5949,7 +5916,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) eager->tcp_rack = seg_seq; eager->tcp_rnxt = seg_seq + 1; U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack); - BUMP_MIB(&tcp_mib, tcpPassiveOpens); + BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens); eager->tcp_state = TCPS_SYN_RCVD; mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); @@ -6043,7 +6010,9 @@ error1: * If a connection already exists, send the mp to that connections so * that it can be appropriately dealt with. */ - if ((econnp = ipcl_classify(mp, connp->conn_zoneid)) != NULL) { + ipst = tcps->tcps_netstack->netstack_ip; + + if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) { if (!IPCL_IS_CONNECTED(econnp)) { /* * Something bad happened. ipcl_conn_insert() @@ -6469,6 +6438,7 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, ipaddr_t dstaddr = *dstaddrp; int32_t oldstate; uint16_t lport; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(tcp->tcp_ipversion == IPV4_VERSION); @@ -6495,7 +6465,7 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, /* Handle __sin6_src_id if socket not bound to an IP address */ if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) { ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6, - tcp->tcp_connp->conn_zoneid); + tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack); IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6, tcp->tcp_ipha->ipha_src); } @@ -6524,7 +6494,7 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, * included in the checksum but that ip will include the * first hop in the source route in the tcp checksum. */ - tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha); + tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack); tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + (tcp->tcp_ipha->ipha_dst & 0xffff)); @@ -6550,7 +6520,8 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, * tcp_bindi will pick an unused port, insert the connection * in the bind hash and transition to BOUND state. */ - lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); + lport = tcp_update_next_port(tcps->tcps_next_port_to_try, + tcp, B_TRUE); lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, B_FALSE, B_FALSE); if (lport == 0) { @@ -6590,7 +6561,7 @@ tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp, in_port_t dstport, mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, &tcp->tcp_sticky_ipp); } - BUMP_MIB(&tcp_mib, tcpActiveOpens); + BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); tcp->tcp_active_open = 1; /* * If the bind cannot complete immediately @@ -6630,6 +6601,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, ip6_rthdr_t *rth; int32_t oldstate; uint16_t lport; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(tcp->tcp_family == AF_INET6); @@ -6656,7 +6628,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, /* Handle __sin6_src_id if socket not bound to an IP address */ if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, - tcp->tcp_connp->conn_zoneid); + tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack); tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; } @@ -6723,8 +6695,8 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, */ rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph); if (rth != NULL) { - - tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth); + tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth, + tcps->tcps_netstack); tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); } else { @@ -6748,7 +6720,8 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, * tcp_bindi will pick an unused port, insert the connection * in the bind hash and transition to BOUND state. */ - lport = tcp_update_next_port(tcp_next_port_to_try, tcp, B_TRUE); + lport = tcp_update_next_port(tcps->tcps_next_port_to_try, + tcp, B_TRUE); lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, B_FALSE, B_FALSE); if (lport == 0) { @@ -6777,7 +6750,7 @@ tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp, mblk_setcred(mp1, tcp->tcp_cred); mp1 = ip_bind_v6(tcp->tcp_wq, mp1, tcp->tcp_connp, &tcp->tcp_sticky_ipp); - BUMP_MIB(&tcp_mib, tcpActiveOpens); + BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); tcp->tcp_active_open = 1; /* ip_bind_v6() may return ACK or ERROR */ if (mp1 != NULL) @@ -6810,23 +6783,28 @@ tcp_def_q_set(tcp_t *tcp, mblk_t *mp) { struct iocblk *iocp = (struct iocblk *)mp->b_rptr; queue_t *q = tcp->tcp_wq; + tcp_stack_t *tcps = tcp->tcp_tcps; +#ifdef NS_DEBUG + (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n", + tcps->tcps_netstack->netstack_stackid); +#endif mp->b_datap->db_type = M_IOCACK; iocp->ioc_count = 0; - mutex_enter(&tcp_g_q_lock); - if (tcp_g_q != NULL) { - mutex_exit(&tcp_g_q_lock); + mutex_enter(&tcps->tcps_g_q_lock); + if (tcps->tcps_g_q != NULL) { + mutex_exit(&tcps->tcps_g_q_lock); iocp->ioc_error = EALREADY; } else { mblk_t *mp1; mp1 = tcp_ip_bind_mp(tcp, O_T_BIND_REQ, 0); if (mp1 == NULL) { - mutex_exit(&tcp_g_q_lock); + mutex_exit(&tcps->tcps_g_q_lock); iocp->ioc_error = ENOMEM; } else { - tcp_g_q = tcp->tcp_rq; - mutex_exit(&tcp_g_q_lock); + tcps->tcps_g_q = tcp->tcp_rq; + mutex_exit(&tcps->tcps_g_q_lock); iocp->ioc_error = 0; iocp->ioc_rval = 0; /* @@ -6852,6 +6830,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) tcp_t *ltcp = NULL; t_scalar_t seqnum; conn_t *connp; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { @@ -6894,6 +6873,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) */ int old_state = tcp->tcp_state; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; /* * The connection can't be on the tcp_time_wait_head list @@ -6910,14 +6890,14 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) if (tcp->tcp_ipversion == IPV4_VERSION) { connp = ipcl_lookup_listener_v4(tcp->tcp_lport, tcp->tcp_ipha->ipha_src, - tcp->tcp_connp->conn_zoneid); + tcp->tcp_connp->conn_zoneid, ipst); if (connp != NULL) ltcp = connp->conn_tcp; } else { /* Allow tcp_bound_if listeners? */ connp = ipcl_lookup_listener_v6(tcp->tcp_lport, &tcp->tcp_ip6h->ip6_src, 0, - tcp->tcp_connp->conn_zoneid); + tcp->tcp_connp->conn_zoneid, ipst); if (connp != NULL) ltcp = connp->conn_tcp; } @@ -6930,10 +6910,10 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) if (ltcp != NULL) CONN_DEC_REF(ltcp->tcp_connp); if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { - BUMP_MIB(&tcp_mib, tcpAttemptFails); + BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); } else if (old_state == TCPS_ESTABLISHED || old_state == TCPS_CLOSE_WAIT) { - BUMP_MIB(&tcp_mib, tcpEstabResets); + BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); } if (tcp->tcp_fused) @@ -7090,6 +7070,7 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) conn_t *econnp = (conn_t *)arg; tcp_t *eager = econnp->conn_tcp; tcp_t *listener = eager->tcp_listener; + tcp_stack_t *tcps = eager->tcp_tcps; /* * We could be called because listener is closing. Since @@ -7097,8 +7078,9 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) * Better use the default queue just to send the TH_RST * out. */ - eager->tcp_rq = tcp_g_q; - eager->tcp_wq = WR(tcp_g_q); + ASSERT(tcps->tcps_g_q != NULL); + eager->tcp_rq = tcps->tcps_g_q; + eager->tcp_wq = WR(tcps->tcps_g_q); if (eager->tcp_state > TCPS_LISTEN) { tcp_xmit_ctl("tcp_eager_kill, can't wait", @@ -7136,8 +7118,9 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) { tcp_t *eager; mblk_t *mp; + tcp_stack_t *tcps = listener->tcp_tcps; - TCP_STAT(tcp_eager_blowoff_calls); + TCP_STAT(tcps, tcp_eager_blowoff_calls); eager = listener; mutex_enter(&listener->tcp_eager_lock); do { @@ -7171,12 +7154,13 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) { tcp_t *eager; mblk_t *mp; + tcp_stack_t *tcps = listener->tcp_tcps; ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); if (!q0_only) { /* First cleanup q */ - TCP_STAT(tcp_eager_blowoff_q); + TCP_STAT(tcps, tcp_eager_blowoff_q); eager = listener->tcp_eager_next_q; while (eager != NULL) { if (eager->tcp_closemp_used == 0) { @@ -7192,7 +7176,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) } } /* Then cleanup q0 */ - TCP_STAT(tcp_eager_blowoff_q0); + TCP_STAT(tcps, tcp_eager_blowoff_q0); eager = listener->tcp_eager_next_q0; while (eager != listener) { if (eager->tcp_closemp_used == 0) { @@ -7323,10 +7307,12 @@ static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) { int i; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - for (i = 0; i < tcp_g_num_epriv_ports; i++) { - if (tcp_g_epriv_ports[i] != 0) - (void) mi_mpprintf(mp, "%d ", tcp_g_epriv_ports[i]); + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (tcps->tcps_g_epriv_ports[i] != 0) + (void) mi_mpprintf(mp, "%d ", + tcps->tcps_g_epriv_ports[i]); } return (0); } @@ -7342,6 +7328,7 @@ tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, { long new_value; int i; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; /* * Fail the request if the new value does not lie within the @@ -7352,26 +7339,26 @@ tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, return (EINVAL); } - mutex_enter(&tcp_epriv_port_lock); + mutex_enter(&tcps->tcps_epriv_port_lock); /* Check if the value is already in the list */ - for (i = 0; i < tcp_g_num_epriv_ports; i++) { - if (new_value == tcp_g_epriv_ports[i]) { - mutex_exit(&tcp_epriv_port_lock); + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (new_value == tcps->tcps_g_epriv_ports[i]) { + mutex_exit(&tcps->tcps_epriv_port_lock); return (EEXIST); } } /* Find an empty slot */ - for (i = 0; i < tcp_g_num_epriv_ports; i++) { - if (tcp_g_epriv_ports[i] == 0) + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (tcps->tcps_g_epriv_ports[i] == 0) break; } - if (i == tcp_g_num_epriv_ports) { - mutex_exit(&tcp_epriv_port_lock); + if (i == tcps->tcps_g_num_epriv_ports) { + mutex_exit(&tcps->tcps_epriv_port_lock); return (EOVERFLOW); } /* Set the new value */ - tcp_g_epriv_ports[i] = (uint16_t)new_value; - mutex_exit(&tcp_epriv_port_lock); + tcps->tcps_g_epriv_ports[i] = (uint16_t)new_value; + mutex_exit(&tcps->tcps_epriv_port_lock); return (0); } @@ -7386,6 +7373,7 @@ tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, { long new_value; int i; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; /* * Fail the request if the new value does not lie within the @@ -7396,19 +7384,19 @@ tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, return (EINVAL); } - mutex_enter(&tcp_epriv_port_lock); + mutex_enter(&tcps->tcps_epriv_port_lock); /* Check that the value is already in the list */ - for (i = 0; i < tcp_g_num_epriv_ports; i++) { - if (tcp_g_epriv_ports[i] == new_value) + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (tcps->tcps_g_epriv_ports[i] == new_value) break; } - if (i == tcp_g_num_epriv_ports) { - mutex_exit(&tcp_epriv_port_lock); + if (i == tcps->tcps_g_num_epriv_ports) { + mutex_exit(&tcps->tcps_epriv_port_lock); return (ESRCH); } /* Clear the value */ - tcp_g_epriv_ports[i] = 0; - mutex_exit(&tcp_epriv_port_lock); + tcps->tcps_g_epriv_ports[i] = 0; + mutex_exit(&tcps->tcps_epriv_port_lock); return (0); } @@ -7473,6 +7461,8 @@ tcp_tpistate(tcp_t *tcp) static void tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) { + tcp_stack_t *tcps = tcp->tcp_tcps; + if (tcp->tcp_family == AF_INET6) *tia = tcp_g_t_info_ack_v6; else @@ -7482,9 +7472,9 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) if (tcp->tcp_mss == 0) { /* Not yet set - tcp_open does not set mss */ if (tcp->tcp_ipversion == IPV4_VERSION) - tia->TIDU_size = tcp_mss_def_ipv4; + tia->TIDU_size = tcps->tcps_mss_def_ipv4; else - tia->TIDU_size = tcp_mss_def_ipv6; + tia->TIDU_size = tcps->tcps_mss_def_ipv6; } else { tia->TIDU_size = tcp->tcp_mss; } @@ -7692,8 +7682,9 @@ tcp_reinit(tcp_t *tcp) { mblk_t *mp; int err; + tcp_stack_t *tcps = tcp->tcp_tcps; - TCP_STAT(tcp_reinit_calls); + TCP_STAT(tcps, tcp_reinit_calls); /* tcp_reinit should never be called for detached tcp_t's */ ASSERT(tcp->tcp_listener == NULL); @@ -7710,9 +7701,9 @@ tcp_reinit(tcp_t *tcp) * Reset everything in the state vector, after updating global * MIB data from instance counters. */ - UPDATE_MIB(&tcp_mib, tcpHCInSegs, tcp->tcp_ibsegs); + UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); tcp->tcp_ibsegs = 0; - UPDATE_MIB(&tcp_mib, tcpHCOutSegs, tcp->tcp_obsegs); + UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); tcp->tcp_obsegs = 0; tcp_close_mpp(&tcp->tcp_xmit_head); @@ -7787,6 +7778,7 @@ tcp_reinit(tcp_t *tcp) tcp_reinit_values(tcp); ipcl_hash_remove(tcp->tcp_connp); conn_delete_ire(tcp->tcp_connp, NULL); + tcp_ipsec_cleanup(tcp); if (tcp->tcp_conn_req_max != 0) { /* @@ -7844,10 +7836,10 @@ tcp_reinit(tcp_t *tcp) tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; ASSERT(tcp->tcp_ptpbhn != NULL); - tcp->tcp_rq->q_hiwat = tcp_recv_hiwat; - tcp->tcp_rwnd = tcp_recv_hiwat; + tcp->tcp_rq->q_hiwat = tcps->tcps_recv_hiwat; + tcp->tcp_rwnd = tcps->tcps_recv_hiwat; tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? - tcp_mss_def_ipv6 : tcp_mss_def_ipv4; + tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; } /* @@ -7861,6 +7853,8 @@ static void tcp_reinit_values(tcp) tcp_t *tcp; { + tcp_stack_t *tcps = tcp->tcp_tcps; + #ifndef lint #define DONTCARE(x) #define PRESERVE(x) @@ -8092,10 +8086,10 @@ tcp_reinit_values(tcp) PRESERVE(tcp->tcp_family); if (tcp->tcp_family == AF_INET6) { tcp->tcp_ipversion = IPV6_VERSION; - tcp->tcp_mss = tcp_mss_def_ipv6; + tcp->tcp_mss = tcps->tcps_mss_def_ipv6; } else { tcp->tcp_ipversion = IPV4_VERSION; - tcp->tcp_mss = tcp_mss_def_ipv4; + tcp->tcp_mss = tcps->tcps_mss_def_ipv4; } tcp->tcp_bound_if = 0; @@ -8187,6 +8181,7 @@ static int tcp_init_values(tcp_t *tcp) { int err; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT((tcp->tcp_family == AF_INET && tcp->tcp_ipversion == IPV4_VERSION) || @@ -8201,32 +8196,32 @@ tcp_init_values(tcp_t *tcp) * during first few transmissions of a connection as seen in slow * links. */ - tcp->tcp_rtt_sa = tcp_rexmit_interval_initial << 2; - tcp->tcp_rtt_sd = tcp_rexmit_interval_initial >> 1; + tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2; + tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1; tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + - tcp_conn_grace_period; - if (tcp->tcp_rto < tcp_rexmit_interval_min) - tcp->tcp_rto = tcp_rexmit_interval_min; + tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + + tcps->tcps_conn_grace_period; + if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min) + tcp->tcp_rto = tcps->tcps_rexmit_interval_min; tcp->tcp_timer_backoff = 0; tcp->tcp_ms_we_have_waited = 0; tcp->tcp_last_recv_time = lbolt; - tcp->tcp_cwnd_max = tcp_cwnd_max_; + tcp->tcp_cwnd_max = tcps->tcps_cwnd_max_; tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; tcp->tcp_snd_burst = TCP_CWND_INFINITE; - tcp->tcp_maxpsz = tcp_maxpsz_multiplier; + tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier; - tcp->tcp_first_timer_threshold = tcp_ip_notify_interval; - tcp->tcp_first_ctimer_threshold = tcp_ip_notify_cinterval; - tcp->tcp_second_timer_threshold = tcp_ip_abort_interval; + tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; + tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval; + tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval; /* * Fix it to tcp_ip_abort_linterval later if it turns out to be a * passive open. */ - tcp->tcp_second_ctimer_threshold = tcp_ip_abort_cinterval; + tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval; - tcp->tcp_naglim = tcp_naglim_def; + tcp->tcp_naglim = tcps->tcps_naglim_def; /* NOTE: ISS is now set in tcp_adapt_ire(). */ @@ -8259,8 +8254,8 @@ tcp_init_values(tcp_t *tcp) * down tcp_rwnd. tcp_adapt_ire() will set the right value later. */ tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; - tcp->tcp_xmit_lowater = tcp_xmit_lowat; - tcp->tcp_xmit_hiwater = tcp_xmit_hiwat; + tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat; + tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat; tcp->tcp_cork = B_FALSE; /* @@ -8269,10 +8264,10 @@ tcp_init_values(tcp_t *tcp) * initialization here means that this value is not inherited thru * tcp_reinit(). */ - tcp->tcp_debug = tcp_dbg; + tcp->tcp_debug = tcps->tcps_dbg; - tcp->tcp_ka_interval = tcp_keepalive_interval; - tcp->tcp_ka_abort_thres = tcp_keepalive_abort_interval; + tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; + tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; return (0); } @@ -8286,6 +8281,7 @@ tcp_header_init_ipv4(tcp_t *tcp) tcph_t *tcph; uint32_t sum; conn_t *connp; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * This is a simple initialization. If there's @@ -8318,10 +8314,10 @@ tcp_header_init_ipv4(tcp_t *tcp) = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; tcp->tcp_ipha->ipha_ident = 0; - tcp->tcp_ttl = (uchar_t)tcp_ipv4_ttl; + tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl; tcp->tcp_tos = 0; tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; - tcp->tcp_ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; + tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP; tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t)); @@ -8348,6 +8344,7 @@ tcp_header_init_ipv6(tcp_t *tcp) tcph_t *tcph; uint32_t sum; conn_t *connp; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * This is a simple initialization. If there's @@ -8390,7 +8387,7 @@ tcp_header_init_ipv6(tcp_t *tcp) tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t)); tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP; - tcp->tcp_ip6h->ip6_hops = (uint8_t)tcp_ipv6_hoplimit; + tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit; tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN); tcp->tcp_tcph = tcph; @@ -8429,6 +8426,7 @@ tcp_icmp_error(tcp_t *tcp, mblk_t *mp) uint32_t ratio; size_t mp_size = MBLKL(mp); uint32_t seg_seq; + tcp_stack_t *tcps = tcp->tcp_tcps; /* Assume IP provides aligned packets - otherwise toss */ if (!OK_32PTR(mp->b_rptr)) { @@ -8571,7 +8569,7 @@ noticmpv4: * tcp_wput_data(). Need to adjust all those * params to make sure tcp_wput_data() work properly. */ - if (tcp_ignore_path_mtu) + if (tcps->tcps_ignore_path_mtu) break; /* @@ -8598,7 +8596,7 @@ noticmpv4: * or less than tcp_mss_min. * The value 68 comes from rfc 1191. */ - if (new_mss < MAX(68, tcp_mss_min)) + if (new_mss < MAX(68, tcps->tcps_mss_min)) tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; @@ -8717,6 +8715,7 @@ tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl) mblk_t *first_mp = mp; size_t mp_size; uint32_t seg_seq; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * The caller has determined if this is an IPSEC_IN packet and @@ -8842,7 +8841,7 @@ noticmpv6: * tcp_wput_data(). Need to adjust all those * params to make sure tcp_wput_data() work properly. */ - if (tcp_ignore_path_mtu) + if (tcps->tcps_ignore_path_mtu) break; /* @@ -9193,13 +9192,14 @@ tcp_keepalive_killer(void *arg) int32_t firetime; int32_t idletime; int32_t ka_intrvl; + tcp_stack_t *tcps = tcp->tcp_tcps; tcp->tcp_ka_tid = 0; if (tcp->tcp_fused) return; - BUMP_MIB(&tcp_mib, tcpTimKeepalive); + BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive); ka_intrvl = tcp->tcp_ka_interval; /* @@ -9224,7 +9224,7 @@ tcp_keepalive_killer(void *arg) */ if (tcp->tcp_ka_abort_thres != 0 && idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { - BUMP_MIB(&tcp_mib, tcpTimKeepaliveDrop); + BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop); (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? tcp->tcp_client_errno : ETIMEDOUT, 11); return; @@ -9248,18 +9248,20 @@ tcp_keepalive_killer(void *arg) TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); tcp_send_data(tcp, tcp->tcp_wq, mp); - BUMP_MIB(&tcp_mib, tcpTimKeepaliveProbe); + BUMP_MIB(&tcps->tcps_mib, + tcpTimKeepaliveProbe); if (tcp->tcp_ka_last_intrvl != 0) { + int max; /* * We should probe again at least * in ka_intrvl, but not more than * tcp_rexmit_interval_max. */ + max = tcps->tcps_rexmit_interval_max; firetime = MIN(ka_intrvl - 1, tcp->tcp_ka_last_intrvl << 1); - if (firetime > tcp_rexmit_interval_max) - firetime = - tcp_rexmit_interval_max; + if (firetime > max) + firetime = max; } else { firetime = tcp->tcp_rto; } @@ -9501,14 +9503,15 @@ static void tcp_mss_set(tcp_t *tcp, uint32_t mss) { uint32_t mss_max; + tcp_stack_t *tcps = tcp->tcp_tcps; if (tcp->tcp_ipversion == IPV4_VERSION) - mss_max = tcp_mss_max_ipv4; + mss_max = tcps->tcps_mss_max_ipv4; else - mss_max = tcp_mss_max_ipv6; + mss_max = tcps->tcps_mss_max_ipv6; - if (mss < tcp_mss_min) - mss = tcp_mss_min; + if (mss < tcps->tcps_mss_min) + mss = tcps->tcps_mss_min; if (mss > mss_max) mss = mss_max; /* @@ -9532,7 +9535,7 @@ tcp_mss_set(tcp_t *tcp, uint32_t mss) * The new tcp_cwnd should not get bigger. */ if (tcp->tcp_init_cwnd == 0) { - tcp->tcp_cwnd = MIN(tcp_slow_start_initial * mss, + tcp->tcp_cwnd = MIN(tcps->tcps_slow_start_initial * mss, MIN(4 * mss, MAX(2 * mss, 4380 / mss * mss))); } else { if (tcp->tcp_mss < mss) { @@ -9554,25 +9557,60 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) conn_t *connp; int err; dev_t conn_dev; - zoneid_t zoneid = getzoneid(); - - /* - * Special case for install: miniroot needs to be able to access files - * via NFS as though it were always in the global zone. - */ - if (credp == kcred && nfs_global_client_only != 0) - zoneid = GLOBAL_ZONEID; + zoneid_t zoneid; + tcp_stack_t *tcps = NULL; if (q->q_ptr != NULL) return (0); + if (!(flag & SO_ACCEPTOR)) { + /* + * Special case for install: miniroot needs to be able to + * access files via NFS as though it were always in the + * global zone. + */ + if (credp == kcred && nfs_global_client_only != 0) { + zoneid = GLOBAL_ZONEID; + tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)-> + netstack_tcp; + ASSERT(tcps != NULL); + } else { + netstack_t *ns; + + ns = netstack_find_by_cred(credp); + ASSERT(ns != NULL); + tcps = ns->netstack_tcp; + ASSERT(tcps != NULL); + + /* + * For exclusive stacks we set the zoneid to zero + * to make TCP operate as if in the global zone. + */ + if (tcps->tcps_netstack->netstack_stackid != + GLOBAL_NETSTACKID) + zoneid = GLOBAL_ZONEID; + else + zoneid = crgetzoneid(credp); + } + /* + * For stackid zero this is done from strplumb.c, but + * non-zero stackids are handled here. + */ + if (tcps->tcps_g_q == NULL && + tcps->tcps_netstack->netstack_stackid != + GLOBAL_NETSTACKID) { + tcp_g_q_setup(tcps); + } + } if (sflag == MODOPEN) { /* * This is a special case. The purpose of a modopen * is to allow just the T_SVR4_OPTMGMT_REQ to pass * through for MIB browsers. Everything else is failed. */ - connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); + connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt), tcps); + /* tcp_get_conn incremented refcnt */ + netstack_rele(tcps->tcps_netstack); if (connp == NULL) return (ENOMEM); @@ -9580,6 +9618,8 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp->conn_flags |= IPCL_TCPMOD; connp->conn_cred = credp; connp->conn_zoneid = zoneid; + ASSERT(connp->conn_netstack == tcps->tcps_netstack); + ASSERT(connp->conn_netstack->netstack_tcp == tcps); q->q_ptr = WR(q)->q_ptr = connp; crhold(credp); q->q_qinfo = &tcp_mod_rinit; @@ -9587,13 +9627,17 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) qprocson(q); return (0); } - - if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) + if ((conn_dev = inet_minor_alloc(ip_minor_arena)) == 0) { + if (tcps != NULL) + netstack_rele(tcps->tcps_netstack); return (EBUSY); + } *devp = makedevice(getemajor(*devp), (minor_t)conn_dev); if (flag & SO_ACCEPTOR) { + /* No netstack_find_by_cred, hence no netstack_rele needed */ + ASSERT(tcps == NULL); q->q_qinfo = &tcp_acceptor_rinit; q->q_ptr = (void *)conn_dev; WR(q)->q_qinfo = &tcp_acceptor_winit; @@ -9602,7 +9646,12 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (0); } - connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt)); + connp = (conn_t *)tcp_get_conn(IP_SQUEUE_GET(lbolt), tcps); + /* + * Both tcp_get_conn and netstack_find_by_cred incremented refcnt, + * so we drop it by one. + */ + netstack_rele(tcps->tcps_netstack); if (connp == NULL) { inet_minor_free(ip_minor_arena, conn_dev); q->q_ptr = NULL; @@ -9620,7 +9669,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; tcp->tcp_ipversion = IPV6_VERSION; tcp->tcp_family = AF_INET6; - tcp->tcp_mss = tcp_mss_def_ipv6; + tcp->tcp_mss = tcps->tcps_mss_def_ipv6; } else { connp->conn_flags |= IPCL_TCP4; connp->conn_send = ip_output; @@ -9628,7 +9677,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp->conn_pkt_isv6 = B_FALSE; tcp->tcp_ipversion = IPV4_VERSION; tcp->tcp_family = AF_INET; - tcp->tcp_mss = tcp_mss_def_ipv4; + tcp->tcp_mss = tcps->tcps_mss_def_ipv4; } /* @@ -9643,6 +9692,8 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) connp->conn_zoneid = zoneid; connp->conn_mlp_type = mlptSingle; connp->conn_ulp_labeled = !is_system_labeled(); + ASSERT(connp->conn_netstack == tcps->tcps_netstack); + ASSERT(tcp->tcp_tcps == tcps); /* * If the caller has the process-wide flag set, then default to MAC @@ -9675,7 +9726,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); } - if (tcp_trace) + if (tcps->tcps_trace) tcp->tcp_tracebuf = kmem_zalloc(sizeof (tcptrch_t), KM_SLEEP); err = tcp_init(tcp, q); @@ -9687,8 +9738,8 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (err); } - RD(q)->q_hiwat = tcp_recv_hiwat; - tcp->tcp_rwnd = tcp_recv_hiwat; + RD(q)->q_hiwat = tcps->tcps_recv_hiwat; + tcp->tcp_rwnd = tcps->tcps_recv_hiwat; /* Non-zero default values */ connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; @@ -9745,21 +9796,22 @@ int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) { int32_t *i1 = (int32_t *)ptr; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; switch (level) { case IPPROTO_TCP: switch (name) { case TCP_NOTIFY_THRESHOLD: - *i1 = tcp_ip_notify_interval; + *i1 = tcps->tcps_ip_notify_interval; break; case TCP_ABORT_THRESHOLD: - *i1 = tcp_ip_abort_interval; + *i1 = tcps->tcps_ip_abort_interval; break; case TCP_CONN_NOTIFY_THRESHOLD: - *i1 = tcp_ip_notify_cinterval; + *i1 = tcps->tcps_ip_notify_cinterval; break; case TCP_CONN_ABORT_THRESHOLD: - *i1 = tcp_ip_abort_cinterval; + *i1 = tcps->tcps_ip_abort_cinterval; break; default: return (-1); @@ -9768,7 +9820,7 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) case IPPROTO_IP: switch (name) { case IP_TTL: - *i1 = tcp_ipv4_ttl; + *i1 = tcps->tcps_ipv4_ttl; break; default: return (-1); @@ -9777,7 +9829,7 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) case IPPROTO_IPV6: switch (name) { case IPV6_UNICAST_HOPS: - *i1 = tcp_ipv6_hoplimit; + *i1 = tcps->tcps_ipv6_hoplimit; break; default: return (-1); @@ -10093,7 +10145,8 @@ tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr) return (-1); return (ip_fill_mtuinfo(&connp->conn_remv6, - connp->conn_fport, mtuinfo)); + connp->conn_fport, mtuinfo, + connp->conn_netstack)); } default: return (-1); @@ -10121,6 +10174,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, boolean_t onoff = (*i1 == 0) ? 0 : 1; boolean_t checkonly; int reterr; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; switch (optset_context) { case SETFN_OPTCOM_CHECKONLY: @@ -10280,7 +10334,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, tcp->tcp_dgram_errind = onoff; break; case SO_SNDBUF: { - if (*i1 > tcp_max_buf) { + if (*i1 > tcps->tcps_max_buf) { *outlenp = 0; return (ENOBUFS); } @@ -10288,10 +10342,10 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, break; tcp->tcp_xmit_hiwater = *i1; - if (tcp_snd_lowat_fraction != 0) + if (tcps->tcps_snd_lowat_fraction != 0) tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / - tcp_snd_lowat_fraction; + tcps->tcps_snd_lowat_fraction; (void) tcp_maxpsz_set(tcp, B_TRUE); /* * If we are flow-controlled, recheck the condition. @@ -10308,7 +10362,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, break; } case SO_RCVBUF: - if (*i1 > tcp_max_buf) { + if (*i1 > tcps->tcps_max_buf) { *outlenp = 0; return (ENOBUFS); } @@ -10419,7 +10473,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, tcp->tcp_init_cwnd = init_cwnd; break; } - if ((reterr = secpolicy_net_config(cr, B_TRUE)) != 0) { + if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) { *outlenp = 0; return (reterr); } @@ -10434,8 +10488,8 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, if (checkonly) break; - if (*i1 < tcp_keepalive_interval_low || - *i1 > tcp_keepalive_interval_high) { + if (*i1 < tcps->tcps_keepalive_interval_low || + *i1 > tcps->tcps_keepalive_interval_high) { *outlenp = 0; return (EINVAL); } @@ -10458,8 +10512,10 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, break; case TCP_KEEPALIVE_ABORT_THRESHOLD: if (!checkonly) { - if (*i1 < tcp_keepalive_abort_interval_low || - *i1 > tcp_keepalive_abort_interval_high) { + if (*i1 < + tcps->tcps_keepalive_abort_interval_low || + *i1 > + tcps->tcps_keepalive_abort_interval_high) { *outlenp = 0; return (EINVAL); } @@ -10571,7 +10627,7 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name, if (*i1 == -1) { tcp->tcp_ip6h->ip6_hops = ipp->ipp_unicast_hops = - (uint8_t)tcp_ipv6_hoplimit; + (uint8_t)tcps->tcps_ipv6_hoplimit; ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; /* Pass modified value to IP. */ *i1 = tcp->tcp_ip6h->ip6_hops; @@ -10973,6 +11029,7 @@ tcp_build_hdrs(queue_t *q, tcp_t *tcp) char buf[TCP_MAX_HDR_LENGTH]; ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; in6_addr_t src, dst; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * save the existing tcp header and source/dest IP addresses @@ -11030,7 +11087,7 @@ tcp_build_hdrs(queue_t *q, tcp_t *tcp) * the default value for TCP. */ if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) - tcp->tcp_ip6h->ip6_hops = tcp_ipv6_hoplimit; + tcp->tcp_ip6h->ip6_hops = tcps->tcps_ipv6_hoplimit; /* * If we're setting extension headers after a connection @@ -11050,14 +11107,14 @@ tcp_build_hdrs(queue_t *q, tcp_t *tcp) (uint8_t *)tcp->tcp_tcph); if (rth != NULL) { tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, - rth); + rth, tcps->tcps_netstack); tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); } } /* Try to get everything in a single mblk */ - (void) mi_set_sth_wroff(RD(q), hdrs_len + tcp_wroff_xtra); + (void) mi_set_sth_wroff(RD(q), hdrs_len + tcps->tcps_wroff_xtra); return (0); } @@ -11183,6 +11240,7 @@ tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) uint_t tcph_len; uint8_t *ip_optp; tcph_t *new_tcph; + tcp_stack_t *tcps = tcp->tcp_tcps; if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) return (EINVAL); @@ -11224,7 +11282,7 @@ tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) if (!TCP_IS_DETACHED(tcp)) { /* Always allocate room for all options. */ (void) mi_set_sth_wroff(tcp->tcp_rq, - TCP_MAX_COMBINED_HEADER_LENGTH + tcp_wroff_xtra); + TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra); } return (0); } @@ -11245,100 +11303,116 @@ tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) * named dispatch handler. */ static boolean_t -tcp_param_register(tcpparam_t *tcppa, int cnt) +tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps) { for (; cnt-- > 0; tcppa++) { if (tcppa->tcp_param_name && tcppa->tcp_param_name[0]) { - if (!nd_load(&tcp_g_nd, tcppa->tcp_param_name, + if (!nd_load(ndp, tcppa->tcp_param_name, tcp_param_get, tcp_param_set, (caddr_t)tcppa)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } } } - if (!nd_load(&tcp_g_nd, tcp_wroff_xtra_param.tcp_param_name, + tcps->tcps_wroff_xtra_param = kmem_zalloc(sizeof (tcpparam_t), + KM_SLEEP); + bcopy(&lcl_tcp_wroff_xtra_param, tcps->tcps_wroff_xtra_param, + sizeof (tcpparam_t)); + if (!nd_load(ndp, tcps->tcps_wroff_xtra_param->tcp_param_name, tcp_param_get, tcp_param_set_aligned, - (caddr_t)&tcp_wroff_xtra_param)) { - nd_free(&tcp_g_nd); + (caddr_t)tcps->tcps_wroff_xtra_param)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, tcp_mdt_head_param.tcp_param_name, + tcps->tcps_mdt_head_param = kmem_zalloc(sizeof (tcpparam_t), + KM_SLEEP); + bcopy(&lcl_tcp_mdt_head_param, tcps->tcps_mdt_head_param, + sizeof (tcpparam_t)); + if (!nd_load(ndp, tcps->tcps_mdt_head_param->tcp_param_name, tcp_param_get, tcp_param_set_aligned, - (caddr_t)&tcp_mdt_head_param)) { - nd_free(&tcp_g_nd); + (caddr_t)tcps->tcps_mdt_head_param)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, tcp_mdt_tail_param.tcp_param_name, + tcps->tcps_mdt_tail_param = kmem_zalloc(sizeof (tcpparam_t), + KM_SLEEP); + bcopy(&lcl_tcp_mdt_tail_param, tcps->tcps_mdt_tail_param, + sizeof (tcpparam_t)); + if (!nd_load(ndp, tcps->tcps_mdt_tail_param->tcp_param_name, tcp_param_get, tcp_param_set_aligned, - (caddr_t)&tcp_mdt_tail_param)) { - nd_free(&tcp_g_nd); + (caddr_t)tcps->tcps_mdt_tail_param)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, tcp_mdt_max_pbufs_param.tcp_param_name, - tcp_param_get, tcp_param_set, - (caddr_t)&tcp_mdt_max_pbufs_param)) { - nd_free(&tcp_g_nd); + tcps->tcps_mdt_max_pbufs_param = kmem_zalloc(sizeof (tcpparam_t), + KM_SLEEP); + bcopy(&lcl_tcp_mdt_max_pbufs_param, tcps->tcps_mdt_max_pbufs_param, + sizeof (tcpparam_t)); + if (!nd_load(ndp, tcps->tcps_mdt_max_pbufs_param->tcp_param_name, + tcp_param_get, tcp_param_set_aligned, + (caddr_t)tcps->tcps_mdt_max_pbufs_param)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports", + if (!nd_load(ndp, "tcp_extra_priv_ports", tcp_extra_priv_ports_get, NULL, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_add", + if (!nd_load(ndp, "tcp_extra_priv_ports_add", NULL, tcp_extra_priv_ports_add, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_extra_priv_ports_del", + if (!nd_load(ndp, "tcp_extra_priv_ports_del", NULL, tcp_extra_priv_ports_del, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_status", tcp_status_report, NULL, + if (!nd_load(ndp, "tcp_status", tcp_status_report, NULL, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_bind_hash", tcp_bind_hash_report, + if (!nd_load(ndp, "tcp_bind_hash", tcp_bind_hash_report, NULL, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_listen_hash", tcp_listen_hash_report, - NULL, NULL)) { - nd_free(&tcp_g_nd); + if (!nd_load(ndp, "tcp_listen_hash", + tcp_listen_hash_report, NULL, NULL)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_conn_hash", tcp_conn_hash_report, + if (!nd_load(ndp, "tcp_conn_hash", tcp_conn_hash_report, NULL, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_acceptor_hash", tcp_acceptor_hash_report, - NULL, NULL)) { - nd_free(&tcp_g_nd); + if (!nd_load(ndp, "tcp_acceptor_hash", + tcp_acceptor_hash_report, NULL, NULL)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_host_param", tcp_host_param_report, + if (!nd_load(ndp, "tcp_host_param", tcp_host_param_report, tcp_host_param_set, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_host_param_ipv6", tcp_host_param_report, - tcp_host_param_set_ipv6, NULL)) { - nd_free(&tcp_g_nd); + if (!nd_load(ndp, "tcp_host_param_ipv6", + tcp_host_param_report, tcp_host_param_set_ipv6, NULL)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_1948_phrase", NULL, tcp_1948_phrase_set, - NULL)) { - nd_free(&tcp_g_nd); + if (!nd_load(ndp, "tcp_1948_phrase", NULL, + tcp_1948_phrase_set, NULL)) { + nd_free(ndp); return (B_FALSE); } - if (!nd_load(&tcp_g_nd, "tcp_reserved_port_list", + if (!nd_load(ndp, "tcp_reserved_port_list", tcp_reserved_port_list, NULL, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } /* @@ -11346,10 +11420,10 @@ tcp_param_register(tcpparam_t *tcppa, int cnt) * through printing of their name (no get or set routines) * XXX Remove in future releases ? */ - if (!nd_load(&tcp_g_nd, + if (!nd_load(ndp, "tcp_close_wait_interval(obsoleted - " "use tcp_time_wait_interval)", NULL, NULL, NULL)) { - nd_free(&tcp_g_nd); + nd_free(ndp); return (B_FALSE); } return (B_TRUE); @@ -11412,6 +11486,7 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) mblk_t *mp2; mblk_t *next_mp; uint32_t u1; + tcp_stack_t *tcps = tcp->tcp_tcps; /* Walk through all the new pieces. */ do { @@ -11431,8 +11506,8 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) if (!mp1) { tcp->tcp_reass_tail = mp; tcp->tcp_reass_head = mp; - BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); - UPDATE_MIB(&tcp_mib, + BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataUnorderBytes, end - start); continue; } @@ -11441,8 +11516,8 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) /* Link it on end. */ mp1->b_cont = mp; tcp->tcp_reass_tail = mp; - BUMP_MIB(&tcp_mib, tcpInDataUnorderSegs); - UPDATE_MIB(&tcp_mib, + BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataUnorderBytes, end - start); continue; } @@ -11508,6 +11583,7 @@ tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) uint32_t end; mblk_t *mp1; uint32_t u1; + tcp_stack_t *tcps = tcp->tcp_tcps; end = TCP_REASS_END(mp); while ((mp1 = mp->b_cont) != NULL) { @@ -11517,16 +11593,17 @@ tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { mp->b_wptr -= end - u1; TCP_REASS_SET_END(mp, u1); - BUMP_MIB(&tcp_mib, tcpInDataPartDupSegs); - UPDATE_MIB(&tcp_mib, tcpInDataPartDupBytes, end - u1); + BUMP_MIB(&tcps->tcps_mib, tcpInDataPartDupSegs); + UPDATE_MIB(&tcps->tcps_mib, + tcpInDataPartDupBytes, end - u1); break; } mp->b_cont = mp1->b_cont; TCP_REASS_SET_SEQ(mp1, 0); TCP_REASS_SET_END(mp1, 0); freeb(mp1); - BUMP_MIB(&tcp_mib, tcpInDataDupSegs); - UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, end - u1); + BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, end - u1); } if (!mp1) tcp->tcp_reass_tail = mp; @@ -11544,6 +11621,8 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp) #ifdef DEBUG uint_t cnt = 0; #endif + tcp_stack_t *tcps = tcp->tcp_tcps; + /* Can't drain on an eager connection */ if (tcp->tcp_listener != NULL) return (ret); @@ -11598,7 +11677,7 @@ tcp_rcv_drain(queue_t *q, tcp_t *tcp) * deferred acks segments, send an update immediately. */ if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { - BUMP_MIB(&tcp_mib, tcpOutWinUpdate); + BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); ret = TH_ACK_NEEDED; } tcp->tcp_rwnd = q->q_hiwat; @@ -11684,8 +11763,9 @@ tcp_input(void *arg, mblk_t *mp, void *arg2) if (tcp->tcp_state == TCPS_CLOSED || tcp->tcp_state == TCPS_BOUND) { conn_t *new_connp; + ip_stack_t *ipst = tcp->tcp_tcps->tcps_netstack->netstack_ip; - new_connp = ipcl_classify(mp, connp->conn_zoneid); + new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); if (new_connp != NULL) { tcp_reinput(new_connp, mp, arg2); return; @@ -11809,8 +11889,9 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) clock_t sa = tcp->tcp_rtt_sa; clock_t sv = tcp->tcp_rtt_sd; clock_t rto; + tcp_stack_t *tcps = tcp->tcp_tcps; - BUMP_MIB(&tcp_mib, tcpRttUpdate); + BUMP_MIB(&tcps->tcps_mib, tcpRttUpdate); tcp->tcp_rtt_update++; /* tcp_rtt_sa is not 0 means this is a new sample. */ @@ -11877,12 +11958,12 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) * deviation of RTO to accomodate burstiness of 1/4 of * window size. */ - rto = (sa >> 3) + sv + tcp_rexmit_interval_extra + (sa >> 5); + rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); - if (rto > tcp_rexmit_interval_max) { - tcp->tcp_rto = tcp_rexmit_interval_max; - } else if (rto < tcp_rexmit_interval_min) { - tcp->tcp_rto = tcp_rexmit_interval_min; + if (rto > tcps->tcps_rexmit_interval_max) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_max; + } else if (rto < tcps->tcps_rexmit_interval_min) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_min; } else { tcp->tcp_rto = rto; } @@ -11952,6 +12033,7 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) int32_t mss; uint32_t seg_len; mblk_t *xmit_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(tcp->tcp_sack_info != NULL); ASSERT(tcp->tcp_notsack_list != NULL); @@ -11988,7 +12070,7 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { if (SEQ_GT(notsack_blk->end, begin) && (notsack_blk->sack_cnt >= - tcp_dupack_fast_retransmit)) { + tcps->tcps_dupack_fast_retransmit)) { end = notsack_blk->end; if (SEQ_LT(begin, notsack_blk->begin)) { begin = notsack_blk->begin; @@ -12046,9 +12128,9 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) */ snxt_mp->b_prev = (mblk_t *)lbolt; - BUMP_MIB(&tcp_mib, tcpRetransSegs); - UPDATE_MIB(&tcp_mib, tcpRetransBytes, seg_len); - BUMP_MIB(&tcp_mib, tcpOutSackRetransSegs); + BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, seg_len); + BUMP_MIB(&tcps->tcps_mib, tcpOutSackRetransSegs); /* * Update tcp_rexmit_max to extend this SACK recovery phase. * This happens when new data sent during fast recovery is @@ -12076,6 +12158,9 @@ tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, ipsec_in_t *ii; const char *reason; kstat_named_t *counter; + tcp_stack_t *tcps = tcp->tcp_tcps; + ipsec_stack_t *ipss; + ip_stack_t *ipst; ASSERT(mctl_present || !secure); @@ -12093,9 +12178,13 @@ tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, act->ipa_act.ipa_type == IPSEC_ACT_CLEAR) return (B_TRUE); ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH, - "tcp_check_policy", ipha, ip6h, secure); + "tcp_check_policy", ipha, ip6h, secure, + tcps->tcps_netstack); + ipss = tcps->tcps_netstack->netstack_ipsec; + ip_drop_packet(first_mp, B_TRUE, NULL, NULL, - &ipdrops_tcp_clear, &tcp_dropper); + DROPPER(ipss, ipds_tcp_clear), + &tcps->tcps_dropper); return (B_FALSE); } @@ -12104,9 +12193,13 @@ tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, */ if (act == NULL) { ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED, - "tcp_check_policy", ipha, ip6h, secure); + "tcp_check_policy", ipha, ip6h, secure, + tcps->tcps_netstack); + ipss = tcps->tcps_netstack->netstack_ipsec; + ip_drop_packet(first_mp, B_TRUE, NULL, NULL, - &ipdrops_tcp_secure, &tcp_dropper); + DROPPER(ipss, ipds_tcp_secure), + &tcps->tcps_dropper); return (B_FALSE); } @@ -12122,17 +12215,20 @@ tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, ii = (ipsec_in_t *)first_mp->b_rptr; + ipst = tcps->tcps_netstack->netstack_ip; + if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason, &counter, tcp->tcp_connp)) { - BUMP_MIB(&ip_mib, ipsecInSucceeded); + BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); return (B_TRUE); } (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, "tcp inbound policy mismatch: %s, packet dropped\n", reason); - BUMP_MIB(&ip_mib, ipsecInFailed); + BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, &tcp_dropper); + ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, + &tcps->tcps_dropper); return (B_FALSE); } @@ -12153,6 +12249,7 @@ tcp_ss_rexmit(tcp_t *tcp) int32_t off; int32_t burst = tcp->tcp_snd_burst; mblk_t *snxt_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * Note that tcp_rexmit can be set even though TCP has retransmitted @@ -12195,8 +12292,8 @@ tcp_ss_rexmit(tcp_t *tcp) * retransmission. */ old_snxt_mp->b_prev = (mblk_t *)lbolt; - BUMP_MIB(&tcp_mib, tcpRetransSegs); - UPDATE_MIB(&tcp_mib, tcpRetransBytes, cnt); + BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, cnt); tcp->tcp_rexmit_nxt = snxt; burst--; @@ -12236,6 +12333,7 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) tcp_opt_t tcpopt; uint32_t mss_max; char *tmp_tcph; + tcp_stack_t *tcps = tcp->tcp_tcps; tcpopt.tcp = NULL; options = tcp_parse_options(tcph, &tcpopt); @@ -12248,16 +12346,16 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) */ if (!(options & TCP_OPT_MSS_PRESENT)) { if (tcp->tcp_ipversion == IPV4_VERSION) - tcpopt.tcp_opt_mss = tcp_mss_def_ipv4; + tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; else - tcpopt.tcp_opt_mss = tcp_mss_def_ipv6; + tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; } else { if (tcp->tcp_ipversion == IPV4_VERSION) - mss_max = tcp_mss_max_ipv4; + mss_max = tcps->tcps_mss_max_ipv4; else - mss_max = tcp_mss_max_ipv6; - if (tcpopt.tcp_opt_mss < tcp_mss_min) - tcpopt.tcp_opt_mss = tcp_mss_min; + mss_max = tcps->tcps_mss_max_ipv6; + if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) + tcpopt.tcp_opt_mss = tcps->tcps_mss_min; else if (tcpopt.tcp_opt_mss > mss_max) tcpopt.tcp_opt_mss = mss_max; } @@ -12317,7 +12415,7 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) */ if ((options & TCP_OPT_SACK_OK_PRESENT) && (tcp->tcp_snd_sack_ok || - (tcp_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { + (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { /* This should be true only in the passive case. */ if (tcp->tcp_sack_info == NULL) { ASSERT(TCP_IS_DETACHED(tcp)); @@ -12398,6 +12496,7 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) struct T_conn_ind *conn_ind; ipaddr_t *addr_cache; boolean_t need_send_conn_ind = B_FALSE; + tcp_stack_t *tcps = listener->tcp_tcps; /* retrieve the eager */ conn_ind = (struct T_conn_ind *)mp->b_rptr; @@ -12509,7 +12608,7 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) listener->tcp_syn_rcvd_timeout--; if (listener->tcp_syn_defense && listener->tcp_syn_rcvd_timeout <= - (tcp_conn_req_max_q0 >> 5) && + (tcps->tcps_conn_req_max_q0 >> 5) && 10*MINUTES < TICK_TO_MSEC(lbolt64 - listener->tcp_last_rcv_lbolt)) { /* @@ -12552,6 +12651,7 @@ tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, ip6_pkt_t ipp; uint_t ipvers; uint_t ip_hdr_len; + tcp_stack_t *tcps = tcp->tcp_tcps; rptr = mp->b_rptr; ASSERT(OK_32PTR(rptr)); @@ -12616,12 +12716,13 @@ tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, if (ip6h->ip6_nxt != IPPROTO_TCP) { uint8_t nexthdrp; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; /* Look for ifindex information */ if (ip6h->ip6_nxt == IPPROTO_RAW) { ip6i_t *ip6i = (ip6i_t *)ip6h; if ((uchar_t *)&ip6i[1] > mp->b_wptr) { - BUMP_MIB(&ip_mib, tcpInErrs); + BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); freemsg(first_mp); return (NULL); } @@ -12643,7 +12744,7 @@ tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, } if (MBLKL(mp) < IPV6_HDR_LEN + sizeof (tcph_t)) { - BUMP_MIB(&ip_mib, tcpInErrs); + BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); freemsg(first_mp); return (NULL); } @@ -12658,7 +12759,7 @@ tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp); /* Verify if this is a TCP packet */ if (nexthdrp != IPPROTO_TCP) { - BUMP_MIB(&ip_mib, tcpInErrs); + BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); freemsg(first_mp); return (NULL); } @@ -12730,12 +12831,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) conn_t *connp = (conn_t *)arg; squeue_t *sqp = (squeue_t *)arg2; tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * RST from fused tcp loopback peer should trigger an unfuse. */ if (tcp->tcp_fused) { - TCP_STAT(tcp_fusion_aborted); + TCP_STAT(tcps, tcp_fusion_aborted); tcp_unfuse(tcp); } @@ -12755,7 +12857,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len, NULL, &ipp); if (mp == NULL) { - TCP_STAT(tcp_rput_v6_error); + TCP_STAT(tcps, tcp_rput_v6_error); return; } iphdr = mp->b_rptr; @@ -12896,11 +12998,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) if (tcp->tcp_snd_sack_ok) { (void) mi_set_sth_wroff(tcp->tcp_rq, tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + - (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); + (tcp->tcp_loopback ? 0 : + tcps->tcps_wroff_xtra)); } else { (void) mi_set_sth_wroff(tcp->tcp_rq, tcp->tcp_hdr_len + - (tcp->tcp_loopback ? 0 : tcp_wroff_xtra)); + (tcp->tcp_loopback ? 0 : + tcps->tcps_wroff_xtra)); } } if (flags & TH_ACK) { @@ -12997,7 +13101,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) TCP_TRACE_SEND_PKT); tcp_send_data(tcp, tcp->tcp_wq, ack_mp); BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcp_mib, tcpOutAck); + BUMP_MIB(&tcps->tcps_mib, tcpOutAck); /* Send up T_CONN_CON */ putnext(tcp->tcp_rq, mp1); @@ -13012,7 +13116,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * as usual. Mark this tcp as not capable * of fusion. */ - TCP_STAT(tcp_fusion_unfusable); + TCP_STAT(tcps, tcp_fusion_unfusable); tcp->tcp_unfusable = B_TRUE; putnext(tcp->tcp_rq, mp1); } @@ -13091,8 +13195,9 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) case TCPS_CLOSED: case TCPS_BOUND: { conn_t *new_connp; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - new_connp = ipcl_classify(mp, connp->conn_zoneid); + new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); if (new_connp != NULL) { tcp_reinput(new_connp, mp, connp->conn_sqp); return; @@ -13127,7 +13232,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) */ if (TCP_IS_DETACHED_NONEAGER(tcp) && (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { - BUMP_MIB(&tcp_mib, tcpInClosed); + BUMP_MIB(&tcps->tcps_mib, tcpInClosed); TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); @@ -13195,8 +13300,8 @@ try_again:; /* Recompute the gaps after noting the SYN. */ goto try_again; } - BUMP_MIB(&tcp_mib, tcpInDataDupSegs); - UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, + BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, (seg_len > -gap ? -gap : seg_len)); /* Remove the old stuff from seg_len. */ seg_len += gap; @@ -13313,10 +13418,11 @@ try_again:; mblk_t *mp2; if (tcp->tcp_rwnd == 0) { - BUMP_MIB(&tcp_mib, tcpInWinProbe); + BUMP_MIB(&tcps->tcps_mib, tcpInWinProbe); } else { - BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); - UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); + BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); + UPDATE_MIB(&tcps->tcps_mib, + tcpInDataPastWinBytes, -rgap); } /* @@ -13533,8 +13639,8 @@ ok:; } } } else if (seg_len > 0) { - BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); - UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); + BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); /* * If an out of order FIN was received before, and the seq * num and len of the new segment match that of the FIN, @@ -13910,7 +14016,7 @@ process_ack: * simultaneous active opens. */ if (tcp->tcp_loopback) { - TCP_STAT(tcp_fusion_unfusable); + TCP_STAT(tcps, tcp_fusion_unfusable); tcp->tcp_unfusable = B_TRUE; } } @@ -14006,7 +14112,7 @@ process_ack: if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { int dupack_cnt; - BUMP_MIB(&tcp_mib, tcpInDupAck); + BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); /* * Fast retransmit. When we have seen exactly three * identical ACKs while we have unacked data @@ -14019,7 +14125,7 @@ process_ack: ! tcp->tcp_rexmit) { /* Do Limited Transmit */ if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < - tcp_dupack_fast_retransmit) { + tcps->tcps_dupack_fast_retransmit) { /* * RFC 3042 * @@ -14050,7 +14156,7 @@ process_ack: flags |= TH_LIMIT_XMIT; } } else if (dupack_cnt == - tcp_dupack_fast_retransmit) { + tcps->tcps_dupack_fast_retransmit) { /* * If we have reduced tcp_ssthresh @@ -14178,7 +14284,7 @@ process_ack: if (new_swnd != 0) { /* tcp_suna != tcp_snxt */ /* Packet contains a window update */ - BUMP_MIB(&tcp_mib, tcpInWinUpdate); + BUMP_MIB(&tcps->tcps_mib, tcpInWinUpdate); tcp->tcp_zero_win_probe = 0; tcp->tcp_timer_backoff = 0; tcp->tcp_ms_we_have_waited = 0; @@ -14216,7 +14322,7 @@ process_ack: * Should we send ACKs in response to ACK only segments? */ if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { - BUMP_MIB(&tcp_mib, tcpInAckUnsent); + BUMP_MIB(&tcps->tcps_mib, tcpInAckUnsent); /* drop the received segment */ freemsg(mp); @@ -14231,14 +14337,14 @@ process_ack: */ if (tcp_drop_ack_unsent_cnt > 0 && ++tcp->tcp_in_ack_unsent > tcp_drop_ack_unsent_cnt) { - TCP_STAT(tcp_in_ack_unsent_drop); + TCP_STAT(tcps, tcp_in_ack_unsent_drop); return; } mp = tcp_ack_mp(tcp); if (mp != NULL) { TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcp_mib, tcpOutAck); + BUMP_MIB(&tcps->tcps_mib, tcpOutAck); tcp_send_data(tcp, tcp->tcp_wq, mp); } return; @@ -14259,7 +14365,7 @@ process_ack: * window was inflated to account for the other side's * cached packets, retract it. If it is, do Hoe's algorithm. */ - if (tcp->tcp_dupack_cnt >= tcp_dupack_fast_retransmit) { + if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { ASSERT(tcp->tcp_rexmit == B_FALSE); if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { tcp->tcp_dupack_cnt = 0; @@ -14303,7 +14409,7 @@ process_ack: * segments. */ tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + - tcp_dupack_fast_retransmit * mss; + tcps->tcps_dupack_fast_retransmit * mss; tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; flags |= TH_REXMIT_NEEDED; } @@ -14342,8 +14448,8 @@ process_ack: } } - BUMP_MIB(&tcp_mib, tcpInAckSegs); - UPDATE_MIB(&tcp_mib, tcpInAckBytes, bytes_acked); + BUMP_MIB(&tcps->tcps_mib, tcpInAckSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInAckBytes, bytes_acked); tcp->tcp_suna = seg_ack; if (tcp->tcp_zero_win_probe != 0) { tcp->tcp_zero_win_probe = 0; @@ -14425,7 +14531,7 @@ process_ack: tcp_set_rto(tcp, (int32_t)lbolt - (int32_t)(intptr_t)mp1->b_prev); else - BUMP_MIB(&tcp_mib, tcpRttNoUpdate); + BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); /* Remeber the last sequence to be ACKed */ tcp->tcp_csuna = seg_ack; @@ -14434,7 +14540,7 @@ process_ack: tcp->tcp_set_timer = 0; } } else { - BUMP_MIB(&tcp_mib, tcpRttNoUpdate); + BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); } /* Eat acknowledged bytes off the xmit queue. */ @@ -14605,7 +14711,7 @@ est: * flushing the FIN_WAIT_2 connection. */ TCP_TIMER_RESTART(tcp, - tcp_fin_wait_2_flush_interval); + tcps->tcps_fin_wait_2_flush_interval); } break; case TCPS_FIN_WAIT_2: @@ -14628,10 +14734,10 @@ est: tcp->tcp_exclbind = 0; if (!TCP_IS_DETACHED(tcp)) { TCP_TIMER_RESTART(tcp, - tcp_time_wait_interval); + tcps->tcps_time_wait_interval); } else { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcp_rput_time_wait); + TCP_DBGSTAT(tcps, tcp_rput_time_wait); } } /*FALLTHRU*/ @@ -14683,10 +14789,10 @@ est: tcp->tcp_exclbind = 0; if (!TCP_IS_DETACHED(tcp)) { TCP_TIMER_RESTART(tcp, - tcp_time_wait_interval); + tcps->tcps_time_wait_interval); } else { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcp_rput_time_wait); + TCP_DBGSTAT(tcps, tcp_rput_time_wait); } if (seg_len) { /* @@ -14879,9 +14985,9 @@ est: * do anything for a detached tcp. */ if (!TCP_IS_DETACHED(tcp)) - tcp->tcp_push_tid = TCP_TIMER(tcp, - tcp_push_timer, - MSEC_TO_TICK(tcp_push_timer_interval)); + tcp->tcp_push_tid = TCP_TIMER(tcp, + tcp_push_timer, + MSEC_TO_TICK(tcps->tcps_push_timer_interval)); } } xmit_check: @@ -14898,7 +15004,7 @@ xmit_check: if (flags & TH_REXMIT_NEEDED) { uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; - BUMP_MIB(&tcp_mib, tcpOutFastRetrans); + BUMP_MIB(&tcps->tcps_mib, tcpOutFastRetrans); if (snd_size > mss) snd_size = mss; if (snd_size > tcp->tcp_swnd) @@ -14910,8 +15016,9 @@ xmit_check: if (mp1 != NULL) { tcp->tcp_xmit_head->b_prev = (mblk_t *)lbolt; tcp->tcp_csuna = tcp->tcp_snxt; - BUMP_MIB(&tcp_mib, tcpRetransSegs); - UPDATE_MIB(&tcp_mib, tcpRetransBytes, snd_size); + BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); + UPDATE_MIB(&tcps->tcps_mib, + tcpRetransBytes, snd_size); TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); tcp_send_data(tcp, tcp->tcp_wq, mp1); @@ -14985,7 +15092,7 @@ ack_check: TCP_RECORD_TRACE(tcp, mp1, TCP_TRACE_SEND_PKT); tcp_send_data(tcp, tcp->tcp_wq, mp1); BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcp_mib, tcpOutAck); + BUMP_MIB(&tcps->tcps_mib, tcpOutAck); } if (tcp->tcp_ack_tid != 0) { (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); @@ -15000,8 +15107,8 @@ ack_check: if (tcp->tcp_ack_tid == 0) { tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, MSEC_TO_TICK(tcp->tcp_localnet ? - (clock_t)tcp_local_dack_interval : - (clock_t)tcp_deferred_ack_interval)); + (clock_t)tcps->tcps_local_dack_interval : + (clock_t)tcps->tcps_deferred_ack_interval)); } } if (flags & TH_ORDREL_NEEDED) { @@ -15470,6 +15577,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) mblk_t *lsoi; int retval; mblk_t *ire_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; switch (mp->b_datap->db_type) { case M_PROTO: @@ -15580,12 +15688,12 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) * round up. */ tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), - tcp_recv_hiwat_minmss * mss); + tcps->tcps_recv_hiwat_minmss * mss); q->q_hiwat = tcp->tcp_rwnd; tcp_set_ws_value(tcp); U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), tcp->tcp_tcph->th_win); - if (tcp->tcp_rcv_ws > 0 || tcp_wscale_always) + if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) tcp->tcp_snd_ws_ok = B_TRUE; /* @@ -15594,8 +15702,8 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) * include the timestamp * option in the SYN segment. */ - if (tcp_tstamp_always || - (tcp->tcp_rcv_ws && tcp_tstamp_if_wscale)) { + if (tcps->tcps_tstamp_always || + (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { tcp->tcp_snd_ts_ok = B_TRUE; } @@ -15604,7 +15712,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) * tcp_adapt_ire() if the sack metric * is set. So check it here also. */ - if (tcp_sack_permitted == 2 || + if (tcps->tcps_sack_permitted == 2 || tcp->tcp_snd_sack_ok) { if (tcp->tcp_sack_info == NULL) { tcp->tcp_sack_info = @@ -15622,7 +15730,7 @@ tcp_rput_other(tcp_t *tcp, mblk_t *mp) * enabled IP packets. Setting it to 1 avoids * compatibility problems. */ - if (tcp_ecn_permitted == 2) + if (tcps->tcps_ecn_permitted == 2) tcp->tcp_ecn_ok = B_TRUE; TCP_TIMER_RESTART(tcp, tcp->tcp_rto); @@ -15778,10 +15886,11 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) tcp_t *tcp = connp->conn_tcp; queue_t *q = tcp->tcp_rq; uint_t thwin; + tcp_stack_t *tcps = tcp->tcp_tcps; freeb(mp); - TCP_STAT(tcp_rsrv_calls); + TCP_STAT(tcps, tcp_rsrv_calls); if (TCP_IS_DETACHED(tcp) || q == NULL) { return; @@ -15809,7 +15918,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) tcp_clrqfull(peer_tcp); TCP_FUSE_SYNCSTR_UNPLUG_DRAIN(tcp); - TCP_STAT(tcp_fusion_backenabled); + TCP_STAT(tcps, tcp_fusion_backenabled); return; } @@ -15829,7 +15938,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) tcp_xmit_ctl(NULL, tcp, (tcp->tcp_swnd == 0) ? tcp->tcp_suna : tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); - BUMP_MIB(&tcp_mib, tcpOutWinUpdate); + BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); } } /* Handle a failure to allocate a T_ORDREL_IND here */ @@ -15879,12 +15988,13 @@ tcp_rsrv(queue_t *q) conn_t *connp = Q_TO_CONN(q); tcp_t *tcp = connp->conn_tcp; mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; /* No code does a putq on the read side */ ASSERT(q->q_first == NULL); /* Nothing to do for the default queue */ - if (q == tcp_g_q) { + if (q == tcps->tcps_g_q) { return; } @@ -15937,6 +16047,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) uint32_t old_max_rwnd; uint32_t max_transmittable_rwnd; boolean_t tcp_detached = TCP_IS_DETACHED(tcp); + tcp_stack_t *tcps = tcp->tcp_tcps; if (tcp->tcp_fused) { size_t sth_hiwat; @@ -15973,7 +16084,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) * funny TCP interactions of Nagle algorithm, SWS avoidance * and delayed acknowledgement. */ - rwnd = MAX(rwnd, tcp_recv_hiwat_minmss * mss); + rwnd = MAX(rwnd, tcps->tcps_recv_hiwat_minmss * mss); /* * If window size info has already been exchanged, TCP should not @@ -16005,7 +16116,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) } if (tcp->tcp_localnet) { tcp->tcp_rack_abs_max = - MIN(tcp_local_dacks_max, rwnd / mss / 2); + MIN(tcps->tcps_local_dacks_max, rwnd / mss / 2); } else { /* * For a remote host on a different subnet (through a router), @@ -16013,7 +16124,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) * tcp_deferred_acks_max is default to 2. */ tcp->tcp_rack_abs_max = - MIN(tcp_deferred_acks_max, rwnd / mss / 2); + MIN(tcps->tcps_deferred_acks_max, rwnd / mss / 2); } if (tcp->tcp_rack_cur_max > tcp->tcp_rack_abs_max) tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; @@ -16042,7 +16153,8 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) * prefer to choose these values algorithmically, with a likely * relationship to rwnd. */ - (void) mi_set_sth_hiwat(tcp->tcp_rq, MAX(rwnd, tcp_sth_rcv_hiwat)); + (void) mi_set_sth_hiwat(tcp->tcp_rq, + MAX(rwnd, tcps->tcps_sth_rcv_hiwat)); return (rwnd); } @@ -16072,6 +16184,8 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) zoneid_t zoneid; int v4_conn_idx; int v6_conn_idx; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + ip_stack_t *ipst; if (mpctl == NULL || (mpdata = mpctl->b_cont) == NULL || @@ -16087,22 +16201,23 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) } /* build table of connections -- need count in fixed part */ - SET_MIB(tcp_mib.tcpRtoAlgorithm, 4); /* vanj */ - SET_MIB(tcp_mib.tcpRtoMin, tcp_rexmit_interval_min); - SET_MIB(tcp_mib.tcpRtoMax, tcp_rexmit_interval_max); - SET_MIB(tcp_mib.tcpMaxConn, -1); - SET_MIB(tcp_mib.tcpCurrEstab, 0); + SET_MIB(tcps->tcps_mib.tcpRtoAlgorithm, 4); /* vanj */ + SET_MIB(tcps->tcps_mib.tcpRtoMin, tcps->tcps_rexmit_interval_min); + SET_MIB(tcps->tcps_mib.tcpRtoMax, tcps->tcps_rexmit_interval_max); + SET_MIB(tcps->tcps_mib.tcpMaxConn, -1); + SET_MIB(tcps->tcps_mib.tcpCurrEstab, 0); ispriv = - secpolicy_net_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; + secpolicy_ip_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; zoneid = Q_TO_CONN(q)->conn_zoneid; v4_conn_idx = v6_conn_idx = 0; mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; for (i = 0; i < CONN_G_HASH_SIZE; i++) { + ipst = tcps->tcps_netstack->netstack_ip; - connfp = &ipcl_globalhash_fanout[i]; + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; connp = NULL; @@ -16115,16 +16230,18 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) continue; /* not in this zone */ tcp = connp->conn_tcp; - UPDATE_MIB(&tcp_mib, tcpHCInSegs, tcp->tcp_ibsegs); + UPDATE_MIB(&tcps->tcps_mib, + tcpHCInSegs, tcp->tcp_ibsegs); tcp->tcp_ibsegs = 0; - UPDATE_MIB(&tcp_mib, tcpHCOutSegs, tcp->tcp_obsegs); + UPDATE_MIB(&tcps->tcps_mib, + tcpHCOutSegs, tcp->tcp_obsegs); tcp->tcp_obsegs = 0; tce6.tcp6ConnState = tce.tcpConnState = tcp_snmp_state(tcp); if (tce.tcpConnState == MIB2_TCP_established || tce.tcpConnState == MIB2_TCP_closeWait) - BUMP_MIB(&tcp_mib, tcpCurrEstab); + BUMP_MIB(&tcps->tcps_mib, tcpCurrEstab); needattr = B_FALSE; bzero(&mlp, sizeof (mlp)); @@ -16268,15 +16385,17 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) } /* fixed length structure for IPv4 and IPv6 counters */ - SET_MIB(tcp_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); - SET_MIB(tcp_mib.tcp6ConnTableSize, sizeof (mib2_tcp6ConnEntry_t)); + SET_MIB(tcps->tcps_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); + SET_MIB(tcps->tcps_mib.tcp6ConnTableSize, + sizeof (mib2_tcp6ConnEntry_t)); /* synchronize 32- and 64-bit counters */ - SYNC32_MIB(&tcp_mib, tcpInSegs, tcpHCInSegs); - SYNC32_MIB(&tcp_mib, tcpOutSegs, tcpHCOutSegs); + SYNC32_MIB(&tcps->tcps_mib, tcpInSegs, tcpHCInSegs); + SYNC32_MIB(&tcps->tcps_mib, tcpOutSegs, tcpHCOutSegs); optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; optp->level = MIB2_TCP; optp->name = 0; - (void) snmp_append_data(mpdata, (char *)&tcp_mib, sizeof (tcp_mib)); + (void) snmp_append_data(mpdata, (char *)&tcps->tcps_mib, + sizeof (tcps->tcps_mib)); optp->len = msgdsize(mpdata); qreply(q, mpctl); @@ -16395,7 +16514,7 @@ tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval, tcp_t *thisstream, cred_t *cr) { char hash[10], addrbuf[INET6_ADDRSTRLEN]; - boolean_t ispriv = secpolicy_net_config(cr, B_TRUE) == 0; + boolean_t ispriv = secpolicy_ip_config(cr, B_TRUE) == 0; char cflag; in6_addr_t v6dst; char buf[80]; @@ -16512,6 +16631,11 @@ tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) conn_t *connp; connf_t *connfp; zoneid_t zoneid; + tcp_stack_t *tcps; + ip_stack_t *ipst; + + zoneid = Q_TO_CONN(q)->conn_zoneid; + tcps = Q_TO_TCP(q)->tcp_tcps; /* * Because of the ndd constraint, at most we can have 64K buffer @@ -16521,9 +16645,9 @@ tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) * we limit the rate of doing this using tcp_ndd_get_info_interval. * This should be OK as normal users should not do this too often. */ - if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { - if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < - drv_usectohz(tcp_ndd_get_info_interval * 1000)) { + if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { + if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < + drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); return (0); } @@ -16536,10 +16660,10 @@ tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) (void) mi_mpprintf(mp, "%s", tcp_report_header); - zoneid = Q_TO_CONN(q)->conn_zoneid; for (i = 0; i < CONN_G_HASH_SIZE; i++) { - connfp = &ipcl_globalhash_fanout[i]; + ipst = tcps->tcps_netstack->netstack_ip; + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; connp = NULL; @@ -16555,7 +16679,7 @@ tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } - tcp_last_ndd_get_info_time = ddi_get_lbolt(); + tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); return (0); } @@ -16568,11 +16692,14 @@ tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) tcp_t *tcp; int i; zoneid_t zoneid; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + + zoneid = Q_TO_CONN(q)->conn_zoneid; /* Refer to comments in tcp_status_report(). */ - if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { - if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < - drv_usectohz(tcp_ndd_get_info_interval * 1000)) { + if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { + if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < + drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); return (0); } @@ -16585,10 +16712,8 @@ tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) (void) mi_mpprintf(mp, " %s", tcp_report_header); - zoneid = Q_TO_CONN(q)->conn_zoneid; - - for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { - tbf = &tcp_bind_fanout[i]; + for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { + tbf = &tcps->tcps_bind_fanout[i]; mutex_enter(&tbf->tf_lock); for (tcp = tbf->tf_tcp; tcp != NULL; tcp = tcp->tcp_bind_hash) { @@ -16602,7 +16727,7 @@ tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } mutex_exit(&tbf->tf_lock); } - tcp_last_ndd_get_info_time = ddi_get_lbolt(); + tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); return (0); } @@ -16616,11 +16741,16 @@ tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) tcp_t *tcp; int i; zoneid_t zoneid; + tcp_stack_t *tcps; + ip_stack_t *ipst; + + zoneid = Q_TO_CONN(q)->conn_zoneid; + tcps = Q_TO_TCP(q)->tcp_tcps; /* Refer to comments in tcp_status_report(). */ - if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { - if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < - drv_usectohz(tcp_ndd_get_info_interval * 1000)) { + if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { + if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < + drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); return (0); } @@ -16635,10 +16765,10 @@ tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) " TCP " MI_COL_HDRPAD_STR "zone IP addr port seqnum backlog (q0/q/max)"); - zoneid = Q_TO_CONN(q)->conn_zoneid; + ipst = tcps->tcps_netstack->netstack_ip; - for (i = 0; i < ipcl_bind_fanout_size; i++) { - connfp = &ipcl_bind_fanout[i]; + for (i = 0; i < ipst->ips_ipcl_bind_fanout_size; i++) { + connfp = &ipst->ips_ipcl_bind_fanout[i]; connp = NULL; while ((connp = ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { @@ -16650,7 +16780,7 @@ tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } } - tcp_last_ndd_get_info_time = ddi_get_lbolt(); + tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); return (0); } @@ -16664,11 +16794,17 @@ tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) tcp_t *tcp; int i; zoneid_t zoneid; + tcp_stack_t *tcps; + ip_stack_t *ipst; + + zoneid = Q_TO_CONN(q)->conn_zoneid; + tcps = Q_TO_TCP(q)->tcp_tcps; + ipst = tcps->tcps_netstack->netstack_ip; /* Refer to comments in tcp_status_report(). */ - if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { - if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < - drv_usectohz(tcp_ndd_get_info_interval * 1000)) { + if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { + if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < + drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); return (0); } @@ -16680,13 +16816,11 @@ tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } (void) mi_mpprintf(mp, "tcp_conn_hash_size = %d", - ipcl_conn_fanout_size); + ipst->ips_ipcl_conn_fanout_size); (void) mi_mpprintf(mp, " %s", tcp_report_header); - zoneid = Q_TO_CONN(q)->conn_zoneid; - - for (i = 0; i < ipcl_conn_fanout_size; i++) { - connfp = &ipcl_conn_fanout[i]; + for (i = 0; i < ipst->ips_ipcl_conn_fanout_size; i++) { + connfp = &ipst->ips_ipcl_conn_fanout[i]; connp = NULL; while ((connp = ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { @@ -16699,7 +16833,7 @@ tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } } - tcp_last_ndd_get_info_time = ddi_get_lbolt(); + tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); return (0); } @@ -16712,11 +16846,15 @@ tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) tcp_t *tcp; int i; zoneid_t zoneid; + tcp_stack_t *tcps; + + zoneid = Q_TO_CONN(q)->conn_zoneid; + tcps = Q_TO_TCP(q)->tcp_tcps; /* Refer to comments in tcp_status_report(). */ - if (cr == NULL || secpolicy_net_config(cr, B_TRUE) != 0) { - if (ddi_get_lbolt() - tcp_last_ndd_get_info_time < - drv_usectohz(tcp_ndd_get_info_interval * 1000)) { + if (cr == NULL || secpolicy_ip_config(cr, B_TRUE) != 0) { + if (ddi_get_lbolt() - tcps->tcps_last_ndd_get_info_time < + drv_usectohz(tcps->tcps_ndd_get_info_interval * 1000)) { (void) mi_mpprintf(mp, NDD_TOO_QUICK_MSG); return (0); } @@ -16729,10 +16867,8 @@ tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) (void) mi_mpprintf(mp, " %s", tcp_report_header); - zoneid = Q_TO_CONN(q)->conn_zoneid; - - for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { - tf = &tcp_acceptor_fanout[i]; + for (i = 0; i < TCP_FANOUT_SIZE; i++) { + tf = &tcps->tcps_acceptor_fanout[i]; mutex_enter(&tf->tf_lock); for (tcp = tf->tf_tcp; tcp != NULL; tcp = tcp->tcp_acceptor_hash) { @@ -16744,7 +16880,7 @@ tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } mutex_exit(&tf->tf_lock); } - tcp_last_ndd_get_info_time = ddi_get_lbolt(); + tcps->tcps_last_ndd_get_info_time = ddi_get_lbolt(); return (0); } @@ -16764,6 +16900,7 @@ tcp_timer(void *arg) uint32_t mss; conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; tcp->tcp_timer_tid = 0; @@ -16796,8 +16933,8 @@ tcp_timer(void *arg) } if (!listener->tcp_syn_defense && (listener->tcp_syn_rcvd_timeout > - (tcp_conn_req_max_q0 >> 2)) && - (tcp_conn_req_max_q0 > 200)) { + (tcps->tcps_conn_req_max_q0 >> 2)) && + (tcps->tcps_conn_req_max_q0 > 200)) { /* We may be under attack. Put on a defense. */ listener->tcp_syn_defense = B_TRUE; cmn_err(CE_WARN, "High TCP connect timeout " @@ -16844,7 +16981,7 @@ tcp_timer(void *arg) if (tcp->tcp_suna != tcp->tcp_snxt) { clock_t time_to_wait; - BUMP_MIB(&tcp_mib, tcpTimRetrans); + BUMP_MIB(&tcps->tcps_mib, tcpTimRetrans); if (!tcp->tcp_xmit_head) break; time_to_wait = lbolt - @@ -16856,7 +16993,7 @@ tcp_timer(void *arg) * restart the timer. */ if (time_to_wait > msec_per_tick) { - TCP_STAT(tcp_timer_fire_early); + TCP_STAT(tcps, tcp_timer_fire_early); TCP_TIMER_RESTART(tcp, time_to_wait); return; } @@ -16937,7 +17074,7 @@ tcp_timer(void *arg) /* Extend window for zero window probe */ tcp->tcp_swnd++; tcp->tcp_zero_win_probe = B_TRUE; - BUMP_MIB(&tcp_mib, tcpOutWinProbe); + BUMP_MIB(&tcps->tcps_mib, tcpOutWinProbe); } else { /* * Handle timeout from sender SWS avoidance. @@ -16965,7 +17102,7 @@ tcp_timer(void *arg) !tcp->tcp_fin_acked) break; /* Nothing to do, return without restarting timer. */ - TCP_STAT(tcp_timer_fire_miss); + TCP_STAT(tcps, tcp_timer_fire_miss); return; case TCPS_FIN_WAIT_2: /* @@ -16977,7 +17114,8 @@ tcp_timer(void *arg) if (TCP_IS_DETACHED(tcp)) { (void) tcp_clean_death(tcp, 0, 23); } else { - TCP_TIMER_RESTART(tcp, tcp_fin_wait_2_flush_interval); + TCP_TIMER_RESTART(tcp, + tcps->tcps_fin_wait_2_flush_interval); } return; case TCPS_TIME_WAIT: @@ -17001,7 +17139,7 @@ tcp_timer(void *arg) if ((tcp->tcp_zero_win_probe == 0) || (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) > second_threshold)) { - BUMP_MIB(&tcp_mib, tcpTimRetransDrop); + BUMP_MIB(&tcps->tcps_mib, tcpTimRetransDrop); /* * If TCP is in SYN_RCVD state, send back a * RST|ACK as BSD does. Note that tcp_zero_win_probe @@ -17059,19 +17197,19 @@ tcp_timer(void *arg) } tcp->tcp_timer_backoff++; if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcp_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < - tcp_rexmit_interval_min) { + tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < + tcps->tcps_rexmit_interval_min) { /* * This means the original RTO is tcp_rexmit_interval_min. * So we will use tcp_rexmit_interval_min as the RTO value * and do the backoff. */ - ms = tcp_rexmit_interval_min << tcp->tcp_timer_backoff; + ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff; } else { ms <<= tcp->tcp_timer_backoff; } - if (ms > tcp_rexmit_interval_max) { - ms = tcp_rexmit_interval_max; + if (ms > tcps->tcps_rexmit_interval_max) { + ms = tcps->tcps_rexmit_interval_max; /* * ms is at max, decrement tcp_timer_backoff to avoid * overflow. @@ -17135,8 +17273,8 @@ tcp_timer(void *arg) } tcp->tcp_csuna = tcp->tcp_snxt; - BUMP_MIB(&tcp_mib, tcpRetransSegs); - UPDATE_MIB(&tcp_mib, tcpRetransBytes, mss); + BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss); TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); tcp_send_data(tcp, tcp->tcp_wq, mp); @@ -17208,6 +17346,7 @@ tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) { int i; boolean_t restart = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; if (random && tcp_random_anon_port != 0) { (void) random_get_pseudo_bytes((uint8_t *)&port, @@ -17221,29 +17360,29 @@ tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) * port to get the random port. It should fall into the * valid anon port range. */ - if (port < tcp_smallest_anon_port) { - port = tcp_smallest_anon_port + - port % (tcp_largest_anon_port - - tcp_smallest_anon_port); + if (port < tcps->tcps_smallest_anon_port) { + port = tcps->tcps_smallest_anon_port + + port % (tcps->tcps_largest_anon_port - + tcps->tcps_smallest_anon_port); } } retry: - if (port < tcp_smallest_anon_port) - port = (in_port_t)tcp_smallest_anon_port; + if (port < tcps->tcps_smallest_anon_port) + port = (in_port_t)tcps->tcps_smallest_anon_port; - if (port > tcp_largest_anon_port) { + if (port > tcps->tcps_largest_anon_port) { if (restart) return (0); restart = B_TRUE; - port = (in_port_t)tcp_smallest_anon_port; + port = (in_port_t)tcps->tcps_smallest_anon_port; } - if (port < tcp_smallest_nonpriv_port) - port = (in_port_t)tcp_smallest_nonpriv_port; + if (port < tcps->tcps_smallest_nonpriv_port) + port = (in_port_t)tcps->tcps_smallest_nonpriv_port; - for (i = 0; i < tcp_g_num_epriv_ports; i++) { - if (port == tcp_g_epriv_ports[i]) { + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (port == tcps->tcps_g_epriv_ports[i]) { port++; /* * Make sure whether the port is in the @@ -17275,9 +17414,9 @@ tcp_get_next_priv_port(const tcp_t *tcp) static in_port_t next_priv_port = IPPORT_RESERVED - 1; in_port_t nextport; boolean_t restart = B_FALSE; - + tcp_stack_t *tcps = tcp->tcp_tcps; retry: - if (next_priv_port < tcp_min_anonpriv_port || + if (next_priv_port < tcps->tcps_min_anonpriv_port || next_priv_port >= IPPORT_RESERVED) { next_priv_port = IPPORT_RESERVED - 1; if (restart) @@ -17370,6 +17509,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; uint32_t msize; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * Try and ASSERT the minimum possible references on the @@ -17457,7 +17597,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) */ if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { - SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); + SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); } usable = tcp->tcp_swnd; /* tcp window size */ @@ -17530,8 +17670,8 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) U32_TO_ABE32(snxt, tcph->th_seq); - BUMP_MIB(&tcp_mib, tcpOutDataSegs); - UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); + BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); BUMP_LOCAL(tcp->tcp_obsegs); /* Update the latest receive window size in TCP header. */ @@ -17557,7 +17697,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) (!OK_32PTR(rptr))) { /* NOTE: we assume allocb returns an OK_32PTR */ mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcp_wroff_xtra, BPRI_MED); + tcps->tcps_wroff_xtra, BPRI_MED); if (!mp) { freemsg(mp1); goto no_memory; @@ -17566,7 +17706,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mp1 = mp; /* Leave room for Link Level header */ /* hdrlen = tcp->tcp_hdr_len; */ - rptr = &mp1->b_rptr[tcp_wroff_xtra]; + rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; mp1->b_wptr = &rptr[hdrlen]; } mp1->b_rptr = rptr; @@ -17657,6 +17797,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) mblk_t *stropt_mp = mp; struct stroptions *stropt; uint_t thwin; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * Drop the eager's ref on the listener, that was placed when @@ -17765,7 +17906,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) } stropt->so_flags = SO_HIWAT; - stropt->so_hiwat = MAX(q->q_hiwat, tcp_sth_rcv_hiwat); + stropt->so_hiwat = MAX(q->q_hiwat, tcps->tcps_sth_rcv_hiwat); stropt->so_flags |= SO_MAXBLK; stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); @@ -17800,10 +17941,10 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); } else if (tcp->tcp_snd_sack_ok) { stropt->so_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + - (tcp->tcp_loopback ? 0 : tcp_wroff_xtra); + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); } else { stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : - tcp_wroff_xtra); + tcps->tcps_wroff_xtra); } /* @@ -17851,7 +17992,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) tcp, (tcp->tcp_swnd == 0) ? tcp->tcp_suna : tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); - BUMP_MIB(&tcp_mib, tcpOutWinUpdate); + BUMP_MIB(&tcps->tcps_mib, tcpOutWinUpdate); } } @@ -17880,7 +18021,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) } if (peer_tcp->tcp_flow_stopped) { tcp_clrqfull(peer_tcp); - TCP_STAT(tcp_fusion_backenabled); + TCP_STAT(tcps, tcp_fusion_backenabled); } mutex_exit(&peer_tcp->tcp_non_sq_lock); mutex_exit(&tcp->tcp_non_sq_lock); @@ -17982,7 +18123,7 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2) /* * This is the STREAMS entry point for T_CONN_RES coming down on * Acceptor STREAM when sockfs listener does accept processing. - * Read the block comment on top pf tcp_conn_request(). + * Read the block comment on top of tcp_conn_request(). */ void tcp_wput_accept(queue_t *q, mblk_t *mp) @@ -18048,6 +18189,9 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; econnp->conn_allzones = listener->tcp_connp->conn_allzones; + ASSERT(econnp->conn_netstack == + listener->tcp_connp->conn_netstack); + ASSERT(eager->tcp_tcps == listener->tcp_tcps); /* Put the ref for IP */ CONN_INC_REF(econnp); @@ -18231,6 +18375,7 @@ tcp_wput(queue_t *q, mblk_t *mp) uchar_t *rptr; struct iocblk *iocp; uint32_t msize; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; ASSERT(connp->conn_ref >= 2); @@ -18315,7 +18460,7 @@ tcp_wput(queue_t *q, mblk_t *mp) case ND_SET: /* nd_getset does the necessary checks */ case ND_GET: - if (!nd_getset(q, tcp_g_nd, mp)) { + if (!nd_getset(q, tcps->tcps_g_nd, mp)) { CALL_IP_WPUT(connp, q, mp); return; } @@ -18326,7 +18471,7 @@ tcp_wput(queue_t *q, mblk_t *mp) * Wants to be the default wq. Check the credentials * first, the rest is executed via squeue. */ - if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { + if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { iocp->ioc_error = EPERM; iocp->ioc_count = 0; mp->b_datap->db_type = M_IOCACK; @@ -18388,6 +18533,7 @@ tcp_zcopy_check(tcp_t *tcp) conn_t *connp = tcp->tcp_connp; ire_t *ire; boolean_t zc_enabled = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; if (do_tcpzcopy == 2) zc_enabled = B_TRUE; @@ -18424,10 +18570,10 @@ tcp_zcopy_check(tcp_t *tcp) if (!TCP_IS_DETACHED(tcp)) { if (zc_enabled) { (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMSAFE); - TCP_STAT(tcp_zcopy_on); + TCP_STAT(tcps, tcp_zcopy_on); } else { (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); - TCP_STAT(tcp_zcopy_off); + TCP_STAT(tcps, tcp_zcopy_off); } } return (zc_enabled); @@ -18436,13 +18582,15 @@ tcp_zcopy_check(tcp_t *tcp) static mblk_t * tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) { + tcp_stack_t *tcps = tcp->tcp_tcps; + if (do_tcpzcopy == 2) return (bp); else if (tcp->tcp_snd_zcopy_on) { tcp->tcp_snd_zcopy_on = B_FALSE; if (!TCP_IS_DETACHED(tcp)) { (void) mi_set_sth_copyopt(tcp->tcp_rq, ZCVMUNSAFE); - TCP_STAT(tcp_zcopy_disable); + TCP_STAT(tcps, tcp_zcopy_disable); } } return (tcp_zcopy_backoff(tcp, bp, 0)); @@ -18456,8 +18604,10 @@ static mblk_t * tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) { mblk_t *head, *tail, *nbp; + tcp_stack_t *tcps = tcp->tcp_tcps; + if (IS_VMLOANED_MBLK(bp)) { - TCP_STAT(tcp_zcopy_backoff); + TCP_STAT(tcps, tcp_zcopy_backoff); if ((head = copyb(bp)) == NULL) { /* fail to backoff; leave it for the next backoff */ tcp->tcp_xmit_zc_clean = B_FALSE; @@ -18486,7 +18636,7 @@ tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) tail = head; while (nbp) { if (IS_VMLOANED_MBLK(nbp)) { - TCP_STAT(tcp_zcopy_backoff); + TCP_STAT(tcps, tcp_zcopy_backoff); if ((tail->b_cont = copyb(nbp)) == NULL) { tcp->tcp_xmit_zc_clean = B_FALSE; tail->b_cont = nbp; @@ -18541,9 +18691,10 @@ tcp_zcopy_notify(tcp_t *tcp) static boolean_t tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep) { - ire_t *ire; - conn_t *connp = tcp->tcp_connp; - + ire_t *ire; + conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; mutex_enter(&connp->conn_lock); ire = connp->conn_ire_cache; @@ -18562,7 +18713,7 @@ tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep) /* force a recheck later on */ tcp->tcp_ire_ill_check_done = B_FALSE; - TCP_DBGSTAT(tcp_ire_null1); + TCP_DBGSTAT(tcps, tcp_ire_null1); connp->conn_ire_cache = NULL; mutex_exit(&connp->conn_lock); @@ -18570,12 +18721,13 @@ tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep) IRE_REFRELE_NOTR(ire); tsl = crgetlabel(CONN_CRED(connp)); - ire = (dst ? ire_cache_lookup(*dst, connp->conn_zoneid, tsl) : + ire = (dst ? + ire_cache_lookup(*dst, connp->conn_zoneid, tsl, ipst) : ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst, - connp->conn_zoneid, tsl)); + connp->conn_zoneid, tsl, ipst)); if (ire == NULL) { - TCP_STAT(tcp_ire_null); + TCP_STAT(tcps, tcp_ire_null); return (B_FALSE); } @@ -18630,6 +18782,7 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) ill_t *ill; conn_t *connp = tcp->tcp_connp; mblk_t *ire_fp_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; if (mp != NULL) ipha = (ipha_t *)mp->b_rptr; @@ -18646,7 +18799,7 @@ tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || ((mp != NULL) && (ire->ire_max_frag < ntohs(ipha->ipha_length) || MBLKL(ire_fp_mp) > MBLKHEAD(mp)))) { - TCP_STAT(tcp_ip_ire_send); + TCP_STAT(tcps, tcp_ip_ire_send); IRE_REFRELE(ire); return (B_FALSE); } @@ -18687,6 +18840,8 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) uint32_t hcksum_txflags = 0; mblk_t *ire_fp_mp; uint_t ire_fp_mp_len; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; ASSERT(DB_TYPE(mp) == M_DATA); @@ -18708,10 +18863,10 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) !connp->conn_ulp_labeled || ipha->ipha_ident == IP_HDR_INCLUDED || ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || - IPP_ENABLED(IPP_LOCAL_OUT)) { + IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { if (tcp->tcp_snd_zcopy_aware) mp = tcp_zcopy_disable(tcp, mp); - TCP_STAT(tcp_ip_send); + TCP_STAT(tcps, tcp_ip_send); CALL_IP_WPUT(connp, q, mp); return; } @@ -18746,7 +18901,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) * Restore LSO for this connection, so that next time around * it is eligible to go through tcp_lsosend() path again. */ - TCP_STAT(tcp_lso_enabled); + TCP_STAT(tcps, tcp_lso_enabled); tcp->tcp_lso = B_TRUE; ip1dbg(("tcp_send_data: reenabling LSO for connp %p on " "interface %s\n", (void *)connp, ill->ill_name)); @@ -18755,7 +18910,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) * Restore MDT for this connection, so that next time around * it is eligible to go through tcp_multisend() path again. */ - TCP_STAT(tcp_mdt_conn_resumed1); + TCP_STAT(tcps, tcp_mdt_conn_resumed1); tcp->tcp_mdt = B_TRUE; ip1dbg(("tcp_send_data: reenabling MDT for connp %p on " "interface %s\n", (void *)connp, ill->ill_name)); @@ -18787,8 +18942,8 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) /* Software checksum? */ if (DB_CKSUMFLAGS(mp) == 0) { - TCP_STAT(tcp_out_sw_cksum); - TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, + TCP_STAT(tcps, tcp_out_sw_cksum); + TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); } @@ -18819,14 +18974,15 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) * depending on the availability of transmit resources at * the media layer. */ - IP_DLS_ILL_TX(ill, ipha, mp); + IP_DLS_ILL_TX(ill, ipha, mp, ipst); } else { ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr; DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ip4_physical_out_event, ipv4firewall_physical_out, - NULL, out_ill, ipha, mp, mp); + FW_HOOKS(ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, + NULL, out_ill, ipha, mp, mp, ipst); DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); if (mp != NULL) putnext(ire->ire_stq, mp); @@ -18896,6 +19052,8 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) int32_t tcp_tcp_hdr_len; int mdt_thres; int rc; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst; tcpstate = tcp->tcp_state; if (mp == NULL) { @@ -19052,7 +19210,7 @@ data_null: if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { - SET_TCP_INIT_CWND(tcp, mss, tcp_slow_start_after_idle); + SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); } if (tcpstate == TCPS_SYN_RCVD) { /* @@ -19192,6 +19350,8 @@ data_null: * connection, stop using LSO/MDT and restore the stream head * parameters accordingly. */ + ipst = tcps->tcps_netstack->netstack_ip; + if ((tcp->tcp_lso || tcp->tcp_mdt) && ((tcp->tcp_ipversion == IPV4_VERSION && tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || @@ -19200,7 +19360,7 @@ data_null: tcp->tcp_state != TCPS_ESTABLISHED || TCP_IS_DETACHED(tcp) || !CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp) || CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) || - IPP_ENABLED(IPP_LOCAL_OUT))) { + IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { if (tcp->tcp_lso) { tcp->tcp_connp->conn_lso_ok = B_FALSE; tcp->tcp_lso = B_FALSE; @@ -19212,9 +19372,9 @@ data_null: /* Anything other than detached is considered pathological */ if (!TCP_IS_DETACHED(tcp)) { if (tcp->tcp_lso) - TCP_STAT(tcp_lso_disabled); + TCP_STAT(tcps, tcp_lso_disabled); else - TCP_STAT(tcp_mdt_conn_halted1); + TCP_STAT(tcps, tcp_mdt_conn_halted1); (void) tcp_maxpsz_set(tcp, B_TRUE); } } @@ -19400,7 +19560,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) static int tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, const uint32_t start, const uint32_t stuff, const uint32_t end, - const uint32_t flags) + const uint32_t flags, tcp_stack_t *tcps) { /* Add global destination address & SAP attribute */ if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) { @@ -19408,7 +19568,7 @@ tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, "destination address+SAP\n")); if (dlmp != NULL) - TCP_STAT(tcp_mdt_allocfail); + TCP_STAT(tcps, tcp_mdt_allocfail); return (-1); } @@ -19418,7 +19578,7 @@ tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, ip1dbg(("tcp_mdt_add_attrs: can't add global hardware " "checksum attribute\n")); - TCP_STAT(tcp_mdt_allocfail); + TCP_STAT(tcps, tcp_mdt_allocfail); return (-1); } @@ -19472,6 +19632,8 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, conn_t *connp; mblk_t *mp, *mp1, *fw_mp_head = NULL; uchar_t *pld_start; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; #ifdef _BIG_ENDIAN #define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) @@ -19574,7 +19736,7 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, */ if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) { /* don't go through this path anymore for this connection */ - TCP_STAT(tcp_mdt_conn_halted2); + TCP_STAT(tcps, tcp_mdt_conn_halted2); tcp->tcp_mdt = B_FALSE; ip1dbg(("tcp_multisend: disabling MDT for connp %p on " "interface %s\n", (void *)connp, ill->ill_name)); @@ -19678,7 +19840,7 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, * return to us once a large-size transmission is * possible. */ - TCP_STAT(tcp_mdt_legacy_small); + TCP_STAT(tcps, tcp_mdt_legacy_small); if ((err = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, num_sack_blk, usable, snxt, tail_unsent, xmit_tail, local_time, @@ -19694,7 +19856,7 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, return (0); } - TCP_STAT(tcp_mdt_legacy_ret); + TCP_STAT(tcps, tcp_mdt_legacy_ret); /* * We may have delivered the Multidata, so make sure * to re-initialize before the next round. @@ -19788,7 +19950,7 @@ tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* hardware checksum offsets */ start, stuff, 0, /* hardware checksum flag */ - hwcksum_flags) != 0)) { + hwcksum_flags, tcps) != 0)) { legacy_send: if (md_mp != NULL) { /* Unlink message from the chain */ @@ -19807,11 +19969,11 @@ legacy_send: md_mp_head = NULL; } /* md_hbuf gets freed automatically */ - TCP_STAT(tcp_mdt_discarded); + TCP_STAT(tcps, tcp_mdt_discarded); freeb(md_mp); } else { /* Either allocb or mmd_alloc failed */ - TCP_STAT(tcp_mdt_allocfail); + TCP_STAT(tcps, tcp_mdt_allocfail); if (md_hbuf != NULL) freeb(md_hbuf); } @@ -19831,7 +19993,7 @@ legacy_send_no_md: * we gave up with the Multidata processings * and let the old path have it all. */ - TCP_STAT(tcp_mdt_legacy_all); + TCP_STAT(tcps, tcp_mdt_legacy_all); return (tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, num_sack_blk, usable, snxt, tail_unsent, xmit_tail, local_time, @@ -19839,11 +20001,11 @@ legacy_send_no_md: } /* link to any existing ones, if applicable */ - TCP_STAT(tcp_mdt_allocd); + TCP_STAT(tcps, tcp_mdt_allocd); if (md_mp_head == NULL) { md_mp_head = md_mp; } else if (tcp_mdt_chain) { - TCP_STAT(tcp_mdt_linked); + TCP_STAT(tcps, tcp_mdt_linked); linkb(md_mp_head, md_mp); } } @@ -19896,7 +20058,7 @@ legacy_send_no_md: break; /* done */ if ((md_pbuf = dupb(*xmit_tail)) == NULL) { - TCP_STAT(tcp_mdt_allocfail); + TCP_STAT(tcps, tcp_mdt_allocfail); goto legacy_send; /* out_of_mem */ } @@ -19905,7 +20067,8 @@ legacy_send_no_md: if (!ip_md_zcopy_attr(mmd, NULL, zc_cap->ill_zerocopy_flags)) { freeb(md_pbuf); - TCP_STAT(tcp_mdt_allocfail); + TCP_STAT(tcps, + tcp_mdt_allocfail); /* out_of_mem */ goto legacy_send; } @@ -19968,7 +20131,7 @@ legacy_send_no_md: max_pld > 0) { md_pbuf_nxt = dupb((*xmit_tail)->b_cont); if (md_pbuf_nxt == NULL) { - TCP_STAT(tcp_mdt_allocfail); + TCP_STAT(tcps, tcp_mdt_allocfail); goto legacy_send; /* out_of_mem */ } @@ -19977,7 +20140,8 @@ legacy_send_no_md: if (!ip_md_zcopy_attr(mmd, NULL, zc_cap->ill_zerocopy_flags)) { freeb(md_pbuf_nxt); - TCP_STAT(tcp_mdt_allocfail); + TCP_STAT(tcps, + tcp_mdt_allocfail); /* out_of_mem */ goto legacy_send; } @@ -20094,7 +20258,8 @@ legacy_send_no_md: *snxt == tcp->tcp_fss) { if (!tcp->tcp_fin_acked) { tcp->tcp_tcph->th_flags[0] |= TH_FIN; - BUMP_MIB(&tcp_mib, tcpOutControl); + BUMP_MIB(&tcps->tcps_mib, + tcpOutControl); } if (!tcp->tcp_fin_sent) { tcp->tcp_fin_sent = B_TRUE; @@ -20294,7 +20459,7 @@ legacy_send_no_md: (void *)tcp, (void *)mmd, (void *)pkt_info, err); } - TCP_STAT(tcp_mdt_addpdescfail); + TCP_STAT(tcps, tcp_mdt_addpdescfail); goto legacy_send; /* out_of_mem */ } ASSERT(pkt != NULL); @@ -20336,8 +20501,8 @@ legacy_send_no_md: *up = (sum & 0xFFFF) + (sum >> 16); } else { /* software checksumming */ - TCP_STAT(tcp_out_sw_cksum); - TCP_STAT_UPDATE(tcp_out_sw_cksum_bytes, + TCP_STAT(tcps, tcp_out_sw_cksum); + TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, tcp->tcp_hdr_len + tcp->tcp_last_sent_len); *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, cksum + IP_TCP_CSUM_COMP); @@ -20359,8 +20524,10 @@ legacy_send_no_md: } } - if (af == AF_INET && HOOKS4_INTERESTED_PHYSICAL_OUT|| - af == AF_INET6 && HOOKS6_INTERESTED_PHYSICAL_OUT) { + if (af == AF_INET && + HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) || + af == AF_INET6 && + HOOKS6_INTERESTED_PHYSICAL_OUT(ipst)) { /* build header(IP/TCP) mblk for this segment */ if ((mp = dupb(md_hbuf)) == NULL) goto legacy_send; @@ -20387,9 +20554,10 @@ legacy_send_no_md: ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ip4_physical_out_event, - ipv4firewall_physical_out, - NULL, ill, ipha, mp, mp); + FW_HOOKS( + ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, + NULL, ill, ipha, mp, mp, ipst); DTRACE_PROBE1( ip4__physical__out__end, mblk_t *, mp); @@ -20400,9 +20568,10 @@ legacy_send_no_md: ill_t *, ill, ip6_t *, ip6h, mblk_t *, mp); - FW_HOOKS6(ip6_physical_out_event, - ipv6firewall_physical_out, - NULL, ill, ip6h, mp, mp); + FW_HOOKS6( + ipst->ips_ip6_physical_out_event, + ipst->ips_ipv6firewall_physical_out, + NULL, ill, ip6h, mp, mp, ipst); DTRACE_PROBE1( ip6__physical__out__end, mblk_t *, mp); @@ -20518,7 +20687,7 @@ legacy_send_no_md: freemsg(mp); } if (buf_trunked) { - TCP_STAT(tcp_mdt_discarded); + TCP_STAT(tcps, tcp_mdt_discarded); freeb(md_mp); buf_trunked = B_FALSE; } @@ -20550,6 +20719,8 @@ tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, { uint64_t delta; nce_t *nce; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; ASSERT(ire != NULL && ill != NULL); ASSERT(ire->ire_stq != NULL); @@ -20559,14 +20730,14 @@ tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, /* adjust MIBs and IRE timestamp */ TCP_RECORD_TRACE(tcp, md_mp_head, TCP_TRACE_SEND_PKT); tcp->tcp_obsegs += obsegs; - UPDATE_MIB(&tcp_mib, tcpOutDataSegs, obsegs); - UPDATE_MIB(&tcp_mib, tcpOutDataBytes, obbytes); - TCP_STAT_UPDATE(tcp_mdt_pkt_out, obsegs); + UPDATE_MIB(&tcps->tcps_mib, tcpOutDataSegs, obsegs); + UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, obbytes); + TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out, obsegs); if (tcp->tcp_ipversion == IPV4_VERSION) { - TCP_STAT_UPDATE(tcp_mdt_pkt_out_v4, obsegs); + TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v4, obsegs); } else { - TCP_STAT_UPDATE(tcp_mdt_pkt_out_v6, obsegs); + TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v6, obsegs); } UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests, obsegs); UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, obsegs); @@ -20630,7 +20801,8 @@ tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, */ nce->nce_state = ND_DELAY; mutex_exit(&nce->nce_lock); - NDP_RESTART_TIMER(nce, delay_first_probe_time); + NDP_RESTART_TIMER(nce, + ipst->ips_delay_first_probe_time); if (ip_debug > 3) { /* ip2dbg */ pr_addr_dbg("tcp_multisend_data: state " @@ -20675,6 +20847,8 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, ipaddr_t dst; uint32_t cksum; uint16_t *up; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; ASSERT(DB_TYPE(mp) == M_DATA); ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); @@ -20746,14 +20920,15 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, * depending on the availability of transmit resources at * the media layer. */ - IP_DLS_ILL_TX(ill, ipha, mp); + IP_DLS_ILL_TX(ill, ipha, mp, ipst); } else { ill_t *out_ill = (ill_t *)ire->ire_stq->q_ptr; DTRACE_PROBE4(ip4__physical__out__start, ill_t *, NULL, ill_t *, out_ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ip4_physical_out_event, ipv4firewall_physical_out, - NULL, out_ill, ipha, mp, mp); + FW_HOOKS(ipst->ips_ip4_physical_out_event, + ipst->ips_ipv4firewall_physical_out, + NULL, out_ill, ipha, mp, mp, ipst); DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); if (mp != NULL) putnext(ire->ire_stq, mp); @@ -20785,6 +20960,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, int num_lso_seg = 1; uint_t lso_usable; boolean_t do_lso_send = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * Check LSO capability before any further work. And the similar check @@ -21008,16 +21184,16 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, *snxt += len; *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcp_mib, tcpOutDataSegs); - UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); + BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); tcp_send_data(tcp, q, mp); continue; } *snxt += len; /* Adjust later if we don't send all of len */ - BUMP_MIB(&tcp_mib, tcpOutDataSegs); - UPDATE_MIB(&tcp_mib, tcpOutDataBytes, len); + BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); if (*tail_unsent) { /* Are the bytes above us in flight? */ @@ -21097,7 +21273,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, must_alloc:; mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcp_wroff_xtra + ire_fp_mp_len, BPRI_MED); + tcps->tcps_wroff_xtra + ire_fp_mp_len, BPRI_MED); if (mp1 == NULL) { freemsg(mp); if (ire != NULL) @@ -21108,7 +21284,8 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, mp = mp1; /* Leave room for Link Level header */ len = tcp_hdr_len; - rptr = &mp->b_rptr[tcp_wroff_xtra + ire_fp_mp_len]; + rptr = + &mp->b_rptr[tcps->tcps_wroff_xtra + ire_fp_mp_len]; mp->b_wptr = &rptr[len]; } @@ -21197,7 +21374,8 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, *usable -= spill; *snxt += spill; tcp->tcp_last_sent_len += spill; - UPDATE_MIB(&tcp_mib, tcpOutDataBytes, spill); + UPDATE_MIB(&tcps->tcps_mib, + tcpOutDataBytes, spill); /* * Adjust the checksum */ @@ -21233,8 +21411,8 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, num_lso_seg); tcp->tcp_obsegs += num_lso_seg; - TCP_STAT(tcp_lso_times); - TCP_STAT_UPDATE(tcp_lso_pkt_out, num_lso_seg); + TCP_STAT(tcps, tcp_lso_times); + TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); } else { tcp_send_data(tcp, q, mp); BUMP_LOCAL(tcp->tcp_obsegs); @@ -21278,6 +21456,7 @@ static void tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) { boolean_t prev_state; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * IP is telling us to abort MDT on this connection? We know @@ -21292,7 +21471,7 @@ tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) prev_state = tcp->tcp_mdt; tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0); if (!tcp->tcp_mdt && !first) { - TCP_STAT(tcp_mdt_conn_halted3); + TCP_STAT(tcps, tcp_mdt_conn_halted3); ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n", (void *)tcp->tcp_connp)); } @@ -21335,18 +21514,18 @@ tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) /* a zero means driver wants default value */ tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld, - tcp_mdt_max_pbufs); + tcps->tcps_mdt_max_pbufs); if (tcp->tcp_mdt_max_pld == 0) - tcp->tcp_mdt_max_pld = tcp_mdt_max_pbufs; + tcp->tcp_mdt_max_pld = tcps->tcps_mdt_max_pbufs; /* ensure 32-bit alignment */ - tcp->tcp_mdt_hdr_head = roundup(MAX(tcp_mdt_hdr_head_min, + tcp->tcp_mdt_hdr_head = roundup(MAX(tcps->tcps_mdt_hdr_head_min, mdt_capab->ill_mdt_hdr_head), 4); - tcp->tcp_mdt_hdr_tail = roundup(MAX(tcp_mdt_hdr_tail_min, + tcp->tcp_mdt_hdr_tail = roundup(MAX(tcps->tcps_mdt_hdr_tail_min, mdt_capab->ill_mdt_hdr_tail), 4); if (!first && !prev_state) { - TCP_STAT(tcp_mdt_conn_resumed2); + TCP_STAT(tcps, tcp_mdt_conn_resumed2); ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n", (void *)tcp->tcp_connp)); } @@ -21385,6 +21564,8 @@ tcp_lso_info_mp(mblk_t *mp) static void tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab) { + tcp_stack_t *tcps = tcp->tcp_tcps; + /* * IP is telling us to abort LSO on this connection? We know * this because the capability is only turned off when IP @@ -21396,7 +21577,7 @@ tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab) * will indicate that the feature is to be turned on. */ tcp->tcp_lso = (lso_capab->ill_lso_on != 0); - TCP_STAT(tcp_lso_enabled); + TCP_STAT(tcps, tcp_lso_enabled); /* * We currently only support LSO on simple TCP/IPv4, @@ -21408,7 +21589,7 @@ tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab) tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || (tcp->tcp_ipversion == IPV6_VERSION)) { tcp->tcp_lso = B_FALSE; - TCP_STAT(tcp_lso_disabled); + TCP_STAT(tcps, tcp_lso_disabled); } else { tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lso_capab->ill_lso_max); @@ -21419,6 +21600,8 @@ static void tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt) { conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; ASSERT(ire != NULL); @@ -21429,13 +21612,13 @@ tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt) * are only best-effort checks, and we do more thorough ones prior * to calling tcp_send()/tcp_multisend(). */ - if ((ip_lso_outbound || ip_multidata_outbound) && check_lso_mdt && - !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && + if ((ipst->ips_ip_lso_outbound || ipst->ips_ip_multidata_outbound) && + check_lso_mdt && !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && ill != NULL && !CONN_IPSEC_OUT_ENCAPSULATED(connp) && !(ire->ire_flags & RTF_MULTIRT) && - !IPP_ENABLED(IPP_LOCAL_OUT) && + !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && CONN_IS_LSO_MD_FASTPATH(connp)) { - if (ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { + if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { /* Cache the result */ connp->conn_lso_ok = B_TRUE; @@ -21447,7 +21630,8 @@ tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt) ill->ill_name)); } tcp_lso_update(tcp, ill->ill_lso_capab); - } else if (ip_multidata_outbound && ILL_MDT_CAPABLE(ill)) { + } else if (ipst->ips_ip_multidata_outbound && + ILL_MDT_CAPABLE(ill)) { /* Cache the result */ connp->conn_mdt_ok = B_TRUE; @@ -21720,6 +21904,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) tcp_t *tcp = connp->conn_tcp; queue_t *q = tcp->tcp_wq; struct iocblk *iocp; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(DB_TYPE(mp) == M_IOCTL); /* @@ -21738,7 +21923,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) switch (iocp->ioc_cmd) { case TCP_IOC_DEFAULT_Q: /* Wants to be the default wq. */ - if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { + if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { iocp->ioc_error = EPERM; iocp->ioc_count = 0; mp->b_datap->db_type = M_IOCACK; @@ -21782,7 +21967,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) tcp_fuse_disable_pair(tcp, B_FALSE); } tcp->tcp_issocket = B_FALSE; - TCP_STAT(tcp_sock_fallback); + TCP_STAT(tcps, tcp_sock_fallback); DB_TYPE(mp) = M_IOCACK; iocp->ioc_error = 0; @@ -21975,7 +22160,9 @@ non_urgent_data: static void tcp_wsrv(queue_t *q) { - TCP_STAT(tcp_wsrv_called); + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + + TCP_STAT(tcps, tcp_wsrv_called); } /* Non overlapping byte exchanger */ @@ -22006,6 +22193,7 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) int tcp_hdr_len; int tcp_ip_hdr_len; mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * Save sum for use in source route later. @@ -22021,12 +22209,12 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", str, seq, ack, ctl); } - mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, + mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcps->tcps_wroff_xtra, BPRI_MED); if (mp == NULL) { return; } - rptr = &mp->b_rptr[tcp_wroff_xtra]; + rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; mp->b_rptr = rptr; mp->b_wptr = &rptr[tcp_hdr_len]; bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); @@ -22043,8 +22231,8 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; tcph->th_flags[0] = (uint8_t)ctl; if (ctl & TH_RST) { - BUMP_MIB(&tcp_mib, tcpOutRsts); - BUMP_MIB(&tcp_mib, tcpOutControl); + BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); + BUMP_MIB(&tcps->tcps_mib, tcpOutControl); /* * Don't send TSopt w/ TH_RST packets per RFC 1323. */ @@ -22076,7 +22264,7 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) tcph->th_win); tcp->tcp_rack = ack; tcp->tcp_rack_cnt = 0; - BUMP_MIB(&tcp_mib, tcpOutAck); + BUMP_MIB(&tcps->tcps_mib, tcpOutAck); } BUMP_LOCAL(tcp->tcp_obsegs); U32_TO_BE32(seq, tcph->th_seq); @@ -22095,7 +22283,7 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) * to a segment. If it returns B_FALSE, TCP should not respond. */ static boolean_t -tcp_send_rst_chk(void) +tcp_send_rst_chk(tcp_stack_t *tcps) { clock_t now; @@ -22109,14 +22297,15 @@ tcp_send_rst_chk(void) * RSTs in normal cases but when under attack, the impact is * limited. */ - if (tcp_rst_sent_rate_enabled != 0) { + if (tcps->tcps_rst_sent_rate_enabled != 0) { now = lbolt; /* lbolt can wrap around. */ - if ((tcp_last_rst_intrvl > now) || - (TICK_TO_MSEC(now - tcp_last_rst_intrvl) > 1*SECONDS)) { - tcp_last_rst_intrvl = now; - tcp_rst_cnt = 1; - } else if (++tcp_rst_cnt > tcp_rst_sent_rate) { + if ((tcps->tcps_last_rst_intrvl > now) || + (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) > + 1*SECONDS)) { + tcps->tcps_last_rst_intrvl = now; + tcps->tcps_rst_cnt = 1; + } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) { return (B_FALSE); } } @@ -22191,7 +22380,8 @@ tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic) */ static void tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, - uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid) + uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid, + tcp_stack_t *tcps) { ipha_t *ipha = NULL; ip6_t *ip6h = NULL; @@ -22205,13 +22395,31 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, in6_addr_t v6addr; int addr_len; void *addr; - queue_t *q = tcp_g_q; - tcp_t *tcp = Q_TO_TCP(q); + queue_t *q = tcps->tcps_g_q; + tcp_t *tcp; cred_t *cr; mblk_t *nmp; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - if (!tcp_send_rst_chk()) { - tcp_rst_unsent++; + if (tcps->tcps_g_q == NULL) { + /* + * For non-zero stackids the default queue isn't created + * until the first open, thus there can be a need to send + * a reset before then. But we can't do that, hence we just + * drop the packet. Later during boot, when the default queue + * has been setup, a retransmitted packet from the peer + * will result in a reset. + */ + ASSERT(tcps->tcps_netstack->netstack_stackid != + GLOBAL_NETSTACKID); + freemsg(mp); + return; + } + + tcp = Q_TO_TCP(q); + + if (!tcp_send_rst_chk(tcps)) { + tcps->tcps_rst_unsent++; freemsg(mp); return; } @@ -22225,7 +22433,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, mctl_present = B_FALSE; } - if (str && q && tcp_dbg) { + if (str && q && tcps->tcps_dbg) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " "flags 0x%x", @@ -22269,7 +22477,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || CLASSD(ipha->ipha_src)) { freemsg(ipsec_mp); - BUMP_MIB(&ip_mib, ipIfStatsInDiscards); + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); return; } } else { @@ -22278,7 +22486,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { freemsg(ipsec_mp); - BUMP_MIB(&ip6_mib, ipIfStatsInDiscards); + BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); return; } @@ -22309,7 +22517,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, ipha->ipha_src = ipha->ipha_dst; ipha->ipha_dst = v4addr; ipha->ipha_ident = 0; - ipha->ipha_ttl = (uchar_t)tcp_ipv4_ttl; + ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; addr_len = IP_ADDR_LEN; addr = &v4addr; } else { @@ -22319,7 +22527,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, v6addr = ip6h->ip6_src; ip6h->ip6_src = ip6h->ip6_dst; ip6h->ip6_dst = v6addr; - ip6h->ip6_hops = (uchar_t)tcp_ipv6_hoplimit; + ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; addr_len = IPV6_ADDR_LEN; addr = &v6addr; } @@ -22330,8 +22538,8 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, U16_TO_BE16(sizeof (tcph_t), tcph->th_sum); tcph->th_flags[0] = (uint8_t)ctl; if (ctl & TH_RST) { - BUMP_MIB(&tcp_mib, tcpOutRsts); - BUMP_MIB(&tcp_mib, tcpOutControl); + BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); + BUMP_MIB(&tcps->tcps_mib, tcpOutControl); } /* IP trusts us to set up labels when required. */ @@ -22341,10 +22549,12 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) err = tsol_check_label(cr, &mp, &adjust, - tcp->tcp_connp->conn_mac_exempt); + tcp->tcp_connp->conn_mac_exempt, + tcps->tcps_netstack->netstack_ip); else err = tsol_check_label_v6(cr, &mp, &adjust, - tcp->tcp_connp->conn_mac_exempt); + tcp->tcp_connp->conn_mac_exempt, + tcps->tcps_netstack->netstack_ip); if (mctl_present) ipsec_mp->b_cont = mp; else @@ -22374,7 +22584,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, zoneid = GLOBAL_ZONEID; /* Add the zoneid so ip_output routes it properly */ - if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid)) == NULL) { + if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) { freemsg(ipsec_mp); return; } @@ -22390,7 +22600,7 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, * reused by tcp_xmit_listener_reset, so it already contains * the right credentials and we don't need to call mblk_setcred. * Also the conn's cred is not right since it is associated - * with tcp_g_q. + * with tcps_g_q. */ CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp); @@ -22424,6 +22634,7 @@ tcp_xmit_end(tcp_t *tcp) { ipic_t *ipic; mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; if (tcp->tcp_state < TCPS_SYN_RCVD || tcp->tcp_state > TCPS_CLOSE_WAIT) { @@ -22477,7 +22688,8 @@ tcp_xmit_end(tcp_t *tcp) * If TCP does not get enough samples of RTT or tcp_rtt_updates * is 0, don't update the cache. */ - if (tcp_rtt_updates == 0 || tcp->tcp_rtt_update < tcp_rtt_updates) + if (tcps->tcps_rtt_updates == 0 || + tcp->tcp_rtt_update < tcps->tcps_rtt_updates) return (0); /* @@ -22520,7 +22732,8 @@ tcp_xmit_end(tcp_t *tcp) * RST. */ void -tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) +tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, + tcp_stack_t *tcps) { uchar_t *rptr; uint32_t seg_len; @@ -22534,8 +22747,9 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) boolean_t mctl_present = B_FALSE; boolean_t check = B_TRUE; boolean_t policy_present; + ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; - TCP_STAT(tcp_no_listener); + TCP_STAT(tcps, tcp_no_listener); ipsec_mp = mp; @@ -22558,11 +22772,11 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) } if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { - policy_present = ipsec_inbound_v4_policy_present; + policy_present = ipss->ipsec_inbound_v4_policy_present; ipha = (ipha_t *)mp->b_rptr; ip6h = NULL; } else { - policy_present = ipsec_inbound_v6_policy_present; + policy_present = ipss->ipsec_inbound_v6_policy_present; ipha = NULL; ip6h = (ip6_t *)mp->b_rptr; } @@ -22573,7 +22787,8 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) * nobody's home. */ ipsec_mp = ipsec_check_global_policy( - ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present); + ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present, + tcps->tcps_netstack); if (ipsec_mp == NULL) return; } @@ -22599,7 +22814,7 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) freemsg(ipsec_mp); } else if (flags & TH_ACK) { tcp_xmit_early_reset("no tcp, reset", - ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid); + ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps); } else { if (flags & TH_SYN) { seg_len++; @@ -22612,13 +22827,13 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid) * floor. */ freemsg(ipsec_mp); - tcp_rst_unsent++; + tcps->tcps_rst_unsent++; return; } tcp_xmit_early_reset("no tcp, reset/ack", ipsec_mp, 0, seg_seq + seg_len, - TH_RST | TH_ACK, ip_hdr_len, zoneid); + TH_RST | TH_ACK, ip_hdr_len, zoneid, tcps); } } @@ -22650,10 +22865,11 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, tcph_t *tcph; int32_t num_sack_blk = 0; int32_t sack_opt_len = 0; + tcp_stack_t *tcps = tcp->tcp_tcps; /* Allocate for our maximum TCP header + link-level */ - mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcp_wroff_xtra, - BPRI_MED); + mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + + tcps->tcps_wroff_xtra, BPRI_MED); if (!mp1) return (NULL); data_length = 0; @@ -22722,7 +22938,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); - rptr = mp1->b_rptr + tcp_wroff_xtra; + rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; mp1->b_rptr = rptr; mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); @@ -22863,7 +23079,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, * the peer's calculated SMSS may be smaller * than what it can be. This should be OK. */ - if (tcp_use_smss_as_mss_opt) { + if (tcps->tcps_use_smss_as_mss_opt) { u1 = tcp->tcp_mss; U16_TO_BE16(u1, wptr); } @@ -22916,13 +23132,13 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, u1 += tcp->tcp_sum; u1 = (u1 >> 16) + (u1 & 0xFFFF); U16_TO_BE16(u1, tcph->th_sum); - BUMP_MIB(&tcp_mib, tcpOutControl); + BUMP_MIB(&tcps->tcps_mib, tcpOutControl); } if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (seq + data_length) == tcp->tcp_fss) { if (!tcp->tcp_fin_acked) { flags |= TH_FIN; - BUMP_MIB(&tcp_mib, tcpOutControl); + BUMP_MIB(&tcps->tcps_mib, tcpOutControl); } if (!tcp->tcp_fin_sent) { tcp->tcp_fin_sent = B_TRUE; @@ -22950,7 +23166,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && u1 < (uint32_t)(64 * 1024)) { flags |= TH_URG; - BUMP_MIB(&tcp_mib, tcpOutUrg); + BUMP_MIB(&tcps->tcps_mib, tcpOutUrg); U32_TO_ABE16(u1, tcph->th_urp); } } @@ -23025,8 +23241,9 @@ tcp_push_timer(void *arg) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; - TCP_DBGSTAT(tcp_push_timer_cnt); + TCP_DBGSTAT(tcps, tcp_push_timer_cnt); ASSERT(tcp->tcp_listener == NULL); @@ -23051,8 +23268,9 @@ tcp_ack_timer(void *arg) conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; - TCP_DBGSTAT(tcp_ack_timer_cnt); + TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); tcp->tcp_ack_tid = 0; @@ -23086,8 +23304,8 @@ tcp_ack_timer(void *arg) if (mp != NULL) { TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_SEND_PKT); BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcp_mib, tcpOutAck); - BUMP_MIB(&tcp_mib, tcpOutAckDelayed); + BUMP_MIB(&tcps->tcps_mib, tcpOutAck); + BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed); tcp_send_data(tcp, tcp->tcp_wq, mp); } } @@ -23098,6 +23316,7 @@ static mblk_t * tcp_ack_mp(tcp_t *tcp) { uint32_t seq_no; + tcp_stack_t *tcps = tcp->tcp_tcps; /* * There are a few cases to be considered while setting the sequence no. @@ -23155,7 +23374,7 @@ tcp_ack_mp(tcp_t *tcp) tcp_hdr_len = tcp->tcp_hdr_len; tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; } - mp1 = allocb(tcp_hdr_len + tcp_wroff_xtra, BPRI_MED); + mp1 = allocb(tcp_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); if (!mp1) return (NULL); @@ -23163,7 +23382,7 @@ tcp_ack_mp(tcp_t *tcp) U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); /* copy in prototype TCP + IP header */ - rptr = mp1->b_rptr + tcp_wroff_xtra; + rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; mp1->b_rptr = rptr; mp1->b_wptr = rptr + tcp_hdr_len; bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); @@ -23250,16 +23469,18 @@ tcp_ack_mp(tcp_t *tcp) */ /* ARGSUSED */ static tcp_t * -tcp_alloc_temp_tcp(in_port_t port) +tcp_alloc_temp_tcp(in_port_t port, tcp_stack_t *tcps) { conn_t *connp; tcp_t *tcp; - connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP); + connp = ipcl_conn_create(IPCL_TCPCONN, KM_SLEEP, tcps->tcps_netstack); if (connp == NULL) return (NULL); tcp = connp->conn_tcp; + tcp->tcp_tcps = tcps; + TCPS_REFHOLD(tcps); /* * Only initialize the necessary info in those structures. Note @@ -23291,6 +23512,8 @@ tcp_alloc_temp_tcp(in_port_t port) * * Return: * B_TRUE if the deletion is successful, B_FALSE otherwise. + * + * Assumes that nca is only for zoneid=0 */ boolean_t tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) @@ -23299,19 +23522,25 @@ tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) int size; tcp_t **temp_tcp_array; tcp_t *tcp; + tcp_stack_t *tcps; + + tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_tcp; + ASSERT(tcps != NULL); - rw_enter(&tcp_reserved_port_lock, RW_WRITER); + rw_enter(&tcps->tcps_reserved_port_lock, RW_WRITER); /* First make sure that the port ranage is indeed reserved. */ - for (i = 0; i < tcp_reserved_port_array_size; i++) { - if (tcp_reserved_port[i].lo_port == lo_port) { - hi_port = tcp_reserved_port[i].hi_port; - temp_tcp_array = tcp_reserved_port[i].temp_tcp_array; + for (i = 0; i < tcps->tcps_reserved_port_array_size; i++) { + if (tcps->tcps_reserved_port[i].lo_port == lo_port) { + hi_port = tcps->tcps_reserved_port[i].hi_port; + temp_tcp_array = + tcps->tcps_reserved_port[i].temp_tcp_array; break; } } - if (i == tcp_reserved_port_array_size) { - rw_exit(&tcp_reserved_port_lock); + if (i == tcps->tcps_reserved_port_array_size) { + rw_exit(&tcps->tcps_reserved_port_lock); + netstack_rele(tcps->tcps_netstack); return (B_FALSE); } @@ -23319,11 +23548,13 @@ tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) * Remove the range from the array. This simple loop is possible * because port ranges are inserted in ascending order. */ - for (j = i; j < tcp_reserved_port_array_size - 1; j++) { - tcp_reserved_port[j].lo_port = tcp_reserved_port[j+1].lo_port; - tcp_reserved_port[j].hi_port = tcp_reserved_port[j+1].hi_port; - tcp_reserved_port[j].temp_tcp_array = - tcp_reserved_port[j+1].temp_tcp_array; + for (j = i; j < tcps->tcps_reserved_port_array_size - 1; j++) { + tcps->tcps_reserved_port[j].lo_port = + tcps->tcps_reserved_port[j+1].lo_port; + tcps->tcps_reserved_port[j].hi_port = + tcps->tcps_reserved_port[j+1].hi_port; + tcps->tcps_reserved_port[j].temp_tcp_array = + tcps->tcps_reserved_port[j+1].temp_tcp_array; } /* Remove all the temporary tcp structures. */ @@ -23336,8 +23567,9 @@ tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) size--; } kmem_free(temp_tcp_array, (hi_port - lo_port + 1) * sizeof (tcp_t *)); - tcp_reserved_port_array_size--; - rw_exit(&tcp_reserved_port_lock); + tcps->tcps_reserved_port_array_size--; + rw_exit(&tcps->tcps_reserved_port_lock); + netstack_rele(tcps->tcps_netstack); return (B_TRUE); } @@ -23346,13 +23578,13 @@ tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) * first parameter is the list of tcp to be removed. The second parameter * is the number of tcps in the array. */ -#define TCP_TMP_TCP_REMOVE(tcp_array, num) \ +#define TCP_TMP_TCP_REMOVE(tcp_array, num, tcps) \ { \ while ((num) > 0) { \ tcp_t *tcp = (tcp_array)[(num) - 1]; \ tf_t *tbf; \ tcp_t *tcpnext; \ - tbf = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)]; \ + tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)]; \ mutex_enter(&tbf->tf_lock); \ tcpnext = tcp->tcp_bind_hash; \ if (tcpnext) { \ @@ -23384,6 +23616,8 @@ tcp_reserved_port_del(in_port_t lo_port, in_port_t hi_port) * * Return: * B_TRUE if the port reservation is successful, B_FALSE otherwise. + * + * Assumes that nca is only for zoneid=0 */ boolean_t tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) @@ -23399,15 +23633,21 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) boolean_t used; tcp_rport_t tmp_ports[TCP_RESERVED_PORTS_ARRAY_MAX_SIZE]; zoneid_t zoneid = GLOBAL_ZONEID; + tcp_stack_t *tcps; /* Sanity check. */ if (size <= 0 || size > TCP_RESERVED_PORTS_RANGE_MAX) { return (B_FALSE); } - rw_enter(&tcp_reserved_port_lock, RW_WRITER); - if (tcp_reserved_port_array_size == TCP_RESERVED_PORTS_ARRAY_MAX_SIZE) { - rw_exit(&tcp_reserved_port_lock); + tcps = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_tcp; + ASSERT(tcps != NULL); + + rw_enter(&tcps->tcps_reserved_port_lock, RW_WRITER); + if (tcps->tcps_reserved_port_array_size == + TCP_RESERVED_PORTS_ARRAY_MAX_SIZE) { + rw_exit(&tcps->tcps_reserved_port_lock); + netstack_rele(tcps->tcps_netstack); return (B_FALSE); } @@ -23417,22 +23657,25 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) */ *lo_port = TCP_SMALLEST_RESERVED_PORT; *hi_port = TCP_LARGEST_RESERVED_PORT; - for (i = 0; i < tcp_reserved_port_array_size; - *lo_port = tcp_reserved_port[i].hi_port + 1, i++) { - if (tcp_reserved_port[i].lo_port - *lo_port >= size) { - *hi_port = tcp_reserved_port[i].lo_port - 1; + for (i = 0; i < tcps->tcps_reserved_port_array_size; + *lo_port = tcps->tcps_reserved_port[i].hi_port + 1, i++) { + if (tcps->tcps_reserved_port[i].lo_port - *lo_port >= size) { + *hi_port = tcps->tcps_reserved_port[i].lo_port - 1; break; } } /* No available port range. */ - if (i == tcp_reserved_port_array_size && *hi_port - *lo_port < size) { - rw_exit(&tcp_reserved_port_lock); + if (i == tcps->tcps_reserved_port_array_size && + *hi_port - *lo_port < size) { + rw_exit(&tcps->tcps_reserved_port_lock); + netstack_rele(tcps->tcps_netstack); return (B_FALSE); } temp_tcp_array = kmem_zalloc(size * sizeof (tcp_t *), KM_NOSLEEP); if (temp_tcp_array == NULL) { - rw_exit(&tcp_reserved_port_lock); + rw_exit(&tcps->tcps_reserved_port_lock); + netstack_rele(tcps->tcps_netstack); return (B_FALSE); } @@ -23442,7 +23685,7 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) cur_size++, port++) { used = B_FALSE; net_port = htons(port); - tbf = &tcp_bind_fanout[TCP_BIND_HASH(net_port)]; + tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(net_port)]; mutex_enter(&tbf->tf_lock); for (tcp = tbf->tf_tcp; tcp != NULL; tcp = tcp->tcp_bind_hash) { @@ -23454,7 +23697,8 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) * temporary tcps. */ mutex_exit(&tbf->tf_lock); - TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); + TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size, + tcps); *lo_port = port + 1; cur_size = -1; used = B_TRUE; @@ -23462,18 +23706,21 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) } } if (!used) { - if ((tmp_tcp = tcp_alloc_temp_tcp(net_port)) == NULL) { + if ((tmp_tcp = tcp_alloc_temp_tcp(net_port, tcps)) == + NULL) { /* * Allocation failure. Just fail the request. * Need to remove all those temporary tcp * structures. */ mutex_exit(&tbf->tf_lock); - TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); - rw_exit(&tcp_reserved_port_lock); + TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size, + tcps); + rw_exit(&tcps->tcps_reserved_port_lock); kmem_free(temp_tcp_array, (hi_port - lo_port + 1) * sizeof (tcp_t *)); + netstack_rele(tcps->tcps_netstack); return (B_FALSE); } temp_tcp_array[cur_size] = tmp_tcp; @@ -23489,9 +23736,10 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) * range is available. */ if (cur_size < size) { - TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size); - rw_exit(&tcp_reserved_port_lock); + TCP_TMP_TCP_REMOVE(temp_tcp_array, cur_size, tcps); + rw_exit(&tcps->tcps_reserved_port_lock); kmem_free(temp_tcp_array, size * sizeof (tcp_t *)); + netstack_rele(tcps->tcps_netstack); return (B_FALSE); } *hi_port = port - 1; @@ -23504,32 +23752,37 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) * that we should provide more reserved port ranges, this function * has to be modified to be more efficient. */ - if (tcp_reserved_port_array_size == 0) { - tcp_reserved_port[0].lo_port = *lo_port; - tcp_reserved_port[0].hi_port = *hi_port; - tcp_reserved_port[0].temp_tcp_array = temp_tcp_array; + if (tcps->tcps_reserved_port_array_size == 0) { + tcps->tcps_reserved_port[0].lo_port = *lo_port; + tcps->tcps_reserved_port[0].hi_port = *hi_port; + tcps->tcps_reserved_port[0].temp_tcp_array = temp_tcp_array; } else { - for (i = 0, j = 0; i < tcp_reserved_port_array_size; i++, j++) { - if (*lo_port < tcp_reserved_port[i].lo_port && i == j) { + for (i = 0, j = 0; i < tcps->tcps_reserved_port_array_size; + i++, j++) { + if (*lo_port < tcps->tcps_reserved_port[i].lo_port && + i == j) { tmp_ports[j].lo_port = *lo_port; tmp_ports[j].hi_port = *hi_port; tmp_ports[j].temp_tcp_array = temp_tcp_array; j++; } - tmp_ports[j].lo_port = tcp_reserved_port[i].lo_port; - tmp_ports[j].hi_port = tcp_reserved_port[i].hi_port; + tmp_ports[j].lo_port = + tcps->tcps_reserved_port[i].lo_port; + tmp_ports[j].hi_port = + tcps->tcps_reserved_port[i].hi_port; tmp_ports[j].temp_tcp_array = - tcp_reserved_port[i].temp_tcp_array; + tcps->tcps_reserved_port[i].temp_tcp_array; } if (j == i) { tmp_ports[j].lo_port = *lo_port; tmp_ports[j].hi_port = *hi_port; tmp_ports[j].temp_tcp_array = temp_tcp_array; } - bcopy(tmp_ports, tcp_reserved_port, sizeof (tmp_ports)); + bcopy(tmp_ports, tcps->tcps_reserved_port, sizeof (tmp_ports)); } - tcp_reserved_port_array_size++; - rw_exit(&tcp_reserved_port_lock); + tcps->tcps_reserved_port_array_size++; + rw_exit(&tcps->tcps_reserved_port_lock); + netstack_rele(tcps->tcps_netstack); return (B_TRUE); } @@ -23543,19 +23796,19 @@ tcp_reserved_port_add(int size, in_port_t *lo_port, in_port_t *hi_port) * B_TRUE is the port is inside a reserved port range, B_FALSE otherwise. */ boolean_t -tcp_reserved_port_check(in_port_t port) +tcp_reserved_port_check(in_port_t port, tcp_stack_t *tcps) { int i; - rw_enter(&tcp_reserved_port_lock, RW_READER); - for (i = 0; i < tcp_reserved_port_array_size; i++) { - if (port >= tcp_reserved_port[i].lo_port || - port <= tcp_reserved_port[i].hi_port) { - rw_exit(&tcp_reserved_port_lock); + rw_enter(&tcps->tcps_reserved_port_lock, RW_READER); + for (i = 0; i < tcps->tcps_reserved_port_array_size; i++) { + if (port >= tcps->tcps_reserved_port[i].lo_port || + port <= tcps->tcps_reserved_port[i].hi_port) { + rw_exit(&tcps->tcps_reserved_port_lock); return (B_TRUE); } } - rw_exit(&tcp_reserved_port_lock); + rw_exit(&tcps->tcps_reserved_port_lock); return (B_FALSE); } @@ -23568,17 +23821,19 @@ static int tcp_reserved_port_list(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) { int i; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - rw_enter(&tcp_reserved_port_lock, RW_READER); - if (tcp_reserved_port_array_size > 0) + rw_enter(&tcps->tcps_reserved_port_lock, RW_READER); + if (tcps->tcps_reserved_port_array_size > 0) (void) mi_mpprintf(mp, "The following ports are reserved:"); else (void) mi_mpprintf(mp, "No port is reserved."); - for (i = 0; i < tcp_reserved_port_array_size; i++) { + for (i = 0; i < tcps->tcps_reserved_port_array_size; i++) { (void) mi_mpprintf(mp, "%d-%d", - tcp_reserved_port[i].lo_port, tcp_reserved_port[i].hi_port); + tcps->tcps_reserved_port[i].lo_port, + tcps->tcps_reserved_port[i].hi_port); } - rw_exit(&tcp_reserved_port_lock); + rw_exit(&tcps->tcps_reserved_port_lock); return (0); } @@ -23639,6 +23894,7 @@ tcp_bind_hash_remove(tcp_t *tcp) { tcp_t *tcpnext; kmutex_t *lockp; + tcp_stack_t *tcps = tcp->tcp_tcps; if (tcp->tcp_ptpbhn == NULL) return; @@ -23648,7 +23904,7 @@ tcp_bind_hash_remove(tcp_t *tcp) * hash_remove's for this instance. */ ASSERT(tcp->tcp_lport != 0); - lockp = &tcp_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; + lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; ASSERT(lockp != NULL); mutex_enter(lockp); @@ -23670,12 +23926,12 @@ tcp_bind_hash_remove(tcp_t *tcp) * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. */ static tcp_t * -tcp_acceptor_hash_lookup(t_uscalar_t id) +tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *tcps) { tf_t *tf; tcp_t *tcp; - tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; + tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; mutex_enter(&tf->tf_lock); for (tcp = tf->tf_tcp; tcp != NULL; tcp = tcp->tcp_acceptor_hash) { @@ -23699,8 +23955,9 @@ tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) tf_t *tf; tcp_t **tcpp; tcp_t *tcpnext; + tcp_stack_t *tcps = tcp->tcp_tcps; - tf = &tcp_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; + tf = &tcps->tcps_acceptor_fanout[TCP_ACCEPTOR_HASH(id)]; if (tcp->tcp_ptpahn != NULL) tcp_acceptor_hash_remove(tcp); @@ -23756,13 +24013,12 @@ tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) int error = 0; int retval; char *end; - tcp_hsp_t *hsp; tcp_hsp_t *hspprev; - ipaddr_t addr = 0; /* Address we're looking for */ in6_addr_t v6addr; /* Address we're looking for */ uint32_t hash; /* Hash of that address */ + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; /* * If the following variables are still zero after parsing the input @@ -23777,7 +24033,7 @@ tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) long timestamp = 0; /* Originate TCP TSTAMP option, 1 = yes */ boolean_t delete = B_FALSE; /* User asked to delete this HSP */ - rw_enter(&tcp_hsp_lock, RW_WRITER); + rw_enter(&tcps->tcps_hsp_lock, RW_WRITER); /* Parse and validate address */ if (af == AF_INET) { @@ -23884,14 +24140,14 @@ tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) * Note that deletes don't return an error if the thing * we're trying to delete isn't there. */ - if (tcp_hsp_hash == NULL) + if (tcps->tcps_hsp_hash == NULL) goto done; - hsp = tcp_hsp_hash[hash]; + hsp = tcps->tcps_hsp_hash[hash]; if (hsp) { if (IN6_ARE_ADDR_EQUAL(&hsp->tcp_hsp_addr_v6, &v6addr)) { - tcp_hsp_hash[hash] = hsp->tcp_hsp_next; + tcps->tcps_hsp_hash[hash] = hsp->tcp_hsp_next; mi_free((char *)hsp); } else { hspprev = hsp; @@ -23913,10 +24169,10 @@ tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) * so, allocate the hash table. */ - if (!tcp_hsp_hash) { - tcp_hsp_hash = (tcp_hsp_t **) + if (!tcps->tcps_hsp_hash) { + tcps->tcps_hsp_hash = (tcp_hsp_t **) mi_zalloc(sizeof (tcp_hsp_t *) * TCP_HSP_HASH_SIZE); - if (!tcp_hsp_hash) { + if (!tcps->tcps_hsp_hash) { error = EINVAL; goto done; } @@ -23924,7 +24180,7 @@ tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) /* Get head of hash chain */ - hsp = tcp_hsp_hash[hash]; + hsp = tcps->tcps_hsp_hash[hash]; /* Try to find pre-existing hsp on hash chain */ /* Doesn't handle CIDR prefixes. */ @@ -23945,8 +24201,8 @@ tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) error = EINVAL; goto done; } - hsp->tcp_hsp_next = tcp_hsp_hash[hash]; - tcp_hsp_hash[hash] = hsp; + hsp->tcp_hsp_next = tcps->tcps_hsp_hash[hash]; + tcps->tcps_hsp_hash[hash] = hsp; } /* Set values that the user asked us to change */ @@ -23966,7 +24222,7 @@ tcp_host_param_setvalue(queue_t *q, mblk_t *mp, char *value, caddr_t cp, int af) } done: - rw_exit(&tcp_hsp_lock); + rw_exit(&tcps->tcps_hsp_lock); return (error); } @@ -23993,14 +24249,15 @@ tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) tcp_hsp_t *hsp; int i; char addrbuf[INET6_ADDRSTRLEN], subnetbuf[INET6_ADDRSTRLEN]; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - rw_enter(&tcp_hsp_lock, RW_READER); + rw_enter(&tcps->tcps_hsp_lock, RW_READER); (void) mi_mpprintf(mp, "Hash HSP " MI_COL_HDRPAD_STR "Address Subnet Mask Send Receive TStamp"); - if (tcp_hsp_hash) { + if (tcps->tcps_hsp_hash) { for (i = 0; i < TCP_HSP_HASH_SIZE; i++) { - hsp = tcp_hsp_hash[i]; + hsp = tcps->tcps_hsp_hash[i]; while (hsp) { if (hsp->tcp_hsp_vers == IPV4_VERSION) { (void) inet_ntop(AF_INET, @@ -24032,7 +24289,7 @@ tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) } } } - rw_exit(&tcp_hsp_lock); + rw_exit(&tcps->tcps_hsp_lock); return (0); } @@ -24051,19 +24308,19 @@ static ipaddr_t netmasks[] = { * associated with the routes to determine the default sndspace and rcvspace. */ static tcp_hsp_t * -tcp_hsp_lookup(ipaddr_t addr) +tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *tcps) { tcp_hsp_t *hsp = NULL; /* Quick check without acquiring the lock. */ - if (tcp_hsp_hash == NULL) + if (tcps->tcps_hsp_hash == NULL) return (NULL); - rw_enter(&tcp_hsp_lock, RW_READER); + rw_enter(&tcps->tcps_hsp_lock, RW_READER); /* This routine finds the best-matching HSP for address addr. */ - if (tcp_hsp_hash) { + if (tcps->tcps_hsp_hash) { int i; ipaddr_t srchaddr; tcp_hsp_t *hsp_net; @@ -24075,7 +24332,7 @@ tcp_hsp_lookup(ipaddr_t addr) for (i = 1; i <= 3; i++) { /* Look for exact match on srchaddr */ - hsp = tcp_hsp_hash[TCP_HSP_HASH(srchaddr)]; + hsp = tcps->tcps_hsp_hash[TCP_HSP_HASH(srchaddr)]; while (hsp) { if (hsp->tcp_hsp_vers == IPV4_VERSION && hsp->tcp_hsp_addr == srchaddr) @@ -24128,7 +24385,7 @@ tcp_hsp_lookup(ipaddr_t addr) } } - rw_exit(&tcp_hsp_lock); + rw_exit(&tcps->tcps_hsp_lock); return (hsp); } @@ -24137,19 +24394,19 @@ tcp_hsp_lookup(ipaddr_t addr) * match lookup. */ static tcp_hsp_t * -tcp_hsp_lookup_ipv6(in6_addr_t *v6addr) +tcp_hsp_lookup_ipv6(in6_addr_t *v6addr, tcp_stack_t *tcps) { tcp_hsp_t *hsp = NULL; /* Quick check without acquiring the lock. */ - if (tcp_hsp_hash == NULL) + if (tcps->tcps_hsp_hash == NULL) return (NULL); - rw_enter(&tcp_hsp_lock, RW_READER); + rw_enter(&tcps->tcps_hsp_lock, RW_READER); /* This routine finds the best-matching HSP for address addr. */ - if (tcp_hsp_hash) { + if (tcps->tcps_hsp_hash) { int i; in6_addr_t v6srchaddr; tcp_hsp_t *hsp_net; @@ -24161,7 +24418,7 @@ tcp_hsp_lookup_ipv6(in6_addr_t *v6addr) for (i = 1; i <= 3; i++) { /* Look for exact match on srchaddr */ - hsp = tcp_hsp_hash[TCP_HSP_HASH( + hsp = tcps->tcps_hsp_hash[TCP_HSP_HASH( V4_PART_OF_V6(v6srchaddr))]; while (hsp) { if (hsp->tcp_hsp_vers == IPV6_VERSION && @@ -24224,7 +24481,7 @@ tcp_hsp_lookup_ipv6(in6_addr_t *v6addr) } } - rw_exit(&tcp_hsp_lock); + rw_exit(&tcps->tcps_hsp_lock); return (hsp); } @@ -24450,7 +24707,7 @@ tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, #define PASSWD_SIZE 16 /* MUST be multiple of 4 */ static void -tcp_iss_key_init(uint8_t *phrase, int len) +tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *tcps) { struct { int32_t current_time; @@ -24496,11 +24753,11 @@ tcp_iss_key_init(uint8_t *phrase, int len) /* * Hash 'em all together. The MD5Final is called per-connection. */ - mutex_enter(&tcp_iss_key_lock); - MD5Init(&tcp_iss_key); - MD5Update(&tcp_iss_key, (uchar_t *)&tcp_iss_cookie, + mutex_enter(&tcps->tcps_iss_key_lock); + MD5Init(&tcps->tcps_iss_key); + MD5Update(&tcps->tcps_iss_key, (uchar_t *)&tcp_iss_cookie, sizeof (tcp_iss_cookie)); - mutex_exit(&tcp_iss_key_lock); + mutex_exit(&tcps->tcps_iss_key_lock); } /* @@ -24511,10 +24768,12 @@ static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) { + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + /* * Basically, value contains a new pass phrase. Pass it along! */ - tcp_iss_key_init((uint8_t *)value, strlen(value)); + tcp_iss_key_init((uint8_t *)value, strlen(value), tcps); return (0); } @@ -24534,45 +24793,232 @@ tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags) return (0); } +/* + * Make sure we wait until the default queue is setup, yet allow + * tcp_g_q_create() to open a TCP stream. + * We need to allow tcp_g_q_create() do do an open + * of tcp, hence we compare curhread. + * All others have to wait until the tcps_g_q has been + * setup. + */ void -tcp_ddi_init(void) +tcp_g_q_setup(tcp_stack_t *tcps) { - int i; + mutex_enter(&tcps->tcps_g_q_lock); + if (tcps->tcps_g_q != NULL) { + mutex_exit(&tcps->tcps_g_q_lock); + return; + } + if (tcps->tcps_g_q_creator == NULL) { + /* This thread will set it up */ + tcps->tcps_g_q_creator = curthread; + mutex_exit(&tcps->tcps_g_q_lock); + tcp_g_q_create(tcps); + mutex_enter(&tcps->tcps_g_q_lock); + ASSERT(tcps->tcps_g_q_creator == curthread); + tcps->tcps_g_q_creator = NULL; + cv_signal(&tcps->tcps_g_q_cv); + ASSERT(tcps->tcps_g_q != NULL); + mutex_exit(&tcps->tcps_g_q_lock); + return; + } + /* Everybody but the creator has to wait */ + if (tcps->tcps_g_q_creator != curthread) { + while (tcps->tcps_g_q == NULL) + cv_wait(&tcps->tcps_g_q_cv, &tcps->tcps_g_q_lock); + } + mutex_exit(&tcps->tcps_g_q_lock); +} - /* Initialize locks */ - rw_init(&tcp_hsp_lock, NULL, RW_DEFAULT, NULL); - mutex_init(&tcp_g_q_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&tcp_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&tcp_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&tcp_reserved_port_lock, NULL, RW_DEFAULT, NULL); +major_t IP_MAJ; +#define IP "ip" - for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { - mutex_init(&tcp_bind_fanout[i].tf_lock, NULL, - MUTEX_DEFAULT, NULL); +#define TCP6DEV "/devices/pseudo/tcp6@0:tcp6" + +/* + * Create a default tcp queue here instead of in strplumb + */ +void +tcp_g_q_create(tcp_stack_t *tcps) +{ + int error; + ldi_handle_t lh = NULL; + ldi_ident_t li = NULL; + int rval; + cred_t *cr; + +#ifdef NS_DEBUG + (void) printf("tcp_g_q_create()\n"); +#endif + + ASSERT(tcps->tcps_g_q_creator == curthread); + + error = ldi_ident_from_major(IP_MAJ, &li); + if (error) { +#ifdef DEBUG + printf("tcp_g_q_create: lyr ident get failed error %d\n", + error); +#endif + return; } - for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { - mutex_init(&tcp_acceptor_fanout[i].tf_lock, NULL, - MUTEX_DEFAULT, NULL); + cr = zone_get_kcred(netstackid_to_zoneid( + tcps->tcps_netstack->netstack_stackid)); + ASSERT(cr != NULL); + /* + * We set the tcp default queue to IPv6 because IPv4 falls + * back to IPv6 when it can't find a client, but + * IPv6 does not fall back to IPv4. + */ + error = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, cr, &lh, li); + if (error) { +#ifdef DEBUG + printf("tcp_g_q_create: open of TCP6DEV failed error %d\n", + error); +#endif + goto out; } - /* TCP's IPsec code calls the packet dropper. */ - ip_drop_register(&tcp_dropper, "TCP IPsec policy enforcement"); + /* + * This ioctl causes the tcp framework to cache a pointer to + * this stream, so we don't want to close the stream after + * this operation. + * Use the kernel credentials that are for the zone we're in. + */ + error = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q, + (intptr_t)0, FKIOCTL, cr, &rval); + if (error) { +#ifdef DEBUG + printf("tcp_g_q_create: ioctl TCP_IOC_DEFAULT_Q failed " + "error %d\n", error); +#endif + goto out; + } + tcps->tcps_g_q_lh = lh; /* For tcp_g_q_close */ + lh = NULL; +out: + /* Close layered handles */ + if (li) + ldi_ident_release(li); + /* Keep cred around until _inactive needs it */ + tcps->tcps_g_q_cr = cr; +} - if (!tcp_g_nd) { - if (!tcp_param_register(tcp_param_arr, A_CNT(tcp_param_arr))) { - nd_free(&tcp_g_nd); - } +/* + * We keep tcp_g_q set until all other tcp_t's in the zone + * has gone away, and then when tcp_g_q_inactive() is called + * we clear it. + */ +void +tcp_g_q_destroy(tcp_stack_t *tcps) +{ +#ifdef NS_DEBUG + (void) printf("tcp_g_q_destroy()for stack %d\n", + tcps->tcps_netstack->netstack_stackid); +#endif + + if (tcps->tcps_g_q == NULL) { + return; /* Nothing to cleanup */ + } + /* + * Drop reference corresponding to the default queue. + * This reference was added from tcp_open when the default queue + * was created, hence we compensate for this extra drop in + * tcp_g_q_close. If the refcnt drops to zero here it means + * the default queue was the last one to be open, in which + * case, then tcp_g_q_inactive will be + * called as a result of the refrele. + */ + TCPS_REFRELE(tcps); +} + +/* + * Called when last tcp_t drops reference count using TCPS_REFRELE. + * Run by tcp_q_q_inactive using a taskq. + */ +static void +tcp_g_q_close(void *arg) +{ + tcp_stack_t *tcps = arg; + int error; + ldi_handle_t lh = NULL; + ldi_ident_t li = NULL; + cred_t *cr; + +#ifdef NS_DEBUG + (void) printf("tcp_g_q_inactive() for stack %d refcnt %d\n", + tcps->tcps_netstack->netstack_stackid, + tcps->tcps_netstack->netstack_refcnt); +#endif + lh = tcps->tcps_g_q_lh; + if (lh == NULL) + return; /* Nothing to cleanup */ + + ASSERT(tcps->tcps_refcnt == 1); + ASSERT(tcps->tcps_g_q != NULL); + + error = ldi_ident_from_major(IP_MAJ, &li); + if (error) { +#ifdef DEBUG + printf("tcp_g_q_inactive: lyr ident get failed error %d\n", + error); +#endif + return; } + cr = tcps->tcps_g_q_cr; + tcps->tcps_g_q_cr = NULL; + ASSERT(cr != NULL); + /* - * Note: To really walk the device tree you need the devinfo - * pointer to your device which is only available after probe/attach. - * The following is safe only because it uses ddi_root_node() + * Make sure we can break the recursion when tcp_close decrements + * the reference count causing g_q_inactive to be called again. */ - tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, - tcp_opt_obj.odb_opt_arr_cnt); + tcps->tcps_g_q_lh = NULL; + + /* close the default queue */ + (void) ldi_close(lh, FREAD|FWRITE, cr); + /* + * At this point in time tcps and the rest of netstack_t might + * have been deleted. + */ + tcps = NULL; + + /* Close layered handles */ + ldi_ident_release(li); + crfree(cr); +} + +/* + * Called when last tcp_t drops reference count using TCPS_REFRELE. + * + * Have to ensure that the ldi routines are not used by an + * interrupt thread by using a taskq. + */ +void +tcp_g_q_inactive(tcp_stack_t *tcps) +{ + if (tcps->tcps_g_q_lh == NULL) + return; /* Nothing to cleanup */ + + ASSERT(tcps->tcps_refcnt == 0); + TCPS_REFHOLD(tcps); /* Compensate for what g_q_destroy did */ + + if (servicing_interrupt()) { + (void) taskq_dispatch(tcp_taskq, tcp_g_q_close, + (void *) tcps, TQ_SLEEP); + } else { + tcp_g_q_close(tcps); + } +} + +/* + * Called by IP when IP is loaded into the kernel + */ +void +tcp_ddi_g_init(void) +{ + IP_MAJ = ddi_name_to_major(IP); tcp_timercache = kmem_cache_create("tcp_timercache", sizeof (tcp_timer_t) + sizeof (mblk_t), 0, @@ -24586,13 +25032,92 @@ tcp_ddi_init(void) TCP_MAX_COMBINED_HEADER_LENGTH, 0, tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0); + mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); + + /* Initialize the random number generator */ + tcp_random_init(); + tcp_squeue_wput_proc = tcp_squeue_switch(tcp_squeue_wput); tcp_squeue_close_proc = tcp_squeue_switch(tcp_squeue_close); + /* A single callback independently of how many netstacks we have */ ip_squeue_init(tcp_squeue_add); - /* Initialize the random number generator */ - tcp_random_init(); + tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics); + + tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1, + TASKQ_PREPOPULATE); + + /* + * We want to be informed each time a stack is created or + * destroyed in the kernel, so we can maintain the + * set of tcp_stack_t's. + */ + netstack_register(NS_TCP, tcp_stack_init, tcp_stack_shutdown, + tcp_stack_fini); +} + + +/* + * Initialize the TCP stack instance. + */ +static void * +tcp_stack_init(netstackid_t stackid, netstack_t *ns) +{ + tcp_stack_t *tcps; + tcpparam_t *pa; + int i; + + tcps = (tcp_stack_t *)kmem_zalloc(sizeof (*tcps), KM_SLEEP); + tcps->tcps_netstack = ns; + + /* Initialize locks */ + rw_init(&tcps->tcps_hsp_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&tcps->tcps_g_q_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tcps->tcps_g_q_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&tcps->tcps_reserved_port_lock, NULL, RW_DEFAULT, NULL); + + tcps->tcps_g_num_epriv_ports = TCP_NUM_EPRIV_PORTS; + tcps->tcps_g_epriv_ports[0] = 2049; + tcps->tcps_g_epriv_ports[1] = 4045; + tcps->tcps_min_anonpriv_port = 512; + + tcps->tcps_bind_fanout = kmem_zalloc(sizeof (tf_t) * + TCP_BIND_FANOUT_SIZE, KM_SLEEP); + tcps->tcps_acceptor_fanout = kmem_zalloc(sizeof (tf_t) * + TCP_FANOUT_SIZE, KM_SLEEP); + tcps->tcps_reserved_port = kmem_zalloc(sizeof (tcp_rport_t) * + TCP_RESERVED_PORTS_ARRAY_MAX_SIZE, KM_SLEEP); + + for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { + mutex_init(&tcps->tcps_bind_fanout[i].tf_lock, NULL, + MUTEX_DEFAULT, NULL); + } + + for (i = 0; i < TCP_FANOUT_SIZE; i++) { + mutex_init(&tcps->tcps_acceptor_fanout[i].tf_lock, NULL, + MUTEX_DEFAULT, NULL); + } + + /* TCP's IPsec code calls the packet dropper. */ + ip_drop_register(&tcps->tcps_dropper, "TCP IPsec policy enforcement"); + + pa = (tcpparam_t *)kmem_alloc(sizeof (lcl_tcp_param_arr), KM_SLEEP); + tcps->tcps_params = pa; + bcopy(lcl_tcp_param_arr, tcps->tcps_params, sizeof (lcl_tcp_param_arr)); + + (void) tcp_param_register(&tcps->tcps_g_nd, tcps->tcps_params, + A_CNT(lcl_tcp_param_arr), tcps); + + /* + * Note: To really walk the device tree you need the devinfo + * pointer to your device which is only available after probe/attach. + * The following is safe only because it uses ddi_root_node() + */ + tcp_max_optsize = optcom_max_optsize(tcp_opt_obj.odb_opt_des_arr, + tcp_opt_obj.odb_opt_arr_cnt); /* * Initialize RFC 1948 secret values. This will probably be reset once @@ -24605,48 +25130,104 @@ tcp_ddi_init(void) */ tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, - sizeof (tcp_g_t_info_ack)); + sizeof (tcp_g_t_info_ack), tcps); - if ((tcp_kstat = kstat_create(TCP_MOD_NAME, 0, "tcpstat", - "net", KSTAT_TYPE_NAMED, - sizeof (tcp_statistics) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL)) != NULL) { - tcp_kstat->ks_data = &tcp_statistics; - kstat_install(tcp_kstat); - } + tcps->tcps_kstat = tcp_kstat2_init(stackid, &tcps->tcps_statistics); + tcps->tcps_mibkp = tcp_kstat_init(stackid, tcps); - tcp_kstat_init(); + return (tcps); } +/* + * Called when the IP module is about to be unloaded. + */ void -tcp_ddi_destroy(void) +tcp_ddi_g_destroy(void) +{ + tcp_g_kstat_fini(tcp_g_kstat); + tcp_g_kstat = NULL; + bzero(&tcp_g_statistics, sizeof (tcp_g_statistics)); + + mutex_destroy(&tcp_random_lock); + + kmem_cache_destroy(tcp_timercache); + kmem_cache_destroy(tcp_sack_info_cache); + kmem_cache_destroy(tcp_iphc_cache); + + netstack_unregister(NS_TCP); + taskq_destroy(tcp_taskq); +} + +/* + * Shut down the TCP stack instance. + */ +/* ARGSUSED */ +static void +tcp_stack_shutdown(netstackid_t stackid, void *arg) +{ + tcp_stack_t *tcps = (tcp_stack_t *)arg; + + tcp_g_q_destroy(tcps); +} + +/* + * Free the TCP stack instance. + */ +static void +tcp_stack_fini(netstackid_t stackid, void *arg) { + tcp_stack_t *tcps = (tcp_stack_t *)arg; int i; - nd_free(&tcp_g_nd); + nd_free(&tcps->tcps_g_nd); + kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr)); + tcps->tcps_params = NULL; + kmem_free(tcps->tcps_wroff_xtra_param, sizeof (tcpparam_t)); + tcps->tcps_wroff_xtra_param = NULL; + kmem_free(tcps->tcps_mdt_head_param, sizeof (tcpparam_t)); + tcps->tcps_mdt_head_param = NULL; + kmem_free(tcps->tcps_mdt_tail_param, sizeof (tcpparam_t)); + tcps->tcps_mdt_tail_param = NULL; + kmem_free(tcps->tcps_mdt_max_pbufs_param, sizeof (tcpparam_t)); + tcps->tcps_mdt_max_pbufs_param = NULL; - for (i = 0; i < A_CNT(tcp_bind_fanout); i++) { - mutex_destroy(&tcp_bind_fanout[i].tf_lock); + for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { + ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL); + mutex_destroy(&tcps->tcps_bind_fanout[i].tf_lock); } - for (i = 0; i < A_CNT(tcp_acceptor_fanout); i++) { - mutex_destroy(&tcp_acceptor_fanout[i].tf_lock); + for (i = 0; i < TCP_FANOUT_SIZE; i++) { + ASSERT(tcps->tcps_acceptor_fanout[i].tf_tcp == NULL); + mutex_destroy(&tcps->tcps_acceptor_fanout[i].tf_lock); } - mutex_destroy(&tcp_iss_key_lock); - rw_destroy(&tcp_hsp_lock); - mutex_destroy(&tcp_g_q_lock); - mutex_destroy(&tcp_random_lock); - mutex_destroy(&tcp_epriv_port_lock); - rw_destroy(&tcp_reserved_port_lock); + kmem_free(tcps->tcps_bind_fanout, sizeof (tf_t) * TCP_BIND_FANOUT_SIZE); + tcps->tcps_bind_fanout = NULL; - ip_drop_unregister(&tcp_dropper); + kmem_free(tcps->tcps_acceptor_fanout, sizeof (tf_t) * TCP_FANOUT_SIZE); + tcps->tcps_acceptor_fanout = NULL; - kmem_cache_destroy(tcp_timercache); - kmem_cache_destroy(tcp_sack_info_cache); - kmem_cache_destroy(tcp_iphc_cache); + kmem_free(tcps->tcps_reserved_port, sizeof (tcp_rport_t) * + TCP_RESERVED_PORTS_ARRAY_MAX_SIZE); + tcps->tcps_reserved_port = NULL; + + mutex_destroy(&tcps->tcps_iss_key_lock); + rw_destroy(&tcps->tcps_hsp_lock); + mutex_destroy(&tcps->tcps_g_q_lock); + cv_destroy(&tcps->tcps_g_q_cv); + mutex_destroy(&tcps->tcps_epriv_port_lock); + rw_destroy(&tcps->tcps_reserved_port_lock); + + ip_drop_unregister(&tcps->tcps_dropper); + + tcp_kstat2_fini(stackid, tcps->tcps_kstat); + tcps->tcps_kstat = NULL; + bzero(&tcps->tcps_statistics, sizeof (tcps->tcps_statistics)); + + tcp_kstat_fini(stackid, tcps->tcps_mibkp); + tcps->tcps_mibkp = NULL; - tcp_kstat_fini(); + kmem_free(tcps, sizeof (*tcps)); } /* @@ -24660,14 +25241,15 @@ tcp_iss_init(tcp_t *tcp) MD5_CTX context; struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; uint32_t answer[4]; + tcp_stack_t *tcps = tcp->tcp_tcps; - tcp_iss_incr_extra += (ISS_INCR >> 1); - tcp->tcp_iss = tcp_iss_incr_extra; - switch (tcp_strong_iss) { + tcps->tcps_iss_incr_extra += (ISS_INCR >> 1); + tcp->tcp_iss = tcps->tcps_iss_incr_extra; + switch (tcps->tcps_strong_iss) { case 2: - mutex_enter(&tcp_iss_key_lock); - context = tcp_iss_key; - mutex_exit(&tcp_iss_key_lock); + mutex_enter(&tcps->tcps_iss_key_lock); + context = tcps->tcps_iss_key; + mutex_exit(&tcps->tcps_iss_key_lock); arg.ports = tcp->tcp_ports; if (tcp->tcp_ipversion == IPV4_VERSION) { IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, @@ -24713,19 +25295,38 @@ tcp_iss_init(tcp_t *tcp) * non-zero from the callback routine terminates the search. */ int -cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg) +cl_tcp_walk_list(int (*cl_callback)(cl_tcp_info_t *, void *), + void *arg) +{ + netstack_handle_t nh; + netstack_t *ns; + int ret = 0; + + netstack_next_init(&nh); + while ((ns = netstack_next(&nh)) != NULL) { + ret = cl_tcp_walk_list_stack(cl_callback, arg, + ns->netstack_tcp); + netstack_rele(ns); + } + netstack_next_fini(&nh); + return (ret); +} + +static int +cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, + tcp_stack_t *tcps) { tcp_t *tcp; cl_tcp_info_t cl_tcpi; connf_t *connfp; conn_t *connp; int i; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; ASSERT(callback != NULL); for (i = 0; i < CONN_G_HASH_SIZE; i++) { - - connfp = &ipcl_globalhash_fanout[i]; + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; connp = NULL; while ((connp = @@ -24959,13 +25560,16 @@ tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) */ static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, - boolean_t exact) + boolean_t exact, tcp_stack_t *tcps) { int nmatch, err = 0; tcp_t *tcp; MBLKP mp, last, listhead = NULL; conn_t *tconnp; - connf_t *connfp = &ipcl_conn_fanout[index]; + connf_t *connfp; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + + connfp = &ipst->ips_ipcl_conn_fanout[index]; startover: nmatch = 0; @@ -25021,7 +25625,7 @@ startover: * Abort all connections that matches the attributes specified in acp. */ static int -tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp) +tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) { sa_family_t af; uint32_t ports; @@ -25030,6 +25634,7 @@ tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp) boolean_t exact = B_FALSE; /* set when there is no wildcard */ int index = -1; ushort_t logflags; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; af = acp->ac_local.ss_family; @@ -25057,14 +25662,16 @@ tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp) */ if (index != -1) { err = tcp_ioctl_abort_bucket(acp, index, - &count, exact); + &count, exact, tcps); } else { /* * loop through all entries for wildcard case */ - for (index = 0; index < ipcl_conn_fanout_size; index++) { + for (index = 0; + index < ipst->ips_ipcl_conn_fanout_size; + index++) { err = tcp_ioctl_abort_bucket(acp, index, - &count, exact); + &count, exact, tcps); if (err != 0) break; } @@ -25095,8 +25702,11 @@ tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) MBLKP mp1; sa_family_t laf, raf; tcp_ioc_abort_conn_t *acp; - zone_t *zptr; - zoneid_t zoneid = Q_TO_CONN(q)->conn_zoneid; + zone_t *zptr; + conn_t *connp = Q_TO_CONN(q); + zoneid_t zoneid = connp->conn_zoneid; + tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; iocp = (IOCP)mp->b_rptr; @@ -25107,7 +25717,7 @@ tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) } /* check permissions */ - if (secpolicy_net_config(iocp->ioc_cr, B_FALSE) != 0) { + if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { err = EPERM; goto out; } @@ -25132,6 +25742,13 @@ tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) } } + /* + * For exclusive stacks we set the zoneid to zero + * to make TCP operate as if in the global zone. + */ + if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) + acp->ac_zoneid = GLOBAL_ZONEID; + if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || acp->ac_start > acp->ac_end || laf != raf || (laf != AF_INET && laf != AF_INET6)) { @@ -25140,7 +25757,7 @@ tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) } tcp_ioctl_abort_dump(acp); - err = tcp_ioctl_abort(acp); + err = tcp_ioctl_abort(acp, tcps); out: if (mp1 != NULL) { @@ -25171,6 +25788,7 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, uint_t flags; uint32_t new_swnd = 0; conn_t *connp; + tcp_stack_t *tcps = tcp->tcp_tcps; BUMP_LOCAL(tcp->tcp_ibsegs); TCP_RECORD_TRACE(tcp, mp, TCP_TRACE_RECV_PKT); @@ -25188,8 +25806,8 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, gap = seg_seq - tcp->tcp_rnxt; rgap = tcp->tcp_rwnd - (gap + seg_len); if (gap < 0) { - BUMP_MIB(&tcp_mib, tcpInDataDupSegs); - UPDATE_MIB(&tcp_mib, tcpInDataDupBytes, + BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, (seg_len > -gap ? -gap : seg_len)); seg_len += gap; if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { @@ -25208,12 +25826,13 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, if (tcp_time_wait_remove(tcp, NULL) == B_TRUE) { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcp_rput_time_wait); + TCP_DBGSTAT(tcps, + tcp_rput_time_wait); } } else { ASSERT(tcp != NULL); TCP_TIMER_RESTART(tcp, - tcp_time_wait_interval); + tcps->tcps_time_wait_interval); } tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); @@ -25243,10 +25862,11 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, * The above calculation is ugly and is a * waste of CPU cycles... */ - uint32_t new_iss = tcp_iss_incr_extra; + uint32_t new_iss = tcps->tcps_iss_incr_extra; int32_t adj; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - switch (tcp_strong_iss) { + switch (tcps->tcps_strong_iss) { case 2: { /* Add time and MD5 components. */ uint32_t answer[4]; @@ -25257,9 +25877,9 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, } arg; MD5_CTX context; - mutex_enter(&tcp_iss_key_lock); - context = tcp_iss_key; - mutex_exit(&tcp_iss_key_lock); + mutex_enter(&tcps->tcps_iss_key_lock); + context = tcps->tcps_iss_key; + mutex_exit(&tcps->tcps_iss_key_lock); arg.ports = tcp->tcp_ports; /* We use MAPPED addresses in tcp_iss_init */ arg.src = tcp->tcp_ip_src_v6; @@ -25293,7 +25913,7 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, * ahead of the current tcp_snxt, so add the * difference to tcp_iss_incr_extra. */ - tcp_iss_incr_extra += adj; + tcps->tcps_iss_incr_extra += adj; } /* * If tcp_clean_death() can not perform the task now, @@ -25314,9 +25934,9 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, * check this time by attaching a dummy * ipsec_in with ipsec_in_dont_check set. */ - if ((connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid)) != - NULL) { - TCP_STAT(tcp_time_wait_syn_success); + connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid, ipst); + if (connp != NULL) { + TCP_STAT(tcps, tcp_time_wait_syn_success); tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp); return; } @@ -25328,8 +25948,8 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, * value is the amount out of window. */ if (rgap < 0) { - BUMP_MIB(&tcp_mib, tcpInDataPastWinSegs); - UPDATE_MIB(&tcp_mib, tcpInDataPastWinBytes, -rgap); + BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataPastWinBytes, -rgap); /* Fix seg_len and make sure there is something left. */ seg_len += rgap; if (seg_len <= 0) { @@ -25358,9 +25978,9 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, flags |= TH_ACK_NEEDED; seg_len = 0; } else if (seg_len > 0) { - BUMP_MIB(&tcp_mib, tcpInClosed); - BUMP_MIB(&tcp_mib, tcpInDataInorderSegs); - UPDATE_MIB(&tcp_mib, tcpInDataInorderBytes, seg_len); + BUMP_MIB(&tcps->tcps_mib, tcpInClosed); + BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); + UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); } if (flags & TH_RST) { (void) tcp_clean_death(tcp, 0, 28); @@ -25381,7 +26001,7 @@ process_ack: if (bytes_acked <= 0) { if (bytes_acked == 0 && seg_len == 0 && new_swnd == tcp->tcp_swnd) - BUMP_MIB(&tcp_mib, tcpInDupAck); + BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); } else { /* Acks something not sent */ flags |= TH_ACK_NEEDED; @@ -25398,7 +26018,7 @@ done: if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { DB_CKSUMSTART(mp) = 0; mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - TCP_STAT(tcp_time_wait_syn_fail); + TCP_STAT(tcps, tcp_time_wait_syn_fail); } freemsg(mp); } @@ -25450,15 +26070,16 @@ tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) mblk_t *mp; tcp_timer_t *tcpt; tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(connp->conn_sqp != NULL); - TCP_DBGSTAT(tcp_timeout_calls); + TCP_DBGSTAT(tcps, tcp_timeout_calls); if (tcp->tcp_timercache == NULL) { mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); } else { - TCP_DBGSTAT(tcp_timeout_cached_alloc); + TCP_DBGSTAT(tcps, tcp_timeout_cached_alloc); mp = tcp->tcp_timercache; tcp->tcp_timercache = mp->b_next; mp->b_next = NULL; @@ -25523,8 +26144,9 @@ tcp_timeout_cancel(conn_t *connp, timeout_id_t id) mblk_t *mp = (mblk_t *)id; tcp_timer_t *tcpt; clock_t delta; + tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; - TCP_DBGSTAT(tcp_timeout_cancel_reqs); + TCP_DBGSTAT(tcps, tcp_timeout_cancel_reqs); if (mp == NULL) return (-1); @@ -25535,7 +26157,7 @@ tcp_timeout_cancel(conn_t *connp, timeout_id_t id) delta = untimeout(tcpt->tcpt_tid); if (delta >= 0) { - TCP_DBGSTAT(tcp_timeout_canceled); + TCP_DBGSTAT(tcps, tcp_timeout_canceled); tcp_timer_free(connp->conn_tcp, mp); CONN_DEC_REF(connp); } @@ -25566,19 +26188,24 @@ tcp_timermp_alloc(int kmflags) mp->b_wptr = NULL; mp->b_datap = NULL; mp->b_queue = NULL; + mp->b_cont = NULL; } else if (kmflags & KM_PANIC) { /* * Failed to allocate memory for the timer. Try allocating from * dblock caches. */ - TCP_STAT(tcp_timermp_allocfail); + /* ipclassifier calls this from a constructor - hence no tcps */ + TCP_G_STAT(tcp_timermp_allocfail); mp = allocb_tryhard(sizeof (tcp_timer_t)); if (mp == NULL) { size_t size = 0; /* * Memory is really low. Try tryhard allocation. + * + * ipclassifier calls this from a constructor - + * hence no tcps */ - TCP_STAT(tcp_timermp_allocdblfail); + TCP_G_STAT(tcp_timermp_allocdblfail); mp = kmem_alloc_tryhard(sizeof (mblk_t) + sizeof (tcp_timer_t), &size, kmflags); mp->b_rptr = (uchar_t *)(&mp[1]); @@ -25586,10 +26213,12 @@ tcp_timermp_alloc(int kmflags) mp->b_wptr = (uchar_t *)-1; mp->b_datap = (dblk_t *)size; mp->b_queue = NULL; + mp->b_cont = NULL; } ASSERT(mp->b_wptr != NULL); } - TCP_DBGSTAT(tcp_timermp_alloced); + /* ipclassifier calls this from a constructor - hence no tcps */ + TCP_G_DBGSTAT(tcp_timermp_alloced); return (mp); } @@ -25619,6 +26248,7 @@ static void tcp_timer_free(tcp_t *tcp, mblk_t *mp) { mblk_t *mp1 = tcp->tcp_timercache; + tcp_stack_t *tcps = tcp->tcp_tcps; if (mp->b_wptr != NULL) { /* @@ -25636,7 +26266,7 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp) tcp->tcp_timercache = mp; } else { kmem_cache_free(tcp_timercache, mp); - TCP_DBGSTAT(tcp_timermp_freed); + TCP_DBGSTAT(tcps, tcp_timermp_freed); } } @@ -25655,6 +26285,7 @@ void tcp_setqfull(tcp_t *tcp) { queue_t *q = tcp->tcp_wq; + tcp_stack_t *tcps = tcp->tcp_tcps; if (!(q->q_flag & QFULL)) { mutex_enter(QLOCK(q)); @@ -25663,7 +26294,7 @@ tcp_setqfull(tcp_t *tcp) q->q_flag |= QFULL; tcp->tcp_flow_stopped = B_TRUE; mutex_exit(QLOCK(q)); - TCP_STAT(tcp_flwctl_on); + TCP_STAT(tcps, tcp_flwctl_on); } else { mutex_exit(QLOCK(q)); } @@ -25689,12 +26320,171 @@ tcp_clrqfull(tcp_t *tcp) } } + /* - * TCP Kstats implementation + * kstats related to squeues i.e. not per IP instance */ +static void * +tcp_g_kstat_init(tcp_g_stat_t *tcp_g_statp) +{ + kstat_t *ksp; + + tcp_g_stat_t template = { + { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, + { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, + { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, + { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, + }; + + ksp = kstat_create(TCP_MOD_NAME, 0, "tcpstat_g", "net", + KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return (NULL); + + bcopy(&template, tcp_g_statp, sizeof (template)); + ksp->ks_data = (void *)tcp_g_statp; + + kstat_install(ksp); + return (ksp); +} + +static void +tcp_g_kstat_fini(kstat_t *ksp) +{ + if (ksp != NULL) { + kstat_delete(ksp); + } +} + + +static void * +tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) +{ + kstat_t *ksp; + + tcp_stat_t template = { + { "tcp_time_wait", KSTAT_DATA_UINT64 }, + { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, + { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, + { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, + { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, + { "tcp_ip_output", KSTAT_DATA_UINT64 }, + { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, + { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, + { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, + { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, + { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, + { "tcp_eager_err1", KSTAT_DATA_UINT64 }, + { "tcp_eager_err2", KSTAT_DATA_UINT64 }, + { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, + { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, + { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, + { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, + { "tcp_no_listener", KSTAT_DATA_UINT64 }, + { "tcp_found_eager", KSTAT_DATA_UINT64 }, + { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, + { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, + { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, + { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, + { "tcp_open_alloc", KSTAT_DATA_UINT64 }, + { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, + { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, + { "tcp_listendrop", KSTAT_DATA_UINT64 }, + { "tcp_listendropq0", KSTAT_DATA_UINT64 }, + { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, + { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, + { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, + { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, + { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, + { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, + { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, + { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, + { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, + { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, + { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, + { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, + { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, + { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, + { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, + { "tcp_ire_null1", KSTAT_DATA_UINT64 }, + { "tcp_ire_null", KSTAT_DATA_UINT64 }, + { "tcp_ip_send", KSTAT_DATA_UINT64 }, + { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, + { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, + { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, + { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, + { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, + { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, + { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, + { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, + { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, + { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, + { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, + { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, + { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, + { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, + { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, + { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, + { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, + { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, + { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, + { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, + { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, + { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, + { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, + { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, + { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, + { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, + { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, + { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, + { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, + { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, + { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, + { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, + { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, + { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, + { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, + { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, + { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, + { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, + { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, + { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, + }; + + ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net", + KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL, stackid); + + if (ksp == NULL) + return (NULL); + + bcopy(&template, tcps_statisticsp, sizeof (template)); + ksp->ks_data = (void *)tcps_statisticsp; + ksp->ks_private = (void *)(uintptr_t)stackid; + + kstat_install(ksp); + return (ksp); +} + static void -tcp_kstat_init(void) +tcp_kstat2_fini(netstackid_t stackid, kstat_t *ksp) { + if (ksp != NULL) { + ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); + kstat_delete_netstack(ksp, stackid); + } +} + +/* + * TCP Kstats implementation + */ +static void * +tcp_kstat_init(netstackid_t stackid, tcp_stack_t *tcps) +{ + kstat_t *ksp; + tcp_named_kstat_t template = { { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, { "rtoMin", KSTAT_DATA_INT32, 0 }, @@ -25751,55 +26541,69 @@ tcp_kstat_init(void) { "connTableSize6", KSTAT_DATA_INT32, 0 } }; - tcp_mibkp = kstat_create(TCP_MOD_NAME, 0, TCP_MOD_NAME, - "mib2", KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0); + ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2", + KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid); - if (tcp_mibkp == NULL) - return; + if (ksp == NULL) + return (NULL); template.rtoAlgorithm.value.ui32 = 4; - template.rtoMin.value.ui32 = tcp_rexmit_interval_min; - template.rtoMax.value.ui32 = tcp_rexmit_interval_max; + template.rtoMin.value.ui32 = tcps->tcps_rexmit_interval_min; + template.rtoMax.value.ui32 = tcps->tcps_rexmit_interval_max; template.maxConn.value.i32 = -1; - bcopy(&template, tcp_mibkp->ks_data, sizeof (template)); + bcopy(&template, ksp->ks_data, sizeof (template)); + ksp->ks_update = tcp_kstat_update; + ksp->ks_private = (void *)(uintptr_t)stackid; - tcp_mibkp->ks_update = tcp_kstat_update; - - kstat_install(tcp_mibkp); + kstat_install(ksp); + return (ksp); } static void -tcp_kstat_fini(void) +tcp_kstat_fini(netstackid_t stackid, kstat_t *ksp) { - - if (tcp_mibkp != NULL) { - kstat_delete(tcp_mibkp); - tcp_mibkp = NULL; + if (ksp != NULL) { + ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); + kstat_delete_netstack(ksp, stackid); } } static int tcp_kstat_update(kstat_t *kp, int rw) { - tcp_named_kstat_t *tcpkp; - tcp_t *tcp; - connf_t *connfp; - conn_t *connp; - int i; + tcp_named_kstat_t *tcpkp; + tcp_t *tcp; + connf_t *connfp; + conn_t *connp; + int i; + netstackid_t stackid = (netstackid_t)(uintptr_t)kp->ks_private; + netstack_t *ns; + tcp_stack_t *tcps; + ip_stack_t *ipst; - if (!kp || !kp->ks_data) + if ((kp == NULL) || (kp->ks_data == NULL)) return (EIO); if (rw == KSTAT_WRITE) return (EACCES); + ns = netstack_find_by_stackid(stackid); + if (ns == NULL) + return (-1); + tcps = ns->netstack_tcp; + if (tcps == NULL) { + netstack_rele(ns); + return (-1); + } tcpkp = (tcp_named_kstat_t *)kp->ks_data; tcpkp->currEstab.value.ui32 = 0; + ipst = ns->netstack_ip; + for (i = 0; i < CONN_G_HASH_SIZE; i++) { - connfp = &ipcl_globalhash_fanout[i]; + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; connp = NULL; while ((connp = ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { @@ -25813,55 +26617,67 @@ tcp_kstat_update(kstat_t *kp, int rw) } } - tcpkp->activeOpens.value.ui32 = tcp_mib.tcpActiveOpens; - tcpkp->passiveOpens.value.ui32 = tcp_mib.tcpPassiveOpens; - tcpkp->attemptFails.value.ui32 = tcp_mib.tcpAttemptFails; - tcpkp->estabResets.value.ui32 = tcp_mib.tcpEstabResets; - tcpkp->inSegs.value.ui64 = tcp_mib.tcpHCInSegs; - tcpkp->outSegs.value.ui64 = tcp_mib.tcpHCOutSegs; - tcpkp->retransSegs.value.ui32 = tcp_mib.tcpRetransSegs; - tcpkp->connTableSize.value.i32 = tcp_mib.tcpConnTableSize; - tcpkp->outRsts.value.ui32 = tcp_mib.tcpOutRsts; - tcpkp->outDataSegs.value.ui32 = tcp_mib.tcpOutDataSegs; - tcpkp->outDataBytes.value.ui32 = tcp_mib.tcpOutDataBytes; - tcpkp->retransBytes.value.ui32 = tcp_mib.tcpRetransBytes; - tcpkp->outAck.value.ui32 = tcp_mib.tcpOutAck; - tcpkp->outAckDelayed.value.ui32 = tcp_mib.tcpOutAckDelayed; - tcpkp->outUrg.value.ui32 = tcp_mib.tcpOutUrg; - tcpkp->outWinUpdate.value.ui32 = tcp_mib.tcpOutWinUpdate; - tcpkp->outWinProbe.value.ui32 = tcp_mib.tcpOutWinProbe; - tcpkp->outControl.value.ui32 = tcp_mib.tcpOutControl; - tcpkp->outFastRetrans.value.ui32 = tcp_mib.tcpOutFastRetrans; - tcpkp->inAckSegs.value.ui32 = tcp_mib.tcpInAckSegs; - tcpkp->inAckBytes.value.ui32 = tcp_mib.tcpInAckBytes; - tcpkp->inDupAck.value.ui32 = tcp_mib.tcpInDupAck; - tcpkp->inAckUnsent.value.ui32 = tcp_mib.tcpInAckUnsent; - tcpkp->inDataInorderSegs.value.ui32 = tcp_mib.tcpInDataInorderSegs; - tcpkp->inDataInorderBytes.value.ui32 = tcp_mib.tcpInDataInorderBytes; - tcpkp->inDataUnorderSegs.value.ui32 = tcp_mib.tcpInDataUnorderSegs; - tcpkp->inDataUnorderBytes.value.ui32 = tcp_mib.tcpInDataUnorderBytes; - tcpkp->inDataDupSegs.value.ui32 = tcp_mib.tcpInDataDupSegs; - tcpkp->inDataDupBytes.value.ui32 = tcp_mib.tcpInDataDupBytes; - tcpkp->inDataPartDupSegs.value.ui32 = tcp_mib.tcpInDataPartDupSegs; - tcpkp->inDataPartDupBytes.value.ui32 = tcp_mib.tcpInDataPartDupBytes; - tcpkp->inDataPastWinSegs.value.ui32 = tcp_mib.tcpInDataPastWinSegs; - tcpkp->inDataPastWinBytes.value.ui32 = tcp_mib.tcpInDataPastWinBytes; - tcpkp->inWinProbe.value.ui32 = tcp_mib.tcpInWinProbe; - tcpkp->inWinUpdate.value.ui32 = tcp_mib.tcpInWinUpdate; - tcpkp->inClosed.value.ui32 = tcp_mib.tcpInClosed; - tcpkp->rttNoUpdate.value.ui32 = tcp_mib.tcpRttNoUpdate; - tcpkp->rttUpdate.value.ui32 = tcp_mib.tcpRttUpdate; - tcpkp->timRetrans.value.ui32 = tcp_mib.tcpTimRetrans; - tcpkp->timRetransDrop.value.ui32 = tcp_mib.tcpTimRetransDrop; - tcpkp->timKeepalive.value.ui32 = tcp_mib.tcpTimKeepalive; - tcpkp->timKeepaliveProbe.value.ui32 = tcp_mib.tcpTimKeepaliveProbe; - tcpkp->timKeepaliveDrop.value.ui32 = tcp_mib.tcpTimKeepaliveDrop; - tcpkp->listenDrop.value.ui32 = tcp_mib.tcpListenDrop; - tcpkp->listenDropQ0.value.ui32 = tcp_mib.tcpListenDropQ0; - tcpkp->halfOpenDrop.value.ui32 = tcp_mib.tcpHalfOpenDrop; - tcpkp->outSackRetransSegs.value.ui32 = tcp_mib.tcpOutSackRetransSegs; - tcpkp->connTableSize6.value.i32 = tcp_mib.tcp6ConnTableSize; - + tcpkp->activeOpens.value.ui32 = tcps->tcps_mib.tcpActiveOpens; + tcpkp->passiveOpens.value.ui32 = tcps->tcps_mib.tcpPassiveOpens; + tcpkp->attemptFails.value.ui32 = tcps->tcps_mib.tcpAttemptFails; + tcpkp->estabResets.value.ui32 = tcps->tcps_mib.tcpEstabResets; + tcpkp->inSegs.value.ui64 = tcps->tcps_mib.tcpHCInSegs; + tcpkp->outSegs.value.ui64 = tcps->tcps_mib.tcpHCOutSegs; + tcpkp->retransSegs.value.ui32 = tcps->tcps_mib.tcpRetransSegs; + tcpkp->connTableSize.value.i32 = tcps->tcps_mib.tcpConnTableSize; + tcpkp->outRsts.value.ui32 = tcps->tcps_mib.tcpOutRsts; + tcpkp->outDataSegs.value.ui32 = tcps->tcps_mib.tcpOutDataSegs; + tcpkp->outDataBytes.value.ui32 = tcps->tcps_mib.tcpOutDataBytes; + tcpkp->retransBytes.value.ui32 = tcps->tcps_mib.tcpRetransBytes; + tcpkp->outAck.value.ui32 = tcps->tcps_mib.tcpOutAck; + tcpkp->outAckDelayed.value.ui32 = tcps->tcps_mib.tcpOutAckDelayed; + tcpkp->outUrg.value.ui32 = tcps->tcps_mib.tcpOutUrg; + tcpkp->outWinUpdate.value.ui32 = tcps->tcps_mib.tcpOutWinUpdate; + tcpkp->outWinProbe.value.ui32 = tcps->tcps_mib.tcpOutWinProbe; + tcpkp->outControl.value.ui32 = tcps->tcps_mib.tcpOutControl; + tcpkp->outFastRetrans.value.ui32 = tcps->tcps_mib.tcpOutFastRetrans; + tcpkp->inAckSegs.value.ui32 = tcps->tcps_mib.tcpInAckSegs; + tcpkp->inAckBytes.value.ui32 = tcps->tcps_mib.tcpInAckBytes; + tcpkp->inDupAck.value.ui32 = tcps->tcps_mib.tcpInDupAck; + tcpkp->inAckUnsent.value.ui32 = tcps->tcps_mib.tcpInAckUnsent; + tcpkp->inDataInorderSegs.value.ui32 = + tcps->tcps_mib.tcpInDataInorderSegs; + tcpkp->inDataInorderBytes.value.ui32 = + tcps->tcps_mib.tcpInDataInorderBytes; + tcpkp->inDataUnorderSegs.value.ui32 = + tcps->tcps_mib.tcpInDataUnorderSegs; + tcpkp->inDataUnorderBytes.value.ui32 = + tcps->tcps_mib.tcpInDataUnorderBytes; + tcpkp->inDataDupSegs.value.ui32 = tcps->tcps_mib.tcpInDataDupSegs; + tcpkp->inDataDupBytes.value.ui32 = tcps->tcps_mib.tcpInDataDupBytes; + tcpkp->inDataPartDupSegs.value.ui32 = + tcps->tcps_mib.tcpInDataPartDupSegs; + tcpkp->inDataPartDupBytes.value.ui32 = + tcps->tcps_mib.tcpInDataPartDupBytes; + tcpkp->inDataPastWinSegs.value.ui32 = + tcps->tcps_mib.tcpInDataPastWinSegs; + tcpkp->inDataPastWinBytes.value.ui32 = + tcps->tcps_mib.tcpInDataPastWinBytes; + tcpkp->inWinProbe.value.ui32 = tcps->tcps_mib.tcpInWinProbe; + tcpkp->inWinUpdate.value.ui32 = tcps->tcps_mib.tcpInWinUpdate; + tcpkp->inClosed.value.ui32 = tcps->tcps_mib.tcpInClosed; + tcpkp->rttNoUpdate.value.ui32 = tcps->tcps_mib.tcpRttNoUpdate; + tcpkp->rttUpdate.value.ui32 = tcps->tcps_mib.tcpRttUpdate; + tcpkp->timRetrans.value.ui32 = tcps->tcps_mib.tcpTimRetrans; + tcpkp->timRetransDrop.value.ui32 = tcps->tcps_mib.tcpTimRetransDrop; + tcpkp->timKeepalive.value.ui32 = tcps->tcps_mib.tcpTimKeepalive; + tcpkp->timKeepaliveProbe.value.ui32 = + tcps->tcps_mib.tcpTimKeepaliveProbe; + tcpkp->timKeepaliveDrop.value.ui32 = + tcps->tcps_mib.tcpTimKeepaliveDrop; + tcpkp->listenDrop.value.ui32 = tcps->tcps_mib.tcpListenDrop; + tcpkp->listenDropQ0.value.ui32 = tcps->tcps_mib.tcpListenDropQ0; + tcpkp->halfOpenDrop.value.ui32 = tcps->tcps_mib.tcpHalfOpenDrop; + tcpkp->outSackRetransSegs.value.ui32 = + tcps->tcps_mib.tcpOutSackRetransSegs; + tcpkp->connTableSize6.value.i32 = tcps->tcps_mib.tcp6ConnTableSize; + + netstack_rele(ns); return (0); } @@ -25872,10 +26688,11 @@ tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) ipha_t *ipha; uint8_t *nexthdrp; tcph_t *tcph; + tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; /* Already has an eager */ if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - TCP_STAT(tcp_reinput_syn); + TCP_STAT(tcps, tcp_reinput_syn); squeue_enter(connp->conn_sqp, mp, connp->conn_recv, connp, SQTAG_TCP_REINPUT_EAGER); return; @@ -25924,6 +26741,10 @@ tcp_squeue_switch(int val) return (rval); } +/* + * This is called once for each squeue - globally for all stack + * instances. + */ static void tcp_squeue_add(squeue_t *sqp) { |