diff options
Diffstat (limited to 'usr/src/uts/common/inet/tcp/tcp.c')
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 12129 |
1 files changed, 3194 insertions, 8935 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index c9a941eab2..0e1ef43cfb 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -46,8 +46,6 @@ #include <sys/ethernet.h> #include <sys/cpuvar.h> #include <sys/dlpi.h> -#include <sys/multidata.h> -#include <sys/multidata_impl.h> #include <sys/pattr.h> #include <sys/policy.h> #include <sys/priv.h> @@ -87,7 +85,6 @@ #include <inet/tcp_impl.h> #include <inet/udp_impl.h> #include <net/pfkeyv2.h> -#include <inet/ipsec_info.h> #include <inet/ipdrop.h> #include <inet/ipclassifier.h> @@ -95,6 +92,7 @@ #include <inet/ip_ftable.h> #include <inet/ip_if.h> #include <inet/ipp_common.h> +#include <inet/ip_rts.h> #include <inet/ip_netinfo.h> #include <sys/squeue_impl.h> #include <sys/squeue.h> @@ -111,7 +109,7 @@ * * The entire tcp state is contained in tcp_t and conn_t structure * which are allocated in tandem using ipcl_conn_create() and passing - * IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect + * IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect * the references on the tcp_t. The tcp_t structure is never compressed * and packets always land on the correct TCP perimeter from the time * eager is created till the time tcp_t dies (as such the old mentat @@ -172,8 +170,8 @@ * * This is a more interesting case because of various races involved in * establishing a eager in its own perimeter. Read the meta comment on - * top of tcp_conn_request(). But briefly, the squeue is picked by - * ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU. + * top of tcp_input_listener(). But briefly, the squeue is picked by + * ip_fanout based on the ring or the sender (if loopback). * * Closing a connection: * @@ -198,20 +196,13 @@ * * Special provisions and fast paths: * - * We make special provision for (AF_INET, SOCK_STREAM) sockets which - * can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP - * will never send a M_CTL to TCP. As such, ip_tcp_input() which handles - * all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY - * check to send packets directly to tcp_rput_data via squeue. Everyone - * else comes through tcp_input() on the read side. - * - * We also make special provisions for sockfs by marking tcp_issocket + * We make special provisions for sockfs by marking tcp_issocket * whenever we have only sockfs on top of TCP. This allows us to skip * putting the tcp in acceptor hash since a sockfs listener can never * become acceptor and also avoid allocating a tcp_t for acceptor STREAM * since eager has already been allocated and the accept now happens * on acceptor STREAM. There is a big blob of comment on top of - * tcp_conn_request explaining the new accept. When socket is POP'd, + * tcp_input_listener explaining the new accept. When socket is POP'd, * sockfs sends us an ioctl to mark the fact and we go back to old * behaviour. Once tcp_issocket is unset, its never set for the * life of that connection. @@ -224,13 +215,6 @@ * only exception is tcp_xmit_listeners_reset() which is called * directly from IP and needs to policy check to see if TH_RST * can be sent out. - * - * PFHooks notes : - * - * For mdt case, one meta buffer contains multiple packets. Mblks for every - * packet are assembled and passed to the hooks. When packets are blocked, - * or boundary of any packet is changed, the mdt processing is stopped, and - * packets of the meta buffer are send to the IP path one by one. */ /* @@ -244,7 +228,7 @@ int tcp_squeue_flag; /* * This controls how tiny a write must be before we try to copy it - * into the the mblk on the tail of the transmit queue. Not much + * into the mblk on the tail of the transmit queue. Not much * speedup is observed for values larger than sixteen. Zero will * disable the optimisation. */ @@ -333,16 +317,6 @@ static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; tcp_g_stat_t tcp_g_statistics; kstat_t *tcp_g_kstat; -/* - * Call either ip_output or ip_output_v6. This replaces putnext() calls on the - * tcp write side. - */ -#define CALL_IP_WPUT(connp, q, mp) { \ - ASSERT(((q)->q_flag & QREADR) == 0); \ - TCP_DBGSTAT(connp->conn_netstack->netstack_tcp, tcp_ip_output); \ - connp->conn_send(connp, (mp), (q), IP_WPUT); \ -} - /* Macros for timestamp comparisons */ #define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) #define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) @@ -354,7 +328,7 @@ kstat_t *tcp_g_kstat; * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); * a per-connection component which grows by 125000 for every new connection; * and an "extra" component that grows by a random amount centered - * approximately on 64000. This causes the the ISS generator to cycle every + * approximately on 64000. This causes the ISS generator to cycle every * 4.89 hours if no TCP connections are made, and faster if connections are * made. * @@ -381,8 +355,13 @@ static sin6_t sin6_null; /* Zero address for quick clears */ */ #define TCP_OLD_URP_INTERPRETATION 1 +/* + * Since tcp_listener is not cleared atomically with tcp_detached + * being cleared we need this extra bit to tell a detached connection + * apart from one that is in the process of being accepted. + */ #define TCP_IS_DETACHED_NONEAGER(tcp) \ - (TCP_IS_DETACHED(tcp) && \ + (TCP_IS_DETACHED(tcp) && \ (!(tcp)->tcp_hard_binding)) /* @@ -495,7 +474,6 @@ typedef struct tcp_timer_s { static kmem_cache_t *tcp_timercache; kmem_cache_t *tcp_sack_info_cache; -kmem_cache_t *tcp_iphc_cache; /* * For scalability, we must not run a timer for every TCP connection @@ -592,17 +570,6 @@ typedef struct tcp_opt_s { } tcp_opt_t; /* - * TCP option struct passing information b/w lisenter and eager. - */ -struct tcp_options { - uint_t to_flags; - ssize_t to_boundif; /* IPV6_BOUND_IF */ -}; - -#define TCPOPT_BOUNDIF 0x00000001 /* set IPV6_BOUND_IF */ -#define TCPOPT_RECVPKTINFO 0x00000002 /* set IPV6_RECVPKTINFO */ - -/* * RFC1323-recommended phrasing of TSTAMP option, for easier parsing */ @@ -673,43 +640,53 @@ typedef struct tcpt_s { /* * Functions called directly via squeue having a prototype of edesc_t. */ -void tcp_conn_request(void *arg, mblk_t *mp, void *arg2); -static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2); -void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2); -static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2); -static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2); -void tcp_input(void *arg, mblk_t *mp, void *arg2); -void tcp_rput_data(void *arg, mblk_t *mp, void *arg2); -static void tcp_close_output(void *arg, mblk_t *mp, void *arg2); -void tcp_output(void *arg, mblk_t *mp, void *arg2); -void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2); -static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2); -static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2); -static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2); +void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *ira); +static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_input_data(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *ira); +static void tcp_close_output(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_output(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); /* Prototype for TCP functions */ static void tcp_random_init(void); int tcp_random(void); static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); -static int tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, +static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager); -static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp); +static int tcp_set_destination(tcp_t *tcp); static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, boolean_t user_specified); static void tcp_closei_local(tcp_t *tcp); static void tcp_close_detached(tcp_t *tcp); -static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, - mblk_t *idmp, mblk_t **defermp); +static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, + mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira); static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, - in_port_t dstport, uint_t srcid, cred_t *cr, pid_t pid); -static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, - in_port_t dstport, uint32_t flowinfo, uint_t srcid, - uint32_t scope_id, cred_t *cr, pid_t pid); + in_port_t dstport, uint_t srcid); +static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, + in_port_t dstport, uint32_t flowinfo, + uint_t srcid, uint32_t scope_id); static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); -static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp); static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); static char *tcp_display(tcp_t *tcp, char *, char); static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); @@ -735,34 +712,16 @@ static void tcp_acceptor_hash_remove(tcp_t *tcp); static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); static void tcp_info_req(tcp_t *tcp, mblk_t *mp); static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); -static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp); -void tcp_g_q_setup(tcp_stack_t *); -void tcp_g_q_create(tcp_stack_t *); -void tcp_g_q_destroy(tcp_stack_t *); -static int tcp_header_init_ipv4(tcp_t *tcp); -static int tcp_header_init_ipv6(tcp_t *tcp); -int tcp_init(tcp_t *tcp, queue_t *q); -static int tcp_init_values(tcp_t *tcp); -static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic); -static void tcp_ip_ire_mark_advice(tcp_t *tcp); +static void tcp_init_values(tcp_t *tcp); static void tcp_ip_notify(tcp_t *tcp); -static mblk_t *tcp_ire_mp(mblk_t **mpp); static void tcp_iss_init(tcp_t *tcp); static void tcp_keepalive_killer(void *arg); -static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt); -static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss); +static int tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt); +static void tcp_mss_set(tcp_t *tcp, uint32_t size); static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, int *t_errorp, int *sys_errorp); static boolean_t tcp_allow_connopt_set(int level, int name); int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); -int tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr); -int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, - int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, - uchar_t *outvalp, void *thisdg_attrs, cred_t *cr, - mblk_t *mblk); -static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha); -static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, - uchar_t *ptr, uint_t len); static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *); @@ -785,9 +744,9 @@ static uint_t tcp_rcv_drain(tcp_t *tcp); static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_ss_rexmit(tcp_t *tcp); -static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp); -static void tcp_process_options(tcp_t *, tcph_t *); -static void tcp_rput_common(tcp_t *tcp, mblk_t *mp); +static mblk_t *tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, + ip_recv_attr_t *); +static void tcp_process_options(tcp_t *, tcpha_t *); static void tcp_rsrv(queue_t *q); static int tcp_snmp_state(tcp_t *tcp); static void tcp_timer(void *arg); @@ -801,16 +760,10 @@ void tcp_tpi_accept(queue_t *q, mblk_t *mp); static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); -static int tcp_send(queue_t *q, tcp_t *tcp, const int mss, - const int tcp_hdr_len, const int tcp_tcp_hdr_len, +static int tcp_send(tcp_t *tcp, const int mss, + const int total_hdr_len, const int tcp_hdr_len, const int num_sack_blk, int *usable, uint_t *snxt, - int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres); -static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, - const int tcp_hdr_len, const int tcp_tcp_hdr_len, - const int num_sack_blk, int *usable, uint_t *snxt, - int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres); + int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time); static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk); static void tcp_wsrv(queue_t *q); @@ -818,38 +771,36 @@ static int tcp_xmit_end(tcp_t *tcp); static void tcp_ack_timer(void *arg); static mblk_t *tcp_ack_mp(tcp_t *tcp); static void tcp_xmit_early_reset(char *str, mblk_t *mp, - uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len, - zoneid_t zoneid, tcp_stack_t *, conn_t *connp); + uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *, + ip_stack_t *, conn_t *); static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl); -static int setmaxps(queue_t *q, int maxpsz); static void tcp_set_rto(tcp_t *, time_t); -static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *, - boolean_t, boolean_t); -static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, - boolean_t ipsec_mctl); +static void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); +static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, + ip_recv_attr_t *); static int tcp_build_hdrs(tcp_t *); static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, - uint32_t seg_seq, uint32_t seg_ack, int seg_len, - tcph_t *tcph); -boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp); -static mblk_t *tcp_mdt_info_mp(mblk_t *); -static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t); -static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *, - const boolean_t, const uint32_t, const uint32_t, - const uint32_t, const uint32_t, tcp_stack_t *); -static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *, - const uint_t, const uint_t, boolean_t *); -static mblk_t *tcp_lso_info_mp(mblk_t *); -static void tcp_lso_update(tcp_t *, ill_lso_capab_t *); -static void tcp_send_data(tcp_t *, queue_t *, mblk_t *); + uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha, + ip_recv_attr_t *ira); +boolean_t tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp); +static boolean_t tcp_zcopy_check(tcp_t *); +static void tcp_zcopy_notify(tcp_t *); +static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); +static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa); +static void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only); +static void tcp_update_zcopy(tcp_t *tcp); +static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, + ixa_notify_arg_t); +static void tcp_rexmit_after_error(tcp_t *tcp); +static void tcp_send_data(tcp_t *, mblk_t *); extern mblk_t *tcp_timermp_alloc(int); extern void tcp_timermp_free(tcp_t *); static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); static void tcp_stop_lingering(tcp_t *tcp); static void tcp_close_linger_timeout(void *arg); static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); -static void tcp_stack_shutdown(netstackid_t stackid, void *arg); static void tcp_stack_fini(netstackid_t stackid, void *arg); static void *tcp_g_kstat_init(tcp_g_stat_t *); static void tcp_g_kstat_fini(kstat_t *); @@ -858,11 +809,10 @@ static void tcp_kstat_fini(netstackid_t, kstat_t *); static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); static void tcp_kstat2_fini(netstackid_t, kstat_t *); static int tcp_kstat_update(kstat_t *kp, int rw); -void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp); -static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, - tcph_t *tcph, uint_t ipvers, mblk_t *idmp); -static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, - tcph_t *tcph, mblk_t *idmp); +static mblk_t *tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira); +static mblk_t *tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira); static int tcp_squeue_switch(int); static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); @@ -872,21 +822,17 @@ static int tcp_tpi_close(queue_t *, int); static int tcp_tpi_close_accept(queue_t *); static void tcp_squeue_add(squeue_t *); -static boolean_t tcp_zcopy_check(tcp_t *); -static void tcp_zcopy_notify(tcp_t *); -static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *); -static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int); -static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); +static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); -extern void tcp_kssl_input(tcp_t *, mblk_t *); +extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *); -void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2); -void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); +void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy); +void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, sock_upper_handle_t, cred_t *); static int tcp_listen(sock_lower_handle_t, int, cred_t *); -static int tcp_post_ip_bind(tcp_t *, mblk_t *, int, cred_t *, pid_t); static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *, boolean_t); static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, @@ -922,7 +868,8 @@ static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); */ static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); -static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *); +static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, @@ -988,12 +935,6 @@ struct streamtab tcpinfov6 = { sock_downcalls_t sock_tcp_downcalls; -/* - * Have to ensure that tcp_g_q_close is not done by an - * interrupt thread. - */ -static taskq_t *tcp_taskq; - /* Setable only in /etc/system. Move to ndd? */ boolean_t tcp_icmp_source_quench = B_FALSE; @@ -1042,8 +983,8 @@ static struct T_info_ack tcp_g_t_info_ack_v6 = { #define PARAM_MAX (~(uint32_t)0) /* Max size IP datagram is 64k - 1 */ -#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t))) -#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t))) +#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t))) +#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t))) /* Max of the above */ #define TCP_MSS_MAX TCP_MSS_MAX_IPV4 @@ -1128,29 +1069,10 @@ static tcpparam_t lcl_tcp_param_arr[] = { { 0, 100*MS, 50*MS, "tcp_push_timer_interval"}, { 0, 1, 0, "tcp_use_smss_as_mss_opt"}, { 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"}, + { 0, 1, 0, "tcp_dev_flow_ctl"}, }; /* END CSTYLED */ -/* - * tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of - * each header fragment in the header buffer. Each parameter value has - * to be a multiple of 4 (32-bit aligned). - */ -static tcpparam_t lcl_tcp_mdt_head_param = - { 32, 256, 32, "tcp_mdt_hdr_head_min" }; -static tcpparam_t lcl_tcp_mdt_tail_param = - { 0, 256, 32, "tcp_mdt_hdr_tail_min" }; -#define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val -#define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val - -/* - * tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out - * the maximum number of payload buffers associated per Multidata. - */ -static tcpparam_t lcl_tcp_mdt_max_pbufs_param = - { 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" }; -#define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val - /* Round up the value to the nearest mss. */ #define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) @@ -1162,7 +1084,7 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param = * point ECT(0) for TCP as described in RFC 2481. */ #define SET_ECT(tcp, iph) \ - if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ + if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \ /* We need to clear the code point first. */ \ ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ @@ -1183,23 +1105,12 @@ static tcpparam_t lcl_tcp_mdt_max_pbufs_param = #define IS_VMLOANED_MBLK(mp) \ (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) - -/* Enable or disable b_cont M_MULTIDATA chaining for MDT. */ -boolean_t tcp_mdt_chain = B_TRUE; - -/* - * MDT threshold in the form of effective send MSS multiplier; we take - * the MDT path if the amount of unsent data exceeds the threshold value - * (default threshold is 1*SMSS). - */ -uint_t tcp_mdt_smss_threshold = 1; - uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ /* * Forces all connections to obey the value of the tcps_maxpsz_multiplier * tunable settable via NDD. Otherwise, the per-connection behavior is - * determined dynamically during tcp_adapt_ire(), which is the default. + * determined dynamically during tcp_set_destination(), which is the default. */ boolean_t tcp_static_maxpsz = B_FALSE; @@ -1273,84 +1184,73 @@ int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, uint8_t *laddrp, in_port_t lport, uint8_t *faddrp, in_port_t fport, void *args) = NULL; - void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, sa_family_t addr_family, uint8_t *laddrp, in_port_t lport, uint8_t *faddrp, in_port_t fport, void *args) = NULL; -/* - * The following are defined in ip.c - */ -extern int (*cl_inet_isclusterwide)(netstackid_t stack_id, uint8_t protocol, - sa_family_t addr_family, uint8_t *laddrp, - void *args); -extern uint32_t (*cl_inet_ipident)(netstackid_t stack_id, uint8_t protocol, - sa_family_t addr_family, uint8_t *laddrp, - uint8_t *faddrp, void *args); - /* * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) */ -#define CL_INET_CONNECT(connp, tcp, is_outgoing, err) { \ +#define CL_INET_CONNECT(connp, is_outgoing, err) { \ (err) = 0; \ if (cl_inet_connect2 != NULL) { \ /* \ * Running in cluster mode - register active connection \ * information \ */ \ - if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ - if ((tcp)->tcp_ipha->ipha_src != 0) { \ + if ((connp)->conn_ipversion == IPV4_VERSION) { \ + if ((connp)->conn_laddr_v4 != 0) { \ (err) = (*cl_inet_connect2)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, is_outgoing, AF_INET, \ - (uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v4)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v4)),\ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } else { \ if (!IN6_IS_ADDR_UNSPECIFIED( \ - &(tcp)->tcp_ip6h->ip6_src)) { \ + &(connp)->conn_laddr_v6)) { \ (err) = (*cl_inet_connect2)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, is_outgoing, AF_INET6, \ - (uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v6)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v6)), \ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } \ } \ } -#define CL_INET_DISCONNECT(connp, tcp) { \ +#define CL_INET_DISCONNECT(connp) { \ if (cl_inet_disconnect != NULL) { \ /* \ * Running in cluster mode - deregister active \ * connection information \ */ \ - if ((tcp)->tcp_ipversion == IPV4_VERSION) { \ - if ((tcp)->tcp_ip_src != 0) { \ + if ((connp)->conn_ipversion == IPV4_VERSION) { \ + if ((connp)->conn_laddr_v4 != 0) { \ (*cl_inet_disconnect)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, AF_INET, \ - (uint8_t *)(&((tcp)->tcp_ip_src)), \ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v4)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v4)),\ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } else { \ if (!IN6_IS_ADDR_UNSPECIFIED( \ - &(tcp)->tcp_ip_src_v6)) { \ + &(connp)->conn_laddr_v6)) { \ (*cl_inet_disconnect)( \ (connp)->conn_netstack->netstack_stackid,\ IPPROTO_TCP, AF_INET6, \ - (uint8_t *)(&((tcp)->tcp_ip_src_v6)),\ - (in_port_t)(tcp)->tcp_lport, \ - (uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\ - (in_port_t)(tcp)->tcp_fport, NULL); \ + (uint8_t *)(&((connp)->conn_laddr_v6)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v6)), \ + (in_port_t)(connp)->conn_fport, NULL); \ } \ } \ } \ @@ -1367,11 +1267,6 @@ int cl_tcp_walk_list(netstackid_t stack_id, static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, tcp_stack_t *tcps); -#define DTRACE_IP_FASTPATH(mp, iph, ill, ipha, ip6h) \ - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, void_ip_t *, \ - iph, __dtrace_ipsr_ill_t *, ill, ipha_t *, ipha, \ - ip6_t *, ip6h, int, 0); - static void tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) { @@ -1540,7 +1435,7 @@ tcp_time_wait_append(tcp_t *tcp) /* ARGSUSED */ void -tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) +tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; @@ -1551,11 +1446,11 @@ tcp_timewait_output(void *arg, mblk_t *mp, void *arg2) return; } - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); ASSERT(!tcp->tcp_listener); TCP_STAT(tcps, tcp_time_wait_reap); @@ -1579,10 +1474,17 @@ tcp_ipsec_cleanup(tcp_t *tcp) ASSERT(connp->conn_flags & IPCL_TCPCONN); if (connp->conn_latch != NULL) { - IPLATCH_REFRELE(connp->conn_latch, - connp->conn_netstack); + IPLATCH_REFRELE(connp->conn_latch); connp->conn_latch = NULL; } + if (connp->conn_latch_in_policy != NULL) { + IPPOL_REFRELE(connp->conn_latch_in_policy); + connp->conn_latch_in_policy = NULL; + } + if (connp->conn_latch_in_action != NULL) { + IPACT_REFRELE(connp->conn_latch_in_action); + connp->conn_latch_in_action = NULL; + } if (connp->conn_policy != NULL) { IPPH_REFRELE(connp->conn_policy, connp->conn_netstack); connp->conn_policy = NULL; @@ -1598,9 +1500,6 @@ void tcp_cleanup(tcp_t *tcp) { mblk_t *mp; - char *tcp_iphc; - int tcp_iphc_len; - int tcp_hdr_grown; tcp_sack_info_t *tcp_sack_info; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; @@ -1611,6 +1510,22 @@ tcp_cleanup(tcp_t *tcp) /* Cleanup that which needs the netstack first */ tcp_ipsec_cleanup(tcp); + ixa_cleanup(connp->conn_ixa); + + if (connp->conn_ht_iphc != NULL) { + kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); + connp->conn_ht_iphc = NULL; + connp->conn_ht_iphc_allocated = 0; + connp->conn_ht_iphc_len = 0; + connp->conn_ht_ulp = NULL; + connp->conn_ht_ulp_len = 0; + tcp->tcp_ipha = NULL; + tcp->tcp_ip6h = NULL; + tcp->tcp_tcpha = NULL; + } + + /* We clear any IP_OPTIONS and extension headers */ + ip_pkt_free(&connp->conn_xmit_ipp); tcp_free(tcp); @@ -1626,8 +1541,6 @@ tcp_cleanup(tcp_t *tcp) } tcp->tcp_kssl_pending = B_FALSE; - conn_delete_ire(connp, NULL); - /* * Since we will bzero the entire structure, we need to * remove it and reinsert it in global hash list. We @@ -1639,46 +1552,36 @@ tcp_cleanup(tcp_t *tcp) */ ipcl_globalhash_remove(connp); - /* - * Now it is safe to decrement the reference counts. - * This might be the last reference on the netstack and TCPS - * in which case it will cause the tcp_g_q_close and - * the freeing of the IP Instance. - */ - connp->conn_netstack = NULL; - netstack_rele(ns); - ASSERT(tcps != NULL); - tcp->tcp_tcps = NULL; - TCPS_REFRELE(tcps); - /* Save some state */ mp = tcp->tcp_timercache; tcp_sack_info = tcp->tcp_sack_info; - tcp_iphc = tcp->tcp_iphc; - tcp_iphc_len = tcp->tcp_iphc_len; - tcp_hdr_grown = tcp->tcp_hdr_grown; tcp_rsrv_mp = tcp->tcp_rsrv_mp; if (connp->conn_cred != NULL) { crfree(connp->conn_cred); connp->conn_cred = NULL; } - if (connp->conn_effective_cred != NULL) { - crfree(connp->conn_effective_cred); - connp->conn_effective_cred = NULL; - } ipcl_conn_cleanup(connp); connp->conn_flags = IPCL_TCPCONN; + + /* + * Now it is safe to decrement the reference counts. + * This might be the last reference on the netstack + * in which case it will cause the freeing of the IP Instance. + */ + connp->conn_netstack = NULL; + connp->conn_ixa->ixa_ipst = NULL; + netstack_rele(ns); + ASSERT(tcps != NULL); + tcp->tcp_tcps = NULL; + bzero(tcp, sizeof (tcp_t)); /* restore the state */ tcp->tcp_timercache = mp; tcp->tcp_sack_info = tcp_sack_info; - tcp->tcp_iphc = tcp_iphc; - tcp->tcp_iphc_len = tcp_iphc_len; - tcp->tcp_hdr_grown = tcp_hdr_grown; tcp->tcp_rsrv_mp = tcp_rsrv_mp; tcp->tcp_connp = connp; @@ -1686,7 +1589,7 @@ tcp_cleanup(tcp_t *tcp) ASSERT(connp->conn_tcp == tcp); ASSERT(connp->conn_flags & IPCL_TCPCONN); connp->conn_state_flags = CONN_INCIPIENT; - ASSERT(connp->conn_ulp == IPPROTO_TCP); + ASSERT(connp->conn_proto == IPPROTO_TCP); ASSERT(connp->conn_ref == 1); } @@ -1777,11 +1680,7 @@ tcp_time_wait_collector(void *arg) /* * Set the CONDEMNED flag now itself so that * the refcnt cannot increase due to any - * walker. But we have still not cleaned up - * conn_ire_cache. This is still ok since - * we are going to clean it up in tcp_cleanup - * immediately and any interface unplumb - * thread will wait till the ire is blown away + * walker. */ connp->conn_state_flags |= CONN_CONDEMNED; mutex_exit(lock); @@ -1809,7 +1708,7 @@ tcp_time_wait_collector(void *arg) mutex_exit( &tcp_time_wait->tcp_time_wait_lock); tcp_bind_hash_remove(tcp); - conn_delete_ire(tcp->tcp_connp, NULL); + ixa_cleanup(tcp->tcp_connp->conn_ixa); tcp_ipsec_cleanup(tcp); CONN_DEC_REF(tcp->tcp_connp); } @@ -1839,7 +1738,7 @@ tcp_time_wait_collector(void *arg) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_output, connp, + tcp_timewait_output, connp, NULL, SQ_FILL, SQTAG_TCP_TIMEWAIT); } } else { @@ -1867,7 +1766,7 @@ tcp_time_wait_collector(void *arg) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_output, connp, + tcp_timewait_output, connp, NULL, SQ_FILL, SQTAG_TCP_TIMEWAIT); } mutex_enter(&tcp_time_wait->tcp_time_wait_lock); @@ -1886,24 +1785,23 @@ tcp_time_wait_collector(void *arg) /* * Reply to a clients T_CONN_RES TPI message. This function * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES - * on the acceptor STREAM and processed in tcp_wput_accept(). - * Read the block comment on top of tcp_conn_request(). + * on the acceptor STREAM and processed in tcp_accept_common(). + * Read the block comment on top of tcp_input_listener(). */ static void tcp_tli_accept(tcp_t *listener, mblk_t *mp) { - tcp_t *acceptor; - tcp_t *eager; - tcp_t *tcp; + tcp_t *acceptor; + tcp_t *eager; + tcp_t *tcp; struct T_conn_res *tcr; t_uscalar_t acceptor_id; t_scalar_t seqnum; - mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */ - struct tcp_options *tcpopt; - mblk_t *ok_mp; - mblk_t *mp1; + mblk_t *discon_mp = NULL; + mblk_t *ok_mp; + mblk_t *mp1; tcp_stack_t *tcps = listener->tcp_tcps; - int error; + conn_t *econnp; if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { tcp_err_ack(listener, mp, TPROTO, 0); @@ -1922,8 +1820,8 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) * fanout hash lock is held. * This prevents any thread from entering the acceptor queue from * below (since it has not been hard bound yet i.e. any inbound - * packets will arrive on the listener or default tcp queue and - * go through tcp_lookup). + * packets will arrive on the listener conn_t and + * go through the classifier). * The CONN_INC_REF will prevent the acceptor from closing. * * XXX It is still possible for a tli application to send down data @@ -1974,7 +1872,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) } else { acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); if (acceptor == NULL) { - if (listener->tcp_debug) { + if (listener->tcp_connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_accept: did not find acceptor 0x%x\n", @@ -2013,7 +1911,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) * Rendezvous with an eager connection request packet hanging off * 'tcp' that has the 'seqnum' tag. We tagged the detached open * tcp structure when the connection packet arrived in - * tcp_conn_request(). + * tcp_input_listener(). */ seqnum = tcr->SEQ_number; eager = listener; @@ -2047,37 +1945,26 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) */ ASSERT(eager->tcp_connp->conn_ref >= 1); - /* Pre allocate the stroptions mblk also */ - opt_mp = allocb(MAX(sizeof (struct tcp_options), - sizeof (struct T_conn_res)), BPRI_HI); - if (opt_mp == NULL) { + /* + * Pre allocate the discon_ind mblk also. tcp_accept_finish will + * use it if something failed. + */ + discon_mp = allocb(MAX(sizeof (struct T_discon_ind), + sizeof (struct stroptions)), BPRI_HI); + if (discon_mp == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); tcp_err_ack(listener, mp, TSYSERR, ENOMEM); return; } - DB_TYPE(opt_mp) = M_SETOPTS; - opt_mp->b_wptr += sizeof (struct tcp_options); - tcpopt = (struct tcp_options *)opt_mp->b_rptr; - tcpopt->to_flags = 0; - /* - * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO - * from listener to acceptor. - */ - if (listener->tcp_bound_if != 0) { - tcpopt->to_flags |= TCPOPT_BOUNDIF; - tcpopt->to_boundif = listener->tcp_bound_if; - } - if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { - tcpopt->to_flags |= TCPOPT_RECVPKTINFO; - } + econnp = eager->tcp_connp; - /* Re-use mp1 to hold a copy of mp, in case reallocb fails */ + /* Hold a copy of mp, in case reallocb fails */ if ((mp1 = copymsg(mp)) == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); - freemsg(opt_mp); + freemsg(discon_mp); tcp_err_ack(listener, mp, TSYSERR, ENOMEM); return; } @@ -2093,7 +1980,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) { int extra; - extra = (eager->tcp_family == AF_INET) ? + extra = (econnp->conn_family == AF_INET) ? sizeof (sin_t) : sizeof (sin6_t); /* @@ -2104,7 +1991,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { CONN_DEC_REF(acceptor->tcp_connp); CONN_DEC_REF(eager->tcp_connp); - freemsg(opt_mp); + freemsg(discon_mp); /* Original mp has been freed by now, so use mp1 */ tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); return; @@ -2114,38 +2001,32 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) switch (extra) { case sizeof (sin_t): { - sin_t *sin = (sin_t *)ok_mp->b_wptr; + sin_t *sin = (sin_t *)ok_mp->b_wptr; - ok_mp->b_wptr += extra; - sin->sin_family = AF_INET; - sin->sin_port = eager->tcp_lport; - sin->sin_addr.s_addr = - eager->tcp_ipha->ipha_src; - break; - } + ok_mp->b_wptr += extra; + sin->sin_family = AF_INET; + sin->sin_port = econnp->conn_lport; + sin->sin_addr.s_addr = econnp->conn_laddr_v4; + break; + } case sizeof (sin6_t): { - sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; + sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; - ok_mp->b_wptr += extra; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = eager->tcp_lport; - if (eager->tcp_ipversion == IPV4_VERSION) { - sin6->sin6_flowinfo = 0; - IN6_IPADDR_TO_V4MAPPED( - eager->tcp_ipha->ipha_src, - &sin6->sin6_addr); - } else { - ASSERT(eager->tcp_ip6h != NULL); - sin6->sin6_flowinfo = - eager->tcp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; - sin6->sin6_addr = - eager->tcp_ip6h->ip6_src; - } + ok_mp->b_wptr += extra; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = econnp->conn_lport; + sin6->sin6_addr = econnp->conn_laddr_v6; + sin6->sin6_flowinfo = econnp->conn_flowinfo; + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && + (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = + econnp->conn_ixa->ixa_scopeid; + } else { sin6->sin6_scope_id = 0; - sin6->__sin6_src_id = 0; - break; } + sin6->__sin6_src_id = 0; + break; + } default: break; } @@ -2158,15 +2039,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) * the tcp_accept_swap is done since it would be dangerous to * let the application start using the new fd prior to the swap. */ - error = tcp_accept_swap(listener, acceptor, eager); - if (error != 0) { - CONN_DEC_REF(acceptor->tcp_connp); - CONN_DEC_REF(eager->tcp_connp); - freemsg(ok_mp); - /* Original mp has been freed by now, so use mp1 */ - tcp_err_ack(listener, mp1, TSYSERR, error); - return; - } + tcp_accept_swap(listener, acceptor, eager); /* * tcp_accept_swap unlinks eager from listener but does not drop @@ -2244,7 +2117,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) /* We no longer need mp1, since all options processing has passed */ freemsg(mp1); - putnext(listener->tcp_rq, ok_mp); + putnext(listener->tcp_connp->conn_rq, ok_mp); mutex_enter(&listener->tcp_eager_lock); if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { @@ -2305,7 +2178,7 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) listener->tcp_eager_last_q = tcp; tcp->tcp_eager_next_q = NULL; mutex_exit(&listener->tcp_eager_lock); - putnext(tcp->tcp_rq, conn_ind); + putnext(tcp->tcp_connp->conn_rq, conn_ind); } else { mutex_exit(&listener->tcp_eager_lock); } @@ -2318,26 +2191,20 @@ tcp_tli_accept(tcp_t *listener, mblk_t *mp) */ finish: ASSERT(acceptor->tcp_detached); - ASSERT(tcps->tcps_g_q != NULL); + acceptor->tcp_connp->conn_rq = NULL; ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); - acceptor->tcp_rq = tcps->tcps_g_q; - acceptor->tcp_wq = WR(tcps->tcps_g_q); + acceptor->tcp_connp->conn_wq = NULL; (void) tcp_clean_death(acceptor, 0, 2); CONN_DEC_REF(acceptor->tcp_connp); /* - * In case we already received a FIN we have to make tcp_rput send - * the ordrel_ind. This will also send up a window update if the window - * has opened up. - * - * In the normal case of a successful connection acceptance - * we give the O_T_BIND_REQ to the read side put procedure as an - * indication that this was just accepted. This tells tcp_rput to - * pass up any data queued in tcp_rcv_list. + * We pass discon_mp to tcp_accept_finish to get on the right squeue. * - * In the fringe case where options sent with T_CONN_RES failed and - * we required, we would be indicating a T_DISCON_IND to blow - * away this connection. + * It will update the setting for sockfs/stream head and also take + * care of any data that arrived before accept() wad called. + * In case we already received a FIN then tcp_accept_finish will send up + * the ordrel. It will also send up a window update if the window + * has opened up. */ /* @@ -2346,7 +2213,7 @@ finish: * and is well know but nothing can be done short of major rewrite * to fix it. Now it is possible to take care of it by assigning TLI/XTI * eager same squeue as listener (we can distinguish non socket - * listeners at the time of handling a SYN in tcp_conn_request) + * listeners at the time of handling a SYN in tcp_input_listener) * and do most of the work that tcp_accept_finish does here itself * and then get behind the acceptor squeue to access the acceptor * queue. @@ -2354,52 +2221,38 @@ finish: /* * We already have a ref on tcp so no need to do one before squeue_enter */ - SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, opt_mp, tcp_accept_finish, - eager->tcp_connp, SQ_FILL, SQTAG_TCP_ACCEPT_FINISH); + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, + tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, + SQTAG_TCP_ACCEPT_FINISH); } /* * Swap information between the eager and acceptor for a TLI/XTI client. * The sockfs accept is done on the acceptor stream and control goes - * through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not + * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not * called. In either case, both the eager and listener are in their own * perimeter (squeue) and the code has to deal with potential race. * - * See the block comment on top of tcp_accept() and tcp_wput_accept(). + * See the block comment on top of tcp_accept() and tcp_tli_accept(). */ -static int +static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) { conn_t *econnp, *aconnp; - cred_t *effective_cred = NULL; - ASSERT(eager->tcp_rq == listener->tcp_rq); + ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); ASSERT(eager->tcp_detached && !acceptor->tcp_detached); - ASSERT(!eager->tcp_hard_bound); ASSERT(!TCP_IS_SOCKET(acceptor)); ASSERT(!TCP_IS_SOCKET(eager)); ASSERT(!TCP_IS_SOCKET(listener)); - econnp = eager->tcp_connp; - aconnp = acceptor->tcp_connp; - /* * Trusted Extensions may need to use a security label that is * different from the acceptor's label on MLP and MAC-Exempt * sockets. If this is the case, the required security label - * already exists in econnp->conn_effective_cred. Use this label - * to generate a new effective cred for the acceptor. - * - * We allow for potential application level retry attempts by - * checking for transient errors before modifying eager. + * already exists in econnp->conn_ixa->ixa_tsl. Since we make the + * acceptor stream refer to econnp we atomatically get that label. */ - if (is_system_labeled() && - aconnp->conn_cred != NULL && econnp->conn_effective_cred != NULL) { - effective_cred = copycred_from_tslabel(aconnp->conn_cred, - crgetlabel(econnp->conn_effective_cred), KM_NOSLEEP); - if (effective_cred == NULL) - return (ENOMEM); - } acceptor->tcp_detached = B_TRUE; /* @@ -2416,18 +2269,20 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) ASSERT(eager->tcp_eager_next_q0 == NULL && eager->tcp_eager_prev_q0 == NULL); mutex_exit(&listener->tcp_eager_lock); - eager->tcp_rq = acceptor->tcp_rq; - eager->tcp_wq = acceptor->tcp_wq; - eager->tcp_rq->q_ptr = econnp; - eager->tcp_wq->q_ptr = econnp; + econnp = eager->tcp_connp; + aconnp = acceptor->tcp_connp; + econnp->conn_rq = aconnp->conn_rq; + econnp->conn_wq = aconnp->conn_wq; + econnp->conn_rq->q_ptr = econnp; + econnp->conn_wq->q_ptr = econnp; /* * In the TLI/XTI loopback case, we are inside the listener's squeue, * which might be a different squeue from our peer TCP instance. * For TCP Fusion, the peer expects that whenever tcp_detached is * clear, our TCP queues point to the acceptor's queues. Thus, use - * membar_producer() to ensure that the assignments of tcp_rq/tcp_wq + * membar_producer() to ensure that the assignments of conn_rq/conn_wq * above reach global visibility prior to the clearing of tcp_detached. */ membar_producer(); @@ -2439,419 +2294,187 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) econnp->conn_minor_arena = aconnp->conn_minor_arena; ASSERT(econnp->conn_minor_arena != NULL); - if (eager->tcp_cred != NULL) - crfree(eager->tcp_cred); - eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred; - if (econnp->conn_effective_cred != NULL) - crfree(econnp->conn_effective_cred); - econnp->conn_effective_cred = effective_cred; + if (econnp->conn_cred != NULL) + crfree(econnp->conn_cred); + econnp->conn_cred = aconnp->conn_cred; aconnp->conn_cred = NULL; - ASSERT(aconnp->conn_effective_cred == NULL); - + econnp->conn_cpid = aconnp->conn_cpid; ASSERT(econnp->conn_netstack == aconnp->conn_netstack); ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); econnp->conn_zoneid = aconnp->conn_zoneid; econnp->conn_allzones = aconnp->conn_allzones; + econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; + econnp->conn_mac_mode = aconnp->conn_mac_mode; + econnp->conn_zone_is_global = aconnp->conn_zone_is_global; aconnp->conn_mac_mode = CONN_MAC_DEFAULT; /* Do the IPC initialization */ CONN_INC_REF(econnp); - econnp->conn_multicast_loop = aconnp->conn_multicast_loop; - econnp->conn_af_isv6 = aconnp->conn_af_isv6; - econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6; + econnp->conn_family = aconnp->conn_family; + econnp->conn_ipversion = aconnp->conn_ipversion; /* Done with old IPC. Drop its ref on its connp */ CONN_DEC_REF(aconnp); - return (0); } /* * Adapt to the information, such as rtt and rtt_sd, provided from the - * ire cached in conn_cache_ire. If no ire cached, do a ire lookup. + * DCE and IRE maintained by IP. * * Checks for multicast and broadcast destination address. - * Returns zero on failure; non-zero if ok. + * Returns zero if ok; an errno on failure. * * Note that the MSS calculation here is based on the info given in - * the IRE. We do not do any calculation based on TCP options. They - * will be handled in tcp_rput_other() and tcp_rput_data() when TCP - * knows which options to use. + * the DCE and IRE. We do not do any calculation based on TCP options. They + * will be handled in tcp_input_data() when TCP knows which options to use. * * Note on how TCP gets its parameters for a connection. * * When a tcp_t structure is allocated, it gets all the default parameters. - * In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd, + * In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd, * spipe, rpipe, ... from the route metrics. Route metric overrides the * default. * - * An incoming SYN with a multicast or broadcast destination address, is dropped - * in 1 of 2 places. - * - * 1. If the packet was received over the wire it is dropped in - * ip_rput_process_broadcast() - * - * 2. If the packet was received through internal IP loopback, i.e. the packet - * was generated and received on the same machine, it is dropped in - * ip_wput_local() + * An incoming SYN with a multicast or broadcast destination address is dropped + * in ip_fanout_v4/v6. * * An incoming SYN with a multicast or broadcast source address is always - * dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to + * dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in + * conn_connect. + * The same logic in tcp_set_destination also serves to * reject an attempt to connect to a broadcast or multicast (destination) * address. */ static int -tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) +tcp_set_destination(tcp_t *tcp) { - ire_t *ire; - ire_t *sire = NULL; - iulp_t *ire_uinfo = NULL; uint32_t mss_max; uint32_t mss; boolean_t tcp_detached = TCP_IS_DETACHED(tcp); conn_t *connp = tcp->tcp_connp; - boolean_t ire_cacheable = B_FALSE; - zoneid_t zoneid = connp->conn_zoneid; - int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT | - MATCH_IRE_SECATTR; - ts_label_t *tsl = crgetlabel(CONN_CRED(connp)); - ill_t *ill = NULL; - boolean_t incoming = (ire_mp == NULL); tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(connp->conn_ire_cache == NULL); - - if (tcp->tcp_ipversion == IPV4_VERSION) { + iulp_t uinfo; + int error; + uint32_t flags; - if (CLASSD(tcp->tcp_connp->conn_rem)) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - return (0); - } - /* - * If IP_NEXTHOP is set, then look for an IRE_CACHE - * for the destination with the nexthop as gateway. - * ire_ctable_lookup() is used because this particular - * ire, if it exists, will be marked private. - * If that is not available, use the interface ire - * for the nexthop. - * - * TSol: tcp_update_label will detect label mismatches based - * only on the destination's label, but that would not - * detect label mismatches based on the security attributes - * of routes or next hop gateway. Hence we need to pass the - * label to ire_ftable_lookup below in order to locate the - * right prefix (and/or) ire cache. Similarly we also need - * pass the label to the ire_cache_lookup below to locate - * the right ire that also matches on the label. - */ - if (tcp->tcp_connp->conn_nexthop_set) { - ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem, - tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid, - tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW, - ipst); - if (ire == NULL) { - ire = ire_ftable_lookup( - tcp->tcp_connp->conn_nexthop_v4, - 0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0, - tsl, match_flags, ipst); - if (ire == NULL) - return (0); - } else { - ire_uinfo = &ire->ire_uinfo; - } - } else { - ire = ire_cache_lookup(tcp->tcp_connp->conn_rem, - zoneid, tsl, ipst); - if (ire != NULL) { - ire_cacheable = B_TRUE; - ire_uinfo = (ire_mp != NULL) ? - &((ire_t *)ire_mp->b_rptr)->ire_uinfo: - &ire->ire_uinfo; + flags = IPDF_LSO | IPDF_ZCOPY; + /* + * Make sure we have a dce for the destination to avoid dce_ident + * contention for connected sockets. + */ + flags |= IPDF_UNIQUE_DCE; - } else { - if (ire_mp == NULL) { - ire = ire_ftable_lookup( - tcp->tcp_connp->conn_rem, - 0, 0, 0, NULL, &sire, zoneid, 0, - tsl, (MATCH_IRE_RECURSIVE | - MATCH_IRE_DEFAULT), ipst); - if (ire == NULL) - return (0); - ire_uinfo = (sire != NULL) ? - &sire->ire_uinfo : - &ire->ire_uinfo; - } else { - ire = (ire_t *)ire_mp->b_rptr; - ire_uinfo = - &((ire_t *) - ire_mp->b_rptr)->ire_uinfo; - } - } - } - ASSERT(ire != NULL); + if (!tcps->tcps_ignore_path_mtu) + connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY; - if ((ire->ire_src_addr == INADDR_ANY) || - (ire->ire_type & IRE_BROADCAST)) { - /* - * ire->ire_mp is non null when ire_mp passed in is used - * ire->ire_mp is set in ip_bind_insert_ire[_v6](). - */ - if (ire->ire_mp == NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - return (0); - } - - if (tcp->tcp_ipha->ipha_src == INADDR_ANY) { - ipaddr_t src_addr; + /* Use conn_lock to satify ASSERT; tcp is already serialized */ + mutex_enter(&connp->conn_lock); + error = conn_connect(connp, &uinfo, flags); + mutex_exit(&connp->conn_lock); + if (error != 0) + return (error); - /* - * ip_bind_connected() has stored the correct source - * address in conn_src. - */ - src_addr = tcp->tcp_connp->conn_src; - tcp->tcp_ipha->ipha_src = src_addr; - /* - * Copy of the src addr. in tcp_t is needed - * for the lookup funcs. - */ - IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6); - } - /* - * Set the fragment bit so that IP will tell us if the MTU - * should change. IP tells us the latest setting of - * ip_path_mtu_discovery through ire_frag_flag. - */ - if (ipst->ips_ip_path_mtu_discovery) { - tcp->tcp_ipha->ipha_fragment_offset_and_flags = - htons(IPH_DF); - } - /* - * If ire_uinfo is NULL, this is the IRE_INTERFACE case - * for IP_NEXTHOP. No cache ire has been found for the - * destination and we are working with the nexthop's - * interface ire. Since we need to forward all packets - * to the nexthop first, we "blindly" set tcp_localnet - * to false, eventhough the destination may also be - * onlink. - */ - if (ire_uinfo == NULL) - tcp->tcp_localnet = 0; - else - tcp->tcp_localnet = (ire->ire_gateway_addr == 0); - } else { - /* - * For incoming connection ire_mp = NULL - * For outgoing connection ire_mp != NULL - * Technically we should check conn_incoming_ill - * when ire_mp is NULL and conn_outgoing_ill when - * ire_mp is non-NULL. But this is performance - * critical path and for IPV*_BOUND_IF, outgoing - * and incoming ill are always set to the same value. - */ - ill_t *dst_ill = NULL; - ipif_t *dst_ipif = NULL; + error = tcp_build_hdrs(tcp); + if (error != 0) + return (error); - ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill); + tcp->tcp_localnet = uinfo.iulp_localnet; - if (connp->conn_outgoing_ill != NULL) { - /* Outgoing or incoming path */ - int err; + if (uinfo.iulp_rtt != 0) { + clock_t rto; - dst_ill = conn_get_held_ill(connp, - &connp->conn_outgoing_ill, &err); - if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) { - ip1dbg(("tcp_adapt_ire: ill_lookup failed\n")); - return (0); - } - match_flags |= MATCH_IRE_ILL; - dst_ipif = dst_ill->ill_ipif; - } - ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6, - 0, 0, dst_ipif, zoneid, tsl, match_flags, ipst); + tcp->tcp_rtt_sa = uinfo.iulp_rtt; + tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; + rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + + tcps->tcps_rexmit_interval_extra + + (tcp->tcp_rtt_sa >> 5); - if (ire != NULL) { - ire_cacheable = B_TRUE; - ire_uinfo = (ire_mp != NULL) ? - &((ire_t *)ire_mp->b_rptr)->ire_uinfo: - &ire->ire_uinfo; + if (rto > tcps->tcps_rexmit_interval_max) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_max; + } else if (rto < tcps->tcps_rexmit_interval_min) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_min; } else { - if (ire_mp == NULL) { - ire = ire_ftable_lookup_v6( - &tcp->tcp_connp->conn_remv6, - 0, 0, 0, dst_ipif, &sire, zoneid, - 0, tsl, match_flags, ipst); - if (ire == NULL) { - if (dst_ill != NULL) - ill_refrele(dst_ill); - return (0); - } - ire_uinfo = (sire != NULL) ? &sire->ire_uinfo : - &ire->ire_uinfo; - } else { - ire = (ire_t *)ire_mp->b_rptr; - ire_uinfo = - &((ire_t *)ire_mp->b_rptr)->ire_uinfo; - } - } - if (dst_ill != NULL) - ill_refrele(dst_ill); - - ASSERT(ire != NULL); - ASSERT(ire_uinfo != NULL); - - if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) || - IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) { - /* - * ire->ire_mp is non null when ire_mp passed in is used - * ire->ire_mp is set in ip_bind_insert_ire[_v6](). - */ - if (ire->ire_mp == NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); - return (0); + tcp->tcp_rto = rto; } - - if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { - in6_addr_t src_addr; - - /* - * ip_bind_connected_v6() has stored the correct source - * address per IPv6 addr. selection policy in - * conn_src_v6. - */ - src_addr = tcp->tcp_connp->conn_srcv6; - - tcp->tcp_ip6h->ip6_src = src_addr; - /* - * Copy of the src addr. in tcp_t is needed - * for the lookup funcs. - */ - tcp->tcp_ip_src_v6 = src_addr; - ASSERT(IN6_ARE_ADDR_EQUAL(&tcp->tcp_ip6h->ip6_src, - &connp->conn_srcv6)); + } + if (uinfo.iulp_ssthresh != 0) + tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; + else + tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; + if (uinfo.iulp_spipe > 0) { + connp->conn_sndbuf = MIN(uinfo.iulp_spipe, + tcps->tcps_max_buf); + if (tcps->tcps_snd_lowat_fraction != 0) { + connp->conn_sndlowat = connp->conn_sndbuf / + tcps->tcps_snd_lowat_fraction; } - tcp->tcp_localnet = - IN6_IS_ADDR_UNSPECIFIED(&ire->ire_gateway_addr_v6); + (void) tcp_maxpsz_set(tcp, B_TRUE); } - /* - * This allows applications to fail quickly when connections are made - * to dead hosts. Hosts can be labeled dead by adding a reject route - * with both the RTF_REJECT and RTF_PRIVATE flags set. + * Note that up till now, acceptor always inherits receive + * window from the listener. But if there is a metrics + * associated with a host, we should use that instead of + * inheriting it from listener. Thus we need to pass this + * info back to the caller. */ - if ((ire->ire_flags & RTF_REJECT) && - (ire->ire_flags & RTF_PRIVATE)) - goto error; + if (uinfo.iulp_rpipe > 0) { + tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe, + tcps->tcps_max_buf); + } + + if (uinfo.iulp_rtomax > 0) { + tcp->tcp_second_timer_threshold = + uinfo.iulp_rtomax; + } /* - * Make use of the cached rtt and rtt_sd values to calculate the - * initial RTO. Note that they are already initialized in - * tcp_init_values(). - * If ire_uinfo is NULL, i.e., we do not have a cache ire for - * IP_NEXTHOP, but instead are using the interface ire for the - * nexthop, then we do not use the ire_uinfo from that ire to - * do any initializations. + * Use the metric option settings, iulp_tstamp_ok and + * iulp_wscale_ok, only for active open. What this means + * is that if the other side uses timestamp or window + * scale option, TCP will also use those options. That + * is for passive open. If the application sets a + * large window, window scale is enabled regardless of + * the value in iulp_wscale_ok. This is the behavior + * since 2.6. So we keep it. + * The only case left in passive open processing is the + * check for SACK. + * For ECN, it should probably be like SACK. But the + * current value is binary, so we treat it like the other + * cases. The metric only controls active open.For passive + * open, the ndd param, tcp_ecn_permitted, controls the + * behavior. */ - if (ire_uinfo != NULL) { - if (ire_uinfo->iulp_rtt != 0) { - clock_t rto; - - tcp->tcp_rtt_sa = ire_uinfo->iulp_rtt; - tcp->tcp_rtt_sd = ire_uinfo->iulp_rtt_sd; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + - (tcp->tcp_rtt_sa >> 5); - - if (rto > tcps->tcps_rexmit_interval_max) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_max; - } else if (rto < tcps->tcps_rexmit_interval_min) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_min; - } else { - tcp->tcp_rto = rto; - } - } - if (ire_uinfo->iulp_ssthresh != 0) - tcp->tcp_cwnd_ssthresh = ire_uinfo->iulp_ssthresh; - else - tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; - if (ire_uinfo->iulp_spipe > 0) { - tcp->tcp_xmit_hiwater = MIN(ire_uinfo->iulp_spipe, - tcps->tcps_max_buf); - if (tcps->tcps_snd_lowat_fraction != 0) - tcp->tcp_xmit_lowater = tcp->tcp_xmit_hiwater / - tcps->tcps_snd_lowat_fraction; - (void) tcp_maxpsz_set(tcp, B_TRUE); - } + if (!tcp_detached) { /* - * Note that up till now, acceptor always inherits receive - * window from the listener. But if there is a metrics - * associated with a host, we should use that instead of - * inheriting it from listener. Thus we need to pass this - * info back to the caller. + * The if check means that the following can only + * be turned on by the metrics only IRE, but not off. */ - if (ire_uinfo->iulp_rpipe > 0) { - tcp->tcp_rwnd = MIN(ire_uinfo->iulp_rpipe, - tcps->tcps_max_buf); - } - - if (ire_uinfo->iulp_rtomax > 0) { - tcp->tcp_second_timer_threshold = - ire_uinfo->iulp_rtomax; - } - + if (uinfo.iulp_tstamp_ok) + tcp->tcp_snd_ts_ok = B_TRUE; + if (uinfo.iulp_wscale_ok) + tcp->tcp_snd_ws_ok = B_TRUE; + if (uinfo.iulp_sack == 2) + tcp->tcp_snd_sack_ok = B_TRUE; + if (uinfo.iulp_ecn_ok) + tcp->tcp_ecn_ok = B_TRUE; + } else { /* - * Use the metric option settings, iulp_tstamp_ok and - * iulp_wscale_ok, only for active open. What this means - * is that if the other side uses timestamp or window - * scale option, TCP will also use those options. That - * is for passive open. If the application sets a - * large window, window scale is enabled regardless of - * the value in iulp_wscale_ok. This is the behavior - * since 2.6. So we keep it. - * The only case left in passive open processing is the - * check for SACK. - * For ECN, it should probably be like SACK. But the - * current value is binary, so we treat it like the other - * cases. The metric only controls active open.For passive - * open, the ndd param, tcp_ecn_permitted, controls the - * behavior. + * Passive open. + * + * As above, the if check means that SACK can only be + * turned on by the metric only IRE. */ - if (!tcp_detached) { - /* - * The if check means that the following can only - * be turned on by the metrics only IRE, but not off. - */ - if (ire_uinfo->iulp_tstamp_ok) - tcp->tcp_snd_ts_ok = B_TRUE; - if (ire_uinfo->iulp_wscale_ok) - tcp->tcp_snd_ws_ok = B_TRUE; - if (ire_uinfo->iulp_sack == 2) - tcp->tcp_snd_sack_ok = B_TRUE; - if (ire_uinfo->iulp_ecn_ok) - tcp->tcp_ecn_ok = B_TRUE; - } else { - /* - * Passive open. - * - * As above, the if check means that SACK can only be - * turned on by the metric only IRE. - */ - if (ire_uinfo->iulp_sack > 0) { - tcp->tcp_snd_sack_ok = B_TRUE; - } + if (uinfo.iulp_sack > 0) { + tcp->tcp_snd_sack_ok = B_TRUE; } } - /* - * XXX: Note that currently, ire_max_frag can be as small as 68 + * XXX Note that currently, iulp_mtu can be as small as 68 * because of PMTUd. So tcp_mss may go to negative if combined * length of all those options exceeds 28 bytes. But because * of the tcp_mss_min check below, we may not have a problem if @@ -2864,31 +2487,15 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) * We do not deal with that now. All those problems related to * PMTUd will be fixed later. */ - ASSERT(ire->ire_max_frag != 0); - mss = tcp->tcp_if_mtu = ire->ire_max_frag; - if (tcp->tcp_ipp_fields & IPPF_USE_MIN_MTU) { - if (tcp->tcp_ipp_use_min_mtu == IPV6_USE_MIN_MTU_NEVER) { - mss = MIN(mss, IPV6_MIN_MTU); - } - } + ASSERT(uinfo.iulp_mtu != 0); + mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu; /* Sanity check for MSS value. */ - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) mss_max = tcps->tcps_mss_max_ipv4; else mss_max = tcps->tcps_mss_max_ipv6; - if (tcp->tcp_ipversion == IPV6_VERSION && - (ire->ire_frag_flag & IPH_FRAG_HDR)) { - /* - * After receiving an ICMPv6 "packet too big" message with a - * MTU < 1280, and for multirouted IPv6 packets, the IP layer - * will insert a 8-byte fragment header in every packet; we - * reduce the MSS by that amount here. - */ - mss -= sizeof (ip6_frag_t); - } - if (tcp->tcp_ipsec_overhead == 0) tcp->tcp_ipsec_overhead = conn_ipsec_length(connp); @@ -2903,71 +2510,28 @@ tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp) tcp->tcp_mss = mss; /* + * Update the tcp connection with LSO capability. + */ + tcp_update_lso(tcp, connp->conn_ixa); + + /* * Initialize the ISS here now that we have the full connection ID. * The RFC 1948 method of initial sequence number generation requires * knowledge of the full connection ID before setting the ISS. */ - tcp_iss_init(tcp); - if (ire->ire_type & (IRE_LOOPBACK | IRE_LOCAL)) - tcp->tcp_loopback = B_TRUE; - - if (sire != NULL) - IRE_REFRELE(sire); - - /* - * If we got an IRE_CACHE and an ILL, go through their properties; - * otherwise, this is deferred until later when we have an IRE_CACHE. - */ - if (tcp->tcp_loopback || - (ire_cacheable && (ill = ire_to_ill(ire)) != NULL)) { - /* - * For incoming, see if this tcp may be MDT-capable. For - * outgoing, this process has been taken care of through - * tcp_rput_other. - */ - tcp_ire_ill_check(tcp, ire, ill, incoming); - tcp->tcp_ire_ill_check_done = B_TRUE; - } + tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local); - mutex_enter(&connp->conn_lock); /* * Make sure that conn is not marked incipient * for incoming connections. A blind * removal of incipient flag is cheaper than * check and removal. */ + mutex_enter(&connp->conn_lock); connp->conn_state_flags &= ~CONN_INCIPIENT; - - /* - * Must not cache forwarding table routes - * or recache an IRE after the conn_t has - * had conn_ire_cache cleared and is flagged - * unusable, (see the CONN_CACHE_IRE() macro). - */ - if (ire_cacheable && CONN_CACHE_IRE(connp)) { - rw_enter(&ire->ire_bucket->irb_lock, RW_READER); - if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { - connp->conn_ire_cache = ire; - IRE_UNTRACE_REF(ire); - rw_exit(&ire->ire_bucket->irb_lock); - mutex_exit(&connp->conn_lock); - return (1); - } - rw_exit(&ire->ire_bucket->irb_lock); - } mutex_exit(&connp->conn_lock); - - if (ire->ire_mp == NULL) - ire_refrele(ire); - return (1); - -error: - if (ire->ire_mp == NULL) - ire_refrele(ire); - if (sire != NULL) - ire_refrele(sire); return (0); } @@ -3001,7 +2565,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_tpi_bind: bad req, len %u", (uint_t)(mp->b_wptr - mp->b_rptr)); @@ -3010,7 +2574,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) return; } /* Make sure the largest address fits */ - mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t) + 1, 1); + mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); if (mp1 == NULL) { tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); return; @@ -3024,7 +2588,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) switch (len) { case 0: /* request for a generic port */ tbr->ADDR_offset = sizeof (struct T_bind_req); - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { tbr->ADDR_length = sizeof (sin_t); sin = (sin_t *)&tbr[1]; *sin = sin_null; @@ -3033,7 +2597,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) len = sizeof (sin_t); mp->b_wptr = (uchar_t *)&sin[1]; } else { - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); tbr->ADDR_length = sizeof (sin6_t); sin6 = (sin6_t *)&tbr[1]; *sin6 = sin6_null; @@ -3055,7 +2619,7 @@ tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) break; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_tpi_bind: bad address length, %d", tbr->ADDR_length); @@ -3080,16 +2644,16 @@ done: /* * Update port information as sockfs/tpi needs it for checking */ - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { sin = (sin_t *)sa; - sin->sin_port = tcp->tcp_lport; + sin->sin_port = connp->conn_lport; } else { sin6 = (sin6_t *)sa; - sin6->sin6_port = tcp->tcp_lport; + sin6->sin6_port = connp->conn_lport; } mp->b_datap->db_type = M_PCPROTO; tbr->PRIM_type = T_BIND_ACK; - putnext(tcp->tcp_rq, mp); + putnext(connp->conn_rq, mp); } } @@ -3139,7 +2703,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * Set loopmax appropriately so that one does not look * forever in the case all of the anonymous ports are in use. */ - if (tcp->tcp_anon_priv_bind) { + if (connp->conn_anon_priv_bind) { /* * loopmax = * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 @@ -3175,7 +2739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, mutex_enter(&tbf->tf_lock); for (ltcp = tbf->tf_tcp; ltcp != NULL; ltcp = ltcp->tcp_bind_hash) { - if (lport == ltcp->tcp_lport) + if (lport == ltcp->tcp_connp->conn_lport) break; } @@ -3191,7 +2755,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * privilege as being in all zones, as there's * otherwise no way to identify the right receiver. */ - if (!IPCL_BIND_ZONE_MATCH(ltcp->tcp_connp, connp)) + if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) continue; /* @@ -3227,7 +2791,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * added. * * if (ltcp->tcp_state == TCPS_LISTEN || - * !reuseaddr || !ltcp->tcp_reuseaddr) { + * !reuseaddr || !lconnp->conn_reuseaddr) { * ... * } * @@ -3243,17 +2807,18 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, */ not_socket = !(TCP_IS_SOCKET(ltcp) && TCP_IS_SOCKET(tcp)); - exclbind = ltcp->tcp_exclbind || tcp->tcp_exclbind; + exclbind = lconnp->conn_exclbind || + connp->conn_exclbind; if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || (connp->conn_mac_mode != CONN_MAC_DEFAULT) || (exclbind && (not_socket || ltcp->tcp_state <= TCPS_ESTABLISHED))) { if (V6_OR_V4_INADDR_ANY( - ltcp->tcp_bound_source_v6) || + lconnp->conn_bound_addr_v6) || V6_OR_V4_INADDR_ANY(*laddr) || IN6_ARE_ADDR_EQUAL(laddr, - <cp->tcp_bound_source_v6)) { + &lconnp->conn_bound_addr_v6)) { break; } continue; @@ -3266,7 +2831,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * specific port. We use the same autoassigned port * number space for IPv4 and IPv6 sockets. */ - if (tcp->tcp_ipversion != ltcp->tcp_ipversion && + if (connp->conn_ipversion != lconnp->conn_ipversion && bind_to_req_port_only) continue; @@ -3281,9 +2846,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, */ if (quick_connect && (ltcp->tcp_state > TCPS_LISTEN) && - ((tcp->tcp_fport != ltcp->tcp_fport) || - !IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, - <cp->tcp_remote_v6))) + ((connp->conn_fport != lconnp->conn_fport) || + !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, + &lconnp->conn_faddr_v6))) continue; if (!reuseaddr) { @@ -3299,9 +2864,9 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, */ if (!V6_OR_V4_INADDR_ANY(*laddr) && !V6_OR_V4_INADDR_ANY( - ltcp->tcp_bound_source_v6) && + lconnp->conn_bound_addr_v6) && !IN6_ARE_ADDR_EQUAL(laddr, - <cp->tcp_bound_source_v6)) + &lconnp->conn_bound_addr_v6)) continue; if (ltcp->tcp_state >= TCPS_BOUND) { /* @@ -3327,7 +2892,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * SO_REUSEADDR setting, so we break. */ if (IN6_ARE_ADDR_EQUAL(laddr, - <cp->tcp_bound_source_v6) && + &lconnp->conn_bound_addr_v6) && (ltcp->tcp_state == TCPS_LISTEN || ltcp->tcp_state == TCPS_BOUND)) break; @@ -3343,11 +2908,10 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * number. */ tcp->tcp_state = TCPS_BOUND; - tcp->tcp_lport = htons(port); - *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; + connp->conn_lport = htons(port); ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( - tcp->tcp_lport)] == tbf); + connp->conn_lport)] == tbf); tcp_bind_hash_insert(tbf, tcp, 1); mutex_exit(&tbf->tf_lock); @@ -3364,12 +2928,12 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * is updated. After the update, it may or may not * be in the valid range. */ - if (!tcp->tcp_anon_priv_bind) + if (!connp->conn_anon_priv_bind) tcps->tcps_next_port_to_try = port + 1; return (port); } - if (tcp->tcp_anon_priv_bind) { + if (connp->conn_anon_priv_bind) { port = tcp_get_next_priv_port(tcp); } else { if (count == 0 && user_specified) { @@ -3402,12 +2966,13 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * tcp_clean_death / tcp_close_detached must not be called more than once * on a tcp. Thus every function that potentially calls tcp_clean_death * must check for the tcp state before calling tcp_clean_death. - * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper, + * Eg. tcp_input_data, tcp_eager_kill, tcp_clean_death_wrapper, * tcp_timer_handler, all check for the tcp state. */ /* ARGSUSED */ void -tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2) +tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy) { tcp_t *tcp = ((conn_t *)arg)->conn_tcp; @@ -3449,11 +3014,11 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) } ASSERT(tcp != NULL); - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); if (TCP_IS_DETACHED(tcp)) { if (tcp->tcp_hard_binding) { @@ -3483,7 +3048,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) TCP_STAT(tcps, tcp_clean_death_nondetached); - q = tcp->tcp_rq; + q = connp->conn_rq; /* Trash all inbound data */ if (!IPCL_IS_NONSTR(connp)) { @@ -3506,7 +3071,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) */ (void) putnextctl1(q, M_FLUSH, FLUSHR); } - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_clean_death: discon err %d", err); } @@ -3519,7 +3084,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) if (mp != NULL) { putnext(q, mp); } else { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_clean_death, sending M_ERROR"); @@ -3552,6 +3117,7 @@ tcp_stop_lingering(tcp_t *tcp) { clock_t delta = 0; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; tcp->tcp_linger_tid = 0; if (tcp->tcp_state > TCPS_LISTEN) { @@ -3568,15 +3134,14 @@ tcp_stop_lingering(tcp_t *tcp) } /* * Need to cancel those timers which will not be used when - * TCP is detached. This has to be done before the tcp_wq - * is set to the global queue. + * TCP is detached. This has to be done before the conn_wq + * is cleared. */ tcp_timers_stop(tcp); tcp->tcp_detached = B_TRUE; - ASSERT(tcps->tcps_g_q != NULL); - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + connp->conn_rq = NULL; + connp->conn_wq = NULL; if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); @@ -3595,16 +3160,14 @@ tcp_stop_lingering(tcp_t *tcp) } } else { tcp_closei_local(tcp); - CONN_DEC_REF(tcp->tcp_connp); + CONN_DEC_REF(connp); } finish: /* Signal closing thread that it can complete close */ mutex_enter(&tcp->tcp_closelock); tcp->tcp_detached = B_TRUE; - ASSERT(tcps->tcps_g_q != NULL); - - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + connp->conn_rq = NULL; + connp->conn_wq = NULL; tcp->tcp_closed = 1; cv_signal(&tcp->tcp_closecv); @@ -3636,9 +3199,9 @@ tcp_close_common(conn_t *connp, int flags) ASSERT(connp->conn_ref >= 2); /* - * Mark the conn as closing. ill_pending_mp_add will not + * Mark the conn as closing. ipsq_pending_mp_add will not * add any mp to the pending mp list, after this conn has - * started closing. Same for sq_pending_mp_add + * started closing. */ mutex_enter(&connp->conn_lock); connp->conn_state_flags |= CONN_CLOSING; @@ -3664,7 +3227,7 @@ tcp_close_common(conn_t *connp, int flags) TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_close_output, connp, - tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); + NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); mutex_enter(&tcp->tcp_closelock); while (!tcp->tcp_closed) { @@ -3684,13 +3247,13 @@ tcp_close_common(conn_t *connp, int flags) * thread is higher priority than the squeue worker * thread and is bound to the same cpu. */ - if (tcp->tcp_linger && tcp->tcp_lingertime > 0) { + if (connp->conn_linger && connp->conn_lingertime > 0) { mutex_exit(&tcp->tcp_closelock); /* Entering squeue, bump ref count. */ CONN_INC_REF(connp); bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); SQUEUE_ENTER_ONE(connp->conn_sqp, bp, - tcp_linger_interrupted, connp, + tcp_linger_interrupted, connp, NULL, tcp_squeue_flag, SQTAG_IP_TCP_CLOSE); mutex_enter(&tcp->tcp_closelock); } @@ -3703,8 +3266,8 @@ tcp_close_common(conn_t *connp, int flags) /* * In the case of listener streams that have eagers in the q or q0 - * we wait for the eagers to drop their reference to us. tcp_rq and - * tcp_wq of the eagers point to our queues. By waiting for the + * we wait for the eagers to drop their reference to us. conn_rq and + * conn_wq of the eagers point to our queues. By waiting for the * refcnt to drop to 1, we are sure that the eagers have cleaned * up their queue pointers and also dropped their references to us. */ @@ -3716,13 +3279,12 @@ tcp_close_common(conn_t *connp, int flags) mutex_exit(&connp->conn_lock); } /* - * ioctl cleanup. The mp is queued in the - * ill_pending_mp or in the sq_pending_mp. + * ioctl cleanup. The mp is queued in the ipx_pending_mp. */ if (conn_ioctl_cleanup_reqd) conn_ioctl_cleanup(connp); - tcp->tcp_cpid = -1; + connp->conn_cpid = NOPID; } static int @@ -3799,7 +3361,7 @@ tcp_tpi_close_accept(queue_t *q) /* ARGSUSED */ static void -tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2) +tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; @@ -3828,7 +3390,7 @@ tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2) /* ARGSUSED */ static void -tcp_close_output(void *arg, mblk_t *mp, void *arg2) +tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { char *msg; conn_t *connp = (conn_t *)arg; @@ -3847,10 +3409,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) } mutex_exit(&tcp->tcp_eager_lock); - connp->conn_mdt_ok = B_FALSE; - tcp->tcp_mdt = B_FALSE; - - connp->conn_lso_ok = B_FALSE; tcp->tcp_lso = B_FALSE; msg = NULL; @@ -3879,12 +3437,11 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) * If SO_LINGER has set a zero linger time, abort the * connection with a reset. */ - if (tcp->tcp_linger && tcp->tcp_lingertime == 0) { + if (connp->conn_linger && connp->conn_lingertime == 0) { msg = "tcp_close, zero lingertime"; break; } - ASSERT(tcp->tcp_hard_bound || tcp->tcp_hard_binding); /* * Abort connection if there is unread data queued. */ @@ -3893,9 +3450,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) break; } /* - * tcp_hard_bound is now cleared thus all packets go through - * tcp_lookup. This fact is used by tcp_detach below. - * * We have done a qwait() above which could have possibly * drained more messages in turn causing transition to a * different state. Check whether we have to do the rest @@ -3915,7 +3469,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) * If lingering on close then wait until the fin is acked, * the SO_LINGER time passes, or a reset is sent/received. */ - if (tcp->tcp_linger && tcp->tcp_lingertime > 0 && + if (connp->conn_linger && connp->conn_lingertime > 0 && !(tcp->tcp_fin_acked) && tcp->tcp_state >= TCPS_ESTABLISHED) { if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { @@ -3926,7 +3480,7 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) tcp->tcp_linger_tid = TCP_TIMER(tcp, tcp_close_linger_timeout, - tcp->tcp_lingertime * hz); + connp->conn_lingertime * hz); /* tcp_close_linger_timeout will finish close */ if (tcp->tcp_linger_tid == 0) @@ -3944,8 +3498,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) } /* - * Make sure that no other thread will access the tcp_rq of - * this instance (through lookups etc.) as tcp_rq will go + * Make sure that no other thread will access the conn_rq of + * this instance (through lookups etc.) as conn_rq will go * away shortly. */ tcp_acceptor_hash_remove(tcp); @@ -3962,8 +3516,8 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) } /* * Need to cancel those timers which will not be used when - * TCP is detached. This has to be done before the tcp_wq - * is set to the global queue. + * TCP is detached. This has to be done before the conn_wq + * is set to NULL. */ tcp_timers_stop(tcp); @@ -4004,18 +3558,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2) ASSERT(connp->conn_ref >= 2); finish: - /* - * Although packets are always processed on the correct - * tcp's perimeter and access is serialized via squeue's, - * IP still needs a queue when sending packets in time_wait - * state so use WR(tcps_g_q) till ip_output() can be - * changed to deal with just connp. For read side, we - * could have set tcp_rq to NULL but there are some cases - * in tcp_rput_data() from early days of this code which - * do a putnext without checking if tcp is closed. Those - * need to be identified before both tcp_rq and tcp_wq - * can be set to NULL and tcps_g_q can disappear forever. - */ mutex_enter(&tcp->tcp_closelock); /* * Don't change the queues in the case of a listener that has @@ -4024,13 +3566,8 @@ finish: */ if (!tcp->tcp_wait_for_eagers) { tcp->tcp_detached = B_TRUE; - /* - * When default queue is closing we set tcps_g_q to NULL - * after the close is done. - */ - ASSERT(tcps->tcps_g_q != NULL); - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + connp->conn_rq = NULL; + connp->conn_wq = NULL; } /* Signal tcp_close() to finish closing. */ @@ -4112,8 +3649,7 @@ tcp_timers_stop(tcp_t *tcp) static void tcp_closei_local(tcp_t *tcp) { - ire_t *ire; - conn_t *connp = tcp->tcp_connp; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; if (!TCP_IS_SOCKET(tcp)) @@ -4138,7 +3674,7 @@ tcp_closei_local(tcp_t *tcp) * this point, eager will be closed but we * leave it in listeners eager list so that * if listener decides to close without doing - * accept, we can clean this up. In tcp_wput_accept + * accept, we can clean this up. In tcp_tli_accept * we take care of the case of accept on closed * eager. */ @@ -4150,9 +3686,9 @@ tcp_closei_local(tcp_t *tcp) * listener queue, after we have released our * reference on the listener */ - ASSERT(tcps->tcps_g_q != NULL); - tcp->tcp_rq = tcps->tcps_g_q; - tcp->tcp_wq = WR(tcps->tcps_g_q); + ASSERT(tcp->tcp_detached); + connp->conn_rq = NULL; + connp->conn_wq = NULL; CONN_DEC_REF(listener->tcp_connp); } else { mutex_exit(&listener->tcp_eager_lock); @@ -4185,20 +3721,16 @@ tcp_closei_local(tcp_t *tcp) */ if (tcp->tcp_state == TCPS_TIME_WAIT) (void) tcp_time_wait_remove(tcp, NULL); - CL_INET_DISCONNECT(connp, tcp); + CL_INET_DISCONNECT(connp); ipcl_hash_remove(connp); + ixa_cleanup(connp->conn_ixa); /* - * Delete the cached ire in conn_ire_cache and also mark - * the conn as CONDEMNED + * Mark the conn as CONDEMNED */ mutex_enter(&connp->conn_lock); connp->conn_state_flags |= CONN_CONDEMNED; - ire = connp->conn_ire_cache; - connp->conn_ire_cache = NULL; mutex_exit(&connp->conn_lock); - if (ire != NULL) - IRE_REFRELE_NOTR(ire); /* Need to cleanup any pending ioctls */ ASSERT(tcp->tcp_time_wait_next == NULL); @@ -4227,14 +3759,14 @@ tcp_closei_local(tcp_t *tcp) void tcp_free(tcp_t *tcp) { - mblk_t *mp; - ip6_pkt_t *ipp; + mblk_t *mp; + conn_t *connp = tcp->tcp_connp; ASSERT(tcp != NULL); ASSERT(tcp->tcp_ptpahn == NULL && tcp->tcp_acceptor_hash == NULL); - tcp->tcp_rq = NULL; - tcp->tcp_wq = NULL; + connp->conn_rq = NULL; + connp->conn_wq = NULL; tcp_close_mpp(&tcp->tcp_xmit_head); tcp_close_mpp(&tcp->tcp_reass_head); @@ -4281,12 +3813,12 @@ tcp_free(tcp_t *tcp) tcp->tcp_dstoptslen = 0; } ASSERT(tcp->tcp_dstoptslen == 0); - if (tcp->tcp_rtdstopts != NULL) { - mi_free(tcp->tcp_rtdstopts); - tcp->tcp_rtdstopts = NULL; - tcp->tcp_rtdstoptslen = 0; + if (tcp->tcp_rthdrdstopts != NULL) { + mi_free(tcp->tcp_rthdrdstopts); + tcp->tcp_rthdrdstopts = NULL; + tcp->tcp_rthdrdstoptslen = 0; } - ASSERT(tcp->tcp_rtdstoptslen == 0); + ASSERT(tcp->tcp_rthdrdstoptslen == 0); if (tcp->tcp_rthdr != NULL) { mi_free(tcp->tcp_rthdr); tcp->tcp_rthdr = NULL; @@ -4294,18 +3826,6 @@ tcp_free(tcp_t *tcp) } ASSERT(tcp->tcp_rthdrlen == 0); - ipp = &tcp->tcp_sticky_ipp; - if (ipp->ipp_fields & (IPPF_HOPOPTS | IPPF_RTDSTOPTS | IPPF_DSTOPTS | - IPPF_RTHDR)) - ip6_pkt_free(ipp); - - /* - * Free memory associated with the tcp/ip header template. - */ - - if (tcp->tcp_iphc != NULL) - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); - /* * Following is really a blowing away a union. * It happens to have exactly two members of identical size @@ -4317,17 +3837,19 @@ tcp_free(tcp_t *tcp) /* * Put a connection confirmation message upstream built from the - * address information within 'iph' and 'tcph'. Report our success or failure. + * address/flowid information with the conn and iph. Report our success or + * failure. */ static boolean_t -tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, - mblk_t **defermp) +tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, + mblk_t **defermp, ip_recv_attr_t *ira) { sin_t sin; sin6_t sin6; mblk_t *mp; char *optp = NULL; int optlen = 0; + conn_t *connp = tcp->tcp_connp; if (defermp != NULL) *defermp = NULL; @@ -4352,20 +3874,19 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, } if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { - ipha_t *ipha = (ipha_t *)iphdr; /* packet is IPv4 */ - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { sin = sin_null; - sin.sin_addr.s_addr = ipha->ipha_src; - sin.sin_port = *(uint16_t *)tcph->th_lport; + sin.sin_addr.s_addr = connp->conn_faddr_v4; + sin.sin_port = connp->conn_fport; sin.sin_family = AF_INET; mp = mi_tpi_conn_con(NULL, (char *)&sin, (int)sizeof (sin_t), optp, optlen); } else { sin6 = sin6_null; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; mp = mi_tpi_conn_con(NULL, (char *)&sin6, (int)sizeof (sin6_t), optp, optlen); @@ -4375,10 +3896,10 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, ip6_t *ip6h = (ip6_t *)iphdr; ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); sin6 = sin6_null; - sin6.sin6_addr = ip6h->ip6_src; - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; mp = mi_tpi_conn_con(NULL, (char *)&sin6, @@ -4393,16 +3914,16 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, if (defermp == NULL) { conn_t *connp = tcp->tcp_connp; if (IPCL_IS_NONSTR(connp)) { - cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp, &cpid); (*connp->conn_upcalls->su_connected) - (connp->conn_upper_handle, tcp->tcp_connid, cr, - cpid); + (connp->conn_upper_handle, tcp->tcp_connid, + ira->ira_cred, ira->ira_cpid); freemsg(mp); } else { - putnext(tcp->tcp_rq, mp); + if (ira->ira_cred != NULL) { + /* So that getpeerucred works for TPI sockfs */ + mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); + } + putnext(connp->conn_rq, mp); } } else { *defermp = mp; @@ -4456,7 +3977,7 @@ tcp_drop_q0(tcp_t *tcp) */ MAKE_UNDROPPABLE(eager); - if (tcp->tcp_debug) { + if (tcp->tcp_connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, "tcp_drop_q0: listen half-open queue (max=%d) overflow" " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, @@ -4469,18 +3990,19 @@ tcp_drop_q0(tcp_t *tcp) /* Put a reference on the conn as we are enqueueing it in the sqeue */ CONN_INC_REF(eager->tcp_connp); - /* Mark the IRE created for this SYN request temporary */ - tcp_ip_ire_mark_advice(eager); SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_clean_death_wrapper, eager->tcp_connp, + tcp_clean_death_wrapper, eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_DROP_Q0); return (B_TRUE); } -int +/* + * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 + */ +static mblk_t * tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, - tcph_t *tcph, uint_t ipvers, mblk_t *idmp) + ip_recv_attr_t *ira) { tcp_t *ltcp = lconnp->conn_tcp; tcp_t *tcp = connp->conn_tcp; @@ -4488,36 +4010,30 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, ipha_t *ipha; ip6_t *ip6h; sin6_t sin6; - in6_addr_t v6dst; - int err; - int ifindex = 0; + uint_t ifindex = ira->ira_ruifindex; tcp_stack_t *tcps = tcp->tcp_tcps; - if (ipvers == IPV4_VERSION) { + if (ira->ira_flags & IRAF_IS_IPV4) { ipha = (ipha_t *)mp->b_rptr; - connp->conn_send = ip_output; - connp->conn_recv = tcp_input; - - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, - &connp->conn_bound_source_v6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); + connp->conn_ipversion = IPV4_VERSION; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); + connp->conn_saddr_v6 = connp->conn_laddr_v6; sin6 = sin6_null; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &sin6.sin6_addr); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; - sin6.__sin6_src_id = ip_srcid_find_addr(&v6dst, - lconnp->conn_zoneid, tcps->tcps_netstack); - if (tcp->tcp_recvdstaddr) { + sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, + IPCL_ZONEID(lconnp), tcps->tcps_netstack); + + if (connp->conn_recv_ancillary.crb_recvdstaddr) { sin6_t sin6d; sin6d = sin6_null; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, - &sin6d.sin6_addr); - sin6d.sin6_port = *(uint16_t *)tcph->th_fport; + sin6d.sin6_addr = connp->conn_laddr_v6; + sin6d.sin6_port = connp->conn_lport; sin6d.sin6_family = AF_INET; tpi_mp = mi_tpi_extconn_ind(NULL, (char *)&sin6d, sizeof (sin6_t), @@ -4534,24 +4050,18 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, } else { ip6h = (ip6_t *)mp->b_rptr; - connp->conn_send = ip_output_v6; - connp->conn_recv = tcp_input; - - connp->conn_bound_source_v6 = ip6h->ip6_dst; - connp->conn_srcv6 = ip6h->ip6_dst; - connp->conn_remv6 = ip6h->ip6_src; - - /* db_cksumstuff is set at ip_fanout_tcp_v6 */ - ifindex = (int)DB_CKSUMSTUFF(mp); - DB_CKSUMSTUFF(mp) = 0; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_laddr_v6 = ip6h->ip6_dst; + connp->conn_faddr_v6 = ip6h->ip6_src; + connp->conn_saddr_v6 = connp->conn_laddr_v6; sin6 = sin6_null; - sin6.sin6_addr = ip6h->ip6_src; - sin6.sin6_port = *(uint16_t *)tcph->th_lport; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; sin6.sin6_family = AF_INET6; sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - sin6.__sin6_src_id = ip_srcid_find_addr(&ip6h->ip6_dst, - lconnp->conn_zoneid, tcps->tcps_netstack); + sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, + IPCL_ZONEID(lconnp), tcps->tcps_netstack); if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { /* Pass up the scope_id of remote addr */ @@ -4559,13 +4069,16 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, } else { sin6.sin6_scope_id = 0; } - if (tcp->tcp_recvdstaddr) { + if (connp->conn_recv_ancillary.crb_recvdstaddr) { sin6_t sin6d; sin6d = sin6_null; - sin6.sin6_addr = ip6h->ip6_dst; - sin6d.sin6_port = *(uint16_t *)tcph->th_fport; - sin6d.sin6_family = AF_INET; + sin6.sin6_addr = connp->conn_laddr_v6; + sin6d.sin6_port = connp->conn_lport; + sin6d.sin6_family = AF_INET6; + if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) + sin6d.sin6_scope_id = ifindex; + tpi_mp = mi_tpi_extconn_ind(NULL, (char *)&sin6d, sizeof (sin6_t), (char *)&tcp, (t_scalar_t)sizeof (intptr_t), @@ -4579,194 +4092,40 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, } } - if (tpi_mp == NULL) - return (ENOMEM); - - connp->conn_fport = *(uint16_t *)tcph->th_lport; - connp->conn_lport = *(uint16_t *)tcph->th_fport; - connp->conn_flags |= (IPCL_TCP6|IPCL_EAGER); - connp->conn_fully_bound = B_FALSE; - - /* Inherit information from the "parent" */ - tcp->tcp_ipversion = ltcp->tcp_ipversion; - tcp->tcp_family = ltcp->tcp_family; - - tcp->tcp_wq = ltcp->tcp_wq; - tcp->tcp_rq = ltcp->tcp_rq; - tcp->tcp_mss = tcps->tcps_mss_def_ipv6; - tcp->tcp_detached = B_TRUE; - SOCK_CONNID_INIT(tcp->tcp_connid); - if ((err = tcp_init_values(tcp)) != 0) { - freemsg(tpi_mp); - return (err); - } - - if (ipvers == IPV4_VERSION) { - if ((err = tcp_header_init_ipv4(tcp)) != 0) { - freemsg(tpi_mp); - return (err); - } - ASSERT(tcp->tcp_ipha != NULL); - } else { - /* ifindex must be already set */ - ASSERT(ifindex != 0); - - if (ltcp->tcp_bound_if != 0) - tcp->tcp_bound_if = ltcp->tcp_bound_if; - else if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) - tcp->tcp_bound_if = ifindex; - - tcp->tcp_ipv6_recvancillary = ltcp->tcp_ipv6_recvancillary; - tcp->tcp_recvifindex = 0; - tcp->tcp_recvhops = 0xffffffffU; - ASSERT(tcp->tcp_ip6h != NULL); - } - - tcp->tcp_lport = ltcp->tcp_lport; - - if (ltcp->tcp_ipversion == tcp->tcp_ipversion) { - if (tcp->tcp_iphc_len != ltcp->tcp_iphc_len) { - /* - * Listener had options of some sort; eager inherits. - * Free up the eager template and allocate one - * of the right size. - */ - if (tcp->tcp_hdr_grown) { - kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); - } else { - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); - kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); - } - tcp->tcp_iphc = kmem_zalloc(ltcp->tcp_iphc_len, - KM_NOSLEEP); - if (tcp->tcp_iphc == NULL) { - tcp->tcp_iphc_len = 0; - freemsg(tpi_mp); - return (ENOMEM); - } - tcp->tcp_iphc_len = ltcp->tcp_iphc_len; - tcp->tcp_hdr_grown = B_TRUE; - } - tcp->tcp_hdr_len = ltcp->tcp_hdr_len; - tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; - tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; - tcp->tcp_ip6_hops = ltcp->tcp_ip6_hops; - tcp->tcp_ip6_vcf = ltcp->tcp_ip6_vcf; - - /* - * Copy the IP+TCP header template from listener to eager - */ - bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); - if (tcp->tcp_ipversion == IPV6_VERSION) { - if (((ip6i_t *)(tcp->tcp_iphc))->ip6i_nxt == - IPPROTO_RAW) { - tcp->tcp_ip6h = - (ip6_t *)(tcp->tcp_iphc + - sizeof (ip6i_t)); - } else { - tcp->tcp_ip6h = - (ip6_t *)(tcp->tcp_iphc); - } - tcp->tcp_ipha = NULL; - } else { - tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = NULL; - } - tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + - tcp->tcp_ip_hdr_len); - } else { - /* - * only valid case when ipversion of listener and - * eager differ is when listener is IPv6 and - * eager is IPv4. - * Eager header template has been initialized to the - * maximum v4 header sizes, which includes space for - * TCP and IP options. - */ - ASSERT((ltcp->tcp_ipversion == IPV6_VERSION) && - (tcp->tcp_ipversion == IPV4_VERSION)); - ASSERT(tcp->tcp_iphc_len >= - TCP_MAX_COMBINED_HEADER_LENGTH); - tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; - /* copy IP header fields individually */ - tcp->tcp_ipha->ipha_ttl = - ltcp->tcp_ip6h->ip6_hops; - bcopy(ltcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_lport, sizeof (ushort_t)); - } - - bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); - bcopy(tcp->tcp_tcph->th_fport, &tcp->tcp_fport, - sizeof (in_port_t)); - - if (ltcp->tcp_lport == 0) { - tcp->tcp_lport = *(in_port_t *)tcph->th_fport; - bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, - sizeof (in_port_t)); - } - - if (tcp->tcp_ipversion == IPV4_VERSION) { - ASSERT(ipha != NULL); - tcp->tcp_ipha->ipha_dst = ipha->ipha_src; - tcp->tcp_ipha->ipha_src = ipha->ipha_dst; - - /* Source routing option copyover (reverse it) */ - if (tcps->tcps_rev_src_routes) - tcp_opt_reverse(tcp, ipha); - } else { - ASSERT(ip6h != NULL); - tcp->tcp_ip6h->ip6_dst = ip6h->ip6_src; - tcp->tcp_ip6h->ip6_src = ip6h->ip6_dst; - } - - ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); - ASSERT(!tcp->tcp_tconnind_started); - /* - * If the SYN contains a credential, it's a loopback packet; attach - * the credential to the TPI message. - */ - mblk_copycred(tpi_mp, idmp); - - tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; - - /* Inherit the listener's SSL protection state */ - - if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { - kssl_hold_ent(tcp->tcp_kssl_ent); - tcp->tcp_kssl_pending = B_TRUE; - } - - /* Inherit the listener's non-STREAMS flag */ - if (IPCL_IS_NONSTR(lconnp)) { - connp->conn_flags |= IPCL_NONSTR; - } - - return (0); + return (tpi_mp); } - -int -tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, - tcph_t *tcph, mblk_t *idmp) +/* Handle a SYN on an AF_INET socket */ +mblk_t * +tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira) { tcp_t *ltcp = lconnp->conn_tcp; tcp_t *tcp = connp->conn_tcp; sin_t sin; mblk_t *tpi_mp = NULL; - int err; tcp_stack_t *tcps = tcp->tcp_tcps; + ipha_t *ipha; + + ASSERT(ira->ira_flags & IRAF_IS_IPV4); + ipha = (ipha_t *)mp->b_rptr; + + connp->conn_ipversion = IPV4_VERSION; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); + connp->conn_saddr_v6 = connp->conn_laddr_v6; sin = sin_null; - sin.sin_addr.s_addr = ipha->ipha_src; - sin.sin_port = *(uint16_t *)tcph->th_lport; + sin.sin_addr.s_addr = connp->conn_faddr_v4; + sin.sin_port = connp->conn_fport; sin.sin_family = AF_INET; - if (ltcp->tcp_recvdstaddr) { + if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { sin_t sind; sind = sin_null; - sind.sin_addr.s_addr = ipha->ipha_dst; - sind.sin_port = *(uint16_t *)tcph->th_fport; + sind.sin_addr.s_addr = connp->conn_laddr_v4; + sind.sin_port = connp->conn_lport; sind.sin_family = AF_INET; tpi_mp = mi_tpi_extconn_ind(NULL, (char *)&sind, sizeof (sin_t), (char *)&tcp, @@ -4779,214 +4138,8 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, (t_scalar_t)ltcp->tcp_conn_req_seqnum); } - if (tpi_mp == NULL) { - return (ENOMEM); - } - - connp->conn_flags |= (IPCL_TCP4|IPCL_EAGER); - connp->conn_send = ip_output; - connp->conn_recv = tcp_input; - connp->conn_fully_bound = B_FALSE; - - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_bound_source_v6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_srcv6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_remv6); - connp->conn_fport = *(uint16_t *)tcph->th_lport; - connp->conn_lport = *(uint16_t *)tcph->th_fport; - - /* Inherit information from the "parent" */ - tcp->tcp_ipversion = ltcp->tcp_ipversion; - tcp->tcp_family = ltcp->tcp_family; - tcp->tcp_wq = ltcp->tcp_wq; - tcp->tcp_rq = ltcp->tcp_rq; tcp->tcp_mss = tcps->tcps_mss_def_ipv4; - tcp->tcp_detached = B_TRUE; - SOCK_CONNID_INIT(tcp->tcp_connid); - if ((err = tcp_init_values(tcp)) != 0) { - freemsg(tpi_mp); - return (err); - } - - /* - * Let's make sure that eager tcp template has enough space to - * copy IPv4 listener's tcp template. Since the conn_t structure is - * preserved and tcp_iphc_len is also preserved, an eager conn_t may - * have a tcp_template of total len TCP_MAX_COMBINED_HEADER_LENGTH or - * more (in case of re-allocation of conn_t with tcp-IPv6 template with - * extension headers or with ip6i_t struct). Note that bcopy() below - * copies listener tcp's hdr_len which cannot be greater than TCP_MAX_ - * COMBINED_HEADER_LENGTH as this listener must be a IPv4 listener. - */ - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - ASSERT(ltcp->tcp_hdr_len <= TCP_MAX_COMBINED_HEADER_LENGTH); - - tcp->tcp_hdr_len = ltcp->tcp_hdr_len; - tcp->tcp_ip_hdr_len = ltcp->tcp_ip_hdr_len; - tcp->tcp_tcp_hdr_len = ltcp->tcp_tcp_hdr_len; - tcp->tcp_ttl = ltcp->tcp_ttl; - tcp->tcp_tos = ltcp->tcp_tos; - - /* Copy the IP+TCP header template from listener to eager */ - bcopy(ltcp->tcp_iphc, tcp->tcp_iphc, ltcp->tcp_hdr_len); - tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = NULL; - tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + - tcp->tcp_ip_hdr_len); - - /* Initialize the IP addresses and Ports */ - tcp->tcp_ipha->ipha_dst = ipha->ipha_src; - tcp->tcp_ipha->ipha_src = ipha->ipha_dst; - bcopy(tcph->th_lport, tcp->tcp_tcph->th_fport, sizeof (in_port_t)); - bcopy(tcph->th_fport, tcp->tcp_tcph->th_lport, sizeof (in_port_t)); - - /* Source routing option copyover (reverse it) */ - if (tcps->tcps_rev_src_routes) - tcp_opt_reverse(tcp, ipha); - - ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); - ASSERT(!tcp->tcp_tconnind_started); - - /* - * If the SYN contains a credential, it's a loopback packet; attach - * the credential to the TPI message. - */ - mblk_copycred(tpi_mp, idmp); - - tcp->tcp_conn.tcp_eager_conn_ind = tpi_mp; - - /* Inherit the listener's SSL protection state */ - if ((tcp->tcp_kssl_ent = ltcp->tcp_kssl_ent) != NULL) { - kssl_hold_ent(tcp->tcp_kssl_ent); - tcp->tcp_kssl_pending = B_TRUE; - } - - /* Inherit the listener's non-STREAMS flag */ - if (IPCL_IS_NONSTR(lconnp)) { - connp->conn_flags |= IPCL_NONSTR; - } - - return (0); -} - -/* - * sets up conn for ipsec. - * if the first mblk is M_CTL it is consumed and mpp is updated. - * in case of error mpp is freed. - */ -conn_t * -tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) -{ - conn_t *connp = tcp->tcp_connp; - conn_t *econnp; - squeue_t *new_sqp; - mblk_t *first_mp = *mpp; - mblk_t *mp = *mpp; - boolean_t mctl_present = B_FALSE; - uint_t ipvers; - - econnp = tcp_get_conn(sqp, tcp->tcp_tcps); - if (econnp == NULL) { - freemsg(first_mp); - return (NULL); - } - if (DB_TYPE(mp) == M_CTL) { - if (mp->b_cont == NULL || - mp->b_cont->b_datap->db_type != M_DATA) { - freemsg(first_mp); - return (NULL); - } - mp = mp->b_cont; - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) == 0) { - freemsg(first_mp); - return (NULL); - } - - mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - first_mp->b_datap->db_struioflag &= ~STRUIO_POLICY; - mctl_present = B_TRUE; - } else { - ASSERT(mp->b_datap->db_struioflag & STRUIO_POLICY); - mp->b_datap->db_struioflag &= ~STRUIO_POLICY; - } - - new_sqp = (squeue_t *)DB_CKSUMSTART(mp); - DB_CKSUMSTART(mp) = 0; - - ASSERT(OK_32PTR(mp->b_rptr)); - ipvers = IPH_HDR_VERSION(mp->b_rptr); - if (ipvers == IPV4_VERSION) { - uint16_t *up; - uint32_t ports; - ipha_t *ipha; - - ipha = (ipha_t *)mp->b_rptr; - up = (uint16_t *)((uchar_t *)ipha + - IPH_HDR_LENGTH(ipha) + TCP_PORTS_OFFSET); - ports = *(uint32_t *)up; - IPCL_TCP_EAGER_INIT(econnp, IPPROTO_TCP, - ipha->ipha_dst, ipha->ipha_src, ports); - } else { - uint16_t *up; - uint32_t ports; - uint16_t ip_hdr_len; - uint8_t *nexthdrp; - ip6_t *ip6h; - tcph_t *tcph; - - ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt == IPPROTO_TCP) { - ip_hdr_len = IPV6_HDR_LEN; - } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &ip_hdr_len, - &nexthdrp) || *nexthdrp != IPPROTO_TCP) { - CONN_DEC_REF(econnp); - freemsg(first_mp); - return (NULL); - } - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - up = (uint16_t *)tcph->th_lport; - ports = *(uint32_t *)up; - IPCL_TCP_EAGER_INIT_V6(econnp, IPPROTO_TCP, - ip6h->ip6_dst, ip6h->ip6_src, ports); - } - - /* - * The caller already ensured that there is a sqp present. - */ - econnp->conn_sqp = new_sqp; - econnp->conn_initial_sqp = new_sqp; - - if (connp->conn_policy != NULL) { - ipsec_in_t *ii; - ii = (ipsec_in_t *)(first_mp->b_rptr); - ASSERT(ii->ipsec_in_policy == NULL); - IPPH_REFHOLD(connp->conn_policy); - ii->ipsec_in_policy = connp->conn_policy; - - first_mp->b_datap->db_type = IPSEC_POLICY_SET; - if (!ip_bind_ipsec_policy_set(econnp, first_mp)) { - CONN_DEC_REF(econnp); - freemsg(first_mp); - return (NULL); - } - } - - if (ipsec_conn_cache_policy(econnp, ipvers == IPV4_VERSION) != 0) { - CONN_DEC_REF(econnp); - freemsg(first_mp); - return (NULL); - } - - /* - * If we know we have some policy, pass the "IPSEC" - * options size TCP uses this adjust the MSS. - */ - econnp->conn_tcp->tcp_ipsec_overhead = conn_ipsec_length(econnp); - if (mctl_present) { - freeb(first_mp); - *mpp = mp; - } - - return (econnp); + return (tpi_mp); } /* @@ -5002,10 +4155,8 @@ tcp_get_ipsec_conn(tcp_t *tcp, squeue_t *sqp, mblk_t **mpp) * connection sitting in the freelist. Obviously, this buys us * performance. * - * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_conn_request - * has multiple disadvantages - tying up the squeue during alloc, and the - * fact that IPSec policy initialization has to happen here which - * requires us sending a M_CTL and checking for it i.e. real ugliness. + * 2) Defence against DOS attack. Allocating a tcp/conn in tcp_input_listener + * has multiple disadvantages - tying up the squeue during alloc. * But allocating the conn/tcp in IP land is also not the best since * we can't check the 'q' and 'q0' which are protected by squeue and * blindly allocate memory which might have to be freed here if we are @@ -5050,9 +4201,15 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps) ns = tcps->tcps_netstack; netstack_hold(ns); connp->conn_netstack = ns; + connp->conn_ixa->ixa_ipst = ns->netstack_ip; tcp->tcp_tcps = tcps; - TCPS_REFHOLD(tcps); ipcl_globalhash_insert(connp); + + connp->conn_ixa->ixa_notify_cookie = tcp; + ASSERT(connp->conn_ixa->ixa_notify == tcp_notify); + connp->conn_recv = tcp_input_data; + ASSERT(connp->conn_recvicmp == tcp_icmp_input); + ASSERT(connp->conn_verifyicmp == tcp_verifyicmp); return ((void *)connp); } mutex_exit(&tcp_time_wait->tcp_time_wait_lock); @@ -5075,62 +4232,20 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps) mutex_init(&tcp->tcp_rsrv_mp_lock, NULL, MUTEX_DEFAULT, NULL); tcp->tcp_tcps = tcps; - TCPS_REFHOLD(tcps); - return ((void *)connp); -} + connp->conn_recv = tcp_input_data; + connp->conn_recvicmp = tcp_icmp_input; + connp->conn_verifyicmp = tcp_verifyicmp; -/* - * Update the cached label for the given tcp_t. This should be called once per - * connection, and before any packets are sent or tcp_process_options is - * invoked. Returns B_FALSE if the correct label could not be constructed. - */ -static boolean_t -tcp_update_label(tcp_t *tcp, const cred_t *cr) -{ - conn_t *connp = tcp->tcp_connp; - - if (tcp->tcp_ipversion == IPV4_VERSION) { - uchar_t optbuf[IP_MAX_OPT_LENGTH]; - int added; - - if (tsol_compute_label(cr, tcp->tcp_remote, optbuf, - tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) - return (B_FALSE); - - added = tsol_remove_secopt(tcp->tcp_ipha, tcp->tcp_hdr_len); - if (added == -1) - return (B_FALSE); - tcp->tcp_hdr_len += added; - tcp->tcp_tcph = (tcph_t *)((uchar_t *)tcp->tcp_tcph + added); - tcp->tcp_ip_hdr_len += added; - if ((tcp->tcp_label_len = optbuf[IPOPT_OLEN]) != 0) { - tcp->tcp_label_len = (tcp->tcp_label_len + 3) & ~3; - added = tsol_prepend_option(optbuf, tcp->tcp_ipha, - tcp->tcp_hdr_len); - if (added == -1) - return (B_FALSE); - tcp->tcp_hdr_len += added; - tcp->tcp_tcph = (tcph_t *) - ((uchar_t *)tcp->tcp_tcph + added); - tcp->tcp_ip_hdr_len += added; - } - } else { - uchar_t optbuf[TSOL_MAX_IPV6_OPTION]; - - if (tsol_compute_label_v6(cr, &tcp->tcp_remote_v6, optbuf, - tcp->tcp_tcps->tcps_netstack->netstack_ip) != 0) - return (B_FALSE); - if (tsol_update_sticky(&tcp->tcp_sticky_ipp, - &tcp->tcp_label_len, optbuf) != 0) - return (B_FALSE); - if (tcp_build_hdrs(tcp) != 0) - return (B_FALSE); - } - - connp->conn_ulp_labeled = 1; + /* + * Register tcp_notify to listen to capability changes detected by IP. + * This upcall is made in the context of the call to conn_ip_output + * thus it is inside the squeue. + */ + connp->conn_ixa->ixa_notify = tcp_notify; + connp->conn_ixa->ixa_notify_cookie = tcp; - return (B_TRUE); + return ((void *)connp); } /* BEGIN CSTYLED */ @@ -5140,7 +4255,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * ======================= * * The eager is now established in its own perimeter as soon as SYN is - * received in tcp_conn_request(). When sockfs receives conn_ind, it + * received in tcp_input_listener(). When sockfs receives conn_ind, it * completes the accept processing on the acceptor STREAM. The sending * of conn_ind part is common for both sockfs listener and a TLI/XTI * listener but a TLI/XTI listener completes the accept processing @@ -5149,29 +4264,28 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * Common control flow for 3 way handshake: * ---------------------------------------- * - * incoming SYN (listener perimeter) -> tcp_rput_data() - * -> tcp_conn_request() + * incoming SYN (listener perimeter) -> tcp_input_listener() * - * incoming SYN-ACK-ACK (eager perim) -> tcp_rput_data() + * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() * * Sockfs ACCEPT Path: * ------------------- * - * open acceptor stream (tcp_open allocates tcp_wput_accept() + * open acceptor stream (tcp_open allocates tcp_tli_accept() * as STREAM entry point) * - * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_wput_accept() + * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() * - * tcp_wput_accept() extracts the eager and makes the q->q_ptr <-> eager + * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager * association (we are not behind eager's squeue but sockfs is protecting us * and no one knows about this stream yet. The STREAMS entry point q->q_info * is changed to point at tcp_wput(). * - * tcp_wput_accept() sends any deferred eagers via tcp_send_pending() to + * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to * listener (done on listener's perimeter). * - * tcp_wput_accept() calls tcp_accept_finish() on eagers perimeter to finish + * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish * accept. * * TLI/XTI client ACCEPT path: @@ -5179,8 +4293,8 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * * soaccept() sends T_CONN_RES on the listener STREAM. * - * tcp_accept() -> tcp_accept_swap() complete the processing and send - * the bind_mp to eager perimeter to finish accept (tcp_rput_other()). + * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send + * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). * * Locks: * ====== @@ -5191,7 +4305,7 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) * Referencing: * ============ * - * 1) We start out in tcp_conn_request by eager placing a ref on + * 1) We start out in tcp_input_listener by eager placing a ref on * listener and listener adding eager to listeners->tcp_eager_next_q0. * * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before @@ -5249,51 +4363,71 @@ tcp_update_label(tcp_t *tcp, const cred_t *cr) /* * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. - * tcp_rput_data will not see any SYN packets. + * tcp_input_data will not see any packets for listeners since the listener + * has conn_recv set to tcp_input_listener. */ /* ARGSUSED */ void -tcp_conn_request(void *arg, mblk_t *mp, void *arg2) +tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - tcph_t *tcph; + tcpha_t *tcpha; uint32_t seg_seq; tcp_t *eager; - uint_t ipvers; - ipha_t *ipha; - ip6_t *ip6h; int err; conn_t *econnp = NULL; squeue_t *new_sqp; mblk_t *mp1; uint_t ip_hdr_len; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - cred_t *credp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst; + conn_t *lconnp = (conn_t *)arg; + tcp_t *listener = lconnp->conn_tcp; + tcp_stack_t *tcps = listener->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + uint_t flags; + mblk_t *tpi_mp; + uint_t ifindex = ira->ira_ruifindex; - if (tcp->tcp_state != TCPS_LISTEN) + ip_hdr_len = ira->ira_ip_hdr_length; + tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; + flags = (unsigned int)tcpha->tha_flags & 0xFF; + + if (!(flags & TH_SYN)) { + if ((flags & TH_RST) || (flags & TH_URG)) { + freemsg(mp); + return; + } + if (flags & TH_ACK) { + /* Note this executes in listener's squeue */ + tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); + return; + } + + freemsg(mp); + return; + } + + if (listener->tcp_state != TCPS_LISTEN) goto error2; - ASSERT((tcp->tcp_connp->conn_flags & IPCL_BOUND) != 0); + ASSERT(IPCL_IS_BOUND(lconnp)); - mutex_enter(&tcp->tcp_eager_lock); - if (tcp->tcp_conn_req_cnt_q >= tcp->tcp_conn_req_max) { - mutex_exit(&tcp->tcp_eager_lock); + mutex_enter(&listener->tcp_eager_lock); + if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { + mutex_exit(&listener->tcp_eager_lock); TCP_STAT(tcps, tcp_listendrop); BUMP_MIB(&tcps->tcps_mib, tcpListenDrop); - if (tcp->tcp_debug) { + if (lconnp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, - "tcp_conn_request: listen backlog (max=%d) " + "tcp_input_listener: listen backlog (max=%d) " "overflow (%d pending) on %s", - tcp->tcp_conn_req_max, tcp->tcp_conn_req_cnt_q, - tcp_display(tcp, NULL, DISP_PORT_ONLY)); + listener->tcp_conn_req_max, + listener->tcp_conn_req_cnt_q, + tcp_display(listener, NULL, DISP_PORT_ONLY)); } goto error2; } - if (tcp->tcp_conn_req_cnt_q0 >= - tcp->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { + if (listener->tcp_conn_req_cnt_q0 >= + listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { /* * Q0 is full. Drop a pending half-open req from the queue * to make room for the new SYN req. Also mark the time we @@ -5303,83 +4437,127 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * be to set the "tcp_syn_defense" flag now. */ TCP_STAT(tcps, tcp_listendropq0); - tcp->tcp_last_rcv_lbolt = lbolt64; - if (!tcp_drop_q0(tcp)) { - mutex_exit(&tcp->tcp_eager_lock); + listener->tcp_last_rcv_lbolt = lbolt64; + if (!tcp_drop_q0(listener)) { + mutex_exit(&listener->tcp_eager_lock); BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0); - if (tcp->tcp_debug) { + if (lconnp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, - "tcp_conn_request: listen half-open queue " - "(max=%d) full (%d pending) on %s", + "tcp_input_listener: listen half-open " + "queue (max=%d) full (%d pending) on %s", tcps->tcps_conn_req_max_q0, - tcp->tcp_conn_req_cnt_q0, - tcp_display(tcp, NULL, + listener->tcp_conn_req_cnt_q0, + tcp_display(listener, NULL, DISP_PORT_ONLY)); } goto error2; } } - mutex_exit(&tcp->tcp_eager_lock); + mutex_exit(&listener->tcp_eager_lock); /* - * IP adds STRUIO_EAGER and ensures that the received packet is - * M_DATA even if conn_ipv6_recvpktinfo is enabled or for ip6 - * link local address. If IPSec is enabled, db_struioflag has - * STRUIO_POLICY set (mutually exclusive from STRUIO_EAGER); - * otherwise an error case if neither of them is set. + * IP sets ira_sqp to either the senders conn_sqp (for loopback) + * or based on the ring (for packets from GLD). Otherwise it is + * set based on lbolt i.e., a somewhat random number. */ - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - new_sqp = (squeue_t *)DB_CKSUMSTART(mp); - DB_CKSUMSTART(mp) = 0; - mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - econnp = (conn_t *)tcp_get_conn(arg2, tcps); - if (econnp == NULL) - goto error2; - ASSERT(econnp->conn_netstack == connp->conn_netstack); - econnp->conn_sqp = new_sqp; - econnp->conn_initial_sqp = new_sqp; - } else if ((mp->b_datap->db_struioflag & STRUIO_POLICY) != 0) { - /* - * mp is updated in tcp_get_ipsec_conn(). - */ - econnp = tcp_get_ipsec_conn(tcp, arg2, &mp); - if (econnp == NULL) { - /* - * mp freed by tcp_get_ipsec_conn. - */ - return; - } - ASSERT(econnp->conn_netstack == connp->conn_netstack); - } else { + ASSERT(ira->ira_sqp != NULL); + new_sqp = ira->ira_sqp; + + econnp = (conn_t *)tcp_get_conn(arg2, tcps); + if (econnp == NULL) goto error2; - } - ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(econnp->conn_netstack == lconnp->conn_netstack); + econnp->conn_sqp = new_sqp; + econnp->conn_initial_sqp = new_sqp; + econnp->conn_ixa->ixa_sqp = new_sqp; + + econnp->conn_fport = tcpha->tha_lport; + econnp->conn_lport = tcpha->tha_fport; + + err = conn_inherit_parent(lconnp, econnp); + if (err != 0) + goto error3; - ipvers = IPH_HDR_VERSION(mp->b_rptr); - ASSERT(ipvers == IPV6_VERSION || ipvers == IPV4_VERSION); ASSERT(OK_32PTR(mp->b_rptr)); - if (ipvers == IPV4_VERSION) { - ipha = (ipha_t *)mp->b_rptr; - ip_hdr_len = IPH_HDR_LENGTH(ipha); - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - } else { - ip6h = (ip6_t *)mp->b_rptr; - ip_hdr_len = ip_hdr_length_v6(mp, ip6h); - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - } + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || + IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); - if (tcp->tcp_family == AF_INET) { - ASSERT(ipvers == IPV4_VERSION); - err = tcp_conn_create_v4(connp, econnp, ipha, tcph, mp); + if (lconnp->conn_family == AF_INET) { + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); + tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); } else { - err = tcp_conn_create_v6(connp, econnp, mp, tcph, ipvers, mp); + tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); } - if (err) + if (tpi_mp == NULL) goto error3; eager = econnp->conn_tcp; + eager->tcp_detached = B_TRUE; + SOCK_CONNID_INIT(eager->tcp_connid); + + tcp_init_values(eager); + + ASSERT((econnp->conn_ixa->ixa_flags & + (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == + (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); + + if (!tcps->tcps_dev_flow_ctl) + econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; + + /* Prepare for diffing against previous packets */ + eager->tcp_recvifindex = 0; + eager->tcp_recvhops = 0xffffffffU; + + if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || + IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { + econnp->conn_incoming_ifindex = ifindex; + econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + econnp->conn_ixa->ixa_scopeid = ifindex; + } + } + + if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == + (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && + tcps->tcps_rev_src_routes) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + ip_pkt_t *ipp = &econnp->conn_xmit_ipp; + + /* Source routing option copyover (reverse it) */ + err = ip_find_hdr_v4(ipha, ipp, B_TRUE); + if (err != 0) { + freemsg(tpi_mp); + goto error3; + } + ip_pkt_source_route_reverse_v4(ipp); + } + + ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); + ASSERT(!eager->tcp_tconnind_started); + /* + * If the SYN came with a credential, it's a loopback packet or a + * labeled packet; attach the credential to the TPI message. + */ + if (ira->ira_cred != NULL) + mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); + + eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; + + /* Inherit the listener's SSL protection state */ + if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) { + kssl_hold_ent(eager->tcp_kssl_ent); + eager->tcp_kssl_pending = B_TRUE; + } + + /* Inherit the listener's non-STREAMS flag */ + if (IPCL_IS_NONSTR(lconnp)) { + econnp->conn_flags |= IPCL_NONSTR; + } + ASSERT(eager->tcp_ordrel_mp == NULL); if (!IPCL_IS_NONSTR(econnp)) { @@ -5392,127 +4570,103 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) goto error3; } - /* Inherit various TCP parameters from the listener */ - eager->tcp_naglim = tcp->tcp_naglim; - eager->tcp_first_timer_threshold = tcp->tcp_first_timer_threshold; - eager->tcp_second_timer_threshold = tcp->tcp_second_timer_threshold; - - eager->tcp_first_ctimer_threshold = tcp->tcp_first_ctimer_threshold; - eager->tcp_second_ctimer_threshold = tcp->tcp_second_ctimer_threshold; - /* - * tcp_adapt_ire() may change tcp_rwnd according to the ire metrics. - * If it does not, the eager's receive window will be set to the - * listener's receive window later in this function. + * Now that the IP addresses and ports are setup in econnp we + * can do the IPsec policy work. */ - eager->tcp_rwnd = 0; + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + if (lconnp->conn_policy != NULL) { + /* + * Inherit the policy from the listener; use + * actions from ira + */ + if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { + CONN_DEC_REF(econnp); + freemsg(mp); + goto error3; + } + } + } - /* - * Inherit listener's tcp_init_cwnd. Need to do this before - * calling tcp_process_options() where tcp_mss_set() is called - * to set the initial cwnd. - */ - eager->tcp_init_cwnd = tcp->tcp_init_cwnd; + /* Inherit various TCP parameters from the listener */ + eager->tcp_naglim = listener->tcp_naglim; + eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold; + eager->tcp_second_timer_threshold = + listener->tcp_second_timer_threshold; + eager->tcp_first_ctimer_threshold = + listener->tcp_first_ctimer_threshold; + eager->tcp_second_ctimer_threshold = + listener->tcp_second_ctimer_threshold; /* - * Zones: tcp_adapt_ire() and tcp_send_data() both need the - * zone id before the accept is completed in tcp_wput_accept(). + * tcp_set_destination() may set tcp_rwnd according to the route + * metrics. If it does not, the eager's receive window will be set + * to the listener's receive window later in this function. */ - econnp->conn_zoneid = connp->conn_zoneid; - econnp->conn_allzones = connp->conn_allzones; - - /* Copy nexthop information from listener to eager */ - if (connp->conn_nexthop_set) { - econnp->conn_nexthop_set = connp->conn_nexthop_set; - econnp->conn_nexthop_v4 = connp->conn_nexthop_v4; - } + eager->tcp_rwnd = 0; /* - * TSOL: tsol_input_proc() needs the eager's cred before the - * eager is accepted + * Inherit listener's tcp_init_cwnd. Need to do this before + * calling tcp_process_options() which set the initial cwnd. */ - econnp->conn_cred = eager->tcp_cred = credp = connp->conn_cred; - crhold(credp); + eager->tcp_init_cwnd = listener->tcp_init_cwnd; - ASSERT(econnp->conn_effective_cred == NULL); if (is_system_labeled()) { - cred_t *cr; - ts_label_t *tsl; - - /* - * If this is an MLP connection or a MAC-Exempt connection - * with an unlabeled node, packets are to be - * exchanged using the security label of the received - * SYN packet instead of the server application's label. - */ - if ((cr = msg_getcred(mp, NULL)) != NULL && - (tsl = crgetlabel(cr)) != NULL && - (connp->conn_mlp_type != mlptSingle || - (connp->conn_mac_mode != CONN_MAC_AWARE && - (tsl->tsl_flags & TSLF_UNLABELED)))) { - if ((econnp->conn_effective_cred = - copycred_from_tslabel(econnp->conn_cred, - tsl, KM_NOSLEEP)) != NULL) { - DTRACE_PROBE2( - syn_accept_peerlabel, - conn_t *, econnp, cred_t *, - econnp->conn_effective_cred); - } else { - DTRACE_PROBE3( - tx__ip__log__error__set__eagercred__tcp, - char *, - "SYN mp(1) label on eager connp(2) failed", - mblk_t *, mp, conn_t *, econnp); - goto error3; - } + ip_xmit_attr_t *ixa = econnp->conn_ixa; + + ASSERT(ira->ira_tsl != NULL); + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + ixa->ixa_tsl = NULL; + } + if ((lconnp->conn_mlp_type != mlptSingle || + lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && + ira->ira_tsl != NULL) { + /* + * If this is an MLP connection or a MAC-Exempt + * connection with an unlabeled node, packets are to be + * exchanged using the security label of the received + * SYN packet instead of the server application's label. + * tsol_check_dest called from ip_set_destination + * might later update TSF_UNLABELED by replacing + * ixa_tsl with a new label. + */ + label_hold(ira->ira_tsl); + ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); + DTRACE_PROBE2(mlp_syn_accept, conn_t *, + econnp, ts_label_t *, ixa->ixa_tsl) } else { + ixa->ixa_tsl = crgetlabel(econnp->conn_cred); DTRACE_PROBE2(syn_accept, conn_t *, - econnp, cred_t *, econnp->conn_cred) + econnp, ts_label_t *, ixa->ixa_tsl) } - /* - * Verify the destination is allowed to receive packets - * at the security label of the SYN-ACK we are generating. - * tsol_check_dest() may create a new effective cred for - * this connection with a modified label or label flags. + * conn_connect() called from tcp_set_destination will verify + * the destination is allowed to receive packets at the + * security label of the SYN-ACK we are generating. As part of + * that, tsol_check_dest() may create a new effective label for + * this connection. + * Finally conn_connect() will call conn_update_label. + * All that remains for TCP to do is to call + * conn_build_hdr_template which is done as part of + * tcp_set_destination. */ - if (IN6_IS_ADDR_V4MAPPED(&econnp->conn_remv6)) { - uint32_t dst; - IN6_V4MAPPED_TO_IPADDR(&econnp->conn_remv6, dst); - err = tsol_check_dest(CONN_CRED(econnp), &dst, - IPV4_VERSION, B_FALSE, &cr); - } else { - err = tsol_check_dest(CONN_CRED(econnp), - &econnp->conn_remv6, IPV6_VERSION, - B_FALSE, &cr); - } - if (err != 0) - goto error3; - if (cr != NULL) { - if (econnp->conn_effective_cred != NULL) - crfree(econnp->conn_effective_cred); - econnp->conn_effective_cred = cr; - } - - /* - * Generate the security label to be used in the text of - * this connection's outgoing packets. - */ - if (!tcp_update_label(eager, CONN_CRED(econnp))) { - DTRACE_PROBE3( - tx__ip__log__error__connrequest__tcp, - char *, "eager connp(1) label on SYN mp(2) failed", - conn_t *, econnp, mblk_t *, mp); - goto error3; - } } + /* + * Since we will clear tcp_listener before we clear tcp_detached + * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress + * so we can tell a TCP_DETACHED_NONEAGER apart. + */ eager->tcp_hard_binding = B_TRUE; tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ - TCP_BIND_HASH(eager->tcp_lport)], eager, 0); + TCP_BIND_HASH(econnp->conn_lport)], eager, 0); - CL_INET_CONNECT(connp, eager, B_FALSE, err); + CL_INET_CONNECT(econnp, B_FALSE, err); if (err != 0) { tcp_bind_hash_remove(eager); goto error3; @@ -5528,32 +4682,27 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) SOCK_CONNID_BUMP(eager->tcp_connid); /* - * There should be no ire in the mp as we are being called after - * receiving the SYN. - */ - ASSERT(tcp_ire_mp(&mp) == NULL); - - /* - * Adapt our mss, ttl, ... according to information provided in IRE. + * Adapt our mss, ttl, ... based on the remote address. */ - if (tcp_adapt_ire(eager, NULL) == 0) { + if (tcp_set_destination(eager) != 0) { + BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); /* Undo the bind_hash_insert */ tcp_bind_hash_remove(eager); goto error3; } /* Process all TCP options. */ - tcp_process_options(eager, tcph); + tcp_process_options(eager, tcpha); /* Is the other end ECN capable? */ if (tcps->tcps_ecn_permitted >= 1 && - (tcph->th_flags[0] & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { + (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { eager->tcp_ecn_ok = B_TRUE; } /* - * listeners tcp_recv_hiwater should be the default window size or a + * The listener's conn_rcvbuf should be the default window size or a * window size changed via SO_RCVBUF option. First round up the * eager's tcp_rwnd to the nearest MSS. Then find out the window * scale option value if needed. Call tcp_rwnd_set() to finish the @@ -5563,7 +4712,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * we should not inherit receive window size from listener. */ eager->tcp_rwnd = MSS_ROUNDUP( - (eager->tcp_rwnd == 0 ? tcp->tcp_recv_hiwater: + (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : eager->tcp_rwnd), eager->tcp_mss); if (eager->tcp_snd_ws_ok) tcp_set_ws_value(eager); @@ -5575,77 +4724,46 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) */ (void) tcp_rwnd_set(eager, eager->tcp_rwnd); - /* - * We eliminate the need for sockfs to send down a T_SVR4_OPTMGMT_REQ - * via soaccept()->soinheritoptions() which essentially applies - * all the listener options to the new STREAM. The options that we - * need to take care of are: - * SO_DEBUG, SO_REUSEADDR, SO_KEEPALIVE, SO_DONTROUTE, SO_BROADCAST, - * SO_USELOOPBACK, SO_OOBINLINE, SO_DGRAM_ERRIND, SO_LINGER, - * SO_SNDBUF, SO_RCVBUF. - * - * SO_RCVBUF: tcp_rwnd_set() above takes care of it. - * SO_SNDBUF: Set the tcp_xmit_hiwater for the eager. When - * tcp_maxpsz_set() gets called later from - * tcp_accept_finish(), the option takes effect. - * - */ - /* Set the TCP options */ - eager->tcp_recv_lowater = tcp->tcp_recv_lowater; - eager->tcp_xmit_hiwater = tcp->tcp_xmit_hiwater; - eager->tcp_dgram_errind = tcp->tcp_dgram_errind; - eager->tcp_oobinline = tcp->tcp_oobinline; - eager->tcp_reuseaddr = tcp->tcp_reuseaddr; - eager->tcp_broadcast = tcp->tcp_broadcast; - eager->tcp_useloopback = tcp->tcp_useloopback; - eager->tcp_dontroute = tcp->tcp_dontroute; - eager->tcp_debug = tcp->tcp_debug; - eager->tcp_linger = tcp->tcp_linger; - eager->tcp_lingertime = tcp->tcp_lingertime; - if (tcp->tcp_ka_enabled) - eager->tcp_ka_enabled = 1; - - ASSERT(eager->tcp_recv_hiwater != 0 && - eager->tcp_recv_hiwater == eager->tcp_rwnd); - - /* Set the IP options */ - econnp->conn_broadcast = connp->conn_broadcast; - econnp->conn_loopback = connp->conn_loopback; - econnp->conn_dontroute = connp->conn_dontroute; - econnp->conn_reuseaddr = connp->conn_reuseaddr; + ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && + eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); + + ASSERT(econnp->conn_rcvbuf != 0 && + econnp->conn_rcvbuf == eager->tcp_rwnd); /* Put a ref on the listener for the eager. */ - CONN_INC_REF(connp); - mutex_enter(&tcp->tcp_eager_lock); - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; - eager->tcp_eager_next_q0 = tcp->tcp_eager_next_q0; - tcp->tcp_eager_next_q0 = eager; - eager->tcp_eager_prev_q0 = tcp; + CONN_INC_REF(lconnp); + mutex_enter(&listener->tcp_eager_lock); + listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; + eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; + listener->tcp_eager_next_q0 = eager; + eager->tcp_eager_prev_q0 = listener; /* Set tcp_listener before adding it to tcp_conn_fanout */ - eager->tcp_listener = tcp; - eager->tcp_saved_listener = tcp; + eager->tcp_listener = listener; + eager->tcp_saved_listener = listener; /* * Tag this detached tcp vector for later retrieval * by our listener client in tcp_accept(). */ - eager->tcp_conn_req_seqnum = tcp->tcp_conn_req_seqnum; - tcp->tcp_conn_req_cnt_q0++; - if (++tcp->tcp_conn_req_seqnum == -1) { + eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; + listener->tcp_conn_req_cnt_q0++; + if (++listener->tcp_conn_req_seqnum == -1) { /* * -1 is "special" and defined in TPI as something * that should never be used in T_CONN_IND */ - ++tcp->tcp_conn_req_seqnum; + ++listener->tcp_conn_req_seqnum; } - mutex_exit(&tcp->tcp_eager_lock); + mutex_exit(&listener->tcp_eager_lock); - if (tcp->tcp_syn_defense) { + if (listener->tcp_syn_defense) { /* Don't drop the SYN that comes from a good IP source */ - ipaddr_t *addr_cache = (ipaddr_t *)(tcp->tcp_ip_addr_cache); - if (addr_cache != NULL && eager->tcp_remote == - addr_cache[IP_ADDR_CACHE_HASH(eager->tcp_remote)]) { + ipaddr_t *addr_cache; + + addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); + if (addr_cache != NULL && econnp->conn_faddr_v4 == + addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { eager->tcp_dontdrop = B_TRUE; } } @@ -5655,14 +4773,14 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * as we do that, we expose the eager to the classifier and * should not touch any field outside the eager's perimeter. * So do all the work necessary before inserting the eager - * in its own perimeter. Be optimistic that ipcl_conn_insert() + * in its own perimeter. Be optimistic that conn_connect() * will succeed but undo everything if it fails. */ - seg_seq = ABE32_TO_U32(tcph->th_seq); + seg_seq = ntohl(tcpha->tha_seq); eager->tcp_irs = seg_seq; eager->tcp_rack = seg_seq; eager->tcp_rnxt = seg_seq + 1; - U32_TO_ABE32(eager->tcp_rnxt, eager->tcp_tcph->th_ack); + eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens); eager->tcp_state = TCPS_SYN_RCVD; mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, @@ -5677,24 +4795,10 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) } /* - * Note that in theory this should use the current pid - * so that getpeerucred on the client returns the actual listener - * that does accept. But accept() hasn't been called yet. We could use - * the pid of the process that did bind/listen on the server. - * However, with common usage like inetd() the bind/listen can be done - * by a different process than the accept(). - * Hence we do the simple thing of using the open pid here. - * Note that db_credp is set later in tcp_send_data(). - */ - mblk_setcred(mp1, credp, tcp->tcp_cpid); - eager->tcp_cpid = tcp->tcp_cpid; - eager->tcp_open_time = lbolt64; - - /* * We need to start the rto timer. In normal case, we start * the timer after sending the packet on the wire (or at * least believing that packet was sent by waiting for - * CALL_IP_WPUT() to return). Since this is the first packet + * conn_ip_output() to return). Since this is the first packet * being sent on the wire for the eager, our initial tcp_rto * is at least tcp_rexmit_interval_min which is a fairly * large value to allow the algorithm to adjust slowly to large @@ -5716,7 +4820,7 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * ensure against an eager close race. */ - CONN_INC_REF(eager->tcp_connp); + CONN_INC_REF(econnp); TCP_TIMER_RESTART(eager, eager->tcp_rto); @@ -5724,22 +4828,16 @@ tcp_conn_request(void *arg, mblk_t *mp, void *arg2) * Insert the eager in its own perimeter now. We are ready to deal * with any packets on eager. */ - if (eager->tcp_ipversion == IPV4_VERSION) { - if (ipcl_conn_insert(econnp, IPPROTO_TCP, 0, 0, 0) != 0) { - goto error; - } - } else { - if (ipcl_conn_insert_v6(econnp, IPPROTO_TCP, 0, 0, 0, 0) != 0) { - goto error; - } - } - - /* mark conn as fully-bound */ - econnp->conn_fully_bound = B_TRUE; + if (ipcl_conn_insert(econnp) != 0) + goto error; - /* Send the SYN-ACK */ - tcp_send_data(eager, eager->tcp_wq, mp1); - CONN_DEC_REF(eager->tcp_connp); + /* + * Send the SYN-ACK. Can't use tcp_send_data since we can't update + * pmtu etc; we are not on the eager's squeue + */ + ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); + (void) conn_ip_output(mp1, econnp->conn_ixa); + CONN_DEC_REF(econnp); freemsg(mp); return; @@ -5749,7 +4847,7 @@ error: TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); mp1 = &eager->tcp_closemp; SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, - econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_2); + econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); /* * If a connection already exists, send the mp to that connections so @@ -5757,7 +4855,7 @@ error: */ ipst = tcps->tcps_netstack->netstack_ip; - if ((econnp = ipcl_classify(mp, connp->conn_zoneid, ipst)) != NULL) { + if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { if (!IPCL_IS_CONNECTED(econnp)) { /* * Something bad happened. ipcl_conn_insert() @@ -5772,8 +4870,8 @@ error: CONN_DEC_REF(econnp); freemsg(mp); } else { - SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, - tcp_input, econnp, SQ_FILL, SQTAG_TCP_CONN_REQ_1); + SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, + econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); } } else { /* Nobody wants this packet */ @@ -5803,18 +4901,21 @@ error2: * very first time and there is no attempt to rebind them. */ void -tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) +tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *ira) { conn_t *connp = (conn_t *)arg; squeue_t *sqp = (squeue_t *)arg2; squeue_t *new_sqp; uint32_t conn_flags; - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - new_sqp = (squeue_t *)DB_CKSUMSTART(mp); - } else { - goto done; - } + /* + * IP sets ira_sqp to either the senders conn_sqp (for loopback) + * or based on the ring (for packets from GLD). Otherwise it is + * set based on lbolt i.e., a somewhat random number. + */ + ASSERT(ira->ira_sqp != NULL); + new_sqp = ira->ira_sqp; if (connp->conn_fanout == NULL) goto done; @@ -5849,6 +4950,8 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) if (connp->conn_sqp != new_sqp) { while (connp->conn_sqp != new_sqp) (void) casptr(&connp->conn_sqp, sqp, new_sqp); + /* No special MT issues for outbound ixa_sqp hint */ + connp->conn_ixa->ixa_sqp = new_sqp; } do { @@ -5860,49 +4963,47 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) mutex_exit(&connp->conn_fanout->connf_lock); mutex_exit(&connp->conn_lock); + + /* + * Assume we have picked a good squeue for the listener. Make + * subsequent SYNs not try to change the squeue. + */ + connp->conn_recv = tcp_input_listener; } done: if (connp->conn_sqp != sqp) { CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, - SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); + ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); } else { - tcp_conn_request(connp, mp, sqp); + tcp_input_listener(connp, mp, sqp, ira); } } /* * Successful connect request processing begins when our client passes - * a T_CONN_REQ message into tcp_wput() and ends when tcp_rput() passes - * our T_OK_ACK reply message upstream. The control flow looks like this: - * upstream -> tcp_wput() -> tcp_wput_proto() -> tcp_tpi_connect() -> IP - * upstream <- tcp_rput() <- IP + * a T_CONN_REQ message into tcp_wput(), which performs function calls into + * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). + * * After various error checks are completed, tcp_tpi_connect() lays - * the target address and port into the composite header template, - * preallocates the T_OK_ACK reply message, construct a full 12 byte bind - * request followed by an IRE request, and passes the three mblk message - * down to IP looking like this: - * O_T_BIND_REQ for IP --> IRE req --> T_OK_ACK for our client - * Processing continues in tcp_rput() when we receive the following message: - * T_BIND_ACK from IP --> IRE ack --> T_OK_ACK for our client - * After consuming the first two mblks, tcp_rput() calls tcp_timer(), - * to fire off the connection request, and then passes the T_OK_ACK mblk - * upstream that we filled in below. There are, of course, numerous - * error conditions along the way which truncate the processing described - * above. + * the target address and port into the composite header template. + * Then we ask IP for information, including a source address if we didn't + * already have one. Finally we prepare to send the SYN packet, and then + * send up the T_OK_ACK reply message. */ static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) { sin_t *sin; - queue_t *q = tcp->tcp_wq; struct T_conn_req *tcr; struct sockaddr *sa; socklen_t len; int error; cred_t *cr; pid_t cpid; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; /* * All Solaris components should pass a db_credp @@ -5944,7 +5045,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) * Determine packet type based on type of address passed in * the request should contain an IPv4 or IPv6 address. * Make sure that address family matches the type of - * family of the the address passed down + * family of the address passed down. */ switch (tcr->DEST_length) { default: @@ -6022,7 +5123,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) break; } - error = proto_verify_ip_addr(tcp->tcp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) { tcp_err_ack(tcp, mp, TSYSERR, error); return; @@ -6111,7 +5212,7 @@ tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) /* return error ack and blow away saved option results if any */ connect_failed: if (mp != NULL) - putnext(tcp->tcp_rq, mp); + putnext(connp->conn_rq, mp); else { tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, TSYSERR, ENOMEM); @@ -6121,20 +5222,19 @@ connect_failed: /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations. + * Returns zero if OK, a positive errno, or a negative TLI error. */ static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, - uint_t srcid, cred_t *cr, pid_t pid) + uint_t srcid) { - tcph_t *tcph; - mblk_t *mp; - ipaddr_t dstaddr = *dstaddrp; - int32_t oldstate; - uint16_t lport; - int error = 0; + ipaddr_t dstaddr = *dstaddrp; + uint16_t lport; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; + int error; - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); + ASSERT(connp->conn_ipversion == IPV4_VERSION); /* Check for attempt to connect to INADDR_ANY */ if (dstaddr == INADDR_ANY) { @@ -6157,74 +5257,21 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, } /* Handle __sin6_src_id if socket not bound to an IP address */ - if (srcid != 0 && tcp->tcp_ipha->ipha_src == INADDR_ANY) { - ip_srcid_find_id(srcid, &tcp->tcp_ip_src_v6, - tcp->tcp_connp->conn_zoneid, tcps->tcps_netstack); - IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_ip_src_v6, - tcp->tcp_ipha->ipha_src); + if (srcid != 0 && connp->conn_laddr_v4 == INADDR_ANY) { + ip_srcid_find_id(srcid, &connp->conn_laddr_v6, + IPCL_ZONEID(connp), tcps->tcps_netstack); + connp->conn_saddr_v6 = connp->conn_laddr_v6; } - /* - * Don't let an endpoint connect to itself. Note that - * the test here does not catch the case where the - * source IP addr was left unspecified by the user. In - * this case, the source addr is set in tcp_adapt_ire() - * using the reply to the T_BIND message that we send - * down to IP here and the check is repeated in tcp_rput_other. - */ - if (dstaddr == tcp->tcp_ipha->ipha_src && - dstport == tcp->tcp_lport) { - error = -TBADADDR; - goto failed; - } + IN6_IPADDR_TO_V4MAPPED(dstaddr, &connp->conn_faddr_v6); + connp->conn_fport = dstport; /* - * Verify the destination is allowed to receive packets - * at the security label of the connection we are initiating. - * tsol_check_dest() may create a new effective cred for this - * connection with a modified label or label flags. - */ - if (is_system_labeled()) { - ASSERT(tcp->tcp_connp->conn_effective_cred == NULL); - if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp), - &dstaddr, IPV4_VERSION, tcp->tcp_connp->conn_mac_mode, - &tcp->tcp_connp->conn_effective_cred)) != 0) { - if (error != EHOSTUNREACH) - error = -TSYSERR; - goto failed; - } - } - - tcp->tcp_ipha->ipha_dst = dstaddr; - IN6_IPADDR_TO_V4MAPPED(dstaddr, &tcp->tcp_remote_v6); - - /* - * Massage a source route if any putting the first hop - * in iph_dst. Compute a starting value for the checksum which - * takes into account that the original iph_dst should be - * included in the checksum but that ip will include the - * first hop in the source route in the tcp checksum. - */ - tcp->tcp_sum = ip_massage_options(tcp->tcp_ipha, tcps->tcps_netstack); - tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); - tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + - (tcp->tcp_ipha->ipha_dst & 0xffff)); - if ((int)tcp->tcp_sum < 0) - tcp->tcp_sum--; - tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + - (tcp->tcp_sum >> 16)); - tcph = tcp->tcp_tcph; - *(uint16_t *)tcph->th_fport = dstport; - tcp->tcp_fport = dstport; - - oldstate = tcp->tcp_state; - /* * At this point the remote destination address and remote port fields * in the tcp-four-tuple have been filled in the tcp structure. Now we - * have to see which state tcp was in so we can take apropriate action. + * have to see which state tcp was in so we can take appropriate action. */ - if (oldstate == TCPS_IDLE) { + if (tcp->tcp_state == TCPS_IDLE) { /* * We support a quick connect capability here, allowing * clients to transition directly from IDLE to SYN_SENT @@ -6233,203 +5280,93 @@ tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, */ lport = tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); - lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, + lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, B_FALSE, B_FALSE); - if (lport == 0) { - error = -TNOADDR; - goto failed; - } - } - tcp->tcp_state = TCPS_SYN_SENT; - - mp = allocb(sizeof (ire_t), BPRI_HI); - if (mp == NULL) { - tcp->tcp_state = oldstate; - error = ENOMEM; - goto failed; + if (lport == 0) + return (-TNOADDR); } - mp->b_wptr += sizeof (ire_t); - mp->b_datap->db_type = IRE_DB_REQ_TYPE; - tcp->tcp_hard_binding = 1; - /* - * We need to make sure that the conn_recv is set to a non-null - * value before we insert the conn_t into the classifier table. - * This is to avoid a race with an incoming packet which does - * an ipcl_classify(). + * Lookup the route to determine a source address and the uinfo. + * If there was a source route we have tcp_ipha->ipha_dst as the first + * hop. + * Setup TCP parameters based on the metrics/DCE. */ - tcp->tcp_connp->conn_recv = tcp_input; + error = tcp_set_destination(tcp); + if (error != 0) + return (error); - if (tcp->tcp_family == AF_INET) { - error = ip_proto_bind_connected_v4(tcp->tcp_connp, &mp, - IPPROTO_TCP, &tcp->tcp_ipha->ipha_src, tcp->tcp_lport, - tcp->tcp_remote, tcp->tcp_fport, B_TRUE, B_TRUE, cr); - } else { - in6_addr_t v6src; - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src); - } else { - v6src = tcp->tcp_ip6h->ip6_src; - } - error = ip_proto_bind_connected_v6(tcp->tcp_connp, &mp, - IPPROTO_TCP, &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6, - &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr); - } - BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); - tcp->tcp_active_open = 1; + /* + * Don't let an endpoint connect to itself. + */ + if (connp->conn_faddr_v4 == connp->conn_laddr_v4 && + connp->conn_fport == connp->conn_lport) + return (-TBADADDR); + tcp->tcp_state = TCPS_SYN_SENT; - return (tcp_post_ip_bind(tcp, mp, error, cr, pid)); -failed: - /* return error ack and blow away saved option results if any */ - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - return (error); + return (ipcl_conn_insert_v4(connp)); } /* * Handle connect to IPv6 destinations. + * Returns zero if OK, a positive errno, or a negative TLI error. */ static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, - uint32_t flowinfo, uint_t srcid, uint32_t scope_id, cred_t *cr, pid_t pid) + uint32_t flowinfo, uint_t srcid, uint32_t scope_id) { - tcph_t *tcph; - mblk_t *mp; - ip6_rthdr_t *rth; - int32_t oldstate; - uint16_t lport; + uint16_t lport; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; - int error = 0; - conn_t *connp = tcp->tcp_connp; + int error; - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); /* * If we're here, it means that the destination address is a native - * IPv6 address. Return an error if tcp_ipversion is not IPv6. A + * IPv6 address. Return an error if conn_ipversion is not IPv6. A * reason why it might not be IPv6 is if the socket was bound to an * IPv4-mapped IPv6 address. */ - if (tcp->tcp_ipversion != IPV6_VERSION) { + if (connp->conn_ipversion != IPV6_VERSION) return (-TBADADDR); - } /* * Interpret a zero destination to mean loopback. * Update the T_CONN_REQ (sin/sin6) since it is used to * generate the T_CONN_CON. */ - if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) { + if (IN6_IS_ADDR_UNSPECIFIED(dstaddrp)) *dstaddrp = ipv6_loopback; - } /* Handle __sin6_src_id if socket not bound to an IP address */ - if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) { - ip_srcid_find_id(srcid, &tcp->tcp_ip6h->ip6_src, - connp->conn_zoneid, tcps->tcps_netstack); - tcp->tcp_ip_src_v6 = tcp->tcp_ip6h->ip6_src; - } - - /* - * Take care of the scope_id now and add ip6i_t - * if ip6i_t is not already allocated through TCP - * sticky options. At this point tcp_ip6h does not - * have dst info, thus use dstaddrp. - */ - if (scope_id != 0 && - IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { - ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; - ip6i_t *ip6i; - - ipp->ipp_ifindex = scope_id; - ip6i = (ip6i_t *)tcp->tcp_iphc; - - if ((ipp->ipp_fields & IPPF_HAS_IP6I) && - ip6i != NULL && (ip6i->ip6i_nxt == IPPROTO_RAW)) { - /* Already allocated */ - ip6i->ip6i_flags |= IP6I_IFINDEX; - ip6i->ip6i_ifindex = ipp->ipp_ifindex; - ipp->ipp_fields |= IPPF_SCOPE_ID; - } else { - int reterr; - - ipp->ipp_fields |= IPPF_SCOPE_ID; - if (ipp->ipp_fields & IPPF_HAS_IP6I) - ip2dbg(("tcp_connect_v6: SCOPE_ID set\n")); - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - goto failed; - ip1dbg(("tcp_connect_ipv6: tcp_bld_hdrs returned\n")); - } - } - - /* - * Don't let an endpoint connect to itself. Note that - * the test here does not catch the case where the - * source IP addr was left unspecified by the user. In - * this case, the source addr is set in tcp_adapt_ire() - * using the reply to the T_BIND message that we send - * down to IP here and the check is repeated in tcp_rput_other. - */ - if (IN6_ARE_ADDR_EQUAL(dstaddrp, &tcp->tcp_ip6h->ip6_src) && - (dstport == tcp->tcp_lport)) { - error = -TBADADDR; - goto failed; + if (srcid != 0 && IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { + ip_srcid_find_id(srcid, &connp->conn_laddr_v6, + IPCL_ZONEID(connp), tcps->tcps_netstack); + connp->conn_saddr_v6 = connp->conn_laddr_v6; } /* - * Verify the destination is allowed to receive packets - * at the security label of the connection we are initiating. - * check_dest may create a new effective cred for this - * connection with a modified label or label flags. + * Take care of the scope_id now. */ - if (is_system_labeled()) { - ASSERT(tcp->tcp_connp->conn_effective_cred == NULL); - if ((error = tsol_check_dest(CONN_CRED(tcp->tcp_connp), - dstaddrp, IPV6_VERSION, tcp->tcp_connp->conn_mac_mode, - &tcp->tcp_connp->conn_effective_cred)) != 0) { - if (error != EHOSTUNREACH) - error = -TSYSERR; - goto failed; - } - } - - tcp->tcp_ip6h->ip6_dst = *dstaddrp; - tcp->tcp_remote_v6 = *dstaddrp; - tcp->tcp_ip6h->ip6_vcf = - (IPV6_DEFAULT_VERS_AND_FLOW & IPV6_VERS_AND_FLOW_MASK) | - (flowinfo & ~IPV6_VERS_AND_FLOW_MASK); - - /* - * Massage a routing header (if present) putting the first hop - * in ip6_dst. Compute a starting value for the checksum which - * takes into account that the original ip6_dst should be - * included in the checksum but that ip will include the - * first hop in the source route in the tcp checksum. - */ - rth = ip_find_rthdr_v6(tcp->tcp_ip6h, (uint8_t *)tcp->tcp_tcph); - if (rth != NULL) { - tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, rth, - tcps->tcps_netstack); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + - (tcp->tcp_sum >> 16)); + if (scope_id != 0 && IN6_IS_ADDR_LINKSCOPE(dstaddrp)) { + connp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + connp->conn_ixa->ixa_scopeid = scope_id; } else { - tcp->tcp_sum = 0; + connp->conn_ixa->ixa_flags &= ~IXAF_SCOPEID_SET; } - tcph = tcp->tcp_tcph; - *(uint16_t *)tcph->th_fport = dstport; - tcp->tcp_fport = dstport; + connp->conn_flowinfo = flowinfo; + connp->conn_faddr_v6 = *dstaddrp; + connp->conn_fport = dstport; - oldstate = tcp->tcp_state; /* * At this point the remote destination address and remote port fields * in the tcp-four-tuple have been filled in the tcp structure. Now we - * have to see which state tcp was in so we can take apropriate action. + * have to see which state tcp was in so we can take appropriate action. */ - if (oldstate == TCPS_IDLE) { + if (tcp->tcp_state == TCPS_IDLE) { /* * We support a quick connect capability here, allowing * clients to transition directly from IDLE to SYN_SENT @@ -6438,128 +5375,55 @@ tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, */ lport = tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); - lport = tcp_bindi(tcp, lport, &tcp->tcp_ip_src_v6, 0, B_TRUE, + lport = tcp_bindi(tcp, lport, &connp->conn_laddr_v6, 0, B_TRUE, B_FALSE, B_FALSE); - if (lport == 0) { - error = -TNOADDR; - goto failed; - } + if (lport == 0) + return (-TNOADDR); } - tcp->tcp_state = TCPS_SYN_SENT; - - mp = allocb(sizeof (ire_t), BPRI_HI); - if (mp != NULL) { - in6_addr_t v6src; - - mp->b_wptr += sizeof (ire_t); - mp->b_datap->db_type = IRE_DB_REQ_TYPE; - tcp->tcp_hard_binding = 1; - - /* - * We need to make sure that the conn_recv is set to a non-null - * value before we insert the conn_t into the classifier table. - * This is to avoid a race with an incoming packet which does - * an ipcl_classify(). - */ - tcp->tcp_connp->conn_recv = tcp_input; + /* + * Lookup the route to determine a source address and the uinfo. + * If there was a source route we have tcp_ip6h->ip6_dst as the first + * hop. + * Setup TCP parameters based on the metrics/DCE. + */ + error = tcp_set_destination(tcp); + if (error != 0) + return (error); - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, &v6src); - } else { - v6src = tcp->tcp_ip6h->ip6_src; - } - error = ip_proto_bind_connected_v6(connp, &mp, IPPROTO_TCP, - &v6src, tcp->tcp_lport, &tcp->tcp_remote_v6, - &tcp->tcp_sticky_ipp, tcp->tcp_fport, B_TRUE, B_TRUE, cr); - BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); - tcp->tcp_active_open = 1; + /* + * Don't let an endpoint connect to itself. + */ + if (IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &connp->conn_laddr_v6) && + connp->conn_fport == connp->conn_lport) + return (-TBADADDR); - return (tcp_post_ip_bind(tcp, mp, error, cr, pid)); - } - /* Error case */ - tcp->tcp_state = oldstate; - error = ENOMEM; + tcp->tcp_state = TCPS_SYN_SENT; -failed: - /* return error ack and blow away saved option results if any */ - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - return (error); + return (ipcl_conn_insert_v6(connp)); } /* - * We need a stream q for detached closing tcp connections - * to use. Our client hereby indicates that this q is the - * one to use. + * Disconnect + * Note that unlike other functions this returns a positive tli error + * when it fails; it never returns an errno. */ -static void -tcp_def_q_set(tcp_t *tcp, mblk_t *mp) -{ - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; - queue_t *q = tcp->tcp_wq; - tcp_stack_t *tcps = tcp->tcp_tcps; - -#ifdef NS_DEBUG - (void) printf("TCP_IOC_DEFAULT_Q for stack %d\n", - tcps->tcps_netstack->netstack_stackid); -#endif - mp->b_datap->db_type = M_IOCACK; - iocp->ioc_count = 0; - mutex_enter(&tcps->tcps_g_q_lock); - if (tcps->tcps_g_q != NULL) { - mutex_exit(&tcps->tcps_g_q_lock); - iocp->ioc_error = EALREADY; - } else { - int error = 0; - conn_t *connp = tcp->tcp_connp; - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - - tcps->tcps_g_q = tcp->tcp_rq; - mutex_exit(&tcps->tcps_g_q_lock); - iocp->ioc_error = 0; - iocp->ioc_rval = 0; - /* - * We are passing tcp_sticky_ipp as NULL - * as it is not useful for tcp_default queue - * - * Set conn_recv just in case. - */ - tcp->tcp_connp->conn_recv = tcp_conn_request; - - ASSERT(connp->conn_af_isv6); - connp->conn_ulp = IPPROTO_TCP; - - if (ipst->ips_ipcl_proto_fanout_v6[IPPROTO_TCP].connf_head != - NULL || (connp->conn_mac_mode != CONN_MAC_DEFAULT)) { - error = -TBADADDR; - } else { - connp->conn_srcv6 = ipv6_all_zeros; - ipcl_proto_insert_v6(connp, IPPROTO_TCP); - } - - (void) tcp_post_ip_bind(tcp, NULL, error, NULL, 0); - } - qreply(q, mp); -} - static int tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) { tcp_t *ltcp = NULL; - conn_t *connp; + conn_t *lconnp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; /* * Right now, upper modules pass down a T_DISCON_REQ to TCP, * when the stream is in BOUND state. Do not send a reset, * since the destination IP address is not valid, and it can * be the initialized value of all zeros (broadcast address). - * - * XXX There won't be any pending bind request to IP. */ - if (tcp->tcp_state <= TCPS_BOUND) { - if (tcp->tcp_debug) { + if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_hard_binding) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_disconnect: bad state, %d", tcp->tcp_state); } @@ -6595,19 +5459,23 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) * If it used to be a listener, check to make sure no one else * has taken the port before switching back to LISTEN state. */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - connp = ipcl_lookup_listener_v4(tcp->tcp_lport, - tcp->tcp_ipha->ipha_src, - tcp->tcp_connp->conn_zoneid, ipst); - if (connp != NULL) - ltcp = connp->conn_tcp; + if (connp->conn_ipversion == IPV4_VERSION) { + lconnp = ipcl_lookup_listener_v4(connp->conn_lport, + connp->conn_laddr_v4, IPCL_ZONEID(connp), ipst); + if (lconnp != NULL) + ltcp = lconnp->conn_tcp; } else { - /* Allow tcp_bound_if listeners? */ - connp = ipcl_lookup_listener_v6(tcp->tcp_lport, - &tcp->tcp_ip6h->ip6_src, 0, - tcp->tcp_connp->conn_zoneid, ipst); - if (connp != NULL) - ltcp = connp->conn_tcp; + uint_t ifindex = 0; + + if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) + ifindex = connp->conn_ixa->ixa_scopeid; + + /* Allow conn_bound_if listeners? */ + lconnp = ipcl_lookup_listener_v6(connp->conn_lport, + &connp->conn_laddr_v6, ifindex, IPCL_ZONEID(connp), + ipst); + if (lconnp != NULL) + ltcp = lconnp->conn_tcp; } if (tcp->tcp_conn_req_max && ltcp == NULL) { tcp->tcp_state = TCPS_LISTEN; @@ -6616,7 +5484,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) tcp->tcp_state = TCPS_BOUND; } if (ltcp != NULL) - CONN_DEC_REF(ltcp->tcp_connp); + CONN_DEC_REF(lconnp); if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); } else if (old_state == TCPS_ESTABLISHED || @@ -6648,7 +5516,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) /* * Our client hereby directs us to reject the connection request - * that tcp_conn_request() marked with 'seqnum'. Rejection consists + * that tcp_input_listener() marked with 'seqnum'. Rejection consists * of sending the appropriate RST, not an ICMP error. */ static void @@ -6656,6 +5524,7 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) { t_scalar_t seqnum; int error; + conn_t *connp = tcp->tcp_connp; ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_discon_req)) { @@ -6669,11 +5538,11 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) else { if (tcp->tcp_state >= TCPS_ESTABLISHED) { /* Send M_FLUSH according to TPI */ - (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); + (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); } mp = mi_tpi_ok_ack_alloc(mp); - if (mp) - putnext(tcp->tcp_rq, mp); + if (mp != NULL) + putnext(connp->conn_rq, mp); } } @@ -6695,6 +5564,7 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) in6_addr_t local, remote; char local_addrbuf[INET6_ADDRSTRLEN]; char remote_addrbuf[INET6_ADDRSTRLEN]; + conn_t *connp; if (sup_buf != NULL) buf = sup_buf; @@ -6703,6 +5573,8 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) if (tcp == NULL) return ("NULL_TCP"); + + connp = tcp->tcp_connp; switch (tcp->tcp_state) { case TCPS_CLOSED: cp = "TCP_CLOSED"; @@ -6750,32 +5622,32 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) } switch (format) { case DISP_ADDR_AND_PORT: - if (tcp->tcp_ipversion == IPV4_VERSION) { + if (connp->conn_ipversion == IPV4_VERSION) { /* * Note that we use the remote address in the tcp_b * structure. This means that it will print out * the real destination address, not the next hop's * address if source routing is used. */ - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ip_src, &local); - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_remote, &remote); + IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); + IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); } else { - local = tcp->tcp_ip_src_v6; - remote = tcp->tcp_remote_v6; + local = connp->conn_laddr_v6; + remote = connp->conn_faddr_v6; } (void) inet_ntop(AF_INET6, &local, local_addrbuf, sizeof (local_addrbuf)); (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, sizeof (remote_addrbuf)); (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", - local_addrbuf, ntohs(tcp->tcp_lport), remote_addrbuf, - ntohs(tcp->tcp_fport), cp); + local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, + ntohs(connp->conn_fport), cp); break; case DISP_PORT_ONLY: default: (void) mi_sprintf(buf, "[%u, %u] %s", - ntohs(tcp->tcp_lport), ntohs(tcp->tcp_fport), cp); + ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); break; } @@ -6788,26 +5660,24 @@ tcp_display(tcp_t *tcp, char *sup_buf, char format) * eager to disappear either by means of tcp_eager_blowoff() or * tcp_eager_cleanup() being called. tcp_eager_kill() can also be * called (via squeue) if the eager cannot be inserted in the - * fanout table in tcp_conn_request(). + * fanout table in tcp_input_listener(). */ /* ARGSUSED */ void -tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) +tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *econnp = (conn_t *)arg; tcp_t *eager = econnp->conn_tcp; tcp_t *listener = eager->tcp_listener; - tcp_stack_t *tcps = eager->tcp_tcps; /* * We could be called because listener is closing. Since - * the eager is using listener's queue's, its not safe. - * Better use the default queue just to send the TH_RST - * out. + * the eager was using listener's queue's, we avoid + * using the listeners queues from now on. */ - ASSERT(tcps->tcps_g_q != NULL); - eager->tcp_rq = tcps->tcps_g_q; - eager->tcp_wq = WR(tcps->tcps_g_q); + ASSERT(eager->tcp_detached); + econnp->conn_rq = NULL; + econnp->conn_wq = NULL; /* * An eager's conn_fanout will be NULL if it's a duplicate @@ -6828,7 +5698,7 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) * The eager has sent a conn_ind up to the * listener but listener decides to close * instead. We need to drop the extra ref - * placed on eager in tcp_rput_data() before + * placed on eager in tcp_input_data() before * sending the conn_ind to listener. */ CONN_DEC_REF(econnp); @@ -6873,7 +5743,7 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) mutex_exit(&listener->tcp_eager_lock); mp = &eager->tcp_closemp; SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, - eager->tcp_connp, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); + eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); return (B_TRUE); } @@ -6901,7 +5771,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) CONN_INC_REF(eager->tcp_connp); mp = &eager->tcp_closemp; SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, + tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); } eager = eager->tcp_eager_next_q; @@ -6917,7 +5787,7 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) CONN_INC_REF(eager->tcp_connp); mp = &eager->tcp_closemp; SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, SQ_FILL, + tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_CLEANUP_Q0); } eager = eager->tcp_eager_next_q0; @@ -7008,7 +5878,7 @@ static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) { if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } /* Shorthand to generate and send TPI error acks to our client */ @@ -7024,7 +5894,7 @@ tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, teackp->ERROR_prim = primitive; teackp->TLI_error = t_error; teackp->UNIX_error = sys_error; - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } } @@ -7194,8 +6064,9 @@ static void tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; - if (tcp->tcp_family == AF_INET6) + if (connp->conn_family == AF_INET6) *tia = tcp_g_t_info_ack_v6; else *tia = tcp_g_t_info_ack; @@ -7203,7 +6074,7 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) tia->OPT_size = tcp_max_optsize; if (tcp->tcp_mss == 0) { /* Not yet set - tcp_open does not set mss */ - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) tia->TIDU_size = tcps->tcps_mss_def_ipv4; else tia->TIDU_size = tcps->tcps_mss_def_ipv6; @@ -7258,7 +6129,7 @@ tcp_capability_req(tcp_t *tcp, mblk_t *mp) tcap = (struct T_capability_ack *)mp->b_rptr; tcp_do_capability_ack(tcp, tcap, cap_bits1); - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } /* @@ -7276,16 +6147,18 @@ tcp_info_req(tcp_t *tcp, mblk_t *mp) return; } tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); - putnext(tcp->tcp_rq, mp); + putnext(tcp->tcp_connp->conn_rq, mp); } /* Respond to the TPI addr request */ static void tcp_addr_req(tcp_t *tcp, mblk_t *mp) { - sin_t *sin; + struct sockaddr *sa; mblk_t *ackmp; struct T_addr_ack *taa; + conn_t *connp = tcp->tcp_connp; + uint_t addrlen; /* Make it large enough for worst case */ ackmp = reallocb(mp, sizeof (struct T_addr_ack) + @@ -7295,10 +6168,6 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp) return; } - if (tcp->tcp_ipversion == IPV6_VERSION) { - tcp_addr_req_ipv6(tcp, ackmp); - return; - } taa = (struct T_addr_ack *)ackmp->b_rptr; bzero(taa, sizeof (struct T_addr_ack)); @@ -7307,110 +6176,38 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp) taa->PRIM_type = T_ADDR_ACK; ackmp->b_datap->db_type = M_PCPROTO; + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + /* * Note: Following code assumes 32 bit alignment of basic * data structures like sin_t and struct T_addr_ack. */ if (tcp->tcp_state >= TCPS_BOUND) { /* - * Fill in local address + * Fill in local address first */ - taa->LOCADDR_length = sizeof (sin_t); taa->LOCADDR_offset = sizeof (*taa); - - sin = (sin_t *)&taa[1]; - - /* Fill zeroes and then intialize non-zero fields */ - *sin = sin_null; - - sin->sin_family = AF_INET; - - sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; - sin->sin_port = *(uint16_t *)tcp->tcp_tcph->th_lport; - - ackmp->b_wptr = (uchar_t *)&sin[1]; - - if (tcp->tcp_state >= TCPS_SYN_RCVD) { - /* - * Fill in Remote address - */ - taa->REMADDR_length = sizeof (sin_t); - taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + - taa->LOCADDR_length); - - sin = (sin_t *)(ackmp->b_rptr + taa->REMADDR_offset); - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = tcp->tcp_remote; - sin->sin_port = tcp->tcp_fport; - - ackmp->b_wptr = (uchar_t *)&sin[1]; - } + taa->LOCADDR_length = addrlen; + sa = (struct sockaddr *)&taa[1]; + (void) conn_getsockname(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; } - putnext(tcp->tcp_rq, ackmp); -} - -/* Assumes that tcp_addr_req gets enough space and alignment */ -static void -tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) -{ - sin6_t *sin6; - struct T_addr_ack *taa; - - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - ASSERT(OK_32PTR(ackmp->b_rptr)); - ASSERT(ackmp->b_wptr - ackmp->b_rptr >= sizeof (struct T_addr_ack) + - 2 * sizeof (sin6_t)); - - taa = (struct T_addr_ack *)ackmp->b_rptr; - - bzero(taa, sizeof (struct T_addr_ack)); - ackmp->b_wptr = (uchar_t *)&taa[1]; - - taa->PRIM_type = T_ADDR_ACK; - ackmp->b_datap->db_type = M_PCPROTO; - - /* - * Note: Following code assumes 32 bit alignment of basic - * data structures like sin6_t and struct T_addr_ack. - */ - if (tcp->tcp_state >= TCPS_BOUND) { + if (tcp->tcp_state >= TCPS_SYN_RCVD) { /* - * Fill in local address + * Fill in Remote address */ - taa->LOCADDR_length = sizeof (sin6_t); - taa->LOCADDR_offset = sizeof (*taa); - - sin6 = (sin6_t *)&taa[1]; - *sin6 = sin6_null; - - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; - sin6->sin6_port = tcp->tcp_lport; - - ackmp->b_wptr = (uchar_t *)&sin6[1]; - - if (tcp->tcp_state >= TCPS_SYN_RCVD) { - /* - * Fill in Remote address - */ - taa->REMADDR_length = sizeof (sin6_t); - taa->REMADDR_offset = ROUNDUP32(taa->LOCADDR_offset + - taa->LOCADDR_length); - - sin6 = (sin6_t *)(ackmp->b_rptr + taa->REMADDR_offset); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_flowinfo = - tcp->tcp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; - sin6->sin6_addr = tcp->tcp_remote_v6; - sin6->sin6_port = tcp->tcp_fport; - - ackmp->b_wptr = (uchar_t *)&sin6[1]; - } + taa->REMADDR_length = addrlen; + /* assumed 32-bit alignment */ + taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; + sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); + (void) conn_getpeername(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; } - putnext(tcp->tcp_rq, ackmp); + ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); + putnext(tcp->tcp_connp->conn_rq, ackmp); } /* @@ -7420,19 +6217,19 @@ tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *ackmp) static void tcp_reinit(tcp_t *tcp) { - mblk_t *mp; - int err; + mblk_t *mp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; TCP_STAT(tcps, tcp_reinit_calls); /* tcp_reinit should never be called for detached tcp_t's */ ASSERT(tcp->tcp_listener == NULL); - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); /* Cancel outstanding timers */ tcp_timers_stop(tcp); @@ -7453,7 +6250,7 @@ tcp_reinit(tcp_t *tcp) tcp->tcp_unsent = tcp->tcp_xmit_tail_unsent = 0; mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -7494,7 +6291,7 @@ tcp_reinit(tcp_t *tcp) */ tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); - CL_INET_DISCONNECT(tcp->tcp_connp, tcp); + CL_INET_DISCONNECT(connp); /* * The connection can't be on the tcp_time_wait_head list @@ -7522,14 +6319,12 @@ tcp_reinit(tcp_t *tcp) * Reset/preserve other values */ tcp_reinit_values(tcp); - ipcl_hash_remove(tcp->tcp_connp); - conn_delete_ire(tcp->tcp_connp, NULL); + ipcl_hash_remove(connp); + ixa_cleanup(connp->conn_ixa); tcp_ipsec_cleanup(tcp); - if (tcp->tcp_connp->conn_effective_cred != NULL) { - crfree(tcp->tcp_connp->conn_effective_cred); - tcp->tcp_connp->conn_effective_cred = NULL; - } + connp->conn_laddr_v6 = connp->conn_bound_addr_v6; + connp->conn_saddr_v6 = connp->conn_bound_addr_v6; if (tcp->tcp_conn_req_max != 0) { /* @@ -7553,44 +6348,31 @@ tcp_reinit(tcp_t *tcp) tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; tcp->tcp_eager_next_drop_q0 = tcp; tcp->tcp_eager_prev_drop_q0 = tcp; - tcp->tcp_connp->conn_recv = tcp_conn_request; - if (tcp->tcp_family == AF_INET6) { - ASSERT(tcp->tcp_connp->conn_af_isv6); - (void) ipcl_bind_insert_v6(tcp->tcp_connp, IPPROTO_TCP, - &tcp->tcp_ip6h->ip6_src, tcp->tcp_lport); - } else { - ASSERT(!tcp->tcp_connp->conn_af_isv6); - (void) ipcl_bind_insert(tcp->tcp_connp, IPPROTO_TCP, - tcp->tcp_ipha->ipha_src, tcp->tcp_lport); - } + /* + * Initially set conn_recv to tcp_input_listener_unbound to try + * to pick a good squeue for the listener when the first SYN + * arrives. tcp_input_listener_unbound sets it to + * tcp_input_listener on that first SYN. + */ + connp->conn_recv = tcp_input_listener_unbound; + + connp->conn_proto = IPPROTO_TCP; + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; + + (void) ipcl_bind_insert(connp); } else { tcp->tcp_state = TCPS_BOUND; } /* * Initialize to default values - * Can't fail since enough header template space already allocated - * at open(). - */ - err = tcp_init_values(tcp); - ASSERT(err == 0); - /* Restore state in tcp_tcph */ - bcopy(&tcp->tcp_lport, tcp->tcp_tcph->th_lport, TCP_PORT_LEN); - if (tcp->tcp_ipversion == IPV4_VERSION) - tcp->tcp_ipha->ipha_src = tcp->tcp_bound_source; - else - tcp->tcp_ip6h->ip6_src = tcp->tcp_bound_source_v6; - /* - * Copy of the src addr. in tcp_t is needed in tcp_t - * since the lookup funcs can only lookup on tcp_t */ - tcp->tcp_ip_src_v6 = tcp->tcp_bound_source_v6; + tcp_init_values(tcp); ASSERT(tcp->tcp_ptpbhn != NULL); - tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat; - tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat; - tcp->tcp_rwnd = tcps->tcps_recv_hiwat; - tcp->tcp_mss = tcp->tcp_ipversion != IPV4_VERSION ? + tcp->tcp_rwnd = connp->conn_rcvbuf; + tcp->tcp_mss = connp->conn_ipversion != IPV4_VERSION ? tcps->tcps_mss_def_ipv6 : tcps->tcps_mss_def_ipv4; } @@ -7606,6 +6388,7 @@ tcp_reinit_values(tcp) tcp_t *tcp; { tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; #ifndef lint #define DONTCARE(x) @@ -7626,8 +6409,8 @@ tcp_reinit_values(tcp) ASSERT(tcp->tcp_time_wait_prev == NULL); ASSERT(tcp->tcp_time_wait_expire == 0); PRESERVE(tcp->tcp_state); - PRESERVE(tcp->tcp_rq); - PRESERVE(tcp->tcp_wq); + PRESERVE(connp->conn_rq); + PRESERVE(connp->conn_wq); ASSERT(tcp->tcp_xmit_head == NULL); ASSERT(tcp->tcp_xmit_last == NULL); @@ -7638,26 +6421,32 @@ tcp_reinit_values(tcp) tcp->tcp_snxt = 0; /* Displayed in mib */ tcp->tcp_suna = 0; /* Displayed in mib */ tcp->tcp_swnd = 0; - DONTCARE(tcp->tcp_cwnd); /* Init in tcp_mss_set */ + DONTCARE(tcp->tcp_cwnd); /* Init in tcp_process_options */ ASSERT(tcp->tcp_ibsegs == 0); ASSERT(tcp->tcp_obsegs == 0); - if (tcp->tcp_iphc != NULL) { - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); + if (connp->conn_ht_iphc != NULL) { + kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated); + connp->conn_ht_iphc = NULL; + connp->conn_ht_iphc_allocated = 0; + connp->conn_ht_iphc_len = 0; + connp->conn_ht_ulp = NULL; + connp->conn_ht_ulp_len = 0; + tcp->tcp_ipha = NULL; + tcp->tcp_ip6h = NULL; + tcp->tcp_tcpha = NULL; } + /* We clear any IP_OPTIONS and extension headers */ + ip_pkt_free(&connp->conn_xmit_ipp); + DONTCARE(tcp->tcp_naglim); /* Init in tcp_init_values */ - DONTCARE(tcp->tcp_hdr_len); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_ipha); DONTCARE(tcp->tcp_ip6h); - DONTCARE(tcp->tcp_ip_hdr_len); - DONTCARE(tcp->tcp_tcph); - DONTCARE(tcp->tcp_tcp_hdr_len); /* Init in tcp_init_values */ + DONTCARE(tcp->tcp_tcpha); tcp->tcp_valid_bits = 0; - DONTCARE(tcp->tcp_xmit_hiwater); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_timer_backoff); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_last_recv_time); /* Init in tcp_init_values */ tcp->tcp_last_rcv_lbolt = 0; @@ -7666,38 +6455,19 @@ tcp_reinit_values(tcp) tcp->tcp_urp_last_valid = 0; tcp->tcp_hard_binding = 0; - tcp->tcp_hard_bound = 0; - PRESERVE(tcp->tcp_cred); - PRESERVE(tcp->tcp_cpid); - PRESERVE(tcp->tcp_open_time); - PRESERVE(tcp->tcp_exclbind); tcp->tcp_fin_acked = 0; tcp->tcp_fin_rcvd = 0; tcp->tcp_fin_sent = 0; tcp->tcp_ordrel_done = 0; - tcp->tcp_debug = 0; - tcp->tcp_dontroute = 0; - tcp->tcp_broadcast = 0; - - tcp->tcp_useloopback = 0; - tcp->tcp_reuseaddr = 0; - tcp->tcp_oobinline = 0; - tcp->tcp_dgram_errind = 0; - tcp->tcp_detached = 0; - tcp->tcp_bind_pending = 0; - tcp->tcp_unbind_pending = 0; tcp->tcp_snd_ws_ok = B_FALSE; tcp->tcp_snd_ts_ok = B_FALSE; - tcp->tcp_linger = 0; - tcp->tcp_ka_enabled = 0; tcp->tcp_zero_win_probe = 0; tcp->tcp_loopback = 0; - tcp->tcp_refuse = 0; tcp->tcp_localnet = 0; tcp->tcp_syn_defense = 0; tcp->tcp_set_timer = 0; @@ -7707,19 +6477,12 @@ tcp_reinit_values(tcp) tcp->tcp_xmit_zc_clean = B_FALSE; tcp->tcp_snd_sack_ok = B_FALSE; - PRESERVE(tcp->tcp_recvdstaddr); tcp->tcp_hwcksum = B_FALSE; - tcp->tcp_ire_ill_check_done = B_FALSE; - DONTCARE(tcp->tcp_maxpsz); /* Init in tcp_init_values */ - - tcp->tcp_mdt = B_FALSE; - tcp->tcp_mdt_hdr_head = 0; - tcp->tcp_mdt_hdr_tail = 0; + DONTCARE(tcp->tcp_maxpsz_multiplier); /* Init in tcp_init_values */ tcp->tcp_conn_def_q0 = 0; tcp->tcp_ip_forward_progress = B_FALSE; - tcp->tcp_anon_priv_bind = 0; tcp->tcp_ecn_ok = B_FALSE; tcp->tcp_cwr = B_FALSE; @@ -7740,7 +6503,7 @@ tcp_reinit_values(tcp) tcp->tcp_ts_recent = 0; tcp->tcp_rnxt = 0; /* Displayed in mib */ DONTCARE(tcp->tcp_rwnd); /* Set in tcp_reinit() */ - tcp->tcp_if_mtu = 0; + tcp->tcp_initial_pmtu = 0; ASSERT(tcp->tcp_reass_head == NULL); ASSERT(tcp->tcp_reass_tail == NULL); @@ -7752,7 +6515,7 @@ tcp_reinit_values(tcp) ASSERT(tcp->tcp_rcv_last_tail == NULL); ASSERT(tcp->tcp_rcv_cnt == 0); - DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_adapt_ire */ + DONTCARE(tcp->tcp_cwnd_ssthresh); /* Init in tcp_set_destination */ DONTCARE(tcp->tcp_cwnd_max); /* Init in tcp_init_values */ tcp->tcp_csuna = 0; @@ -7773,8 +6536,6 @@ tcp_reinit_values(tcp) ASSERT(tcp->tcp_listener == NULL); - DONTCARE(tcp->tcp_xmit_lowater); /* Init in tcp_init_values */ - DONTCARE(tcp->tcp_irs); /* tcp_valid_bits cleared */ DONTCARE(tcp->tcp_iss); /* tcp_valid_bits cleared */ DONTCARE(tcp->tcp_fss); /* tcp_valid_bits cleared */ @@ -7785,14 +6546,11 @@ tcp_reinit_values(tcp) PRESERVE(tcp->tcp_conn_req_max); PRESERVE(tcp->tcp_conn_req_seqnum); - DONTCARE(tcp->tcp_ip_hdr_len); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_first_timer_threshold); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_second_timer_threshold); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_first_ctimer_threshold); /* Init in tcp_init_values */ DONTCARE(tcp->tcp_second_ctimer_threshold); /* in tcp_init_values */ - tcp->tcp_lingertime = 0; - DONTCARE(tcp->tcp_urp_last); /* tcp_urp_last_valid is cleared */ ASSERT(tcp->tcp_urp_mp == NULL); ASSERT(tcp->tcp_urp_mark_mp == NULL); @@ -7811,16 +6569,16 @@ tcp_reinit_values(tcp) tcp->tcp_client_errno = 0; - DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */ + DONTCARE(connp->conn_sum); /* Init in tcp_init_values */ - tcp->tcp_remote_v6 = ipv6_all_zeros; /* Displayed in MIB */ + connp->conn_faddr_v6 = ipv6_all_zeros; /* Displayed in MIB */ - PRESERVE(tcp->tcp_bound_source_v6); + PRESERVE(connp->conn_bound_addr_v6); tcp->tcp_last_sent_len = 0; tcp->tcp_dupack_cnt = 0; - tcp->tcp_fport = 0; /* Displayed in MIB */ - PRESERVE(tcp->tcp_lport); + connp->conn_fport = 0; /* Displayed in MIB */ + PRESERVE(connp->conn_lport); PRESERVE(tcp->tcp_acceptor_lockp); @@ -7828,16 +6586,18 @@ tcp_reinit_values(tcp) PRESERVE(tcp->tcp_acceptor_id); DONTCARE(tcp->tcp_ipsec_overhead); - PRESERVE(tcp->tcp_family); - if (tcp->tcp_family == AF_INET6) { + PRESERVE(connp->conn_family); + /* Remove any remnants of mapped address binding */ + if (connp->conn_family == AF_INET6) { + connp->conn_ipversion = IPV6_VERSION; tcp->tcp_mss = tcps->tcps_mss_def_ipv6; } else { + connp->conn_ipversion = IPV4_VERSION; tcp->tcp_mss = tcps->tcps_mss_def_ipv4; } - PRESERVE(tcp->tcp_ipversion); /* Init in tcp_init_values */ - tcp->tcp_bound_if = 0; - tcp->tcp_ipv6_recvancillary = 0; + connp->conn_bound_if = 0; + connp->conn_recv_ancillary.crb_all = 0; tcp->tcp_recvifindex = 0; tcp->tcp_recvhops = 0; tcp->tcp_closed = 0; @@ -7854,19 +6614,18 @@ tcp_reinit_values(tcp) tcp->tcp_dstoptslen = 0; } ASSERT(tcp->tcp_dstoptslen == 0); - if (tcp->tcp_rtdstopts != NULL) { - mi_free(tcp->tcp_rtdstopts); - tcp->tcp_rtdstopts = NULL; - tcp->tcp_rtdstoptslen = 0; + if (tcp->tcp_rthdrdstopts != NULL) { + mi_free(tcp->tcp_rthdrdstopts); + tcp->tcp_rthdrdstopts = NULL; + tcp->tcp_rthdrdstoptslen = 0; } - ASSERT(tcp->tcp_rtdstoptslen == 0); + ASSERT(tcp->tcp_rthdrdstoptslen == 0); if (tcp->tcp_rthdr != NULL) { mi_free(tcp->tcp_rthdr); tcp->tcp_rthdr = NULL; tcp->tcp_rthdrlen = 0; } ASSERT(tcp->tcp_rthdrlen == 0); - PRESERVE(tcp->tcp_drop_opt_ack_cnt); /* Reset fusion-related fields */ tcp->tcp_fused = B_FALSE; @@ -7902,35 +6661,17 @@ tcp_reinit_values(tcp) #undef PRESERVE } -/* - * Allocate necessary resources and initialize state vector. - * Guaranteed not to fail so that when an error is returned, - * the caller doesn't need to do any additional cleanup. - */ -int -tcp_init(tcp_t *tcp, queue_t *q) -{ - int err; - - tcp->tcp_rq = q; - tcp->tcp_wq = WR(q); - tcp->tcp_state = TCPS_IDLE; - if ((err = tcp_init_values(tcp)) != 0) - tcp_timers_stop(tcp); - return (err); -} - -static int +static void tcp_init_values(tcp_t *tcp) { - int err; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; - ASSERT((tcp->tcp_family == AF_INET && - tcp->tcp_ipversion == IPV4_VERSION) || - (tcp->tcp_family == AF_INET6 && - (tcp->tcp_ipversion == IPV4_VERSION || - tcp->tcp_ipversion == IPV6_VERSION))); + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); /* * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO @@ -7953,7 +6694,7 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN; tcp->tcp_snd_burst = TCP_CWND_INFINITE; - tcp->tcp_maxpsz = tcps->tcps_maxpsz_multiplier; + tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier; tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval; @@ -7966,10 +6707,7 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_naglim = tcps->tcps_naglim_def; - /* NOTE: ISS is now set in tcp_adapt_ire(). */ - - tcp->tcp_mdt_hdr_head = 0; - tcp->tcp_mdt_hdr_tail = 0; + /* NOTE: ISS is now set in tcp_set_destination(). */ /* Reset fusion-related fields */ tcp->tcp_fused = B_FALSE; @@ -7977,280 +6715,84 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_fused_sigurg = B_FALSE; tcp->tcp_loopback_peer = NULL; - /* Initialize the header template */ - if (tcp->tcp_family == AF_INET) { - err = tcp_header_init_ipv4(tcp); - } else { - err = tcp_header_init_ipv6(tcp); - } - if (err) - return (err); + /* We rebuild the header template on the next connect/conn_request */ + + connp->conn_mlp_type = mlptSingle; /* * Init the window scale to the max so tcp_rwnd_set() won't pare - * down tcp_rwnd. tcp_adapt_ire() will set the right value later. + * down tcp_rwnd. tcp_set_destination() will set the right value later. */ tcp->tcp_rcv_ws = TCP_MAX_WINSHIFT; - tcp->tcp_xmit_lowater = tcps->tcps_xmit_lowat; - tcp->tcp_xmit_hiwater = tcps->tcps_xmit_hiwat; - tcp->tcp_recv_hiwater = tcps->tcps_recv_hiwat; - tcp->tcp_rwnd = tcps->tcps_recv_hiwat; - tcp->tcp_recv_lowater = tcp_rinfo.mi_lowat; + tcp->tcp_rwnd = connp->conn_rcvbuf; tcp->tcp_cork = B_FALSE; /* - * Init the tcp_debug option. This value determines whether TCP + * Init the tcp_debug option if it wasn't already set. This value + * determines whether TCP * calls strlog() to print out debug messages. Doing this * initialization here means that this value is not inherited thru * tcp_reinit(). */ - tcp->tcp_debug = tcps->tcps_dbg; + if (!connp->conn_debug) + connp->conn_debug = tcps->tcps_dbg; tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; - - return (0); -} - -/* - * Initialize the IPv4 header. Loses any record of any IP options. - */ -static int -tcp_header_init_ipv4(tcp_t *tcp) -{ - tcph_t *tcph; - uint32_t sum; - conn_t *connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * This is a simple initialization. If there's - * already a template, it should never be too small, - * so reuse it. Otherwise, allocate space for the new one. - */ - if (tcp->tcp_iphc == NULL) { - ASSERT(tcp->tcp_iphc_len == 0); - tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; - tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); - if (tcp->tcp_iphc == NULL) { - tcp->tcp_iphc_len = 0; - return (ENOMEM); - } - } - - /* options are gone; may need a new label */ - connp = tcp->tcp_connp; - connp->conn_mlp_type = mlptSingle; - connp->conn_ulp_labeled = !is_system_labeled(); - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - - /* - * tcp_do_get{sock,peer}name constructs the sockaddr from the - * ip header, and decides which header to use based on ip version. - * That operation happens outside the squeue, so we hold the lock - * here to ensure that the ip version and header remain consistent. - */ - mutex_enter(&connp->conn_lock); - tcp->tcp_ipversion = IPV4_VERSION; - tcp->tcp_ipha = (ipha_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = NULL; - mutex_exit(&connp->conn_lock); - - tcp->tcp_hdr_len = sizeof (ipha_t) + sizeof (tcph_t); - tcp->tcp_tcp_hdr_len = sizeof (tcph_t); - tcp->tcp_ip_hdr_len = sizeof (ipha_t); - tcp->tcp_ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (tcph_t)); - tcp->tcp_ipha->ipha_version_and_hdr_length - = (IP_VERSION << 4) | IP_SIMPLE_HDR_LENGTH_IN_WORDS; - tcp->tcp_ipha->ipha_ident = 0; - - tcp->tcp_ttl = (uchar_t)tcps->tcps_ipv4_ttl; - tcp->tcp_tos = 0; - tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; - tcp->tcp_ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; - tcp->tcp_ipha->ipha_protocol = IPPROTO_TCP; - - tcph = (tcph_t *)(tcp->tcp_iphc + sizeof (ipha_t)); - tcp->tcp_tcph = tcph; - tcph->th_offset_and_rsrvd[0] = (5 << 4); - /* - * IP wants our header length in the checksum field to - * allow it to perform a single pseudo-header+checksum - * calculation on behalf of TCP. - * Include the adjustment for a source route once IP_OPTIONS is set. - */ - sum = sizeof (tcph_t) + tcp->tcp_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - return (0); -} - -/* - * Initialize the IPv6 header. Loses any record of any IPv6 extension headers. - */ -static int -tcp_header_init_ipv6(tcp_t *tcp) -{ - tcph_t *tcph; - uint32_t sum; - conn_t *connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * This is a simple initialization. If there's - * already a template, it should never be too small, - * so reuse it. Otherwise, allocate space for the new one. - * Ensure that there is enough space to "downgrade" the tcp_t - * to an IPv4 tcp_t. This requires having space for a full load - * of IPv4 options, as well as a full load of TCP options - * (TCP_MAX_COMBINED_HEADER_LENGTH, 120 bytes); this is more space - * than a v6 header and a TCP header with a full load of TCP options - * (IPV6_HDR_LEN is 40 bytes; TCP_MAX_HDR_LENGTH is 60 bytes). - * We want to avoid reallocation in the "downgraded" case when - * processing outbound IPv4 options. - */ - if (tcp->tcp_iphc == NULL) { - ASSERT(tcp->tcp_iphc_len == 0); - tcp->tcp_iphc_len = TCP_MAX_COMBINED_HEADER_LENGTH; - tcp->tcp_iphc = kmem_cache_alloc(tcp_iphc_cache, KM_NOSLEEP); - if (tcp->tcp_iphc == NULL) { - tcp->tcp_iphc_len = 0; - return (ENOMEM); - } - } - - /* options are gone; may need a new label */ - connp = tcp->tcp_connp; - connp->conn_mlp_type = mlptSingle; - connp->conn_ulp_labeled = !is_system_labeled(); - - ASSERT(tcp->tcp_iphc_len >= TCP_MAX_COMBINED_HEADER_LENGTH); - tcp->tcp_hdr_len = IPV6_HDR_LEN + sizeof (tcph_t); - tcp->tcp_tcp_hdr_len = sizeof (tcph_t); - tcp->tcp_ip_hdr_len = IPV6_HDR_LEN; - - /* - * tcp_do_get{sock,peer}name constructs the sockaddr from the - * ip header, and decides which header to use based on ip version. - * That operation happens outside the squeue, so we hold the lock - * here to ensure that the ip version and header remain consistent. - */ - mutex_enter(&connp->conn_lock); - tcp->tcp_ipversion = IPV6_VERSION; - tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; - tcp->tcp_ipha = NULL; - mutex_exit(&connp->conn_lock); - - /* Initialize the header template */ - - tcp->tcp_ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; - tcp->tcp_ip6h->ip6_plen = ntohs(sizeof (tcph_t)); - tcp->tcp_ip6h->ip6_nxt = IPPROTO_TCP; - tcp->tcp_ip6h->ip6_hops = (uint8_t)tcps->tcps_ipv6_hoplimit; - - tcph = (tcph_t *)(tcp->tcp_iphc + IPV6_HDR_LEN); - tcp->tcp_tcph = tcph; - tcph->th_offset_and_rsrvd[0] = (5 << 4); - /* - * IP wants our header length in the checksum field to - * allow it to perform a single psuedo-header+checksum - * calculation on behalf of TCP. - * Include the adjustment for a source route when IPV6_RTHDR is set. - */ - sum = sizeof (tcph_t) + tcp->tcp_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - return (0); } /* At minimum we need 8 bytes in the TCP header for the lookup */ #define ICMP_MIN_TCP_HDR 8 /* - * tcp_icmp_error is called by tcp_rput_other to process ICMP error messages + * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages * passed up by IP. The message is always received on the correct tcp_t. * Assumes that IP has pulled up everything up to and including the ICMP header. */ -void -tcp_icmp_error(tcp_t *tcp, mblk_t *mp) +/* ARGSUSED2 */ +static void +tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { - icmph_t *icmph; - ipha_t *ipha; - int iph_hdr_length; - tcph_t *tcph; - boolean_t ipsec_mctl = B_FALSE; - boolean_t secure; - mblk_t *first_mp = mp; - int32_t new_mss; - uint32_t ratio; - size_t mp_size = MBLKL(mp); - uint32_t seg_seq; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - /* Assume IP provides aligned packets - otherwise toss */ - if (!OK_32PTR(mp->b_rptr)) { - freemsg(mp); - return; - } - - /* - * Since ICMP errors are normal data marked with M_CTL when sent - * to TCP or UDP, we have to look for a IPSEC_IN value to identify - * packets starting with an ipsec_info_t, see ipsec_info.h. - */ - if ((mp_size == sizeof (ipsec_info_t)) && - (((ipsec_info_t *)mp->b_rptr)->ipsec_info_type == IPSEC_IN)) { - ASSERT(mp->b_cont != NULL); - mp = mp->b_cont; - /* IP should have done this */ - ASSERT(OK_32PTR(mp->b_rptr)); - mp_size = MBLKL(mp); - ipsec_mctl = B_TRUE; - } + conn_t *connp = (conn_t *)arg1; + icmph_t *icmph; + ipha_t *ipha; + int iph_hdr_length; + tcpha_t *tcpha; + uint32_t seg_seq; + tcp_t *tcp = connp->conn_tcp; - /* - * Verify that we have a complete outer IP header. If not, drop it. - */ - if (mp_size < sizeof (ipha_t)) { -noticmpv4: - freemsg(first_mp); - return; - } + /* Assume IP provides aligned packets */ + ASSERT(OK_32PTR(mp->b_rptr)); + ASSERT((MBLKL(mp) >= sizeof (ipha_t))); - ipha = (ipha_t *)mp->b_rptr; /* * Verify IP version. Anything other than IPv4 or IPv6 packet is sent * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. */ - switch (IPH_HDR_VERSION(ipha)) { - case IPV6_VERSION: - tcp_icmp_error_ipv6(tcp, first_mp, ipsec_mctl); + if (!(ira->ira_flags & IRAF_IS_IPV4)) { + tcp_icmp_error_ipv6(tcp, mp, ira); return; - case IPV4_VERSION: - break; - default: - goto noticmpv4; } /* Skip past the outer IP and ICMP headers */ - iph_hdr_length = IPH_HDR_LENGTH(ipha); + iph_hdr_length = ira->ira_ip_hdr_length; icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; /* - * If we don't have the correct outer IP header length or if the ULP - * is not IPPROTO_ICMP or if we don't have a complete inner IP header - * send it upstream. + * If we don't have the correct outer IP header length + * or if we don't have a complete inner IP header + * drop it. */ if (iph_hdr_length < sizeof (ipha_t) || - ipha->ipha_protocol != IPPROTO_ICMP || (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { - goto noticmpv4; +noticmpv4: + freemsg(mp); + return; } ipha = (ipha_t *)&icmph[1]; /* Skip past the inner IP and find the ULP header */ iph_hdr_length = IPH_HDR_LENGTH(ipha); - tcph = (tcph_t *)((char *)ipha + iph_hdr_length); + tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length); /* * If we don't have the correct inner IP header length or if the ULP * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR @@ -8258,166 +6800,20 @@ noticmpv4: */ if (iph_hdr_length < sizeof (ipha_t) || ipha->ipha_protocol != IPPROTO_TCP || - (uchar_t *)tcph + ICMP_MIN_TCP_HDR > mp->b_wptr) { - goto noticmpv4; - } - - if (TCP_IS_DETACHED_NONEAGER(tcp)) { - if (ipsec_mctl) { - secure = ipsec_in_is_secure(first_mp); - } else { - secure = B_FALSE; - } - if (secure) { - /* - * If we are willing to accept this in clear - * we don't have to verify policy. - */ - if (!ipsec_inbound_accept_clear(mp, ipha, NULL)) { - if (!tcp_check_policy(tcp, first_mp, - ipha, NULL, secure, ipsec_mctl)) { - /* - * tcp_check_policy called - * ip_drop_packet() on failure. - */ - return; - } - } - } - } else if (ipsec_mctl) { - /* - * This is a hard_bound connection. IP has already - * verified policy. We don't have to do it again. - */ - freeb(first_mp); - first_mp = mp; - ipsec_mctl = B_FALSE; - } - - seg_seq = ABE32_TO_U32(tcph->th_seq); - /* - * TCP SHOULD check that the TCP sequence number contained in - * payload of the ICMP error message is within the range - * SND.UNA <= SEG.SEQ < SND.NXT. - */ - if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) { - /* - * The ICMP message is bogus, just drop it. But if this is - * an ICMP too big message, IP has already changed - * the ire_max_frag to the bogus value. We need to change - * it back. - */ - if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && - icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED) { - conn_t *connp = tcp->tcp_connp; - ire_t *ire; - int flag; - - if (tcp->tcp_ipversion == IPV4_VERSION) { - flag = tcp->tcp_ipha-> - ipha_fragment_offset_and_flags; - } else { - flag = 0; - } - mutex_enter(&connp->conn_lock); - if ((ire = connp->conn_ire_cache) != NULL) { - mutex_enter(&ire->ire_lock); - mutex_exit(&connp->conn_lock); - ire->ire_max_frag = tcp->tcp_if_mtu; - ire->ire_frag_flag |= flag; - mutex_exit(&ire->ire_lock); - } else { - mutex_exit(&connp->conn_lock); - } - } + (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) { goto noticmpv4; } + seg_seq = ntohl(tcpha->tha_seq); switch (icmph->icmph_type) { case ICMP_DEST_UNREACHABLE: switch (icmph->icmph_code) { case ICMP_FRAGMENTATION_NEEDED: /* - * Reduce the MSS based on the new MTU. This will - * eliminate any fragmentation locally. - * N.B. There may well be some funny side-effects on - * the local send policy and the remote receive policy. - * Pending further research, we provide - * tcp_ignore_path_mtu just in case this proves - * disastrous somewhere. - * - * After updating the MSS, retransmit part of the - * dropped segment using the new mss by calling - * tcp_wput_data(). Need to adjust all those - * params to make sure tcp_wput_data() work properly. - */ - if (tcps->tcps_ignore_path_mtu || - tcp->tcp_ipha->ipha_fragment_offset_and_flags == 0) - break; - - /* - * Decrease the MSS by time stamp options - * IP options and IPSEC options. tcp_hdr_len - * includes time stamp option and IP option - * length. Note that new_mss may be negative - * if tcp_ipsec_overhead is large and the - * icmph_du_mtu is the minimum value, which is 68. - */ - new_mss = ntohs(icmph->icmph_du_mtu) - - tcp->tcp_hdr_len - tcp->tcp_ipsec_overhead; - - DTRACE_PROBE2(tcp__pmtu__change, tcp_t *, tcp, int, - new_mss); - - /* - * Only update the MSS if the new one is - * smaller than the previous one. This is - * to avoid problems when getting multiple - * ICMP errors for the same MTU. - */ - if (new_mss >= tcp->tcp_mss) - break; - - /* - * Note that we are using the template header's DF - * bit in the fast path sending. So we need to compare - * the new mss with both tcps_mss_min and ip_pmtu_min. - * And stop doing IPv4 PMTUd if new_mss is less than - * MAX(tcps_mss_min, ip_pmtu_min). - */ - if (new_mss < tcps->tcps_mss_min || - new_mss < ipst->ips_ip_pmtu_min) { - tcp->tcp_ipha->ipha_fragment_offset_and_flags = - 0; - } - - ratio = tcp->tcp_cwnd / tcp->tcp_mss; - ASSERT(ratio >= 1); - tcp_mss_set(tcp, new_mss, B_TRUE); - - /* - * Make sure we have something to - * send. + * Update Path MTU, then try to send something out. */ - if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && - (tcp->tcp_xmit_head != NULL)) { - /* - * Shrink tcp_cwnd in - * proportion to the old MSS/new MSS. - */ - tcp->tcp_cwnd = ratio * tcp->tcp_mss; - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - (tcp->tcp_unsent == 0)) { - tcp->tcp_rexmit_max = tcp->tcp_fss; - } else { - tcp->tcp_rexmit_max = tcp->tcp_snxt; - } - tcp->tcp_rexmit_nxt = tcp->tcp_suna; - tcp->tcp_rexmit = B_TRUE; - tcp->tcp_dupack_cnt = 0; - tcp->tcp_snd_burst = TCP_CWND_SS; - tcp_ss_rexmit(tcp); - } + tcp_update_pmtu(tcp, B_TRUE); + tcp_rexmit_after_error(tcp); break; case ICMP_PORT_UNREACHABLE: case ICMP_PROTOCOL_UNREACHABLE: @@ -8451,7 +6847,6 @@ noticmpv4: * Ditch the half-open connection if we * suspect a SYN attack is under way. */ - tcp_ip_ire_mark_advice(tcp); (void) tcp_clean_death(tcp, tcp->tcp_client_errno, 7); } @@ -8483,67 +6878,191 @@ noticmpv4: break; } } - freemsg(first_mp); + freemsg(mp); } /* - * tcp_icmp_error_ipv6 is called by tcp_rput_other to process ICMPv6 - * error messages passed up by IP. - * Assumes that IP has pulled up all the extension headers as well - * as the ICMPv6 header. + * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might + * change. But it can refer to fields like tcp_suna and tcp_snxt. + * + * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP + * error messages received by IP. The message is always received on the correct + * tcp_t. + */ +/* ARGSUSED */ +static boolean_t +tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, + ip_recv_attr_t *ira) +{ + tcpha_t *tcpha = (tcpha_t *)arg2; + uint32_t seq = ntohl(tcpha->tha_seq); + tcp_t *tcp = connp->conn_tcp; + + /* + * TCP sequence number contained in payload of the ICMP error message + * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise, + * the message is either a stale ICMP error, or an attack from the + * network. Fail the verification. + */ + if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) + return (B_FALSE); + + /* For "too big" we also check the ignore flag */ + if (ira->ira_flags & IRAF_IS_IPV4) { + ASSERT(icmph != NULL); + if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && + icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && + tcp->tcp_tcps->tcps_ignore_path_mtu) + return (B_FALSE); + } else { + ASSERT(icmp6 != NULL); + if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG && + tcp->tcp_tcps->tcps_ignore_path_mtu) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Update the TCP connection according to change of PMTU. + * + * Path MTU might have changed by either increase or decrease, so need to + * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny + * or negative MSS, since tcp_mss_set() will do it. */ static void -tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, boolean_t ipsec_mctl) +tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) { - icmp6_t *icmp6; - ip6_t *ip6h; - uint16_t iph_hdr_length; - tcpha_t *tcpha; - uint8_t *nexthdrp; - uint32_t new_mss; - uint32_t ratio; - boolean_t secure; - mblk_t *first_mp = mp; - size_t mp_size; - uint32_t seg_seq; - tcp_stack_t *tcps = tcp->tcp_tcps; + uint32_t pmtu; + int32_t mss; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; + iaflags_t ixaflags; + + if (tcp->tcp_tcps->tcps_ignore_path_mtu) + return; + + if (tcp->tcp_state < TCPS_ESTABLISHED) + return; /* - * The caller has determined if this is an IPSEC_IN packet and - * set ipsec_mctl appropriately (see tcp_icmp_error). + * Always call ip_get_pmtu() to make sure that IP has updated + * ixa_flags properly. */ - if (ipsec_mctl) - mp = mp->b_cont; + pmtu = ip_get_pmtu(ixa); + ixaflags = ixa->ixa_flags; - mp_size = MBLKL(mp); + /* + * Calculate the MSS by decreasing the PMTU by conn_ht_iphc_len and + * IPsec overhead if applied. Make sure to use the most recent + * IPsec information. + */ + mss = pmtu - connp->conn_ht_iphc_len - conn_ipsec_length(connp); /* - * Verify that we have a complete IP header. If not, send it upstream. + * Nothing to change, so just return. */ - if (mp_size < sizeof (ip6_t)) { -noticmpv6: - freemsg(first_mp); + if (mss == tcp->tcp_mss) return; - } /* - * Verify this is an ICMPV6 packet, else send it upstream. + * Currently, for ICMP errors, only PMTU decrease is handled. */ - ip6h = (ip6_t *)mp->b_rptr; - if (ip6h->ip6_nxt == IPPROTO_ICMPV6) { - iph_hdr_length = IPV6_HDR_LEN; - } else if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, - &nexthdrp) || - *nexthdrp != IPPROTO_ICMPV6) { - goto noticmpv6; + if (mss > tcp->tcp_mss && decrease_only) + return; + + DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); + + /* + * Update ixa_fragsize and ixa_pmtu. + */ + ixa->ixa_fragsize = ixa->ixa_pmtu = pmtu; + + /* + * Adjust MSS and all relevant variables. + */ + tcp_mss_set(tcp, mss); + + /* + * If the PMTU is below the min size maintained by IP, then ip_get_pmtu + * has set IXAF_PMTU_TOO_SMALL and cleared IXAF_PMTU_IPV4_DF. Since TCP + * has a (potentially different) min size we do the same. Make sure to + * clear IXAF_DONTFRAG, which is used by IP to decide whether to + * fragment the packet. + * + * LSO over IPv6 can not be fragmented. So need to disable LSO + * when IPv6 fragmentation is needed. + */ + if (mss < tcp->tcp_tcps->tcps_mss_min) + ixaflags |= IXAF_PMTU_TOO_SMALL; + + if (ixaflags & IXAF_PMTU_TOO_SMALL) + ixaflags &= ~(IXAF_DONTFRAG | IXAF_PMTU_IPV4_DF); + + if ((connp->conn_ipversion == IPV4_VERSION) && + !(ixaflags & IXAF_PMTU_IPV4_DF)) { + tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; } + ixa->ixa_flags = ixaflags; +} + +/* + * Do slow start retransmission after ICMP errors of PMTU changes. + */ +static void +tcp_rexmit_after_error(tcp_t *tcp) +{ + /* + * All sent data has been acknowledged or no data left to send, just + * to return. + */ + if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) || + (tcp->tcp_xmit_head == NULL)) + return; + + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0)) + tcp->tcp_rexmit_max = tcp->tcp_fss; + else + tcp->tcp_rexmit_max = tcp->tcp_snxt; + + tcp->tcp_rexmit_nxt = tcp->tcp_suna; + tcp->tcp_rexmit = B_TRUE; + tcp->tcp_dupack_cnt = 0; + tcp->tcp_snd_burst = TCP_CWND_SS; + tcp_ss_rexmit(tcp); +} + +/* + * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6 + * error messages passed up by IP. + * Assumes that IP has pulled up all the extension headers as well + * as the ICMPv6 header. + */ +static void +tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira) +{ + icmp6_t *icmp6; + ip6_t *ip6h; + uint16_t iph_hdr_length = ira->ira_ip_hdr_length; + tcpha_t *tcpha; + uint8_t *nexthdrp; + uint32_t seg_seq; + + /* + * Verify that we have a complete IP header. + */ + ASSERT((MBLKL(mp) >= sizeof (ip6_t))); + icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; ip6h = (ip6_t *)&icmp6[1]; /* * Verify if we have a complete ICMP and inner IP header. */ - if ((uchar_t *)&ip6h[1] > mp->b_wptr) - goto noticmpv6; + if ((uchar_t *)&ip6h[1] > mp->b_wptr) { +noticmpv6: + freemsg(mp); + return; + } if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) goto noticmpv6; @@ -8558,130 +7077,15 @@ noticmpv6: goto noticmpv6; } - /* - * ICMP errors come on the right queue or come on - * listener/global queue for detached connections and - * get switched to the right queue. If it comes on the - * right queue, policy check has already been done by IP - * and thus free the first_mp without verifying the policy. - * If it has come for a non-hard bound connection, we need - * to verify policy as IP may not have done it. - */ - if (!tcp->tcp_hard_bound) { - if (ipsec_mctl) { - secure = ipsec_in_is_secure(first_mp); - } else { - secure = B_FALSE; - } - if (secure) { - /* - * If we are willing to accept this in clear - * we don't have to verify policy. - */ - if (!ipsec_inbound_accept_clear(mp, NULL, ip6h)) { - if (!tcp_check_policy(tcp, first_mp, - NULL, ip6h, secure, ipsec_mctl)) { - /* - * tcp_check_policy called - * ip_drop_packet() on failure. - */ - return; - } - } - } - } else if (ipsec_mctl) { - /* - * This is a hard_bound connection. IP has already - * verified policy. We don't have to do it again. - */ - freeb(first_mp); - first_mp = mp; - ipsec_mctl = B_FALSE; - } - seg_seq = ntohl(tcpha->tha_seq); - /* - * TCP SHOULD check that the TCP sequence number contained in - * payload of the ICMP error message is within the range - * SND.UNA <= SEG.SEQ < SND.NXT. - */ - if (SEQ_LT(seg_seq, tcp->tcp_suna) || SEQ_GEQ(seg_seq, tcp->tcp_snxt)) { - /* - * If the ICMP message is bogus, should we kill the - * connection, or should we just drop the bogus ICMP - * message? It would probably make more sense to just - * drop the message so that if this one managed to get - * in, the real connection should not suffer. - */ - goto noticmpv6; - } - switch (icmp6->icmp6_type) { case ICMP6_PACKET_TOO_BIG: /* - * Reduce the MSS based on the new MTU. This will - * eliminate any fragmentation locally. - * N.B. There may well be some funny side-effects on - * the local send policy and the remote receive policy. - * Pending further research, we provide - * tcp_ignore_path_mtu just in case this proves - * disastrous somewhere. - * - * After updating the MSS, retransmit part of the - * dropped segment using the new mss by calling - * tcp_wput_data(). Need to adjust all those - * params to make sure tcp_wput_data() work properly. - */ - if (tcps->tcps_ignore_path_mtu) - break; - - /* - * Decrease the MSS by time stamp options - * IP options and IPSEC options. tcp_hdr_len - * includes time stamp option and IP option - * length. - */ - new_mss = ntohs(icmp6->icmp6_mtu) - tcp->tcp_hdr_len - - tcp->tcp_ipsec_overhead; - - /* - * Only update the MSS if the new one is - * smaller than the previous one. This is - * to avoid problems when getting multiple - * ICMP errors for the same MTU. - */ - if (new_mss >= tcp->tcp_mss) - break; - - ratio = tcp->tcp_cwnd / tcp->tcp_mss; - ASSERT(ratio >= 1); - tcp_mss_set(tcp, new_mss, B_TRUE); - - /* - * Make sure we have something to - * send. + * Update Path MTU, then try to send something out. */ - if (SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) && - (tcp->tcp_xmit_head != NULL)) { - /* - * Shrink tcp_cwnd in - * proportion to the old MSS/new MSS. - */ - tcp->tcp_cwnd = ratio * tcp->tcp_mss; - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - (tcp->tcp_unsent == 0)) { - tcp->tcp_rexmit_max = tcp->tcp_fss; - } else { - tcp->tcp_rexmit_max = tcp->tcp_snxt; - } - tcp->tcp_rexmit_nxt = tcp->tcp_suna; - tcp->tcp_rexmit = B_TRUE; - tcp->tcp_dupack_cnt = 0; - tcp->tcp_snd_burst = TCP_CWND_SS; - tcp_ss_rexmit(tcp); - } + tcp_update_pmtu(tcp, B_TRUE); + tcp_rexmit_after_error(tcp); break; - case ICMP6_DST_UNREACH: switch (icmp6->icmp6_code) { case ICMP6_DST_UNREACH_NOPORT: @@ -8692,7 +7096,6 @@ noticmpv6: ECONNREFUSED, 8); } break; - case ICMP6_DST_UNREACH_ADMIN: case ICMP6_DST_UNREACH_NOROUTE: case ICMP6_DST_UNREACH_BEYONDSCOPE: @@ -8708,7 +7111,6 @@ noticmpv6: * Ditch the half-open connection if we * suspect a SYN attack is under way. */ - tcp_ip_ire_mark_advice(tcp); (void) tcp_clean_death(tcp, tcp->tcp_client_errno, 9); } @@ -8720,7 +7122,6 @@ noticmpv6: break; } break; - case ICMP6_PARAM_PROB: /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && @@ -8739,83 +7140,42 @@ noticmpv6: default: break; } - freemsg(first_mp); + freemsg(mp); } /* * Notify IP that we are having trouble with this connection. IP should - * blow the IRE away and start over. + * make note so it can potentially use a different IRE. */ static void tcp_ip_notify(tcp_t *tcp) { - struct iocblk *iocp; - ipid_t *ipid; - mblk_t *mp; - - /* IPv6 has NUD thus notification to delete the IRE is not needed */ - if (tcp->tcp_ipversion == IPV6_VERSION) - return; - - mp = mkiocb(IP_IOCTL); - if (mp == NULL) - return; - - iocp = (struct iocblk *)mp->b_rptr; - iocp->ioc_count = sizeof (ipid_t) + sizeof (tcp->tcp_ipha->ipha_dst); - - mp->b_cont = allocb(iocp->ioc_count, BPRI_HI); - if (!mp->b_cont) { - freeb(mp); - return; - } + conn_t *connp = tcp->tcp_connp; + ire_t *ire; - ipid = (ipid_t *)mp->b_cont->b_rptr; - mp->b_cont->b_wptr += iocp->ioc_count; - bzero(ipid, sizeof (*ipid)); - ipid->ipid_cmd = IP_IOC_IRE_DELETE_NO_REPLY; - ipid->ipid_ire_type = IRE_CACHE; - ipid->ipid_addr_offset = sizeof (ipid_t); - ipid->ipid_addr_length = sizeof (tcp->tcp_ipha->ipha_dst); /* * Note: in the case of source routing we want to blow away the * route to the first source route hop. */ - bcopy(&tcp->tcp_ipha->ipha_dst, &ipid[1], - sizeof (tcp->tcp_ipha->ipha_dst)); - - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); -} - -/* Unlink and return any mblk that looks like it contains an ire */ -static mblk_t * -tcp_ire_mp(mblk_t **mpp) -{ - mblk_t *mp = *mpp; - mblk_t *prev_mp = NULL; - - for (;;) { - switch (DB_TYPE(mp)) { - case IRE_DB_TYPE: - case IRE_DB_REQ_TYPE: - if (mp == *mpp) { - *mpp = mp->b_cont; - } else { - prev_mp->b_cont = mp->b_cont; - } - mp->b_cont = NULL; - return (mp); - default: - break; + ire = connp->conn_ixa->ixa_ire; + if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + if (ire->ire_ipversion == IPV4_VERSION) { + /* + * As per RFC 1122, we send an RTM_LOSING to inform + * routing protocols. + */ + ip_rts_change(RTM_LOSING, ire->ire_addr, + ire->ire_gateway_addr, ire->ire_mask, + connp->conn_laddr_v4, 0, 0, 0, + (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), + ire->ire_ipst); } - prev_mp = mp; - mp = mp->b_cont; - if (mp == NULL) - break; + (void) ire_no_good(ire); } - return (mp); } +#pragma inline(tcp_send_data) + /* * Timer callback routine for keepalive probe. We do a fake resend of * last ACKed byte. Then set a timer using RTO. When the timer expires, @@ -8890,7 +7250,7 @@ tcp_keepalive_killer(void *arg) * timer back. */ if (mp != NULL) { - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveProbe); if (tcp->tcp_ka_last_intrvl != 0) { @@ -8930,17 +7290,17 @@ tcp_keepalive_killer(void *arg) int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) { - queue_t *q = tcp->tcp_rq; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_rq; int32_t mss = tcp->tcp_mss; int maxpsz; - conn_t *connp = tcp->tcp_connp; if (TCP_IS_DETACHED(tcp)) return (mss); if (tcp->tcp_fused) { maxpsz = tcp_fuse_maxpsz(tcp); mss = INFPSZ; - } else if (tcp->tcp_mdt || tcp->tcp_lso || tcp->tcp_maxpsz == 0) { + } else if (tcp->tcp_maxpsz_multiplier == 0) { /* * Set the sd_qn_maxpsz according to the socket send buffer * size, and sd_maxblk to INFPSZ (-1). This will essentially @@ -8948,7 +7308,7 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) * kernel-allocated buffers without breaking it up into smaller * chunks. We round up the buffer size to the nearest SMSS. */ - maxpsz = MSS_ROUNDUP(tcp->tcp_xmit_hiwater, mss); + maxpsz = MSS_ROUNDUP(connp->conn_sndbuf, mss); if (tcp->tcp_kssl_ctx == NULL) mss = INFPSZ; else @@ -8960,21 +7320,17 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) * head to break down larger than SMSS writes into SMSS- * size mblks, up to tcp_maxpsz_multiplier mblks at a time. */ - /* XXX tune this with ndd tcp_maxpsz_multiplier */ - maxpsz = tcp->tcp_maxpsz * mss; - if (maxpsz > tcp->tcp_xmit_hiwater/2) { - maxpsz = tcp->tcp_xmit_hiwater/2; + maxpsz = tcp->tcp_maxpsz_multiplier * mss; + if (maxpsz > connp->conn_sndbuf / 2) { + maxpsz = connp->conn_sndbuf / 2; /* Round up to nearest mss */ maxpsz = MSS_ROUNDUP(maxpsz, mss); } } (void) proto_set_maxpsz(q, connp, maxpsz); - if (!(IPCL_IS_NONSTR(connp))) { - /* XXX do it in set_maxpsz()? */ - tcp->tcp_wq->q_maxpsz = maxpsz; - } - + if (!(IPCL_IS_NONSTR(connp))) + connp->conn_wq->q_maxpsz = maxpsz; if (set_maxblk) (void) proto_set_tx_maxblk(q, connp, mss); return (mss); @@ -8985,18 +7341,18 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) * tcpopt struct and return a bitmask saying which options were found. */ static int -tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) +tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) { uchar_t *endp; int len; uint32_t mss; - uchar_t *up = (uchar_t *)tcph; + uchar_t *up = (uchar_t *)tcpha; int found = 0; int32_t sack_len; tcp_seq sack_begin, sack_end; tcp_t *tcp; - endp = up + TCP_HDR_LENGTH(tcph); + endp = up + TCP_HDR_LENGTH(tcpha); up += TCP_MIN_HEADER_LENGTH; while (up < endp) { len = endp - up; @@ -9135,28 +7491,20 @@ tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt) } /* - * Set the mss associated with a particular tcp based on its current value, - * and a new one passed in. Observe minimums and maximums, and reset - * other state variables that we want to view as multiples of mss. - * - * This function is called mainly because values like tcp_mss, tcp_cwnd, - * highwater marks etc. need to be initialized or adjusted. - * 1) From tcp_process_options() when the other side's SYN/SYN-ACK - * packet arrives. - * 2) We need to set a new MSS when ICMP_FRAGMENTATION_NEEDED or - * ICMP6_PACKET_TOO_BIG arrives. - * 3) From tcp_paws_check() if the other side stops sending the timestamp, - * to increase the MSS to use the extra bytes available. + * Set the MSS associated with a particular tcp based on its current value, + * and a new one passed in. Observe minimums and maximums, and reset other + * state variables that we want to view as multiples of MSS. * - * Callers except tcp_paws_check() ensure that they only reduce mss. + * The value of MSS could be either increased or descreased. */ static void -tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss) +tcp_mss_set(tcp_t *tcp, uint32_t mss) { uint32_t mss_max; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) mss_max = tcps->tcps_mss_max_ipv4; else mss_max = tcps->tcps_mss_max_ipv6; @@ -9176,34 +7524,22 @@ tcp_mss_set(tcp_t *tcp, uint32_t mss, boolean_t do_ss) * TCP should be able to buffer at least 4 MSS data for obvious * performance reason. */ - if ((mss << 2) > tcp->tcp_xmit_hiwater) - tcp->tcp_xmit_hiwater = mss << 2; + if ((mss << 2) > connp->conn_sndbuf) + connp->conn_sndbuf = mss << 2; /* - * Set the xmit_lowater to at least twice of MSS. + * Set the send lowater to at least twice of MSS. */ - if ((mss << 1) > tcp->tcp_xmit_lowater) - tcp->tcp_xmit_lowater = mss << 1; + if ((mss << 1) > connp->conn_sndlowat) + connp->conn_sndlowat = mss << 1; + + /* + * Update tcp_cwnd according to the new value of MSS. Keep the + * previous ratio to preserve the transmit rate. + */ + tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; + tcp->tcp_cwnd_cnt = 0; - if (do_ss) { - /* - * Either the tcp_cwnd is as yet uninitialized, or mss is - * changing due to a reduction in MTU, presumably as a - * result of a new path component, reset cwnd to its - * "initial" value, as a multiple of the new mss. - */ - SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_initial); - } else { - /* - * Called by tcp_paws_check(), the mss increased - * marginally to allow use of space previously taken - * by the timestamp option. It would be inappropriate - * to apply slow start or tcp_init_cwnd values to - * tcp_cwnd, simply adjust to a multiple of the new mss. - */ - tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; - tcp->tcp_cwnd_cnt = 0; - } tcp->tcp_mss = mss; (void) tcp_maxpsz_set(tcp, B_TRUE); } @@ -9223,12 +7559,11 @@ tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) } static conn_t * -tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, - boolean_t issocket, int *errorp) +tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, + int *errorp) { tcp_t *tcp = NULL; conn_t *connp; - int err; zoneid_t zoneid; tcp_stack_t *tcps; squeue_t *sqp; @@ -9265,15 +7600,6 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, else zoneid = crgetzoneid(credp); } - /* - * For stackid zero this is done from strplumb.c, but - * non-zero stackids are handled here. - */ - if (tcps->tcps_g_q == NULL && - tcps->tcps_netstack->netstack_stackid != - GLOBAL_NETSTACKID) { - tcp_g_q_setup(tcps); - } sqp = IP_SQUEUE_GET((uint_t)gethrtime()); connp = (conn_t *)tcp_get_conn(sqp, tcps); @@ -9286,41 +7612,50 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, *errorp = ENOSR; return (NULL); } + ASSERT(connp->conn_ixa->ixa_protocol == connp->conn_proto); + connp->conn_sqp = sqp; connp->conn_initial_sqp = connp->conn_sqp; + connp->conn_ixa->ixa_sqp = connp->conn_sqp; tcp = connp->conn_tcp; + /* + * Besides asking IP to set the checksum for us, have conn_ip_output + * to do the following checks when necessary: + * + * IXAF_VERIFY_SOURCE: drop packets when our outer source goes invalid + * IXAF_VERIFY_PMTU: verify PMTU changes + * IXAF_VERIFY_LSO: verify LSO capability changes + */ + connp->conn_ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO; + + if (!tcps->tcps_dev_flow_ctl) + connp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; + if (isv6) { - connp->conn_flags |= IPCL_TCP6; - connp->conn_send = ip_output_v6; - connp->conn_af_isv6 = B_TRUE; - connp->conn_pkt_isv6 = B_TRUE; - connp->conn_src_preferences = IPV6_PREFER_SRC_DEFAULT; - tcp->tcp_ipversion = IPV6_VERSION; - tcp->tcp_family = AF_INET6; + connp->conn_ixa->ixa_src_preferences = IPV6_PREFER_SRC_DEFAULT; + connp->conn_ipversion = IPV6_VERSION; + connp->conn_family = AF_INET6; tcp->tcp_mss = tcps->tcps_mss_def_ipv6; + connp->conn_default_ttl = tcps->tcps_ipv6_hoplimit; } else { - connp->conn_flags |= IPCL_TCP4; - connp->conn_send = ip_output; - connp->conn_af_isv6 = B_FALSE; - connp->conn_pkt_isv6 = B_FALSE; - tcp->tcp_ipversion = IPV4_VERSION; - tcp->tcp_family = AF_INET; + connp->conn_ipversion = IPV4_VERSION; + connp->conn_family = AF_INET; tcp->tcp_mss = tcps->tcps_mss_def_ipv4; + connp->conn_default_ttl = tcps->tcps_ipv4_ttl; } + connp->conn_xmit_ipp.ipp_unicast_hops = connp->conn_default_ttl; + + crhold(credp); + connp->conn_cred = credp; + connp->conn_cpid = curproc->p_pid; + connp->conn_open_time = lbolt64; - /* - * TCP keeps a copy of cred for cache locality reasons but - * we put a reference only once. If connp->conn_cred - * becomes invalid, tcp_cred should also be set to NULL. - */ - tcp->tcp_cred = connp->conn_cred = credp; - crhold(connp->conn_cred); - tcp->tcp_cpid = curproc->p_pid; - tcp->tcp_open_time = lbolt64; connp->conn_zoneid = zoneid; + /* conn_allzones can not be set this early, hence no IPCL_ZONEID */ + connp->conn_ixa->ixa_zoneid = zoneid; connp->conn_mlp_type = mlptSingle; - connp->conn_ulp_labeled = !is_system_labeled(); ASSERT(connp->conn_netstack == tcps->tcps_netstack); ASSERT(tcp->tcp_tcps == tcps); @@ -9331,38 +7666,22 @@ tcp_create_common(queue_t *q, cred_t *credp, boolean_t isv6, if (getpflags(NET_MAC_AWARE, credp) != 0) connp->conn_mac_mode = CONN_MAC_AWARE; - connp->conn_dev = NULL; + connp->conn_zone_is_global = (crgetzoneid(credp) == GLOBAL_ZONEID); + if (issocket) { - connp->conn_flags |= IPCL_SOCKET; tcp->tcp_issocket = 1; } - /* Non-zero default values */ - connp->conn_multicast_loop = IP_DEFAULT_MULTICAST_LOOP; - - if (q == NULL) { - /* - * Create a helper stream for non-STREAMS socket. - */ - err = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); - if (err != 0) { - ip1dbg(("tcp_create_common: create of IP helper stream " - "failed\n")); - CONN_DEC_REF(connp); - *errorp = err; - return (NULL); - } - q = connp->conn_rq; - } + connp->conn_rcvbuf = tcps->tcps_recv_hiwat; + connp->conn_sndbuf = tcps->tcps_xmit_hiwat; + connp->conn_sndlowat = tcps->tcps_xmit_lowat; + connp->conn_so_type = SOCK_STREAM; + connp->conn_wroff = connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra; SOCK_CONNID_INIT(tcp->tcp_connid); - err = tcp_init(tcp, q); - if (err != 0) { - CONN_DEC_REF(connp); - *errorp = err; - return (NULL); - } - + tcp->tcp_state = TCPS_IDLE; + tcp_init_values(tcp); return (connp); } @@ -9415,7 +7734,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, q->q_qinfo = &tcp_acceptor_rinit; /* * the conn_dev and minor_arena will be subsequently used by - * tcp_wput_accept() and tcp_tpi_close_accept() to figure out + * tcp_tli_accept() and tcp_tpi_close_accept() to figure out * the minor device number for this connection from the q_ptr. */ RD(q)->q_ptr = (void *)conn_dev; @@ -9426,7 +7745,7 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, } issocket = flag & SO_SOCKSTR; - connp = tcp_create_common(q, credp, isv6, issocket, &err); + connp = tcp_create_common(credp, isv6, issocket, &err); if (connp == NULL) { inet_minor_free(minor_arena, conn_dev); @@ -9434,6 +7753,8 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, return (err); } + connp->conn_rq = q; + connp->conn_wq = WR(q); q->q_ptr = WR(q)->q_ptr = connp; connp->conn_dev = conn_dev; @@ -9500,7 +7821,7 @@ tcp_allow_connopt_set(int level, int name) } /* - * this routine gets default values of certain options whose default + * This routine gets default values of certain options whose default * values are maintained by protocol specific code */ /* ARGSUSED */ @@ -9553,321 +7874,102 @@ tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) return (sizeof (int)); } +/* + * TCP routine to get the values of options. + */ static int tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) { int *i1 = (int *)ptr; tcp_t *tcp = connp->conn_tcp; - ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; + conn_opt_arg_t coas; + int retval; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; switch (level) { case SOL_SOCKET: switch (name) { - case SO_LINGER: { - struct linger *lgr = (struct linger *)ptr; - - lgr->l_onoff = tcp->tcp_linger ? SO_LINGER : 0; - lgr->l_linger = tcp->tcp_lingertime; - } - return (sizeof (struct linger)); - case SO_DEBUG: - *i1 = tcp->tcp_debug ? SO_DEBUG : 0; - break; - case SO_KEEPALIVE: - *i1 = tcp->tcp_ka_enabled ? SO_KEEPALIVE : 0; - break; - case SO_DONTROUTE: - *i1 = tcp->tcp_dontroute ? SO_DONTROUTE : 0; - break; - case SO_USELOOPBACK: - *i1 = tcp->tcp_useloopback ? SO_USELOOPBACK : 0; - break; - case SO_BROADCAST: - *i1 = tcp->tcp_broadcast ? SO_BROADCAST : 0; - break; - case SO_REUSEADDR: - *i1 = tcp->tcp_reuseaddr ? SO_REUSEADDR : 0; - break; - case SO_OOBINLINE: - *i1 = tcp->tcp_oobinline ? SO_OOBINLINE : 0; - break; - case SO_DGRAM_ERRIND: - *i1 = tcp->tcp_dgram_errind ? SO_DGRAM_ERRIND : 0; - break; - case SO_TYPE: - *i1 = SOCK_STREAM; - break; - case SO_SNDBUF: - *i1 = tcp->tcp_xmit_hiwater; - break; - case SO_RCVBUF: - *i1 = tcp->tcp_recv_hiwater; - break; case SO_SND_COPYAVOID: *i1 = tcp->tcp_snd_zcopy_on ? SO_SND_COPYAVOID : 0; - break; - case SO_ALLZONES: - *i1 = connp->conn_allzones ? 1 : 0; - break; - case SO_ANON_MLP: - *i1 = connp->conn_anon_mlp; - break; - case SO_MAC_EXEMPT: - *i1 = (connp->conn_mac_mode == CONN_MAC_AWARE); - break; - case SO_MAC_IMPLICIT: - *i1 = (connp->conn_mac_mode == CONN_MAC_IMPLICIT); - break; - case SO_EXCLBIND: - *i1 = tcp->tcp_exclbind ? SO_EXCLBIND : 0; - break; - case SO_PROTOTYPE: - *i1 = IPPROTO_TCP; - break; - case SO_DOMAIN: - *i1 = tcp->tcp_family; - break; + return (sizeof (int)); case SO_ACCEPTCONN: *i1 = (tcp->tcp_state == TCPS_LISTEN); - default: - return (-1); + return (sizeof (int)); } break; case IPPROTO_TCP: switch (name) { case TCP_NODELAY: *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; - break; + return (sizeof (int)); case TCP_MAXSEG: *i1 = tcp->tcp_mss; - break; + return (sizeof (int)); case TCP_NOTIFY_THRESHOLD: *i1 = (int)tcp->tcp_first_timer_threshold; - break; + return (sizeof (int)); case TCP_ABORT_THRESHOLD: *i1 = tcp->tcp_second_timer_threshold; - break; + return (sizeof (int)); case TCP_CONN_NOTIFY_THRESHOLD: *i1 = tcp->tcp_first_ctimer_threshold; - break; + return (sizeof (int)); case TCP_CONN_ABORT_THRESHOLD: *i1 = tcp->tcp_second_ctimer_threshold; - break; - case TCP_RECVDSTADDR: - *i1 = tcp->tcp_recvdstaddr; - break; - case TCP_ANONPRIVBIND: - *i1 = tcp->tcp_anon_priv_bind; - break; - case TCP_EXCLBIND: - *i1 = tcp->tcp_exclbind ? TCP_EXCLBIND : 0; - break; + return (sizeof (int)); case TCP_INIT_CWND: *i1 = tcp->tcp_init_cwnd; - break; + return (sizeof (int)); case TCP_KEEPALIVE_THRESHOLD: *i1 = tcp->tcp_ka_interval; - break; + return (sizeof (int)); case TCP_KEEPALIVE_ABORT_THRESHOLD: *i1 = tcp->tcp_ka_abort_thres; - break; + return (sizeof (int)); case TCP_CORK: *i1 = tcp->tcp_cork; - break; - default: - return (-1); + return (sizeof (int)); } break; case IPPROTO_IP: - if (tcp->tcp_family != AF_INET) + if (connp->conn_family != AF_INET) return (-1); switch (name) { case IP_OPTIONS: - case T_IP_OPTIONS: { - /* - * This is compatible with BSD in that in only return - * the reverse source route with the final destination - * as the last entry. The first 4 bytes of the option - * will contain the final destination. - */ - int opt_len; - - opt_len = (char *)tcp->tcp_tcph - (char *)tcp->tcp_ipha; - opt_len -= tcp->tcp_label_len + IP_SIMPLE_HDR_LENGTH; - ASSERT(opt_len >= 0); + case T_IP_OPTIONS: /* Caller ensures enough space */ - if (opt_len > 0) { - /* - * TODO: Do we have to handle getsockopt on an - * initiator as well? - */ - return (ip_opt_get_user(tcp->tcp_ipha, ptr)); - } - return (0); - } - case IP_TOS: - case T_IP_TOS: - *i1 = (int)tcp->tcp_ipha->ipha_type_of_service; - break; - case IP_TTL: - *i1 = (int)tcp->tcp_ipha->ipha_ttl; - break; - case IP_NEXTHOP: - /* Handled at IP level */ - return (-EINVAL); + return (ip_opt_get_user(connp, ptr)); default: - return (-1); + break; } break; + case IPPROTO_IPV6: /* * IPPROTO_IPV6 options are only supported for sockets * that are using IPv6 on the wire. */ - if (tcp->tcp_ipversion != IPV6_VERSION) { + if (connp->conn_ipversion != IPV6_VERSION) { return (-1); } switch (name) { - case IPV6_UNICAST_HOPS: - *i1 = (unsigned int) tcp->tcp_ip6h->ip6_hops; - break; /* goto sizeof (int) option return */ - case IPV6_BOUND_IF: - /* Zero if not set */ - *i1 = tcp->tcp_bound_if; - break; /* goto sizeof (int) option return */ - case IPV6_RECVPKTINFO: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVTCLASS: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPLIMIT: - if (tcp->tcp_ipv6_recvancillary & - TCP_IPV6_RECVHOPLIMIT) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVHOPOPTS: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVDSTOPTS: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVDSTOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case _OLD_IPV6_RECVDSTOPTS: - if (tcp->tcp_ipv6_recvancillary & - TCP_OLD_IPV6_RECVDSTOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDR: - if (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_RECVRTHDRDSTOPTS: - if (tcp->tcp_ipv6_recvancillary & - TCP_IPV6_RECVRTDSTOPTS) - *i1 = 1; - else - *i1 = 0; - break; /* goto sizeof (int) option return */ - case IPV6_PKTINFO: { - /* XXX assumes that caller has room for max size! */ - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)ptr; - if (ipp->ipp_fields & IPPF_IFINDEX) - pkti->ipi6_ifindex = ipp->ipp_ifindex; - else - pkti->ipi6_ifindex = 0; - if (ipp->ipp_fields & IPPF_ADDR) - pkti->ipi6_addr = ipp->ipp_addr; - else - pkti->ipi6_addr = ipv6_all_zeros; - return (sizeof (struct in6_pktinfo)); - } - case IPV6_TCLASS: - if (ipp->ipp_fields & IPPF_TCLASS) - *i1 = ipp->ipp_tclass; - else - *i1 = IPV6_FLOW_TCLASS( - IPV6_DEFAULT_VERS_AND_FLOW); - break; /* goto sizeof (int) option return */ - case IPV6_NEXTHOP: { - sin6_t *sin6 = (sin6_t *)ptr; - - if (!(ipp->ipp_fields & IPPF_NEXTHOP)) - return (0); - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_addr = ipp->ipp_nexthop; - return (sizeof (sin6_t)); - } - case IPV6_HOPOPTS: - if (!(ipp->ipp_fields & IPPF_HOPOPTS)) - return (0); - if (ipp->ipp_hopoptslen <= tcp->tcp_label_len) - return (0); - bcopy((char *)ipp->ipp_hopopts + tcp->tcp_label_len, - ptr, ipp->ipp_hopoptslen - tcp->tcp_label_len); - if (tcp->tcp_label_len > 0) { - ptr[0] = ((char *)ipp->ipp_hopopts)[0]; - ptr[1] = (ipp->ipp_hopoptslen - - tcp->tcp_label_len + 7) / 8 - 1; - } - return (ipp->ipp_hopoptslen - tcp->tcp_label_len); - case IPV6_RTHDRDSTOPTS: - if (!(ipp->ipp_fields & IPPF_RTDSTOPTS)) - return (0); - bcopy(ipp->ipp_rtdstopts, ptr, ipp->ipp_rtdstoptslen); - return (ipp->ipp_rtdstoptslen); - case IPV6_RTHDR: - if (!(ipp->ipp_fields & IPPF_RTHDR)) - return (0); - bcopy(ipp->ipp_rthdr, ptr, ipp->ipp_rthdrlen); - return (ipp->ipp_rthdrlen); - case IPV6_DSTOPTS: - if (!(ipp->ipp_fields & IPPF_DSTOPTS)) - return (0); - bcopy(ipp->ipp_dstopts, ptr, ipp->ipp_dstoptslen); - return (ipp->ipp_dstoptslen); - case IPV6_SRC_PREFERENCES: - return (ip6_get_src_preferences(connp, - (uint32_t *)ptr)); - case IPV6_PATHMTU: { - struct ip6_mtuinfo *mtuinfo = (struct ip6_mtuinfo *)ptr; - + case IPV6_PATHMTU: if (tcp->tcp_state < TCPS_ESTABLISHED) return (-1); - - return (ip_fill_mtuinfo(&connp->conn_remv6, - connp->conn_fport, mtuinfo, - connp->conn_netstack)); - } - default: - return (-1); + break; } break; - default: - return (-1); } - return (sizeof (int)); + mutex_enter(&connp->conn_lock); + retval = conn_opt_get(&coas, level, name, ptr); + mutex_exit(&connp->conn_lock); + return (retval); } /* @@ -9896,7 +7998,6 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, tcp_opt_obj.odb_opt_des_arr, tcp_opt_obj.odb_opt_arr_cnt, - tcp_opt_obj.odb_topmost_tpiprovider, B_FALSE, B_TRUE, cr); if (error != 0) { if (error < 0) { @@ -9909,30 +8010,28 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = squeue_synch_enter(sqp, connp, NULL); if (error == ENOMEM) { + kmem_free(optvalp_buf, max_optbuf_len); return (ENOMEM); } len = tcp_opt_get(connp, level, option_name, optvalp_buf); squeue_synch_exit(sqp, connp); - if (len < 0) { - /* - * Pass on to IP - */ + if (len == -1) { kmem_free(optvalp_buf, max_optbuf_len); - return (ip_get_options(connp, level, option_name, - optvalp, optlen, cr)); - } else { - /* - * update optlen and copy option value - */ - t_uscalar_t size = MIN(len, *optlen); - bcopy(optvalp_buf, optvalp, size); - bcopy(&size, optlen, sizeof (size)); - - kmem_free(optvalp_buf, max_optbuf_len); - return (0); + return (EINVAL); } + + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); } /* @@ -9943,7 +8042,7 @@ tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, int tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { tcp_t *tcp = connp->conn_tcp; int *i1 = (int *)invalp; @@ -9951,6 +8050,13 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, boolean_t checkonly; int reterr; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_opt_arg_t coas; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; switch (optset_context) { case SETFN_OPTCOM_CHECKONLY: @@ -10016,37 +8122,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, switch (level) { case SOL_SOCKET: switch (name) { - case SO_LINGER: { - struct linger *lgr = (struct linger *)invalp; - - if (!checkonly) { - if (lgr->l_onoff) { - tcp->tcp_linger = 1; - tcp->tcp_lingertime = lgr->l_linger; - } else { - tcp->tcp_linger = 0; - tcp->tcp_lingertime = 0; - } - /* struct copy */ - *(struct linger *)outvalp = *lgr; - } else { - if (!lgr->l_onoff) { - ((struct linger *) - outvalp)->l_onoff = 0; - ((struct linger *) - outvalp)->l_linger = 0; - } else { - /* struct copy */ - *(struct linger *)outvalp = *lgr; - } - } - *outlenp = sizeof (struct linger); - return (0); - } - case SO_DEBUG: - if (!checkonly) - tcp->tcp_debug = onoff; - break; case SO_KEEPALIVE: if (checkonly) { /* check only case */ @@ -10054,65 +8129,25 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } if (!onoff) { - if (tcp->tcp_ka_enabled) { + if (connp->conn_keepalive) { if (tcp->tcp_ka_tid != 0) { (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); tcp->tcp_ka_tid = 0; } - tcp->tcp_ka_enabled = 0; + connp->conn_keepalive = 0; } break; } - if (!tcp->tcp_ka_enabled) { + if (!connp->conn_keepalive) { /* Crank up the keepalive timer */ tcp->tcp_ka_last_intrvl = 0; tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, MSEC_TO_TICK(tcp->tcp_ka_interval)); - tcp->tcp_ka_enabled = 1; - } - break; - case SO_DONTROUTE: - /* - * SO_DONTROUTE, SO_USELOOPBACK, and SO_BROADCAST are - * only of interest to IP. We track them here only so - * that we can report their current value. - */ - if (!checkonly) { - tcp->tcp_dontroute = onoff; - tcp->tcp_connp->conn_dontroute = onoff; + connp->conn_keepalive = 1; } break; - case SO_USELOOPBACK: - if (!checkonly) { - tcp->tcp_useloopback = onoff; - tcp->tcp_connp->conn_loopback = onoff; - } - break; - case SO_BROADCAST: - if (!checkonly) { - tcp->tcp_broadcast = onoff; - tcp->tcp_connp->conn_broadcast = onoff; - } - break; - case SO_REUSEADDR: - if (!checkonly) { - tcp->tcp_reuseaddr = onoff; - tcp->tcp_connp->conn_reuseaddr = onoff; - } - break; - case SO_OOBINLINE: - if (!checkonly) { - tcp->tcp_oobinline = onoff; - if (IPCL_IS_NONSTR(tcp->tcp_connp)) - proto_set_rx_oob_opt(connp, onoff); - } - break; - case SO_DGRAM_ERRIND: - if (!checkonly) - tcp->tcp_dgram_errind = onoff; - break; case SO_SNDBUF: { if (*i1 > tcps->tcps_max_buf) { *outlenp = 0; @@ -10121,11 +8156,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, if (checkonly) break; - tcp->tcp_xmit_hiwater = *i1; - if (tcps->tcps_snd_lowat_fraction != 0) - tcp->tcp_xmit_lowater = - tcp->tcp_xmit_hiwater / + connp->conn_sndbuf = *i1; + if (tcps->tcps_snd_lowat_fraction != 0) { + connp->conn_sndlowat = connp->conn_sndbuf / tcps->tcps_snd_lowat_fraction; + } (void) tcp_maxpsz_set(tcp, B_TRUE); /* * If we are flow-controlled, recheck the condition. @@ -10135,11 +8170,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) < tcp->tcp_xmit_hiwater) { + TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); - break; + *outlenp = inlen; + return (0); } case SO_RCVBUF: if (*i1 > tcps->tcps_max_buf) { @@ -10155,43 +8191,20 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, * XXX should we return the rwnd here * and tcp_opt_get ? */ - break; + *outlenp = inlen; + return (0); case SO_SND_COPYAVOID: if (!checkonly) { - /* we only allow enable at most once for now */ if (tcp->tcp_loopback || (tcp->tcp_kssl_ctx != NULL) || - (!tcp->tcp_snd_zcopy_aware && - (onoff != 1 || !tcp_zcopy_check(tcp)))) { + (onoff != 1) || !tcp_zcopy_check(tcp)) { *outlenp = 0; return (EOPNOTSUPP); } tcp->tcp_snd_zcopy_aware = 1; } - break; - case SO_RCVTIMEO: - case SO_SNDTIMEO: - /* - * Pass these two options in order for third part - * protocol usage. Here just return directly. - */ + *outlenp = inlen; return (0); - case SO_ALLZONES: - /* Pass option along to IP level for handling */ - return (-EINVAL); - case SO_ANON_MLP: - /* Pass option along to IP level for handling */ - return (-EINVAL); - case SO_MAC_EXEMPT: - /* Pass option along to IP level for handling */ - return (-EINVAL); - case SO_EXCLBIND: - if (!checkonly) - tcp->tcp_exclbind = onoff; - break; - default: - *outlenp = 0; - return (EINVAL); } break; case IPPROTO_TCP: @@ -10217,25 +8230,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, tcp->tcp_second_ctimer_threshold = *i1; break; case TCP_RECVDSTADDR: - if (tcp->tcp_state > TCPS_LISTEN) - return (EOPNOTSUPP); - if (!checkonly) - tcp->tcp_recvdstaddr = onoff; - break; - case TCP_ANONPRIVBIND: - if ((reterr = secpolicy_net_privaddr(cr, 0, - IPPROTO_TCP)) != 0) { + if (tcp->tcp_state > TCPS_LISTEN) { *outlenp = 0; - return (reterr); - } - if (!checkonly) { - tcp->tcp_anon_priv_bind = onoff; + return (EOPNOTSUPP); } + /* Setting done in conn_opt_set */ break; - case TCP_EXCLBIND: - if (!checkonly) - tcp->tcp_exclbind = onoff; - break; /* goto sizeof (int) option return */ case TCP_INIT_CWND: { uint32_t init_cwnd = *((uint32_t *)invalp); @@ -10278,7 +8278,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, * keepalive timer. */ if (tcp->tcp_ka_tid != 0) { - ASSERT(tcp->tcp_ka_enabled); + ASSERT(connp->conn_keepalive); (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); tcp->tcp_ka_last_intrvl = 0; @@ -10318,49 +8318,15 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } break; default: - *outlenp = 0; - return (EINVAL); + break; } break; case IPPROTO_IP: - if (tcp->tcp_family != AF_INET) { + if (connp->conn_family != AF_INET) { *outlenp = 0; - return (ENOPROTOOPT); + return (EINVAL); } switch (name) { - case IP_OPTIONS: - case T_IP_OPTIONS: - reterr = tcp_opt_set_header(tcp, checkonly, - invalp, inlen); - if (reterr) { - *outlenp = 0; - return (reterr); - } - /* OK return - copy input buffer into output buffer */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - return (0); - case IP_TOS: - case T_IP_TOS: - if (!checkonly) { - tcp->tcp_ipha->ipha_type_of_service = - (uchar_t)*i1; - tcp->tcp_tos = (uchar_t)*i1; - } - break; - case IP_TTL: - if (!checkonly) { - tcp->tcp_ipha->ipha_ttl = (uchar_t)*i1; - tcp->tcp_ttl = (uchar_t)*i1; - } - break; - case IP_BOUND_IF: - case IP_NEXTHOP: - /* Handled at the IP level */ - return (-EINVAL); case IP_SEC_OPT: /* * We should not allow policy setting after @@ -10368,166 +8334,42 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (tcp->tcp_state == TCPS_LISTEN) { return (EINVAL); - } else { - /* Handled at the IP level */ - return (-EINVAL); } - default: - *outlenp = 0; - return (EINVAL); + break; } break; - case IPPROTO_IPV6: { - ip6_pkt_t *ipp; - + case IPPROTO_IPV6: /* * IPPROTO_IPV6 options are only supported for sockets * that are using IPv6 on the wire. */ - if (tcp->tcp_ipversion != IPV6_VERSION) { + if (connp->conn_ipversion != IPV6_VERSION) { *outlenp = 0; - return (ENOPROTOOPT); + return (EINVAL); } - /* - * Only sticky options; no ancillary data - */ - ipp = &tcp->tcp_sticky_ipp; switch (name) { - case IPV6_UNICAST_HOPS: - /* -1 means use default */ - if (*i1 < -1 || *i1 > IPV6_MAX_HOPS) { - *outlenp = 0; - return (EINVAL); - } - if (!checkonly) { - if (*i1 == -1) { - tcp->tcp_ip6h->ip6_hops = - ipp->ipp_unicast_hops = - (uint8_t)tcps->tcps_ipv6_hoplimit; - ipp->ipp_fields &= ~IPPF_UNICAST_HOPS; - /* Pass modified value to IP. */ - *i1 = tcp->tcp_ip6h->ip6_hops; - } else { - tcp->tcp_ip6h->ip6_hops = - ipp->ipp_unicast_hops = - (uint8_t)*i1; - ipp->ipp_fields |= IPPF_UNICAST_HOPS; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - } - break; - case IPV6_BOUND_IF: - if (!checkonly) { - tcp->tcp_bound_if = *i1; - PASS_OPT_TO_IP(connp); - } - break; - /* - * Set boolean switches for ancillary data delivery - */ case IPV6_RECVPKTINFO: if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVPKTINFO; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVPKTINFO; /* Force it to be sent up with the next msg */ tcp->tcp_recvifindex = 0; - PASS_OPT_TO_IP(connp); } break; case IPV6_RECVTCLASS: if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVTCLASS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVTCLASS; - PASS_OPT_TO_IP(connp); + /* Force it to be sent up with the next msg */ + tcp->tcp_recvtclass = 0xffffffffU; } break; case IPV6_RECVHOPLIMIT: if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVHOPLIMIT; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVHOPLIMIT; /* Force it to be sent up with the next msg */ tcp->tcp_recvhops = 0xffffffffU; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVHOPOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVHOPOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVHOPOPTS; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVDSTOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVDSTOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVDSTOPTS; - PASS_OPT_TO_IP(connp); - } - break; - case _OLD_IPV6_RECVDSTOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_OLD_IPV6_RECVDSTOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_OLD_IPV6_RECVDSTOPTS; - } - break; - case IPV6_RECVRTHDR: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVRTHDR; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVRTHDR; - PASS_OPT_TO_IP(connp); - } - break; - case IPV6_RECVRTHDRDSTOPTS: - if (!checkonly) { - if (onoff) - tcp->tcp_ipv6_recvancillary |= - TCP_IPV6_RECVRTDSTOPTS; - else - tcp->tcp_ipv6_recvancillary &= - ~TCP_IPV6_RECVRTDSTOPTS; - PASS_OPT_TO_IP(connp); } break; case IPV6_PKTINFO: - if (inlen != 0 && inlen != sizeof (struct in6_pktinfo)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~(IPPF_IFINDEX|IPPF_ADDR); - } else { + /* This is an extra check for TCP */ + if (inlen == sizeof (struct in6_pktinfo)) { struct in6_pktinfo *pkti; pkti = (struct in6_pktinfo *)invalp; @@ -10539,219 +8381,8 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) return (EINVAL); - /* - * IP will validate the source address and - * interface index. - */ - if (IPCL_IS_NONSTR(tcp->tcp_connp)) { - reterr = ip_set_options(tcp->tcp_connp, - level, name, invalp, inlen, cr); - } else { - reterr = ip6_set_pktinfo(cr, - tcp->tcp_connp, pkti); - } - if (reterr != 0) - return (reterr); - ipp->ipp_ifindex = pkti->ipi6_ifindex; - ipp->ipp_addr = pkti->ipi6_addr; - if (ipp->ipp_ifindex != 0) - ipp->ipp_fields |= IPPF_IFINDEX; - else - ipp->ipp_fields &= ~IPPF_IFINDEX; - if (!IN6_IS_ADDR_UNSPECIFIED(&ipp->ipp_addr)) - ipp->ipp_fields |= IPPF_ADDR; - else - ipp->ipp_fields &= ~IPPF_ADDR; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - case IPV6_TCLASS: - if (inlen != 0 && inlen != sizeof (int)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_TCLASS; - } else { - if (*i1 > 255 || *i1 < -1) - return (EINVAL); - if (*i1 == -1) { - ipp->ipp_tclass = 0; - *i1 = 0; - } else { - ipp->ipp_tclass = *i1; - } - ipp->ipp_fields |= IPPF_TCLASS; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - case IPV6_NEXTHOP: - /* - * IP will verify that the nexthop is reachable - * and fail for sticky options. - */ - if (inlen != 0 && inlen != sizeof (sin6_t)) - return (EINVAL); - if (checkonly) - break; - - if (inlen == 0) { - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } else { - sin6_t *sin6 = (sin6_t *)invalp; - - if (sin6->sin6_family != AF_INET6) - return (EAFNOSUPPORT); - if (IN6_IS_ADDR_V4MAPPED( - &sin6->sin6_addr)) - return (EADDRNOTAVAIL); - ipp->ipp_nexthop = sin6->sin6_addr; - if (!IN6_IS_ADDR_UNSPECIFIED( - &ipp->ipp_nexthop)) - ipp->ipp_fields |= IPPF_NEXTHOP; - else - ipp->ipp_fields &= ~IPPF_NEXTHOP; - } - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - PASS_OPT_TO_IP(connp); - break; - case IPV6_HOPOPTS: { - ip6_hbh_t *hopts = (ip6_hbh_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (hopts->ip6h_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_hopopts, - &ipp->ipp_hopoptslen, tcp->tcp_label_len); - if (reterr != 0) - return (reterr); - if (ipp->ipp_hopoptslen == 0) - ipp->ipp_fields &= ~IPPF_HOPOPTS; - else - ipp->ipp_fields |= IPPF_HOPOPTS; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_RTHDRDSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_rtdstopts, - &ipp->ipp_rtdstoptslen, 0); - if (reterr != 0) - return (reterr); - if (ipp->ipp_rtdstoptslen == 0) - ipp->ipp_fields &= ~IPPF_RTDSTOPTS; - else - ipp->ipp_fields |= IPPF_RTDSTOPTS; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_DSTOPTS: { - ip6_dest_t *dopts = (ip6_dest_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (dopts->ip6d_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_dstopts, - &ipp->ipp_dstoptslen, 0); - if (reterr != 0) - return (reterr); - if (ipp->ipp_dstoptslen == 0) - ipp->ipp_fields &= ~IPPF_DSTOPTS; - else - ipp->ipp_fields |= IPPF_DSTOPTS; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_RTHDR: { - ip6_rthdr_t *rt = (ip6_rthdr_t *)invalp; - - /* - * Sanity checks - minimum size, size a multiple of - * eight bytes, and matching size passed in. - */ - if (inlen != 0 && - inlen != (8 * (rt->ip6r_len + 1))) - return (EINVAL); - - if (checkonly) - break; - - reterr = optcom_pkt_set(invalp, inlen, B_TRUE, - (uchar_t **)&ipp->ipp_rthdr, - &ipp->ipp_rthdrlen, 0); - if (reterr != 0) - return (reterr); - if (ipp->ipp_rthdrlen == 0) - ipp->ipp_fields &= ~IPPF_RTHDR; - else - ipp->ipp_fields |= IPPF_RTHDR; - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - break; - } - case IPV6_V6ONLY: - if (!checkonly) { - tcp->tcp_connp->conn_ipv6_v6only = onoff; } break; - case IPV6_USE_MIN_MTU: - if (inlen != sizeof (int)) - return (EINVAL); - - if (*i1 < -1 || *i1 > 1) - return (EINVAL); - - if (checkonly) - break; - - ipp->ipp_fields |= IPPF_USE_MIN_MTU; - ipp->ipp_use_min_mtu = *i1; - break; case IPV6_SEC_OPT: /* * We should not allow policy setting after @@ -10759,30 +8390,18 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, */ if (tcp->tcp_state == TCPS_LISTEN) { return (EINVAL); - } else { - /* Handled at the IP level */ - return (-EINVAL); - } - case IPV6_SRC_PREFERENCES: - if (inlen != sizeof (uint32_t)) - return (EINVAL); - reterr = ip6_set_src_preferences(tcp->tcp_connp, - *(uint32_t *)invalp); - if (reterr != 0) { - *outlenp = 0; - return (reterr); } break; - default: - *outlenp = 0; - return (EINVAL); } break; - } /* end IPPROTO_IPV6 */ - default: + } + reterr = conn_opt_set(&coas, level, name, inlen, invalp, + checkonly, cr); + if (reterr != 0) { *outlenp = 0; - return (EINVAL); + return (reterr); } + /* * Common case of OK return with outval same as inval */ @@ -10791,6 +8410,45 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, (void) bcopy(invalp, outvalp, inlen); } *outlenp = inlen; + + if (coas.coa_changed & COA_HEADER_CHANGED) { + reterr = tcp_build_hdrs(tcp); + if (reterr != 0) + return (reterr); + } + if (coas.coa_changed & COA_ROUTE_CHANGED) { + in6_addr_t nexthop; + + /* + * If we are connected we re-cache the information. + * We ignore errors to preserve BSD behavior. + * Note that we don't redo IPsec policy lookup here + * since the final destination (or source) didn't change. + */ + ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, + &connp->conn_faddr_v6, &nexthop); + + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && + !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + (void) ip_attr_connect(connp, connp->conn_ixa, + &connp->conn_laddr_v6, &connp->conn_faddr_v6, + &nexthop, connp->conn_fport, NULL, NULL, + IPDF_VERIFY_DST); + } + } + if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { + connp->conn_wq->q_hiwat = connp->conn_sndbuf; + } + if (coas.coa_changed & COA_WROFF_CHANGED) { + connp->conn_wroff = connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra; + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); + } + if (coas.coa_changed & COA_OOBINLINE_CHANGED) { + if (IPCL_IS_NONSTR(connp)) + proto_set_rx_oob_opt(connp, onoff); + } return (0); } @@ -10798,12 +8456,12 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, int tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr, mblk_t *mblk) + void *thisdg_attrs, cred_t *cr) { conn_t *connp = Q_TO_CONN(q); return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, - outlenp, outvalp, thisdg_attrs, cr, mblk)); + outlenp, outvalp, thisdg_attrs, cr)); } int @@ -10843,7 +8501,6 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = proto_opt_check(level, option_name, optlen, NULL, tcp_opt_obj.odb_opt_des_arr, tcp_opt_obj.odb_opt_arr_cnt, - tcp_opt_obj.odb_topmost_tpiprovider, B_TRUE, B_FALSE, cr); if (error != 0) { @@ -10856,292 +8513,75 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, - NULL, cr, NULL); + NULL, cr); squeue_synch_exit(sqp, connp); - if (error < 0) { - /* - * Pass on to ip - */ - error = ip_set_options(connp, level, option_name, optvalp, - optlen, cr); - } + ASSERT(error >= 0); + return (error); } /* - * Update tcp_sticky_hdrs based on tcp_sticky_ipp. - * The headers include ip6i_t (if needed), ip6_t, any sticky extension + * Build/update the tcp header template (in conn_ht_iphc) based on + * conn_xmit_ipp. The headers include ip6_t, any extension * headers, and the maximum size tcp header (to avoid reallocation * on the fly for additional tcp options). + * + * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}. * Returns failure if can't allocate memory. */ static int tcp_build_hdrs(tcp_t *tcp) { - char *hdrs; - uint_t hdrs_len; - ip6i_t *ip6i; - char buf[TCP_MAX_HDR_LENGTH]; - ip6_pkt_t *ipp = &tcp->tcp_sticky_ipp; - in6_addr_t src, dst; tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; + conn_t *connp = tcp->tcp_connp; + tcpha_t *tcpha; + uint32_t cksum; + int error; - /* - * save the existing tcp header and source/dest IP addresses - */ - bcopy(tcp->tcp_tcph, buf, tcp->tcp_tcp_hdr_len); - src = tcp->tcp_ip6h->ip6_src; - dst = tcp->tcp_ip6h->ip6_dst; - hdrs_len = ip_total_hdrs_len_v6(ipp) + TCP_MAX_HDR_LENGTH; - ASSERT(hdrs_len != 0); - if (hdrs_len > tcp->tcp_iphc_len) { - /* Need to reallocate */ - hdrs = kmem_zalloc(hdrs_len, KM_NOSLEEP); - if (hdrs == NULL) - return (ENOMEM); - if (tcp->tcp_iphc != NULL) { - if (tcp->tcp_hdr_grown) { - kmem_free(tcp->tcp_iphc, tcp->tcp_iphc_len); - } else { - bzero(tcp->tcp_iphc, tcp->tcp_iphc_len); - kmem_cache_free(tcp_iphc_cache, tcp->tcp_iphc); - } - tcp->tcp_iphc_len = 0; - } - ASSERT(tcp->tcp_iphc_len == 0); - tcp->tcp_iphc = hdrs; - tcp->tcp_iphc_len = hdrs_len; - tcp->tcp_hdr_grown = B_TRUE; - } - ip_build_hdrs_v6((uchar_t *)tcp->tcp_iphc, - hdrs_len - TCP_MAX_HDR_LENGTH, ipp, IPPROTO_TCP); + /* Grab lock to satisfy ASSERT; TCP is serialized using squeue */ + mutex_enter(&connp->conn_lock); + error = conn_build_hdr_template(connp, TCP_MIN_HEADER_LENGTH, + TCP_MAX_TCP_OPTIONS_LENGTH, &connp->conn_laddr_v6, + &connp->conn_faddr_v6, connp->conn_flowinfo); + mutex_exit(&connp->conn_lock); + if (error != 0) + return (error); - /* Set header fields not in ipp */ - if (ipp->ipp_fields & IPPF_HAS_IP6I) { - ip6i = (ip6i_t *)tcp->tcp_iphc; - tcp->tcp_ip6h = (ip6_t *)&ip6i[1]; - } else { - tcp->tcp_ip6h = (ip6_t *)tcp->tcp_iphc; - } /* - * tcp->tcp_ip_hdr_len will include ip6i_t if there is one. - * - * tcp->tcp_tcp_hdr_len doesn't change here. + * Any routing header/option has been massaged. The checksum difference + * is stored in conn_sum for later use. */ - tcp->tcp_ip_hdr_len = hdrs_len - TCP_MAX_HDR_LENGTH; - tcp->tcp_tcph = (tcph_t *)(tcp->tcp_iphc + tcp->tcp_ip_hdr_len); - tcp->tcp_hdr_len = tcp->tcp_ip_hdr_len + tcp->tcp_tcp_hdr_len; + tcpha = (tcpha_t *)connp->conn_ht_ulp; + tcp->tcp_tcpha = tcpha; - bcopy(buf, tcp->tcp_tcph, tcp->tcp_tcp_hdr_len); - - tcp->tcp_ip6h->ip6_src = src; - tcp->tcp_ip6h->ip6_dst = dst; + tcpha->tha_lport = connp->conn_lport; + tcpha->tha_fport = connp->conn_fport; + tcpha->tha_sum = 0; + tcpha->tha_offset_and_reserved = (5 << 4); /* - * If the hop limit was not set by ip_build_hdrs_v6(), set it to - * the default value for TCP. - */ - if (!(ipp->ipp_fields & IPPF_UNICAST_HOPS)) - tcp->tcp_ip6h->ip6_hops = tcps->tcps_ipv6_hoplimit; - - /* - * If we're setting extension headers after a connection - * has been established, and if we have a routing header - * among the extension headers, call ip_massage_options_v6 to - * manipulate the routing header/ip6_dst set the checksum - * difference in the tcp header template. - * (This happens in tcp_connect_ipv6 if the routing header - * is set prior to the connect.) - * Set the tcp_sum to zero first in case we've cleared a - * routing header or don't have one at all. + * IP wants our header length in the checksum field to + * allow it to perform a single pseudo-header+checksum + * calculation on behalf of TCP. + * Include the adjustment for a source route once IP_OPTIONS is set. */ - tcp->tcp_sum = 0; - if ((tcp->tcp_state >= TCPS_SYN_SENT) && - (tcp->tcp_ipp_fields & IPPF_RTHDR)) { - ip6_rthdr_t *rth = ip_find_rthdr_v6(tcp->tcp_ip6h, - (uint8_t *)tcp->tcp_tcph); - if (rth != NULL) { - tcp->tcp_sum = ip_massage_options_v6(tcp->tcp_ip6h, - rth, tcps->tcps_netstack); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + - (tcp->tcp_sum >> 16)); - } - } - - /* Try to get everything in a single mblk */ - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - hdrs_len + tcps->tcps_wroff_xtra); - return (0); -} - -/* - * Transfer any source route option from ipha to buf/dst in reversed form. - */ -static int -tcp_opt_rev_src_route(ipha_t *ipha, char *buf, uchar_t *dst) -{ - ipoptp_t opts; - uchar_t *opt; - uint8_t optval; - uint8_t optlen; - uint32_t len = 0; - - for (optval = ipoptp_first(&opts, ipha); - optval != IPOPT_EOL; - optval = ipoptp_next(&opts)) { - opt = opts.ipoptp_cur; - optlen = opts.ipoptp_len; - switch (optval) { - int off1, off2; - case IPOPT_SSRR: - case IPOPT_LSRR: - - /* Reverse source route */ - /* - * First entry should be the next to last one in the - * current source route (the last entry is our - * address.) - * The last entry should be the final destination. - */ - buf[IPOPT_OPTVAL] = (uint8_t)optval; - buf[IPOPT_OLEN] = (uint8_t)optlen; - off1 = IPOPT_MINOFF_SR - 1; - off2 = opt[IPOPT_OFFSET] - IP_ADDR_LEN - 1; - if (off2 < 0) { - /* No entries in source route */ - break; - } - bcopy(opt + off2, dst, IP_ADDR_LEN); - /* - * Note: use src since ipha has not had its src - * and dst reversed (it is in the state it was - * received. - */ - bcopy(&ipha->ipha_src, buf + off2, - IP_ADDR_LEN); - off2 -= IP_ADDR_LEN; - - while (off2 > 0) { - bcopy(opt + off2, buf + off1, - IP_ADDR_LEN); - off1 += IP_ADDR_LEN; - off2 -= IP_ADDR_LEN; - } - buf[IPOPT_OFFSET] = IPOPT_MINOFF_SR; - buf += optlen; - len += optlen; - break; - } - } -done: - /* Pad the resulting options */ - while (len & 0x3) { - *buf++ = IPOPT_EOL; - len++; - } - return (len); -} - - -/* - * Extract and revert a source route from ipha (if any) - * and then update the relevant fields in both tcp_t and the standard header. - */ -static void -tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha) -{ - char buf[TCP_MAX_HDR_LENGTH]; - uint_t tcph_len; - int len; - - ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION); - len = IPH_HDR_LENGTH(ipha); - if (len == IP_SIMPLE_HDR_LENGTH) - /* Nothing to do */ - return; - if (len > IP_SIMPLE_HDR_LENGTH + TCP_MAX_IP_OPTIONS_LENGTH || - (len & 0x3)) - return; - - tcph_len = tcp->tcp_tcp_hdr_len; - bcopy(tcp->tcp_tcph, buf, tcph_len); - tcp->tcp_sum = (tcp->tcp_ipha->ipha_dst >> 16) + - (tcp->tcp_ipha->ipha_dst & 0xffff); - len = tcp_opt_rev_src_route(ipha, (char *)tcp->tcp_ipha + - IP_SIMPLE_HDR_LENGTH, (uchar_t *)&tcp->tcp_ipha->ipha_dst); - len += IP_SIMPLE_HDR_LENGTH; - tcp->tcp_sum -= ((tcp->tcp_ipha->ipha_dst >> 16) + - (tcp->tcp_ipha->ipha_dst & 0xffff)); - if ((int)tcp->tcp_sum < 0) - tcp->tcp_sum--; - tcp->tcp_sum = (tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16); - tcp->tcp_sum = ntohs((tcp->tcp_sum & 0xFFFF) + (tcp->tcp_sum >> 16)); - tcp->tcp_tcph = (tcph_t *)((char *)tcp->tcp_ipha + len); - bcopy(buf, tcp->tcp_tcph, tcph_len); - tcp->tcp_ip_hdr_len = len; - tcp->tcp_ipha->ipha_version_and_hdr_length = - (IP_VERSION << 4) | (len >> 2); - len += tcph_len; - tcp->tcp_hdr_len = len; -} - -/* - * Copy the standard header into its new location, - * lay in the new options and then update the relevant - * fields in both tcp_t and the standard header. - */ -static int -tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly, uchar_t *ptr, uint_t len) -{ - uint_t tcph_len; - uint8_t *ip_optp; - tcph_t *new_tcph; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - - if ((len > TCP_MAX_IP_OPTIONS_LENGTH) || (len & 0x3)) - return (EINVAL); - - if (len > IP_MAX_OPT_LENGTH - tcp->tcp_label_len) - return (EINVAL); - - if (checkonly) { - /* - * do not really set, just pretend to - T_CHECK - */ - return (0); - } + cksum = sizeof (tcpha_t) + connp->conn_sum; + cksum = (cksum >> 16) + (cksum & 0xFFFF); + ASSERT(cksum < 0x10000); + tcpha->tha_sum = htons(cksum); - ip_optp = (uint8_t *)tcp->tcp_ipha + IP_SIMPLE_HDR_LENGTH; - if (tcp->tcp_label_len > 0) { - int padlen; - uint8_t opt; + if (connp->conn_ipversion == IPV4_VERSION) + tcp->tcp_ipha = (ipha_t *)connp->conn_ht_iphc; + else + tcp->tcp_ip6h = (ip6_t *)connp->conn_ht_iphc; - /* convert list termination to no-ops */ - padlen = tcp->tcp_label_len - ip_optp[IPOPT_OLEN]; - ip_optp += ip_optp[IPOPT_OLEN]; - opt = len > 0 ? IPOPT_NOP : IPOPT_EOL; - while (--padlen >= 0) - *ip_optp++ = opt; - } - tcph_len = tcp->tcp_tcp_hdr_len; - new_tcph = (tcph_t *)(ip_optp + len); - ovbcopy(tcp->tcp_tcph, new_tcph, tcph_len); - tcp->tcp_tcph = new_tcph; - bcopy(ptr, ip_optp, len); - - len += IP_SIMPLE_HDR_LENGTH + tcp->tcp_label_len; - - tcp->tcp_ip_hdr_len = len; - tcp->tcp_ipha->ipha_version_and_hdr_length = - (IP_VERSION << 4) | (len >> 2); - tcp->tcp_hdr_len = len + tcph_len; - if (!TCP_IS_DETACHED(tcp)) { - /* Always allocate room for all options. */ - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - TCP_MAX_COMBINED_HEADER_LENGTH + tcps->tcps_wroff_xtra); + if (connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra > + connp->conn_wroff) { + connp->conn_wroff = connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra; + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); } return (0); } @@ -11184,36 +8624,6 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps) nd_free(ndp); return (B_FALSE); } - tcps->tcps_mdt_head_param = kmem_zalloc(sizeof (tcpparam_t), - KM_SLEEP); - bcopy(&lcl_tcp_mdt_head_param, tcps->tcps_mdt_head_param, - sizeof (tcpparam_t)); - if (!nd_load(ndp, tcps->tcps_mdt_head_param->tcp_param_name, - tcp_param_get, tcp_param_set_aligned, - (caddr_t)tcps->tcps_mdt_head_param)) { - nd_free(ndp); - return (B_FALSE); - } - tcps->tcps_mdt_tail_param = kmem_zalloc(sizeof (tcpparam_t), - KM_SLEEP); - bcopy(&lcl_tcp_mdt_tail_param, tcps->tcps_mdt_tail_param, - sizeof (tcpparam_t)); - if (!nd_load(ndp, tcps->tcps_mdt_tail_param->tcp_param_name, - tcp_param_get, tcp_param_set_aligned, - (caddr_t)tcps->tcps_mdt_tail_param)) { - nd_free(ndp); - return (B_FALSE); - } - tcps->tcps_mdt_max_pbufs_param = kmem_zalloc(sizeof (tcpparam_t), - KM_SLEEP); - bcopy(&lcl_tcp_mdt_max_pbufs_param, tcps->tcps_mdt_max_pbufs_param, - sizeof (tcpparam_t)); - if (!nd_load(ndp, tcps->tcps_mdt_max_pbufs_param->tcp_param_name, - tcp_param_get, tcp_param_set_aligned, - (caddr_t)tcps->tcps_mdt_max_pbufs_param)) { - nd_free(ndp); - return (B_FALSE); - } if (!nd_load(ndp, "tcp_extra_priv_ports", tcp_extra_priv_ports_get, NULL, NULL)) { nd_free(ndp); @@ -11248,7 +8658,7 @@ tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *tcps) return (B_TRUE); } -/* ndd set routine for tcp_wroff_xtra, tcp_mdt_hdr_{head,tail}_min. */ +/* ndd set routine for tcp_wroff_xtra. */ /* ARGSUSED */ static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, caddr_t cp, @@ -11307,6 +8717,7 @@ tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) uint32_t u1; tcp_stack_t *tcps = tcp->tcp_tcps; + /* Walk through all the new pieces. */ do { ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= @@ -11433,9 +8844,10 @@ tcp_rwnd_reopen(tcp_t *tcp) { uint_t ret = 0; uint_t thwin; + conn_t *connp = tcp->tcp_connp; /* Learn the latest rwnd information that we sent to the other side. */ - thwin = ((uint_t)BE16_TO_U16(tcp->tcp_tcph->th_win)) + thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win)) << tcp->tcp_rcv_ws; /* This is peer's calculated send window (our receive window). */ thwin -= tcp->tcp_rnxt - tcp->tcp_rack; @@ -11444,7 +8856,7 @@ tcp_rwnd_reopen(tcp_t *tcp) * SWS avoidance. This means that we need to check the increase of * of receive window is at least 1 MSS. */ - if (tcp->tcp_recv_hiwater - thwin >= tcp->tcp_mss) { + if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) { /* * If the window that the other side knows is less than max * deferred acks segments, send an update immediately. @@ -11453,7 +8865,7 @@ tcp_rwnd_reopen(tcp_t *tcp) BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate); ret = TH_ACK_NEEDED; } - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; } return (ret); } @@ -11469,7 +8881,7 @@ tcp_rcv_drain(tcp_t *tcp) #ifdef DEBUG uint_t cnt = 0; #endif - queue_t *q = tcp->tcp_rq; + queue_t *q = tcp->tcp_connp->conn_rq; /* Can't drain on an eager connection */ if (tcp->tcp_listener != NULL) @@ -11511,7 +8923,7 @@ tcp_rcv_drain(tcp_t *tcp) if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, NULL); continue; } putnext(q, mp); @@ -11538,11 +8950,22 @@ tcp_rcv_drain(tcp_t *tcp) * Other messages are added as new (b_next) elements. */ void -tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) +tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) { ASSERT(seg_len == msgdsize(mp)); ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); + if (is_system_labeled()) { + ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); + /* + * Provide for protocols above TCP such as RPC. NOPID leaves + * db_cpid unchanged. + * The cred could have already been set. + */ + if (cr != NULL) + mblk_setcred(mp, cr, NOPID); + } + if (tcp->tcp_rcv_list == NULL) { ASSERT(tcp->tcp_rcv_last_head == NULL); tcp->tcp_rcv_list = mp; @@ -11562,176 +8985,6 @@ tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len) tcp->tcp_rwnd -= seg_len; } -/* - * DEFAULT TCP ENTRY POINT via squeue on READ side. - * - * This is the default entry function into TCP on the read side. TCP is - * always entered via squeue i.e. using squeue's for mutual exclusion. - * When classifier does a lookup to find the tcp, it also puts a reference - * on the conn structure associated so the tcp is guaranteed to exist - * when we come here. We still need to check the state because it might - * as well has been closed. The squeue processing function i.e. squeue_enter, - * is responsible for doing the CONN_DEC_REF. - * - * Apart from the default entry point, IP also sends packets directly to - * tcp_rput_data for AF_INET fast path and tcp_conn_request for incoming - * connections. - */ -boolean_t tcp_outbound_squeue_switch = B_FALSE; -void -tcp_input(void *arg, mblk_t *mp, void *arg2) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = (tcp_t *)connp->conn_tcp; - - /* arg2 is the sqp */ - ASSERT(arg2 != NULL); - ASSERT(mp != NULL); - - /* - * Don't accept any input on a closed tcp as this TCP logically does - * not exist on the system. Don't proceed further with this TCP. - * For eg. this packet could trigger another close of this tcp - * which would be disastrous for tcp_refcnt. tcp_close_detached / - * tcp_clean_death / tcp_closei_local must be called at most once - * on a TCP. In this case we need to refeed the packet into the - * classifier and figure out where the packet should go. Need to - * preserve the recv_ill somehow. Until we figure that out, for - * now just drop the packet if we can't classify the packet. - */ - if (tcp->tcp_state == TCPS_CLOSED || - tcp->tcp_state == TCPS_BOUND) { - conn_t *new_connp; - ip_stack_t *ipst = tcp->tcp_tcps->tcps_netstack->netstack_ip; - - new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); - if (new_connp != NULL) { - tcp_reinput(new_connp, mp, arg2); - return; - } - /* We failed to classify. For now just drop the packet */ - freemsg(mp); - return; - } - - if (DB_TYPE(mp) != M_DATA) { - tcp_rput_common(tcp, mp); - return; - } - - if (mp->b_datap->db_struioflag & STRUIO_CONNECT) { - squeue_t *final_sqp; - - mp->b_datap->db_struioflag &= ~STRUIO_CONNECT; - final_sqp = (squeue_t *)DB_CKSUMSTART(mp); - DB_CKSUMSTART(mp) = 0; - if (tcp->tcp_state == TCPS_SYN_SENT && - connp->conn_final_sqp == NULL && - tcp_outbound_squeue_switch) { - ASSERT(connp->conn_initial_sqp == connp->conn_sqp); - connp->conn_final_sqp = final_sqp; - if (connp->conn_final_sqp != connp->conn_sqp) { - CONN_INC_REF(connp); - SQUEUE_SWITCH(connp, connp->conn_final_sqp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_rput_data, connp, ip_squeue_flag, - SQTAG_CONNECT_FINISH); - return; - } - } - } - tcp_rput_data(connp, mp, arg2); -} - -/* - * The read side put procedure. - * The packets passed up by ip are assume to be aligned according to - * OK_32PTR and the IP+TCP headers fitting in the first mblk. - */ -static void -tcp_rput_common(tcp_t *tcp, mblk_t *mp) -{ - /* - * tcp_rput_data() does not expect M_CTL except for the case - * where tcp_ipv6_recvancillary is set and we get a IN_PKTINFO - * type. Need to make sure that any other M_CTLs don't make - * it to tcp_rput_data since it is not expecting any and doesn't - * check for it. - */ - if (DB_TYPE(mp) == M_CTL) { - switch (*(uint32_t *)(mp->b_rptr)) { - case TCP_IOC_ABORT_CONN: - /* - * Handle connection abort request. - */ - tcp_ioctl_abort_handler(tcp, mp); - return; - case IPSEC_IN: - /* - * Only secure icmp arrive in TCP and they - * don't go through data path. - */ - tcp_icmp_error(tcp, mp); - return; - case IN_PKTINFO: - /* - * Handle IPV6_RECVPKTINFO socket option on AF_INET6 - * sockets that are receiving IPv4 traffic. tcp - */ - ASSERT(tcp->tcp_family == AF_INET6); - ASSERT(tcp->tcp_ipv6_recvancillary & - TCP_IPV6_RECVPKTINFO); - tcp_rput_data(tcp->tcp_connp, mp, - tcp->tcp_connp->conn_sqp); - return; - case MDT_IOC_INFO_UPDATE: - /* - * Handle Multidata information update; the - * following routine will free the message. - */ - if (tcp->tcp_connp->conn_mdt_ok) { - tcp_mdt_update(tcp, - &((ip_mdt_info_t *)mp->b_rptr)->mdt_capab, - B_FALSE); - } - freemsg(mp); - return; - case LSO_IOC_INFO_UPDATE: - /* - * Handle LSO information update; the following - * routine will free the message. - */ - if (tcp->tcp_connp->conn_lso_ok) { - tcp_lso_update(tcp, - &((ip_lso_info_t *)mp->b_rptr)->lso_capab); - } - freemsg(mp); - return; - default: - /* - * tcp_icmp_err() will process the M_CTL packets. - * Non-ICMP packets, if any, will be discarded in - * tcp_icmp_err(). We will process the ICMP packet - * even if we are TCP_IS_DETACHED_NONEAGER as the - * incoming ICMP packet may result in changing - * the tcp_mss, which we would need if we have - * packets to retransmit. - */ - tcp_icmp_error(tcp, mp); - return; - } - } - - /* No point processing the message if tcp is already closed */ - if (TCP_IS_DETACHED_NONEAGER(tcp)) { - freemsg(mp); - return; - } - - tcp_rput_other(tcp, mp); -} - - /* The minimum of smoothed mean deviation in RTO calculation. */ #define TCP_SD_MIN 400 @@ -11885,12 +9138,12 @@ tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) * segments. A segment is eligible if sack_cnt for that segment is greater * than or equal tcp_dupack_fast_retransmit. After it has retransmitted * all eligible segments, it checks to see if TCP can send some new segments - * (fast recovery). If it can, set the appropriate flag for tcp_rput_data(). + * (fast recovery). If it can, set the appropriate flag for tcp_input_data(). * * Parameters: * tcp_t *tcp: the tcp structure of the connection. * uint_t *flags: in return, appropriate value will be set for - * tcp_rput_data(). + * tcp_input_data(). */ static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) @@ -11988,7 +9241,7 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) tcp->tcp_pipe += seg_len; tcp->tcp_sack_snxt = begin + seg_len; - tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); + tcp_send_data(tcp, xmit_mp); /* * Update the send timestamp to avoid false retransmission. @@ -12012,96 +9265,8 @@ tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) } /* - * This function handles policy checking at TCP level for non-hard_bound/ - * detached connections. - */ -static boolean_t -tcp_check_policy(tcp_t *tcp, mblk_t *first_mp, ipha_t *ipha, ip6_t *ip6h, - boolean_t secure, boolean_t mctl_present) -{ - ipsec_latch_t *ipl = NULL; - ipsec_action_t *act = NULL; - mblk_t *data_mp; - ipsec_in_t *ii; - const char *reason; - kstat_named_t *counter; - tcp_stack_t *tcps = tcp->tcp_tcps; - ipsec_stack_t *ipss; - ip_stack_t *ipst; - - ASSERT(mctl_present || !secure); - - ASSERT((ipha == NULL && ip6h != NULL) || - (ip6h == NULL && ipha != NULL)); - - /* - * We don't necessarily have an ipsec_in_act action to verify - * policy because of assymetrical policy where we have only - * outbound policy and no inbound policy (possible with global - * policy). - */ - if (!secure) { - if (act == NULL || act->ipa_act.ipa_type == IPSEC_ACT_BYPASS || - act->ipa_act.ipa_type == IPSEC_ACT_CLEAR) - return (B_TRUE); - ipsec_log_policy_failure(IPSEC_POLICY_MISMATCH, - "tcp_check_policy", ipha, ip6h, secure, - tcps->tcps_netstack); - ipss = tcps->tcps_netstack->netstack_ipsec; - - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_tcp_clear), - &tcps->tcps_dropper); - return (B_FALSE); - } - - /* - * We have a secure packet. - */ - if (act == NULL) { - ipsec_log_policy_failure(IPSEC_POLICY_NOT_NEEDED, - "tcp_check_policy", ipha, ip6h, secure, - tcps->tcps_netstack); - ipss = tcps->tcps_netstack->netstack_ipsec; - - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, - DROPPER(ipss, ipds_tcp_secure), - &tcps->tcps_dropper); - return (B_FALSE); - } - - /* - * XXX This whole routine is currently incorrect. ipl should - * be set to the latch pointer, but is currently not set, so - * we initialize it to NULL to avoid picking up random garbage. - */ - if (ipl == NULL) - return (B_TRUE); - - data_mp = first_mp->b_cont; - - ii = (ipsec_in_t *)first_mp->b_rptr; - - ipst = tcps->tcps_netstack->netstack_ip; - - if (ipsec_check_ipsecin_latch(ii, data_mp, ipl, ipha, ip6h, &reason, - &counter, tcp->tcp_connp)) { - BUMP_MIB(&ipst->ips_ip_mib, ipsecInSucceeded); - return (B_TRUE); - } - (void) strlog(TCP_MOD_ID, 0, 0, SL_ERROR|SL_WARN|SL_CONSOLE, - "tcp inbound policy mismatch: %s, packet dropped\n", - reason); - BUMP_MIB(&ipst->ips_ip_mib, ipsecInFailed); - - ip_drop_packet(first_mp, B_TRUE, NULL, NULL, counter, - &tcps->tcps_dropper); - return (B_FALSE); -} - -/* - * tcp_ss_rexmit() is called in tcp_rput_data() to do slow start - * retransmission after a timeout. + * tcp_ss_rexmit() is called to do slow start retransmission after a timeout + * or ICMP errors. * * To limit the number of duplicate segments, we limit the number of segment * to be sent in one time to tcp_snd_burst, the burst variable. @@ -12150,7 +9315,7 @@ tcp_ss_rexmit(tcp_t *tcp) if (xmit_mp == NULL) return; - tcp_send_data(tcp, tcp->tcp_wq, xmit_mp); + tcp_send_data(tcp, xmit_mp); snxt += cnt; win -= cnt; @@ -12184,7 +9349,7 @@ tcp_ss_rexmit(tcp_t *tcp) /* * Process all TCP option in SYN segment. Note that this function should - * be called after tcp_adapt_ire() is called so that the necessary info + * be called after tcp_set_destination() is called so that the necessary info * from IRE is already set in the tcp structure. * * This function sets up the correct tcp_mss value according to the @@ -12194,16 +9359,17 @@ tcp_ss_rexmit(tcp_t *tcp) * should do the appropriate change. */ void -tcp_process_options(tcp_t *tcp, tcph_t *tcph) +tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) { int options; tcp_opt_t tcpopt; uint32_t mss_max; char *tmp_tcph; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; tcpopt.tcp = NULL; - options = tcp_parse_options(tcph, &tcpopt); + options = tcp_parse_options(tcpha, &tcpopt); /* * Process MSS option. Note that MSS option value does not account @@ -12212,12 +9378,12 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * IPv6. */ if (!(options & TCP_OPT_MSS_PRESENT)) { - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; else tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; } else { - if (tcp->tcp_ipversion == IPV4_VERSION) + if (connp->conn_ipversion == IPV4_VERSION) mss_max = tcps->tcps_mss_max_ipv4; else mss_max = tcps->tcps_mss_max_ipv6; @@ -12240,23 +9406,23 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) /* Process Timestamp option. */ if ((options & TCP_OPT_TSTAMP_PRESENT) && (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { - tmp_tcph = (char *)tcp->tcp_tcph; + tmp_tcph = (char *)tcp->tcp_tcpha; tcp->tcp_snd_ts_ok = B_TRUE; tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; tcp->tcp_last_rcv_lbolt = lbolt64; ASSERT(OK_32PTR(tmp_tcph)); - ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); /* Fill in our template header with basic timestamp option. */ - tmp_tcph += tcp->tcp_tcp_hdr_len; + tmp_tcph += connp->conn_ht_ulp_len; tmp_tcph[0] = TCPOPT_NOP; tmp_tcph[1] = TCPOPT_NOP; tmp_tcph[2] = TCPOPT_TSTAMP; tmp_tcph[3] = TCPOPT_TSTAMP_LEN; - tcp->tcp_hdr_len += TCPOPT_REAL_TS_LEN; - tcp->tcp_tcp_hdr_len += TCPOPT_REAL_TS_LEN; - tcp->tcp_tcph->th_offset_and_rsrvd[0] += (3 << 4); + connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; + connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; + tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); } else { tcp->tcp_snd_ts_ok = B_FALSE; } @@ -12266,12 +9432,11 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * then allocate the SACK info structure. Note the following ways * when tcp_snd_sack_ok is set to true. * - * For active connection: in tcp_adapt_ire() called in - * tcp_rput_other(), or in tcp_rput_other() when tcp_sack_permitted - * is checked. + * For active connection: in tcp_set_destination() called in + * tcp_connect(). * - * For passive connection: in tcp_adapt_ire() called in - * tcp_accept_comm(). + * For passive connection: in tcp_set_destination() called in + * tcp_input_listener(). * * That's the reason why the extra TCP_IS_DETACHED() check is there. * That check makes sure that if we did not send a SACK OK option, @@ -12320,7 +9485,8 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * Now we know the exact TCP/IP header length, subtract * that from tcp_mss to get our side's MSS. */ - tcp->tcp_mss -= tcp->tcp_hdr_len; + tcp->tcp_mss -= connp->conn_ht_iphc_len; + /* * Here we assume that the other side's header size will be equal to * our header size. We calculate the real MSS accordingly. Need to @@ -12328,22 +9494,29 @@ tcp_process_options(tcp_t *tcp, tcph_t *tcph) * * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) */ - tcpopt.tcp_opt_mss -= tcp->tcp_hdr_len + tcp->tcp_ipsec_overhead - - ((tcp->tcp_ipversion == IPV4_VERSION ? + tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + + tcp->tcp_ipsec_overhead - + ((connp->conn_ipversion == IPV4_VERSION ? IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); /* * Set MSS to the smaller one of both ends of the connection. * We should not have called tcp_mss_set() before, but our * side of the MSS should have been set to a proper value - * by tcp_adapt_ire(). tcp_mss_set() will also set up the + * by tcp_set_destination(). tcp_mss_set() will also set up the * STREAM head parameters properly. * * If we have a larger-than-16-bit window but the other side * didn't want to do window scale, tcp_rwnd_set() will take * care of that. */ - tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss), B_TRUE); + tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); + + /* + * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been + * updated properly. + */ + SET_TCP_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); } /* @@ -12410,7 +9583,7 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) tcp_t *tail; /* - * The eager already has an extra ref put in tcp_rput_data + * The eager already has an extra ref put in tcp_input_data * so that it stays till accept comes back even though it * might get into TCPS_CLOSED as a result of a TH_RST etc. */ @@ -12496,8 +9669,8 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) * remote host. This proves the IP addr is good. * Cache it! */ - addr_cache[IP_ADDR_CACHE_HASH( - tcp->tcp_remote)] = tcp->tcp_remote; + addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = + tcp->tcp_connp->conn_faddr_v4; } mutex_exit(&listener->tcp_eager_lock); if (need_send_conn_ind) @@ -12513,17 +9686,16 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) { if (IPCL_IS_NONSTR(lconnp)) { cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp, &cpid); + pid_t cpid = NOPID; ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp); ASSERT(econnp->conn_tcp->tcp_saved_listener == lconnp->conn_tcp); + cr = msg_getcred(mp, &cpid); + /* Keep the message around in case of a fallback to TPI */ econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp; - /* * Notify the ULP about the newconn. It is guaranteed that no * tcp_accept() call will be made for the eager if the @@ -12545,177 +9717,83 @@ tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) econnp->conn_tcp->tcp_conn_req_seqnum); } } else { - putnext(lconnp->conn_tcp->tcp_rq, mp); + putnext(lconnp->conn_rq, mp); } } -mblk_t * -tcp_find_pktinfo(tcp_t *tcp, mblk_t *mp, uint_t *ipversp, uint_t *ip_hdr_lenp, - uint_t *ifindexp, ip6_pkt_t *ippp) +/* + * Handle a packet that has been reclassified by TCP. + * This function drops the ref on connp that the caller had. + */ +static void +tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) { - ip_pktinfo_t *pinfo; - ip6_t *ip6h; - uchar_t *rptr; - mblk_t *first_mp = mp; - boolean_t mctl_present = B_FALSE; - uint_t ifindex = 0; - ip6_pkt_t ipp; - uint_t ipvers; - uint_t ip_hdr_len; - tcp_stack_t *tcps = tcp->tcp_tcps; + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - rptr = mp->b_rptr; - ASSERT(OK_32PTR(rptr)); - ASSERT(tcp != NULL); - ipp.ipp_fields = 0; + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + freemsg(mp); + CONN_DEC_REF(connp); + return; + } - switch DB_TYPE(mp) { - case M_CTL: - mp = mp->b_cont; - if (mp == NULL) { - freemsg(first_mp); - return (NULL); + if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || + (ira->ira_flags & IRAF_IPSEC_SECURE)) { + ip6_t *ip6h; + ipha_t *ipha; + + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha = (ipha_t *)mp->b_rptr; + ip6h = NULL; + } else { + ipha = NULL; + ip6h = (ip6_t *)mp->b_rptr; } - if (DB_TYPE(mp) != M_DATA) { - freemsg(first_mp); - return (NULL); + mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira); + if (mp == NULL) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + CONN_DEC_REF(connp); + return; } - mctl_present = B_TRUE; - break; - case M_DATA: - break; - default: - cmn_err(CE_NOTE, "tcp_find_pktinfo: unknown db_type"); - freemsg(mp); - return (NULL); } - ipvers = IPH_HDR_VERSION(rptr); - if (ipvers == IPV4_VERSION) { - if (tcp == NULL) { - ip_hdr_len = IPH_HDR_LENGTH(rptr); - goto done; - } - - ipp.ipp_fields |= IPPF_HOPLIMIT; - ipp.ipp_hoplimit = ((ipha_t *)rptr)->ipha_ttl; + if (IPCL_IS_TCP(connp)) { /* - * If we have IN_PKTINFO in an M_CTL and tcp_ipv6_recvancillary - * has TCP_IPV6_RECVPKTINFO set, pass I/F index along in ipp. + * do not drain, certain use cases can blow + * the stack */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) && - mctl_present) { - pinfo = (ip_pktinfo_t *)first_mp->b_rptr; - if ((MBLKL(first_mp) == sizeof (ip_pktinfo_t)) && - (pinfo->ip_pkt_ulp_type == IN_PKTINFO) && - (pinfo->ip_pkt_flags & IPF_RECVIF)) { - ipp.ipp_fields |= IPPF_IFINDEX; - ipp.ipp_ifindex = pinfo->ip_pkt_ifindex; - ifindex = pinfo->ip_pkt_ifindex; - } - freeb(first_mp); - mctl_present = B_FALSE; - } - ip_hdr_len = IPH_HDR_LENGTH(rptr); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + connp->conn_recv, connp, ira, + SQ_NODRAIN, SQTAG_IP_TCP_INPUT); } else { - ip6h = (ip6_t *)rptr; - - ASSERT(ipvers == IPV6_VERSION); - ipp.ipp_fields = IPPF_HOPLIMIT | IPPF_TCLASS; - ipp.ipp_tclass = (ip6h->ip6_flow & 0x0FF00000) >> 20; - ipp.ipp_hoplimit = ip6h->ip6_hops; - - if (ip6h->ip6_nxt != IPPROTO_TCP) { - uint8_t nexthdrp; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - /* Look for ifindex information */ - if (ip6h->ip6_nxt == IPPROTO_RAW) { - ip6i_t *ip6i = (ip6i_t *)ip6h; - if ((uchar_t *)&ip6i[1] > mp->b_wptr) { - BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); - freemsg(first_mp); - return (NULL); - } - - if (ip6i->ip6i_flags & IP6I_IFINDEX) { - ASSERT(ip6i->ip6i_ifindex != 0); - ipp.ipp_fields |= IPPF_IFINDEX; - ipp.ipp_ifindex = ip6i->ip6i_ifindex; - ifindex = ip6i->ip6i_ifindex; - } - rptr = (uchar_t *)&ip6i[1]; - mp->b_rptr = rptr; - if (rptr == mp->b_wptr) { - mblk_t *mp1; - mp1 = mp->b_cont; - freeb(mp); - mp = mp1; - rptr = mp->b_rptr; - } - if (MBLKL(mp) < IPV6_HDR_LEN + - sizeof (tcph_t)) { - BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); - freemsg(first_mp); - return (NULL); - } - ip6h = (ip6_t *)rptr; - } - - /* - * Find any potentially interesting extension headers - * as well as the length of the IPv6 + extension - * headers. - */ - ip_hdr_len = ip_find_hdr_v6(mp, ip6h, &ipp, &nexthdrp); - /* Verify if this is a TCP packet */ - if (nexthdrp != IPPROTO_TCP) { - BUMP_MIB(&ipst->ips_ip_mib, tcpInErrs); - freemsg(first_mp); - return (NULL); - } - } else { - ip_hdr_len = IPV6_HDR_LEN; - } + /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ + (connp->conn_recv)(connp, mp, NULL, + ira); + CONN_DEC_REF(connp); } -done: - if (ipversp != NULL) - *ipversp = ipvers; - if (ip_hdr_lenp != NULL) - *ip_hdr_lenp = ip_hdr_len; - if (ippp != NULL) - *ippp = ipp; - if (ifindexp != NULL) - *ifindexp = ifindex; - if (mctl_present) { - freeb(first_mp); - } - return (mp); } +boolean_t tcp_outbound_squeue_switch = B_FALSE; + /* * Handle M_DATA messages from IP. Its called directly from IP via - * squeue for AF_INET type sockets fast path. No M_CTL are expected - * in this path. - * - * For everything else (including AF_INET6 sockets with 'tcp_ipversion' - * v4 and v6), we are called through tcp_input() and a M_CTL can - * be present for options but tcp_find_pktinfo() deals with it. We - * only expect M_DATA packets after tcp_find_pktinfo() is done. + * squeue for received IP packets. * * The first argument is always the connp/tcp to which the mp belongs. * There are no exceptions to this rule. The caller has already put - * a reference on this connp/tcp and once tcp_rput_data() returns, + * a reference on this connp/tcp and once tcp_input_data() returns, * the squeue will do the refrele. * - * The TH_SYN for the listener directly go to tcp_conn_request via - * squeue. + * The TH_SYN for the listener directly go to tcp_input_listener via + * squeue. ICMP errors go directly to tcp_icmp_input(). * * sqp: NULL = recursive, sqp != NULL means called from squeue */ void -tcp_rput_data(void *arg, mblk_t *mp, void *arg2) +tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) { int32_t bytes_acked; int32_t gap; @@ -12729,11 +9807,10 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) int seg_len; uint_t ip_hdr_len; uint32_t seg_seq; - tcph_t *tcph; + tcpha_t *tcpha; int urp; tcp_opt_t tcpopt; - uint_t ipvers; - ip6_pkt_t ipp; + ip_pkt_t ipp; boolean_t ofo_seg = B_FALSE; /* Out of order segment */ uint32_t cwnd; uint32_t add; @@ -12756,33 +9833,43 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) rptr = mp->b_rptr; ASSERT(OK_32PTR(rptr)); - /* - * An AF_INET socket is not capable of receiving any pktinfo. Do inline - * processing here. For rest call tcp_find_pktinfo to fill up the - * necessary information. - */ - if (IPCL_IS_TCP4(connp)) { - ipvers = IPV4_VERSION; - ip_hdr_len = IPH_HDR_LENGTH(rptr); - } else { - mp = tcp_find_pktinfo(tcp, mp, &ipvers, &ip_hdr_len, - NULL, &ipp); - if (mp == NULL) { - TCP_STAT(tcps, tcp_rput_v6_error); - return; + ip_hdr_len = ira->ira_ip_hdr_length; + if (connp->conn_recv_ancillary.crb_all != 0) { + /* + * Record packet information in the ip_pkt_t + */ + ipp.ipp_fields = 0; + if (ira->ira_flags & IRAF_IS_IPV4) { + (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, + B_FALSE); + } else { + uint8_t nexthdrp; + + /* + * IPv6 packets can only be received by applications + * that are prepared to receive IPv6 addresses. + * The IP fanout must ensure this. + */ + ASSERT(connp->conn_family == AF_INET6); + + (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, + &nexthdrp); + ASSERT(nexthdrp == IPPROTO_TCP); + + /* Could have caused a pullup? */ + iphdr = mp->b_rptr; + rptr = mp->b_rptr; } - iphdr = mp->b_rptr; - rptr = mp->b_rptr; } ASSERT(DB_TYPE(mp) == M_DATA); ASSERT(mp->b_next == NULL); - tcph = (tcph_t *)&rptr[ip_hdr_len]; - seg_seq = ABE32_TO_U32(tcph->th_seq); - seg_ack = ABE32_TO_U32(tcph->th_ack); + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + seg_seq = ntohl(tcpha->tha_seq); + seg_ack = ntohl(tcpha->tha_ack); ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); seg_len = (int)(mp->b_wptr - rptr) - - (ip_hdr_len + TCP_HDR_LENGTH(tcph)); + (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { do { ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= @@ -12794,7 +9881,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, - seg_len, tcph); + seg_len, tcpha, ira); return; } @@ -12809,7 +9896,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_last_recv_time = lbolt; } - flags = (unsigned int)tcph->th_flags[0] & 0xFF; + flags = (unsigned int)tcpha->tha_flags & 0xFF; BUMP_LOCAL(tcp->tcp_ibsegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); @@ -12840,7 +9927,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) } /* Update pointers into message */ iphdr = rptr = mp->b_rptr; - tcph = (tcph_t *)&rptr[ip_hdr_len]; + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { /* * Since we can't handle any data with this urgent @@ -12849,13 +9936,29 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * the urgent mark and generate the M_PCSIG, * which we can do. */ - mp->b_wptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); + mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); seg_len = 0; } } switch (tcp->tcp_state) { case TCPS_SYN_SENT: + if (connp->conn_final_sqp == NULL && + tcp_outbound_squeue_switch && sqp != NULL) { + ASSERT(connp->conn_initial_sqp == connp->conn_sqp); + connp->conn_final_sqp = sqp; + if (connp->conn_final_sqp != connp->conn_sqp) { + DTRACE_PROBE1(conn__final__sqp__switch, + conn_t *, connp); + CONN_INC_REF(connp); + SQUEUE_SWITCH(connp, connp->conn_final_sqp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_input_data, connp, ira, ip_squeue_flag, + SQTAG_CONNECT_FINISH); + return; + } + DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); + } if (flags & TH_ACK) { /* * Note that our stack cannot send data before a @@ -12887,13 +9990,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) } /* Process all TCP options. */ - tcp_process_options(tcp, tcph); + tcp_process_options(tcp, tcpha); /* * The following changes our rwnd to be a multiple of the * MIN(peer MSS, our MSS) for performance reason. */ - (void) tcp_rwnd_set(tcp, - MSS_ROUNDUP(tcp->tcp_recv_hiwater, tcp->tcp_mss)); + (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, + tcp->tcp_mss)); /* Is the other end ECN capable? */ if (tcp->tcp_ecn_ok) { @@ -12910,21 +10013,17 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_irs = seg_seq; tcp->tcp_rack = seg_seq; tcp->tcp_rnxt = seg_seq + 1; - U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); + tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); if (!TCP_IS_DETACHED(tcp)) { /* Allocate room for SACK options if needed. */ - if (tcp->tcp_snd_sack_ok) { - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - tcp->tcp_hdr_len + - TCPOPT_MAX_SACK_LEN + - (tcp->tcp_loopback ? 0 : - tcps->tcps_wroff_xtra)); - } else { - (void) proto_set_tx_wroff(tcp->tcp_rq, connp, - tcp->tcp_hdr_len + - (tcp->tcp_loopback ? 0 : - tcps->tcps_wroff_xtra)); - } + connp->conn_wroff = connp->conn_ht_iphc_len; + if (tcp->tcp_snd_sack_ok) + connp->conn_wroff += TCPOPT_MAX_SACK_LEN; + if (!tcp->tcp_loopback) + connp->conn_wroff += tcps->tcps_wroff_xtra; + + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); } if (flags & TH_ACK) { /* @@ -12944,15 +10043,14 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * sending up connection confirmation */ tcp->tcp_state = TCPS_ESTABLISHED; - if (!tcp_conn_con(tcp, iphdr, tcph, mp, - tcp->tcp_loopback ? &mp1 : NULL)) { + if (!tcp_conn_con(tcp, iphdr, mp, + tcp->tcp_loopback ? &mp1 : NULL, ira)) { tcp->tcp_state = TCPS_SYN_SENT; freemsg(mp); return; } /* SYN was acked - making progress */ - if (tcp->tcp_ipversion == IPV6_VERSION) - tcp->tcp_ip_forward_progress = B_TRUE; + tcp->tcp_ip_forward_progress = B_TRUE; /* One for the SYN */ tcp->tcp_suna = tcp->tcp_iss + 1; @@ -12983,7 +10081,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_swl1 = seg_seq; tcp->tcp_swl2 = seg_ack; - new_swnd = BE16_TO_U16(tcph->th_win); + new_swnd = ntohs(tcpha->tha_win); tcp->tcp_swnd = new_swnd; if (new_swnd > tcp->tcp_max_swnd) tcp->tcp_max_swnd = new_swnd; @@ -13022,22 +10120,25 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_ack_tid); tcp->tcp_ack_tid = 0; } - tcp_send_data(tcp, tcp->tcp_wq, ack_mp); + tcp_send_data(tcp, ack_mp); BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); if (!IPCL_IS_NONSTR(connp)) { /* Send up T_CONN_CON */ - putnext(tcp->tcp_rq, mp1); + if (ira->ira_cred != NULL) { + mblk_setcred(mp1, + ira->ira_cred, + ira->ira_cpid); + } + putnext(connp->conn_rq, mp1); } else { - cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp1, &cpid); (*connp->conn_upcalls-> su_connected) (connp->conn_upper_handle, - tcp->tcp_connid, cr, cpid); + tcp->tcp_connid, + ira->ira_cred, + ira->ira_cpid); freemsg(mp1); } @@ -13054,15 +10155,16 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) TCP_STAT(tcps, tcp_fusion_unfusable); tcp->tcp_unfusable = B_TRUE; if (!IPCL_IS_NONSTR(connp)) { - putnext(tcp->tcp_rq, mp1); + if (ira->ira_cred != NULL) { + mblk_setcred(mp1, ira->ira_cred, + ira->ira_cpid); + } + putnext(connp->conn_rq, mp1); } else { - cred_t *cr; - pid_t cpid; - - cr = msg_getcred(mp1, &cpid); (*connp->conn_upcalls->su_connected) (connp->conn_upper_handle, - tcp->tcp_connid, cr, cpid); + tcp->tcp_connid, ira->ira_cred, + ira->ira_cpid); freemsg(mp1); } } @@ -13089,13 +10191,8 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_state = TCPS_SYN_RCVD; mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); - if (mp1) { - /* - * See comment in tcp_conn_request() for why we use - * the open() time pid here. - */ - DB_CPID(mp1) = tcp->tcp_cpid; - tcp_send_data(tcp, tcp->tcp_wq, mp1); + if (mp1 != NULL) { + tcp_send_data(tcp, mp1); TCP_TIMER_RESTART(tcp, tcp->tcp_rto); } freemsg(mp); @@ -13146,9 +10243,20 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) conn_t *new_connp; ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - new_connp = ipcl_classify(mp, connp->conn_zoneid, ipst); + /* + * Don't accept any input on a closed tcp as this TCP logically + * does not exist on the system. Don't proceed further with + * this TCP. For instance, this packet could trigger another + * close of this tcp which would be disastrous for tcp_refcnt. + * tcp_close_detached / tcp_clean_death / tcp_closei_local must + * be called at most once on a TCP. In this case we need to + * refeed the packet into the classifier and figure out where + * the packet should go. + */ + new_connp = ipcl_classify(mp, ira, ipst); if (new_connp != NULL) { - tcp_reinput(new_connp, mp, connp->conn_sqp); + /* Drops ref on new_connp */ + tcp_reinput(new_connp, mp, ira, ipst); return; } /* We failed to classify. For now just drop the packet */ @@ -13194,7 +10302,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) tcp->tcp_kssl_ctx = NULL; tcp->tcp_rnxt += seg_len; - U32_TO_ABE32(tcp->tcp_rnxt, tcp->tcp_tcph->th_ack); + tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); flags |= TH_ACK_NEEDED; goto ack_check; } @@ -13205,13 +10313,13 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) return; } - mp->b_rptr = (uchar_t *)tcph + TCP_HDR_LENGTH(tcph); - urp = BE16_TO_U16(tcph->th_urp) - TCP_OLD_URP_INTERPRETATION; - new_swnd = BE16_TO_U16(tcph->th_win) << - ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); + mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); + urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION; + new_swnd = ntohs(tcpha->tha_win) << + ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); if (tcp->tcp_snd_ts_ok) { - if (!tcp_paws_check(tcp, tcph, &tcpopt)) { + if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { /* * This segment is not acceptable. * Drop it and send back an ACK. @@ -13227,7 +10335,7 @@ tcp_rput_data(void *arg, mblk_t *mp, void *arg2) * SACK info in already updated in tcp_parse_options. Ignore * all other TCP options... */ - (void) tcp_parse_options(tcph, &tcpopt); + (void) tcp_parse_options(tcpha, &tcpopt); } try_again:; mss = tcp->tcp_mss; @@ -13289,7 +10397,7 @@ try_again:; * Adjust seg_len to the original value for tracing. */ seg_len -= gap; - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: unacceptable, gap %d, rgap %d, " "flags 0x%x, seg_seq %u, seg_ack %u, " @@ -13436,7 +10544,7 @@ try_again:; return; } if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(tcp->tcp_rq, + !putnextctl1(connp->conn_rq, M_PCSIG, SIGURG)) { /* Try again on the rexmit. */ freemsg(mp1); @@ -13505,7 +10613,7 @@ ok:; * same segment. In this case, we once again turn * on ECN_ECHO. */ - if (tcp->tcp_ipversion == IPV4_VERSION) { + if (connp->conn_ipversion == IPV4_VERSION) { uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { @@ -13705,7 +10813,7 @@ ok:; return; } if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(tcp->tcp_rq, M_PCSIG, + !putnextctl1(connp->conn_rq, M_PCSIG, SIGURG)) { /* Try again on the rexmit. */ freemsg(mp1); @@ -13739,7 +10847,7 @@ ok:; } else if (tcp->tcp_urp_mark_mp != NULL) { /* * An allocation failure prevented the previous - * tcp_rput_data from sending up the allocated + * tcp_input_data from sending up the allocated * MSG*MARKNEXT message - send it up this time * around. */ @@ -13775,14 +10883,14 @@ ok:; */ (void) adjmsg(mp, urp - seg_len); - tcp_rput_data(connp, - mp, NULL); + tcp_input_data(connp, + mp, NULL, ira); return; } (void) adjmsg(mp1, urp - seg_len); /* Feed this piece back in. */ tmp_rnxt = tcp->tcp_rnxt; - tcp_rput_data(connp, mp1, NULL); + tcp_input_data(connp, mp1, NULL, ira); /* * If the data passed back in was not * processed (ie: bad ACK) sending @@ -13811,13 +10919,13 @@ ok:; */ (void) adjmsg(mp, urp + 1 - seg_len); - tcp_rput_data(connp, - mp, NULL); + tcp_input_data(connp, + mp, NULL, ira); return; } (void) adjmsg(mp1, urp + 1 - seg_len); tmp_rnxt = tcp->tcp_rnxt; - tcp_rput_data(connp, mp1, NULL); + tcp_input_data(connp, mp1, NULL, ira); /* * If the data passed back in was not * processed (ie: bad ACK) sending @@ -13831,7 +10939,7 @@ ok:; return; } } - tcp_rput_data(connp, mp, NULL); + tcp_input_data(connp, mp, NULL, ira); return; } /* @@ -13960,7 +11068,7 @@ process_ack: } bytes_acked = (int)(seg_ack - tcp->tcp_suna); - if (tcp->tcp_ipversion == IPV6_VERSION && bytes_acked > 0) + if (bytes_acked > 0) tcp->tcp_ip_forward_progress = B_TRUE; if (tcp->tcp_state == TCPS_SYN_RCVD) { if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && @@ -13983,7 +11091,7 @@ process_ack: /* * The listener also exists because of the refhold - * done in tcp_conn_request. Its possible that it + * done in tcp_input_listener. Its possible that it * might have closed. We will check that once we * get inside listeners context. */ @@ -14005,12 +11113,12 @@ process_ack: } else if (!tcp->tcp_loopback) { SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp, tcp_send_conn_ind, - listener->tcp_connp, SQ_FILL, + listener->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_CONN_IND); } else { SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp, tcp_send_conn_ind, - listener->tcp_connp, SQ_PROCESS, + listener->tcp_connp, NULL, SQ_PROCESS, SQTAG_TCP_CONN_IND); } } @@ -14026,7 +11134,7 @@ process_ack: */ tcp->tcp_state = TCPS_ESTABLISHED; if (tcp->tcp_active_open) { - if (!tcp_conn_con(tcp, iphdr, tcph, mp, NULL)) { + if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { freemsg(mp); tcp->tcp_state = TCPS_SYN_RCVD; return; @@ -14044,8 +11152,7 @@ process_ack: tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ bytes_acked--; /* SYN was acked - making progress */ - if (tcp->tcp_ipversion == IPV6_VERSION) - tcp->tcp_ip_forward_progress = B_TRUE; + tcp->tcp_ip_forward_progress = B_TRUE; /* * If SYN was retransmitted, need to reset all @@ -14083,7 +11190,7 @@ process_ack: /* Fuse when both sides are in ESTABLISHED state */ if (tcp->tcp_loopback && do_tcp_fusion) - tcp_fuse(tcp, iphdr, tcph); + tcp_fuse(tcp, iphdr, tcpha); } /* This code follows 4.4BSD-Lite2 mostly. */ @@ -14388,7 +11495,7 @@ process_ack: if (mp != NULL) { BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } return; } @@ -14487,7 +11594,6 @@ process_ack: } } else { tcp->tcp_rexmit = B_FALSE; - tcp->tcp_xmit_zc_clean = B_FALSE; tcp->tcp_rexmit_nxt = tcp->tcp_snxt; tcp->tcp_snd_burst = tcp->tcp_localnet ? TCP_CWND_INFINITE : TCP_CWND_NORMAL; @@ -14662,8 +11768,7 @@ fin_acked: tcp->tcp_xmit_tail = NULL; if (tcp->tcp_fin_sent) { /* FIN was acked - making progress */ - if (tcp->tcp_ipversion == IPV6_VERSION && - !tcp->tcp_fin_acked) + if (!tcp->tcp_fin_acked) tcp->tcp_ip_forward_progress = B_TRUE; tcp->tcp_fin_acked = B_TRUE; if (tcp->tcp_linger_tid != 0 && @@ -14781,7 +11886,7 @@ est: * bit so this TIME-WAIT connection won't * interfere with new ones. */ - tcp->tcp_exclbind = 0; + connp->conn_exclbind = 0; if (!TCP_IS_DETACHED(tcp)) { TCP_TIMER_RESTART(tcp, tcps->tcps_time_wait_interval); @@ -14805,8 +11910,8 @@ est: if (!tcp->tcp_fin_rcvd) { tcp->tcp_fin_rcvd = B_TRUE; tcp->tcp_rnxt++; - tcph = tcp->tcp_tcph; - U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); + tcpha = tcp->tcp_tcpha; + tcpha->tha_ack = htonl(tcp->tcp_rnxt); /* * Generate the ordrel_ind at the end unless we @@ -14815,7 +11920,7 @@ est: * after tcp_accept is done. */ if (tcp->tcp_listener == NULL && - !TCP_IS_DETACHED(tcp) && (!tcp->tcp_hard_binding)) + !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding) flags |= TH_ORDREL_NEEDED; switch (tcp->tcp_state) { case TCPS_SYN_RCVD: @@ -14836,7 +11941,7 @@ est: * bit so this TIME-WAIT connection won't * interfere with new ones. */ - tcp->tcp_exclbind = 0; + connp->conn_exclbind = 0; if (!TCP_IS_DETACHED(tcp)) { TCP_TIMER_RESTART(tcp, tcps->tcps_time_wait_interval); @@ -14872,7 +11977,7 @@ est: freeb(mp1); } update_ack: - tcph = tcp->tcp_tcph; + tcpha = tcp->tcp_tcpha; tcp->tcp_rack_cnt++; { uint32_t cur_max; @@ -14915,7 +12020,7 @@ update_ack: } } tcp->tcp_rnxt += seg_len; - U32_TO_ABE32(tcp->tcp_rnxt, tcph->th_ack); + tcpha->tha_ack = htonl(tcp->tcp_rnxt); if (mp == NULL) goto xmit_check; @@ -14942,12 +12047,13 @@ update_ack: /* * Check for ancillary data changes compared to last segment. */ - if (tcp->tcp_ipv6_recvancillary != 0) { - mp = tcp_rput_add_ancillary(tcp, mp, &ipp); - ASSERT(mp != NULL); + if (connp->conn_recv_ancillary.crb_all != 0) { + mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira); + if (mp == NULL) + return; } - if (tcp->tcp_listener || tcp->tcp_hard_binding) { + if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { /* * Side queue inbound data until the accept happens. * tcp_accept/tcp_rput drains this when the accept happens. @@ -14961,9 +12067,9 @@ update_ack: if (tcp->tcp_kssl_pending) { DTRACE_PROBE1(kssl_mblk__ksslinput_pending, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); } else { - tcp_rcv_enqueue(tcp, mp, seg_len); + tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); } } else if (IPCL_IS_NONSTR(connp)) { /* @@ -15015,19 +12121,22 @@ update_ack: (DB_TYPE(mp) == M_DATA)) { DTRACE_PROBE1(kssl_mblk__ksslinput_data1, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); } else { - putnext(tcp->tcp_rq, mp); - if (!canputnext(tcp->tcp_rq)) + if (is_system_labeled()) + tcp_setcred_data(mp, ira); + + putnext(connp->conn_rq, mp); + if (!canputnext(connp->conn_rq)) tcp->tcp_rwnd -= seg_len; } } else if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { /* Does this need SSL processing first? */ DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp); - tcp_kssl_input(tcp, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); } else if ((flags & (TH_PUSH|TH_FIN)) || - tcp->tcp_rcv_cnt + seg_len >= tcp->tcp_recv_hiwater >> 3) { + tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) { if (tcp->tcp_rcv_list != NULL) { /* * Enqueue the new segment first and then @@ -15042,11 +12151,15 @@ update_ack: * canputnext() as tcp_rcv_drain() needs to * call canputnext(). */ - tcp_rcv_enqueue(tcp, mp, seg_len); + tcp_rcv_enqueue(tcp, mp, seg_len, + ira->ira_cred); flags |= tcp_rcv_drain(tcp); } else { - putnext(tcp->tcp_rq, mp); - if (!canputnext(tcp->tcp_rq)) + if (is_system_labeled()) + tcp_setcred_data(mp, ira); + + putnext(connp->conn_rq, mp); + if (!canputnext(connp->conn_rq)) tcp->tcp_rwnd -= seg_len; } } else { @@ -15054,7 +12167,7 @@ update_ack: * Enqueue all packets when processing an mblk * from the co queue and also enqueue normal packets. */ - tcp_rcv_enqueue(tcp, mp, seg_len); + tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); } /* * Make sure the timer is running if we have data waiting @@ -15103,7 +12216,7 @@ xmit_check: BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, snd_size); - tcp_send_data(tcp, tcp->tcp_wq, mp1); + tcp_send_data(tcp, mp1); } } if (flags & TH_NEED_SACK_REXMIT) { @@ -15155,7 +12268,10 @@ ack_check: ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); mp1 = tcp->tcp_urp_mark_mp; tcp->tcp_urp_mark_mp = NULL; - putnext(tcp->tcp_rq, mp1); + if (is_system_labeled()) + tcp_setcred_data(mp1, ira); + + putnext(connp->conn_rq, mp1); #ifdef DEBUG (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_rput: sending zero-length %s %s", @@ -15172,7 +12288,7 @@ ack_check: mp1 = tcp_ack_mp(tcp); if (mp1 != NULL) { - tcp_send_data(tcp, tcp->tcp_wq, mp1); + tcp_send_data(tcp, mp1); BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); } @@ -15200,6 +12316,7 @@ ack_check: * after tcp_accept is done. */ ASSERT(tcp->tcp_listener == NULL); + ASSERT(!tcp->tcp_detached); if (IPCL_IS_NONSTR(connp)) { ASSERT(tcp->tcp_ordrel_mp == NULL); @@ -15220,7 +12337,7 @@ ack_check: mp1 = tcp->tcp_ordrel_mp; tcp->tcp_ordrel_mp = NULL; tcp->tcp_ordrel_done = B_TRUE; - putnext(tcp->tcp_rq, mp1); + putnext(connp->conn_rq, mp1); } done: ASSERT(!(flags & TH_MARKNEXT_NEEDED)); @@ -15251,21 +12368,22 @@ tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt) * segment passes the PAWS test, else returns B_FALSE. */ boolean_t -tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) +tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp) { uint8_t flags; int options; uint8_t *up; + conn_t *connp = tcp->tcp_connp; - flags = (unsigned int)tcph->th_flags[0] & 0xFF; + flags = (unsigned int)tcpha->tha_flags & 0xFF; /* * If timestamp option is aligned nicely, get values inline, * otherwise call general routine to parse. Only do that * if timestamp is the only option. */ - if (TCP_HDR_LENGTH(tcph) == (uint32_t)TCP_MIN_HEADER_LENGTH + + if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + TCPOPT_REAL_TS_LEN && - OK_32PTR((up = ((uint8_t *)tcph) + + OK_32PTR((up = ((uint8_t *)tcpha) + TCP_MIN_HEADER_LENGTH)) && *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); @@ -15278,7 +12396,7 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) } else { tcpoptp->tcp = NULL; } - options = tcp_parse_options(tcph, tcpoptp); + options = tcp_parse_options(tcpha, tcpoptp); } if (options & TCP_OPT_TSTAMP_PRESENT) { @@ -15311,16 +12429,15 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) */ tcp->tcp_snd_ts_ok = B_FALSE; - tcp->tcp_hdr_len -= TCPOPT_REAL_TS_LEN; - tcp->tcp_tcp_hdr_len -= TCPOPT_REAL_TS_LEN; - tcp->tcp_tcph->th_offset_and_rsrvd[0] -= (3 << 4); + connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN; + connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN; + tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4); /* - * Adjust the tcp_mss accordingly. We also need to - * adjust tcp_cwnd here in accordance with the new mss. - * But we avoid doing a slow start here so as to not - * to lose on the transfer rate built up so far. + * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid + * doing a slow start here so as to not to lose on the + * transfer rate built up so far. */ - tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN, B_FALSE); + tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); if (tcp->tcp_snd_sack_ok) { ASSERT(tcp->tcp_sack_info != NULL); tcp->tcp_max_sack_blk = 4; @@ -15338,38 +12455,37 @@ tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp) * when memory allocation fails we can just wait for the next data segment. */ static mblk_t * -tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) +tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, + ip_recv_attr_t *ira) { struct T_optdata_ind *todi; int optlen; uchar_t *optptr; struct T_opthdr *toh; - uint_t addflag; /* Which pieces to add */ + crb_t addflag; /* Which pieces to add */ mblk_t *mp1; + conn_t *connp = tcp->tcp_connp; optlen = 0; - addflag = 0; + addflag.crb_all = 0; /* If app asked for pktinfo and the index has changed ... */ - if ((ipp->ipp_fields & IPPF_IFINDEX) && - ipp->ipp_ifindex != tcp->tcp_recvifindex && - (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO)) { + if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && + ira->ira_ruifindex != tcp->tcp_recvifindex) { optlen += sizeof (struct T_opthdr) + sizeof (struct in6_pktinfo); - addflag |= TCP_IPV6_RECVPKTINFO; + addflag.crb_ip_recvpktinfo = 1; } /* If app asked for hoplimit and it has changed ... */ - if ((ipp->ipp_fields & IPPF_HOPLIMIT) && - ipp->ipp_hoplimit != tcp->tcp_recvhops && - (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPLIMIT)) { + if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && + ipp->ipp_hoplimit != tcp->tcp_recvhops) { optlen += sizeof (struct T_opthdr) + sizeof (uint_t); - addflag |= TCP_IPV6_RECVHOPLIMIT; + addflag.crb_ipv6_recvhoplimit = 1; } /* If app asked for tclass and it has changed ... */ - if ((ipp->ipp_fields & IPPF_TCLASS) && - ipp->ipp_tclass != tcp->tcp_recvtclass && - (tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVTCLASS)) { + if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && + ipp->ipp_tclass != tcp->tcp_recvtclass) { optlen += sizeof (struct T_opthdr) + sizeof (uint_t); - addflag |= TCP_IPV6_RECVTCLASS; + addflag.crb_ipv6_recvtclass = 1; } /* * If app asked for hopbyhop headers and it has changed ... @@ -15377,51 +12493,51 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) * a connected socket at all, (2) we're connected to at most one peer, * (3) if anything changes, then it must be some other extra option. */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVHOPOPTS) && + if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { - optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen - - tcp->tcp_label_len; - addflag |= TCP_IPV6_RECVHOPOPTS; + optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; + addflag.crb_ipv6_recvhopopts = 1; if (!ip_allocbuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen)) return (mp); } /* If app asked for dst headers before routing headers ... */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTDSTOPTS) && - ip_cmpbuf(tcp->tcp_rtdstopts, tcp->tcp_rtdstoptslen, - (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) { + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && + ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { optlen += sizeof (struct T_opthdr) + - ipp->ipp_rtdstoptslen; - addflag |= TCP_IPV6_RECVRTDSTOPTS; - if (!ip_allocbuf((void **)&tcp->tcp_rtdstopts, - &tcp->tcp_rtdstoptslen, (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen)) + ipp->ipp_rthdrdstoptslen; + addflag.crb_ipv6_recvrthdrdstopts = 1; + if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts, + &tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) return (mp); } /* If app asked for routing headers and it has changed ... */ - if ((tcp->tcp_ipv6_recvancillary & TCP_IPV6_RECVRTHDR) && + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; - addflag |= TCP_IPV6_RECVRTHDR; + addflag.crb_ipv6_recvrthdr = 1; if (!ip_allocbuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), ipp->ipp_rthdr, ipp->ipp_rthdrlen)) return (mp); } /* If app asked for dest headers and it has changed ... */ - if ((tcp->tcp_ipv6_recvancillary & - (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) && + if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts || + connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) && ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; - addflag |= TCP_IPV6_RECVDSTOPTS; + addflag.crb_ipv6_recvdstopts = 1; if (!ip_allocbuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), ipp->ipp_dstopts, ipp->ipp_dstoptslen)) @@ -15454,9 +12570,11 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) * If app asked for pktinfo and the index has changed ... * Note that the local address never changes for the connection. */ - if (addflag & TCP_IPV6_RECVPKTINFO) { + if (addflag.crb_ip_recvpktinfo) { struct in6_pktinfo *pkti; + uint_t ifindex; + ifindex = ira->ira_ruifindex; toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_PKTINFO; @@ -15464,19 +12582,15 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) toh->status = 0; optptr += sizeof (*toh); pkti = (struct in6_pktinfo *)optptr; - if (tcp->tcp_ipversion == IPV6_VERSION) - pkti->ipi6_addr = tcp->tcp_ip6h->ip6_src; - else - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, - &pkti->ipi6_addr); - pkti->ipi6_ifindex = ipp->ipp_ifindex; + pkti->ipi6_addr = connp->conn_laddr_v6; + pkti->ipi6_ifindex = ifindex; optptr += sizeof (*pkti); ASSERT(OK_32PTR(optptr)); /* Save as "last" value */ - tcp->tcp_recvifindex = ipp->ipp_ifindex; + tcp->tcp_recvifindex = ifindex; } /* If app asked for hoplimit and it has changed ... */ - if (addflag & TCP_IPV6_RECVHOPLIMIT) { + if (addflag.crb_ipv6_recvhoplimit) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_HOPLIMIT; @@ -15490,7 +12604,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) tcp->tcp_recvhops = ipp->ipp_hoplimit; } /* If app asked for tclass and it has changed ... */ - if (addflag & TCP_IPV6_RECVTCLASS) { + if (addflag.crb_ipv6_recvtclass) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_TCLASS; @@ -15503,40 +12617,38 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) /* Save as "last" value */ tcp->tcp_recvtclass = ipp->ipp_tclass; } - if (addflag & TCP_IPV6_RECVHOPOPTS) { + if (addflag.crb_ipv6_recvhopopts) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_HOPOPTS; - toh->len = sizeof (*toh) + ipp->ipp_hopoptslen - - tcp->tcp_label_len; + toh->len = sizeof (*toh) + ipp->ipp_hopoptslen; toh->status = 0; optptr += sizeof (*toh); - bcopy((uchar_t *)ipp->ipp_hopopts + tcp->tcp_label_len, optptr, - ipp->ipp_hopoptslen - tcp->tcp_label_len); - optptr += ipp->ipp_hopoptslen - tcp->tcp_label_len; + bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen); + optptr += ipp->ipp_hopoptslen; ASSERT(OK_32PTR(optptr)); /* Save as last value */ ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), ipp->ipp_hopopts, ipp->ipp_hopoptslen); } - if (addflag & TCP_IPV6_RECVRTDSTOPTS) { + if (addflag.crb_ipv6_recvrthdrdstopts) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_RTHDRDSTOPTS; - toh->len = sizeof (*toh) + ipp->ipp_rtdstoptslen; + toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen; toh->status = 0; optptr += sizeof (*toh); - bcopy(ipp->ipp_rtdstopts, optptr, ipp->ipp_rtdstoptslen); - optptr += ipp->ipp_rtdstoptslen; + bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); + optptr += ipp->ipp_rthdrdstoptslen; ASSERT(OK_32PTR(optptr)); /* Save as last value */ - ip_savebuf((void **)&tcp->tcp_rtdstopts, - &tcp->tcp_rtdstoptslen, - (ipp->ipp_fields & IPPF_RTDSTOPTS), - ipp->ipp_rtdstopts, ipp->ipp_rtdstoptslen); + ip_savebuf((void **)&tcp->tcp_rthdrdstopts, + &tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); } - if (addflag & TCP_IPV6_RECVRTHDR) { + if (addflag.crb_ipv6_recvrthdr) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_RTHDR; @@ -15551,7 +12663,7 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) (ipp->ipp_fields & IPPF_RTHDR), ipp->ipp_rthdr, ipp->ipp_rthdrlen); } - if (addflag & (TCP_IPV6_RECVDSTOPTS | TCP_OLD_IPV6_RECVDSTOPTS)) { + if (addflag.crb_ipv6_recvdstopts) { toh = (struct T_opthdr *)optptr; toh->level = IPPROTO_IPV6; toh->name = IPV6_DSTOPTS; @@ -15570,99 +12682,13 @@ tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp) return (mp); } -/* - * tcp_rput_other is called by tcp_rput to handle everything other than M_DATA - * messages. - */ -void -tcp_rput_other(tcp_t *tcp, mblk_t *mp) -{ - uchar_t *rptr = mp->b_rptr; - queue_t *q = tcp->tcp_rq; - struct T_error_ack *tea; - - switch (mp->b_datap->db_type) { - case M_PROTO: - case M_PCPROTO: - ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); - if ((mp->b_wptr - rptr) < sizeof (t_scalar_t)) - break; - tea = (struct T_error_ack *)rptr; - ASSERT(tea->PRIM_type != T_BIND_ACK); - ASSERT(tea->ERROR_prim != O_T_BIND_REQ && - tea->ERROR_prim != T_BIND_REQ); - switch (tea->PRIM_type) { - case T_ERROR_ACK: - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_TRACE|SL_ERROR, - "tcp_rput_other: case T_ERROR_ACK, " - "ERROR_prim == %d", - tea->ERROR_prim); - } - switch (tea->ERROR_prim) { - case T_SVR4_OPTMGMT_REQ: - if (tcp->tcp_drop_opt_ack_cnt > 0) { - /* T_OPTMGMT_REQ generated by TCP */ - printf("T_SVR4_OPTMGMT_REQ failed " - "%d/%d - dropped (cnt %d)\n", - tea->TLI_error, tea->UNIX_error, - tcp->tcp_drop_opt_ack_cnt); - freemsg(mp); - tcp->tcp_drop_opt_ack_cnt--; - return; - } - break; - } - if (tea->ERROR_prim == T_SVR4_OPTMGMT_REQ && - tcp->tcp_drop_opt_ack_cnt > 0) { - printf("T_SVR4_OPTMGMT_REQ failed %d/%d " - "- dropped (cnt %d)\n", - tea->TLI_error, tea->UNIX_error, - tcp->tcp_drop_opt_ack_cnt); - freemsg(mp); - tcp->tcp_drop_opt_ack_cnt--; - return; - } - break; - case T_OPTMGMT_ACK: - if (tcp->tcp_drop_opt_ack_cnt > 0) { - /* T_OPTMGMT_REQ generated by TCP */ - freemsg(mp); - tcp->tcp_drop_opt_ack_cnt--; - return; - } - break; - default: - ASSERT(tea->ERROR_prim != T_UNBIND_REQ); - break; - } - break; - case M_FLUSH: - if (*rptr & FLUSHR) - flushq(q, FLUSHDATA); - break; - default: - /* M_CTL will be directly sent to tcp_icmp_error() */ - ASSERT(DB_TYPE(mp) != M_CTL); - break; - } - /* - * Make sure we set this bit before sending the ACK for - * bind. Otherwise accept could possibly run and free - * this tcp struct. - */ - ASSERT(q != NULL); - putnext(q, mp); -} - /* ARGSUSED */ static void -tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) +tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_rq; + queue_t *q = connp->conn_rq; tcp_stack_t *tcps = tcp->tcp_tcps; ASSERT(!IPCL_IS_NONSTR(connp)); @@ -15683,7 +12709,7 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2) if (canputnext(q)) { /* Not flow-controlled, open rwnd */ - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; /* * Send back a window update immediately if TCP is above @@ -15712,16 +12738,10 @@ tcp_rsrv(queue_t *q) conn_t *connp = Q_TO_CONN(q); tcp_t *tcp = connp->conn_tcp; mblk_t *mp; - tcp_stack_t *tcps = tcp->tcp_tcps; /* No code does a putq on the read side */ ASSERT(q->q_first == NULL); - /* Nothing to do for the default queue */ - if (q == tcps->tcps_g_q) { - return; - } - /* * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already * been run. So just return. @@ -15736,7 +12756,7 @@ tcp_rsrv(queue_t *q) CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, - SQ_PROCESS, SQTAG_TCP_RSRV); + NULL, SQ_PROCESS, SQTAG_TCP_RSRV); } /* @@ -15746,8 +12766,8 @@ tcp_rsrv(queue_t *q) * * This function is called in 2 cases: * - * 1) Before data transfer begins, in tcp_accept_comm() for accepting a - * connection (passive open) and in tcp_rput_data() for active connect. + * 1) Before data transfer begins, in tcp_input_listener() for accepting a + * connection (passive open) and in tcp_input_data() for active connect. * This is called after tcp_mss_set() when the desired MSS value is known. * This makes sure that our window size is a mutiple of the other side's * MSS. @@ -15766,6 +12786,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) uint32_t max_transmittable_rwnd; boolean_t tcp_detached = TCP_IS_DETACHED(tcp); tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; /* * Insist on a receive window that is at least @@ -15782,7 +12803,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) ASSERT(peer_tcp != NULL); sth_hiwat = tcp_fuse_set_rcv_hiwat(tcp, rwnd); if (!tcp_detached) { - (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, + (void) proto_set_rx_hiwat(connp->conn_rq, connp, sth_hiwat); tcp_set_recv_threshold(tcp, sth_hiwat >> 3); } @@ -15797,11 +12818,10 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) return (sth_hiwat); } - if (tcp_detached) { + if (tcp_detached) old_max_rwnd = tcp->tcp_rwnd; - } else { - old_max_rwnd = tcp->tcp_recv_hiwater; - } + else + old_max_rwnd = connp->conn_rcvbuf; /* @@ -15854,9 +12874,14 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) * connection.) */ tcp->tcp_rwnd += rwnd - old_max_rwnd; - tcp->tcp_recv_hiwater = rwnd; + connp->conn_rcvbuf = rwnd; + + /* Are we already connected? */ + if (tcp->tcp_tcpha != NULL) { + tcp->tcp_tcpha->tha_win = + htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + } - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, tcp->tcp_tcph->th_win); if ((tcp->tcp_rcv_ws > 0) && rwnd > tcp->tcp_cwnd_max) tcp->tcp_cwnd_max = rwnd; @@ -15865,7 +12890,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) tcp_set_recv_threshold(tcp, rwnd >> 3); - (void) proto_set_rx_hiwat(tcp->tcp_rq, tcp->tcp_connp, rwnd); + (void) proto_set_rx_hiwat(connp->conn_rq, connp, rwnd); return (rwnd); } @@ -15944,7 +12969,7 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) connp = NULL; while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { tcp_t *tcp; boolean_t needattr; @@ -15992,11 +13017,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) needattr = B_TRUE; break; } - if (connp->conn_fully_bound && - connp->conn_effective_cred != NULL) { + if (connp->conn_ixa->ixa_tsl != NULL) { ts_label_t *tsl; - tsl = crgetlabel(connp->conn_effective_cred); + tsl = connp->conn_ixa->ixa_tsl; mlp.tme_flags |= MIB2_TMEF_IS_LABELED; mlp.tme_doi = label2doi(tsl); mlp.tme_label = *label2bslabel(tsl); @@ -16004,12 +13028,17 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) } /* Create a message to report on IPv6 entries */ - if (tcp->tcp_ipversion == IPV6_VERSION) { - tce6.tcp6ConnLocalAddress = tcp->tcp_ip_src_v6; - tce6.tcp6ConnRemAddress = tcp->tcp_remote_v6; - tce6.tcp6ConnLocalPort = ntohs(tcp->tcp_lport); - tce6.tcp6ConnRemPort = ntohs(tcp->tcp_fport); - tce6.tcp6ConnIfIndex = tcp->tcp_bound_if; + if (connp->conn_ipversion == IPV6_VERSION) { + tce6.tcp6ConnLocalAddress = connp->conn_laddr_v6; + tce6.tcp6ConnRemAddress = connp->conn_faddr_v6; + tce6.tcp6ConnLocalPort = ntohs(connp->conn_lport); + tce6.tcp6ConnRemPort = ntohs(connp->conn_fport); + if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) { + tce6.tcp6ConnIfIndex = + connp->conn_ixa->ixa_scopeid; + } else { + tce6.tcp6ConnIfIndex = connp->conn_bound_if; + } /* Don't want just anybody seeing these... */ if (ispriv) { tce6.tcp6ConnEntryInfo.ce_snxt = @@ -16041,9 +13070,9 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; tce6.tcp6ConnCreationProcess = - (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS : - tcp->tcp_cpid; - tce6.tcp6ConnCreationTime = tcp->tcp_open_time; + (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : + connp->conn_cpid; + tce6.tcp6ConnCreationTime = connp->conn_open_time; (void) snmp_append_data2(mp6_conn_ctl->b_cont, &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); @@ -16059,21 +13088,21 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) * but don't have IPV6_V6ONLY set. * (i.e. anything an IPv4 peer could connect to) */ - if (tcp->tcp_ipversion == IPV4_VERSION || + if (connp->conn_ipversion == IPV4_VERSION || (tcp->tcp_state <= TCPS_LISTEN && - !tcp->tcp_connp->conn_ipv6_v6only && - IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip_src_v6))) { - if (tcp->tcp_ipversion == IPV6_VERSION) { + !connp->conn_ipv6_v6only && + IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) { + if (connp->conn_ipversion == IPV6_VERSION) { tce.tcpConnRemAddress = INADDR_ANY; tce.tcpConnLocalAddress = INADDR_ANY; } else { tce.tcpConnRemAddress = - tcp->tcp_remote; + connp->conn_faddr_v4; tce.tcpConnLocalAddress = - tcp->tcp_ip_src; + connp->conn_laddr_v4; } - tce.tcpConnLocalPort = ntohs(tcp->tcp_lport); - tce.tcpConnRemPort = ntohs(tcp->tcp_fport); + tce.tcpConnLocalPort = ntohs(connp->conn_lport); + tce.tcpConnRemPort = ntohs(connp->conn_fport); /* Don't want just anybody seeing these... */ if (ispriv) { tce.tcpConnEntryInfo.ce_snxt = @@ -16107,9 +13136,10 @@ tcp_snmp_get(queue_t *q, mblk_t *mpctl) tcp->tcp_state; tce.tcpConnCreationProcess = - (tcp->tcp_cpid < 0) ? MIB2_UNKNOWN_PROCESS : - tcp->tcp_cpid; - tce.tcpConnCreationTime = tcp->tcp_open_time; + (connp->conn_cpid < 0) ? + MIB2_UNKNOWN_PROCESS : + connp->conn_cpid; + tce.tcpConnCreationTime = connp->conn_open_time; (void) snmp_append_data2(mp_conn_ctl->b_cont, &mp_conn_tail, (char *)&tce, sizeof (tce)); @@ -16273,7 +13303,6 @@ tcp_timer(void *arg) tcp_t *listener = tcp->tcp_listener; if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { - ASSERT(tcp->tcp_rq == listener->tcp_rq); /* it's our first timeout */ tcp->tcp_syn_rcvd_timeout = 1; mutex_enter(&listener->tcp_eager_lock); @@ -16295,7 +13324,7 @@ tcp_timer(void *arg) cmn_err(CE_WARN, "High TCP connect timeout " "rate! System (port %d) may be under a " "SYN flood attack!", - BE16_TO_U16(listener->tcp_tcph->th_lport)); + ntohs(listener->tcp_connp->conn_lport)); listener->tcp_ip_addr_cache = kmem_zalloc( IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), @@ -16363,7 +13392,7 @@ tcp_timer(void *arg) * backoff. */ if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_timer: zero win"); } @@ -16415,6 +13444,13 @@ tcp_timer(void *arg) * 3. But 1 and 3 are exclusive. */ if (tcp->tcp_unsent != 0) { + /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + if (tcp->tcp_cwnd == 0) { /* * Set tcp_cwnd to 1 MSS so that a @@ -16477,7 +13513,7 @@ tcp_timer(void *arg) (void) tcp_clean_death(tcp, 0, 24); return; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_timer: strange state (%d) %s", tcp->tcp_state, tcp_display(tcp, NULL, @@ -16485,8 +13521,16 @@ tcp_timer(void *arg) } return; } + if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + + /* * For zero window probe, we need to send indefinitely, * unless we have not heard from the other side for some * time... @@ -16529,11 +13573,13 @@ tcp_timer(void *arg) tcp->tcp_ms_we_have_waited = second_threshold; } } else if (ms > first_threshold) { - if (tcp->tcp_snd_zcopy_aware && (!tcp->tcp_xmit_zc_clean) && - tcp->tcp_xmit_head != NULL) { - tcp->tcp_xmit_head = - tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, 1); - } + /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + /* * We have been retransmitting for too long... The RTT * we calculated is probably incorrect. Reinitialize it. @@ -16618,20 +13664,11 @@ tcp_timer(void *arg) if (mp == NULL) { return; } - /* - * Attach credentials to retransmitted initial SYNs. - * In theory we should use the credentials from the connect() - * call to ensure that getpeerucred() on the peer will be correct. - * But we assume that SYN's are not dropped for loopback connections. - */ - if (tcp->tcp_state == TCPS_SYN_SENT) { - mblk_setcred(mp, CONN_CRED(tcp->tcp_connp), tcp->tcp_cpid); - } tcp->tcp_csuna = tcp->tcp_snxt; BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } @@ -16639,7 +13676,6 @@ static int tcp_do_unbind(conn_t *connp) { tcp_t *tcp = connp->conn_tcp; - int error = 0; switch (tcp->tcp_state) { case TCPS_BOUND: @@ -16659,41 +13695,36 @@ tcp_do_unbind(conn_t *connp) } mutex_exit(&tcp->tcp_eager_lock); - if (tcp->tcp_ipversion == IPV4_VERSION) { - tcp->tcp_ipha->ipha_src = 0; - } else { - V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); - } - V6_SET_ZERO(tcp->tcp_ip_src_v6); - bzero(tcp->tcp_tcph->th_lport, sizeof (tcp->tcp_tcph->th_lport)); + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; tcp_bind_hash_remove(tcp); tcp->tcp_state = TCPS_IDLE; - tcp->tcp_mdt = B_FALSE; - connp = tcp->tcp_connp; - connp->conn_mdt_ok = B_FALSE; - ipcl_hash_remove(connp); + ip_unbind(connp); bzero(&connp->conn_ports, sizeof (connp->conn_ports)); - return (error); + return (0); } /* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ static void tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) { - int error = tcp_do_unbind(tcp->tcp_connp); + conn_t *connp = tcp->tcp_connp; + int error; + error = tcp_do_unbind(connp); if (error > 0) { tcp_err_ack(tcp, mp, TSYSERR, error); } else if (error < 0) { tcp_err_ack(tcp, mp, -error, 0); } else { /* Send M_FLUSH according to TPI */ - (void) putnextctl1(tcp->tcp_rq, M_FLUSH, FLUSHRW); + (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); mp = mi_tpi_ok_ack_alloc(mp); - putnext(tcp->tcp_rq, mp); + if (mp != NULL) + putnext(connp->conn_rq, mp); } } @@ -16764,7 +13795,7 @@ retry: } } if (is_system_labeled() && - (i = tsol_next_port(crgetzone(tcp->tcp_cred), port, + (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, IPPROTO_TCP, B_TRUE)) != 0) { port = i; goto retry; @@ -16796,7 +13827,7 @@ retry: restart = B_TRUE; } if (is_system_labeled() && - (nextport = tsol_next_port(crgetzone(tcp->tcp_cred), + (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { next_priv_port = nextport; goto retry; @@ -16820,11 +13851,10 @@ struct { */ /* ARGSUSED */ static void -tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) +tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_wq; ASSERT(DB_TYPE(mp) != M_IOCTL); /* @@ -16851,7 +13881,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) tcp_wput_flush(tcp, mp); break; default: - CALL_IP_WPUT(connp, q, mp); + ip_wput_nondata(connp->conn_wq, mp); break; } } @@ -16862,7 +13892,7 @@ tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2) */ /* ARGSUSED */ void -tcp_output(void *arg, mblk_t *mp, void *arg2) +tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { int len; int hdrlen; @@ -16870,7 +13900,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mblk_t *mp1; uchar_t *rptr; uint32_t snxt; - tcph_t *tcph; + tcpha_t *tcpha; struct datab *db; uint32_t suna; uint32_t mss; @@ -16882,7 +13912,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) tcp_t *tcp = connp->conn_tcp; uint32_t msize; tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + ip_xmit_attr_t *ixa; /* * Try and ASSERT the minimum possible references on the @@ -16903,25 +13933,18 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) tcp->tcp_squeue_bytes -= msize; mutex_exit(&tcp->tcp_non_sq_lock); - /* Check to see if this connection wants to be re-fused. */ - if (tcp->tcp_refuse) { - if (tcp->tcp_ipversion == IPV4_VERSION && - !ipst->ips_ip4_observe.he_interested) { - tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ipha, - &tcp->tcp_saved_tcph); - } else if (tcp->tcp_ipversion == IPV6_VERSION && - !ipst->ips_ip6_observe.he_interested) { - tcp_fuse(tcp, (uchar_t *)&tcp->tcp_saved_ip6h, - &tcp->tcp_saved_tcph); - } - } /* Bypass tcp protocol for fused tcp loopback */ if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) return; mss = tcp->tcp_mss; - if (tcp->tcp_xmit_zc_clean) - mp = tcp_zcopy_backoff(tcp, mp, 0); + /* + * If ZEROCOPY has turned off, try not to send any zero-copy message + * down. Do backoff, now. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on) + mp = tcp_zcopy_backoff(tcp, mp, B_FALSE); + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); len = (int)(mp->b_wptr - mp->b_rptr); @@ -16977,8 +14000,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) * start again to get back the connection's "self-clock" as * described in VJ's paper. * - * Refer to the comment in tcp_mss_set() for the calculation - * of tcp_cwnd after idle. + * Reinitialize tcp_cwnd after idle. */ if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && (TICK_TO_MSEC(lbolt - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { @@ -16999,7 +14021,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -17046,43 +14068,43 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mp->b_next = (mblk_t *)(uintptr_t)snxt; /* adjust tcp header information */ - tcph = tcp->tcp_tcph; - tcph->th_flags[0] = (TH_ACK|TH_PUSH); + tcpha = tcp->tcp_tcpha; + tcpha->tha_flags = (TH_ACK|TH_PUSH); - sum = len + tcp->tcp_tcp_hdr_len + tcp->tcp_sum; + sum = len + connp->conn_ht_ulp_len + connp->conn_sum; sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); + tcpha->tha_sum = htons(sum); - U32_TO_ABE32(snxt, tcph->th_seq); + tcpha->tha_seq = htonl(snxt); BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); BUMP_LOCAL(tcp->tcp_obsegs); /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcph->th_win); + tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); tcp->tcp_last_sent_len = (ushort_t)len; - plen = len + tcp->tcp_hdr_len; + plen = len + connp->conn_ht_iphc_len; - if (tcp->tcp_ipversion == IPV4_VERSION) { + ixa = connp->conn_ixa; + ixa->ixa_pktlen = plen; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { tcp->tcp_ipha->ipha_length = htons(plen); } else { - tcp->tcp_ip6h->ip6_plen = htons(plen - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN); } /* see if we need to allocate a mblk for the headers */ - hdrlen = tcp->tcp_hdr_len; + hdrlen = connp->conn_ht_iphc_len; rptr = mp1->b_rptr - hdrlen; db = mp1->b_datap; if ((db->db_ref != 2) || rptr < db->db_base || (!OK_32PTR(rptr))) { /* NOTE: we assume allocb returns an OK_32PTR */ - mp = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcps->tcps_wroff_xtra, BPRI_MED); + mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED); if (!mp) { freemsg(mp1); goto no_memory; @@ -17090,7 +14112,6 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) mp->b_cont = mp1; mp1 = mp; /* Leave room for Link Level header */ - /* hdrlen = tcp->tcp_hdr_len; */ rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; mp1->b_wptr = &rptr[hdrlen]; } @@ -17099,16 +14120,16 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) /* Fill in the timestamp option. */ if (tcp->tcp_snd_ts_ok) { U32_TO_BE32((uint32_t)lbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } else { - ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } /* copy header into outgoing packet */ dst = (ipaddr_t *)rptr; - src = (ipaddr_t *)tcp->tcp_iphc; + src = (ipaddr_t *)connp->conn_ht_iphc; dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; @@ -17135,21 +14156,22 @@ tcp_output(void *arg, mblk_t *mp, void *arg2) if (tcp->tcp_ecn_ok) { SET_ECT(tcp, rptr); - tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); + tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length); if (tcp->tcp_ecn_echo_on) - tcph->th_flags[0] |= TH_ECE; + tcpha->tha_flags |= TH_ECE; if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { - tcph->th_flags[0] |= TH_CWR; + tcpha->tha_flags |= TH_CWR; tcp->tcp_ecn_cwr_sent = B_TRUE; } } if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; } - tcp_send_data(tcp, tcp->tcp_wq, mp1); + tcp_send_data(tcp, mp1); return; /* @@ -17166,29 +14188,27 @@ slow: tcp_wput_data(tcp, NULL, B_FALSE); } +/* + * This runs at the tail end of accept processing on the squeue of the + * new connection. + */ /* ARGSUSED */ void -tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) +tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_rq; - struct tcp_options *tcpopt; + queue_t *q = connp->conn_rq; tcp_stack_t *tcps = tcp->tcp_tcps; - /* socket options */ - uint_t sopp_flags; - ssize_t sopp_rxhiwat; - ssize_t sopp_maxblk; - ushort_t sopp_wroff; - ushort_t sopp_tail; - ushort_t sopp_copyopt; + struct sock_proto_props sopp; - tcpopt = (struct tcp_options *)mp->b_rptr; + /* We should just receive a single mblk that fits a T_discon_ind */ + ASSERT(mp->b_cont == NULL); /* * Drop the eager's ref on the listener, that was placed when - * this eager began life in tcp_conn_request. + * this eager began life in tcp_input_listener. */ CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp); if (IPCL_IS_NONSTR(connp)) { @@ -17227,15 +14247,12 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * memory allocation failure problems. We know * that the size of the incoming mblk i.e. * stroptions is greater than sizeof - * T_discon_ind. So the reallocb below can't - * fail. + * T_discon_ind. */ - freemsg(mp->b_cont); - mp->b_cont = NULL; ASSERT(DB_REF(mp) == 1); - mp = reallocb(mp, sizeof (struct T_discon_ind), - B_FALSE); - ASSERT(mp != NULL); + ASSERT(MBLKSIZE(mp) >= + sizeof (struct T_discon_ind)); + DB_TYPE(mp) = M_PROTO; ((union T_primitives *)mp->b_rptr)->type = T_DISCON_IND; @@ -17251,41 +14268,21 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) mp->b_wptr = mp->b_rptr + sizeof (struct T_discon_ind); putnext(q, mp); - return; } } - if (tcp->tcp_hard_binding) { - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_TRUE; - } + tcp->tcp_hard_binding = B_FALSE; return; } - if (tcpopt->to_flags & TCPOPT_BOUNDIF) { - int boundif = tcpopt->to_boundif; - uint_t len = sizeof (int); - - (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6, - IPV6_BOUND_IF, len, (uchar_t *)&boundif, &len, - (uchar_t *)&boundif, NULL, tcp->tcp_cred, NULL); - } - if (tcpopt->to_flags & TCPOPT_RECVPKTINFO) { - uint_t on = 1; - uint_t len = sizeof (uint_t); - (void) tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, IPPROTO_IPV6, - IPV6_RECVPKTINFO, len, (uchar_t *)&on, &len, - (uchar_t *)&on, NULL, tcp->tcp_cred, NULL); - } - /* - * Set max window size (tcp_recv_hiwater) of the acceptor. + * Set max window size (conn_rcvbuf) of the acceptor. */ if (tcp->tcp_rcv_list == NULL) { /* * Recv queue is empty, tcp_rwnd should not have changed. * That means it should be equal to the listener's tcp_rwnd. */ - tcp->tcp_recv_hiwater = tcp->tcp_rwnd; + connp->conn_rcvbuf = tcp->tcp_rwnd; } else { #ifdef DEBUG mblk_t *tmp; @@ -17300,19 +14297,19 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) ASSERT(cnt != 0 && tcp->tcp_rcv_cnt == cnt); #endif /* There is some data, add them back to get the max. */ - tcp->tcp_recv_hiwater = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; + connp->conn_rcvbuf = tcp->tcp_rwnd + tcp->tcp_rcv_cnt; } /* * This is the first time we run on the correct * queue after tcp_accept. So fix all the q parameters * here. */ - sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; - sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); + sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_MAXBLK | SOCKOPT_WROFF; + sopp.sopp_maxblk = tcp_maxpsz_set(tcp, B_FALSE); - sopp_rxhiwat = tcp->tcp_fused ? - tcp_fuse_set_rcv_hiwat(tcp, tcp->tcp_recv_hiwater) : - tcp->tcp_recv_hiwater; + sopp.sopp_rxhiwat = tcp->tcp_fused ? + tcp_fuse_set_rcv_hiwat(tcp, connp->conn_rcvbuf) : + connp->conn_rcvbuf; /* * Determine what write offset value to use depending on SACK and @@ -17328,18 +14325,18 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * since it would reduce the amount of work done by kmem. * Non-fused tcp loopback case is handled separately below. */ - sopp_wroff = 0; + sopp.sopp_wroff = 0; /* * Update the peer's transmit parameters according to * our recently calculated high water mark value. */ (void) tcp_maxpsz_set(tcp->tcp_loopback_peer, B_TRUE); } else if (tcp->tcp_snd_sack_ok) { - sopp_wroff = tcp->tcp_hdr_len + TCPOPT_MAX_SACK_LEN + + sopp.sopp_wroff = connp->conn_ht_iphc_allocated + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); } else { - sopp_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : - tcps->tcps_wroff_xtra); + sopp.sopp_wroff = connp->conn_ht_iphc_len + + (tcp->tcp_loopback ? 0 : tcps->tcps_wroff_xtra); } /* @@ -17354,30 +14351,22 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) * costs. */ if (tcp->tcp_kssl_ctx != NULL) { - sopp_wroff += SSL3_WROFFSET; + sopp.sopp_wroff += SSL3_WROFFSET; - sopp_flags |= SOCKOPT_TAIL; - sopp_tail = SSL3_MAX_TAIL_LEN; + sopp.sopp_flags |= SOCKOPT_TAIL; + sopp.sopp_tail = SSL3_MAX_TAIL_LEN; - sopp_flags |= SOCKOPT_ZCOPY; - sopp_copyopt = ZCVMUNSAFE; + sopp.sopp_flags |= SOCKOPT_ZCOPY; + sopp.sopp_zcopyflag = ZCVMUNSAFE; - sopp_maxblk = SSL3_MAX_RECORD_LEN; + sopp.sopp_maxblk = SSL3_MAX_RECORD_LEN; } /* Send the options up */ if (IPCL_IS_NONSTR(connp)) { - struct sock_proto_props sopp; - - sopp.sopp_flags = sopp_flags; - sopp.sopp_wroff = sopp_wroff; - sopp.sopp_maxblk = sopp_maxblk; - sopp.sopp_rxhiwat = sopp_rxhiwat; - if (sopp_flags & SOCKOPT_TAIL) { + if (sopp.sopp_flags & SOCKOPT_TAIL) { ASSERT(tcp->tcp_kssl_ctx != NULL); - ASSERT(sopp_flags & SOCKOPT_ZCOPY); - sopp.sopp_tail = sopp_tail; - sopp.sopp_zcopyflag = sopp_copyopt; + ASSERT(sopp.sopp_flags & SOCKOPT_ZCOPY); } if (tcp->tcp_loopback) { sopp.sopp_flags |= SOCKOPT_LOOPBACK; @@ -17385,34 +14374,40 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) } (*connp->conn_upcalls->su_set_proto_props) (connp->conn_upper_handle, &sopp); + freemsg(mp); } else { + /* + * Let us reuse the incoming mblk to avoid + * memory allocation failure problems. We know + * that the size of the incoming mblk is at least + * stroptions + */ struct stroptions *stropt; - mblk_t *stropt_mp = allocb(sizeof (struct stroptions), BPRI_HI); - if (stropt_mp == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - DB_TYPE(stropt_mp) = M_SETOPTS; - stropt = (struct stroptions *)stropt_mp->b_rptr; - stropt_mp->b_wptr += sizeof (struct stroptions); + + ASSERT(DB_REF(mp) == 1); + ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions)); + + DB_TYPE(mp) = M_SETOPTS; + stropt = (struct stroptions *)mp->b_rptr; + mp->b_wptr = mp->b_rptr + sizeof (struct stroptions); + stropt = (struct stroptions *)mp->b_rptr; stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; - stropt->so_hiwat = sopp_rxhiwat; - stropt->so_wroff = sopp_wroff; - stropt->so_maxblk = sopp_maxblk; + stropt->so_hiwat = sopp.sopp_rxhiwat; + stropt->so_wroff = sopp.sopp_wroff; + stropt->so_maxblk = sopp.sopp_maxblk; - if (sopp_flags & SOCKOPT_TAIL) { + if (sopp.sopp_flags & SOCKOPT_TAIL) { ASSERT(tcp->tcp_kssl_ctx != NULL); stropt->so_flags |= SO_TAIL | SO_COPYOPT; - stropt->so_tail = sopp_tail; - stropt->so_copyopt = sopp_copyopt; + stropt->so_tail = sopp.sopp_tail; + stropt->so_copyopt = sopp.sopp_zcopyflag; } /* Send the options up */ - putnext(q, stropt_mp); + putnext(q, mp); } - freemsg(mp); /* * Pass up any data and/or a fin that has been received. * @@ -17432,7 +14427,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) if (!tcp->tcp_fused && (*connp->conn_upcalls->su_recv) (connp->conn_upper_handle, NULL, 0, 0, &error, &push) >= 0) { - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; if (tcp->tcp_state >= TCPS_ESTABLISHED && tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { tcp_xmit_ctl(NULL, @@ -17463,7 +14458,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) /* We drain directly in case of fused tcp loopback */ if (!tcp->tcp_fused && canputnext(q)) { - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; if (tcp->tcp_state >= TCPS_ESTABLISHED && tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { tcp_xmit_ctl(NULL, @@ -17508,12 +14503,9 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) putnext(q, mp); } } - if (tcp->tcp_hard_binding) { - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_TRUE; - } + tcp->tcp_hard_binding = B_FALSE; - if (tcp->tcp_ka_enabled) { + if (connp->conn_keepalive) { tcp->tcp_ka_last_intrvl = 0; tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, MSEC_TO_TICK(tcp->tcp_ka_interval)); @@ -17535,14 +14527,14 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2) /* * The function called through squeue to get behind listener's perimeter to - * send a deffered conn_ind. + * send a deferred conn_ind. */ /* ARGSUSED */ void -tcp_send_pending(void *arg, mblk_t *mp, void *arg2) +tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { - conn_t *connp = (conn_t *)arg; - tcp_t *listener = connp->conn_tcp; + conn_t *lconnp = (conn_t *)arg; + tcp_t *listener = lconnp->conn_tcp; struct T_conn_ind *conn_ind; tcp_t *tcp; @@ -17560,29 +14552,34 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2) return; } - tcp_ulp_newconn(connp, tcp->tcp_connp, mp); + tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); } -/* ARGSUSED */ +/* + * Common to TPI and sockfs accept code. + */ +/* ARGSUSED2 */ static int tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) { tcp_t *listener, *eager; - mblk_t *opt_mp; - struct tcp_options *tcpopt; + mblk_t *discon_mp; listener = lconnp->conn_tcp; ASSERT(listener->tcp_state == TCPS_LISTEN); eager = econnp->conn_tcp; ASSERT(eager->tcp_listener != NULL); - ASSERT(eager->tcp_rq != NULL); + /* + * Pre allocate the discon_ind mblk also. tcp_accept_finish will + * use it if something failed. + */ + discon_mp = allocb(MAX(sizeof (struct T_discon_ind), + sizeof (struct stroptions)), BPRI_HI); - opt_mp = allocb(sizeof (struct tcp_options), BPRI_HI); - if (opt_mp == NULL) { + if (discon_mp == NULL) { return (-TPROTO); } - bzero((char *)opt_mp->b_rptr, sizeof (struct tcp_options)); eager->tcp_issocket = B_TRUE; econnp->conn_zoneid = listener->tcp_connp->conn_zoneid; @@ -17607,24 +14604,6 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) */ ASSERT(econnp->conn_ref >= 3); - opt_mp->b_datap->db_type = M_SETOPTS; - opt_mp->b_wptr += sizeof (struct tcp_options); - - /* - * Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO - * from listener to acceptor. - */ - tcpopt = (struct tcp_options *)opt_mp->b_rptr; - tcpopt->to_flags = 0; - - if (listener->tcp_bound_if != 0) { - tcpopt->to_flags |= TCPOPT_BOUNDIF; - tcpopt->to_boundif = listener->tcp_bound_if; - } - if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) { - tcpopt->to_flags |= TCPOPT_RECVPKTINFO; - } - mutex_enter(&listener->tcp_eager_lock); if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { @@ -17686,7 +14665,7 @@ tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) /* Need to get inside the listener perimeter */ CONN_INC_REF(listener->tcp_connp); SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1, - tcp_send_pending, listener->tcp_connp, SQ_FILL, + tcp_send_pending, listener->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_SEND_PENDING); } no_more_eagers: @@ -17700,8 +14679,8 @@ no_more_eagers: * before sending the conn_ind in tcp_send_conn_ind. * The ref will be dropped in tcp_accept_finish(). */ - SQUEUE_ENTER_ONE(econnp->conn_sqp, opt_mp, tcp_accept_finish, - econnp, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); + SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish, + econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0); return (0); } @@ -17712,7 +14691,6 @@ tcp_accept(sock_lower_handle_t lproto_handle, { conn_t *lconnp, *econnp; tcp_t *listener, *eager; - tcp_stack_t *tcps; lconnp = (conn_t *)lproto_handle; listener = lconnp->conn_tcp; @@ -17720,7 +14698,6 @@ tcp_accept(sock_lower_handle_t lproto_handle, econnp = (conn_t *)eproto_handle; eager = econnp->conn_tcp; ASSERT(eager->tcp_listener != NULL); - tcps = eager->tcp_tcps; /* * It is OK to manipulate these fields outside the eager's squeue @@ -17732,19 +14709,6 @@ tcp_accept(sock_lower_handle_t lproto_handle, econnp->conn_upper_handle = sock_handle; econnp->conn_upcalls = lconnp->conn_upcalls; ASSERT(IPCL_IS_NONSTR(econnp)); - /* - * Create helper stream if it is a non-TPI TCP connection. - */ - if (ip_create_helper_stream(econnp, tcps->tcps_ldi_ident)) { - ip1dbg(("tcp_accept: create of IP helper stream" - " failed\n")); - return (EPROTO); - } - eager->tcp_rq = econnp->conn_rq; - eager->tcp_wq = econnp->conn_wq; - - ASSERT(eager->tcp_rq != NULL); - return (tcp_accept_common(lconnp, econnp, cr)); } @@ -17752,7 +14716,7 @@ tcp_accept(sock_lower_handle_t lproto_handle, /* * This is the STREAMS entry point for T_CONN_RES coming down on * Acceptor STREAM when sockfs listener does accept processing. - * Read the block comment on top of tcp_conn_request(). + * Read the block comment on top of tcp_input_listener(). */ void tcp_tpi_accept(queue_t *q, mblk_t *mp) @@ -17815,8 +14779,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) econnp = eager->tcp_connp; econnp->conn_dev = (dev_t)RD(q)->q_ptr; econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); - eager->tcp_rq = rq; - eager->tcp_wq = q; + econnp->conn_rq = rq; + econnp->conn_wq = q; rq->q_ptr = econnp; rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ q->q_ptr = econnp; @@ -17836,7 +14800,7 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) * should already be enough space in the mp that came * down from soaccept(). */ - if (eager->tcp_family == AF_INET) { + if (econnp->conn_family == AF_INET) { sin_t *sin; ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= @@ -17844,8 +14808,8 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) sin = (sin_t *)mp->b_wptr; mp->b_wptr += sizeof (sin_t); sin->sin_family = AF_INET; - sin->sin_port = eager->tcp_lport; - sin->sin_addr.s_addr = eager->tcp_ipha->ipha_src; + sin->sin_port = econnp->conn_lport; + sin->sin_addr.s_addr = econnp->conn_laddr_v4; } else { sin6_t *sin6; @@ -17854,20 +14818,23 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) sin6 = (sin6_t *)mp->b_wptr; mp->b_wptr += sizeof (sin6_t); sin6->sin6_family = AF_INET6; - sin6->sin6_port = eager->tcp_lport; - if (eager->tcp_ipversion == IPV4_VERSION) { + sin6->sin6_port = econnp->conn_lport; + sin6->sin6_addr = econnp->conn_laddr_v6; + if (econnp->conn_ipversion == IPV4_VERSION) { sin6->sin6_flowinfo = 0; - IN6_IPADDR_TO_V4MAPPED( - eager->tcp_ipha->ipha_src, - &sin6->sin6_addr); } else { ASSERT(eager->tcp_ip6h != NULL); sin6->sin6_flowinfo = eager->tcp_ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - sin6->sin6_addr = eager->tcp_ip6h->ip6_src; } - sin6->sin6_scope_id = 0; + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && + (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = + econnp->conn_ixa->ixa_scopeid; + } else { + sin6->sin6_scope_id = 0; + } sin6->__sin6_src_id = 0; } @@ -17881,97 +14848,6 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp) } } -static int -tcp_do_getsockname(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - switch (tcp->tcp_family) { - case AF_INET: - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *sin = sin_null; - sin->sin_family = AF_INET; - if (tcp->tcp_state >= TCPS_BOUND) { - sin->sin_port = tcp->tcp_lport; - sin->sin_addr.s_addr = tcp->tcp_ipha->ipha_src; - } - *salenp = sizeof (sin_t); - break; - - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - if (tcp->tcp_state >= TCPS_BOUND) { - sin6->sin6_port = tcp->tcp_lport; - mutex_enter(&tcp->tcp_connp->conn_lock); - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, - &sin6->sin6_addr); - } else { - sin6->sin6_addr = tcp->tcp_ip6h->ip6_src; - } - mutex_exit(&tcp->tcp_connp->conn_lock); - } - *salenp = sizeof (sin6_t); - break; - } - - return (0); -} - -static int -tcp_do_getpeername(tcp_t *tcp, struct sockaddr *sa, uint_t *salenp) -{ - sin_t *sin = (sin_t *)sa; - sin6_t *sin6 = (sin6_t *)sa; - - if (tcp->tcp_state < TCPS_SYN_RCVD) - return (ENOTCONN); - - switch (tcp->tcp_family) { - case AF_INET: - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - - if (*salenp < sizeof (sin_t)) - return (EINVAL); - - *sin = sin_null; - sin->sin_family = AF_INET; - sin->sin_port = tcp->tcp_fport; - IN6_V4MAPPED_TO_IPADDR(&tcp->tcp_remote_v6, - sin->sin_addr.s_addr); - *salenp = sizeof (sin_t); - break; - - case AF_INET6: - if (*salenp < sizeof (sin6_t)) - return (EINVAL); - - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = tcp->tcp_fport; - sin6->sin6_addr = tcp->tcp_remote_v6; - mutex_enter(&tcp->tcp_connp->conn_lock); - if (tcp->tcp_ipversion == IPV6_VERSION) { - sin6->sin6_flowinfo = tcp->tcp_ip6h->ip6_vcf & - ~IPV6_VERS_AND_FLOW_MASK; - } - mutex_exit(&tcp->tcp_connp->conn_lock); - *salenp = sizeof (sin6_t); - break; - } - - return (0); -} - /* * Handle special out-of-band ioctl requests (see PSARC/2008/265). */ @@ -17980,7 +14856,8 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp) { void *data; mblk_t *datamp = mp->b_cont; - tcp_t *tcp = Q_TO_TCP(q); + conn_t *connp = Q_TO_CONN(q); + tcp_t *tcp = connp->conn_tcp; cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { @@ -17993,10 +14870,14 @@ tcp_wput_cmdblk(queue_t *q, mblk_t *mp) switch (cmdp->cb_cmd) { case TI_GETPEERNAME: - cmdp->cb_error = tcp_do_getpeername(tcp, data, &cmdp->cb_len); + if (tcp->tcp_state < TCPS_SYN_RCVD) + cmdp->cb_error = ENOTCONN; + else + cmdp->cb_error = conn_getpeername(connp, data, + &cmdp->cb_len); break; case TI_GETMYNAME: - cmdp->cb_error = tcp_do_getsockname(tcp, data, &cmdp->cb_len); + cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); break; default: cmdp->cb_error = EINVAL; @@ -18029,14 +14910,14 @@ tcp_wput(queue_t *q, mblk_t *mp) mutex_enter(&tcp->tcp_non_sq_lock); tcp->tcp_squeue_bytes += size; - if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { + if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, - tcp_squeue_flag, SQTAG_TCP_OUTPUT); + NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); return; case M_CMD: @@ -18053,7 +14934,7 @@ tcp_wput(queue_t *q, mblk_t *mp) if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { type = ((union T_primitives *)rptr)->type; } else { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, dropping one..."); @@ -18093,7 +14974,7 @@ tcp_wput(queue_t *q, mblk_t *mp) /* * Most ioctls can be processed right away without going via * squeues - process them right here. Those that do require - * squeue (currently TCP_IOC_DEFAULT_Q and _SIOCSOCKFALLBACK) + * squeue (currently _SIOCSOCKFALLBACK) * are processed by tcp_wput_ioctl(). */ iocp = (struct iocblk *)mp->b_rptr; @@ -18111,26 +14992,13 @@ tcp_wput(queue_t *q, mblk_t *mp) case ND_SET: /* nd_getset does the necessary checks */ case ND_GET: - if (!nd_getset(q, tcps->tcps_g_nd, mp)) { - CALL_IP_WPUT(connp, q, mp); - return; - } - qreply(q, mp); - return; - case TCP_IOC_DEFAULT_Q: - /* - * Wants to be the default wq. Check the credentials - * first, the rest is executed via squeue. - */ - if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { - iocp->ioc_error = EPERM; - iocp->ioc_count = 0; - mp->b_datap->db_type = M_IOCACK; + if (nd_getset(q, tcps->tcps_g_nd, mp)) { qreply(q, mp); return; } - output_proc = tcp_wput_ioctl; - break; + ip_wput_nondata(q, mp); + return; + default: output_proc = tcp_wput_ioctl; break; @@ -18143,7 +15011,7 @@ tcp_wput(queue_t *q, mblk_t *mp) CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, - tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); + NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); } /* @@ -18188,52 +15056,32 @@ tcp_wput_fallback(queue_t *wq, mblk_t *mp) freemsg(mp); } +/* + * Check the usability of ZEROCOPY. It's instead checking the flag set by IP. + */ static boolean_t tcp_zcopy_check(tcp_t *tcp) { - conn_t *connp = tcp->tcp_connp; - ire_t *ire; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; boolean_t zc_enabled = B_FALSE; tcp_stack_t *tcps = tcp->tcp_tcps; if (do_tcpzcopy == 2) zc_enabled = B_TRUE; - else if (tcp->tcp_ipversion == IPV4_VERSION && - IPCL_IS_CONNECTED(connp) && - (connp->conn_flags & IPCL_CHECK_POLICY) == 0 && - connp->conn_dontroute == 0 && - !connp->conn_nexthop_set && - connp->conn_outgoing_ill == NULL && - do_tcpzcopy == 1) { - /* - * the checks above closely resemble the fast path checks - * in tcp_send_data(). - */ - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); - if (ire != NULL && !(ire->ire_marks & IRE_MARK_CONDEMNED)) { - IRE_REFHOLD(ire); - if (ire->ire_stq != NULL) { - ill_t *ill = (ill_t *)ire->ire_stq->q_ptr; - - zc_enabled = ill && (ill->ill_capabilities & - ILL_CAPAB_ZEROCOPY) && - (ill->ill_zerocopy_capab-> - ill_zerocopy_flags != 0); - } - IRE_REFRELE(ire); - } - mutex_exit(&connp->conn_lock); - } + else if ((do_tcpzcopy == 1) && (ixa->ixa_flags & IXAF_ZCOPY_CAPAB)) + zc_enabled = B_TRUE; + tcp->tcp_snd_zcopy_on = zc_enabled; if (!TCP_IS_DETACHED(tcp)) { if (zc_enabled) { - (void) proto_set_tx_copyopt(tcp->tcp_rq, connp, + ixa->ixa_flags |= IXAF_VERIFY_ZCOPY; + (void) proto_set_tx_copyopt(connp->conn_rq, connp, ZCVMSAFE); TCP_STAT(tcps, tcp_zcopy_on); } else { - (void) proto_set_tx_copyopt(tcp->tcp_rq, connp, + ixa->ixa_flags &= ~IXAF_VERIFY_ZCOPY; + (void) proto_set_tx_copyopt(connp->conn_rq, connp, ZCVMUNSAFE); TCP_STAT(tcps, tcp_zcopy_off); } @@ -18241,99 +15089,84 @@ tcp_zcopy_check(tcp_t *tcp) return (zc_enabled); } -static mblk_t * -tcp_zcopy_disable(tcp_t *tcp, mblk_t *bp) -{ - tcp_stack_t *tcps = tcp->tcp_tcps; - - if (do_tcpzcopy == 2) - return (bp); - else if (tcp->tcp_snd_zcopy_on) { - tcp->tcp_snd_zcopy_on = B_FALSE; - if (!TCP_IS_DETACHED(tcp)) { - (void) proto_set_tx_copyopt(tcp->tcp_rq, tcp->tcp_connp, - ZCVMUNSAFE); - TCP_STAT(tcps, tcp_zcopy_disable); - } - } - return (tcp_zcopy_backoff(tcp, bp, 0)); -} - /* - * Backoff from a zero-copy mblk by copying data to a new mblk and freeing - * the original desballoca'ed segmapped mblk. + * Backoff from a zero-copy message by copying data to a new allocated + * message and freeing the original desballoca'ed segmapped message. + * + * This function is called by following two callers: + * 1. tcp_timer: fix_xmitlist is set to B_TRUE, because it's safe to free + * the origial desballoca'ed message and notify sockfs. This is in re- + * transmit state. + * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need + * to be copied to new message. */ static mblk_t * -tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, int fix_xmitlist) +tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist) { - mblk_t *head, *tail, *nbp; + mblk_t *nbp; + mblk_t *head = NULL; + mblk_t *tail = NULL; tcp_stack_t *tcps = tcp->tcp_tcps; - if (IS_VMLOANED_MBLK(bp)) { - TCP_STAT(tcps, tcp_zcopy_backoff); - if ((head = copyb(bp)) == NULL) { - /* fail to backoff; leave it for the next backoff */ - tcp->tcp_xmit_zc_clean = B_FALSE; - return (bp); - } - if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { - if (fix_xmitlist) - tcp_zcopy_notify(tcp); - else - head->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; - } - nbp = bp->b_cont; - if (fix_xmitlist) { - head->b_prev = bp->b_prev; - head->b_next = bp->b_next; - if (tcp->tcp_xmit_tail == bp) - tcp->tcp_xmit_tail = head; - } - bp->b_next = NULL; - bp->b_prev = NULL; - freeb(bp); - } else { - head = bp; - nbp = bp->b_cont; - } - tail = head; - while (nbp) { - if (IS_VMLOANED_MBLK(nbp)) { + ASSERT(bp != NULL); + while (bp != NULL) { + if (IS_VMLOANED_MBLK(bp)) { TCP_STAT(tcps, tcp_zcopy_backoff); - if ((tail->b_cont = copyb(nbp)) == NULL) { + if ((nbp = copyb(bp)) == NULL) { tcp->tcp_xmit_zc_clean = B_FALSE; - tail->b_cont = nbp; - return (head); + if (tail != NULL) + tail->b_cont = bp; + return ((head == NULL) ? bp : head); } - tail = tail->b_cont; - if (nbp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { + + if (bp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) { if (fix_xmitlist) tcp_zcopy_notify(tcp); else - tail->b_datap->db_struioflag |= + nbp->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; } - bp = nbp; - nbp = nbp->b_cont; + nbp->b_cont = bp->b_cont; + + /* + * Copy saved information and adjust tcp_xmit_tail + * if needed. + */ if (fix_xmitlist) { - tail->b_prev = bp->b_prev; - tail->b_next = bp->b_next; + nbp->b_prev = bp->b_prev; + nbp->b_next = bp->b_next; + if (tcp->tcp_xmit_tail == bp) - tcp->tcp_xmit_tail = tail; + tcp->tcp_xmit_tail = nbp; } - bp->b_next = NULL; + + /* Free the original message. */ bp->b_prev = NULL; + bp->b_next = NULL; freeb(bp); + + bp = nbp; + } + + if (head == NULL) { + head = bp; + } + if (tail == NULL) { + tail = bp; } else { - tail->b_cont = nbp; - tail = nbp; - nbp = nbp->b_cont; + tail->b_cont = bp; + tail = bp; } + + /* Move forward. */ + bp = bp->b_cont; } + if (fix_xmitlist) { tcp->tcp_xmit_last = tail; tcp->tcp_xmit_zc_clean = B_TRUE; } + return (head); } @@ -18341,7 +15174,7 @@ static void tcp_zcopy_notify(tcp_t *tcp) { struct stdata *stp; - conn_t *connp; + conn_t *connp; if (tcp->tcp_detached) return; @@ -18351,323 +15184,149 @@ tcp_zcopy_notify(tcp_t *tcp) (connp->conn_upper_handle); return; } - stp = STREAM(tcp->tcp_rq); + stp = STREAM(connp->conn_rq); mutex_enter(&stp->sd_lock); stp->sd_flag |= STZCNOTIFY; cv_broadcast(&stp->sd_zcopy_wait); mutex_exit(&stp->sd_lock); } -static boolean_t -tcp_send_find_ire(tcp_t *tcp, ipaddr_t *dst, ire_t **irep) +/* + * Update the TCP connection according to change of LSO capability. + */ +static void +tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa) { - ire_t *ire; - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - mutex_enter(&connp->conn_lock); - ire = connp->conn_ire_cache; - ASSERT(!(connp->conn_state_flags & CONN_INCIPIENT)); - - if ((ire != NULL) && - (((dst != NULL) && (ire->ire_addr == *dst)) || ((dst == NULL) && - IN6_ARE_ADDR_EQUAL(&ire->ire_addr_v6, &tcp->tcp_ip6h->ip6_dst))) && - !(ire->ire_marks & IRE_MARK_CONDEMNED)) { - IRE_REFHOLD(ire); - mutex_exit(&connp->conn_lock); - } else { - boolean_t cached = B_FALSE; - ts_label_t *tsl; - - /* force a recheck later on */ - tcp->tcp_ire_ill_check_done = B_FALSE; - - TCP_DBGSTAT(tcps, tcp_ire_null1); - connp->conn_ire_cache = NULL; - mutex_exit(&connp->conn_lock); - - if (ire != NULL) - IRE_REFRELE_NOTR(ire); - - tsl = crgetlabel(CONN_CRED(connp)); - ire = (dst ? - ire_cache_lookup(*dst, connp->conn_zoneid, tsl, ipst) : - ire_cache_lookup_v6(&tcp->tcp_ip6h->ip6_dst, - connp->conn_zoneid, tsl, ipst)); + /* + * We check against IPv4 header length to preserve the old behavior + * of only enabling LSO when there are no IP options. + * But this restriction might not be necessary at all. Before removing + * it, need to verify how LSO is handled for source routing case, with + * which IP does software checksum. + * + * For IPv6, whenever any extension header is needed, LSO is supressed. + */ + if (ixa->ixa_ip_hdr_length != ((ixa->ixa_flags & IXAF_IS_IPV4) ? + IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN)) + return; - if (ire == NULL) { - TCP_STAT(tcps, tcp_ire_null); - return (B_FALSE); - } + /* + * Either the LSO capability newly became usable, or it has changed. + */ + if (ixa->ixa_flags & IXAF_LSO_CAPAB) { + ill_lso_capab_t *lsoc = &ixa->ixa_lso_capab; - IRE_REFHOLD_NOTR(ire); + ASSERT(lsoc->ill_lso_max > 0); + tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, lsoc->ill_lso_max); - mutex_enter(&connp->conn_lock); - if (CONN_CACHE_IRE(connp)) { - rw_enter(&ire->ire_bucket->irb_lock, RW_READER); - if (!(ire->ire_marks & IRE_MARK_CONDEMNED)) { - TCP_CHECK_IREINFO(tcp, ire); - connp->conn_ire_cache = ire; - cached = B_TRUE; - } - rw_exit(&ire->ire_bucket->irb_lock); - } - mutex_exit(&connp->conn_lock); + DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, + boolean_t, B_TRUE, uint32_t, tcp->tcp_lso_max); /* - * We can continue to use the ire but since it was - * not cached, we should drop the extra reference. + * If LSO to be enabled, notify the STREAM header with larger + * data block. */ - if (!cached) - IRE_REFRELE_NOTR(ire); + if (!tcp->tcp_lso) + tcp->tcp_maxpsz_multiplier = 0; + + tcp->tcp_lso = B_TRUE; + TCP_STAT(tcp->tcp_tcps, tcp_lso_enabled); + } else { /* LSO capability is not usable any more. */ + DTRACE_PROBE3(tcp_update_lso, boolean_t, tcp->tcp_lso, + boolean_t, B_FALSE, uint32_t, tcp->tcp_lso_max); /* - * Rampart note: no need to select a new label here, since - * labels are not allowed to change during the life of a TCP - * connection. + * If LSO to be disabled, notify the STREAM header with smaller + * data block. And need to restore fragsize to PMTU. */ + if (tcp->tcp_lso) { + tcp->tcp_maxpsz_multiplier = + tcp->tcp_tcps->tcps_maxpsz_multiplier; + ixa->ixa_fragsize = ixa->ixa_pmtu; + tcp->tcp_lso = B_FALSE; + TCP_STAT(tcp->tcp_tcps, tcp_lso_disabled); + } } - *irep = ire; - - return (B_TRUE); + (void) tcp_maxpsz_set(tcp, B_TRUE); } /* - * Called from tcp_send() or tcp_send_data() to find workable IRE. - * - * 0 = success; - * 1 = failed to find ire and ill. + * Update the TCP connection according to change of ZEROCOPY capability. */ -static boolean_t -tcp_send_find_ire_ill(tcp_t *tcp, mblk_t *mp, ire_t **irep, ill_t **illp) +static void +tcp_update_zcopy(tcp_t *tcp) { - ipha_t *ipha; - ipaddr_t dst; - ire_t *ire; - ill_t *ill; - mblk_t *ire_fp_mp; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; - if (mp != NULL) - ipha = (ipha_t *)mp->b_rptr; - else - ipha = tcp->tcp_ipha; - dst = ipha->ipha_dst; - - if (!tcp_send_find_ire(tcp, &dst, &ire)) - return (B_FALSE); - - if ((ire->ire_flags & RTF_MULTIRT) || - (ire->ire_stq == NULL) || - (ire->ire_nce == NULL) || - ((ire_fp_mp = ire->ire_nce->nce_fp_mp) == NULL) || - ((mp != NULL) && (ire->ire_max_frag < ntohs(ipha->ipha_length) || - MBLKL(ire_fp_mp) > MBLKHEAD(mp)))) { - TCP_STAT(tcps, tcp_ip_ire_send); - IRE_REFRELE(ire); - return (B_FALSE); + if (tcp->tcp_snd_zcopy_on) { + tcp->tcp_snd_zcopy_on = B_FALSE; + if (!TCP_IS_DETACHED(tcp)) { + (void) proto_set_tx_copyopt(connp->conn_rq, connp, + ZCVMUNSAFE); + TCP_STAT(tcps, tcp_zcopy_off); + } + } else { + tcp->tcp_snd_zcopy_on = B_TRUE; + if (!TCP_IS_DETACHED(tcp)) { + (void) proto_set_tx_copyopt(connp->conn_rq, connp, + ZCVMSAFE); + TCP_STAT(tcps, tcp_zcopy_on); + } } +} - ill = ire_to_ill(ire); - ASSERT(ill != NULL); +/* + * Notify function registered with ip_xmit_attr_t. It's called in the squeue + * so it's safe to update the TCP connection. + */ +/* ARGSUSED1 */ +static void +tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, + ixa_notify_arg_t narg) +{ + tcp_t *tcp = (tcp_t *)arg; + conn_t *connp = tcp->tcp_connp; - if (!tcp->tcp_ire_ill_check_done) { - tcp_ire_ill_check(tcp, ire, ill, B_TRUE); - tcp->tcp_ire_ill_check_done = B_TRUE; + switch (ntype) { + case IXAN_LSO: + tcp_update_lso(tcp, connp->conn_ixa); + break; + case IXAN_PMTU: + tcp_update_pmtu(tcp, B_FALSE); + break; + case IXAN_ZCOPY: + tcp_update_zcopy(tcp); + break; + default: + break; } - - *irep = ire; - *illp = ill; - - return (B_TRUE); } static void -tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) +tcp_send_data(tcp_t *tcp, mblk_t *mp) { - ipha_t *ipha; - ipaddr_t src; - ipaddr_t dst; - uint32_t cksum; - ire_t *ire; - uint16_t *up; - ill_t *ill; conn_t *connp = tcp->tcp_connp; - uint32_t hcksum_txflags = 0; - mblk_t *ire_fp_mp; - uint_t ire_fp_mp_len; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - cred_t *cr; - pid_t cpid; - - ASSERT(DB_TYPE(mp) == M_DATA); /* - * Here we need to handle the overloading of the cred_t for - * both getpeerucred and TX. - * If this is a SYN then the caller already set db_credp so - * that getpeerucred will work. But if TX is in use we might have - * a conn_effective_cred which is different, and we need to use that - * cred to make TX use the correct label and label dependent route. + * Check here to avoid sending zero-copy message down to IP when + * ZEROCOPY capability has turned off. We only need to deal with + * the race condition between sockfs and the notification here. + * Since we have tried to backoff the tcp_xmit_head when turning + * zero-copy off and new messages in tcp_output(), we simply drop + * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean + * is not true. */ - if (is_system_labeled()) { - cr = msg_getcred(mp, &cpid); - if (cr == NULL || connp->conn_effective_cred != NULL) - mblk_setcred(mp, CONN_CRED(connp), cpid); - } - - ipha = (ipha_t *)mp->b_rptr; - src = ipha->ipha_src; - dst = ipha->ipha_dst; - - ASSERT(q != NULL); - DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp); - - /* - * Drop off fast path for IPv6 and also if options are present or - * we need to resolve a TS label. - */ - if (tcp->tcp_ipversion != IPV4_VERSION || - !IPCL_IS_CONNECTED(connp) || - !CONN_IS_LSO_MD_FASTPATH(connp) || - (connp->conn_flags & IPCL_CHECK_POLICY) != 0 || - !connp->conn_ulp_labeled || - ipha->ipha_ident == IP_HDR_INCLUDED || - ipha->ipha_version_and_hdr_length != IP_SIMPLE_HDR_VERSION || - IPP_ENABLED(IPP_LOCAL_OUT, ipst)) { - if (tcp->tcp_snd_zcopy_aware) - mp = tcp_zcopy_disable(tcp, mp); - TCP_STAT(tcps, tcp_ip_send); - CALL_IP_WPUT(connp, q, mp); - return; - } - - if (!tcp_send_find_ire_ill(tcp, mp, &ire, &ill)) { - if (tcp->tcp_snd_zcopy_aware) - mp = tcp_zcopy_backoff(tcp, mp, 0); - CALL_IP_WPUT(connp, q, mp); + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on && + !tcp->tcp_xmit_zc_clean) { + ip_drop_output("TCP ZC was disabled but not clean", mp, NULL); + freemsg(mp); return; } - ire_fp_mp = ire->ire_nce->nce_fp_mp; - ire_fp_mp_len = MBLKL(ire_fp_mp); - - ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); - ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, 1); -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); -#endif - - /* - * Check to see if we need to re-enable LSO/MDT for this connection - * because it was previously disabled due to changes in the ill; - * note that by doing it here, this re-enabling only applies when - * the packet is not dispatched through CALL_IP_WPUT(). - * - * That means for IPv4, it is worth re-enabling LSO/MDT for the fastpath - * case, since that's how we ended up here. For IPv6, we do the - * re-enabling work in ip_xmit_v6(), albeit indirectly via squeue. - */ - if (connp->conn_lso_ok && !tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) { - /* - * Restore LSO for this connection, so that next time around - * it is eligible to go through tcp_lsosend() path again. - */ - TCP_STAT(tcps, tcp_lso_enabled); - tcp->tcp_lso = B_TRUE; - ip1dbg(("tcp_send_data: reenabling LSO for connp %p on " - "interface %s\n", (void *)connp, ill->ill_name)); - } else if (connp->conn_mdt_ok && !tcp->tcp_mdt && ILL_MDT_USABLE(ill)) { - /* - * Restore MDT for this connection, so that next time around - * it is eligible to go through tcp_multisend() path again. - */ - TCP_STAT(tcps, tcp_mdt_conn_resumed1); - tcp->tcp_mdt = B_TRUE; - ip1dbg(("tcp_send_data: reenabling MDT for connp %p on " - "interface %s\n", (void *)connp, ill->ill_name)); - } - - if (tcp->tcp_snd_zcopy_aware) { - if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || - (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) - mp = tcp_zcopy_disable(tcp, mp); - /* - * we shouldn't need to reset ipha as the mp containing - * ipha should never be a zero-copy mp. - */ - } - - if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { - ASSERT(ill->ill_hcksum_capab != NULL); - hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; - } - - /* pseudo-header checksum (do it in parts for IP header checksum) */ - cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); - - ASSERT(ipha->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION); - up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); - - IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, - IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); - - /* Software checksum? */ - if (DB_CKSUMFLAGS(mp) == 0) { - TCP_STAT(tcps, tcp_out_sw_cksum); - TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, - ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH); - } - - /* Calculate IP header checksum if hardware isn't capable */ - if (!(DB_CKSUMFLAGS(mp) & HCK_IPV4_HDRCKSUM)) { - IP_HDR_CKSUM(ipha, cksum, ((uint32_t *)ipha)[0], - ((uint16_t *)ipha)[4]); - } - ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); - mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; - bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); - - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, - ntohs(ipha->ipha_length)); - - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL); - - if (mp != NULL) { - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t szone; - - /* - * Both of these functions expect b_rptr to be - * where the IP header starts, so advance past the - * link layer header if present. - */ - mp->b_rptr += ire_fp_mp_len; - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, - ipst, ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - mp->b_rptr -= ire_fp_mp_len; - } - - ILL_SEND_TX(ill, ire, connp, mp, 0, NULL); - } - - IRE_REFRELE(ire); + ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp); + (void) conn_ip_output(mp, connp->conn_ixa); } /* @@ -18731,15 +15390,13 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) int tcpstate; int usable = 0; mblk_t *xmit_tail; - queue_t *q = tcp->tcp_wq; int32_t mss; int32_t num_sack_blk = 0; + int32_t total_hdr_len; int32_t tcp_hdr_len; - int32_t tcp_tcp_hdr_len; - int mdt_thres; int rc; tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst; + conn_t *connp = tcp->tcp_connp; tcpstate = tcp->tcp_state; if (mp == NULL) { @@ -18771,7 +15428,7 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); #else - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, "tcp_wput_data: data after ordrel, %s\n", @@ -18781,12 +15438,12 @@ tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) #endif /* DEBUG */ } if (tcp->tcp_snd_zcopy_aware && - (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY) != 0) + (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) tcp_zcopy_notify(tcp); freemsg(mp); mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -18886,12 +15543,12 @@ data_null: opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; mss = tcp->tcp_mss - opt_len; - tcp_hdr_len = tcp->tcp_hdr_len + opt_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + opt_len; + total_hdr_len = connp->conn_ht_iphc_len + opt_len; + tcp_hdr_len = connp->conn_ht_ulp_len + opt_len; } else { mss = tcp->tcp_mss; - tcp_hdr_len = tcp->tcp_hdr_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; + total_hdr_len = connp->conn_ht_iphc_len; + tcp_hdr_len = connp->conn_ht_ulp_len; } if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && @@ -18913,7 +15570,7 @@ data_null: * In the special case when cwnd is zero, which can only * happen if the connection is ECN capable, return now. * New segments is sent using tcp_timer(). The timer - * is set in tcp_rput_data(). + * is set in tcp_input_data(). */ if (tcp->tcp_cwnd == 0) { /* @@ -19023,66 +15680,12 @@ data_null: } /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcp->tcp_tcph->th_win); + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); - /* - * Determine if it's worthwhile to attempt LSO or MDT, based on: - * - * 1. Simple TCP/IP{v4,v6} (no options). - * 2. IPSEC/IPQoS processing is not needed for the TCP connection. - * 3. If the TCP connection is in ESTABLISHED state. - * 4. The TCP is not detached. - * - * If any of the above conditions have changed during the - * connection, stop using LSO/MDT and restore the stream head - * parameters accordingly. - */ - ipst = tcps->tcps_netstack->netstack_ip; - - if ((tcp->tcp_lso || tcp->tcp_mdt) && - ((tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION && - tcp->tcp_ip_hdr_len != IPV6_HDR_LEN) || - tcp->tcp_state != TCPS_ESTABLISHED || - TCP_IS_DETACHED(tcp) || !CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp) || - CONN_IPSEC_OUT_ENCAPSULATED(tcp->tcp_connp) || - IPP_ENABLED(IPP_LOCAL_OUT, ipst))) { - if (tcp->tcp_lso) { - tcp->tcp_connp->conn_lso_ok = B_FALSE; - tcp->tcp_lso = B_FALSE; - } else { - tcp->tcp_connp->conn_mdt_ok = B_FALSE; - tcp->tcp_mdt = B_FALSE; - } - - /* Anything other than detached is considered pathological */ - if (!TCP_IS_DETACHED(tcp)) { - if (tcp->tcp_lso) - TCP_STAT(tcps, tcp_lso_disabled); - else - TCP_STAT(tcps, tcp_mdt_conn_halted1); - (void) tcp_maxpsz_set(tcp, B_TRUE); - } - } - - /* Use MDT if sendable amount is greater than the threshold */ - if (tcp->tcp_mdt && - (mdt_thres = mss << tcp_mdt_smss_threshold, usable > mdt_thres) && - (tail_unsent > mdt_thres || (xmit_tail->b_cont != NULL && - MBLKL(xmit_tail->b_cont) > mdt_thres)) && - (tcp->tcp_valid_bits == 0 || - tcp->tcp_valid_bits == TCP_FSS_VALID)) { - ASSERT(tcp->tcp_connp->conn_mdt_ok); - rc = tcp_multisend(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, - num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, - local_time, mdt_thres); - } else { - rc = tcp_send(q, tcp, mss, tcp_hdr_len, tcp_tcp_hdr_len, - num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, - local_time, INT_MAX); - } + /* Send the packet. */ + rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len, + num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, + local_time); /* Pretend that all we were trying to send really got sent */ if (rc < 0 && tail_unsent < 0) { @@ -19131,39 +15734,41 @@ done:; tcp->tcp_unsent += len; mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped) { - if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) { + if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { tcp_clrqfull(tcp); } - } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) { - tcp_setqfull(tcp); + } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) { + if (!(tcp->tcp_detached)) + tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); } /* - * tcp_fill_header is called by tcp_send() and tcp_multisend() to fill the - * outgoing TCP header with the template header, as well as other - * options such as time-stamp, ECN and/or SACK. + * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header + * with the template header, as well as other options such as time-stamp, + * ECN and/or SACK. */ static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) { - tcph_t *tcp_tmpl, *tcp_h; + tcpha_t *tcp_tmpl, *tcpha; uint32_t *dst, *src; int hdrlen; + conn_t *connp = tcp->tcp_connp; ASSERT(OK_32PTR(rptr)); /* Template header */ - tcp_tmpl = tcp->tcp_tcph; + tcp_tmpl = tcp->tcp_tcpha; /* Header of outgoing packet */ - tcp_h = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); + tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length); /* dst and src are opaque 32-bit fields, used for copying */ dst = (uint32_t *)rptr; - src = (uint32_t *)tcp->tcp_iphc; - hdrlen = tcp->tcp_hdr_len; + src = (uint32_t *)connp->conn_ht_iphc; + hdrlen = connp->conn_ht_iphc_len; /* Fill time-stamp option if needed */ if (tcp->tcp_snd_ts_ok) { @@ -19172,7 +15777,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) U32_TO_BE32(tcp->tcp_ts_recent, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); } else { - ASSERT(tcp->tcp_tcp_hdr_len == TCP_MIN_HEADER_LENGTH); + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } /* @@ -19208,16 +15813,16 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) SET_ECT(tcp, rptr); if (tcp->tcp_ecn_echo_on) - tcp_h->th_flags[0] |= TH_ECE; + tcpha->tha_flags |= TH_ECE; if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { - tcp_h->th_flags[0] |= TH_CWR; + tcpha->tha_flags |= TH_CWR; tcp->tcp_ecn_cwr_sent = B_TRUE; } } /* Fill in SACK options */ if (num_sack_blk > 0) { - uchar_t *wptr = rptr + tcp->tcp_hdr_len; + uchar_t *wptr = rptr + connp->conn_ht_iphc_len; sack_blk_t *tmp; int32_t i; @@ -19235,1536 +15840,62 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) U32_TO_BE32(tmp[i].end, wptr); wptr += sizeof (tcp_seq); } - tcp_h->th_offset_and_rsrvd[0] += + tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); } } /* - * tcp_mdt_add_attrs() is called by tcp_multisend() in order to attach - * the destination address and SAP attribute, and if necessary, the - * hardware checksum offload attribute to a Multidata message. - */ -static int -tcp_mdt_add_attrs(multidata_t *mmd, const mblk_t *dlmp, const boolean_t hwcksum, - const uint32_t start, const uint32_t stuff, const uint32_t end, - const uint32_t flags, tcp_stack_t *tcps) -{ - /* Add global destination address & SAP attribute */ - if (dlmp == NULL || !ip_md_addr_attr(mmd, NULL, dlmp)) { - ip1dbg(("tcp_mdt_add_attrs: can't add global physical " - "destination address+SAP\n")); - - if (dlmp != NULL) - TCP_STAT(tcps, tcp_mdt_allocfail); - return (-1); - } - - /* Add global hwcksum attribute */ - if (hwcksum && - !ip_md_hcksum_attr(mmd, NULL, start, stuff, end, flags)) { - ip1dbg(("tcp_mdt_add_attrs: can't add global hardware " - "checksum attribute\n")); - - TCP_STAT(tcps, tcp_mdt_allocfail); - return (-1); - } - - return (0); -} - -/* - * Smaller and private version of pdescinfo_t used specifically for TCP, - * which allows for only two payload spans per packet. - */ -typedef struct tcp_pdescinfo_s PDESCINFO_STRUCT(2) tcp_pdescinfo_t; - -/* - * tcp_multisend() is called by tcp_wput_data() for Multidata Transmit - * scheme, and returns one the following: + * tcp_send() is called by tcp_wput_data() and returns one of the following: * * -1 = failed allocation. * 0 = success; burst count reached, or usable send window is too small, * and that we'd rather wait until later before sending again. */ static int -tcp_multisend(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, - const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, - uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres) -{ - mblk_t *md_mp_head, *md_mp, *md_pbuf, *md_pbuf_nxt, *md_hbuf; - multidata_t *mmd; - uint_t obsegs, obbytes, hdr_frag_sz; - uint_t cur_hdr_off, cur_pld_off, base_pld_off, first_snxt; - int num_burst_seg, max_pld; - pdesc_t *pkt; - tcp_pdescinfo_t tcp_pkt_info; - pdescinfo_t *pkt_info; - int pbuf_idx, pbuf_idx_nxt; - int seg_len, len, spill, af; - boolean_t add_buffer, zcopy, clusterwide; - boolean_t rconfirm = B_FALSE; - boolean_t done = B_FALSE; - uint32_t cksum; - uint32_t hwcksum_flags; - ire_t *ire = NULL; - ill_t *ill; - ipha_t *ipha; - ip6_t *ip6h; - ipaddr_t src, dst; - ill_zerocopy_capab_t *zc_cap = NULL; - uint16_t *up; - int err; - conn_t *connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - int usable_mmd, tail_unsent_mmd; - uint_t snxt_mmd, obsegs_mmd, obbytes_mmd; - mblk_t *xmit_tail_mmd; - netstackid_t stack_id; - -#ifdef _BIG_ENDIAN -#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 28) & 0x7) -#else -#define IPVER(ip6h) ((((uint32_t *)ip6h)[0] >> 4) & 0x7) -#endif - -#define PREP_NEW_MULTIDATA() { \ - mmd = NULL; \ - md_mp = md_hbuf = NULL; \ - cur_hdr_off = 0; \ - max_pld = tcp->tcp_mdt_max_pld; \ - pbuf_idx = pbuf_idx_nxt = -1; \ - add_buffer = B_TRUE; \ - zcopy = B_FALSE; \ -} - -#define PREP_NEW_PBUF() { \ - md_pbuf = md_pbuf_nxt = NULL; \ - pbuf_idx = pbuf_idx_nxt = -1; \ - cur_pld_off = 0; \ - first_snxt = *snxt; \ - ASSERT(*tail_unsent > 0); \ - base_pld_off = MBLKL(*xmit_tail) - *tail_unsent; \ -} - - ASSERT(mdt_thres >= mss); - ASSERT(*usable > 0 && *usable > mdt_thres); - ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); - ASSERT(!TCP_IS_DETACHED(tcp)); - ASSERT(tcp->tcp_valid_bits == 0 || - tcp->tcp_valid_bits == TCP_FSS_VALID); - ASSERT((tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len == IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION && - tcp->tcp_ip_hdr_len == IPV6_HDR_LEN)); - - connp = tcp->tcp_connp; - ASSERT(connp != NULL); - ASSERT(CONN_IS_LSO_MD_FASTPATH(connp)); - ASSERT(!CONN_IPSEC_OUT_ENCAPSULATED(connp)); - - stack_id = connp->conn_netstack->netstack_stackid; - - usable_mmd = tail_unsent_mmd = 0; - snxt_mmd = obsegs_mmd = obbytes_mmd = 0; - xmit_tail_mmd = NULL; - /* - * Note that tcp will only declare at most 2 payload spans per - * packet, which is much lower than the maximum allowable number - * of packet spans per Multidata. For this reason, we use the - * privately declared and smaller descriptor info structure, in - * order to save some stack space. - */ - pkt_info = (pdescinfo_t *)&tcp_pkt_info; - - af = (tcp->tcp_ipversion == IPV4_VERSION) ? AF_INET : AF_INET6; - if (af == AF_INET) { - dst = tcp->tcp_ipha->ipha_dst; - src = tcp->tcp_ipha->ipha_src; - ASSERT(!CLASSD(dst)); - } - ASSERT(af == AF_INET || - !IN6_IS_ADDR_MULTICAST(&tcp->tcp_ip6h->ip6_dst)); - - obsegs = obbytes = 0; - num_burst_seg = tcp->tcp_snd_burst; - md_mp_head = NULL; - PREP_NEW_MULTIDATA(); - - /* - * Before we go on further, make sure there is an IRE that we can - * use, and that the ILL supports MDT. Otherwise, there's no point - * in proceeding any further, and we should just hand everything - * off to the legacy path. - */ - if (!tcp_send_find_ire(tcp, (af == AF_INET) ? &dst : NULL, &ire)) - goto legacy_send_no_md; - - ASSERT(ire != NULL); - ASSERT(af != AF_INET || ire->ire_ipversion == IPV4_VERSION); - ASSERT(af == AF_INET || !IN6_IS_ADDR_V4MAPPED(&(ire->ire_addr_v6))); - ASSERT(af == AF_INET || ire->ire_nce != NULL); - ASSERT(!(ire->ire_type & IRE_BROADCAST)); - /* - * If we do support loopback for MDT (which requires modifications - * to the receiving paths), the following assertions should go away, - * and we would be sending the Multidata to loopback conn later on. - */ - ASSERT(!IRE_IS_LOCAL(ire)); - ASSERT(ire->ire_stq != NULL); - - ill = ire_to_ill(ire); - ASSERT(ill != NULL); - ASSERT(!ILL_MDT_CAPABLE(ill) || ill->ill_mdt_capab != NULL); - - if (!tcp->tcp_ire_ill_check_done) { - tcp_ire_ill_check(tcp, ire, ill, B_TRUE); - tcp->tcp_ire_ill_check_done = B_TRUE; - } - - /* - * If the underlying interface conditions have changed, or if the - * new interface does not support MDT, go back to legacy path. - */ - if (!ILL_MDT_USABLE(ill) || (ire->ire_flags & RTF_MULTIRT) != 0) { - /* don't go through this path anymore for this connection */ - TCP_STAT(tcps, tcp_mdt_conn_halted2); - tcp->tcp_mdt = B_FALSE; - ip1dbg(("tcp_multisend: disabling MDT for connp %p on " - "interface %s\n", (void *)connp, ill->ill_name)); - /* IRE will be released prior to returning */ - goto legacy_send_no_md; - } - - if (ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) - zc_cap = ill->ill_zerocopy_capab; - - /* - * Check if we can take tcp fast-path. Note that "incomplete" - * ire's (where the link-layer for next hop is not resolved - * or where the fast-path header in nce_fp_mp is not available - * yet) are sent down the legacy (slow) path. - * NOTE: We should fix ip_xmit_v4 to handle M_MULTIDATA - */ - if (ire->ire_nce && ire->ire_nce->nce_state != ND_REACHABLE) { - /* IRE will be released prior to returning */ - goto legacy_send_no_md; - } - - /* go to legacy path if interface doesn't support zerocopy */ - if (tcp->tcp_snd_zcopy_aware && do_tcpzcopy != 2 && - (zc_cap == NULL || zc_cap->ill_zerocopy_flags == 0)) { - /* IRE will be released prior to returning */ - goto legacy_send_no_md; - } - - /* does the interface support hardware checksum offload? */ - hwcksum_flags = 0; - if (ILL_HCKSUM_CAPABLE(ill) && - (ill->ill_hcksum_capab->ill_hcksum_txflags & - (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6 | HCKSUM_INET_PARTIAL | - HCKSUM_IPHDRCKSUM)) && dohwcksum) { - if (ill->ill_hcksum_capab->ill_hcksum_txflags & - HCKSUM_IPHDRCKSUM) - hwcksum_flags = HCK_IPV4_HDRCKSUM; - - if (ill->ill_hcksum_capab->ill_hcksum_txflags & - (HCKSUM_INET_FULL_V4 | HCKSUM_INET_FULL_V6)) - hwcksum_flags |= HCK_FULLCKSUM; - else if (ill->ill_hcksum_capab->ill_hcksum_txflags & - HCKSUM_INET_PARTIAL) - hwcksum_flags |= HCK_PARTIALCKSUM; - } - - /* - * Each header fragment consists of the leading extra space, - * followed by the TCP/IP header, and the trailing extra space. - * We make sure that each header fragment begins on a 32-bit - * aligned memory address (tcp_mdt_hdr_head is already 32-bit - * aligned in tcp_mdt_update). - */ - hdr_frag_sz = roundup((tcp->tcp_mdt_hdr_head + tcp_hdr_len + - tcp->tcp_mdt_hdr_tail), 4); - - /* are we starting from the beginning of data block? */ - if (*tail_unsent == 0) { - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail); - } - - /* - * Here we create one or more Multidata messages, each made up of - * one header buffer and up to N payload buffers. This entire - * operation is done within two loops: - * - * The outer loop mostly deals with creating the Multidata message, - * as well as the header buffer that gets added to it. It also - * links the Multidata messages together such that all of them can - * be sent down to the lower layer in a single putnext call; this - * linking behavior depends on the tcp_mdt_chain tunable. - * - * The inner loop takes an existing Multidata message, and adds - * one or more (up to tcp_mdt_max_pld) payload buffers to it. It - * packetizes those buffers by filling up the corresponding header - * buffer fragments with the proper IP and TCP headers, and by - * describing the layout of each packet in the packet descriptors - * that get added to the Multidata. - */ - do { - /* - * If usable send window is too small, or data blocks in - * transmit list are smaller than our threshold (i.e. app - * performs large writes followed by small ones), we hand - * off the control over to the legacy path. Note that we'll - * get back the control once it encounters a large block. - */ - if (*usable < mss || (*tail_unsent <= mdt_thres && - (*xmit_tail)->b_cont != NULL && - MBLKL((*xmit_tail)->b_cont) <= mdt_thres)) { - /* send down what we've got so far */ - if (md_mp_head != NULL) { - tcp_multisend_data(tcp, ire, ill, md_mp_head, - obsegs, obbytes, &rconfirm); - } - /* - * Pass control over to tcp_send(), but tell it to - * return to us once a large-size transmission is - * possible. - */ - TCP_STAT(tcps, tcp_mdt_legacy_small); - if ((err = tcp_send(q, tcp, mss, tcp_hdr_len, - tcp_tcp_hdr_len, num_sack_blk, usable, snxt, - tail_unsent, xmit_tail, local_time, - mdt_thres)) <= 0) { - /* burst count reached, or alloc failed */ - IRE_REFRELE(ire); - return (err); - } - - /* tcp_send() may have sent everything, so check */ - if (*usable <= 0) { - IRE_REFRELE(ire); - return (0); - } - - TCP_STAT(tcps, tcp_mdt_legacy_ret); - /* - * We may have delivered the Multidata, so make sure - * to re-initialize before the next round. - */ - md_mp_head = NULL; - obsegs = obbytes = 0; - num_burst_seg = tcp->tcp_snd_burst; - PREP_NEW_MULTIDATA(); - - /* are we starting from the beginning of data block? */ - if (*tail_unsent == 0) { - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= - (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail); - } - } - /* - * Record current values for parameters we may need to pass - * to tcp_send() or tcp_multisend_data(). We checkpoint at - * each iteration of the outer loop (each multidata message - * creation). If we have a failure in the inner loop, we send - * any complete multidata messages we have before reverting - * to using the traditional non-md path. - */ - snxt_mmd = *snxt; - usable_mmd = *usable; - xmit_tail_mmd = *xmit_tail; - tail_unsent_mmd = *tail_unsent; - obsegs_mmd = obsegs; - obbytes_mmd = obbytes; - - /* - * max_pld limits the number of mblks in tcp's transmit - * queue that can be added to a Multidata message. Once - * this counter reaches zero, no more additional mblks - * can be added to it. What happens afterwards depends - * on whether or not we are set to chain the Multidata - * messages. If we are to link them together, reset - * max_pld to its original value (tcp_mdt_max_pld) and - * prepare to create a new Multidata message which will - * get linked to md_mp_head. Else, leave it alone and - * let the inner loop break on its own. - */ - if (tcp_mdt_chain && max_pld == 0) - PREP_NEW_MULTIDATA(); - - /* adding a payload buffer; re-initialize values */ - if (add_buffer) - PREP_NEW_PBUF(); - - /* - * If we don't have a Multidata, either because we just - * (re)entered this outer loop, or after we branched off - * to tcp_send above, setup the Multidata and header - * buffer to be used. - */ - if (md_mp == NULL) { - int md_hbuflen; - uint32_t start, stuff; - - /* - * Calculate Multidata header buffer size large enough - * to hold all of the headers that can possibly be - * sent at this moment. We'd rather over-estimate - * the size than running out of space; this is okay - * since this buffer is small anyway. - */ - md_hbuflen = (howmany(*usable, mss) + 1) * hdr_frag_sz; - - /* - * Start and stuff offset for partial hardware - * checksum offload; these are currently for IPv4. - * For full checksum offload, they are set to zero. - */ - if ((hwcksum_flags & HCK_PARTIALCKSUM)) { - if (af == AF_INET) { - start = IP_SIMPLE_HDR_LENGTH; - stuff = IP_SIMPLE_HDR_LENGTH + - TCP_CHECKSUM_OFFSET; - } else { - start = IPV6_HDR_LEN; - stuff = IPV6_HDR_LEN + - TCP_CHECKSUM_OFFSET; - } - } else { - start = stuff = 0; - } - - /* - * Create the header buffer, Multidata, as well as - * any necessary attributes (destination address, - * SAP and hardware checksum offload) that should - * be associated with the Multidata message. - */ - ASSERT(cur_hdr_off == 0); - if ((md_hbuf = allocb(md_hbuflen, BPRI_HI)) == NULL || - ((md_hbuf->b_wptr += md_hbuflen), - (mmd = mmd_alloc(md_hbuf, &md_mp, - KM_NOSLEEP)) == NULL) || (tcp_mdt_add_attrs(mmd, - /* fastpath mblk */ - ire->ire_nce->nce_res_mp, - /* hardware checksum enabled */ - (hwcksum_flags & (HCK_FULLCKSUM|HCK_PARTIALCKSUM)), - /* hardware checksum offsets */ - start, stuff, 0, - /* hardware checksum flag */ - hwcksum_flags, tcps) != 0)) { -legacy_send: - /* - * We arrive here from a failure within the - * inner (packetizer) loop or we fail one of - * the conditionals above. We restore the - * previously checkpointed values for: - * xmit_tail - * usable - * tail_unsent - * snxt - * obbytes - * obsegs - * We should then be able to dispatch any - * complete multidata before reverting to the - * traditional path with consistent parameters - * (the inner loop updates these as it - * iterates). - */ - *xmit_tail = xmit_tail_mmd; - *usable = usable_mmd; - *tail_unsent = tail_unsent_mmd; - *snxt = snxt_mmd; - obbytes = obbytes_mmd; - obsegs = obsegs_mmd; - if (md_mp != NULL) { - /* Unlink message from the chain */ - if (md_mp_head != NULL) { - err = (intptr_t)rmvb(md_mp_head, - md_mp); - /* - * We can't assert that rmvb - * did not return -1, since we - * may get here before linkb - * happens. We do, however, - * check if we just removed the - * only element in the list. - */ - if (err == 0) - md_mp_head = NULL; - } - /* md_hbuf gets freed automatically */ - TCP_STAT(tcps, tcp_mdt_discarded); - freeb(md_mp); - } else { - /* Either allocb or mmd_alloc failed */ - TCP_STAT(tcps, tcp_mdt_allocfail); - if (md_hbuf != NULL) - freeb(md_hbuf); - } - - /* send down what we've got so far */ - if (md_mp_head != NULL) { - tcp_multisend_data(tcp, ire, ill, - md_mp_head, obsegs, obbytes, - &rconfirm); - } -legacy_send_no_md: - if (ire != NULL) - IRE_REFRELE(ire); - /* - * Too bad; let the legacy path handle this. - * We specify INT_MAX for the threshold, since - * we gave up with the Multidata processings - * and let the old path have it all. - */ - TCP_STAT(tcps, tcp_mdt_legacy_all); - return (tcp_send(q, tcp, mss, tcp_hdr_len, - tcp_tcp_hdr_len, num_sack_blk, usable, - snxt, tail_unsent, xmit_tail, local_time, - INT_MAX)); - } - - /* link to any existing ones, if applicable */ - TCP_STAT(tcps, tcp_mdt_allocd); - if (md_mp_head == NULL) { - md_mp_head = md_mp; - } else if (tcp_mdt_chain) { - TCP_STAT(tcps, tcp_mdt_linked); - linkb(md_mp_head, md_mp); - } - } - - ASSERT(md_mp_head != NULL); - ASSERT(tcp_mdt_chain || md_mp_head->b_cont == NULL); - ASSERT(md_mp != NULL && mmd != NULL); - ASSERT(md_hbuf != NULL); - - /* - * Packetize the transmittable portion of the data block; - * each data block is essentially added to the Multidata - * as a payload buffer. We also deal with adding more - * than one payload buffers, which happens when the remaining - * packetized portion of the current payload buffer is less - * than MSS, while the next data block in transmit queue - * has enough data to make up for one. This "spillover" - * case essentially creates a split-packet, where portions - * of the packet's payload fragments may span across two - * virtually discontiguous address blocks. - */ - seg_len = mss; - do { - len = seg_len; - - /* one must remain NULL for DTRACE_IP_FASTPATH */ - ipha = NULL; - ip6h = NULL; - - ASSERT(len > 0); - ASSERT(max_pld >= 0); - ASSERT(!add_buffer || cur_pld_off == 0); - - /* - * First time around for this payload buffer; note - * in the case of a spillover, the following has - * been done prior to adding the split-packet - * descriptor to Multidata, and we don't want to - * repeat the process. - */ - if (add_buffer) { - ASSERT(mmd != NULL); - ASSERT(md_pbuf == NULL); - ASSERT(md_pbuf_nxt == NULL); - ASSERT(pbuf_idx == -1 && pbuf_idx_nxt == -1); - - /* - * Have we reached the limit? We'd get to - * this case when we're not chaining the - * Multidata messages together, and since - * we're done, terminate this loop. - */ - if (max_pld == 0) - break; /* done */ - - if ((md_pbuf = dupb(*xmit_tail)) == NULL) { - TCP_STAT(tcps, tcp_mdt_allocfail); - goto legacy_send; /* out_of_mem */ - } - - if (IS_VMLOANED_MBLK(md_pbuf) && !zcopy && - zc_cap != NULL) { - if (!ip_md_zcopy_attr(mmd, NULL, - zc_cap->ill_zerocopy_flags)) { - freeb(md_pbuf); - TCP_STAT(tcps, - tcp_mdt_allocfail); - /* out_of_mem */ - goto legacy_send; - } - zcopy = B_TRUE; - } - - md_pbuf->b_rptr += base_pld_off; - - /* - * Add a payload buffer to the Multidata; this - * operation must not fail, or otherwise our - * logic in this routine is broken. There - * is no memory allocation done by the - * routine, so any returned failure simply - * tells us that we've done something wrong. - * - * A failure tells us that either we're adding - * the same payload buffer more than once, or - * we're trying to add more buffers than - * allowed (max_pld calculation is wrong). - * None of the above cases should happen, and - * we panic because either there's horrible - * heap corruption, and/or programming mistake. - */ - pbuf_idx = mmd_addpldbuf(mmd, md_pbuf); - if (pbuf_idx < 0) { - cmn_err(CE_PANIC, "tcp_multisend: " - "payload buffer logic error " - "detected for tcp %p mmd %p " - "pbuf %p (%d)\n", - (void *)tcp, (void *)mmd, - (void *)md_pbuf, pbuf_idx); - } - - ASSERT(max_pld > 0); - --max_pld; - add_buffer = B_FALSE; - } - - ASSERT(md_mp_head != NULL); - ASSERT(md_pbuf != NULL); - ASSERT(md_pbuf_nxt == NULL); - ASSERT(pbuf_idx != -1); - ASSERT(pbuf_idx_nxt == -1); - ASSERT(*usable > 0); - - /* - * We spillover to the next payload buffer only - * if all of the following is true: - * - * 1. There is not enough data on the current - * payload buffer to make up `len', - * 2. We are allowed to send `len', - * 3. The next payload buffer length is large - * enough to accomodate `spill'. - */ - if ((spill = len - *tail_unsent) > 0 && - *usable >= len && - MBLKL((*xmit_tail)->b_cont) >= spill && - max_pld > 0) { - md_pbuf_nxt = dupb((*xmit_tail)->b_cont); - if (md_pbuf_nxt == NULL) { - TCP_STAT(tcps, tcp_mdt_allocfail); - goto legacy_send; /* out_of_mem */ - } - - if (IS_VMLOANED_MBLK(md_pbuf_nxt) && !zcopy && - zc_cap != NULL) { - if (!ip_md_zcopy_attr(mmd, NULL, - zc_cap->ill_zerocopy_flags)) { - freeb(md_pbuf_nxt); - TCP_STAT(tcps, - tcp_mdt_allocfail); - /* out_of_mem */ - goto legacy_send; - } - zcopy = B_TRUE; - } - - /* - * See comments above on the first call to - * mmd_addpldbuf for explanation on the panic. - */ - pbuf_idx_nxt = mmd_addpldbuf(mmd, md_pbuf_nxt); - if (pbuf_idx_nxt < 0) { - panic("tcp_multisend: " - "next payload buffer logic error " - "detected for tcp %p mmd %p " - "pbuf %p (%d)\n", - (void *)tcp, (void *)mmd, - (void *)md_pbuf_nxt, pbuf_idx_nxt); - } - - ASSERT(max_pld > 0); - --max_pld; - } else if (spill > 0) { - /* - * If there's a spillover, but the following - * xmit_tail couldn't give us enough octets - * to reach "len", then stop the current - * Multidata creation and let the legacy - * tcp_send() path take over. We don't want - * to send the tiny segment as part of this - * Multidata for performance reasons; instead, - * we let the legacy path deal with grouping - * it with the subsequent small mblks. - */ - if (*usable >= len && - MBLKL((*xmit_tail)->b_cont) < spill) { - max_pld = 0; - break; /* done */ - } - - /* - * We can't spillover, and we are near - * the end of the current payload buffer, - * so send what's left. - */ - ASSERT(*tail_unsent > 0); - len = *tail_unsent; - } - - /* tail_unsent is negated if there is a spillover */ - *tail_unsent -= len; - *usable -= len; - ASSERT(*usable >= 0); - - if (*usable < mss) - seg_len = *usable; - /* - * Sender SWS avoidance; see comments in tcp_send(); - * everything else is the same, except that we only - * do this here if there is no more data to be sent - * following the current xmit_tail. We don't check - * for 1-byte urgent data because we shouldn't get - * here if TCP_URG_VALID is set. - */ - if (*usable > 0 && *usable < mss && - ((md_pbuf_nxt == NULL && - (*xmit_tail)->b_cont == NULL) || - (md_pbuf_nxt != NULL && - (*xmit_tail)->b_cont->b_cont == NULL)) && - seg_len < (tcp->tcp_max_swnd >> 1) && - (tcp->tcp_unsent - - ((*snxt + len) - tcp->tcp_snxt)) > seg_len && - !tcp->tcp_zero_win_probe) { - if ((*snxt + len) == tcp->tcp_snxt && - (*snxt + len) == tcp->tcp_suna) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - done = B_TRUE; - } - - /* - * Prime pump for IP's checksumming on our behalf; - * include the adjustment for a source route if any. - * Do this only for software/partial hardware checksum - * offload, as this field gets zeroed out later for - * the full hardware checksum offload case. - */ - if (!(hwcksum_flags & HCK_FULLCKSUM)) { - cksum = len + tcp_tcp_hdr_len + tcp->tcp_sum; - cksum = (cksum >> 16) + (cksum & 0xFFFF); - U16_TO_ABE16(cksum, tcp->tcp_tcph->th_sum); - } - - U32_TO_ABE32(*snxt, tcp->tcp_tcph->th_seq); - *snxt += len; - - tcp->tcp_tcph->th_flags[0] = TH_ACK; - /* - * We set the PUSH bit only if TCP has no more buffered - * data to be transmitted (or if sender SWS avoidance - * takes place), as opposed to setting it for every - * last packet in the burst. - */ - if (done || - (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) == 0) - tcp->tcp_tcph->th_flags[0] |= TH_PUSH; - - /* - * Set FIN bit if this is our last segment; snxt - * already includes its length, and it will not - * be adjusted after this point. - */ - if (tcp->tcp_valid_bits == TCP_FSS_VALID && - *snxt == tcp->tcp_fss) { - if (!tcp->tcp_fin_acked) { - tcp->tcp_tcph->th_flags[0] |= TH_FIN; - BUMP_MIB(&tcps->tcps_mib, - tcpOutControl); - } - if (!tcp->tcp_fin_sent) { - tcp->tcp_fin_sent = B_TRUE; - /* - * tcp state must be ESTABLISHED - * in order for us to get here in - * the first place. - */ - tcp->tcp_state = TCPS_FIN_WAIT_1; - - /* - * Upon returning from this routine, - * tcp_wput_data() will set tcp_snxt - * to be equal to snxt + tcp_fin_sent. - * This is essentially the same as - * setting it to tcp_fss + 1. - */ - } - } - - tcp->tcp_last_sent_len = (ushort_t)len; - - len += tcp_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) - tcp->tcp_ipha->ipha_length = htons(len); - else - tcp->tcp_ip6h->ip6_plen = htons(len - - ((char *)&tcp->tcp_ip6h[1] - - tcp->tcp_iphc)); - - pkt_info->flags = (PDESC_HBUF_REF | PDESC_PBUF_REF); - - /* setup header fragment */ - PDESC_HDR_ADD(pkt_info, - md_hbuf->b_rptr + cur_hdr_off, /* base */ - tcp->tcp_mdt_hdr_head, /* head room */ - tcp_hdr_len, /* len */ - tcp->tcp_mdt_hdr_tail); /* tail room */ - - ASSERT(pkt_info->hdr_lim - pkt_info->hdr_base == - hdr_frag_sz); - ASSERT(MBLKIN(md_hbuf, - (pkt_info->hdr_base - md_hbuf->b_rptr), - PDESC_HDRSIZE(pkt_info))); - - /* setup first payload fragment */ - PDESC_PLD_INIT(pkt_info); - PDESC_PLD_SPAN_ADD(pkt_info, - pbuf_idx, /* index */ - md_pbuf->b_rptr + cur_pld_off, /* start */ - tcp->tcp_last_sent_len); /* len */ - - /* create a split-packet in case of a spillover */ - if (md_pbuf_nxt != NULL) { - ASSERT(spill > 0); - ASSERT(pbuf_idx_nxt > pbuf_idx); - ASSERT(!add_buffer); - - md_pbuf = md_pbuf_nxt; - md_pbuf_nxt = NULL; - pbuf_idx = pbuf_idx_nxt; - pbuf_idx_nxt = -1; - cur_pld_off = spill; - - /* trim out first payload fragment */ - PDESC_PLD_SPAN_TRIM(pkt_info, 0, spill); - - /* setup second payload fragment */ - PDESC_PLD_SPAN_ADD(pkt_info, - pbuf_idx, /* index */ - md_pbuf->b_rptr, /* start */ - spill); /* len */ - - if ((*xmit_tail)->b_next == NULL) { - /* - * Store the lbolt used for RTT - * estimation. We can only record one - * timestamp per mblk so we do it when - * we reach the end of the payload - * buffer. Also we only take a new - * timestamp sample when the previous - * timed data from the same mblk has - * been ack'ed. - */ - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = - (mblk_t *)(uintptr_t)first_snxt; - } - - first_snxt = *snxt - spill; - - /* - * Advance xmit_tail; usable could be 0 by - * the time we got here, but we made sure - * above that we would only spillover to - * the next data block if usable includes - * the spilled-over amount prior to the - * subtraction. Therefore, we are sure - * that xmit_tail->b_cont can't be NULL. - */ - ASSERT((*xmit_tail)->b_cont != NULL); - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= - (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail) - spill; - } else { - cur_pld_off += tcp->tcp_last_sent_len; - } - - /* - * Fill in the header using the template header, and - * add options such as time-stamp, ECN and/or SACK, - * as needed. - */ - tcp_fill_header(tcp, pkt_info->hdr_rptr, - (clock_t)local_time, num_sack_blk); - - /* take care of some IP header businesses */ - if (af == AF_INET) { - ipha = (ipha_t *)pkt_info->hdr_rptr; - - ASSERT(OK_32PTR((uchar_t *)ipha)); - ASSERT(PDESC_HDRL(pkt_info) >= - IP_SIMPLE_HDR_LENGTH); - ASSERT(ipha->ipha_version_and_hdr_length == - IP_SIMPLE_HDR_VERSION); - - /* - * Assign ident value for current packet; see - * related comments in ip_wput_ire() about the - * contract private interface with clustering - * group. - */ - clusterwide = B_FALSE; - if (cl_inet_ipident != NULL) { - ASSERT(cl_inet_isclusterwide != NULL); - if ((*cl_inet_isclusterwide)(stack_id, - IPPROTO_IP, AF_INET, - (uint8_t *)(uintptr_t)src, NULL)) { - ipha->ipha_ident = - (*cl_inet_ipident)(stack_id, - IPPROTO_IP, AF_INET, - (uint8_t *)(uintptr_t)src, - (uint8_t *)(uintptr_t)dst, - NULL); - clusterwide = B_TRUE; - } - } - - if (!clusterwide) { - ipha->ipha_ident = (uint16_t) - atomic_add_32_nv( - &ire->ire_ident, 1); - } -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | - (ipha->ipha_ident >> 8); -#endif - } else { - ip6h = (ip6_t *)pkt_info->hdr_rptr; - - ASSERT(OK_32PTR((uchar_t *)ip6h)); - ASSERT(IPVER(ip6h) == IPV6_VERSION); - ASSERT(ip6h->ip6_nxt == IPPROTO_TCP); - ASSERT(PDESC_HDRL(pkt_info) >= - (IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET + - TCP_CHECKSUM_SIZE)); - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - - if (tcp->tcp_ip_forward_progress) { - rconfirm = B_TRUE; - tcp->tcp_ip_forward_progress = B_FALSE; - } - } - - /* at least one payload span, and at most two */ - ASSERT(pkt_info->pld_cnt > 0 && pkt_info->pld_cnt < 3); - - /* add the packet descriptor to Multidata */ - if ((pkt = mmd_addpdesc(mmd, pkt_info, &err, - KM_NOSLEEP)) == NULL) { - /* - * Any failure other than ENOMEM indicates - * that we have passed in invalid pkt_info - * or parameters to mmd_addpdesc, which must - * not happen. - * - * EINVAL is a result of failure on boundary - * checks against the pkt_info contents. It - * should not happen, and we panic because - * either there's horrible heap corruption, - * and/or programming mistake. - */ - if (err != ENOMEM) { - cmn_err(CE_PANIC, "tcp_multisend: " - "pdesc logic error detected for " - "tcp %p mmd %p pinfo %p (%d)\n", - (void *)tcp, (void *)mmd, - (void *)pkt_info, err); - } - TCP_STAT(tcps, tcp_mdt_addpdescfail); - goto legacy_send; /* out_of_mem */ - } - ASSERT(pkt != NULL); - - /* calculate IP header and TCP checksums */ - if (af == AF_INET) { - /* calculate pseudo-header checksum */ - cksum = (dst >> 16) + (dst & 0xFFFF) + - (src >> 16) + (src & 0xFFFF); - - /* offset for TCP header checksum */ - up = IPH_TCPH_CHECKSUMP(ipha, - IP_SIMPLE_HDR_LENGTH); - } else { - up = (uint16_t *)&ip6h->ip6_src; - - /* calculate pseudo-header checksum */ - cksum = up[0] + up[1] + up[2] + up[3] + - up[4] + up[5] + up[6] + up[7] + - up[8] + up[9] + up[10] + up[11] + - up[12] + up[13] + up[14] + up[15]; - - /* Fold the initial sum */ - cksum = (cksum & 0xffff) + (cksum >> 16); - - up = (uint16_t *)(((uchar_t *)ip6h) + - IPV6_HDR_LEN + TCP_CHECKSUM_OFFSET); - } - - if (hwcksum_flags & HCK_FULLCKSUM) { - /* clear checksum field for hardware */ - *up = 0; - } else if (hwcksum_flags & HCK_PARTIALCKSUM) { - uint32_t sum; - - /* pseudo-header checksumming */ - sum = *up + cksum + IP_TCP_CSUM_COMP; - sum = (sum & 0xFFFF) + (sum >> 16); - *up = (sum & 0xFFFF) + (sum >> 16); - } else { - /* software checksumming */ - TCP_STAT(tcps, tcp_out_sw_cksum); - TCP_STAT_UPDATE(tcps, tcp_out_sw_cksum_bytes, - tcp->tcp_hdr_len + tcp->tcp_last_sent_len); - *up = IP_MD_CSUM(pkt, tcp->tcp_ip_hdr_len, - cksum + IP_TCP_CSUM_COMP); - if (*up == 0) - *up = 0xFFFF; - } - - /* IPv4 header checksum */ - if (af == AF_INET) { - if (hwcksum_flags & HCK_IPV4_HDRCKSUM) { - ipha->ipha_hdr_checksum = 0; - } else { - IP_HDR_CKSUM(ipha, cksum, - ((uint32_t *)ipha)[0], - ((uint16_t *)ipha)[4]); - } - } - - if (af == AF_INET && - HOOKS4_INTERESTED_PHYSICAL_OUT(ipst) || - af == AF_INET6 && - HOOKS6_INTERESTED_PHYSICAL_OUT(ipst)) { - mblk_t *mp, *mp1; - uchar_t *hdr_rptr, *hdr_wptr; - uchar_t *pld_rptr, *pld_wptr; - - /* - * We reconstruct a pseudo packet for the hooks - * framework using mmd_transform_link(). - * If it is a split packet we pullup the - * payload. FW_HOOKS expects a pkt comprising - * of two mblks: a header and the payload. - */ - if ((mp = mmd_transform_link(pkt)) == NULL) { - TCP_STAT(tcps, tcp_mdt_allocfail); - goto legacy_send; - } - - if (pkt_info->pld_cnt > 1) { - /* split payload, more than one pld */ - if ((mp1 = msgpullup(mp->b_cont, -1)) == - NULL) { - freemsg(mp); - TCP_STAT(tcps, - tcp_mdt_allocfail); - goto legacy_send; - } - freemsg(mp->b_cont); - mp->b_cont = mp1; - } else { - mp1 = mp->b_cont; - } - ASSERT(mp1 != NULL && mp1->b_cont == NULL); - - /* - * Remember the message offsets. This is so we - * can detect changes when we return from the - * FW_HOOKS callbacks. - */ - hdr_rptr = mp->b_rptr; - hdr_wptr = mp->b_wptr; - pld_rptr = mp->b_cont->b_rptr; - pld_wptr = mp->b_cont->b_wptr; - - if (af == AF_INET) { - DTRACE_PROBE4( - ip4__physical__out__start, - ill_t *, NULL, - ill_t *, ill, - ipha_t *, ipha, - mblk_t *, mp); - FW_HOOKS( - ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, - NULL, ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1( - ip4__physical__out__end, - mblk_t *, mp); - } else { - DTRACE_PROBE4( - ip6__physical__out_start, - ill_t *, NULL, - ill_t *, ill, - ip6_t *, ip6h, - mblk_t *, mp); - FW_HOOKS6( - ipst->ips_ip6_physical_out_event, - ipst->ips_ipv6firewall_physical_out, - NULL, ill, ip6h, mp, mp, 0, ipst); - DTRACE_PROBE1( - ip6__physical__out__end, - mblk_t *, mp); - } - - if (mp == NULL || - (mp1 = mp->b_cont) == NULL || - mp->b_rptr != hdr_rptr || - mp->b_wptr != hdr_wptr || - mp1->b_rptr != pld_rptr || - mp1->b_wptr != pld_wptr || - mp1->b_cont != NULL) { - /* - * We abandon multidata processing and - * return to the normal path, either - * when a packet is blocked, or when - * the boundaries of header buffer or - * payload buffer have been changed by - * FW_HOOKS[6]. - */ - if (mp != NULL) - freemsg(mp); - goto legacy_send; - } - /* Finished with the pseudo packet */ - freemsg(mp); - } - DTRACE_IP_FASTPATH(md_hbuf, pkt_info->hdr_rptr, - ill, ipha, ip6h); - /* advance header offset */ - cur_hdr_off += hdr_frag_sz; - - obbytes += tcp->tcp_last_sent_len; - ++obsegs; - } while (!done && *usable > 0 && --num_burst_seg > 0 && - *tail_unsent > 0); - - if ((*xmit_tail)->b_next == NULL) { - /* - * Store the lbolt used for RTT estimation. We can only - * record one timestamp per mblk so we do it when we - * reach the end of the payload buffer. Also we only - * take a new timestamp sample when the previous timed - * data from the same mblk has been ack'ed. - */ - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)first_snxt; - } - - ASSERT(*tail_unsent >= 0); - if (*tail_unsent > 0) { - /* - * We got here because we broke out of the above - * loop due to of one of the following cases: - * - * 1. len < adjusted MSS (i.e. small), - * 2. Sender SWS avoidance, - * 3. max_pld is zero. - * - * We are done for this Multidata, so trim our - * last payload buffer (if any) accordingly. - */ - if (md_pbuf != NULL) - md_pbuf->b_wptr -= *tail_unsent; - } else if (*usable > 0) { - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)MBLKL(*xmit_tail) <= - (uintptr_t)INT_MAX); - *tail_unsent = (int)MBLKL(*xmit_tail); - add_buffer = B_TRUE; - } - } while (!done && *usable > 0 && num_burst_seg > 0 && - (tcp_mdt_chain || max_pld > 0)); - - if (md_mp_head != NULL) { - /* send everything down */ - tcp_multisend_data(tcp, ire, ill, md_mp_head, obsegs, obbytes, - &rconfirm); - } - -#undef PREP_NEW_MULTIDATA -#undef PREP_NEW_PBUF -#undef IPVER - - IRE_REFRELE(ire); - return (0); -} - -/* - * A wrapper function for sending one or more Multidata messages down to - * the module below ip; this routine does not release the reference of the - * IRE (caller does that). This routine is analogous to tcp_send_data(). - */ -static void -tcp_multisend_data(tcp_t *tcp, ire_t *ire, const ill_t *ill, mblk_t *md_mp_head, - const uint_t obsegs, const uint_t obbytes, boolean_t *rconfirm) +tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, + const int tcp_hdr_len, const int num_sack_blk, int *usable, + uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) { - uint64_t delta; - nce_t *nce; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(ire != NULL && ill != NULL); - ASSERT(ire->ire_stq != NULL); - ASSERT(md_mp_head != NULL); - ASSERT(rconfirm != NULL); - - /* adjust MIBs and IRE timestamp */ - DTRACE_PROBE2(tcp__trace__send, mblk_t *, md_mp_head, tcp_t *, tcp); - tcp->tcp_obsegs += obsegs; - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataSegs, obsegs); - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, obbytes); - TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out, obsegs); - - if (tcp->tcp_ipversion == IPV4_VERSION) { - TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v4, obsegs); - } else { - TCP_STAT_UPDATE(tcps, tcp_mdt_pkt_out_v6, obsegs); - } - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests, obsegs); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits, obsegs); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, obbytes); - - ire->ire_ob_pkt_count += obsegs; - if (ire->ire_ipif != NULL) - atomic_add_32(&ire->ire_ipif->ipif_ob_pkt_count, obsegs); - ire->ire_last_used_time = lbolt; - - if ((tcp->tcp_ipversion == IPV4_VERSION && - ipst->ips_ip4_observe.he_interested) || - (tcp->tcp_ipversion == IPV6_VERSION && - ipst->ips_ip6_observe.he_interested)) { - multidata_t *dlmdp = mmd_getmultidata(md_mp_head); - pdesc_t *dl_pkt; - pdescinfo_t pinfo; - mblk_t *nmp; - zoneid_t szone = tcp->tcp_connp->conn_zoneid; - - for (dl_pkt = mmd_getfirstpdesc(dlmdp, &pinfo); - (dl_pkt != NULL); - dl_pkt = mmd_getnextpdesc(dl_pkt, &pinfo)) { - if ((nmp = mmd_transform_link(dl_pkt)) == NULL) - continue; - ipobs_hook(nmp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - freemsg(nmp); - } - } - - /* send it down */ - putnext(ire->ire_stq, md_mp_head); - - /* we're done for TCP/IPv4 */ - if (tcp->tcp_ipversion == IPV4_VERSION) - return; - - nce = ire->ire_nce; - - ASSERT(nce != NULL); - ASSERT(!(nce->nce_flags & (NCE_F_NONUD|NCE_F_PERMANENT))); - ASSERT(nce->nce_state != ND_INCOMPLETE); - - /* reachability confirmation? */ - if (*rconfirm) { - nce->nce_last = TICK_TO_MSEC(lbolt64); - if (nce->nce_state != ND_REACHABLE) { - mutex_enter(&nce->nce_lock); - nce->nce_state = ND_REACHABLE; - nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; - mutex_exit(&nce->nce_lock); - (void) untimeout(nce->nce_timeout_id); - if (ip_debug > 2) { - /* ip1dbg */ - pr_addr_dbg("tcp_multisend_data: state " - "for %s changed to REACHABLE\n", - AF_INET6, &ire->ire_addr_v6); - } - } - /* reset transport reachability confirmation */ - *rconfirm = B_FALSE; - } - - delta = TICK_TO_MSEC(lbolt64) - nce->nce_last; - ip1dbg(("tcp_multisend_data: delta = %" PRId64 - " ill_reachable_time = %d \n", delta, ill->ill_reachable_time)); - - if (delta > (uint64_t)ill->ill_reachable_time) { - mutex_enter(&nce->nce_lock); - switch (nce->nce_state) { - case ND_REACHABLE: - case ND_STALE: - /* - * ND_REACHABLE is identical to ND_STALE in this - * specific case. If reachable time has expired for - * this neighbor (delta is greater than reachable - * time), conceptually, the neighbor cache is no - * longer in REACHABLE state, but already in STALE - * state. So the correct transition here is to - * ND_DELAY. - */ - nce->nce_state = ND_DELAY; - mutex_exit(&nce->nce_lock); - NDP_RESTART_TIMER(nce, - ipst->ips_delay_first_probe_time); - if (ip_debug > 3) { - /* ip2dbg */ - pr_addr_dbg("tcp_multisend_data: state " - "for %s changed to DELAY\n", - AF_INET6, &ire->ire_addr_v6); - } - break; - case ND_DELAY: - case ND_PROBE: - mutex_exit(&nce->nce_lock); - /* Timers have already started */ - break; - case ND_UNREACHABLE: - /* - * ndp timer has detected that this nce is - * unreachable and initiated deleting this nce - * and all its associated IREs. This is a race - * where we found the ire before it was deleted - * and have just sent out a packet using this - * unreachable nce. - */ - mutex_exit(&nce->nce_lock); - break; - default: - ASSERT(0); - } - } -} - -/* - * Derived from tcp_send_data(). - */ -static void -tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, - int num_lso_seg) -{ - ipha_t *ipha; - mblk_t *ire_fp_mp; - uint_t ire_fp_mp_len; - uint32_t hcksum_txflags = 0; - ipaddr_t src; - ipaddr_t dst; - uint32_t cksum; - uint16_t *up; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(DB_TYPE(mp) == M_DATA); - ASSERT(tcp->tcp_state == TCPS_ESTABLISHED); - ASSERT(tcp->tcp_ipversion == IPV4_VERSION); - ASSERT(tcp->tcp_connp != NULL); - ASSERT(CONN_IS_LSO_MD_FASTPATH(tcp->tcp_connp)); - - ipha = (ipha_t *)mp->b_rptr; - src = ipha->ipha_src; - dst = ipha->ipha_dst; - - DTRACE_PROBE2(tcp__trace__send, mblk_t *, mp, tcp_t *, tcp); - - ASSERT(ipha->ipha_ident == 0 || ipha->ipha_ident == IP_HDR_INCLUDED); - ipha->ipha_ident = (uint16_t)atomic_add_32_nv(&ire->ire_ident, - num_lso_seg); -#ifndef _BIG_ENDIAN - ipha->ipha_ident = (ipha->ipha_ident << 8) | (ipha->ipha_ident >> 8); -#endif - if (tcp->tcp_snd_zcopy_aware) { - if ((ill->ill_capabilities & ILL_CAPAB_ZEROCOPY) == 0 || - (ill->ill_zerocopy_capab->ill_zerocopy_flags == 0)) - mp = tcp_zcopy_disable(tcp, mp); - } - - if (ILL_HCKSUM_CAPABLE(ill) && dohwcksum) { - ASSERT(ill->ill_hcksum_capab != NULL); - hcksum_txflags = ill->ill_hcksum_capab->ill_hcksum_txflags; - } - - /* - * Since the TCP checksum should be recalculated by h/w, we can just - * zero the checksum field for HCK_FULLCKSUM, or calculate partial - * pseudo-header checksum for HCK_PARTIALCKSUM. - * The partial pseudo-header excludes TCP length, that was calculated - * in tcp_send(), so to zero *up before further processing. - */ - cksum = (dst >> 16) + (dst & 0xFFFF) + (src >> 16) + (src & 0xFFFF); - - up = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH); - *up = 0; - - IP_CKSUM_XMIT_FAST(ire->ire_ipversion, hcksum_txflags, mp, ipha, up, - IPPROTO_TCP, IP_SIMPLE_HDR_LENGTH, ntohs(ipha->ipha_length), cksum); - - /* - * Append LSO flags and mss to the mp. - */ - lso_info_set(mp, mss, HW_LSO); - - ipha->ipha_fragment_offset_and_flags |= - (uint32_t)htons(ire->ire_frag_flag); - - ire_fp_mp = ire->ire_nce->nce_fp_mp; - ire_fp_mp_len = MBLKL(ire_fp_mp); - ASSERT(DB_TYPE(ire_fp_mp) == M_DATA); - mp->b_rptr = (uchar_t *)ipha - ire_fp_mp_len; - bcopy(ire_fp_mp->b_rptr, mp->b_rptr, ire_fp_mp_len); - - UPDATE_OB_PKT_COUNT(ire); - ire->ire_last_used_time = lbolt; - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutRequests); - BUMP_MIB(ill->ill_ip_mib, ipIfStatsHCOutTransmits); - UPDATE_MIB(ill->ill_ip_mib, ipIfStatsHCOutOctets, - ntohs(ipha->ipha_length)); - - DTRACE_PROBE4(ip4__physical__out__start, - ill_t *, NULL, ill_t *, ill, ipha_t *, ipha, mblk_t *, mp); - FW_HOOKS(ipst->ips_ip4_physical_out_event, - ipst->ips_ipv4firewall_physical_out, NULL, - ill, ipha, mp, mp, 0, ipst); - DTRACE_PROBE1(ip4__physical__out__end, mblk_t *, mp); - DTRACE_IP_FASTPATH(mp, ipha, ill, ipha, NULL); - - if (mp != NULL) { - if (ipst->ips_ip4_observe.he_interested) { - zoneid_t szone; - - if (ire_fp_mp_len != 0) - mp->b_rptr += ire_fp_mp_len; - szone = ip_get_zoneid_v4(ipha->ipha_src, mp, - ipst, ALL_ZONES); - ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, - ALL_ZONES, ill, ipst); - if (ire_fp_mp_len != 0) - mp->b_rptr -= ire_fp_mp_len; - } - - ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL); - } -} - -/* - * tcp_send() is called by tcp_wput_data() for non-Multidata transmission - * scheme, and returns one of the following: - * - * -1 = failed allocation. - * 0 = success; burst count reached, or usable send window is too small, - * and that we'd rather wait until later before sending again. - * 1 = success; we are called from tcp_multisend(), and both usable send - * window and tail_unsent are greater than the MDT threshold, and thus - * Multidata Transmit should be used instead. - */ -static int -tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, - const int tcp_tcp_hdr_len, const int num_sack_blk, int *usable, - uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time, - const int mdt_thres) -{ - int num_burst_seg = tcp->tcp_snd_burst; - ire_t *ire = NULL; - ill_t *ill = NULL; - mblk_t *ire_fp_mp = NULL; - uint_t ire_fp_mp_len = 0; + int num_burst_seg = tcp->tcp_snd_burst; int num_lso_seg = 1; uint_t lso_usable; boolean_t do_lso_send = B_FALSE; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* - * Check LSO capability before any further work. And the similar check - * need to be done in for(;;) loop. - * LSO will be deployed when therer is more than one mss of available - * data and a burst transmission is allowed. + * Check LSO possibility. The value of tcp->tcp_lso indicates whether + * the underlying connection is LSO capable. Will check whether having + * enough available data to initiate LSO transmission in the for(){} + * loops. */ - if (tcp->tcp_lso && - (tcp->tcp_valid_bits == 0 || - tcp->tcp_valid_bits == TCP_FSS_VALID) && - num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { - /* - * Try to find usable IRE/ILL and do basic check to the ILL. - * Double check LSO usability before going further, since the - * underlying interface could have been changed. In case of any - * change of LSO capability, set tcp_ire_ill_check_done to - * B_FALSE to force to check the ILL with the next send. - */ - if (tcp_send_find_ire_ill(tcp, NULL, &ire, &ill) && - tcp->tcp_lso && ILL_LSO_TCP_USABLE(ill)) { - /* - * Enable LSO with this transmission. - * Since IRE has been hold in tcp_send_find_ire_ill(), - * IRE_REFRELE(ire) should be called before return. - */ + if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0) do_lso_send = B_TRUE; - ire_fp_mp = ire->ire_nce->nce_fp_mp; - ire_fp_mp_len = MBLKL(ire_fp_mp); - /* Round up to multiple of 4 */ - ire_fp_mp_len = ((ire_fp_mp_len + 3) / 4) * 4; - } else { - tcp->tcp_lso = B_FALSE; - tcp->tcp_ire_ill_check_done = B_FALSE; - do_lso_send = B_FALSE; - ill = NULL; - } - } for (;;) { struct datab *db; - tcph_t *tcph; + tcpha_t *tcpha; uint32_t sum; mblk_t *mp, *mp1; uchar_t *rptr; int len; /* - * If we're called by tcp_multisend(), and the amount of - * sendable data as well as the size of current xmit_tail - * is beyond the MDT threshold, return to the caller and - * let the large data transmit be done using MDT. + * Burst count reached, return successfully. */ - if (*usable > 0 && *usable > mdt_thres && - (*tail_unsent > mdt_thres || (*tail_unsent == 0 && - MBLKL((*xmit_tail)->b_cont) > mdt_thres))) { - ASSERT(tcp->tcp_mdt); - return (1); /* success; do large send */ - } - if (num_burst_seg == 0) - break; /* success; burst count reached */ + break; /* - * Calculate the maximum payload length we can send in *one* + * Calculate the maximum payload length we can send at one * time. */ if (do_lso_send) { /* - * Check whether need to do LSO any more. + * Check whether be able to to do LSO for the current + * available data. */ if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { lso_usable = MIN(tcp->tcp_lso_max, *usable); @@ -20787,7 +15918,10 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, } ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); - +#ifdef DEBUG + DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t, + do_lso_send); +#endif /* * Adjust num_burst_seg here. */ @@ -20817,7 +15951,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* * If the retransmit timer is not running * we start it so that we will retransmit - * in the case when the the receiver has + * in the case when the receiver has * decremented the window. */ if (*snxt == tcp->tcp_snxt && @@ -20838,7 +15972,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, } } - tcph = tcp->tcp_tcph; + tcpha = tcp->tcp_tcpha; /* * The reason to adjust len here is that we need to set flags @@ -20849,19 +15983,25 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, *usable -= len; /* Approximate - can be adjusted later */ if (*usable > 0) - tcph->th_flags[0] = TH_ACK; + tcpha->tha_flags = TH_ACK; else - tcph->th_flags[0] = (TH_ACK | TH_PUSH); + tcpha->tha_flags = (TH_ACK | TH_PUSH); /* - * Prime pump for IP's checksumming on our behalf + * Prime pump for IP's checksumming on our behalf. * Include the adjustment for a source route if any. + * In case of LSO, the partial pseudo-header checksum should + * exclusive TCP length, so zero tha_sum before IP calculate + * pseudo-header checksum for partial checksum offload. */ - sum = len + tcp_tcp_hdr_len + tcp->tcp_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - - U32_TO_ABE32(*snxt, tcph->th_seq); + if (do_lso_send) { + sum = 0; + } else { + sum = len + tcp_hdr_len + connp->conn_sum; + sum = (sum >> 16) + (sum & 0xFFFF); + } + tcpha->tha_sum = htons(sum); + tcpha->tha_seq = htonl(*snxt); /* * Branch off to tcp_xmit_mp() if any of the VALID bits is @@ -20907,8 +16047,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, (*xmit_tail)->b_rptr = prev_rptr; if (mp == NULL) { - if (ire != NULL) - IRE_REFRELE(ire); return (-1); } mp1 = mp->b_cont; @@ -20927,7 +16065,7 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); - tcp_send_data(tcp, q, mp); + tcp_send_data(tcp, mp); continue; } @@ -20942,18 +16080,18 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, *tail_unsent -= len; if (len <= mss) /* LSO is unusable */ tcp->tcp_last_sent_len = (ushort_t)len; - len += tcp_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) + len += total_hdr_len; + ixa->ixa_pktlen = len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { tcp->tcp_ipha->ipha_length = htons(len); - else + } else { tcp->tcp_ip6h->ip6_plen = - htons(len - - ((char *)&tcp->tcp_ip6h[1] - - tcp->tcp_iphc)); + htons(len - IPV6_HDR_LEN); + } + mp = dupb(*xmit_tail); if (mp == NULL) { - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } mp->b_rptr = rptr; @@ -20983,21 +16121,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, if (len <= mss) /* LSO is unusable (!do_lso_send) */ tcp->tcp_last_sent_len = (ushort_t)len; - len += tcp_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) + len += total_hdr_len; + ixa->ixa_pktlen = len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { tcp->tcp_ipha->ipha_length = htons(len); - else - tcp->tcp_ip6h->ip6_plen = htons(len - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + } else { + tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); + } mp = dupb(*xmit_tail); if (mp == NULL) { - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } - len = tcp_hdr_len; + len = total_hdr_len; /* * There are four reasons to allocate a new hdr mblk: * 1) The bytes above us are in use by another packet @@ -21008,24 +16146,21 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, rptr = mp->b_rptr - len; if (!OK_32PTR(rptr) || ((db = mp->b_datap), db->db_ref != 2) || - rptr < db->db_base + ire_fp_mp_len) { + rptr < db->db_base) { /* NOTE: we assume allocb returns an OK_32PTR */ must_alloc:; - mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcps->tcps_wroff_xtra + ire_fp_mp_len, BPRI_MED); + mp1 = allocb(connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra, BPRI_MED); if (mp1 == NULL) { freemsg(mp); - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } mp1->b_cont = mp; mp = mp1; /* Leave room for Link Level header */ - len = tcp_hdr_len; - rptr = - &mp->b_rptr[tcps->tcps_wroff_xtra + ire_fp_mp_len]; + len = total_hdr_len; + rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; mp->b_wptr = &rptr[len]; } @@ -21057,18 +16192,17 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* * Excess data in mblk; can we split it? - * If MDT is enabled for the connection, + * If LSO is enabled for the connection, * keep on splitting as this is a transient * send path. */ - if (!do_lso_send && !tcp->tcp_mdt && - (spill + nmpsz > 0)) { + if (!do_lso_send && (spill + nmpsz > 0)) { /* * Don't split if stream head was * told to break up larger writes * into smaller ones. */ - if (tcp->tcp_maxpsz > 0) + if (tcp->tcp_maxpsz_multiplier > 0) break; /* @@ -21096,8 +16230,6 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, if (mp1 == NULL) { *tail_unsent = spill; freemsg(mp); - if (ire != NULL) - IRE_REFRELE(ire); return (-1); /* out_of_mem */ } } @@ -21119,11 +16251,12 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, /* * Adjust the checksum */ - tcph = (tcph_t *)(rptr + tcp->tcp_ip_hdr_len); + tcpha = (tcpha_t *)(rptr + + ixa->ixa_ip_hdr_length); sum += spill; sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_ABE16(sum, tcph->th_sum); - if (tcp->tcp_ipversion == IPV4_VERSION) { + tcpha->tha_sum = htons(sum); + if (connp->conn_ipversion == IPV4_VERSION) { sum = ntohs( ((ipha_t *)rptr)->ipha_length) + spill; @@ -21136,311 +16269,55 @@ tcp_send(queue_t *q, tcp_t *tcp, const int mss, const int tcp_hdr_len, ((ip6_t *)rptr)->ip6_plen = htons(sum); } + ixa->ixa_pktlen += spill; *tail_unsent = 0; } } if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + ixa->ixa_flags &= ~IXAF_REACH_CONF; } + /* + * Append LSO information, both flags and mss, to the mp. + */ if (do_lso_send) { - tcp_lsosend_data(tcp, mp, ire, ill, mss, - num_lso_seg); - tcp->tcp_obsegs += num_lso_seg; + lso_info_set(mp, mss, HW_LSO); + ixa->ixa_fragsize = IP_MAXPACKET; + ixa->ixa_extra_ident = num_lso_seg - 1; + DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, + boolean_t, B_TRUE); + + tcp_send_data(tcp, mp); + + /* + * Restore values of ixa_fragsize and ixa_extra_ident. + */ + ixa->ixa_fragsize = ixa->ixa_pmtu; + ixa->ixa_extra_ident = 0; + tcp->tcp_obsegs += num_lso_seg; TCP_STAT(tcps, tcp_lso_times); TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); } else { - tcp_send_data(tcp, q, mp); + tcp_send_data(tcp, mp); BUMP_LOCAL(tcp->tcp_obsegs); } } - if (ire != NULL) - IRE_REFRELE(ire); return (0); } -/* Unlink and return any mblk that looks like it contains a MDT info */ -static mblk_t * -tcp_mdt_info_mp(mblk_t *mp) -{ - mblk_t *prev_mp; - - for (;;) { - prev_mp = mp; - /* no more to process? */ - if ((mp = mp->b_cont) == NULL) - break; - - switch (DB_TYPE(mp)) { - case M_CTL: - if (*(uint32_t *)mp->b_rptr != MDT_IOC_INFO_UPDATE) - continue; - ASSERT(prev_mp != NULL); - prev_mp->b_cont = mp->b_cont; - mp->b_cont = NULL; - return (mp); - default: - break; - } - } - return (mp); -} - -/* MDT info update routine, called when IP notifies us about MDT */ -static void -tcp_mdt_update(tcp_t *tcp, ill_mdt_capab_t *mdt_capab, boolean_t first) -{ - boolean_t prev_state; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * IP is telling us to abort MDT on this connection? We know - * this because the capability is only turned off when IP - * encounters some pathological cases, e.g. link-layer change - * where the new driver doesn't support MDT, or in situation - * where MDT usage on the link-layer has been switched off. - * IP would not have sent us the initial MDT_IOC_INFO_UPDATE - * if the link-layer doesn't support MDT, and if it does, it - * will indicate that the feature is to be turned on. - */ - prev_state = tcp->tcp_mdt; - tcp->tcp_mdt = (mdt_capab->ill_mdt_on != 0); - if (!tcp->tcp_mdt && !first) { - TCP_STAT(tcps, tcp_mdt_conn_halted3); - ip1dbg(("tcp_mdt_update: disabling MDT for connp %p\n", - (void *)tcp->tcp_connp)); - } - - /* - * We currently only support MDT on simple TCP/{IPv4,IPv6}, - * so disable MDT otherwise. The checks are done here - * and in tcp_wput_data(). - */ - if (tcp->tcp_mdt && - (tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION && - tcp->tcp_ip_hdr_len != IPV6_HDR_LEN)) - tcp->tcp_mdt = B_FALSE; - - if (tcp->tcp_mdt) { - if (mdt_capab->ill_mdt_version != MDT_VERSION_2) { - cmn_err(CE_NOTE, "tcp_mdt_update: unknown MDT " - "version (%d), expected version is %d", - mdt_capab->ill_mdt_version, MDT_VERSION_2); - tcp->tcp_mdt = B_FALSE; - return; - } - - /* - * We need the driver to be able to handle at least three - * spans per packet in order for tcp MDT to be utilized. - * The first is for the header portion, while the rest are - * needed to handle a packet that straddles across two - * virtually non-contiguous buffers; a typical tcp packet - * therefore consists of only two spans. Note that we take - * a zero as "don't care". - */ - if (mdt_capab->ill_mdt_span_limit > 0 && - mdt_capab->ill_mdt_span_limit < 3) { - tcp->tcp_mdt = B_FALSE; - return; - } - - /* a zero means driver wants default value */ - tcp->tcp_mdt_max_pld = MIN(mdt_capab->ill_mdt_max_pld, - tcps->tcps_mdt_max_pbufs); - if (tcp->tcp_mdt_max_pld == 0) - tcp->tcp_mdt_max_pld = tcps->tcps_mdt_max_pbufs; - - /* ensure 32-bit alignment */ - tcp->tcp_mdt_hdr_head = roundup(MAX(tcps->tcps_mdt_hdr_head_min, - mdt_capab->ill_mdt_hdr_head), 4); - tcp->tcp_mdt_hdr_tail = roundup(MAX(tcps->tcps_mdt_hdr_tail_min, - mdt_capab->ill_mdt_hdr_tail), 4); - - if (!first && !prev_state) { - TCP_STAT(tcps, tcp_mdt_conn_resumed2); - ip1dbg(("tcp_mdt_update: reenabling MDT for connp %p\n", - (void *)tcp->tcp_connp)); - } - } -} - -/* Unlink and return any mblk that looks like it contains a LSO info */ -static mblk_t * -tcp_lso_info_mp(mblk_t *mp) -{ - mblk_t *prev_mp; - - for (;;) { - prev_mp = mp; - /* no more to process? */ - if ((mp = mp->b_cont) == NULL) - break; - - switch (DB_TYPE(mp)) { - case M_CTL: - if (*(uint32_t *)mp->b_rptr != LSO_IOC_INFO_UPDATE) - continue; - ASSERT(prev_mp != NULL); - prev_mp->b_cont = mp->b_cont; - mp->b_cont = NULL; - return (mp); - default: - break; - } - } - - return (mp); -} - -/* LSO info update routine, called when IP notifies us about LSO */ -static void -tcp_lso_update(tcp_t *tcp, ill_lso_capab_t *lso_capab) -{ - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * IP is telling us to abort LSO on this connection? We know - * this because the capability is only turned off when IP - * encounters some pathological cases, e.g. link-layer change - * where the new NIC/driver doesn't support LSO, or in situation - * where LSO usage on the link-layer has been switched off. - * IP would not have sent us the initial LSO_IOC_INFO_UPDATE - * if the link-layer doesn't support LSO, and if it does, it - * will indicate that the feature is to be turned on. - */ - tcp->tcp_lso = (lso_capab->ill_lso_on != 0); - TCP_STAT(tcps, tcp_lso_enabled); - - /* - * We currently only support LSO on simple TCP/IPv4, - * so disable LSO otherwise. The checks are done here - * and in tcp_wput_data(). - */ - if (tcp->tcp_lso && - (tcp->tcp_ipversion == IPV4_VERSION && - tcp->tcp_ip_hdr_len != IP_SIMPLE_HDR_LENGTH) || - (tcp->tcp_ipversion == IPV6_VERSION)) { - tcp->tcp_lso = B_FALSE; - TCP_STAT(tcps, tcp_lso_disabled); - } else { - tcp->tcp_lso_max = MIN(TCP_MAX_LSO_LENGTH, - lso_capab->ill_lso_max); - } -} - -static void -tcp_ire_ill_check(tcp_t *tcp, ire_t *ire, ill_t *ill, boolean_t check_lso_mdt) -{ - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(ire != NULL); - - /* - * We may be in the fastpath here, and although we essentially do - * similar checks as in ip_bind_connected{_v6}/ip_xxinfo_return, - * we try to keep things as brief as possible. After all, these - * are only best-effort checks, and we do more thorough ones prior - * to calling tcp_send()/tcp_multisend(). - */ - if ((ipst->ips_ip_lso_outbound || ipst->ips_ip_multidata_outbound) && - check_lso_mdt && !(ire->ire_type & (IRE_LOCAL | IRE_LOOPBACK)) && - ill != NULL && !CONN_IPSEC_OUT_ENCAPSULATED(connp) && - !(ire->ire_flags & RTF_MULTIRT) && - !IPP_ENABLED(IPP_LOCAL_OUT, ipst) && - CONN_IS_LSO_MD_FASTPATH(connp)) { - if (ipst->ips_ip_lso_outbound && ILL_LSO_CAPABLE(ill)) { - /* Cache the result */ - connp->conn_lso_ok = B_TRUE; - - ASSERT(ill->ill_lso_capab != NULL); - if (!ill->ill_lso_capab->ill_lso_on) { - ill->ill_lso_capab->ill_lso_on = 1; - ip1dbg(("tcp_ire_ill_check: connp %p enables " - "LSO for interface %s\n", (void *)connp, - ill->ill_name)); - } - tcp_lso_update(tcp, ill->ill_lso_capab); - } else if (ipst->ips_ip_multidata_outbound && - ILL_MDT_CAPABLE(ill)) { - /* Cache the result */ - connp->conn_mdt_ok = B_TRUE; - - ASSERT(ill->ill_mdt_capab != NULL); - if (!ill->ill_mdt_capab->ill_mdt_on) { - ill->ill_mdt_capab->ill_mdt_on = 1; - ip1dbg(("tcp_ire_ill_check: connp %p enables " - "MDT for interface %s\n", (void *)connp, - ill->ill_name)); - } - tcp_mdt_update(tcp, ill->ill_mdt_capab, B_TRUE); - } - } - - /* - * The goal is to reduce the number of generated tcp segments by - * setting the maxpsz multiplier to 0; this will have an affect on - * tcp_maxpsz_set(). With this behavior, tcp will pack more data - * into each packet, up to SMSS bytes. Doing this reduces the number - * of outbound segments and incoming ACKs, thus allowing for better - * network and system performance. In contrast the legacy behavior - * may result in sending less than SMSS size, because the last mblk - * for some packets may have more data than needed to make up SMSS, - * and the legacy code refused to "split" it. - * - * We apply the new behavior on following situations: - * - * 1) Loopback connections, - * 2) Connections in which the remote peer is not on local subnet, - * 3) Local subnet connections over the bge interface (see below). - * - * Ideally, we would like this behavior to apply for interfaces other - * than bge. However, doing so would negatively impact drivers which - * perform dynamic mapping and unmapping of DMA resources, which are - * increased by setting the maxpsz multiplier to 0 (more mblks per - * packet will be generated by tcp). The bge driver does not suffer - * from this, as it copies the mblks into pre-mapped buffers, and - * therefore does not require more I/O resources than before. - * - * Otherwise, this behavior is present on all network interfaces when - * the destination endpoint is non-local, since reducing the number - * of packets in general is good for the network. - * - * TODO We need to remove this hard-coded conditional for bge once - * a better "self-tuning" mechanism, or a way to comprehend - * the driver transmit strategy is devised. Until the solution - * is found and well understood, we live with this hack. - */ - if (!tcp_static_maxpsz && - (tcp->tcp_loopback || !tcp->tcp_localnet || - (ill->ill_name_length > 3 && bcmp(ill->ill_name, "bge", 3) == 0))) { - /* override the default value */ - tcp->tcp_maxpsz = 0; - - ip3dbg(("tcp_ire_ill_check: connp %p tcp_maxpsz %d on " - "interface %s\n", (void *)connp, tcp->tcp_maxpsz, - ill != NULL ? ill->ill_name : ipif_loopback_name)); - } - - /* set the stream head parameters accordingly */ - (void) tcp_maxpsz_set(tcp, B_TRUE); -} - /* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp) { uchar_t fval = *mp->b_rptr; mblk_t *tail; - queue_t *q = tcp->tcp_wq; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; /* TODO: How should flush interact with urgent data? */ if ((fval & FLUSHW) && tcp->tcp_xmit_head && @@ -21473,7 +16350,7 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp) } /* * We have no unsent data, so unsent must be less than - * tcp_xmit_lowater, so re-enable flow. + * conn_sndlowat, so re-enable flow. */ mutex_enter(&tcp->tcp_non_sq_lock); if (tcp->tcp_flow_stopped) { @@ -21501,12 +16378,12 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp) static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) { - mblk_t *mp1; - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; + mblk_t *mp1; + struct iocblk *iocp = (struct iocblk *)mp->b_rptr; STRUCT_HANDLE(strbuf, sb); - queue_t *q = tcp->tcp_wq; - int error; - uint_t addrlen; + uint_t addrlen; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; /* Make sure it is one of ours. */ switch (iocp->ioc_cmd) { @@ -21514,7 +16391,7 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) case TI_GETPEERNAME: break; default: - CALL_IP_WPUT(tcp->tcp_connp, q, mp); + ip_wput_nondata(q, mp); return; } switch (mi_copy_state(q, mp, &mp1)) { @@ -21541,43 +16418,56 @@ tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) } STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); - addrlen = tcp->tcp_family == AF_INET ? sizeof (sin_t) : sizeof (sin6_t); + + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + if (STRUCT_FGET(sb, maxlen) < addrlen) { mi_copy_done(q, mp, EINVAL); return; } - mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); - if (mp1 == NULL) - return; - switch (iocp->ioc_cmd) { case TI_GETMYNAME: - error = tcp_do_getsockname(tcp, (void *)mp1->b_rptr, &addrlen); break; case TI_GETPEERNAME: - error = tcp_do_getpeername(tcp, (void *)mp1->b_rptr, &addrlen); + if (tcp->tcp_state < TCPS_SYN_RCVD) { + mi_copy_done(q, mp, ENOTCONN); + return; + } break; } + mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); + if (!mp1) + return; - if (error != 0) { - mi_copy_done(q, mp, error); - } else { - mp1->b_wptr += addrlen; - STRUCT_FSET(sb, len, addrlen); - - /* Copy out the address */ - mi_copyout(q, mp); + STRUCT_FSET(sb, len, addrlen); + switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { + case TI_GETMYNAME: + (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; + case TI_GETPEERNAME: + (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; } + mp1->b_wptr += addrlen; + /* Copy out the address */ + mi_copyout(q, mp); } static void tcp_use_pure_tpi(tcp_t *tcp) { + conn_t *connp = tcp->tcp_connp; + #ifdef _ILP32 - tcp->tcp_acceptor_id = (t_uscalar_t)tcp->tcp_rq; + tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; #else - tcp->tcp_acceptor_id = tcp->tcp_connp->conn_dev; + tcp->tcp_acceptor_id = connp->conn_dev; #endif /* * Insert this socket into the acceptor hash. @@ -21595,11 +16485,11 @@ tcp_use_pure_tpi(tcp_t *tcp) */ /* ARGSUSED */ static void -tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) +tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - queue_t *q = tcp->tcp_wq; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + queue_t *q = connp->conn_wq; struct iocblk *iocp; ASSERT(DB_TYPE(mp) == M_IOCTL); @@ -21617,17 +16507,6 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) iocp = (struct iocblk *)mp->b_rptr; switch (iocp->ioc_cmd) { - case TCP_IOC_DEFAULT_Q: - /* Wants to be the default wq. */ - if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { - iocp->ioc_error = EPERM; - iocp->ioc_count = 0; - mp->b_datap->db_type = M_IOCACK; - qreply(q, mp); - return; - } - tcp_def_q_set(tcp, mp); - return; case _SIOCSOCKFALLBACK: /* * Either sockmod is about to be popped and the socket @@ -21650,7 +16529,7 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) qreply(q, mp); return; } - CALL_IP_WPUT(connp, q, mp); + ip_wput_nondata(q, mp); } /* @@ -21658,14 +16537,14 @@ tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2) */ /* ARGSUSED */ static void -tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) +tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; union T_primitives *tprim = (union T_primitives *)mp->b_rptr; - uchar_t *rptr; - t_scalar_t type; - cred_t *cr; + uchar_t *rptr; + t_scalar_t type; + cred_t *cr; /* * Try and ASSERT the minimum possible references on the @@ -21684,7 +16563,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { type = ((union T_primitives *)rptr)->type; if (type == T_EXDATA_REQ) { - tcp_output_urgent(connp, mp, arg2); + tcp_output_urgent(connp, mp, arg2, NULL); } else if (type != T_DATA_REQ) { goto non_urgent_data; } else { @@ -21695,7 +16574,7 @@ tcp_wput_proto(void *arg, mblk_t *mp, void *arg2) } return; } else { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, dropping one..."); } @@ -21776,17 +16655,10 @@ non_urgent_data: * for subsequent processing by ip_restart_optmgmt(), which * will do the CONN_DEC_REF(). */ - CONN_INC_REF(connp); if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) { - if (svr4_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj, - B_TRUE) != EINPROGRESS) { - CONN_DEC_REF(connp); - } + svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); } else { - if (tpi_optcom_req(tcp->tcp_wq, mp, cr, &tcp_opt_obj, - B_TRUE) != EINPROGRESS) { - CONN_DEC_REF(connp); - } + tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); } break; @@ -21804,7 +16676,7 @@ non_urgent_data: * We were crossing FINs and got a reset from * the other side. Just ignore it. */ - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, T_ORDREL_REQ out of " @@ -21818,7 +16690,7 @@ non_urgent_data: tcp_addr_req(tcp, mp); break; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_wput_proto, bogus TPI msg, type %d", tprim->type); @@ -21844,19 +16716,6 @@ tcp_wsrv(queue_t *q) TCP_STAT(tcps, tcp_wsrv_called); } -/* Non overlapping byte exchanger */ -static void -tcp_xchg(uchar_t *a, uchar_t *b, int len) -{ - uchar_t uch; - - while (len-- > 0) { - uch = a[len]; - a[len] = b[len]; - b[len] = uch; - } -} - /* * Send out a control packet on the tcp connection specified. This routine * is typically called where we need a simple ACK or RST generated. @@ -21865,50 +16724,51 @@ static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) { uchar_t *rptr; - tcph_t *tcph; + tcpha_t *tcpha; ipha_t *ipha = NULL; ip6_t *ip6h = NULL; uint32_t sum; - int tcp_hdr_len; - int tcp_ip_hdr_len; + int total_hdr_len; + int ip_hdr_len; mblk_t *mp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* * Save sum for use in source route later. */ - ASSERT(tcp != NULL); - sum = tcp->tcp_tcp_hdr_len + tcp->tcp_sum; - tcp_hdr_len = tcp->tcp_hdr_len; - tcp_ip_hdr_len = tcp->tcp_ip_hdr_len; + sum = connp->conn_ht_ulp_len + connp->conn_sum; + total_hdr_len = connp->conn_ht_iphc_len; + ip_hdr_len = ixa->ixa_ip_hdr_length; /* If a text string is passed in with the request, pass it to strlog. */ - if (str != NULL && tcp->tcp_debug) { + if (str != NULL && connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", str, seq, ack, ctl); } - mp = allocb(tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + tcps->tcps_wroff_xtra, + mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, BPRI_MED); if (mp == NULL) { return; } rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; mp->b_rptr = rptr; - mp->b_wptr = &rptr[tcp_hdr_len]; - bcopy(tcp->tcp_iphc, rptr, tcp_hdr_len); + mp->b_wptr = &rptr[total_hdr_len]; + bcopy(connp->conn_ht_iphc, rptr, total_hdr_len); + + ixa->ixa_pktlen = total_hdr_len; - if (tcp->tcp_ipversion == IPV4_VERSION) { + if (ixa->ixa_flags & IXAF_IS_IPV4) { ipha = (ipha_t *)rptr; - ipha->ipha_length = htons(tcp_hdr_len); + ipha->ipha_length = htons(total_hdr_len); } else { ip6h = (ip6_t *)rptr; - ASSERT(tcp != NULL); - ip6h->ip6_plen = htons(tcp->tcp_hdr_len - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); } - tcph = (tcph_t *)&rptr[tcp_ip_hdr_len]; - tcph->th_flags[0] = (uint8_t)ctl; + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + tcpha->tha_flags = (uint8_t)ctl; if (ctl & TH_RST) { BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); BUMP_MIB(&tcps->tcps_mib, tcpOutControl); @@ -21917,43 +16777,45 @@ tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) */ if (tcp->tcp_snd_ts_ok && tcp->tcp_state > TCPS_SYN_SENT) { - mp->b_wptr = &rptr[tcp_hdr_len - TCPOPT_REAL_TS_LEN]; + mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN]; *(mp->b_wptr) = TCPOPT_EOL; - if (tcp->tcp_ipversion == IPV4_VERSION) { - ipha->ipha_length = htons(tcp_hdr_len - + + ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN; + + if (connp->conn_ipversion == IPV4_VERSION) { + ipha->ipha_length = htons(total_hdr_len - TCPOPT_REAL_TS_LEN); } else { - ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - - TCPOPT_REAL_TS_LEN); + ip6h->ip6_plen = htons(total_hdr_len - + IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN); } - tcph->th_offset_and_rsrvd[0] -= (3 << 4); + tcpha->tha_offset_and_reserved -= (3 << 4); sum -= TCPOPT_REAL_TS_LEN; } } if (ctl & TH_ACK) { if (tcp->tcp_snd_ts_ok) { U32_TO_BE32(lbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcph->th_win); + tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); tcp->tcp_rack = ack; tcp->tcp_rack_cnt = 0; BUMP_MIB(&tcps->tcps_mib, tcpOutAck); } BUMP_LOCAL(tcp->tcp_obsegs); - U32_TO_BE32(seq, tcph->th_seq); - U32_TO_BE32(ack, tcph->th_ack); + tcpha->tha_seq = htonl(seq); + tcpha->tha_ack = htonl(ack); /* * Include the adjustment for a source route if any. */ sum = (sum >> 16) + (sum & 0xFFFF); - U16_TO_BE16(sum, tcph->th_sum); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcpha->tha_sum = htons(sum); + tcp_send_data(tcp, mp); } /* @@ -21991,115 +16853,32 @@ tcp_send_rst_chk(tcp_stack_t *tcps) } /* - * Send down the advice IP ioctl to tell IP to mark an IRE temporary. - */ -static void -tcp_ip_ire_mark_advice(tcp_t *tcp) -{ - mblk_t *mp; - ipic_t *ipic; - - if (tcp->tcp_ipversion == IPV4_VERSION) { - mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, - &ipic); - } else { - mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, - &ipic); - } - if (mp == NULL) - return; - ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); -} - -/* - * Return an IP advice ioctl mblk and set ipic to be the pointer - * to the advice structure. - */ -static mblk_t * -tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic) -{ - struct iocblk *ioc; - mblk_t *mp, *mp1; - - mp = allocb(sizeof (ipic_t) + addr_len, BPRI_HI); - if (mp == NULL) - return (NULL); - bzero(mp->b_rptr, sizeof (ipic_t) + addr_len); - *ipic = (ipic_t *)mp->b_rptr; - (*ipic)->ipic_cmd = IP_IOC_IRE_ADVISE_NO_REPLY; - (*ipic)->ipic_addr_offset = sizeof (ipic_t); - - bcopy(addr, *ipic + 1, addr_len); - - (*ipic)->ipic_addr_length = addr_len; - mp->b_wptr = &mp->b_rptr[sizeof (ipic_t) + addr_len]; - - mp1 = mkiocb(IP_IOCTL); - if (mp1 == NULL) { - freemsg(mp); - return (NULL); - } - mp1->b_cont = mp; - ioc = (struct iocblk *)mp1->b_rptr; - ioc->ioc_count = sizeof (ipic_t) + addr_len; - - return (mp1); -} - -/* * Generate a reset based on an inbound packet, connp is set by caller * when RST is in response to an unexpected inbound packet for which * there is active tcp state in the system. * * IPSEC NOTE : Try to send the reply with the same protection as it came - * in. We still have the ipsec_mp that the packet was attached to. Thus - * the packet will go out at the same level of protection as it came in by - * converting the IPSEC_IN to IPSEC_OUT. + * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t. + * That way the packet will go out at the same level of protection as it + * came in with. */ static void -tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, - uint32_t ack, int ctl, uint_t ip_hdr_len, zoneid_t zoneid, - tcp_stack_t *tcps, conn_t *connp) +tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, + ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp) { ipha_t *ipha = NULL; ip6_t *ip6h = NULL; ushort_t len; - tcph_t *tcph; + tcpha_t *tcpha; int i; - mblk_t *ipsec_mp; - boolean_t mctl_present; - ipic_t *ipic; ipaddr_t v4addr; in6_addr_t v6addr; - int addr_len; - void *addr; - queue_t *q = tcps->tcps_g_q; - tcp_t *tcp; - cred_t *cr; - pid_t pid; - mblk_t *nmp; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - if (tcps->tcps_g_q == NULL) { - /* - * For non-zero stackids the default queue isn't created - * until the first open, thus there can be a need to send - * a reset before then. But we can't do that, hence we just - * drop the packet. Later during boot, when the default queue - * has been setup, a retransmitted packet from the peer - * will result in a reset. - */ - ASSERT(tcps->tcps_netstack->netstack_stackid != - GLOBAL_NETSTACKID); - freemsg(mp); - return; - } - - if (connp != NULL) - tcp = connp->conn_tcp; - else - tcp = Q_TO_TCP(q); + netstack_t *ns = ipst->ips_netstack; + tcp_stack_t *tcps = ns->netstack_tcp; + ip_xmit_attr_t ixas, *ixa; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; + boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */ + ushort_t port; if (!tcp_send_rst_chk(tcps)) { tcps->tcps_rst_unsent++; @@ -22107,16 +16886,41 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, return; } - if (mp->b_datap->db_type == M_CTL) { - ipsec_mp = mp; - mp = mp->b_cont; - mctl_present = B_TRUE; + /* + * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other + * options from the listener. In that case the caller must ensure that + * we are running on the listener = connp squeue. + * + * We get a safe copy of conn_ixa so we don't need to restore anything + * we or ip_output_simple might change in the ixa. + */ + if (connp != NULL) { + ASSERT(connp->conn_on_sqp); + + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + tcps->tcps_rst_unsent++; + freemsg(mp); + return; + } + need_refrele = B_TRUE; } else { - ipsec_mp = mp; - mctl_present = B_FALSE; + bzero(&ixas, sizeof (ixas)); + ixa = &ixas; + /* + * IXAF_VERIFY_SOURCE is overkill since we know the + * packet was for us. + */ + ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE; + ixa->ixa_protocol = IPPROTO_TCP; + ixa->ixa_zoneid = ira->ira_zoneid; + ixa->ixa_ifindex = 0; + ixa->ixa_ipst = ipst; + ixa->ixa_cred = kcred; + ixa->ixa_cpid = NOPID; } - if (str && q && tcps->tcps_dbg) { + if (str && tcps->tcps_dbg) { (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " "flags 0x%x", @@ -22126,20 +16930,12 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, mblk_t *mp1 = copyb(mp); freemsg(mp); mp = mp1; - if (!mp) { - if (mctl_present) - freeb(ipsec_mp); - return; - } else { - if (mctl_present) { - ipsec_mp->b_cont = mp; - } else { - ipsec_mp = mp; - } - } + if (mp == NULL) + goto done; } else if (mp->b_cont) { freemsg(mp->b_cont); mp->b_cont = NULL; + DB_CKSUMFLAGS(mp) = 0; } /* * We skip reversing source route here. @@ -22159,18 +16955,20 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, */ if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || CLASSD(ipha->ipha_src)) { - freemsg(ipsec_mp); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - return; + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + goto done; } } else { ip6h = (ip6_t *)mp->b_rptr; if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { - freemsg(ipsec_mp); BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); - return; + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + goto done; } /* Remove any extension headers assuming partial overlay */ @@ -22185,13 +16983,13 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, ip6h->ip6_nxt = IPPROTO_TCP; } } - tcph = (tcph_t *)&mp->b_rptr[ip_hdr_len]; - if (tcph->th_flags[0] & TH_RST) { - freemsg(ipsec_mp); - return; + tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; + if (tcpha->tha_flags & TH_RST) { + freemsg(mp); + goto done; } - tcph->th_offset_and_rsrvd[0] = (5 << 4); - len = ip_hdr_len + sizeof (tcph_t); + tcpha->tha_offset_and_reserved = (5 << 4); + len = ip_hdr_len + sizeof (tcpha_t); mp->b_wptr = &mp->b_rptr[len]; if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { ipha->ipha_length = htons(len); @@ -22201,108 +16999,79 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, ipha->ipha_dst = v4addr; ipha->ipha_ident = 0; ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; - addr_len = IP_ADDR_LEN; - addr = &v4addr; + ixa->ixa_flags |= IXAF_IS_IPV4; + ixa->ixa_ip_hdr_length = ip_hdr_len; } else { - /* No ip6i_t in this case */ ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); /* Swap addresses */ v6addr = ip6h->ip6_src; ip6h->ip6_src = ip6h->ip6_dst; ip6h->ip6_dst = v6addr; ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; - addr_len = IPV6_ADDR_LEN; - addr = &v6addr; - } - tcp_xchg(tcph->th_fport, tcph->th_lport, 2); - U32_TO_BE32(ack, tcph->th_ack); - U32_TO_BE32(seq, tcph->th_seq); - U16_TO_BE16(0, tcph->th_win); - U16_TO_BE16(sizeof (tcph_t), tcph->th_sum); - tcph->th_flags[0] = (uint8_t)ctl; + ixa->ixa_flags &= ~IXAF_IS_IPV4; + + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = ira->ira_ruifindex; + } + ixa->ixa_ip_hdr_length = IPV6_HDR_LEN; + } + ixa->ixa_pktlen = len; + + /* Swap the ports */ + port = tcpha->tha_fport; + tcpha->tha_fport = tcpha->tha_lport; + tcpha->tha_lport = port; + + tcpha->tha_ack = htonl(ack); + tcpha->tha_seq = htonl(seq); + tcpha->tha_win = 0; + tcpha->tha_sum = htons(sizeof (tcpha_t)); + tcpha->tha_flags = (uint8_t)ctl; if (ctl & TH_RST) { BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); BUMP_MIB(&tcps->tcps_mib, tcpOutControl); } - /* IP trusts us to set up labels when required. */ - if (is_system_labeled() && (cr = msg_getcred(mp, &pid)) != NULL && - crgetlabel(cr) != NULL) { - int err; - - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) - err = tsol_check_label(cr, &mp, - tcp->tcp_connp->conn_mac_mode, - tcps->tcps_netstack->netstack_ip, pid); - else - err = tsol_check_label_v6(cr, &mp, - tcp->tcp_connp->conn_mac_mode, - tcps->tcps_netstack->netstack_ip, pid); - if (mctl_present) - ipsec_mp->b_cont = mp; - else - ipsec_mp = mp; - if (err != 0) { - freemsg(ipsec_mp); - return; - } - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { - ipha = (ipha_t *)mp->b_rptr; - } else { - ip6h = (ip6_t *)mp->b_rptr; - } + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; } + ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ - if (mctl_present) { - ipsec_in_t *ii = (ipsec_in_t *)ipsec_mp->b_rptr; - - ASSERT(ii->ipsec_in_type == IPSEC_IN); - if (!ipsec_in_to_out(ipsec_mp, ipha, ip6h, zoneid)) { - return; + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + /* + * Apply IPsec based on how IPsec was applied to + * the packet that caused the RST. + */ + if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ + goto done; } + } else { + /* + * This is in clear. The RST message we are building + * here should go out in clear, independent of our policy. + */ + ixa->ixa_flags |= IXAF_NO_IPSEC; } - if (zoneid == ALL_ZONES) - zoneid = GLOBAL_ZONEID; - - /* Add the zoneid so ip_output routes it properly */ - if ((nmp = ip_prepend_zoneid(ipsec_mp, zoneid, ipst)) == NULL) { - freemsg(ipsec_mp); - return; - } - ipsec_mp = nmp; /* * NOTE: one might consider tracing a TCP packet here, but * this function has no active TCP state and no tcp structure * that has a trace buffer. If we traced here, we would have * to keep a local trace buffer in tcp_record_trace(). - * - * TSol note: The mblk that contains the incoming packet was - * reused by tcp_xmit_listener_reset, so it already contains - * the right credentials and we don't need to call mblk_setcred. - * Also the conn's cred is not right since it is associated - * with tcps_g_q. */ - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, ipsec_mp); - /* - * Tell IP to mark the IRE used for this destination temporary. - * This way, we can limit our exposure to DoS attack because IP - * creates an IRE for each destination. If there are too many, - * the time to do any routing lookup will be extremely long. And - * the lookup can be in interrupt context. - * - * Note that in normal circumstances, this marking should not - * affect anything. It would be nice if only 1 message is - * needed to inform IP that the IRE created for this RST should - * not be added to the cache table. But there is currently - * not such communication mechanism between TCP and IP. So - * the best we can do now is to send the advice ioctl to IP - * to mark the IRE temporary. - */ - if ((mp = tcp_ip_advise_mblk(addr, addr_len, &ipic)) != NULL) { - ipic->ipic_ire_marks |= IRE_MARK_TEMPORARY; - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); + (void) ip_output_simple(mp, ixa); +done: + ixa_cleanup(ixa); + if (need_refrele) { + ASSERT(ixa != &ixas); + ixa_refrele(ixa); } } @@ -22313,9 +17082,11 @@ tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, static int tcp_xmit_end(tcp_t *tcp) { - ipic_t *ipic; - mblk_t *mp; + mblk_t *mp; tcp_stack_t *tcps = tcp->tcp_tcps; + iulp_t uinfo; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + conn_t *connp = tcp->tcp_connp; if (tcp->tcp_state < TCPS_SYN_RCVD || tcp->tcp_state > TCPS_CLOSE_WAIT) { @@ -22337,7 +17108,7 @@ tcp_xmit_end(tcp_t *tcp) tcp->tcp_fss, B_FALSE, NULL, B_FALSE); if (mp) { - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } else { /* * Couldn't allocate msg. Pretend we got it out. @@ -22373,66 +17144,49 @@ tcp_xmit_end(tcp_t *tcp) return (0); /* - * NOTE: should not update if source routes i.e. if tcp_remote if - * different from the destination. + * We do not have a good algorithm to update ssthresh at this time. + * So don't do any update. + */ + bzero(&uinfo, sizeof (uinfo)); + uinfo.iulp_rtt = tcp->tcp_rtt_sa; + uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; + + /* + * Note that uinfo is kept for conn_faddr in the DCE. Could update even + * if source routed but we don't. */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - if (tcp->tcp_remote != tcp->tcp_ipha->ipha_dst) { + if (connp->conn_ipversion == IPV4_VERSION) { + if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) { return (0); } - mp = tcp_ip_advise_mblk(&tcp->tcp_ipha->ipha_dst, IP_ADDR_LEN, - &ipic); + (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst); } else { - if (!(IN6_ARE_ADDR_EQUAL(&tcp->tcp_remote_v6, + uint_t ifindex; + + if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, &tcp->tcp_ip6h->ip6_dst))) { return (0); } - mp = tcp_ip_advise_mblk(&tcp->tcp_ip6h->ip6_dst, IPV6_ADDR_LEN, - &ipic); - } - - /* Record route attributes in the IRE for use by future connections. */ - if (mp == NULL) - return (0); - - /* - * We do not have a good algorithm to update ssthresh at this time. - * So don't do any update. - */ - ipic->ipic_rtt = tcp->tcp_rtt_sa; - ipic->ipic_rtt_sd = tcp->tcp_rtt_sd; - - CALL_IP_WPUT(tcp->tcp_connp, tcp->tcp_wq, mp); - - return (0); -} + ifindex = 0; + if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) { + ip_xmit_attr_t *ixa = connp->conn_ixa; -/* ARGSUSED */ -void -tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2) -{ - conn_t *connp = (conn_t *)arg; - mblk_t *mp1; - tcp_t *tcp = connp->conn_tcp; - tcp_xmit_reset_event_t *eventp; - - ASSERT(mp->b_datap->db_type == M_PROTO && - MBLKL(mp) == sizeof (tcp_xmit_reset_event_t)); + /* + * If we are going to create a DCE we'd better have + * an ifindex + */ + if (ixa->ixa_nce != NULL) { + ifindex = ixa->ixa_nce->nce_common->ncec_ill-> + ill_phyint->phyint_ifindex; + } else { + return (0); + } + } - if (tcp->tcp_state != TCPS_LISTEN) { - freemsg(mp); - return; + (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo, + ipst); } - - mp1 = mp->b_cont; - mp->b_cont = NULL; - eventp = (tcp_xmit_reset_event_t *)mp->b_rptr; - ASSERT(eventp->tcp_xre_tcps->tcps_netstack == - connp->conn_netstack); - - tcp_xmit_listeners_reset(mp1, eventp->tcp_xre_iphdrlen, - eventp->tcp_xre_zoneid, eventp->tcp_xre_tcps, connp); - freemsg(mp); + return (0); } /* @@ -22442,45 +17196,25 @@ tcp_xmit_reset(void *arg, mblk_t *mp, void *arg2) * Note that we are reusing the incoming mp to construct the outgoing RST. */ void -tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, - tcp_stack_t *tcps, conn_t *connp) +tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst, + conn_t *connp) { uchar_t *rptr; uint32_t seg_len; - tcph_t *tcph; + tcpha_t *tcpha; uint32_t seg_seq; uint32_t seg_ack; uint_t flags; - mblk_t *ipsec_mp; ipha_t *ipha; ip6_t *ip6h; - boolean_t mctl_present = B_FALSE; - boolean_t check = B_TRUE; boolean_t policy_present; + netstack_t *ns = ipst->ips_netstack; + tcp_stack_t *tcps = ns->netstack_tcp; ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; TCP_STAT(tcps, tcp_no_listener); - ipsec_mp = mp; - - if (mp->b_datap->db_type == M_CTL) { - ipsec_in_t *ii; - - mctl_present = B_TRUE; - mp = mp->b_cont; - - ii = (ipsec_in_t *)ipsec_mp->b_rptr; - ASSERT(ii->ipsec_in_type == IPSEC_IN); - if (ii->ipsec_in_dont_check) { - check = B_FALSE; - if (!ii->ipsec_in_secure) { - freeb(ipsec_mp); - mctl_present = B_FALSE; - ipsec_mp = mp; - } - } - } - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { policy_present = ipss->ipsec_inbound_v4_policy_present; ipha = (ipha_t *)mp->b_rptr; @@ -22491,41 +17225,39 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, ip6h = (ip6_t *)mp->b_rptr; } - if (check && policy_present) { + if (policy_present) { /* * The conn_t parameter is NULL because we already know * nobody's home. */ - ipsec_mp = ipsec_check_global_policy( - ipsec_mp, (conn_t *)NULL, ipha, ip6h, mctl_present, - tcps->tcps_netstack); - if (ipsec_mp == NULL) + mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h, + ira, ns); + if (mp == NULL) return; } - if (is_system_labeled() && !tsol_can_reply_error(mp)) { + if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { DTRACE_PROBE2( tx__ip__log__error__nolistener__tcp, char *, "Could not reply with RST to mp(1)", mblk_t *, mp); ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); - freemsg(ipsec_mp); + freemsg(mp); return; } rptr = mp->b_rptr; - tcph = (tcph_t *)&rptr[ip_hdr_len]; - seg_seq = BE32_TO_U32(tcph->th_seq); - seg_ack = BE32_TO_U32(tcph->th_ack); - flags = tcph->th_flags[0]; + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + seg_seq = ntohl(tcpha->tha_seq); + seg_ack = ntohl(tcpha->tha_ack); + flags = tcpha->tha_flags; - seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcph) + ip_hdr_len); + seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len); if (flags & TH_RST) { - freemsg(ipsec_mp); + freemsg(mp); } else if (flags & TH_ACK) { - tcp_xmit_early_reset("no tcp, reset", - ipsec_mp, seg_ack, 0, TH_RST, ip_hdr_len, zoneid, tcps, - connp); + tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST, + ira, ipst, connp); } else { if (flags & TH_SYN) { seg_len++; @@ -22537,14 +17269,13 @@ tcp_xmit_listeners_reset(mblk_t *mp, uint_t ip_hdr_len, zoneid_t zoneid, * segment is neither. Just drop it on the * floor. */ - freemsg(ipsec_mp); + freemsg(mp); tcps->tcps_rst_unsent++; return; } - tcp_xmit_early_reset("no tcp, reset/ack", - ipsec_mp, 0, seg_seq + seg_len, - TH_RST | TH_ACK, ip_hdr_len, zoneid, tcps, connp); + tcp_xmit_early_reset("no tcp, reset/ack", mp, 0, + seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp); } } @@ -22573,14 +17304,16 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, mblk_t *mp1; mblk_t *mp2; uchar_t *rptr; - tcph_t *tcph; + tcpha_t *tcpha; int32_t num_sack_blk = 0; int32_t sack_opt_len = 0; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* Allocate for our maximum TCP header + link-level */ - mp1 = allocb(tcp->tcp_ip_hdr_len + TCP_MAX_HDR_LENGTH + - tcps->tcps_wroff_xtra, BPRI_MED); + mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, + BPRI_MED); if (!mp1) return (NULL); data_length = 0; @@ -22646,15 +17379,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, } /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcp->tcp_tcph->th_win); + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; mp1->b_rptr = rptr; - mp1->b_wptr = rptr + tcp->tcp_hdr_len + sack_opt_len; - bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); - tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; - U32_TO_ABE32(seq, tcph->th_seq); + mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len; + bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); + tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; + tcpha->tha_seq = htonl(seq); /* * Use tcp_unsent to determine if the PUSH bit should be used assumes @@ -22729,14 +17461,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, wptr[0] = TCPOPT_MAXSEG; wptr[1] = TCPOPT_MAXSEG_LEN; wptr += 2; - u1 = tcp->tcp_if_mtu - - (tcp->tcp_ipversion == IPV4_VERSION ? + u1 = tcp->tcp_initial_pmtu - + (connp->conn_ipversion == IPV4_VERSION ? IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH; U16_TO_BE16(u1, wptr); mp1->b_wptr = wptr + 2; /* Update the offset to cover the additional word */ - tcph->th_offset_and_rsrvd[0] += (1 << 4); + tcpha->tha_offset_and_reserved += (1 << 4); /* * Note that the following way of filling in @@ -22763,7 +17495,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, ASSERT(tcp->tcp_ts_recent == 0); U32_TO_BE32(0L, wptr); mp1->b_wptr += TCPOPT_REAL_TS_LEN; - tcph->th_offset_and_rsrvd[0] += + tcpha->tha_offset_and_reserved += (3 << 4); } @@ -22819,7 +17551,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, wptr[2] = TCPOPT_WS_LEN; wptr[3] = (uchar_t)tcp->tcp_rcv_ws; mp1->b_wptr += TCPOPT_REAL_WS_LEN; - tcph->th_offset_and_rsrvd[0] += (1 << 4); + tcpha->tha_offset_and_reserved += (1 << 4); } if (tcp->tcp_snd_sack_ok) { @@ -22829,7 +17561,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, wptr[2] = TCPOPT_SACK_PERMITTED; wptr[3] = TCPOPT_SACK_OK_LEN; mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; - tcph->th_offset_and_rsrvd[0] += (1 << 4); + tcpha->tha_offset_and_reserved += (1 << 4); } /* allocb() of adequate mblk assures space */ @@ -22840,9 +17572,9 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, * Get IP set to checksum on our behalf * Include the adjustment for a source route if any. */ - u1 += tcp->tcp_sum; + u1 += connp->conn_sum; u1 = (u1 >> 16) + (u1 & 0xFFFF); - U16_TO_BE16(u1, tcph->th_sum); + tcpha->tha_sum = htons(u1); BUMP_MIB(&tcps->tcps_mib, tcpOutControl); } if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && @@ -22878,10 +17610,10 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, u1 < (uint32_t)(64 * 1024)) { flags |= TH_URG; BUMP_MIB(&tcps->tcps_mib, tcpOutUrg); - U32_TO_ABE16(u1, tcph->th_urp); + tcpha->tha_urp = htons(u1); } } - tcph->th_flags[0] = (uchar_t)flags; + tcpha->tha_flags = (uchar_t)flags; tcp->tcp_rack = tcp->tcp_rnxt; tcp->tcp_rack_cnt = 0; @@ -22890,14 +17622,14 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, uint32_t llbolt = (uint32_t)lbolt; U32_TO_BE32(llbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } } if (num_sack_blk > 0) { - uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; + uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len; sack_blk_t *tmp; int32_t i; @@ -22915,33 +17647,34 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, U32_TO_BE32(tmp[i].end, wptr); wptr += sizeof (tcp_seq); } - tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) << 4); + tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); } ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); data_length += (int)(mp1->b_wptr - rptr); - if (tcp->tcp_ipversion == IPV4_VERSION) { + + ixa->ixa_pktlen = data_length; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { ((ipha_t *)rptr)->ipha_length = htons(data_length); } else { - ip6_t *ip6 = (ip6_t *)(rptr + - (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? - sizeof (ip6i_t) : 0)); + ip6_t *ip6 = (ip6_t *)rptr; - ip6->ip6_plen = htons(data_length - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN); } /* * Prime pump for IP * Include the adjustment for a source route if any. */ - data_length -= tcp->tcp_ip_hdr_len; - data_length += tcp->tcp_sum; + data_length -= ixa->ixa_ip_hdr_length; + data_length += connp->conn_sum; data_length = (data_length >> 16) + (data_length & 0xFFFF); - U16_TO_ABE16(data_length, tcph->th_sum); + tcpha->tha_sum = htons(data_length); if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; } return (mp1); } @@ -23012,7 +17745,7 @@ tcp_ack_timer(void *arg) BUMP_LOCAL(tcp->tcp_obsegs); BUMP_MIB(&tcps->tcps_mib, tcpOutAck); BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed); - tcp_send_data(tcp, tcp->tcp_wq, mp); + tcp_send_data(tcp, mp); } } @@ -23023,6 +17756,7 @@ tcp_ack_mp(tcp_t *tcp) { uint32_t seq_no; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; /* * There are a few cases to be considered while setting the sequence no. @@ -23058,12 +17792,13 @@ tcp_ack_mp(tcp_t *tcp) /* Generate a simple ACK */ int data_length; uchar_t *rptr; - tcph_t *tcph; + tcpha_t *tcpha; mblk_t *mp1; + int32_t total_hdr_len; int32_t tcp_hdr_len; - int32_t tcp_tcp_hdr_len; int32_t num_sack_blk = 0; int32_t sack_opt_len; + ip_xmit_attr_t *ixa = connp->conn_ixa; /* * Allocate space for TCP + IP headers @@ -23074,34 +17809,34 @@ tcp_ack_mp(tcp_t *tcp) tcp->tcp_num_sack_blk); sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; - tcp_hdr_len = tcp->tcp_hdr_len + sack_opt_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len + sack_opt_len; + total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; + tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; } else { - tcp_hdr_len = tcp->tcp_hdr_len; - tcp_tcp_hdr_len = tcp->tcp_tcp_hdr_len; + total_hdr_len = connp->conn_ht_iphc_len; + tcp_hdr_len = connp->conn_ht_ulp_len; } - mp1 = allocb(tcp_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); + mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); if (!mp1) return (NULL); /* Update the latest receive window size in TCP header. */ - U32_TO_ABE16(tcp->tcp_rwnd >> tcp->tcp_rcv_ws, - tcp->tcp_tcph->th_win); + tcp->tcp_tcpha->tha_win = + htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); /* copy in prototype TCP + IP header */ rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; mp1->b_rptr = rptr; - mp1->b_wptr = rptr + tcp_hdr_len; - bcopy(tcp->tcp_iphc, rptr, tcp->tcp_hdr_len); + mp1->b_wptr = rptr + total_hdr_len; + bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); - tcph = (tcph_t *)&rptr[tcp->tcp_ip_hdr_len]; + tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; /* Set the TCP sequence number. */ - U32_TO_ABE32(seq_no, tcph->th_seq); + tcpha->tha_seq = htonl(seq_no); /* Set up the TCP flag field. */ - tcph->th_flags[0] = (uchar_t)TH_ACK; + tcpha->tha_flags = (uchar_t)TH_ACK; if (tcp->tcp_ecn_echo_on) - tcph->th_flags[0] |= TH_ECE; + tcpha->tha_flags |= TH_ECE; tcp->tcp_rack = tcp->tcp_rnxt; tcp->tcp_rack_cnt = 0; @@ -23111,14 +17846,15 @@ tcp_ack_mp(tcp_t *tcp) uint32_t llbolt = (uint32_t)lbolt; U32_TO_BE32(llbolt, - (char *)tcph+TCP_MIN_HEADER_LENGTH+4); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcph+TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); } /* Fill in SACK options */ if (num_sack_blk > 0) { - uchar_t *wptr = (uchar_t *)tcph + tcp->tcp_tcp_hdr_len; + uchar_t *wptr = (uchar_t *)tcpha + + connp->conn_ht_ulp_len; sack_blk_t *tmp; int32_t i; @@ -23136,34 +17872,33 @@ tcp_ack_mp(tcp_t *tcp) U32_TO_BE32(tmp[i].end, wptr); wptr += sizeof (tcp_seq); } - tcph->th_offset_and_rsrvd[0] += ((num_sack_blk * 2 + 1) - << 4); + tcpha->tha_offset_and_reserved += + ((num_sack_blk * 2 + 1) << 4); } - if (tcp->tcp_ipversion == IPV4_VERSION) { - ((ipha_t *)rptr)->ipha_length = htons(tcp_hdr_len); + ixa->ixa_pktlen = total_hdr_len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); } else { - /* Check for ip6i_t header in sticky hdrs */ - ip6_t *ip6 = (ip6_t *)(rptr + - (((ip6_t *)rptr)->ip6_nxt == IPPROTO_RAW ? - sizeof (ip6i_t) : 0)); + ip6_t *ip6 = (ip6_t *)rptr; - ip6->ip6_plen = htons(tcp_hdr_len - - ((char *)&tcp->tcp_ip6h[1] - tcp->tcp_iphc)); + ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); } /* * Prime pump for checksum calculation in IP. Include the * adjustment for a source route if any. */ - data_length = tcp_tcp_hdr_len + tcp->tcp_sum; + data_length = tcp_hdr_len + connp->conn_sum; data_length = (data_length >> 16) + (data_length & 0xFFFF); - U16_TO_ABE16(data_length, tcph->th_sum); + tcpha->tha_sum = htons(data_length); if (tcp->tcp_ip_forward_progress) { - ASSERT(tcp->tcp_ipversion == IPV6_VERSION); - *(uint32_t *)mp1->b_rptr |= IP_FORWARD_PROG; tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; } return (mp1); } @@ -23183,6 +17918,8 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) tcp_t **tcpp; tcp_t *tcpnext; tcp_t *tcphash; + conn_t *connp = tcp->tcp_connp; + conn_t *connext; if (tcp->tcp_ptpbhn != NULL) { ASSERT(!caller_holds_lock); @@ -23199,7 +17936,7 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) if (tcphash != NULL) { /* Look for an entry using the same port */ while ((tcphash = tcpp[0]) != NULL && - tcp->tcp_lport != tcphash->tcp_lport) + connp->conn_lport != tcphash->tcp_connp->conn_lport) tcpp = &(tcphash->tcp_bind_hash); /* The port was not found, just add to the end */ @@ -23219,14 +17956,19 @@ tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) * INADDR_ANY. */ tcpnext = tcphash; + connext = tcpnext->tcp_connp; tcphash = NULL; - if (V6_OR_V4_INADDR_ANY(tcp->tcp_bound_source_v6) && - !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) { - while ((tcpnext = tcpp[0]) != NULL && - !V6_OR_V4_INADDR_ANY(tcpnext->tcp_bound_source_v6)) - tcpp = &(tcpnext->tcp_bind_hash_port); - - if (tcpnext) { + if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && + !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { + while ((tcpnext = tcpp[0]) != NULL) { + connext = tcpnext->tcp_connp; + if (!V6_OR_V4_INADDR_ANY( + connext->conn_bound_addr_v6)) + tcpp = &(tcpnext->tcp_bind_hash_port); + else + break; + } + if (tcpnext != NULL) { tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; tcphash = tcpnext->tcp_bind_hash; if (tcphash != NULL) { @@ -23263,6 +18005,7 @@ tcp_bind_hash_remove(tcp_t *tcp) tcp_t *tcpnext; kmutex_t *lockp; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; if (tcp->tcp_ptpbhn == NULL) return; @@ -23271,8 +18014,9 @@ tcp_bind_hash_remove(tcp_t *tcp) * Extract the lock pointer in case there are concurrent * hash_remove's for this instance. */ - ASSERT(tcp->tcp_lport != 0); - lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH(tcp->tcp_lport)].tf_lock; + ASSERT(connp->conn_lport != 0); + lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( + connp->conn_lport)].tf_lock; ASSERT(lockp != NULL); mutex_enter(lockp); @@ -23548,7 +18292,7 @@ tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, *sys_errorp = 0; *do_disconnectp = 0; - error = tpi_optcom_buf(tcp->tcp_wq, mp, opt_lenp, + error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, opt_offset, cr, &tcp_opt_obj, NULL, &is_absreq_failure); @@ -23663,238 +18407,6 @@ tcp_sack_info_constructor(void *buf, void *cdrarg, int kmflags) return (0); } -/* ARGSUSED */ -static int -tcp_iphc_constructor(void *buf, void *cdrarg, int kmflags) -{ - bzero(buf, TCP_MAX_COMBINED_HEADER_LENGTH); - return (0); -} - -/* - * Make sure we wait until the default queue is setup, yet allow - * tcp_g_q_create() to open a TCP stream. - * We need to allow tcp_g_q_create() do do an open - * of tcp, hence we compare curhread. - * All others have to wait until the tcps_g_q has been - * setup. - */ -void -tcp_g_q_setup(tcp_stack_t *tcps) -{ - mutex_enter(&tcps->tcps_g_q_lock); - if (tcps->tcps_g_q != NULL) { - mutex_exit(&tcps->tcps_g_q_lock); - return; - } - if (tcps->tcps_g_q_creator == NULL) { - /* This thread will set it up */ - tcps->tcps_g_q_creator = curthread; - mutex_exit(&tcps->tcps_g_q_lock); - tcp_g_q_create(tcps); - mutex_enter(&tcps->tcps_g_q_lock); - ASSERT(tcps->tcps_g_q_creator == curthread); - tcps->tcps_g_q_creator = NULL; - cv_signal(&tcps->tcps_g_q_cv); - ASSERT(tcps->tcps_g_q != NULL); - mutex_exit(&tcps->tcps_g_q_lock); - return; - } - /* Everybody but the creator has to wait */ - if (tcps->tcps_g_q_creator != curthread) { - while (tcps->tcps_g_q == NULL) - cv_wait(&tcps->tcps_g_q_cv, &tcps->tcps_g_q_lock); - } - mutex_exit(&tcps->tcps_g_q_lock); -} - -#define IP "ip" - -#define TCP6DEV "/devices/pseudo/tcp6@0:tcp6" - -/* - * Create a default tcp queue here instead of in strplumb - */ -void -tcp_g_q_create(tcp_stack_t *tcps) -{ - int error; - ldi_handle_t lh = NULL; - ldi_ident_t li = NULL; - int rval; - cred_t *cr; - major_t IP_MAJ; - -#ifdef NS_DEBUG - (void) printf("tcp_g_q_create()\n"); -#endif - - IP_MAJ = ddi_name_to_major(IP); - - ASSERT(tcps->tcps_g_q_creator == curthread); - - error = ldi_ident_from_major(IP_MAJ, &li); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_create: lyr ident get failed error %d\n", - error); -#endif - return; - } - - cr = zone_get_kcred(netstackid_to_zoneid( - tcps->tcps_netstack->netstack_stackid)); - ASSERT(cr != NULL); - /* - * We set the tcp default queue to IPv6 because IPv4 falls - * back to IPv6 when it can't find a client, but - * IPv6 does not fall back to IPv4. - */ - error = ldi_open_by_name(TCP6DEV, FREAD|FWRITE, cr, &lh, li); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_create: open of TCP6DEV failed error %d\n", - error); -#endif - goto out; - } - - /* - * This ioctl causes the tcp framework to cache a pointer to - * this stream, so we don't want to close the stream after - * this operation. - * Use the kernel credentials that are for the zone we're in. - */ - error = ldi_ioctl(lh, TCP_IOC_DEFAULT_Q, - (intptr_t)0, FKIOCTL, cr, &rval); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_create: ioctl TCP_IOC_DEFAULT_Q failed " - "error %d\n", error); -#endif - goto out; - } - tcps->tcps_g_q_lh = lh; /* For tcp_g_q_close */ - lh = NULL; -out: - /* Close layered handles */ - if (li) - ldi_ident_release(li); - /* Keep cred around until _inactive needs it */ - tcps->tcps_g_q_cr = cr; -} - -/* - * We keep tcp_g_q set until all other tcp_t's in the zone - * has gone away, and then when tcp_g_q_inactive() is called - * we clear it. - */ -void -tcp_g_q_destroy(tcp_stack_t *tcps) -{ -#ifdef NS_DEBUG - (void) printf("tcp_g_q_destroy()for stack %d\n", - tcps->tcps_netstack->netstack_stackid); -#endif - - if (tcps->tcps_g_q == NULL) { - return; /* Nothing to cleanup */ - } - /* - * Drop reference corresponding to the default queue. - * This reference was added from tcp_open when the default queue - * was created, hence we compensate for this extra drop in - * tcp_g_q_close. If the refcnt drops to zero here it means - * the default queue was the last one to be open, in which - * case, then tcp_g_q_inactive will be - * called as a result of the refrele. - */ - TCPS_REFRELE(tcps); -} - -/* - * Called when last tcp_t drops reference count using TCPS_REFRELE. - * Run by tcp_q_q_inactive using a taskq. - */ -static void -tcp_g_q_close(void *arg) -{ - tcp_stack_t *tcps = arg; - int error; - ldi_handle_t lh = NULL; - ldi_ident_t li = NULL; - cred_t *cr; - major_t IP_MAJ; - - IP_MAJ = ddi_name_to_major(IP); - -#ifdef NS_DEBUG - (void) printf("tcp_g_q_inactive() for stack %d refcnt %d\n", - tcps->tcps_netstack->netstack_stackid, - tcps->tcps_netstack->netstack_refcnt); -#endif - lh = tcps->tcps_g_q_lh; - if (lh == NULL) - return; /* Nothing to cleanup */ - - ASSERT(tcps->tcps_refcnt == 1); - ASSERT(tcps->tcps_g_q != NULL); - - error = ldi_ident_from_major(IP_MAJ, &li); - if (error) { -#ifdef DEBUG - printf("tcp_g_q_inactive: lyr ident get failed error %d\n", - error); -#endif - return; - } - - cr = tcps->tcps_g_q_cr; - tcps->tcps_g_q_cr = NULL; - ASSERT(cr != NULL); - - /* - * Make sure we can break the recursion when tcp_close decrements - * the reference count causing g_q_inactive to be called again. - */ - tcps->tcps_g_q_lh = NULL; - - /* close the default queue */ - (void) ldi_close(lh, FREAD|FWRITE, cr); - /* - * At this point in time tcps and the rest of netstack_t might - * have been deleted. - */ - tcps = NULL; - - /* Close layered handles */ - ldi_ident_release(li); - crfree(cr); -} - -/* - * Called when last tcp_t drops reference count using TCPS_REFRELE. - * - * Have to ensure that the ldi routines are not used by an - * interrupt thread by using a taskq. - */ -void -tcp_g_q_inactive(tcp_stack_t *tcps) -{ - if (tcps->tcps_g_q_lh == NULL) - return; /* Nothing to cleanup */ - - ASSERT(tcps->tcps_refcnt == 0); - TCPS_REFHOLD(tcps); /* Compensate for what g_q_destroy did */ - - if (servicing_interrupt()) { - (void) taskq_dispatch(tcp_taskq, tcp_g_q_close, - (void *) tcps, TQ_SLEEP); - } else { - tcp_g_q_close(tcps); - } -} - /* * Called by IP when IP is loaded into the kernel */ @@ -23909,10 +18421,6 @@ tcp_ddi_g_init(void) sizeof (tcp_sack_info_t), 0, tcp_sack_info_constructor, NULL, NULL, NULL, NULL, 0); - tcp_iphc_cache = kmem_cache_create("tcp_iphc_cache", - TCP_MAX_COMBINED_HEADER_LENGTH, 0, - tcp_iphc_constructor, NULL, NULL, NULL, NULL, 0); - mutex_init(&tcp_random_lock, NULL, MUTEX_DEFAULT, NULL); /* Initialize the random number generator */ @@ -23923,9 +18431,6 @@ tcp_ddi_g_init(void) tcp_g_kstat = tcp_g_kstat_init(&tcp_g_statistics); - tcp_taskq = taskq_create("tcp_taskq", 1, minclsyspri, 1, 1, - TASKQ_PREPOPULATE); - tcp_squeue_flag = tcp_squeue_switch(tcp_squeue_wput); /* @@ -23933,8 +18438,7 @@ tcp_ddi_g_init(void) * destroyed in the kernel, so we can maintain the * set of tcp_stack_t's. */ - netstack_register(NS_TCP, tcp_stack_init, tcp_stack_shutdown, - tcp_stack_fini); + netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini); } @@ -23956,8 +18460,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcps->tcps_netstack = ns; /* Initialize locks */ - mutex_init(&tcps->tcps_g_q_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&tcps->tcps_g_q_cv, NULL, CV_DEFAULT, NULL); mutex_init(&tcps->tcps_iss_key_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&tcps->tcps_epriv_port_lock, NULL, MUTEX_DEFAULT, NULL); @@ -24018,6 +18520,11 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) major = mod_name_to_major(INET_NAME); error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); ASSERT(error == 0); + tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); + ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); + cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL); + return (tcps); } @@ -24035,22 +18542,8 @@ tcp_ddi_g_destroy(void) kmem_cache_destroy(tcp_timercache); kmem_cache_destroy(tcp_sack_info_cache); - kmem_cache_destroy(tcp_iphc_cache); netstack_unregister(NS_TCP); - taskq_destroy(tcp_taskq); -} - -/* - * Shut down the TCP stack instance. - */ -/* ARGSUSED */ -static void -tcp_stack_shutdown(netstackid_t stackid, void *arg) -{ - tcp_stack_t *tcps = (tcp_stack_t *)arg; - - tcp_g_q_destroy(tcps); } /* @@ -24062,17 +18555,16 @@ tcp_stack_fini(netstackid_t stackid, void *arg) tcp_stack_t *tcps = (tcp_stack_t *)arg; int i; + freeb(tcps->tcps_ixa_cleanup_mp); + tcps->tcps_ixa_cleanup_mp = NULL; + cv_destroy(&tcps->tcps_ixa_cleanup_cv); + mutex_destroy(&tcps->tcps_ixa_cleanup_lock); + nd_free(&tcps->tcps_g_nd); kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr)); tcps->tcps_params = NULL; kmem_free(tcps->tcps_wroff_xtra_param, sizeof (tcpparam_t)); tcps->tcps_wroff_xtra_param = NULL; - kmem_free(tcps->tcps_mdt_head_param, sizeof (tcpparam_t)); - tcps->tcps_mdt_head_param = NULL; - kmem_free(tcps->tcps_mdt_tail_param, sizeof (tcpparam_t)); - tcps->tcps_mdt_tail_param = NULL; - kmem_free(tcps->tcps_mdt_max_pbufs_param, sizeof (tcpparam_t)); - tcps->tcps_mdt_max_pbufs_param = NULL; for (i = 0; i < TCP_BIND_FANOUT_SIZE; i++) { ASSERT(tcps->tcps_bind_fanout[i].tf_tcp == NULL); @@ -24091,8 +18583,6 @@ tcp_stack_fini(netstackid_t stackid, void *arg) tcps->tcps_acceptor_fanout = NULL; mutex_destroy(&tcps->tcps_iss_key_lock); - mutex_destroy(&tcps->tcps_g_q_lock); - cv_destroy(&tcps->tcps_g_q_cv); mutex_destroy(&tcps->tcps_epriv_port_lock); ip_drop_unregister(&tcps->tcps_dropper); @@ -24120,6 +18610,7 @@ tcp_iss_init(tcp_t *tcp) struct { uint32_t ports; in6_addr_t src; in6_addr_t dst; } arg; uint32_t answer[4]; tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; tcps->tcps_iss_incr_extra += (ISS_INCR >> 1); tcp->tcp_iss = tcps->tcps_iss_incr_extra; @@ -24128,16 +18619,9 @@ tcp_iss_init(tcp_t *tcp) mutex_enter(&tcps->tcps_iss_key_lock); context = tcps->tcps_iss_key; mutex_exit(&tcps->tcps_iss_key_lock); - arg.ports = tcp->tcp_ports; - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_src, - &arg.src); - IN6_IPADDR_TO_V4MAPPED(tcp->tcp_ipha->ipha_dst, - &arg.dst); - } else { - arg.src = tcp->tcp_ip6h->ip6_src; - arg.dst = tcp->tcp_ip6h->ip6_dst; - } + arg.ports = connp->conn_ports; + arg.src = connp->conn_laddr_v6; + arg.dst = connp->conn_faddr_v6; MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); MD5Final((uchar_t *)answer, &context); tcp->tcp_iss += answer[0] ^ answer[1] ^ answer[2] ^ answer[3]; @@ -24220,27 +18704,16 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, connp = NULL; while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { tcp = connp->conn_tcp; cl_tcpi.cl_tcpi_version = CL_TCPI_V1; - cl_tcpi.cl_tcpi_ipversion = tcp->tcp_ipversion; + cl_tcpi.cl_tcpi_ipversion = connp->conn_ipversion; cl_tcpi.cl_tcpi_state = tcp->tcp_state; - cl_tcpi.cl_tcpi_lport = tcp->tcp_lport; - cl_tcpi.cl_tcpi_fport = tcp->tcp_fport; - /* - * The macros tcp_laddr and tcp_faddr give the IPv4 - * addresses. They are copied implicitly below as - * mapped addresses. - */ - cl_tcpi.cl_tcpi_laddr_v6 = tcp->tcp_ip_src_v6; - if (tcp->tcp_ipversion == IPV4_VERSION) { - cl_tcpi.cl_tcpi_faddr = - tcp->tcp_ipha->ipha_dst; - } else { - cl_tcpi.cl_tcpi_faddr_v6 = - tcp->tcp_ip6h->ip6_dst; - } + cl_tcpi.cl_tcpi_lport = connp->conn_lport; + cl_tcpi.cl_tcpi_fport = connp->conn_fport; + cl_tcpi.cl_tcpi_laddr_v6 = connp->conn_laddr_v6; + cl_tcpi.cl_tcpi_faddr_v6 = connp->conn_faddr_v6; /* * If the callback returns non-zero @@ -24302,35 +18775,35 @@ cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, /* * Check if a tcp structure matches the info in acp. */ -#define TCP_AC_ADDR_MATCH(acp, tcp) \ +#define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ (((acp)->ac_local.ss_family == AF_INET) ? \ ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ - TCP_AC_V4LOCAL((acp)) == (tcp)->tcp_ip_src) && \ + TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ - TCP_AC_V4REMOTE((acp)) == (tcp)->tcp_remote) && \ + TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ (TCP_AC_V4LPORT((acp)) == 0 || \ - TCP_AC_V4LPORT((acp)) == (tcp)->tcp_lport) && \ + TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ (TCP_AC_V4RPORT((acp)) == 0 || \ - TCP_AC_V4RPORT((acp)) == (tcp)->tcp_fport) && \ - (acp)->ac_start <= (tcp)->tcp_state && \ - (acp)->ac_end >= (tcp)->tcp_state) : \ + TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ + (acp)->ac_start <= (tcp)->tcp_state && \ + (acp)->ac_end >= (tcp)->tcp_state) : \ ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ - &(tcp)->tcp_ip_src_v6)) && \ + &(connp)->conn_laddr_v6)) && \ (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ - &(tcp)->tcp_remote_v6)) && \ + &(connp)->conn_faddr_v6)) && \ (TCP_AC_V6LPORT((acp)) == 0 || \ - TCP_AC_V6LPORT((acp)) == (tcp)->tcp_lport) && \ + TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ (TCP_AC_V6RPORT((acp)) == 0 || \ - TCP_AC_V6RPORT((acp)) == (tcp)->tcp_fport) && \ - (acp)->ac_start <= (tcp)->tcp_state && \ + TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ + (acp)->ac_start <= (tcp)->tcp_state && \ (acp)->ac_end >= (tcp)->tcp_state)) -#define TCP_AC_MATCH(acp, tcp) \ +#define TCP_AC_MATCH(acp, connp, tcp) \ (((acp)->ac_zoneid == ALL_ZONES || \ - (acp)->ac_zoneid == tcp->tcp_connp->conn_zoneid) ? \ - TCP_AC_ADDR_MATCH(acp, tcp) : 0) + (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ + TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) /* * Build a message containing a tcp_ioc_abort_conn_t structure @@ -24346,8 +18819,6 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) if (mp == NULL) return (NULL); - mp->b_datap->db_type = M_CTL; - *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + sizeof (uint32_t)); @@ -24359,17 +18830,17 @@ tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) if (acp->ac_local.ss_family == AF_INET) { tacp->ac_local.ss_family = AF_INET; tacp->ac_remote.ss_family = AF_INET; - TCP_AC_V4LOCAL(tacp) = tp->tcp_ip_src; - TCP_AC_V4REMOTE(tacp) = tp->tcp_remote; - TCP_AC_V4LPORT(tacp) = tp->tcp_lport; - TCP_AC_V4RPORT(tacp) = tp->tcp_fport; + TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; + TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; + TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; + TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; } else { tacp->ac_local.ss_family = AF_INET6; tacp->ac_remote.ss_family = AF_INET6; - TCP_AC_V6LOCAL(tacp) = tp->tcp_ip_src_v6; - TCP_AC_V6REMOTE(tacp) = tp->tcp_remote_v6; - TCP_AC_V6LPORT(tacp) = tp->tcp_lport; - TCP_AC_V6RPORT(tacp) = tp->tcp_fport; + TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; + TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; + TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; + TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; } mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); return (mp); @@ -24419,14 +18890,32 @@ tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) } /* - * Called inside tcp_rput when a message built using + * Called using SQ_FILL when a message built using * tcp_ioctl_abort_build_msg is put into a queue. * Note that when we get here there is no wildcard in acp any more. */ +/* ARGSUSED2 */ static void -tcp_ioctl_abort_handler(tcp_t *tcp, mblk_t *mp) +tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy) { - tcp_ioc_abort_conn_t *acp; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + tcp_ioc_abort_conn_t *acp; + + /* + * Don't accept any input on a closed tcp as this TCP logically does + * not exist on the system. Don't proceed further with this TCP. + * For eg. this packet could trigger another close of this tcp + * which would be disastrous for tcp_refcnt. tcp_close_detached / + * tcp_clean_death / tcp_closei_local must be called at most once + * on a TCP. + */ + if (tcp->tcp_state == TCPS_CLOSED || + tcp->tcp_state == TCPS_BOUND) { + freemsg(mp); + return; + } acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); if (tcp->tcp_state <= acp->ac_end) { @@ -24468,12 +18957,17 @@ startover: for (tconnp = connfp->connf_head; tconnp != NULL; tconnp = tconnp->conn_next) { tcp = tconnp->conn_tcp; - if (TCP_AC_MATCH(acp, tcp)) { - CONN_INC_REF(tcp->tcp_connp); + /* + * We are missing a check on sin6_scope_id for linklocals here, + * but current usage is just for aborting based on zoneid + * for shared-IP zones. + */ + if (TCP_AC_MATCH(acp, tconnp, tcp)) { + CONN_INC_REF(tconnp); mp = tcp_ioctl_abort_build_msg(acp, tcp); if (mp == NULL) { err = ENOMEM; - CONN_DEC_REF(tcp->tcp_connp); + CONN_DEC_REF(tconnp); break; } mp->b_prev = (mblk_t *)tcp; @@ -24501,8 +18995,9 @@ startover: listhead = listhead->b_next; tcp = (tcp_t *)mp->b_prev; mp->b_next = mp->b_prev = NULL; - SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, tcp_input, - tcp->tcp_connp, SQ_FILL, SQTAG_TCP_ABORT_BUCKET); + SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, + tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, + SQ_FILL, SQTAG_TCP_ABORT_BUCKET); } *count += nmatch; @@ -24669,7 +19164,7 @@ out: */ void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, - uint32_t seg_ack, int seg_len, tcph_t *tcph) + uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) { int32_t bytes_acked; int32_t gap; @@ -24677,17 +19172,18 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, tcp_opt_t tcpopt; uint_t flags; uint32_t new_swnd = 0; - conn_t *connp; + conn_t *nconnp; + conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; BUMP_LOCAL(tcp->tcp_ibsegs); DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); - flags = (unsigned int)tcph->th_flags[0] & 0xFF; - new_swnd = BE16_TO_U16(tcph->th_win) << - ((tcph->th_flags[0] & TH_SYN) ? 0 : tcp->tcp_snd_ws); + flags = (unsigned int)tcpha->tha_flags & 0xFF; + new_swnd = ntohs(tcpha->tha_win) << + ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); if (tcp->tcp_snd_ts_ok) { - if (!tcp_paws_check(tcp, tcph, &tcpopt)) { + if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); goto done; @@ -24770,17 +19266,10 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, mutex_enter(&tcps->tcps_iss_key_lock); context = tcps->tcps_iss_key; mutex_exit(&tcps->tcps_iss_key_lock); - arg.ports = tcp->tcp_ports; + arg.ports = connp->conn_ports; /* We use MAPPED addresses in tcp_iss_init */ - arg.src = tcp->tcp_ip_src_v6; - if (tcp->tcp_ipversion == IPV4_VERSION) { - IN6_IPADDR_TO_V4MAPPED( - tcp->tcp_ipha->ipha_dst, - &arg.dst); - } else { - arg.dst = - tcp->tcp_ip6h->ip6_dst; - } + arg.src = connp->conn_laddr_v6; + arg.dst = connp->conn_faddr_v6; MD5Update(&context, (uchar_t *)&arg, sizeof (arg)); MD5Final((uchar_t *)answer, &context); @@ -24813,21 +19302,11 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, */ if (tcp_clean_death(tcp, 0, 27) == -1) goto done; - /* - * We will come back to tcp_rput_data - * on the global queue. Packets destined - * for the global queue will be checked - * with global policy. But the policy for - * this packet has already been checked as - * this was destined for the detached - * connection. We need to bypass policy - * check this time by attaching a dummy - * ipsec_in with ipsec_in_dont_check set. - */ - connp = ipcl_classify(mp, tcp->tcp_connp->conn_zoneid, ipst); - if (connp != NULL) { + nconnp = ipcl_classify(mp, ira, ipst); + if (nconnp != NULL) { TCP_STAT(tcps, tcp_time_wait_syn_success); - tcp_reinput(connp, mp, tcp->tcp_connp->conn_sqp); + /* Drops ref on nconnp */ + tcp_reinput(nconnp, mp, ira, ipst); return; } goto done; @@ -24905,11 +19384,6 @@ process_ack: tcp->tcp_rnxt, TH_ACK); } done: - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - DB_CKSUMSTART(mp) = 0; - mp->b_datap->db_struioflag &= ~STRUIO_EAGER; - TCP_STAT(tcps, tcp_time_wait_syn_fail); - } freemsg(mp); } @@ -24965,11 +19439,12 @@ tcp_timer_callback(void *arg) tcpt = (tcp_timer_t *)mp->b_rptr; connp = tcpt->connp; SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, - SQ_FILL, SQTAG_TCP_TIMER); + NULL, SQ_FILL, SQTAG_TCP_TIMER); } +/* ARGSUSED */ static void -tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) +tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { tcp_timer_t *tcpt; conn_t *connp = (conn_t *)arg; @@ -24983,7 +19458,7 @@ tcp_timer_handler(void *arg, mblk_t *mp, void *arg2) * If the TCP has reached the closed state, don't proceed any * further. This TCP logically does not exist on the system. * tcpt_proc could for example access queues, that have already - * been qprocoff'ed off. Also see comments at the start of tcp_input + * been qprocoff'ed off. */ if (tcp->tcp_state != TCPS_CLOSED) { (*tcpt->tcpt_proc)(connp); @@ -25148,26 +19623,9 @@ tcp_setqfull(tcp_t *tcp) if (tcp->tcp_closed) return; - if (IPCL_IS_NONSTR(connp)) { - (*connp->conn_upcalls->su_txq_full) - (tcp->tcp_connp->conn_upper_handle, B_TRUE); - tcp->tcp_flow_stopped = B_TRUE; - } else { - queue_t *q = tcp->tcp_wq; - - if (!(q->q_flag & QFULL)) { - mutex_enter(QLOCK(q)); - if (!(q->q_flag & QFULL)) { - /* still need to set QFULL */ - q->q_flag |= QFULL; - tcp->tcp_flow_stopped = B_TRUE; - mutex_exit(QLOCK(q)); - TCP_STAT(tcps, tcp_flwctl_on); - } else { - mutex_exit(QLOCK(q)); - } - } - } + conn_setqfull(connp, &tcp->tcp_flow_stopped); + if (tcp->tcp_flow_stopped) + TCP_STAT(tcps, tcp_flwctl_on); } void @@ -25177,27 +19635,7 @@ tcp_clrqfull(tcp_t *tcp) if (tcp->tcp_closed) return; - - if (IPCL_IS_NONSTR(connp)) { - (*connp->conn_upcalls->su_txq_full) - (tcp->tcp_connp->conn_upper_handle, B_FALSE); - tcp->tcp_flow_stopped = B_FALSE; - } else { - queue_t *q = tcp->tcp_wq; - - if (q->q_flag & QFULL) { - mutex_enter(QLOCK(q)); - if (q->q_flag & QFULL) { - q->q_flag &= ~QFULL; - tcp->tcp_flow_stopped = B_FALSE; - mutex_exit(QLOCK(q)); - if (q->q_flag & QWANTW) - qbackenable(q, 0); - } else { - mutex_exit(QLOCK(q)); - } - } - } + conn_clrqfull(connp, &tcp->tcp_flow_stopped); } /* @@ -25246,10 +19684,7 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) tcp_stat_t template = { { "tcp_time_wait", KSTAT_DATA_UINT64 }, { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_success", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_fail", KSTAT_DATA_UINT64 }, - { "tcp_reinput_syn", KSTAT_DATA_UINT64 }, - { "tcp_ip_output", KSTAT_DATA_UINT64 }, + { "tcp_time_wait_syn_success", KSTAT_DATA_UINT64 }, { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, @@ -25287,37 +19722,14 @@ tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, - { "tcp_ire_null1", KSTAT_DATA_UINT64 }, - { "tcp_ire_null", KSTAT_DATA_UINT64 }, - { "tcp_ip_send", KSTAT_DATA_UINT64 }, - { "tcp_ip_ire_send", KSTAT_DATA_UINT64 }, { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, - { "tcp_out_sw_cksum", KSTAT_DATA_UINT64 }, - { "tcp_out_sw_cksum_bytes", KSTAT_DATA_UINT64 }, { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_disable", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out_v4", KSTAT_DATA_UINT64 }, - { "tcp_mdt_pkt_out_v6", KSTAT_DATA_UINT64 }, - { "tcp_mdt_discarded", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted1", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted2", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_halted3", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_resumed1", KSTAT_DATA_UINT64 }, - { "tcp_mdt_conn_resumed2", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_small", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_all", KSTAT_DATA_UINT64 }, - { "tcp_mdt_legacy_ret", KSTAT_DATA_UINT64 }, - { "tcp_mdt_allocfail", KSTAT_DATA_UINT64 }, - { "tcp_mdt_addpdescfail", KSTAT_DATA_UINT64 }, - { "tcp_mdt_allocd", KSTAT_DATA_UINT64 }, - { "tcp_mdt_linked", KSTAT_DATA_UINT64 }, { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, @@ -25490,7 +19902,7 @@ tcp_kstat_update(kstat_t *kp, int rw) connfp = &ipst->ips_ipcl_globalhash_fanout[i]; connp = NULL; while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCP)) != NULL) { + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { tcp = connp->conn_tcp; switch (tcp_snmp_state(tcp)) { case MIB2_TCP_established: @@ -25565,48 +19977,6 @@ tcp_kstat_update(kstat_t *kp, int rw) return (0); } -void -tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp) -{ - uint16_t hdr_len; - ipha_t *ipha; - uint8_t *nexthdrp; - tcph_t *tcph; - tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; - - /* Already has an eager */ - if ((mp->b_datap->db_struioflag & STRUIO_EAGER) != 0) { - TCP_STAT(tcps, tcp_reinput_syn); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, - SQ_PROCESS, SQTAG_TCP_REINPUT_EAGER); - return; - } - - switch (IPH_HDR_VERSION(mp->b_rptr)) { - case IPV4_VERSION: - ipha = (ipha_t *)mp->b_rptr; - hdr_len = IPH_HDR_LENGTH(ipha); - break; - case IPV6_VERSION: - if (!ip_hdr_length_nexthdr_v6(mp, (ip6_t *)mp->b_rptr, - &hdr_len, &nexthdrp)) { - CONN_DEC_REF(connp); - freemsg(mp); - return; - } - break; - } - - tcph = (tcph_t *)&mp->b_rptr[hdr_len]; - if ((tcph->th_flags[0] & (TH_SYN|TH_ACK|TH_RST|TH_URG)) == TH_SYN) { - mp->b_datap->db_struioflag |= STRUIO_EAGER; - DB_CKSUMSTART(mp) = (intptr_t)sqp; - } - - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, - SQ_FILL, SQTAG_TCP_REINPUT); -} - static int tcp_squeue_switch(int val) { @@ -25653,278 +20023,20 @@ tcp_squeue_add(squeue_t *sqp) tcp_time_wait->tcp_free_list_cnt = 0; } -static int -tcp_post_ip_bind(tcp_t *tcp, mblk_t *mp, int error, cred_t *cr, pid_t pid) +/* + * On a labeled system we have some protocols above TCP, such as RPC, which + * appear to assume that every mblk in a chain has a db_credp. + */ +static void +tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira) { - mblk_t *ire_mp = NULL; - mblk_t *syn_mp; - mblk_t *mdti; - mblk_t *lsoi; - int retval; - tcph_t *tcph; - cred_t *ecr; - ts_label_t *tsl; - uint32_t mss; - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - if (error == 0) { - /* - * Adapt Multidata information, if any. The - * following tcp_mdt_update routine will free - * the message. - */ - if (mp != NULL && ((mdti = tcp_mdt_info_mp(mp)) != NULL)) { - tcp_mdt_update(tcp, &((ip_mdt_info_t *)mdti-> - b_rptr)->mdt_capab, B_TRUE); - freemsg(mdti); - } - - /* - * Check to update LSO information with tcp, and - * tcp_lso_update routine will free the message. - */ - if (mp != NULL && ((lsoi = tcp_lso_info_mp(mp)) != NULL)) { - tcp_lso_update(tcp, &((ip_lso_info_t *)lsoi-> - b_rptr)->lso_capab); - freemsg(lsoi); - } - - /* Get the IRE, if we had requested for it */ - if (mp != NULL) - ire_mp = tcp_ire_mp(&mp); - - if (tcp->tcp_hard_binding) { - tcp->tcp_hard_binding = B_FALSE; - tcp->tcp_hard_bound = B_TRUE; - CL_INET_CONNECT(tcp->tcp_connp, tcp, B_TRUE, retval); - if (retval != 0) { - error = EADDRINUSE; - goto bind_failed; - } - } else { - if (ire_mp != NULL) - freeb(ire_mp); - goto after_syn_sent; - } - - retval = tcp_adapt_ire(tcp, ire_mp); - if (ire_mp != NULL) - freeb(ire_mp); - if (retval == 0) { - error = (int)((tcp->tcp_state >= TCPS_SYN_SENT) ? - ENETUNREACH : EADDRNOTAVAIL); - goto ipcl_rm; - } - /* - * Don't let an endpoint connect to itself. - * Also checked in tcp_connect() but that - * check can't handle the case when the - * local IP address is INADDR_ANY. - */ - if (tcp->tcp_ipversion == IPV4_VERSION) { - if ((tcp->tcp_ipha->ipha_dst == - tcp->tcp_ipha->ipha_src) && - (BE16_EQL(tcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_fport))) { - error = EADDRNOTAVAIL; - goto ipcl_rm; - } - } else { - if (IN6_ARE_ADDR_EQUAL( - &tcp->tcp_ip6h->ip6_dst, - &tcp->tcp_ip6h->ip6_src) && - (BE16_EQL(tcp->tcp_tcph->th_lport, - tcp->tcp_tcph->th_fport))) { - error = EADDRNOTAVAIL; - goto ipcl_rm; - } - } - ASSERT(tcp->tcp_state == TCPS_SYN_SENT); - /* - * This should not be possible! Just for - * defensive coding... - */ - if (tcp->tcp_state != TCPS_SYN_SENT) - goto after_syn_sent; - - if (is_system_labeled() && - !tcp_update_label(tcp, CONN_CRED(tcp->tcp_connp))) { - error = EHOSTUNREACH; - goto ipcl_rm; - } - - /* - * tcp_adapt_ire() does not adjust - * for TCP/IP header length. - */ - mss = tcp->tcp_mss - tcp->tcp_hdr_len; - - /* - * Just make sure our rwnd is at - * least tcp_recv_hiwat_mss * MSS - * large, and round up to the nearest - * MSS. - * - * We do the round up here because - * we need to get the interface - * MTU first before we can do the - * round up. - */ - tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), - tcps->tcps_recv_hiwat_minmss * mss); - tcp->tcp_recv_hiwater = tcp->tcp_rwnd; - tcp_set_ws_value(tcp); - U32_TO_ABE16((tcp->tcp_rwnd >> tcp->tcp_rcv_ws), - tcp->tcp_tcph->th_win); - if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) - tcp->tcp_snd_ws_ok = B_TRUE; - - /* - * Set tcp_snd_ts_ok to true - * so that tcp_xmit_mp will - * include the timestamp - * option in the SYN segment. - */ - if (tcps->tcps_tstamp_always || - (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { - tcp->tcp_snd_ts_ok = B_TRUE; - } - - /* - * tcp_snd_sack_ok can be set in - * tcp_adapt_ire() if the sack metric - * is set. So check it here also. - */ - if (tcps->tcps_sack_permitted == 2 || - tcp->tcp_snd_sack_ok) { - if (tcp->tcp_sack_info == NULL) { - tcp->tcp_sack_info = - kmem_cache_alloc(tcp_sack_info_cache, - KM_SLEEP); - } - tcp->tcp_snd_sack_ok = B_TRUE; - } + ASSERT(is_system_labeled()); + ASSERT(ira->ira_cred != NULL); - /* - * Should we use ECN? Note that the current - * default value (SunOS 5.9) of tcp_ecn_permitted - * is 1. The reason for doing this is that there - * are equipments out there that will drop ECN - * enabled IP packets. Setting it to 1 avoids - * compatibility problems. - */ - if (tcps->tcps_ecn_permitted == 2) - tcp->tcp_ecn_ok = B_TRUE; - - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, - tcp->tcp_iss, B_FALSE, NULL, B_FALSE); - if (syn_mp) { - /* - * cr contains the cred from the thread calling - * connect(). - * - * If no thread cred is available, use the - * socket creator's cred instead. If still no - * cred, drop the request rather than risk a - * panic on production systems. - */ - if (cr == NULL) { - cr = CONN_CRED(connp); - pid = tcp->tcp_cpid; - ASSERT(cr != NULL); - if (cr != NULL) { - mblk_setcred(syn_mp, cr, pid); - } else { - error = ECONNABORTED; - goto ipcl_rm; - } - - /* - * If an effective security label exists for - * the connection, create a copy of the thread's - * cred but with the effective label attached. - */ - } else if (is_system_labeled() && - connp->conn_effective_cred != NULL && - (tsl = crgetlabel(connp-> - conn_effective_cred)) != NULL) { - if ((ecr = copycred_from_tslabel(cr, - tsl, KM_NOSLEEP)) == NULL) { - error = ENOMEM; - goto ipcl_rm; - } - mblk_setcred(syn_mp, ecr, pid); - crfree(ecr); - - /* - * Default to using the thread's cred unchanged. - */ - } else { - mblk_setcred(syn_mp, cr, pid); - } - - /* - * We must bump the generation before sending the syn - * to ensure that we use the right generation in case - * this thread issues a "connected" up call. - */ - SOCK_CONNID_BUMP(tcp->tcp_connid); - - tcp_send_data(tcp, tcp->tcp_wq, syn_mp); - } - after_syn_sent: - if (mp != NULL) { - ASSERT(mp->b_cont == NULL); - freeb(mp); - } - return (error); - } else { - /* error */ - if (tcp->tcp_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, - "tcp_post_ip_bind: error == %d", error); - } - if (mp != NULL) { - freeb(mp); - } + while (mp != NULL) { + mblk_setcred(mp, ira->ira_cred, NOPID); + mp = mp->b_cont; } - -ipcl_rm: - /* - * Need to unbind with classifier since we were just - * told that our bind succeeded. a.k.a error == 0 at the entry. - */ - tcp->tcp_hard_bound = B_FALSE; - tcp->tcp_hard_binding = B_FALSE; - - ipcl_hash_remove(connp); - -bind_failed: - tcp->tcp_state = TCPS_IDLE; - if (tcp->tcp_ipversion == IPV4_VERSION) - tcp->tcp_ipha->ipha_src = 0; - else - V6_SET_ZERO(tcp->tcp_ip6h->ip6_src); - /* - * Copy of the src addr. in tcp_t is needed since - * the lookup funcs. can only look at tcp_t - */ - V6_SET_ZERO(tcp->tcp_ip_src_v6); - - tcph = tcp->tcp_tcph; - tcph->th_lport[0] = 0; - tcph->th_lport[1] = 0; - tcp_bind_hash_remove(tcp); - bzero(&connp->u_port, sizeof (connp->u_port)); - /* blow away saved option results if any */ - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - - conn_delete_ire(tcp->tcp_connp, NULL); - - return (error); } static int @@ -25936,16 +20048,16 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, boolean_t user_specified; in_port_t allocated_port; in_port_t requested_port = *requested_port_ptr; - conn_t *connp; + conn_t *connp = tcp->tcp_connp; zone_t *zone; tcp_stack_t *tcps = tcp->tcp_tcps; - in6_addr_t v6addr = tcp->tcp_ip_src_v6; + in6_addr_t v6addr = connp->conn_laddr_v6; /* * XXX It's up to the caller to specify bind_to_req_port_only or not. */ - if (cr == NULL) - cr = tcp->tcp_cred; + ASSERT(cr != NULL); + /* * Get a valid port (within the anonymous range and should not * be a privileged one) to use if the user has not given a port. @@ -25961,7 +20073,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, mlptype = mlptSingle; mlp_port = requested_port; if (requested_port == 0) { - requested_port = tcp->tcp_anon_priv_bind ? + requested_port = connp->conn_anon_priv_bind ? tcp_get_next_priv_port(tcp) : tcp_update_next_port(tcps->tcps_next_port_to_try, tcp, B_TRUE); @@ -25975,7 +20087,6 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, * this socket and RPC is MLP in this zone, then give him an * anonymous MLP. */ - connp = tcp->tcp_connp; if (connp->conn_anon_mlp && is_system_labeled()) { zone = crgetzone(cr); addrtype = tsol_mlp_addr_type( @@ -26016,7 +20127,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, if (priv) { if (secpolicy_net_privaddr(cr, requested_port, IPPROTO_TCP) != 0) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: no priv for port %d", @@ -26044,7 +20155,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, if (mlptype != mlptSingle) { if (secpolicy_net_bindmlp(cr) != 0) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: no priv for multilevel port %d", @@ -26068,7 +20179,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, mlpzone = tsol_mlp_findzone(IPPROTO_TCP, htons(mlp_port)); if (connp->conn_zoneid != mlpzone) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: attempt to bind port " @@ -26083,10 +20194,10 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, if (!user_specified) { int err; - err = tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, requested_port, B_TRUE); if (err != 0) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: cannot establish anon " @@ -26101,17 +20212,18 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, } allocated_port = tcp_bindi(tcp, requested_port, &v6addr, - tcp->tcp_reuseaddr, B_FALSE, bind_to_req_port_only, user_specified); + connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, + user_specified); if (allocated_port == 0) { connp->conn_mlp_type = mlptSingle; if (connp->conn_anon_port) { connp->conn_anon_port = B_FALSE; - (void) tsol_mlp_anon(zone, mlptype, connp->conn_ulp, + (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, requested_port, B_FALSE); } if (bind_to_req_port_only) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: requested addr busy"); @@ -26119,7 +20231,7 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, return (-TADDRBUSY); } else { /* If we are out of ports, fail the bind. */ - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: out of ports?"); @@ -26133,6 +20245,9 @@ tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, return (0); } +/* + * Check the address and check/pick a local port number. + */ static int tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, boolean_t bind_to_req_port_only) @@ -26140,18 +20255,22 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, tcp_t *tcp = connp->conn_tcp; sin_t *sin; sin6_t *sin6; - in_port_t requested_port; + in_port_t requested_port; ipaddr_t v4addr; in6_addr_t v6addr; - uint_t ipversion; - int error = 0; + ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + uint_t scopeid = 0; + int error = 0; + ip_xmit_attr_t *ixa = connp->conn_ixa; ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); if (tcp->tcp_state == TCPS_BOUND) { return (0); } else if (tcp->tcp_state > TCPS_BOUND) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad state, %d", tcp->tcp_state); } @@ -26161,7 +20280,7 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, ASSERT(sa != NULL && len != 0); if (!OK_32PTR((char *)sa)) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad address parameter, " @@ -26171,38 +20290,48 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, return (-TPROTO); } + error = proto_verify_ip_addr(connp->conn_family, sa, len); + if (error != 0) { + return (error); + } + switch (len) { case sizeof (sin_t): /* Complete IPv4 address */ sin = (sin_t *)sa; - /* - * With sockets sockfs will accept bogus sin_family in - * bind() and replace it with the family used in the socket - * call. - */ - if (sin->sin_family != AF_INET || - tcp->tcp_family != AF_INET) { - return (EAFNOSUPPORT); - } requested_port = ntohs(sin->sin_port); - ipversion = IPV4_VERSION; v4addr = sin->sin_addr.s_addr; IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, + B_FALSE); + } break; case sizeof (sin6_t): /* Complete IPv6 address */ sin6 = (sin6_t *)sa; - if (sin6->sin6_family != AF_INET6 || - tcp->tcp_family != AF_INET6) { - return (EAFNOSUPPORT); - } - requested_port = ntohs(sin6->sin6_port); - ipversion = IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr) ? - IPV4_VERSION : IPV6_VERSION; v6addr = sin6->sin6_addr; + requested_port = ntohs(sin6->sin6_port); + if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); + + IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, + zoneid, ipst, B_FALSE); + } + } else { + if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { + if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) + scopeid = sin6->sin6_scope_id; + laddr_type = ip_laddr_verify_v6(&v6addr, + zoneid, ipst, B_FALSE, scopeid); + } + } break; default: - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad address length, %d", len); } @@ -26210,34 +20339,32 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, /* return (-TBADADDR); */ } - tcp->tcp_bound_source_v6 = v6addr; + /* Is the local address a valid unicast address? */ + if (laddr_type == IPVL_BAD) + return (EADDRNOTAVAIL); - /* Check for change in ipversion */ - if (tcp->tcp_ipversion != ipversion) { - ASSERT(tcp->tcp_family == AF_INET6); - error = (ipversion == IPV6_VERSION) ? - tcp_header_init_ipv6(tcp) : tcp_header_init_ipv4(tcp); - if (error) { - return (ENOMEM); - } - } - - /* - * Initialize family specific fields. Copy of the src addr. - * in tcp_t is needed for the lookup funcs. - */ - if (tcp->tcp_ipversion == IPV6_VERSION) { - tcp->tcp_ip6h->ip6_src = v6addr; + connp->conn_bound_addr_v6 = v6addr; + if (scopeid != 0) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; } else { - IN6_V4MAPPED_TO_IPADDR(&v6addr, tcp->tcp_ipha->ipha_src); + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; } - tcp->tcp_ip_src_v6 = v6addr; + + connp->conn_laddr_v6 = v6addr; + connp->conn_saddr_v6 = v6addr; bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; error = tcp_bind_select_lport(tcp, &requested_port, bind_to_req_port_only, cr); - + if (error != 0) { + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + } return (error); } @@ -26253,7 +20380,7 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, tcp_t *tcp = connp->conn_tcp; if (tcp->tcp_state >= TCPS_BOUND) { - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_bind: bad state, %d", tcp->tcp_state); } @@ -26265,19 +20392,8 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, return (error); ASSERT(tcp->tcp_state == TCPS_BOUND); - tcp->tcp_conn_req_max = 0; - - if (tcp->tcp_family == AF_INET6) { - ASSERT(tcp->tcp_connp->conn_af_isv6); - error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP, - &tcp->tcp_bound_source_v6, 0, B_FALSE); - } else { - ASSERT(!tcp->tcp_connp->conn_af_isv6); - error = ip_proto_bind_laddr_v4(connp, NULL, IPPROTO_TCP, - tcp->tcp_ipha->ipha_src, 0, B_FALSE); - } - return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0)); + return (0); } int @@ -26337,7 +20453,14 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, ipaddr_t *dstaddrp; in_port_t dstport; uint_t srcid; - int error = 0; + int error; + uint32_t mss; + mblk_t *syn_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + int32_t oldstate; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + oldstate = tcp->tcp_state; switch (len) { default: @@ -26351,7 +20474,7 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, if (sin->sin_port == 0) { return (-TBADADDR); } - if (tcp->tcp_connp && tcp->tcp_connp->conn_ipv6_v6only) { + if (connp->conn_ipv6_v6only) { return (EAFNOSUPPORT); } break; @@ -26365,23 +20488,18 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, } /* * If we're connecting to an IPv4-mapped IPv6 address, we need to - * make sure that the template IP header in the tcp structure is an - * IPv4 header, and that the tcp_ipversion is IPV4_VERSION. We + * make sure that the conn_ipversion is IPV4_VERSION. We * need to this before we call tcp_bindi() so that the port lookup * code will look for ports in the correct port space (IPv4 and * IPv6 have separate port spaces). */ - if (tcp->tcp_family == AF_INET6 && tcp->tcp_ipversion == IPV6_VERSION && + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV6_VERSION && IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - int err = 0; + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); - err = tcp_header_init_ipv4(tcp); - if (err != 0) { - error = ENOMEM; - goto connect_failed; - } - if (tcp->tcp_lport != 0) - *(uint16_t *)tcp->tcp_tcph->th_lport = tcp->tcp_lport; + connp->conn_ipversion = IPV4_VERSION; } switch (tcp->tcp_state) { @@ -26399,43 +20517,147 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, */ /* FALLTHRU */ case TCPS_BOUND: - if (tcp->tcp_family == AF_INET6) { - if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { - return (tcp_connect_ipv6(tcp, - &sin6->sin6_addr, - sin6->sin6_port, sin6->sin6_flowinfo, - sin6->__sin6_src_id, sin6->sin6_scope_id, - cr, pid)); - } + break; + default: + return (-TOUTSTATE); + } + + /* + * We update our cred/cpid based on the caller of connect + */ + if (connp->conn_cred != cr) { + crhold(cr); + crfree(connp->conn_cred); + connp->conn_cred = cr; + } + connp->conn_cpid = pid; + + /* Cache things in the ixa without any refhold */ + ixa->ixa_cred = cr; + ixa->ixa_cpid = pid; + if (is_system_labeled()) { + /* We need to restart with a label based on the cred */ + ip_xmit_attr_restore_tsl(ixa, ixa->ixa_cred); + } + + if (connp->conn_family == AF_INET6) { + if (!IN6_IS_ADDR_V4MAPPED(&sin6->sin6_addr)) { + error = tcp_connect_ipv6(tcp, &sin6->sin6_addr, + sin6->sin6_port, sin6->sin6_flowinfo, + sin6->__sin6_src_id, sin6->sin6_scope_id); + } else { /* * Destination adress is mapped IPv6 address. * Source bound address should be unspecified or * IPv6 mapped address as well. */ if (!IN6_IS_ADDR_UNSPECIFIED( - &tcp->tcp_bound_source_v6) && - !IN6_IS_ADDR_V4MAPPED(&tcp->tcp_bound_source_v6)) { + &connp->conn_bound_addr_v6) && + !IN6_IS_ADDR_V4MAPPED(&connp->conn_bound_addr_v6)) { return (EADDRNOTAVAIL); } dstaddrp = &V4_PART_OF_V6((sin6->sin6_addr)); dstport = sin6->sin6_port; srcid = sin6->__sin6_src_id; - } else { - dstaddrp = &sin->sin_addr.s_addr; - dstport = sin->sin_port; - srcid = 0; + error = tcp_connect_ipv4(tcp, dstaddrp, dstport, + srcid); } + } else { + dstaddrp = &sin->sin_addr.s_addr; + dstport = sin->sin_port; + srcid = 0; + error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid); + } - error = tcp_connect_ipv4(tcp, dstaddrp, dstport, srcid, cr, - pid); - break; - default: - return (-TOUTSTATE); + if (error != 0) + goto connect_failed; + + CL_INET_CONNECT(connp, B_TRUE, error); + if (error != 0) + goto connect_failed; + + /* connect succeeded */ + BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); + tcp->tcp_active_open = 1; + + /* + * tcp_set_destination() does not adjust for TCP/IP header length. + */ + mss = tcp->tcp_mss - connp->conn_ht_iphc_len; + + /* + * Just make sure our rwnd is at least rcvbuf * MSS large, and round up + * to the nearest MSS. + * + * We do the round up here because we need to get the interface MTU + * first before we can do the round up. + */ + tcp->tcp_rwnd = connp->conn_rcvbuf; + tcp->tcp_rwnd = MAX(MSS_ROUNDUP(tcp->tcp_rwnd, mss), + tcps->tcps_recv_hiwat_minmss * mss); + connp->conn_rcvbuf = tcp->tcp_rwnd; + tcp_set_ws_value(tcp); + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + if (tcp->tcp_rcv_ws > 0 || tcps->tcps_wscale_always) + tcp->tcp_snd_ws_ok = B_TRUE; + + /* + * Set tcp_snd_ts_ok to true + * so that tcp_xmit_mp will + * include the timestamp + * option in the SYN segment. + */ + if (tcps->tcps_tstamp_always || + (tcp->tcp_rcv_ws && tcps->tcps_tstamp_if_wscale)) { + tcp->tcp_snd_ts_ok = B_TRUE; } + /* - * Note: Code below is the "failure" case + * tcp_snd_sack_ok can be set in + * tcp_set_destination() if the sack metric + * is set. So check it here also. + */ + if (tcps->tcps_sack_permitted == 2 || + tcp->tcp_snd_sack_ok) { + if (tcp->tcp_sack_info == NULL) { + tcp->tcp_sack_info = kmem_cache_alloc( + tcp_sack_info_cache, KM_SLEEP); + } + tcp->tcp_snd_sack_ok = B_TRUE; + } + + /* + * Should we use ECN? Note that the current + * default value (SunOS 5.9) of tcp_ecn_permitted + * is 1. The reason for doing this is that there + * are equipments out there that will drop ECN + * enabled IP packets. Setting it to 1 avoids + * compatibility problems. */ + if (tcps->tcps_ecn_permitted == 2) + tcp->tcp_ecn_ok = B_TRUE; + + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + syn_mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, + tcp->tcp_iss, B_FALSE, NULL, B_FALSE); + if (syn_mp != NULL) { + /* + * We must bump the generation before sending the syn + * to ensure that we use the right generation in case + * this thread issues a "connected" up call. + */ + SOCK_CONNID_BUMP(tcp->tcp_connid); + tcp_send_data(tcp, syn_mp); + } + + if (tcp->tcp_conn.tcp_opts_conn_req != NULL) + tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); + return (0); + connect_failed: + connp->conn_faddr_v6 = ipv6_all_zeros; + connp->conn_fport = 0; + tcp->tcp_state = oldstate; if (tcp->tcp_conn.tcp_opts_conn_req != NULL) tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); return (error); @@ -26446,7 +20668,6 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, socklen_t len, sock_connid_t *id, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - tcp_t *tcp = connp->conn_tcp; squeue_t *sqp = connp->conn_sqp; int error; @@ -26455,7 +20676,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); - error = proto_verify_ip_addr(tcp->tcp_family, sa, len); + error = proto_verify_ip_addr(connp->conn_family, sa, len); if (error != 0) { return (error); } @@ -26493,7 +20714,7 @@ tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, } } - if (tcp->tcp_loopback) { + if (connp->conn_tcp->tcp_loopback) { struct sock_proto_props sopp; sopp.sopp_flags = SOCKOPT_LOOPBACK; @@ -26521,7 +20742,7 @@ tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, return (NULL); } - connp = tcp_create_common(NULL, credp, isv6, B_TRUE, errorp); + connp = tcp_create_common(credp, isv6, B_TRUE, errorp); if (connp == NULL) { return (NULL); } @@ -26578,8 +20799,8 @@ tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, connp->conn_upcalls = sock_upcalls; connp->conn_upper_handle = sock_handle; - ASSERT(connp->conn_tcp->tcp_recv_hiwater != 0 && - connp->conn_tcp->tcp_recv_hiwater == connp->conn_tcp->tcp_rwnd); + ASSERT(connp->conn_rcvbuf != 0 && + connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); } @@ -26663,7 +20884,7 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, /* * Squeue Flow Control */ - if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) { + if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { tcp_setqfull(tcp); } mutex_exit(&tcp->tcp_non_sq_lock); @@ -26680,12 +20901,11 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, CONN_INC_REF(connp); if (msg->msg_flags & MSG_OOB) { - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_output_urgent, connp, tcp_squeue_flag, - SQTAG_TCP_OUTPUT); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, + connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, - connp, tcp_squeue_flag, SQTAG_TCP_OUTPUT); + connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); } return (0); @@ -26698,9 +20918,9 @@ tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, return (0); } -/* ARGSUSED */ +/* ARGSUSED2 */ void -tcp_output_urgent(void *arg, mblk_t *mp, void *arg2) +tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { int len; uint32_t msize; @@ -26739,7 +20959,7 @@ tcp_output_urgent(void *arg, mblk_t *mp, void *arg2) tcp_wput_data(tcp, mp, B_TRUE); } -/* ARGSUSED */ +/* ARGSUSED3 */ int tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, socklen_t *addrlenp, cred_t *cr) @@ -26752,24 +20972,24 @@ tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, ASSERT(cr != NULL); ASSERT(tcp != NULL); + if (tcp->tcp_state < TCPS_SYN_RCVD) + return (ENOTCONN); - return (tcp_do_getpeername(tcp, addr, addrlenp)); + return (conn_getpeername(connp, addr, addrlenp)); } -/* ARGSUSED */ +/* ARGSUSED3 */ int tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, socklen_t *addrlenp, cred_t *cr) { conn_t *connp = (conn_t *)proto_handle; - tcp_t *tcp = connp->conn_tcp; /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); ASSERT(connp->conn_upper_handle != NULL); - - return (tcp_do_getsockname(tcp, addr, addrlenp)); + return (conn_getsockname(connp, addr, addrlenp)); } /* @@ -26809,8 +21029,8 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, RD(q)->q_ptr = WR(q)->q_ptr = connp; - connp->conn_tcp->tcp_rq = connp->conn_rq = RD(q); - connp->conn_tcp->tcp_wq = connp->conn_wq = WR(q); + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); WR(q)->q_qinfo = &tcp_sock_winit; @@ -26830,11 +21050,11 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, stropt_mp->b_wptr += sizeof (struct stroptions); stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; - stropt->so_wroff = tcp->tcp_hdr_len + (tcp->tcp_loopback ? 0 : + stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : tcp->tcp_tcps->tcps_wroff_xtra); if (tcp->tcp_snd_sack_ok) stropt->so_wroff += TCPOPT_MAX_SACK_LEN; - stropt->so_hiwat = tcp->tcp_recv_hiwater; + stropt->so_hiwat = connp->conn_rcvbuf; stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); putnext(RD(q), stropt_mp); @@ -26845,15 +21065,17 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); laddrlen = faddrlen = sizeof (sin6_t); - (void) tcp_do_getsockname(tcp, (struct sockaddr *)&laddr, &laddrlen); - error = tcp_do_getpeername(tcp, (struct sockaddr *)&faddr, &faddrlen); + (void) tcp_getsockname((sock_lower_handle_t)connp, + (struct sockaddr *)&laddr, &laddrlen, CRED()); + error = tcp_getpeername((sock_lower_handle_t)connp, + (struct sockaddr *)&faddr, &faddrlen, CRED()); if (error != 0) faddrlen = 0; opts = 0; - if (tcp->tcp_oobinline) + if (connp->conn_oobinline) opts |= SO_OOBINLINE; - if (tcp->tcp_dontroute) + if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) opts |= SO_DONTROUTE; /* @@ -26868,6 +21090,7 @@ tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, while ((mp = tcp->tcp_rcv_list) != NULL) { tcp->tcp_rcv_list = mp->b_next; mp->b_next = NULL; + /* We never do fallback for kernel RPC */ putnext(q, mp); } tcp->tcp_rcv_last_head = NULL; @@ -26908,7 +21131,7 @@ tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) * Sockfs guarantees that the listener will not be closed * during fallback. So we can safely use the listener's queue. */ - putnext(listener->tcp_rq, mp); + putnext(listener->tcp_connp->conn_rq, mp); } int @@ -26987,7 +21210,7 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, /* ARGSUSED */ static void -tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2) +tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) { conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; @@ -27002,7 +21225,7 @@ tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2) * We were crossing FINs and got a reset from * the other side. Just ignore it. */ - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_shutdown_output() out of state %s", @@ -27036,7 +21259,7 @@ tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); CONN_INC_REF(connp); SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, - connp, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); + connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, SOCK_OPCTL_SHUT_SEND, 0); @@ -27109,7 +21332,7 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, */ goto do_listen; } - if (tcp->tcp_debug) { + if (connp->conn_debug) { (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, "tcp_listen: bad state, %d", tcp->tcp_state); } @@ -27121,15 +21344,14 @@ tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, sin6_t *sin6; ASSERT(IPCL_IS_NONSTR(connp)); - /* Do an implicit bind: Request for a generic port. */ - if (tcp->tcp_family == AF_INET) { + if (connp->conn_family == AF_INET) { len = sizeof (sin_t); sin = (sin_t *)&addr; *sin = sin_null; sin->sin_family = AF_INET; } else { - ASSERT(tcp->tcp_family == AF_INET6); + ASSERT(connp->conn_family == AF_INET6); len = sizeof (sin6_t); sin6 = (sin6_t *)&addr; *sin6 = sin6_null; @@ -27171,23 +21393,42 @@ do_listen: } /* - * We can call ip_bind directly, the processing continues - * in tcp_post_ip_bind(). - * * We need to make sure that the conn_recv is set to a non-null * value before we insert the conn into the classifier table. * This is to avoid a race with an incoming packet which does an * ipcl_classify(). + * We initially set it to tcp_input_listener_unbound to try to + * pick a good squeue for the listener when the first SYN arrives. + * tcp_input_listener_unbound sets it to tcp_input_listener on that + * first SYN. */ - connp->conn_recv = tcp_conn_request; - if (tcp->tcp_family == AF_INET) { - error = ip_proto_bind_laddr_v4(connp, NULL, - IPPROTO_TCP, tcp->tcp_bound_source, tcp->tcp_lport, B_TRUE); - } else { - error = ip_proto_bind_laddr_v6(connp, NULL, IPPROTO_TCP, - &tcp->tcp_bound_source_v6, tcp->tcp_lport, B_TRUE); + connp->conn_recv = tcp_input_listener_unbound; + + /* Insert the listener in the classifier table */ + error = ip_laddr_fanout_insert(connp); + if (error != 0) { + /* Undo the bind - release the port number */ + tcp->tcp_state = TCPS_IDLE; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_ports = 0; + + if (connp->conn_anon_port) { + zone_t *zone; + + zone = crgetzone(cr); + connp->conn_anon_port = B_FALSE; + (void) tsol_mlp_anon(zone, connp->conn_mlp_type, + connp->conn_proto, connp->conn_lport, B_FALSE); + } + connp->conn_mlp_type = mlptSingle; + + tcp_bind_hash_remove(tcp); + return (error); } - return (tcp_post_ip_bind(tcp, NULL, error, NULL, 0)); + return (error); } void @@ -27222,7 +21463,7 @@ tcp_clr_flowctrl(sock_lower_handle_t proto_handle) if (tcp->tcp_fused) { tcp_fuse_backenable(tcp); } else { - tcp->tcp_rwnd = tcp->tcp_recv_hiwater; + tcp->tcp_rwnd = connp->conn_rcvbuf; /* * Send back a window update immediately if TCP is above * ESTABLISHED state and the increase of the rcv window @@ -27253,10 +21494,28 @@ tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, /* All Solaris components should pass a cred for this operation. */ ASSERT(cr != NULL); + /* + * If we don't have a helper stream then create one. + * ip_create_helper_stream takes care of locking the conn_t, + * so this check for NULL is just a performance optimization. + */ + if (connp->conn_helper_info == NULL) { + tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; + + /* + * Create a helper stream for non-STREAMS socket. + */ + error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); + if (error != 0) { + ip0dbg(("tcp_ioctl: create of IP helper stream " + "failed %d\n", error)); + return (error); + } + } + switch (cmd) { case ND_SET: case ND_GET: - case TCP_IOC_DEFAULT_Q: case _SIOCSOCKFALLBACK: case TCP_IOC_ABORT_CONN: case TI_GETPEERNAME: |