diff options
author | Kacheong Poon <Kacheong.Poon@Sun.COM> | 2010-02-24 07:49:29 -0800 |
---|---|---|
committer | Kacheong Poon <Kacheong.Poon@Sun.COM> | 2010-02-24 07:49:29 -0800 |
commit | 721fffe35d40e548a5a58dc53a2ec9c6762172d9 (patch) | |
tree | 0d10f62e1ca25f6d524b97fc4240fe59fa9e8548 /usr/src/uts | |
parent | 3357fc65c82fa21d1aabd8d906fb1f49810afe0b (diff) | |
download | illumos-gate-721fffe35d40e548a5a58dc53a2ec9c6762172d9.tar.gz |
PSARC 2010/042 increase max TCP_INIT_CWND
6923847 Increase TCP_INIT_CWND max
6918307 Some TCP kstats are not needed
6923858 TCP connection counter
6925635 The file tcp.c is too big
Diffstat (limited to 'usr/src/uts')
31 files changed, 16330 insertions, 17305 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index d5a5b5ea96..3dba7ff046 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -526,7 +526,9 @@ RTS_OBJS += rtsddi.o IP_ICMP_OBJS = icmp.o icmp_opt_data.o IP_RTS_OBJS = rts.o rts_opt_data.o -IP_TCP_OBJS = tcp.o tcp_fusion.o tcp_kssl.o tcp_opt_data.o tcp_sack.o +IP_TCP_OBJS = tcp.o tcp_fusion.o tcp_kssl.o tcp_opt_data.o tcp_sack.o \ + tcp_stats.o tcp_misc.o tcp_timers.o tcp_time_wait.o tcp_tpi.o \ + tcp_output.o tcp_input.o tcp_socket.o tcp_bind.o tcp_cluster.o IP_UDP_OBJS = udp.o udp_opt_data.o IP_SCTP_OBJS = sctp.o sctp_opt_data.o sctp_output.o \ sctp_init.o sctp_input.o sctp_cookie.o \ diff --git a/usr/src/uts/common/inet/Makefile b/usr/src/uts/common/inet/Makefile index 3d45e4861c..a5da360b01 100644 --- a/usr/src/uts/common/inet/Makefile +++ b/usr/src/uts/common/inet/Makefile @@ -20,7 +20,7 @@ # # -# Copyright 2009 Sun Microsystems, Inc. All rights reserved. +# Copyright 2010 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # uts/common/inet/Makefile @@ -34,7 +34,7 @@ HDRS= arp.h common.h ipclassifier.h ip.h ip6.h ipdrop.h ipnet.h \ led.h mi.h mib2.h nd.h optcom.h sadb.h sctp_itf.h snmpcom.h tcp.h \ tcp_sack.h tcp_stack.h udp_impl.h rawip_impl.h ipp_common.h \ ip_ftable.h ip_impl.h ip_stack.h ip_arp.h tcp_impl.h wifi_ioctl.h \ - ip2mac.h ip2mac_impl.h + ip2mac.h ip2mac_impl.h tcp_stats.h ROOTDIRS= $(ROOT)/usr/include/inet diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 0cc4b522fe..ba57cf4406 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -3732,6 +3732,9 @@ extern boolean_t ip_recv_attr_is_mblk(mblk_t *); #define SQTAG_TCP_IXA_CLEANUP 44 #define SQTAG_TCP_SEND_SYNACK 45 +extern sin_t sin_null; /* Zero address for quick clears */ +extern sin6_t sin6_null; /* Zero address for quick clears */ + #endif /* _KERNEL */ #ifdef __cplusplus diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index a80a4893fb..c7a53d793e 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -101,9 +101,6 @@ #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> -static sin_t sin_null; /* Zero address for quick clears */ -static sin6_t sin6_null; /* Zero address for quick clears */ - /* * Return how much size is needed for the different ancillary data items */ diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 57a5f8690d..2b05d02458 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -203,9 +203,6 @@ struct streamtab icmpinfov6 = { &icmprinitv6, &icmpwinit }; -static sin_t sin_null; /* Zero address for quick clears */ -static sin6_t sin6_null; /* Zero address for quick clears */ - /* Default structure copied into T_INFO_ACK messages */ static struct T_info_ack icmp_g_t_info_ack = { T_INFO_ACK, diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index debf6bbf1f..384b56ce57 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -277,8 +277,10 @@ static ill_t ill_null; /* Empty ILL for init. */ char ipif_loopback_name[] = "lo0"; static char *ipv4_forward_suffix = ":ip_forwarding"; static char *ipv6_forward_suffix = ":ip6_forwarding"; -static sin6_t sin6_null; /* Zero address for quick clears */ -static sin_t sin_null; /* Zero address for quick clears */ + +/* These are used by all IP network modules. */ +sin6_t sin6_null; /* Zero address for quick clears */ +sin_t sin_null; /* Zero address for quick clears */ /* When set search for unused ipif_seqid */ static ipif_t ipif_zero; diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 3cd30ba4ca..bd37503d0b 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -317,7 +317,6 @@ typedef union itc_s { struct kmem_cache *tcp_conn_cache; struct kmem_cache *ip_conn_cache; extern struct kmem_cache *sctp_conn_cache; -extern struct kmem_cache *tcp_sack_info_cache; struct kmem_cache *udp_conn_cache; struct kmem_cache *rawip_conn_cache; struct kmem_cache *rts_conn_cache; diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index bf7dbf85f6..ce15f101ba 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -142,15 +142,14 @@ struct tcp_listen_cnt_s; #if (defined(_KERNEL) || defined(_KMEMUSER)) typedef struct tcp_s { - /* Pointer to previous bind hash next. */ struct tcp_s *tcp_time_wait_next; /* Pointer to next T/W block */ struct tcp_s *tcp_time_wait_prev; /* Pointer to previous T/W next */ clock_t tcp_time_wait_expire; - struct conn_s *tcp_connp; - tcp_stack_t *tcp_tcps; /* Shortcut via conn_netstack */ + struct conn_s *tcp_connp; /* back pointer to conn_t */ + tcp_stack_t *tcp_tcps; /* back pointer to tcp_stack_t */ int32_t tcp_state; int32_t tcp_rcv_ws; /* My window scale power */ @@ -169,9 +168,9 @@ typedef struct tcp_s { uint32_t tcp_rwnd; /* Fields arranged in approximate access order along main paths */ - mblk_t *tcp_xmit_head; /* Head of rexmit list */ - mblk_t *tcp_xmit_last; /* last valid data seen by tcp_wput */ - mblk_t *tcp_xmit_tail; /* Last rexmit data sent */ + mblk_t *tcp_xmit_head; /* Head of xmit/rexmit list */ + mblk_t *tcp_xmit_last; /* Last valid data seen by tcp_wput */ + mblk_t *tcp_xmit_tail; /* Last data sent */ uint32_t tcp_unsent; /* # of bytes in hand that are unsent */ uint32_t tcp_xmit_tail_unsent; /* # of unsent bytes in xmit_tail */ @@ -376,7 +375,6 @@ typedef struct tcp_s { kcondvar_t tcp_closecv; uint8_t tcp_closed; uint8_t tcp_closeflags; - uint8_t tcp_cleandeathtag; mblk_t tcp_closemp; timeout_id_t tcp_linger_tid; /* Linger timer ID */ @@ -495,35 +493,28 @@ extern void tcp_conn_reclaim(void *); extern void tcp_free(tcp_t *tcp); extern void tcp_ddi_g_init(void); extern void tcp_ddi_g_destroy(void); -extern void tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *, - ip_stack_t *, conn_t *); -extern void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *); -extern void tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *); -extern void tcp_input_data(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *); extern void *tcp_get_conn(void *arg, tcp_stack_t *); -extern void tcp_time_wait_collector(void *arg); extern mblk_t *tcp_snmp_get(queue_t *, mblk_t *); extern int tcp_snmp_set(queue_t *, int, int, uchar_t *, int len); -extern mblk_t *tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, - int32_t *offset, mblk_t **end_mp, uint32_t seq, - boolean_t sendall, uint32_t *seg_len, boolean_t rexmit); /* - * The TCP Fanout structure. - * The hash tables and their linkage (tcp_*_hash_next, tcp_ptp*hn) are - * protected by the per-bucket tf_lock. Each tcp_t + * The TCP Fanout structure for bind and acceptor hashes. + * The hash tables and their linkage (tcp_*_hash, tcp_ptp*hn) are + * protected by the per-bucket tf_lock. Each tcp_t * inserted in the list points back at this lock using tcp_*_lockp. * - * The listener and acceptor hash queues are lists of tcp_t. + * The bind and acceptor hash queues are lists of tcp_t. */ /* listener hash and acceptor hash queue head */ typedef struct tf_s { tcp_t *tf_tcp; kmutex_t tf_lock; } tf_t; + + +/* Also used in ipclassifier.c */ +extern struct kmem_cache *tcp_sack_info_cache; + #endif /* (defined(_KERNEL) || defined(_KMEMUSER)) */ /* Contract private interface between TCP and Clustering. */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 1b43c16b72..8c746dc33b 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -83,6 +83,7 @@ #include <inet/kstatcom.h> #include <inet/tcp.h> #include <inet/tcp_impl.h> +#include <inet/tcp_cluster.h> #include <inet/udp_impl.h> #include <net/pfkeyv2.h> #include <inet/ipdrop.h> @@ -102,8 +103,6 @@ #include <rpc/pmap_prot.h> #include <sys/callo.h> -#include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ - /* * TCP Notes: aka FireEngine Phase I (PSARC 2002/433) * @@ -228,297 +227,9 @@ int tcp_squeue_wput = 2; /* /etc/systems */ int tcp_squeue_flag; -/* - * This controls how tiny a write must be before we try to copy it - * into the mblk on the tail of the transmit queue. Not much - * speedup is observed for values larger than sixteen. Zero will - * disable the optimisation. - */ -int tcp_tx_pull_len = 16; - -/* - * TCP Statistics. - * - * How TCP statistics work. - * - * There are two types of statistics invoked by two macros. - * - * TCP_STAT(name) does non-atomic increment of a named stat counter. It is - * supposed to be used in non MT-hot paths of the code. - * - * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is - * supposed to be used for DEBUG purposes and may be used on a hot path. - * - * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat - * (use "kstat tcp" to get them). - * - * There is also additional debugging facility that marks tcp_clean_death() - * instances and saves them in tcp_t structure. It is triggered by - * TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for - * tcp_clean_death() calls that counts the number of times each tag was hit. It - * is triggered by TCP_CLD_COUNTERS define. - * - * How to add new counters. - * - * 1) Add a field in the tcp_stat structure describing your counter. - * 2) Add a line in the template in tcp_kstat2_init() with the name - * of the counter. - * - * IMPORTANT!! - make sure that both are in sync !! - * 3) Use either TCP_STAT or TCP_DBGSTAT with the name. - * - * Please avoid using private counters which are not kstat-exported. - * - * TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances - * in tcp_t structure. - * - * TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags. - */ - -#ifndef TCP_DEBUG_COUNTER -#ifdef DEBUG -#define TCP_DEBUG_COUNTER 1 -#else -#define TCP_DEBUG_COUNTER 0 -#endif -#endif - -#define TCP_CLD_COUNTERS 0 - -#define TCP_TAG_CLEAN_DEATH 1 -#define TCP_MAX_CLEAN_DEATH_TAG 32 - -#ifdef lint -static int _lint_dummy_; -#endif - -#if TCP_CLD_COUNTERS -static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG]; -#define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++ -#elif defined(lint) -#define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0); -#else -#define TCP_CLD_STAT(x) -#endif - -#if TCP_DEBUG_COUNTER -#define TCP_DBGSTAT(tcps, x) \ - atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1) -#define TCP_G_DBGSTAT(x) \ - atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1) -#elif defined(lint) -#define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0); -#define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0); -#else -#define TCP_DBGSTAT(tcps, x) -#define TCP_G_DBGSTAT(x) -#endif - -#define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) - -tcp_g_stat_t tcp_g_statistics; -kstat_t *tcp_g_kstat; - -/* Macros for timestamp comparisons */ -#define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) -#define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) - -/* - * Parameters for TCP Initial Send Sequence number (ISS) generation. When - * tcp_strong_iss is set to 1, which is the default, the ISS is calculated - * by adding three components: a time component which grows by 1 every 4096 - * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); - * a per-connection component which grows by 125000 for every new connection; - * and an "extra" component that grows by a random amount centered - * approximately on 64000. This causes the ISS generator to cycle every - * 4.89 hours if no TCP connections are made, and faster if connections are - * made. - * - * When tcp_strong_iss is set to 0, ISS is calculated by adding two - * components: a time component which grows by 250000 every second; and - * a per-connection component which grows by 125000 for every new connections. - * - * A third method, when tcp_strong_iss is set to 2, for generating ISS is - * prescribed by Steve Bellovin. This involves adding time, the 125000 per - * connection, and a one-way hash (MD5) of the connection ID <sport, dport, - * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered - * password. - */ -#define ISS_INCR 250000 -#define ISS_NSEC_SHT 12 - -static sin_t sin_null; /* Zero address for quick clears */ -static sin6_t sin6_null; /* Zero address for quick clears */ - -/* - * This implementation follows the 4.3BSD interpretation of the urgent - * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause - * incompatible changes in protocols like telnet and rlogin. - */ -#define TCP_OLD_URP_INTERPRETATION 1 - -/* - * Since tcp_listener is not cleared atomically with tcp_detached - * being cleared we need this extra bit to tell a detached connection - * apart from one that is in the process of being accepted. - */ -#define TCP_IS_DETACHED_NONEAGER(tcp) \ - (TCP_IS_DETACHED(tcp) && \ - (!(tcp)->tcp_hard_binding)) - -/* - * TCP reassembly macros. We hide starting and ending sequence numbers in - * b_next and b_prev of messages on the reassembly queue. The messages are - * chained using b_cont. These macros are used in tcp_reass() so we don't - * have to see the ugly casts and assignments. - */ -#define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) -#define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ - (mblk_t *)(uintptr_t)(u)) -#define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) -#define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ - (mblk_t *)(uintptr_t)(u)) - -/* - * Implementation of TCP Timers. - * ============================= - * - * INTERFACE: - * - * There are two basic functions dealing with tcp timers: - * - * timeout_id_t tcp_timeout(connp, func, time) - * clock_t tcp_timeout_cancel(connp, timeout_id) - * TCP_TIMER_RESTART(tcp, intvl) - * - * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' - * after 'time' ticks passed. The function called by timeout() must adhere to - * the same restrictions as a driver soft interrupt handler - it must not sleep - * or call other functions that might sleep. The value returned is the opaque - * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to - * cancel the request. The call to tcp_timeout() may fail in which case it - * returns zero. This is different from the timeout(9F) function which never - * fails. - * - * The call-back function 'func' always receives 'connp' as its single - * argument. It is always executed in the squeue corresponding to the tcp - * structure. The tcp structure is guaranteed to be present at the time the - * call-back is called. - * - * NOTE: The call-back function 'func' is never called if tcp is in - * the TCPS_CLOSED state. - * - * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() - * request. locks acquired by the call-back routine should not be held across - * the call to tcp_timeout_cancel() or a deadlock may result. - * - * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. - * Otherwise, it returns an integer value greater than or equal to 0. In - * particular, if the call-back function is already placed on the squeue, it can - * not be canceled. - * - * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called - * within squeue context corresponding to the tcp instance. Since the - * call-back is also called via the same squeue, there are no race - * conditions described in untimeout(9F) manual page since all calls are - * strictly serialized. - * - * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout - * stored in tcp_timer_tid and starts a new one using - * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back - * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid - * field. - * - * NOTE: since the timeout cancellation is not guaranteed, the cancelled - * call-back may still be called, so it is possible tcp_timer() will be - * called several times. This should not be a problem since tcp_timer() - * should always check the tcp instance state. - * - * - * IMPLEMENTATION: - * - * TCP timers are implemented using three-stage process. The call to - * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function - * when the timer expires. The tcp_timer_callback() arranges the call of the - * tcp_timer_handler() function via squeue corresponding to the tcp - * instance. The tcp_timer_handler() calls actual requested timeout call-back - * and passes tcp instance as an argument to it. Information is passed between - * stages using the tcp_timer_t structure which contains the connp pointer, the - * tcp call-back to call and the timeout id returned by the timeout(9F). - * - * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - - * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo - * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() - * returns the pointer to this mblk. - * - * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It - * looks like a normal mblk without actual dblk attached to it. - * - * To optimize performance each tcp instance holds a small cache of timer - * mblocks. In the current implementation it caches up to two timer mblocks per - * tcp instance. The cache is preserved over tcp frees and is only freed when - * the whole tcp structure is destroyed by its kmem destructor. Since all tcp - * timer processing happens on a corresponding squeue, the cache manipulation - * does not require any locks. Experiments show that majority of timer mblocks - * allocations are satisfied from the tcp cache and do not involve kmem calls. - * - * The tcp_timeout() places a refhold on the connp instance which guarantees - * that it will be present at the time the call-back function fires. The - * tcp_timer_handler() drops the reference after calling the call-back, so the - * call-back function does not need to manipulate the references explicitly. - */ - -typedef struct tcp_timer_s { - conn_t *connp; - void (*tcpt_proc)(void *); - callout_id_t tcpt_tid; -} tcp_timer_t; - -static kmem_cache_t *tcp_timercache; kmem_cache_t *tcp_sack_info_cache; /* - * For scalability, we must not run a timer for every TCP connection - * in TIME_WAIT state. To see why, consider (for time wait interval of - * 4 minutes): - * 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's - * - * This list is ordered by time, so you need only delete from the head - * until you get to entries which aren't old enough to delete yet. - * The list consists of only the detached TIME_WAIT connections. - * - * Note that the timer (tcp_time_wait_expire) is started when the tcp_t - * becomes detached TIME_WAIT (either by changing the state and already - * being detached or the other way around). This means that the TIME_WAIT - * state can be extended (up to doubled) if the connection doesn't become - * detached for a long time. - * - * The list manipulations (including tcp_time_wait_next/prev) - * are protected by the tcp_time_wait_lock. The content of the - * detached TIME_WAIT connections is protected by the normal perimeters. - * - * This list is per squeue and squeues are shared across the tcp_stack_t's. - * Things on tcp_time_wait_head remain associated with the tcp_stack_t - * and conn_netstack. - * The tcp_t's that are added to tcp_free_list are disassociated and - * have NULL tcp_tcps and conn_netstack pointers. - */ -typedef struct tcp_squeue_priv_s { - kmutex_t tcp_time_wait_lock; - callout_id_t tcp_time_wait_tid; - tcp_t *tcp_time_wait_head; - tcp_t *tcp_time_wait_tail; - tcp_t *tcp_free_list; - uint_t tcp_free_list_cnt; -} tcp_squeue_priv_t; - -/* - * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. - * Running it every 5 seconds seems to give the best results. - */ -#define TCP_TIME_WAIT_DELAY drv_usectohz(5000000) - -/* * To prevent memory hog, limit the number of entries in tcp_free_list * to 1% of available memory / number of cpus */ @@ -529,21 +240,9 @@ uint_t tcp_free_list_max_cnt = 0; #define TCP_RECV_LOWATER 2048 #define TCP_RECV_HIWATER 128000 -/* - * PAWS needs a timer for 24 days. This is the number of ticks in 24 days - */ -#define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) - #define TIDUSZ 4096 /* transport interface data unit size */ /* - * Bind hash list size and has function. It has to be a power of 2 for - * hashing. - */ -#define TCP_BIND_FANOUT_SIZE 512 -#define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) - -/* * Size of acceptor hash list. It has to be a power of 2 for hashing. */ #define TCP_ACCEPTOR_FANOUT_SIZE 256 @@ -556,149 +255,11 @@ uint_t tcp_free_list_max_cnt = 0; ((uint_t)(accid) & (TCP_ACCEPTOR_FANOUT_SIZE - 1)) #endif /* _ILP32 */ -#define IP_ADDR_CACHE_SIZE 2048 -#define IP_ADDR_CACHE_HASH(faddr) \ - (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) - -/* - * If there is a limit set on the number of connections allowed per each - * listener, the following struct is used to store that counter. This needs - * to be separated from the listener since the listener can go away before - * all the connections are gone. When the struct is allocated, tlc_cnt is set - * to 1. When the listener goes away, tlc_cnt is decremented by one. And - * the last connection (or the listener) which decrements tlc_cnt to zero - * frees the struct. - * - * tlc_max is the threshold value tcps_conn_listen_port. It is set when the - * tcp_listen_cnt_t is allocated. - * - * tlc_report_time stores the time when cmn_err() is called to report that the - * max has been exceeeded. Report is done at most once every - * TCP_TLC_REPORT_INTERVAL mins for a listener. - * - * tlc_drop stores the number of connection attempt dropped because the - * limit has reached. - */ -typedef struct tcp_listen_cnt_s { - uint32_t tlc_max; - uint32_t tlc_cnt; - int64_t tlc_report_time; - uint32_t tlc_drop; -} tcp_listen_cnt_t; - -#define TCP_TLC_REPORT_INTERVAL (1 * MINUTES) - -#define TCP_DECR_LISTEN_CNT(tcp) \ -{ \ - ASSERT((tcp)->tcp_listen_cnt->tlc_cnt > 0); \ - if (atomic_add_32_nv(&(tcp)->tcp_listen_cnt->tlc_cnt, -1) == 0) \ - kmem_free((tcp)->tcp_listen_cnt, sizeof (tcp_listen_cnt_t)); \ - (tcp)->tcp_listen_cnt = NULL; \ -} - /* Minimum number of connections per listener. */ -uint32_t tcp_min_conn_listener = 2; - -/* - * Linked list struct to store listener connection limit configuration per - * IP stack. - */ -typedef struct tcp_listener_s { - in_port_t tl_port; - uint32_t tl_ratio; - list_node_t tl_link; -} tcp_listener_t; +static uint32_t tcp_min_conn_listener = 2; -/* - * The shift factor applied to tcp_mss to decide if the peer sends us a - * valid initial receive window. By default, if the peer receive window - * is smaller than 1 MSS (shift factor is 0), it is considered as invalid. - */ -uint32_t tcp_init_wnd_shft = 0; - -/* Control whether TCP can enter defensive mode when under memory pressure. */ -boolean_t tcp_do_reclaim = B_TRUE; - -/* - * When the system is under memory pressure, stack variable tcps_reclaim is - * true, we shorten the connection timeout abort interval to tcp_early_abort - * seconds. - */ uint32_t tcp_early_abort = 30; -/* - * TCP options struct returned from tcp_parse_options. - */ -typedef struct tcp_opt_s { - uint32_t tcp_opt_mss; - uint32_t tcp_opt_wscale; - uint32_t tcp_opt_ts_val; - uint32_t tcp_opt_ts_ecr; - tcp_t *tcp; -} tcp_opt_t; - -/* - * RFC1323-recommended phrasing of TSTAMP option, for easier parsing - */ - -#ifdef _BIG_ENDIAN -#define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ - (TCPOPT_TSTAMP << 8) | 10) -#else -#define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ - (TCPOPT_NOP << 8) | TCPOPT_NOP) -#endif - -/* - * Flags returned from tcp_parse_options. - */ -#define TCP_OPT_MSS_PRESENT 1 -#define TCP_OPT_WSCALE_PRESENT 2 -#define TCP_OPT_TSTAMP_PRESENT 4 -#define TCP_OPT_SACK_OK_PRESENT 8 -#define TCP_OPT_SACK_PRESENT 16 - -/* TCP option length */ -#define TCPOPT_NOP_LEN 1 -#define TCPOPT_MAXSEG_LEN 4 -#define TCPOPT_WS_LEN 3 -#define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) -#define TCPOPT_TSTAMP_LEN 10 -#define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) -#define TCPOPT_SACK_OK_LEN 2 -#define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) -#define TCPOPT_REAL_SACK_LEN 4 -#define TCPOPT_MAX_SACK_LEN 36 -#define TCPOPT_HEADER_LEN 2 - -/* TCP cwnd burst factor. */ -#define TCP_CWND_INFINITE 65535 -#define TCP_CWND_SS 3 -#define TCP_CWND_NORMAL 5 - -/* Maximum TCP initial cwin (start/restart). */ -#define TCP_MAX_INIT_CWND 8 - -/* - * Initialize cwnd according to RFC 3390. def_max_init_cwnd is - * either tcp_slow_start_initial or tcp_slow_start_after idle - * depending on the caller. If the upper layer has not used the - * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd - * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. - * If the upper layer has changed set the tcp_init_cwnd, just use - * it to calculate the tcp_cwnd. - */ -#define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \ -{ \ - if ((tcp)->tcp_init_cwnd == 0) { \ - (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ - MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ - } else { \ - (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ - } \ - tcp->tcp_cwnd_cnt = 0; \ -} - /* TCP Timer control structure */ typedef struct tcpt_s { pfv_t tcpt_pfv; /* The routine we are to call */ @@ -710,88 +271,27 @@ typedef struct tcpt_s { */ void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira); -static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); void tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira); -static void tcp_close_output(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -void tcp_output(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy); -static void tcp_send_synack(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); /* Prototype for TCP functions */ static void tcp_random_init(void); int tcp_random(void); -static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp); -static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, - tcp_t *eager); -static int tcp_set_destination(tcp_t *tcp); -static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, - int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only, - boolean_t user_specified); -static void tcp_closei_local(tcp_t *tcp); -static void tcp_close_detached(tcp_t *tcp); -static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, - mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira); -static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp); static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp, in_port_t dstport, uint_t srcid); static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp, in_port_t dstport, uint32_t flowinfo, uint_t srcid, uint32_t scope_id); -static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag); -static void tcp_disconnect(tcp_t *tcp, mblk_t *mp); -static char *tcp_display(tcp_t *tcp, char *, char); -static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum); -static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only); -static void tcp_eager_unlink(tcp_t *tcp); -static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr, - int unixerr); -static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, - int tlierr, int unixerr); static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static int tcp_tpistate(tcp_t *tcp); -static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp, - int caller_holds_lock); -static void tcp_bind_hash_remove(tcp_t *tcp); -static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *); -void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp); -static void tcp_acceptor_hash_remove(tcp_t *tcp); -static void tcp_capability_req(tcp_t *tcp, mblk_t *mp); -static void tcp_info_req(tcp_t *tcp, mblk_t *mp); -static void tcp_addr_req(tcp_t *tcp, mblk_t *mp); -static void tcp_init_values(tcp_t *tcp); -static void tcp_ip_notify(tcp_t *tcp); static void tcp_iss_init(tcp_t *tcp); -static void tcp_keepalive_killer(void *arg); -static int tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt); -static void tcp_mss_set(tcp_t *tcp, uint32_t size); -static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, - int *do_disconnectp, int *t_errorp, int *sys_errorp); -static boolean_t tcp_allow_connopt_set(int level, int name); -int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr); static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr); static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt, tcp_stack_t *); @@ -802,160 +302,26 @@ static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value, static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *); static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt); -static void tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt); -static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start); -static void tcp_reass_timer(void *arg); -static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp); static void tcp_reinit(tcp_t *tcp); static void tcp_reinit_values(tcp_t *tcp); -static uint_t tcp_rwnd_reopen(tcp_t *tcp); -static uint_t tcp_rcv_drain(tcp_t *tcp); -static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags); -static boolean_t tcp_send_rst_chk(tcp_stack_t *); -static void tcp_ss_rexmit(tcp_t *tcp); -static mblk_t *tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, - ip_recv_attr_t *); -static void tcp_process_options(tcp_t *, tcpha_t *); -static void tcp_rsrv(queue_t *q); -static int tcp_snmp_state(tcp_t *tcp); -static void tcp_timer(void *arg); -static void tcp_timer_callback(void *); -static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp, - boolean_t random); -static in_port_t tcp_get_next_priv_port(const tcp_t *); -static void tcp_wput_sock(queue_t *q, mblk_t *mp); -static void tcp_wput_fallback(queue_t *q, mblk_t *mp); -void tcp_tpi_accept(queue_t *q, mblk_t *mp); -static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent); -static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp); -static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); -static int tcp_send(tcp_t *tcp, const int mss, - const int total_hdr_len, const int tcp_hdr_len, - const int num_sack_blk, int *usable, uint_t *snxt, - int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time); -static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, - int num_sack_blk); static void tcp_wsrv(queue_t *q); -static int tcp_xmit_end(tcp_t *tcp); -static void tcp_ack_timer(void *arg); -static mblk_t *tcp_ack_mp(tcp_t *tcp); -static void tcp_xmit_early_reset(char *str, mblk_t *mp, - uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *, - ip_stack_t *, conn_t *); -static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, - uint32_t ack, int ctl); -static void tcp_set_rto(tcp_t *, time_t); -static void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); -static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); -static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, - ip_recv_attr_t *); -static int tcp_build_hdrs(tcp_t *); -static void tcp_time_wait_append(tcp_t *tcp); -static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, - uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha, - ip_recv_attr_t *ira); -boolean_t tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp); -static boolean_t tcp_zcopy_check(tcp_t *); -static void tcp_zcopy_notify(tcp_t *); -static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa); -static void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only); static void tcp_update_zcopy(tcp_t *tcp); static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t, ixa_notify_arg_t); -static void tcp_rexmit_after_error(tcp_t *tcp); -static void tcp_send_data(tcp_t *, mblk_t *); -extern mblk_t *tcp_timermp_alloc(int); -extern void tcp_timermp_free(tcp_t *); -static void tcp_timer_free(tcp_t *tcp, mblk_t *mp); -static void tcp_stop_lingering(tcp_t *tcp); -static void tcp_close_linger_timeout(void *arg); static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns); static void tcp_stack_fini(netstackid_t stackid, void *arg); -static void *tcp_g_kstat_init(tcp_g_stat_t *); -static void tcp_g_kstat_fini(kstat_t *); -static void *tcp_kstat_init(netstackid_t, tcp_stack_t *); -static void tcp_kstat_fini(netstackid_t, kstat_t *); -static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *); -static void tcp_kstat2_fini(netstackid_t, kstat_t *); -static int tcp_kstat_update(kstat_t *kp, int rw); -static mblk_t *tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, - ip_recv_attr_t *ira); -static mblk_t *tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, - ip_recv_attr_t *ira); + static int tcp_squeue_switch(int); static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t); static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *); static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *); -static int tcp_tpi_close(queue_t *, int); -static int tcp_tpi_close_accept(queue_t *); static void tcp_squeue_add(squeue_t *); -static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); - -extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *); - -void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy); -void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); - -static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, - sock_upper_handle_t, cred_t *); -static int tcp_listen(sock_lower_handle_t, int, cred_t *); -static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *, - boolean_t); -static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, - cred_t *, pid_t); -static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, - boolean_t); -static int tcp_do_unbind(conn_t *); -static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *, - boolean_t); - -static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); - -static uint32_t tcp_find_listener_conf(tcp_stack_t *, in_port_t); -static int tcp_listener_conf_get(queue_t *, mblk_t *, caddr_t, cred_t *); -static int tcp_listener_conf_add(queue_t *, mblk_t *, char *, caddr_t, - cred_t *); -static int tcp_listener_conf_del(queue_t *, mblk_t *, char *, caddr_t, - cred_t *); -static void tcp_listener_conf_cleanup(tcp_stack_t *); - -/* - * Routines related to the TCP_IOC_ABORT_CONN ioctl command. - * - * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting - * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure - * (defined in tcp.h) needs to be filled in and passed into the kernel - * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t - * structure contains the four-tuple of a TCP connection and a range of TCP - * states (specified by ac_start and ac_end). The use of wildcard addresses - * and ports is allowed. Connections with a matching four tuple and a state - * within the specified range will be aborted. The valid states for the - * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, - * inclusive. - * - * An application which has its connection aborted by this ioctl will receive - * an error that is dependent on the connection state at the time of the abort. - * If the connection state is < TCPS_TIME_WAIT, an application should behave as - * though a RST packet has been received. If the connection state is equal to - * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel - * and all resources associated with the connection will be freed. - */ -static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); -static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); -static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy); -static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); -static void tcp_ioctl_abort_conn(queue_t *, mblk_t *); -static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, - boolean_t, tcp_stack_t *); -static struct module_info tcp_rinfo = { +struct module_info tcp_rinfo = { TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER }; @@ -1013,11 +379,6 @@ struct streamtab tcpinfov6 = { &tcp_rinitv6, &tcp_winit }; -sock_downcalls_t sock_tcp_downcalls; - -/* Setable only in /etc/system. Move to ndd? */ -boolean_t tcp_icmp_source_quench = B_FALSE; - /* * Following assumes TPI alignment requirements stay along 32 bit * boundaries @@ -1026,7 +387,7 @@ boolean_t tcp_icmp_source_quench = B_FALSE; (((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1)) /* Template for response to info request. */ -static struct T_info_ack tcp_g_t_info_ack = { +struct T_info_ack tcp_g_t_info_ack = { T_INFO_ACK, /* PRIM_type */ 0, /* TSDU_size */ T_INFINITE, /* ETSDU_size */ @@ -1040,7 +401,7 @@ static struct T_info_ack tcp_g_t_info_ack = { (XPG4_1|EXPINLINE) /* PROVIDER_flag */ }; -static struct T_info_ack tcp_g_t_info_ack_v6 = { +struct T_info_ack tcp_g_t_info_ack_v6 = { T_INFO_ACK, /* PRIM_type */ 0, /* TSDU_size */ T_INFINITE, /* ETSDU_size */ @@ -1054,12 +415,6 @@ static struct T_info_ack tcp_g_t_info_ack_v6 = { (XPG4_1|EXPINLINE) /* PROVIDER_flag */ }; -#define MS 1L -#define SECONDS (1000 * MS) -#define MINUTES (60 * SECONDS) -#define HOURS (60 * MINUTES) -#define DAYS (24 * HOURS) - #define PARAM_MAX (~(uint32_t)0) /* Max size IP datagram is 64k - 1 */ @@ -1076,7 +431,6 @@ static struct T_info_ack tcp_g_t_info_ack_v6 = { * layer header. It has to be a multiple of 4. */ static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" }; -#define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val #define MB (1024 * 1024) @@ -1153,35 +507,6 @@ static tcpparam_t lcl_tcp_param_arr[] = { }; /* END CSTYLED */ -/* Round up the value to the nearest mss. */ -#define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) - -/* - * Set ECN capable transport (ECT) code point in IP header. - * - * Note that there are 2 ECT code points '01' and '10', which are called - * ECT(1) and ECT(0) respectively. Here we follow the original ECT code - * point ECT(0) for TCP as described in RFC 2481. - */ -#define SET_ECT(tcp, iph) \ - if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \ - /* We need to clear the code point first. */ \ - ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ - ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ - } else { \ - ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ - ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ - } - -/* - * The format argument to pass to tcp_display(). - * DISP_PORT_ONLY means that the returned string has only port info. - * DISP_ADDR_AND_PORT means that the returned string also contains the - * remote and local IP address. - */ -#define DISP_PORT_ONLY 1 -#define DISP_ADDR_AND_PORT 2 - #define IS_VMLOANED_MBLK(mp) \ (((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0) @@ -1194,194 +519,10 @@ uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */ */ boolean_t tcp_static_maxpsz = B_FALSE; -/* Setable in /etc/system */ -/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ -uint32_t tcp_random_anon_port = 1; - -/* - * To reach to an eager in Q0 which can be dropped due to an incoming - * new SYN request when Q0 is full, a new doubly linked list is - * introduced. This list allows to select an eager from Q0 in O(1) time. - * This is needed to avoid spending too much time walking through the - * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of - * this new list has to be a member of Q0. - * This list is headed by listener's tcp_t. When the list is empty, - * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, - * of listener's tcp_t point to listener's tcp_t itself. - * - * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager - * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. - * These macros do not affect the eager's membership to Q0. - */ - - -#define MAKE_DROPPABLE(listener, eager) \ - if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ - (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ - = (eager); \ - (eager)->tcp_eager_prev_drop_q0 = (listener); \ - (eager)->tcp_eager_next_drop_q0 = \ - (listener)->tcp_eager_next_drop_q0; \ - (listener)->tcp_eager_next_drop_q0 = (eager); \ - } - -#define MAKE_UNDROPPABLE(eager) \ - if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ - (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ - = (eager)->tcp_eager_prev_drop_q0; \ - (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ - = (eager)->tcp_eager_next_drop_q0; \ - (eager)->tcp_eager_prev_drop_q0 = NULL; \ - (eager)->tcp_eager_next_drop_q0 = NULL; \ - } - -/* - * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more - * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent - * data, TCP will not respond with an ACK. RFC 793 requires that - * TCP responds with an ACK for such a bogus ACK. By not following - * the RFC, we prevent TCP from getting into an ACK storm if somehow - * an attacker successfully spoofs an acceptable segment to our - * peer; or when our peer is "confused." - */ -uint32_t tcp_drop_ack_unsent_cnt = 10; - -/* - * Hook functions to enable cluster networking - * On non-clustered systems these vectors must always be NULL. - */ - -void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol, - sa_family_t addr_family, uint8_t *laddrp, - in_port_t lport, void *args) = NULL; -void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol, - sa_family_t addr_family, uint8_t *laddrp, - in_port_t lport, void *args) = NULL; - -int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, - boolean_t is_outgoing, - sa_family_t addr_family, - uint8_t *laddrp, in_port_t lport, - uint8_t *faddrp, in_port_t fport, - void *args) = NULL; -void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, - sa_family_t addr_family, uint8_t *laddrp, - in_port_t lport, uint8_t *faddrp, - in_port_t fport, void *args) = NULL; - - -/* - * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) - */ -#define CL_INET_CONNECT(connp, is_outgoing, err) { \ - (err) = 0; \ - if (cl_inet_connect2 != NULL) { \ - /* \ - * Running in cluster mode - register active connection \ - * information \ - */ \ - if ((connp)->conn_ipversion == IPV4_VERSION) { \ - if ((connp)->conn_laddr_v4 != 0) { \ - (err) = (*cl_inet_connect2)( \ - (connp)->conn_netstack->netstack_stackid,\ - IPPROTO_TCP, is_outgoing, AF_INET, \ - (uint8_t *)(&((connp)->conn_laddr_v4)),\ - (in_port_t)(connp)->conn_lport, \ - (uint8_t *)(&((connp)->conn_faddr_v4)),\ - (in_port_t)(connp)->conn_fport, NULL); \ - } \ - } else { \ - if (!IN6_IS_ADDR_UNSPECIFIED( \ - &(connp)->conn_laddr_v6)) { \ - (err) = (*cl_inet_connect2)( \ - (connp)->conn_netstack->netstack_stackid,\ - IPPROTO_TCP, is_outgoing, AF_INET6, \ - (uint8_t *)(&((connp)->conn_laddr_v6)),\ - (in_port_t)(connp)->conn_lport, \ - (uint8_t *)(&((connp)->conn_faddr_v6)), \ - (in_port_t)(connp)->conn_fport, NULL); \ - } \ - } \ - } \ -} - -#define CL_INET_DISCONNECT(connp) { \ - if (cl_inet_disconnect != NULL) { \ - /* \ - * Running in cluster mode - deregister active \ - * connection information \ - */ \ - if ((connp)->conn_ipversion == IPV4_VERSION) { \ - if ((connp)->conn_laddr_v4 != 0) { \ - (*cl_inet_disconnect)( \ - (connp)->conn_netstack->netstack_stackid,\ - IPPROTO_TCP, AF_INET, \ - (uint8_t *)(&((connp)->conn_laddr_v4)),\ - (in_port_t)(connp)->conn_lport, \ - (uint8_t *)(&((connp)->conn_faddr_v4)),\ - (in_port_t)(connp)->conn_fport, NULL); \ - } \ - } else { \ - if (!IN6_IS_ADDR_UNSPECIFIED( \ - &(connp)->conn_laddr_v6)) { \ - (*cl_inet_disconnect)( \ - (connp)->conn_netstack->netstack_stackid,\ - IPPROTO_TCP, AF_INET6, \ - (uint8_t *)(&((connp)->conn_laddr_v6)),\ - (in_port_t)(connp)->conn_lport, \ - (uint8_t *)(&((connp)->conn_faddr_v6)), \ - (in_port_t)(connp)->conn_fport, NULL); \ - } \ - } \ - } \ -} - -/* - * Steps to do when a tcp_t moves to TIME-WAIT state. - * - * This connection is done, we don't need to account for it. Decrement - * the listener connection counter if needed. - * - * Unconditionally clear the exclusive binding bit so this TIME-WAIT - * connection won't interfere with new ones. - * - * Start the TIME-WAIT timer. If upper layer has not closed the connection, - * the timer is handled within the context of this tcp_t. When the timer - * fires, tcp_clean_death() is called. If upper layer closes the connection - * during this period, tcp_time_wait_append() will be called to add this - * tcp_t to the global TIME-WAIT list. Note that this means that the - * actual wait time in TIME-WAIT state will be longer than the - * tcps_time_wait_interval since the period before upper layer closes the - * connection is not accounted for when tcp_time_wait_append() is called. - * - * If uppser layer has closed the connection, call tcp_time_wait_append() - * directly. - */ -#define SET_TIME_WAIT(tcps, tcp, connp) \ -{ \ - (tcp)->tcp_state = TCPS_TIME_WAIT; \ - if ((tcp)->tcp_listen_cnt != NULL) \ - TCP_DECR_LISTEN_CNT(tcp); \ - (connp)->conn_exclbind = 0; \ - if (!TCP_IS_DETACHED(tcp)) { \ - TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \ - } else { \ - tcp_time_wait_append(tcp); \ - TCP_DBGSTAT(tcps, tcp_rput_time_wait); \ - } \ -} - /* - * Cluster networking hook for traversing current connection list. - * This routine is used to extract the current list of live connections - * which must continue to to be dispatched to this node. + * If the receive buffer size is changed, this function is called to update + * the upper socket layer on the new delayed receive wake up threshold. */ -int cl_tcp_walk_list(netstackid_t stack_id, - int (*callback)(cl_tcp_info_t *, void *), void *arg); - -static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), - void *arg, tcp_stack_t *tcps); - static void tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) { @@ -1404,6 +545,7 @@ tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) (connp->conn_upper_handle, &sopp); } } + /* * Figure out the value of window scale opton. Note that the rwnd is * ASSUMED to be rounded up to the nearest MSS before the calculation. @@ -1412,7 +554,7 @@ tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh) * * Set the compiler flag to make this function inline. */ -static void +void tcp_set_ws_value(tcp_t *tcp) { int i; @@ -1425,160 +567,6 @@ tcp_set_ws_value(tcp_t *tcp) } /* - * Remove a connection from the list of detached TIME_WAIT connections. - * It returns B_FALSE if it can't remove the connection from the list - * as the connection has already been removed from the list due to an - * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. - */ -static boolean_t -tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) -{ - boolean_t locked = B_FALSE; - - if (tcp_time_wait == NULL) { - tcp_time_wait = *((tcp_squeue_priv_t **) - squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - locked = B_TRUE; - } else { - ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); - } - - if (tcp->tcp_time_wait_expire == 0) { - ASSERT(tcp->tcp_time_wait_next == NULL); - ASSERT(tcp->tcp_time_wait_prev == NULL); - if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - return (B_FALSE); - } - ASSERT(TCP_IS_DETACHED(tcp)); - ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); - - if (tcp == tcp_time_wait->tcp_time_wait_head) { - ASSERT(tcp->tcp_time_wait_prev == NULL); - tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; - if (tcp_time_wait->tcp_time_wait_head != NULL) { - tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = - NULL; - } else { - tcp_time_wait->tcp_time_wait_tail = NULL; - } - } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { - ASSERT(tcp != tcp_time_wait->tcp_time_wait_head); - ASSERT(tcp->tcp_time_wait_next == NULL); - tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; - } else { - ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); - ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); - tcp->tcp_time_wait_prev->tcp_time_wait_next = - tcp->tcp_time_wait_next; - tcp->tcp_time_wait_next->tcp_time_wait_prev = - tcp->tcp_time_wait_prev; - } - tcp->tcp_time_wait_next = NULL; - tcp->tcp_time_wait_prev = NULL; - tcp->tcp_time_wait_expire = 0; - - if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - return (B_TRUE); -} - -/* - * Add a connection to the list of detached TIME_WAIT connections - * and set its time to expire. - */ -static void -tcp_time_wait_append(tcp_t *tcp) -{ - tcp_stack_t *tcps = tcp->tcp_tcps; - tcp_squeue_priv_t *tcp_time_wait = - *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, - SQPRIVATE_TCP)); - - tcp_timers_stop(tcp); - - /* Freed above */ - ASSERT(tcp->tcp_timer_tid == 0); - ASSERT(tcp->tcp_ack_tid == 0); - - /* must have happened at the time of detaching the tcp */ - ASSERT(tcp->tcp_ptpahn == NULL); - ASSERT(tcp->tcp_flow_stopped == 0); - ASSERT(tcp->tcp_time_wait_next == NULL); - ASSERT(tcp->tcp_time_wait_prev == NULL); - ASSERT(tcp->tcp_time_wait_expire == NULL); - ASSERT(tcp->tcp_listener == NULL); - - tcp->tcp_time_wait_expire = ddi_get_lbolt(); - /* - * The value computed below in tcp->tcp_time_wait_expire may - * appear negative or wrap around. That is ok since our - * interest is only in the difference between the current lbolt - * value and tcp->tcp_time_wait_expire. But the value should not - * be zero, since it means the tcp is not in the TIME_WAIT list. - * The corresponding comparison in tcp_time_wait_collector() uses - * modular arithmetic. - */ - tcp->tcp_time_wait_expire += - drv_usectohz(tcps->tcps_time_wait_interval * 1000); - if (tcp->tcp_time_wait_expire == 0) - tcp->tcp_time_wait_expire = 1; - - ASSERT(TCP_IS_DETACHED(tcp)); - ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); - ASSERT(tcp->tcp_time_wait_next == NULL); - ASSERT(tcp->tcp_time_wait_prev == NULL); - TCP_DBGSTAT(tcps, tcp_time_wait); - - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - if (tcp_time_wait->tcp_time_wait_head == NULL) { - ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); - tcp_time_wait->tcp_time_wait_head = tcp; - } else { - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == - TCPS_TIME_WAIT); - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; - tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; - } - tcp_time_wait->tcp_time_wait_tail = tcp; - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); -} - -/* ARGSUSED */ -void -tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - ASSERT(tcp != NULL); - if (tcp->tcp_state == TCPS_CLOSED) { - return; - } - - ASSERT((connp->conn_family == AF_INET && - connp->conn_ipversion == IPV4_VERSION) || - (connp->conn_family == AF_INET6 && - (connp->conn_ipversion == IPV4_VERSION || - connp->conn_ipversion == IPV6_VERSION))); - ASSERT(!tcp->tcp_listener); - - TCP_STAT(tcps, tcp_time_wait_reap); - ASSERT(TCP_IS_DETACHED(tcp)); - - /* - * Because they have no upstream client to rebind or tcp_close() - * them later, we axe the connection here and now. - */ - tcp_close_detached(tcp); -} - -/* * Remove cached/latched IPsec references. */ void @@ -1709,732 +697,6 @@ tcp_cleanup(tcp_t *tcp) } /* - * Blows away all tcps whose TIME_WAIT has expired. List traversal - * is done forwards from the head. - * This walks all stack instances since - * tcp_time_wait remains global across all stacks. - */ -/* ARGSUSED */ -void -tcp_time_wait_collector(void *arg) -{ - tcp_t *tcp; - clock_t now; - mblk_t *mp; - conn_t *connp; - kmutex_t *lock; - boolean_t removed; - - squeue_t *sqp = (squeue_t *)arg; - tcp_squeue_priv_t *tcp_time_wait = - *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); - - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - tcp_time_wait->tcp_time_wait_tid = 0; - - if (tcp_time_wait->tcp_free_list != NULL && - tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { - TCP_G_STAT(tcp_freelist_cleanup); - while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { - tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; - tcp->tcp_time_wait_next = NULL; - tcp_time_wait->tcp_free_list_cnt--; - ASSERT(tcp->tcp_tcps == NULL); - CONN_DEC_REF(tcp->tcp_connp); - } - ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); - } - - /* - * In order to reap time waits reliably, we should use a - * source of time that is not adjustable by the user -- hence - * the call to ddi_get_lbolt(). - */ - now = ddi_get_lbolt(); - while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { - /* - * Compare times using modular arithmetic, since - * lbolt can wrapover. - */ - if ((now - tcp->tcp_time_wait_expire) < 0) { - break; - } - - removed = tcp_time_wait_remove(tcp, tcp_time_wait); - ASSERT(removed); - - connp = tcp->tcp_connp; - ASSERT(connp->conn_fanout != NULL); - lock = &connp->conn_fanout->connf_lock; - /* - * This is essentially a TW reclaim fast path optimization for - * performance where the timewait collector checks under the - * fanout lock (so that no one else can get access to the - * conn_t) that the refcnt is 2 i.e. one for TCP and one for - * the classifier hash list. If ref count is indeed 2, we can - * just remove the conn under the fanout lock and avoid - * cleaning up the conn under the squeue, provided that - * clustering callbacks are not enabled. If clustering is - * enabled, we need to make the clustering callback before - * setting the CONDEMNED flag and after dropping all locks and - * so we forego this optimization and fall back to the slow - * path. Also please see the comments in tcp_closei_local - * regarding the refcnt logic. - * - * Since we are holding the tcp_time_wait_lock, its better - * not to block on the fanout_lock because other connections - * can't add themselves to time_wait list. So we do a - * tryenter instead of mutex_enter. - */ - if (mutex_tryenter(lock)) { - mutex_enter(&connp->conn_lock); - if ((connp->conn_ref == 2) && - (cl_inet_disconnect == NULL)) { - ipcl_hash_remove_locked(connp, - connp->conn_fanout); - /* - * Set the CONDEMNED flag now itself so that - * the refcnt cannot increase due to any - * walker. - */ - connp->conn_state_flags |= CONN_CONDEMNED; - mutex_exit(lock); - mutex_exit(&connp->conn_lock); - if (tcp_time_wait->tcp_free_list_cnt < - tcp_free_list_max_cnt) { - /* Add to head of tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_cleanup(tcp); - ASSERT(connp->conn_latch == NULL); - ASSERT(connp->conn_policy == NULL); - ASSERT(tcp->tcp_tcps == NULL); - ASSERT(connp->conn_netstack == NULL); - - mutex_enter( - &tcp_time_wait->tcp_time_wait_lock); - tcp->tcp_time_wait_next = - tcp_time_wait->tcp_free_list; - tcp_time_wait->tcp_free_list = tcp; - tcp_time_wait->tcp_free_list_cnt++; - continue; - } else { - /* Do not add to tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_bind_hash_remove(tcp); - ixa_cleanup(tcp->tcp_connp->conn_ixa); - tcp_ipsec_cleanup(tcp); - CONN_DEC_REF(tcp->tcp_connp); - } - } else { - CONN_INC_REF_LOCKED(connp); - mutex_exit(lock); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ - - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, - "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_output, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); - } - } else { - mutex_enter(&connp->conn_lock); - CONN_INC_REF_LOCKED(connp); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ - - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_output, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); - } - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - } - - if (tcp_time_wait->tcp_free_list != NULL) - tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; - - tcp_time_wait->tcp_time_wait_tid = - timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, - TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION, - CALLOUT_FLAG_ROUNDUP); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); -} - -/* - * Reply to a clients T_CONN_RES TPI message. This function - * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES - * on the acceptor STREAM and processed in tcp_accept_common(). - * Read the block comment on top of tcp_input_listener(). - */ -static void -tcp_tli_accept(tcp_t *listener, mblk_t *mp) -{ - tcp_t *acceptor; - tcp_t *eager; - tcp_t *tcp; - struct T_conn_res *tcr; - t_uscalar_t acceptor_id; - t_scalar_t seqnum; - mblk_t *discon_mp = NULL; - mblk_t *ok_mp; - mblk_t *mp1; - tcp_stack_t *tcps = listener->tcp_tcps; - conn_t *econnp; - - if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { - tcp_err_ack(listener, mp, TPROTO, 0); - return; - } - tcr = (struct T_conn_res *)mp->b_rptr; - - /* - * Under ILP32 the stream head points tcr->ACCEPTOR_id at the - * read side queue of the streams device underneath us i.e. the - * read side queue of 'ip'. Since we can't deference QUEUE_ptr we - * look it up in the queue_hash. Under LP64 it sends down the - * minor_t of the accepting endpoint. - * - * Once the acceptor/eager are modified (in tcp_accept_swap) the - * fanout hash lock is held. - * This prevents any thread from entering the acceptor queue from - * below (since it has not been hard bound yet i.e. any inbound - * packets will arrive on the listener conn_t and - * go through the classifier). - * The CONN_INC_REF will prevent the acceptor from closing. - * - * XXX It is still possible for a tli application to send down data - * on the accepting stream while another thread calls t_accept. - * This should not be a problem for well-behaved applications since - * the T_OK_ACK is sent after the queue swapping is completed. - * - * If the accepting fd is the same as the listening fd, avoid - * queue hash lookup since that will return an eager listener in a - * already established state. - */ - acceptor_id = tcr->ACCEPTOR_id; - mutex_enter(&listener->tcp_eager_lock); - if (listener->tcp_acceptor_id == acceptor_id) { - eager = listener->tcp_eager_next_q; - /* only count how many T_CONN_INDs so don't count q0 */ - if ((listener->tcp_conn_req_cnt_q != 1) || - (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { - mutex_exit(&listener->tcp_eager_lock); - tcp_err_ack(listener, mp, TBADF, 0); - return; - } - if (listener->tcp_conn_req_cnt_q0 != 0) { - /* Throw away all the eagers on q0. */ - tcp_eager_cleanup(listener, 1); - } - if (listener->tcp_syn_defense) { - listener->tcp_syn_defense = B_FALSE; - if (listener->tcp_ip_addr_cache != NULL) { - kmem_free(listener->tcp_ip_addr_cache, - IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); - listener->tcp_ip_addr_cache = NULL; - } - } - /* - * Transfer tcp_conn_req_max to the eager so that when - * a disconnect occurs we can revert the endpoint to the - * listen state. - */ - eager->tcp_conn_req_max = listener->tcp_conn_req_max; - ASSERT(listener->tcp_conn_req_cnt_q0 == 0); - /* - * Get a reference on the acceptor just like the - * tcp_acceptor_hash_lookup below. - */ - acceptor = listener; - CONN_INC_REF(acceptor->tcp_connp); - } else { - acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); - if (acceptor == NULL) { - if (listener->tcp_connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_accept: did not find acceptor 0x%x\n", - acceptor_id); - } - mutex_exit(&listener->tcp_eager_lock); - tcp_err_ack(listener, mp, TPROVMISMATCH, 0); - return; - } - /* - * Verify acceptor state. The acceptable states for an acceptor - * include TCPS_IDLE and TCPS_BOUND. - */ - switch (acceptor->tcp_state) { - case TCPS_IDLE: - /* FALLTHRU */ - case TCPS_BOUND: - break; - default: - CONN_DEC_REF(acceptor->tcp_connp); - mutex_exit(&listener->tcp_eager_lock); - tcp_err_ack(listener, mp, TOUTSTATE, 0); - return; - } - } - - /* The listener must be in TCPS_LISTEN */ - if (listener->tcp_state != TCPS_LISTEN) { - CONN_DEC_REF(acceptor->tcp_connp); - mutex_exit(&listener->tcp_eager_lock); - tcp_err_ack(listener, mp, TOUTSTATE, 0); - return; - } - - /* - * Rendezvous with an eager connection request packet hanging off - * 'tcp' that has the 'seqnum' tag. We tagged the detached open - * tcp structure when the connection packet arrived in - * tcp_input_listener(). - */ - seqnum = tcr->SEQ_number; - eager = listener; - do { - eager = eager->tcp_eager_next_q; - if (eager == NULL) { - CONN_DEC_REF(acceptor->tcp_connp); - mutex_exit(&listener->tcp_eager_lock); - tcp_err_ack(listener, mp, TBADSEQ, 0); - return; - } - } while (eager->tcp_conn_req_seqnum != seqnum); - mutex_exit(&listener->tcp_eager_lock); - - /* - * At this point, both acceptor and listener have 2 ref - * that they begin with. Acceptor has one additional ref - * we placed in lookup while listener has 3 additional - * ref for being behind the squeue (tcp_accept() is - * done on listener's squeue); being in classifier hash; - * and eager's ref on listener. - */ - ASSERT(listener->tcp_connp->conn_ref >= 5); - ASSERT(acceptor->tcp_connp->conn_ref >= 3); - - /* - * The eager at this point is set in its own squeue and - * could easily have been killed (tcp_accept_finish will - * deal with that) because of a TH_RST so we can only - * ASSERT for a single ref. - */ - ASSERT(eager->tcp_connp->conn_ref >= 1); - - /* - * Pre allocate the discon_ind mblk also. tcp_accept_finish will - * use it if something failed. - */ - discon_mp = allocb(MAX(sizeof (struct T_discon_ind), - sizeof (struct stroptions)), BPRI_HI); - if (discon_mp == NULL) { - CONN_DEC_REF(acceptor->tcp_connp); - CONN_DEC_REF(eager->tcp_connp); - tcp_err_ack(listener, mp, TSYSERR, ENOMEM); - return; - } - - econnp = eager->tcp_connp; - - /* Hold a copy of mp, in case reallocb fails */ - if ((mp1 = copymsg(mp)) == NULL) { - CONN_DEC_REF(acceptor->tcp_connp); - CONN_DEC_REF(eager->tcp_connp); - freemsg(discon_mp); - tcp_err_ack(listener, mp, TSYSERR, ENOMEM); - return; - } - - tcr = (struct T_conn_res *)mp1->b_rptr; - - /* - * This is an expanded version of mi_tpi_ok_ack_alloc() - * which allocates a larger mblk and appends the new - * local address to the ok_ack. The address is copied by - * soaccept() for getsockname(). - */ - { - int extra; - - extra = (econnp->conn_family == AF_INET) ? - sizeof (sin_t) : sizeof (sin6_t); - - /* - * Try to re-use mp, if possible. Otherwise, allocate - * an mblk and return it as ok_mp. In any case, mp - * is no longer usable upon return. - */ - if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { - CONN_DEC_REF(acceptor->tcp_connp); - CONN_DEC_REF(eager->tcp_connp); - freemsg(discon_mp); - /* Original mp has been freed by now, so use mp1 */ - tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); - return; - } - - mp = NULL; /* We should never use mp after this point */ - - switch (extra) { - case sizeof (sin_t): { - sin_t *sin = (sin_t *)ok_mp->b_wptr; - - ok_mp->b_wptr += extra; - sin->sin_family = AF_INET; - sin->sin_port = econnp->conn_lport; - sin->sin_addr.s_addr = econnp->conn_laddr_v4; - break; - } - case sizeof (sin6_t): { - sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; - - ok_mp->b_wptr += extra; - sin6->sin6_family = AF_INET6; - sin6->sin6_port = econnp->conn_lport; - sin6->sin6_addr = econnp->conn_laddr_v6; - sin6->sin6_flowinfo = econnp->conn_flowinfo; - if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && - (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { - sin6->sin6_scope_id = - econnp->conn_ixa->ixa_scopeid; - } else { - sin6->sin6_scope_id = 0; - } - sin6->__sin6_src_id = 0; - break; - } - default: - break; - } - ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); - } - - /* - * If there are no options we know that the T_CONN_RES will - * succeed. However, we can't send the T_OK_ACK upstream until - * the tcp_accept_swap is done since it would be dangerous to - * let the application start using the new fd prior to the swap. - */ - tcp_accept_swap(listener, acceptor, eager); - - /* - * tcp_accept_swap unlinks eager from listener but does not drop - * the eager's reference on the listener. - */ - ASSERT(eager->tcp_listener == NULL); - ASSERT(listener->tcp_connp->conn_ref >= 5); - - /* - * The eager is now associated with its own queue. Insert in - * the hash so that the connection can be reused for a future - * T_CONN_RES. - */ - tcp_acceptor_hash_insert(acceptor_id, eager); - - /* - * We now do the processing of options with T_CONN_RES. - * We delay till now since we wanted to have queue to pass to - * option processing routines that points back to the right - * instance structure which does not happen until after - * tcp_accept_swap(). - * - * Note: - * The sanity of the logic here assumes that whatever options - * are appropriate to inherit from listner=>eager are done - * before this point, and whatever were to be overridden (or not) - * in transfer logic from eager=>acceptor in tcp_accept_swap(). - * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it - * before its ACCEPTOR_id comes down in T_CONN_RES ] - * This may not be true at this point in time but can be fixed - * independently. This option processing code starts with - * the instantiated acceptor instance and the final queue at - * this point. - */ - - if (tcr->OPT_length != 0) { - /* Options to process */ - int t_error = 0; - int sys_error = 0; - int do_disconnect = 0; - - if (tcp_conprim_opt_process(eager, mp1, - &do_disconnect, &t_error, &sys_error) < 0) { - eager->tcp_accept_error = 1; - if (do_disconnect) { - /* - * An option failed which does not allow - * connection to be accepted. - * - * We allow T_CONN_RES to succeed and - * put a T_DISCON_IND on the eager queue. - */ - ASSERT(t_error == 0 && sys_error == 0); - eager->tcp_send_discon_ind = 1; - } else { - ASSERT(t_error != 0); - freemsg(ok_mp); - /* - * Original mp was either freed or set - * to ok_mp above, so use mp1 instead. - */ - tcp_err_ack(listener, mp1, t_error, sys_error); - goto finish; - } - } - /* - * Most likely success in setting options (except if - * eager->tcp_send_discon_ind set). - * mp1 option buffer represented by OPT_length/offset - * potentially modified and contains results of setting - * options at this point - */ - } - - /* We no longer need mp1, since all options processing has passed */ - freemsg(mp1); - - putnext(listener->tcp_connp->conn_rq, ok_mp); - - mutex_enter(&listener->tcp_eager_lock); - if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { - tcp_t *tail; - mblk_t *conn_ind; - - /* - * This path should not be executed if listener and - * acceptor streams are the same. - */ - ASSERT(listener != acceptor); - - tcp = listener->tcp_eager_prev_q0; - /* - * listener->tcp_eager_prev_q0 points to the TAIL of the - * deferred T_conn_ind queue. We need to get to the head of - * the queue in order to send up T_conn_ind the same order as - * how the 3WHS is completed. - */ - while (tcp != listener) { - if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) - break; - else - tcp = tcp->tcp_eager_prev_q0; - } - ASSERT(tcp != listener); - conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; - ASSERT(conn_ind != NULL); - tcp->tcp_conn.tcp_eager_conn_ind = NULL; - - /* Move from q0 to q */ - ASSERT(listener->tcp_conn_req_cnt_q0 > 0); - listener->tcp_conn_req_cnt_q0--; - listener->tcp_conn_req_cnt_q++; - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = - tcp->tcp_eager_prev_q0; - tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = - tcp->tcp_eager_next_q0; - tcp->tcp_eager_prev_q0 = NULL; - tcp->tcp_eager_next_q0 = NULL; - tcp->tcp_conn_def_q0 = B_FALSE; - - /* Make sure the tcp isn't in the list of droppables */ - ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && - tcp->tcp_eager_prev_drop_q0 == NULL); - - /* - * Insert at end of the queue because sockfs sends - * down T_CONN_RES in chronological order. Leaving - * the older conn indications at front of the queue - * helps reducing search time. - */ - tail = listener->tcp_eager_last_q; - if (tail != NULL) - tail->tcp_eager_next_q = tcp; - else - listener->tcp_eager_next_q = tcp; - listener->tcp_eager_last_q = tcp; - tcp->tcp_eager_next_q = NULL; - mutex_exit(&listener->tcp_eager_lock); - putnext(tcp->tcp_connp->conn_rq, conn_ind); - } else { - mutex_exit(&listener->tcp_eager_lock); - } - - /* - * Done with the acceptor - free it - * - * Note: from this point on, no access to listener should be made - * as listener can be equal to acceptor. - */ -finish: - ASSERT(acceptor->tcp_detached); - acceptor->tcp_connp->conn_rq = NULL; - ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); - acceptor->tcp_connp->conn_wq = NULL; - (void) tcp_clean_death(acceptor, 0, 2); - CONN_DEC_REF(acceptor->tcp_connp); - - /* - * We pass discon_mp to tcp_accept_finish to get on the right squeue. - * - * It will update the setting for sockfs/stream head and also take - * care of any data that arrived before accept() wad called. - * In case we already received a FIN then tcp_accept_finish will send up - * the ordrel. It will also send up a window update if the window - * has opened up. - */ - - /* - * XXX: we currently have a problem if XTI application closes the - * acceptor stream in between. This problem exists in on10-gate also - * and is well know but nothing can be done short of major rewrite - * to fix it. Now it is possible to take care of it by assigning TLI/XTI - * eager same squeue as listener (we can distinguish non socket - * listeners at the time of handling a SYN in tcp_input_listener) - * and do most of the work that tcp_accept_finish does here itself - * and then get behind the acceptor squeue to access the acceptor - * queue. - */ - /* - * We already have a ref on tcp so no need to do one before squeue_enter - */ - SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, - tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, - SQTAG_TCP_ACCEPT_FINISH); -} - -/* - * Swap information between the eager and acceptor for a TLI/XTI client. - * The sockfs accept is done on the acceptor stream and control goes - * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not - * called. In either case, both the eager and listener are in their own - * perimeter (squeue) and the code has to deal with potential race. - * - * See the block comment on top of tcp_accept() and tcp_tli_accept(). - */ -static void -tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) -{ - conn_t *econnp, *aconnp; - - ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); - ASSERT(eager->tcp_detached && !acceptor->tcp_detached); - ASSERT(!TCP_IS_SOCKET(acceptor)); - ASSERT(!TCP_IS_SOCKET(eager)); - ASSERT(!TCP_IS_SOCKET(listener)); - - /* - * Trusted Extensions may need to use a security label that is - * different from the acceptor's label on MLP and MAC-Exempt - * sockets. If this is the case, the required security label - * already exists in econnp->conn_ixa->ixa_tsl. Since we make the - * acceptor stream refer to econnp we atomatically get that label. - */ - - acceptor->tcp_detached = B_TRUE; - /* - * To permit stream re-use by TLI/XTI, the eager needs a copy of - * the acceptor id. - */ - eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; - - /* remove eager from listen list... */ - mutex_enter(&listener->tcp_eager_lock); - tcp_eager_unlink(eager); - ASSERT(eager->tcp_eager_next_q == NULL && - eager->tcp_eager_last_q == NULL); - ASSERT(eager->tcp_eager_next_q0 == NULL && - eager->tcp_eager_prev_q0 == NULL); - mutex_exit(&listener->tcp_eager_lock); - - econnp = eager->tcp_connp; - aconnp = acceptor->tcp_connp; - econnp->conn_rq = aconnp->conn_rq; - econnp->conn_wq = aconnp->conn_wq; - econnp->conn_rq->q_ptr = econnp; - econnp->conn_wq->q_ptr = econnp; - - /* - * In the TLI/XTI loopback case, we are inside the listener's squeue, - * which might be a different squeue from our peer TCP instance. - * For TCP Fusion, the peer expects that whenever tcp_detached is - * clear, our TCP queues point to the acceptor's queues. Thus, use - * membar_producer() to ensure that the assignments of conn_rq/conn_wq - * above reach global visibility prior to the clearing of tcp_detached. - */ - membar_producer(); - eager->tcp_detached = B_FALSE; - - ASSERT(eager->tcp_ack_tid == 0); - - econnp->conn_dev = aconnp->conn_dev; - econnp->conn_minor_arena = aconnp->conn_minor_arena; - - ASSERT(econnp->conn_minor_arena != NULL); - if (econnp->conn_cred != NULL) - crfree(econnp->conn_cred); - econnp->conn_cred = aconnp->conn_cred; - econnp->conn_ixa->ixa_cred = econnp->conn_cred; - aconnp->conn_cred = NULL; - econnp->conn_cpid = aconnp->conn_cpid; - ASSERT(econnp->conn_netstack == aconnp->conn_netstack); - ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); - - econnp->conn_zoneid = aconnp->conn_zoneid; - econnp->conn_allzones = aconnp->conn_allzones; - econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; - - econnp->conn_mac_mode = aconnp->conn_mac_mode; - econnp->conn_zone_is_global = aconnp->conn_zone_is_global; - aconnp->conn_mac_mode = CONN_MAC_DEFAULT; - - /* Do the IPC initialization */ - CONN_INC_REF(econnp); - - /* Done with old IPC. Drop its ref on its connp */ - CONN_DEC_REF(aconnp); -} - - -/* * Adapt to the information, such as rtt and rtt_sd, provided from the * DCE and IRE maintained by IP. * @@ -2462,7 +724,7 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) * reject an attempt to connect to a broadcast or multicast (destination) * address. */ -static int +int tcp_set_destination(tcp_t *tcp) { uint32_t mss_max; @@ -2648,433 +910,6 @@ tcp_set_destination(tcp_t *tcp) return (0); } -static void -tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) -{ - int error; - conn_t *connp = tcp->tcp_connp; - struct sockaddr *sa; - mblk_t *mp1; - struct T_bind_req *tbr; - int backlog; - socklen_t len; - sin_t *sin; - sin6_t *sin6; - cred_t *cr; - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); - if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_tpi_bind: bad req, len %u", - (uint_t)(mp->b_wptr - mp->b_rptr)); - } - tcp_err_ack(tcp, mp, TPROTO, 0); - return; - } - /* Make sure the largest address fits */ - mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); - if (mp1 == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - mp = mp1; - tbr = (struct T_bind_req *)mp->b_rptr; - - backlog = tbr->CONIND_number; - len = tbr->ADDR_length; - - switch (len) { - case 0: /* request for a generic port */ - tbr->ADDR_offset = sizeof (struct T_bind_req); - if (connp->conn_family == AF_INET) { - tbr->ADDR_length = sizeof (sin_t); - sin = (sin_t *)&tbr[1]; - *sin = sin_null; - sin->sin_family = AF_INET; - sa = (struct sockaddr *)sin; - len = sizeof (sin_t); - mp->b_wptr = (uchar_t *)&sin[1]; - } else { - ASSERT(connp->conn_family == AF_INET6); - tbr->ADDR_length = sizeof (sin6_t); - sin6 = (sin6_t *)&tbr[1]; - *sin6 = sin6_null; - sin6->sin6_family = AF_INET6; - sa = (struct sockaddr *)sin6; - len = sizeof (sin6_t); - mp->b_wptr = (uchar_t *)&sin6[1]; - } - break; - - case sizeof (sin_t): /* Complete IPv4 address */ - sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, - sizeof (sin_t)); - break; - - case sizeof (sin6_t): /* Complete IPv6 address */ - sa = (struct sockaddr *)mi_offset_param(mp, - tbr->ADDR_offset, sizeof (sin6_t)); - break; - - default: - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_tpi_bind: bad address length, %d", - tbr->ADDR_length); - } - tcp_err_ack(tcp, mp, TBADADDR, 0); - return; - } - - if (backlog > 0) { - error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), - tbr->PRIM_type != O_T_BIND_REQ); - } else { - error = tcp_do_bind(connp, sa, len, DB_CRED(mp), - tbr->PRIM_type != O_T_BIND_REQ); - } -done: - if (error > 0) { - tcp_err_ack(tcp, mp, TSYSERR, error); - } else if (error < 0) { - tcp_err_ack(tcp, mp, -error, 0); - } else { - /* - * Update port information as sockfs/tpi needs it for checking - */ - if (connp->conn_family == AF_INET) { - sin = (sin_t *)sa; - sin->sin_port = connp->conn_lport; - } else { - sin6 = (sin6_t *)sa; - sin6->sin6_port = connp->conn_lport; - } - mp->b_datap->db_type = M_PCPROTO; - tbr->PRIM_type = T_BIND_ACK; - putnext(connp->conn_rq, mp); - } -} - -/* - * If the "bind_to_req_port_only" parameter is set, if the requested port - * number is available, return it, If not return 0 - * - * If "bind_to_req_port_only" parameter is not set and - * If the requested port number is available, return it. If not, return - * the first anonymous port we happen across. If no anonymous ports are - * available, return 0. addr is the requested local address, if any. - * - * In either case, when succeeding update the tcp_t to record the port number - * and insert it in the bind hash table. - * - * Note that TCP over IPv4 and IPv6 sockets can use the same port number - * without setting SO_REUSEADDR. This is needed so that they - * can be viewed as two independent transport protocols. - */ -static in_port_t -tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, - int reuseaddr, boolean_t quick_connect, - boolean_t bind_to_req_port_only, boolean_t user_specified) -{ - /* number of times we have run around the loop */ - int count = 0; - /* maximum number of times to run around the loop */ - int loopmax; - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * Lookup for free addresses is done in a loop and "loopmax" - * influences how long we spin in the loop - */ - if (bind_to_req_port_only) { - /* - * If the requested port is busy, don't bother to look - * for a new one. Setting loop maximum count to 1 has - * that effect. - */ - loopmax = 1; - } else { - /* - * If the requested port is busy, look for a free one - * in the anonymous port range. - * Set loopmax appropriately so that one does not look - * forever in the case all of the anonymous ports are in use. - */ - if (connp->conn_anon_priv_bind) { - /* - * loopmax = - * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 - */ - loopmax = IPPORT_RESERVED - - tcps->tcps_min_anonpriv_port; - } else { - loopmax = (tcps->tcps_largest_anon_port - - tcps->tcps_smallest_anon_port + 1); - } - } - do { - uint16_t lport; - tf_t *tbf; - tcp_t *ltcp; - conn_t *lconnp; - - lport = htons(port); - - /* - * Ensure that the tcp_t is not currently in the bind hash. - * Hold the lock on the hash bucket to ensure that - * the duplicate check plus the insertion is an atomic - * operation. - * - * This function does an inline lookup on the bind hash list - * Make sure that we access only members of tcp_t - * and that we don't look at tcp_tcp, since we are not - * doing a CONN_INC_REF. - */ - tcp_bind_hash_remove(tcp); - tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; - mutex_enter(&tbf->tf_lock); - for (ltcp = tbf->tf_tcp; ltcp != NULL; - ltcp = ltcp->tcp_bind_hash) { - if (lport == ltcp->tcp_connp->conn_lport) - break; - } - - for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { - boolean_t not_socket; - boolean_t exclbind; - - lconnp = ltcp->tcp_connp; - - /* - * On a labeled system, we must treat bindings to ports - * on shared IP addresses by sockets with MAC exemption - * privilege as being in all zones, as there's - * otherwise no way to identify the right receiver. - */ - if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) - continue; - - /* - * If TCP_EXCLBIND is set for either the bound or - * binding endpoint, the semantics of bind - * is changed according to the following. - * - * spec = specified address (v4 or v6) - * unspec = unspecified address (v4 or v6) - * A = specified addresses are different for endpoints - * - * bound bind to allowed - * ------------------------------------- - * unspec unspec no - * unspec spec no - * spec unspec no - * spec spec yes if A - * - * For labeled systems, SO_MAC_EXEMPT behaves the same - * as TCP_EXCLBIND, except that zoneid is ignored. - * - * Note: - * - * 1. Because of TLI semantics, an endpoint can go - * back from, say TCP_ESTABLISHED to TCPS_LISTEN or - * TCPS_BOUND, depending on whether it is originally - * a listener or not. That is why we need to check - * for states greater than or equal to TCPS_BOUND - * here. - * - * 2. Ideally, we should only check for state equals - * to TCPS_LISTEN. And the following check should be - * added. - * - * if (ltcp->tcp_state == TCPS_LISTEN || - * !reuseaddr || !lconnp->conn_reuseaddr) { - * ... - * } - * - * The semantics will be changed to this. If the - * endpoint on the list is in state not equal to - * TCPS_LISTEN and both endpoints have SO_REUSEADDR - * set, let the bind succeed. - * - * Because of (1), we cannot do that for TLI - * endpoints. But we can do that for socket endpoints. - * If in future, we can change this going back - * semantics, we can use the above check for TLI also. - */ - not_socket = !(TCP_IS_SOCKET(ltcp) && - TCP_IS_SOCKET(tcp)); - exclbind = lconnp->conn_exclbind || - connp->conn_exclbind; - - if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || - (connp->conn_mac_mode != CONN_MAC_DEFAULT) || - (exclbind && (not_socket || - ltcp->tcp_state <= TCPS_ESTABLISHED))) { - if (V6_OR_V4_INADDR_ANY( - lconnp->conn_bound_addr_v6) || - V6_OR_V4_INADDR_ANY(*laddr) || - IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6)) { - break; - } - continue; - } - - /* - * Check ipversion to allow IPv4 and IPv6 sockets to - * have disjoint port number spaces, if *_EXCLBIND - * is not set and only if the application binds to a - * specific port. We use the same autoassigned port - * number space for IPv4 and IPv6 sockets. - */ - if (connp->conn_ipversion != lconnp->conn_ipversion && - bind_to_req_port_only) - continue; - - /* - * Ideally, we should make sure that the source - * address, remote address, and remote port in the - * four tuple for this tcp-connection is unique. - * However, trying to find out the local source - * address would require too much code duplication - * with IP, since IP needs needs to have that code - * to support userland TCP implementations. - */ - if (quick_connect && - (ltcp->tcp_state > TCPS_LISTEN) && - ((connp->conn_fport != lconnp->conn_fport) || - !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, - &lconnp->conn_faddr_v6))) - continue; - - if (!reuseaddr) { - /* - * No socket option SO_REUSEADDR. - * If existing port is bound to - * a non-wildcard IP address - * and the requesting stream is - * bound to a distinct - * different IP addresses - * (non-wildcard, also), keep - * going. - */ - if (!V6_OR_V4_INADDR_ANY(*laddr) && - !V6_OR_V4_INADDR_ANY( - lconnp->conn_bound_addr_v6) && - !IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6)) - continue; - if (ltcp->tcp_state >= TCPS_BOUND) { - /* - * This port is being used and - * its state is >= TCPS_BOUND, - * so we can't bind to it. - */ - break; - } - } else { - /* - * socket option SO_REUSEADDR is set on the - * binding tcp_t. - * - * If two streams are bound to - * same IP address or both addr - * and bound source are wildcards - * (INADDR_ANY), we want to stop - * searching. - * We have found a match of IP source - * address and source port, which is - * refused regardless of the - * SO_REUSEADDR setting, so we break. - */ - if (IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6) && - (ltcp->tcp_state == TCPS_LISTEN || - ltcp->tcp_state == TCPS_BOUND)) - break; - } - } - if (ltcp != NULL) { - /* The port number is busy */ - mutex_exit(&tbf->tf_lock); - } else { - /* - * This port is ours. Insert in fanout and mark as - * bound to prevent others from getting the port - * number. - */ - tcp->tcp_state = TCPS_BOUND; - connp->conn_lport = htons(port); - - ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( - connp->conn_lport)] == tbf); - tcp_bind_hash_insert(tbf, tcp, 1); - - mutex_exit(&tbf->tf_lock); - - /* - * We don't want tcp_next_port_to_try to "inherit" - * a port number supplied by the user in a bind. - */ - if (user_specified) - return (port); - - /* - * This is the only place where tcp_next_port_to_try - * is updated. After the update, it may or may not - * be in the valid range. - */ - if (!connp->conn_anon_priv_bind) - tcps->tcps_next_port_to_try = port + 1; - return (port); - } - - if (connp->conn_anon_priv_bind) { - port = tcp_get_next_priv_port(tcp); - } else { - if (count == 0 && user_specified) { - /* - * We may have to return an anonymous port. So - * get one to start with. - */ - port = - tcp_update_next_port( - tcps->tcps_next_port_to_try, - tcp, B_TRUE); - user_specified = B_FALSE; - } else { - port = tcp_update_next_port(port + 1, tcp, - B_FALSE); - } - } - if (port == 0) - break; - - /* - * Don't let this loop run forever in the case where - * all of the anonymous ports are in use. - */ - } while (++count < loopmax); - return (0); -} - /* * tcp_clean_death / tcp_close_detached must not be called more than once * on a tcp. Thus every function that potentially calls tcp_clean_death @@ -3091,8 +926,7 @@ tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, freemsg(mp); if (tcp->tcp_state > TCPS_BOUND) - (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, - ETIMEDOUT, 5); + (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, ETIMEDOUT); } /* @@ -3104,20 +938,14 @@ tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2, * TBD - Should the return value distinguish between the tcp_t being * freed and it being reinitialized? */ -static int -tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) +int +tcp_clean_death(tcp_t *tcp, int err) { mblk_t *mp; queue_t *q; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; - TCP_CLD_STAT(tag); - -#if TCP_TAG_CLEAN_DEATH - tcp->tcp_cleandeathtag = tag; -#endif - if (tcp->tcp_fused) tcp_unfuse(tcp); @@ -3168,6 +996,16 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) if (tcp->tcp_listen_cnt != NULL) TCP_DECR_LISTEN_CNT(tcp); + /* + * When a connection is moved to TIME_WAIT state, the connection + * counter is already decremented. So no need to decrement here + * again. See SET_TIME_WAIT() macro. + */ + if (tcp->tcp_state >= TCPS_ESTABLISHED && + tcp->tcp_state < TCPS_TIME_WAIT) { + TCPS_CONN_DEC(tcps); + } + q = connp->conn_rq; /* Trash all inbound data */ @@ -3214,10 +1052,10 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) } if (tcp->tcp_state <= TCPS_SYN_RCVD) { /* SYN_SENT or SYN_RCVD */ - BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); + TCPS_BUMP_MIB(tcps, tcpAttemptFails); } else if (tcp->tcp_state <= TCPS_CLOSE_WAIT) { /* ESTABLISHED or CLOSE_WAIT */ - BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); + TCPS_BUMP_MIB(tcps, tcpEstabResets); } } @@ -3232,7 +1070,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) * In case tcp is in the "lingering state" and waits for the SO_LINGER timeout * to expire, stop the wait and finish the close. */ -static void +void tcp_stop_lingering(tcp_t *tcp) { clock_t delta = 0; @@ -3294,21 +1132,7 @@ finish: mutex_exit(&tcp->tcp_closelock); } -/* - * Handle lingering timeouts. This function is called when the SO_LINGER timeout - * expires. - */ -static void -tcp_close_linger_timeout(void *arg) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - - tcp->tcp_client_errno = ETIMEDOUT; - tcp_stop_lingering(tcp); -} - -static void +void tcp_close_common(conn_t *connp, int flags) { tcp_t *tcp = connp->conn_tcp; @@ -3424,73 +1248,6 @@ tcp_close_common(conn_t *connp, int flags) connp->conn_cpid = NOPID; } -static int -tcp_tpi_close(queue_t *q, int flags) -{ - conn_t *connp; - - ASSERT(WR(q)->q_next == NULL); - - if (flags & SO_FALLBACK) { - /* - * stream is being closed while in fallback - * simply free the resources that were allocated - */ - inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); - qprocsoff(q); - goto done; - } - - connp = Q_TO_CONN(q); - /* - * We are being closed as /dev/tcp or /dev/tcp6. - */ - tcp_close_common(connp, flags); - - qprocsoff(q); - inet_minor_free(connp->conn_minor_arena, connp->conn_dev); - - /* - * Drop IP's reference on the conn. This is the last reference - * on the connp if the state was less than established. If the - * connection has gone into timewait state, then we will have - * one ref for the TCP and one more ref (total of two) for the - * classifier connected hash list (a timewait connections stays - * in connected hash till closed). - * - * We can't assert the references because there might be other - * transient reference places because of some walkers or queued - * packets in squeue for the timewait state. - */ - CONN_DEC_REF(connp); -done: - q->q_ptr = WR(q)->q_ptr = NULL; - return (0); -} - -static int -tcp_tpi_close_accept(queue_t *q) -{ - vmem_t *minor_arena; - dev_t conn_dev; - - ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); - - /* - * We had opened an acceptor STREAM for sockfs which is - * now being closed due to some error. - */ - qprocsoff(q); - - minor_arena = (vmem_t *)WR(q)->q_ptr; - conn_dev = (dev_t)RD(q)->q_ptr; - ASSERT(minor_arena != NULL); - ASSERT(conn_dev != 0); - inet_minor_free(minor_arena, conn_dev); - q->q_ptr = WR(q)->q_ptr = NULL; - return (0); -} - /* * Called by tcp_close() routine via squeue when lingering is * interrupted by a signal. @@ -3512,212 +1269,10 @@ tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) } /* - * Called by streams close routine via squeues when our client blows off her - * descriptor, we take this to mean: "close the stream state NOW, close the tcp - * connection politely" When SO_LINGER is set (with a non-zero linger time and - * it is not a nonblocking socket) then this routine sleeps until the FIN is - * acked. - * - * NOTE: tcp_close potentially returns error when lingering. - * However, the stream head currently does not pass these errors - * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK - * errors to the application (from tsleep()) and not errors - * like ECONNRESET caused by receiving a reset packet. - */ - -/* ARGSUSED */ -static void -tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - char *msg; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - clock_t delta = 0; - tcp_stack_t *tcps = tcp->tcp_tcps; - - ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || - (connp->conn_fanout == NULL && connp->conn_ref >= 3)); - - mutex_enter(&tcp->tcp_eager_lock); - if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { - /* Cleanup for listener */ - tcp_eager_cleanup(tcp, 0); - tcp->tcp_wait_for_eagers = 1; - } - mutex_exit(&tcp->tcp_eager_lock); - - tcp->tcp_lso = B_FALSE; - - msg = NULL; - switch (tcp->tcp_state) { - case TCPS_CLOSED: - case TCPS_IDLE: - case TCPS_BOUND: - case TCPS_LISTEN: - break; - case TCPS_SYN_SENT: - msg = "tcp_close, during connect"; - break; - case TCPS_SYN_RCVD: - /* - * Close during the connect 3-way handshake - * but here there may or may not be pending data - * already on queue. Process almost same as in - * the ESTABLISHED state. - */ - /* FALLTHRU */ - default: - if (tcp->tcp_fused) - tcp_unfuse(tcp); - - /* - * If SO_LINGER has set a zero linger time, abort the - * connection with a reset. - */ - if (connp->conn_linger && connp->conn_lingertime == 0) { - msg = "tcp_close, zero lingertime"; - break; - } - - /* - * Abort connection if there is unread data queued. - */ - if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { - msg = "tcp_close, unread data"; - break; - } - /* - * We have done a qwait() above which could have possibly - * drained more messages in turn causing transition to a - * different state. Check whether we have to do the rest - * of the processing or not. - */ - if (tcp->tcp_state <= TCPS_LISTEN) - break; - - /* - * Transmit the FIN before detaching the tcp_t. - * After tcp_detach returns this queue/perimeter - * no longer owns the tcp_t thus others can modify it. - */ - (void) tcp_xmit_end(tcp); - - /* - * If lingering on close then wait until the fin is acked, - * the SO_LINGER time passes, or a reset is sent/received. - */ - if (connp->conn_linger && connp->conn_lingertime > 0 && - !(tcp->tcp_fin_acked) && - tcp->tcp_state >= TCPS_ESTABLISHED) { - if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { - tcp->tcp_client_errno = EWOULDBLOCK; - } else if (tcp->tcp_client_errno == 0) { - - ASSERT(tcp->tcp_linger_tid == 0); - - tcp->tcp_linger_tid = TCP_TIMER(tcp, - tcp_close_linger_timeout, - connp->conn_lingertime * hz); - - /* tcp_close_linger_timeout will finish close */ - if (tcp->tcp_linger_tid == 0) - tcp->tcp_client_errno = ENOSR; - else - return; - } - - /* - * Check if we need to detach or just close - * the instance. - */ - if (tcp->tcp_state <= TCPS_LISTEN) - break; - } - - /* - * Make sure that no other thread will access the conn_rq of - * this instance (through lookups etc.) as conn_rq will go - * away shortly. - */ - tcp_acceptor_hash_remove(tcp); - - mutex_enter(&tcp->tcp_non_sq_lock); - if (tcp->tcp_flow_stopped) { - tcp_clrqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); - - if (tcp->tcp_timer_tid != 0) { - delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); - tcp->tcp_timer_tid = 0; - } - /* - * Need to cancel those timers which will not be used when - * TCP is detached. This has to be done before the conn_wq - * is set to NULL. - */ - tcp_timers_stop(tcp); - - tcp->tcp_detached = B_TRUE; - if (tcp->tcp_state == TCPS_TIME_WAIT) { - tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcps, tcp_detach_time_wait); - ASSERT(connp->conn_ref >= 3); - goto finish; - } - - /* - * If delta is zero the timer event wasn't executed and was - * successfully canceled. In this case we need to restart it - * with the minimal delta possible. - */ - if (delta >= 0) - tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, - delta ? delta : 1); - - ASSERT(connp->conn_ref >= 3); - goto finish; - } - - /* Detach did not complete. Still need to remove q from stream. */ - if (msg) { - if (tcp->tcp_state == TCPS_ESTABLISHED || - tcp->tcp_state == TCPS_CLOSE_WAIT) - BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); - if (tcp->tcp_state == TCPS_SYN_SENT || - tcp->tcp_state == TCPS_SYN_RCVD) - BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); - tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); - } - - tcp_closei_local(tcp); - CONN_DEC_REF(connp); - ASSERT(connp->conn_ref >= 2); - -finish: - mutex_enter(&tcp->tcp_closelock); - /* - * Don't change the queues in the case of a listener that has - * eagers in its q or q0. It could surprise the eagers. - * Instead wait for the eagers outside the squeue. - */ - if (!tcp->tcp_wait_for_eagers) { - tcp->tcp_detached = B_TRUE; - connp->conn_rq = NULL; - connp->conn_wq = NULL; - } - - /* Signal tcp_close() to finish closing. */ - tcp->tcp_closed = 1; - cv_signal(&tcp->tcp_closecv); - mutex_exit(&tcp->tcp_closelock); -} - -/* * Clean up the b_next and b_prev fields of every mblk pointed at by *mpp. * Some stream heads get upset if they see these later on as anything but NULL. */ -static void +void tcp_close_mpp(mblk_t **mpp) { mblk_t *mp; @@ -3735,7 +1290,7 @@ tcp_close_mpp(mblk_t **mpp) } /* Do detached close. */ -static void +void tcp_close_detached(tcp_t *tcp) { if (tcp->tcp_fused) @@ -3753,41 +1308,13 @@ tcp_close_detached(tcp_t *tcp) } /* - * Stop all TCP timers, and free the timer mblks if requested. - */ -void -tcp_timers_stop(tcp_t *tcp) -{ - if (tcp->tcp_timer_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); - tcp->tcp_timer_tid = 0; - } - if (tcp->tcp_ka_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); - tcp->tcp_ka_tid = 0; - } - if (tcp->tcp_ack_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); - tcp->tcp_ack_tid = 0; - } - if (tcp->tcp_push_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); - tcp->tcp_push_tid = 0; - } - if (tcp->tcp_reass_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid); - tcp->tcp_reass_tid = 0; - } -} - -/* * The tcp_t is going away. Remove it from all lists and set it * to TCPS_CLOSED. The freeing up of memory is deferred until * tcp_inactive. This is needed since a thread in tcp_rput might have * done a CONN_INC_REF on this structure before it was removed from the * hashes. */ -static void +void tcp_closei_local(tcp_t *tcp) { conn_t *connp = tcp->tcp_connp; @@ -3796,12 +1323,24 @@ tcp_closei_local(tcp_t *tcp) if (!TCP_IS_SOCKET(tcp)) tcp_acceptor_hash_remove(tcp); - UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); + TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); tcp->tcp_ibsegs = 0; - UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); + TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); tcp->tcp_obsegs = 0; /* + * This can be called via tcp_time_wait_processing() if TCP gets a + * SYN with sequence number outside the TIME-WAIT connection's + * window. So we need to check for TIME-WAIT state here as the + * connection counter is already decremented. See SET_TIME_WAIT() + * macro + */ + if (tcp->tcp_state >= TCPS_ESTABLISHED && + tcp->tcp_state < TCPS_TIME_WAIT) { + TCPS_CONN_DEC(tcps); + } + + /* * If we are an eager connection hanging off a listener that * hasn't formally accepted the connection yet, get off his * list and blow off any data that we have accumulated. @@ -3979,314 +1518,6 @@ tcp_free(tcp_t *tcp) tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); } - -/* - * Put a connection confirmation message upstream built from the - * address/flowid information with the conn and iph. Report our success or - * failure. - */ -static boolean_t -tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, - mblk_t **defermp, ip_recv_attr_t *ira) -{ - sin_t sin; - sin6_t sin6; - mblk_t *mp; - char *optp = NULL; - int optlen = 0; - conn_t *connp = tcp->tcp_connp; - - if (defermp != NULL) - *defermp = NULL; - - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { - /* - * Return in T_CONN_CON results of option negotiation through - * the T_CONN_REQ. Note: If there is an real end-to-end option - * negotiation, then what is received from remote end needs - * to be taken into account but there is no such thing (yet?) - * in our TCP/IP. - * Note: We do not use mi_offset_param() here as - * tcp_opts_conn_req contents do not directly come from - * an application and are either generated in kernel or - * from user input that was already verified. - */ - mp = tcp->tcp_conn.tcp_opts_conn_req; - optp = (char *)(mp->b_rptr + - ((struct T_conn_req *)mp->b_rptr)->OPT_offset); - optlen = (int) - ((struct T_conn_req *)mp->b_rptr)->OPT_length; - } - - if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { - - /* packet is IPv4 */ - if (connp->conn_family == AF_INET) { - sin = sin_null; - sin.sin_addr.s_addr = connp->conn_faddr_v4; - sin.sin_port = connp->conn_fport; - sin.sin_family = AF_INET; - mp = mi_tpi_conn_con(NULL, (char *)&sin, - (int)sizeof (sin_t), optp, optlen); - } else { - sin6 = sin6_null; - sin6.sin6_addr = connp->conn_faddr_v6; - sin6.sin6_port = connp->conn_fport; - sin6.sin6_family = AF_INET6; - mp = mi_tpi_conn_con(NULL, (char *)&sin6, - (int)sizeof (sin6_t), optp, optlen); - - } - } else { - ip6_t *ip6h = (ip6_t *)iphdr; - - ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); - ASSERT(connp->conn_family == AF_INET6); - sin6 = sin6_null; - sin6.sin6_addr = connp->conn_faddr_v6; - sin6.sin6_port = connp->conn_fport; - sin6.sin6_family = AF_INET6; - sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - mp = mi_tpi_conn_con(NULL, (char *)&sin6, - (int)sizeof (sin6_t), optp, optlen); - } - - if (!mp) - return (B_FALSE); - - mblk_copycred(mp, idmp); - - if (defermp == NULL) { - conn_t *connp = tcp->tcp_connp; - if (IPCL_IS_NONSTR(connp)) { - (*connp->conn_upcalls->su_connected) - (connp->conn_upper_handle, tcp->tcp_connid, - ira->ira_cred, ira->ira_cpid); - freemsg(mp); - } else { - if (ira->ira_cred != NULL) { - /* So that getpeerucred works for TPI sockfs */ - mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); - } - putnext(connp->conn_rq, mp); - } - } else { - *defermp = mp; - } - - if (tcp->tcp_conn.tcp_opts_conn_req != NULL) - tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); - return (B_TRUE); -} - -/* - * Defense for the SYN attack - - * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest - * one from the list of droppable eagers. This list is a subset of q0. - * see comments before the definition of MAKE_DROPPABLE(). - * 2. Don't drop a SYN request before its first timeout. This gives every - * request at least til the first timeout to complete its 3-way handshake. - * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many - * requests currently on the queue that has timed out. This will be used - * as an indicator of whether an attack is under way, so that appropriate - * actions can be taken. (It's incremented in tcp_timer() and decremented - * either when eager goes into ESTABLISHED, or gets freed up.) - * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on - * # of timeout drops back to <= q0len/32 => SYN alert off - */ -static boolean_t -tcp_drop_q0(tcp_t *tcp) -{ - tcp_t *eager; - mblk_t *mp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); - ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); - - /* Pick oldest eager from the list of droppable eagers */ - eager = tcp->tcp_eager_prev_drop_q0; - - /* If list is empty. return B_FALSE */ - if (eager == tcp) { - return (B_FALSE); - } - - /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ - if ((mp = allocb(0, BPRI_HI)) == NULL) - return (B_FALSE); - - /* - * Take this eager out from the list of droppable eagers since we are - * going to drop it. - */ - MAKE_UNDROPPABLE(eager); - - if (tcp->tcp_connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, - "tcp_drop_q0: listen half-open queue (max=%d) overflow" - " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, - tcp->tcp_conn_req_cnt_q0, - tcp_display(tcp, NULL, DISP_PORT_ONLY)); - } - - BUMP_MIB(&tcps->tcps_mib, tcpHalfOpenDrop); - - /* Put a reference on the conn as we are enqueueing it in the sqeue */ - CONN_INC_REF(eager->tcp_connp); - - SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_clean_death_wrapper, eager->tcp_connp, NULL, - SQ_FILL, SQTAG_TCP_DROP_Q0); - - return (B_TRUE); -} - -/* - * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 - */ -static mblk_t * -tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, - ip_recv_attr_t *ira) -{ - tcp_t *ltcp = lconnp->conn_tcp; - tcp_t *tcp = connp->conn_tcp; - mblk_t *tpi_mp; - ipha_t *ipha; - ip6_t *ip6h; - sin6_t sin6; - uint_t ifindex = ira->ira_ruifindex; - tcp_stack_t *tcps = tcp->tcp_tcps; - - if (ira->ira_flags & IRAF_IS_IPV4) { - ipha = (ipha_t *)mp->b_rptr; - - connp->conn_ipversion = IPV4_VERSION; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); - connp->conn_saddr_v6 = connp->conn_laddr_v6; - - sin6 = sin6_null; - sin6.sin6_addr = connp->conn_faddr_v6; - sin6.sin6_port = connp->conn_fport; - sin6.sin6_family = AF_INET6; - sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, - IPCL_ZONEID(lconnp), tcps->tcps_netstack); - - if (connp->conn_recv_ancillary.crb_recvdstaddr) { - sin6_t sin6d; - - sin6d = sin6_null; - sin6d.sin6_addr = connp->conn_laddr_v6; - sin6d.sin6_port = connp->conn_lport; - sin6d.sin6_family = AF_INET; - tpi_mp = mi_tpi_extconn_ind(NULL, - (char *)&sin6d, sizeof (sin6_t), - (char *)&tcp, - (t_scalar_t)sizeof (intptr_t), - (char *)&sin6d, sizeof (sin6_t), - (t_scalar_t)ltcp->tcp_conn_req_seqnum); - } else { - tpi_mp = mi_tpi_conn_ind(NULL, - (char *)&sin6, sizeof (sin6_t), - (char *)&tcp, (t_scalar_t)sizeof (intptr_t), - (t_scalar_t)ltcp->tcp_conn_req_seqnum); - } - } else { - ip6h = (ip6_t *)mp->b_rptr; - - connp->conn_ipversion = IPV6_VERSION; - connp->conn_laddr_v6 = ip6h->ip6_dst; - connp->conn_faddr_v6 = ip6h->ip6_src; - connp->conn_saddr_v6 = connp->conn_laddr_v6; - - sin6 = sin6_null; - sin6.sin6_addr = connp->conn_faddr_v6; - sin6.sin6_port = connp->conn_fport; - sin6.sin6_family = AF_INET6; - sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; - sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, - IPCL_ZONEID(lconnp), tcps->tcps_netstack); - - if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { - /* Pass up the scope_id of remote addr */ - sin6.sin6_scope_id = ifindex; - } else { - sin6.sin6_scope_id = 0; - } - if (connp->conn_recv_ancillary.crb_recvdstaddr) { - sin6_t sin6d; - - sin6d = sin6_null; - sin6.sin6_addr = connp->conn_laddr_v6; - sin6d.sin6_port = connp->conn_lport; - sin6d.sin6_family = AF_INET6; - if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) - sin6d.sin6_scope_id = ifindex; - - tpi_mp = mi_tpi_extconn_ind(NULL, - (char *)&sin6d, sizeof (sin6_t), - (char *)&tcp, (t_scalar_t)sizeof (intptr_t), - (char *)&sin6d, sizeof (sin6_t), - (t_scalar_t)ltcp->tcp_conn_req_seqnum); - } else { - tpi_mp = mi_tpi_conn_ind(NULL, - (char *)&sin6, sizeof (sin6_t), - (char *)&tcp, (t_scalar_t)sizeof (intptr_t), - (t_scalar_t)ltcp->tcp_conn_req_seqnum); - } - } - - tcp->tcp_mss = tcps->tcps_mss_def_ipv6; - return (tpi_mp); -} - -/* Handle a SYN on an AF_INET socket */ -mblk_t * -tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, - ip_recv_attr_t *ira) -{ - tcp_t *ltcp = lconnp->conn_tcp; - tcp_t *tcp = connp->conn_tcp; - sin_t sin; - mblk_t *tpi_mp = NULL; - tcp_stack_t *tcps = tcp->tcp_tcps; - ipha_t *ipha; - - ASSERT(ira->ira_flags & IRAF_IS_IPV4); - ipha = (ipha_t *)mp->b_rptr; - - connp->conn_ipversion = IPV4_VERSION; - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); - IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); - connp->conn_saddr_v6 = connp->conn_laddr_v6; - - sin = sin_null; - sin.sin_addr.s_addr = connp->conn_faddr_v4; - sin.sin_port = connp->conn_fport; - sin.sin_family = AF_INET; - if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { - sin_t sind; - - sind = sin_null; - sind.sin_addr.s_addr = connp->conn_laddr_v4; - sind.sin_port = connp->conn_lport; - sind.sin_family = AF_INET; - tpi_mp = mi_tpi_extconn_ind(NULL, - (char *)&sind, sizeof (sin_t), (char *)&tcp, - (t_scalar_t)sizeof (intptr_t), (char *)&sind, - sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); - } else { - tpi_mp = mi_tpi_conn_ind(NULL, - (char *)&sin, sizeof (sin_t), - (char *)&tcp, (t_scalar_t)sizeof (intptr_t), - (t_scalar_t)ltcp->tcp_conn_req_seqnum); - } - - tcp->tcp_mss = tcps->tcps_mss_def_ipv4; - return (tpi_mp); -} - /* * tcp_get_conn/tcp_free_conn * @@ -4393,1055 +1624,6 @@ tcp_get_conn(void *arg, tcp_stack_t *tcps) return ((void *)connp); } -/* BEGIN CSTYLED */ -/* - * - * The sockfs ACCEPT path: - * ======================= - * - * The eager is now established in its own perimeter as soon as SYN is - * received in tcp_input_listener(). When sockfs receives conn_ind, it - * completes the accept processing on the acceptor STREAM. The sending - * of conn_ind part is common for both sockfs listener and a TLI/XTI - * listener but a TLI/XTI listener completes the accept processing - * on the listener perimeter. - * - * Common control flow for 3 way handshake: - * ---------------------------------------- - * - * incoming SYN (listener perimeter) -> tcp_input_listener() - * - * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() - * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() - * - * Sockfs ACCEPT Path: - * ------------------- - * - * open acceptor stream (tcp_open allocates tcp_tli_accept() - * as STREAM entry point) - * - * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() - * - * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager - * association (we are not behind eager's squeue but sockfs is protecting us - * and no one knows about this stream yet. The STREAMS entry point q->q_info - * is changed to point at tcp_wput(). - * - * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to - * listener (done on listener's perimeter). - * - * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish - * accept. - * - * TLI/XTI client ACCEPT path: - * --------------------------- - * - * soaccept() sends T_CONN_RES on the listener STREAM. - * - * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send - * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). - * - * Locks: - * ====== - * - * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and - * and listeners->tcp_eager_next_q. - * - * Referencing: - * ============ - * - * 1) We start out in tcp_input_listener by eager placing a ref on - * listener and listener adding eager to listeners->tcp_eager_next_q0. - * - * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before - * doing so we place a ref on the eager. This ref is finally dropped at the - * end of tcp_accept_finish() while unwinding from the squeue, i.e. the - * reference is dropped by the squeue framework. - * - * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish - * - * The reference must be released by the same entity that added the reference - * In the above scheme, the eager is the entity that adds and releases the - * references. Note that tcp_accept_finish executes in the squeue of the eager - * (albeit after it is attached to the acceptor stream). Though 1. executes - * in the listener's squeue, the eager is nascent at this point and the - * reference can be considered to have been added on behalf of the eager. - * - * Eager getting a Reset or listener closing: - * ========================================== - * - * Once the listener and eager are linked, the listener never does the unlink. - * If the listener needs to close, tcp_eager_cleanup() is called which queues - * a message on all eager perimeter. The eager then does the unlink, clears - * any pointers to the listener's queue and drops the reference to the - * listener. The listener waits in tcp_close outside the squeue until its - * refcount has dropped to 1. This ensures that the listener has waited for - * all eagers to clear their association with the listener. - * - * Similarly, if eager decides to go away, it can unlink itself and close. - * When the T_CONN_RES comes down, we check if eager has closed. Note that - * the reference to eager is still valid because of the extra ref we put - * in tcp_send_conn_ind. - * - * Listener can always locate the eager under the protection - * of the listener->tcp_eager_lock, and then do a refhold - * on the eager during the accept processing. - * - * The acceptor stream accesses the eager in the accept processing - * based on the ref placed on eager before sending T_conn_ind. - * The only entity that can negate this refhold is a listener close - * which is mutually exclusive with an active acceptor stream. - * - * Eager's reference on the listener - * =================================== - * - * If the accept happens (even on a closed eager) the eager drops its - * reference on the listener at the start of tcp_accept_finish. If the - * eager is killed due to an incoming RST before the T_conn_ind is sent up, - * the reference is dropped in tcp_closei_local. If the listener closes, - * the reference is dropped in tcp_eager_kill. In all cases the reference - * is dropped while executing in the eager's context (squeue). - */ -/* END CSTYLED */ - -/* Process the SYN packet, mp, directed at the listener 'tcp' */ - -/* - * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. - * tcp_input_data will not see any packets for listeners since the listener - * has conn_recv set to tcp_input_listener. - */ -/* ARGSUSED */ -void -tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) -{ - tcpha_t *tcpha; - uint32_t seg_seq; - tcp_t *eager; - int err; - conn_t *econnp = NULL; - squeue_t *new_sqp; - mblk_t *mp1; - uint_t ip_hdr_len; - conn_t *lconnp = (conn_t *)arg; - tcp_t *listener = lconnp->conn_tcp; - tcp_stack_t *tcps = listener->tcp_tcps; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - uint_t flags; - mblk_t *tpi_mp; - uint_t ifindex = ira->ira_ruifindex; - boolean_t tlc_set = B_FALSE; - - ip_hdr_len = ira->ira_ip_hdr_length; - tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; - flags = (unsigned int)tcpha->tha_flags & 0xFF; - - if (!(flags & TH_SYN)) { - if ((flags & TH_RST) || (flags & TH_URG)) { - freemsg(mp); - return; - } - if (flags & TH_ACK) { - /* Note this executes in listener's squeue */ - tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); - return; - } - - freemsg(mp); - return; - } - - if (listener->tcp_state != TCPS_LISTEN) - goto error2; - - ASSERT(IPCL_IS_BOUND(lconnp)); - - mutex_enter(&listener->tcp_eager_lock); - - /* - * The system is under memory pressure, so we need to do our part - * to relieve the pressure. So we only accept new request if there - * is nothing waiting to be accepted or waiting to complete the 3-way - * handshake. This means that busy listener will not get too many - * new requests which they cannot handle in time while non-busy - * listener is still functioning properly. - */ - if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 || - listener->tcp_conn_req_cnt_q0 > 0)) { - mutex_exit(&listener->tcp_eager_lock); - TCP_STAT(tcps, tcp_listen_mem_drop); - goto error2; - } - - if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { - mutex_exit(&listener->tcp_eager_lock); - TCP_STAT(tcps, tcp_listendrop); - BUMP_MIB(&tcps->tcps_mib, tcpListenDrop); - if (lconnp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, - "tcp_input_listener: listen backlog (max=%d) " - "overflow (%d pending) on %s", - listener->tcp_conn_req_max, - listener->tcp_conn_req_cnt_q, - tcp_display(listener, NULL, DISP_PORT_ONLY)); - } - goto error2; - } - - if (listener->tcp_conn_req_cnt_q0 >= - listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { - /* - * Q0 is full. Drop a pending half-open req from the queue - * to make room for the new SYN req. Also mark the time we - * drop a SYN. - * - * A more aggressive defense against SYN attack will - * be to set the "tcp_syn_defense" flag now. - */ - TCP_STAT(tcps, tcp_listendropq0); - listener->tcp_last_rcv_lbolt = ddi_get_lbolt64(); - if (!tcp_drop_q0(listener)) { - mutex_exit(&listener->tcp_eager_lock); - BUMP_MIB(&tcps->tcps_mib, tcpListenDropQ0); - if (lconnp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, - "tcp_input_listener: listen half-open " - "queue (max=%d) full (%d pending) on %s", - tcps->tcps_conn_req_max_q0, - listener->tcp_conn_req_cnt_q0, - tcp_display(listener, NULL, - DISP_PORT_ONLY)); - } - goto error2; - } - } - - /* - * Enforce the limit set on the number of connections per listener. - * Note that tlc_cnt starts with 1. So need to add 1 to tlc_max - * for comparison. - */ - if (listener->tcp_listen_cnt != NULL) { - tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt; - int64_t now; - - if (atomic_add_32_nv(&tlc->tlc_cnt, 1) > tlc->tlc_max + 1) { - mutex_exit(&listener->tcp_eager_lock); - now = ddi_get_lbolt64(); - atomic_add_32(&tlc->tlc_cnt, -1); - TCP_STAT(tcps, tcp_listen_cnt_drop); - tlc->tlc_drop++; - if (now - tlc->tlc_report_time > - MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) { - zcmn_err(lconnp->conn_zoneid, CE_WARN, - "Listener (port %d) connection max (%u) " - "reached: %u attempts dropped total\n", - ntohs(listener->tcp_connp->conn_lport), - tlc->tlc_max, tlc->tlc_drop); - tlc->tlc_report_time = now; - } - goto error2; - } - tlc_set = B_TRUE; - } - - mutex_exit(&listener->tcp_eager_lock); - - /* - * IP sets ira_sqp to either the senders conn_sqp (for loopback) - * or based on the ring (for packets from GLD). Otherwise it is - * set based on lbolt i.e., a somewhat random number. - */ - ASSERT(ira->ira_sqp != NULL); - new_sqp = ira->ira_sqp; - - econnp = (conn_t *)tcp_get_conn(arg2, tcps); - if (econnp == NULL) - goto error2; - - ASSERT(econnp->conn_netstack == lconnp->conn_netstack); - econnp->conn_sqp = new_sqp; - econnp->conn_initial_sqp = new_sqp; - econnp->conn_ixa->ixa_sqp = new_sqp; - - econnp->conn_fport = tcpha->tha_lport; - econnp->conn_lport = tcpha->tha_fport; - - err = conn_inherit_parent(lconnp, econnp); - if (err != 0) - goto error3; - - /* We already know the laddr of the new connection is ours */ - econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation; - - ASSERT(OK_32PTR(mp->b_rptr)); - ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || - IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); - - if (lconnp->conn_family == AF_INET) { - ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); - tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); - } else { - tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); - } - - if (tpi_mp == NULL) - goto error3; - - eager = econnp->conn_tcp; - eager->tcp_detached = B_TRUE; - SOCK_CONNID_INIT(eager->tcp_connid); - - tcp_init_values(eager); - - ASSERT((econnp->conn_ixa->ixa_flags & - (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | - IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == - (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | - IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); - - if (!tcps->tcps_dev_flow_ctl) - econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; - - /* Prepare for diffing against previous packets */ - eager->tcp_recvifindex = 0; - eager->tcp_recvhops = 0xffffffffU; - - if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { - if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || - IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { - econnp->conn_incoming_ifindex = ifindex; - econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; - econnp->conn_ixa->ixa_scopeid = ifindex; - } - } - - if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == - (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && - tcps->tcps_rev_src_routes) { - ipha_t *ipha = (ipha_t *)mp->b_rptr; - ip_pkt_t *ipp = &econnp->conn_xmit_ipp; - - /* Source routing option copyover (reverse it) */ - err = ip_find_hdr_v4(ipha, ipp, B_TRUE); - if (err != 0) { - freemsg(tpi_mp); - goto error3; - } - ip_pkt_source_route_reverse_v4(ipp); - } - - ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); - ASSERT(!eager->tcp_tconnind_started); - /* - * If the SYN came with a credential, it's a loopback packet or a - * labeled packet; attach the credential to the TPI message. - */ - if (ira->ira_cred != NULL) - mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); - - eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; - - /* Inherit the listener's SSL protection state */ - if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) { - kssl_hold_ent(eager->tcp_kssl_ent); - eager->tcp_kssl_pending = B_TRUE; - } - - /* Inherit the listener's non-STREAMS flag */ - if (IPCL_IS_NONSTR(lconnp)) { - econnp->conn_flags |= IPCL_NONSTR; - } - - ASSERT(eager->tcp_ordrel_mp == NULL); - - if (!IPCL_IS_NONSTR(econnp)) { - /* - * Pre-allocate the T_ordrel_ind mblk for TPI socket so that - * at close time, we will always have that to send up. - * Otherwise, we need to do special handling in case the - * allocation fails at that time. - */ - if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) - goto error3; - } - /* - * Now that the IP addresses and ports are setup in econnp we - * can do the IPsec policy work. - */ - if (ira->ira_flags & IRAF_IPSEC_SECURE) { - if (lconnp->conn_policy != NULL) { - /* - * Inherit the policy from the listener; use - * actions from ira - */ - if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { - CONN_DEC_REF(econnp); - freemsg(mp); - goto error3; - } - } - } - - /* Inherit various TCP parameters from the listener */ - eager->tcp_naglim = listener->tcp_naglim; - eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold; - eager->tcp_second_timer_threshold = - listener->tcp_second_timer_threshold; - eager->tcp_first_ctimer_threshold = - listener->tcp_first_ctimer_threshold; - eager->tcp_second_ctimer_threshold = - listener->tcp_second_ctimer_threshold; - - /* - * tcp_set_destination() may set tcp_rwnd according to the route - * metrics. If it does not, the eager's receive window will be set - * to the listener's receive window later in this function. - */ - eager->tcp_rwnd = 0; - - /* - * Inherit listener's tcp_init_cwnd. Need to do this before - * calling tcp_process_options() which set the initial cwnd. - */ - eager->tcp_init_cwnd = listener->tcp_init_cwnd; - - if (is_system_labeled()) { - ip_xmit_attr_t *ixa = econnp->conn_ixa; - - ASSERT(ira->ira_tsl != NULL); - /* Discard any old label */ - if (ixa->ixa_free_flags & IXA_FREE_TSL) { - ASSERT(ixa->ixa_tsl != NULL); - label_rele(ixa->ixa_tsl); - ixa->ixa_free_flags &= ~IXA_FREE_TSL; - ixa->ixa_tsl = NULL; - } - if ((lconnp->conn_mlp_type != mlptSingle || - lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && - ira->ira_tsl != NULL) { - /* - * If this is an MLP connection or a MAC-Exempt - * connection with an unlabeled node, packets are to be - * exchanged using the security label of the received - * SYN packet instead of the server application's label. - * tsol_check_dest called from ip_set_destination - * might later update TSF_UNLABELED by replacing - * ixa_tsl with a new label. - */ - label_hold(ira->ira_tsl); - ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); - DTRACE_PROBE2(mlp_syn_accept, conn_t *, - econnp, ts_label_t *, ixa->ixa_tsl) - } else { - ixa->ixa_tsl = crgetlabel(econnp->conn_cred); - DTRACE_PROBE2(syn_accept, conn_t *, - econnp, ts_label_t *, ixa->ixa_tsl) - } - /* - * conn_connect() called from tcp_set_destination will verify - * the destination is allowed to receive packets at the - * security label of the SYN-ACK we are generating. As part of - * that, tsol_check_dest() may create a new effective label for - * this connection. - * Finally conn_connect() will call conn_update_label. - * All that remains for TCP to do is to call - * conn_build_hdr_template which is done as part of - * tcp_set_destination. - */ - } - - /* - * Since we will clear tcp_listener before we clear tcp_detached - * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress - * so we can tell a TCP_DETACHED_NONEAGER apart. - */ - eager->tcp_hard_binding = B_TRUE; - - tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ - TCP_BIND_HASH(econnp->conn_lport)], eager, 0); - - CL_INET_CONNECT(econnp, B_FALSE, err); - if (err != 0) { - tcp_bind_hash_remove(eager); - goto error3; - } - - /* - * No need to check for multicast destination since ip will only pass - * up multicasts to those that have expressed interest - * TODO: what about rejecting broadcasts? - * Also check that source is not a multicast or broadcast address. - */ - eager->tcp_state = TCPS_SYN_RCVD; - SOCK_CONNID_BUMP(eager->tcp_connid); - - /* - * Adapt our mss, ttl, ... based on the remote address. - */ - - if (tcp_set_destination(eager) != 0) { - BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); - /* Undo the bind_hash_insert */ - tcp_bind_hash_remove(eager); - goto error3; - } - - /* Process all TCP options. */ - tcp_process_options(eager, tcpha); - - /* Is the other end ECN capable? */ - if (tcps->tcps_ecn_permitted >= 1 && - (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { - eager->tcp_ecn_ok = B_TRUE; - } - - /* - * The listener's conn_rcvbuf should be the default window size or a - * window size changed via SO_RCVBUF option. First round up the - * eager's tcp_rwnd to the nearest MSS. Then find out the window - * scale option value if needed. Call tcp_rwnd_set() to finish the - * setting. - * - * Note if there is a rpipe metric associated with the remote host, - * we should not inherit receive window size from listener. - */ - eager->tcp_rwnd = MSS_ROUNDUP( - (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : - eager->tcp_rwnd), eager->tcp_mss); - if (eager->tcp_snd_ws_ok) - tcp_set_ws_value(eager); - /* - * Note that this is the only place tcp_rwnd_set() is called for - * accepting a connection. We need to call it here instead of - * after the 3-way handshake because we need to tell the other - * side our rwnd in the SYN-ACK segment. - */ - (void) tcp_rwnd_set(eager, eager->tcp_rwnd); - - ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && - eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); - - ASSERT(econnp->conn_rcvbuf != 0 && - econnp->conn_rcvbuf == eager->tcp_rwnd); - - /* Put a ref on the listener for the eager. */ - CONN_INC_REF(lconnp); - mutex_enter(&listener->tcp_eager_lock); - listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; - eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; - listener->tcp_eager_next_q0 = eager; - eager->tcp_eager_prev_q0 = listener; - - /* Set tcp_listener before adding it to tcp_conn_fanout */ - eager->tcp_listener = listener; - eager->tcp_saved_listener = listener; - - /* - * Set tcp_listen_cnt so that when the connection is done, the counter - * is decremented. - */ - eager->tcp_listen_cnt = listener->tcp_listen_cnt; - - /* - * Tag this detached tcp vector for later retrieval - * by our listener client in tcp_accept(). - */ - eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; - listener->tcp_conn_req_cnt_q0++; - if (++listener->tcp_conn_req_seqnum == -1) { - /* - * -1 is "special" and defined in TPI as something - * that should never be used in T_CONN_IND - */ - ++listener->tcp_conn_req_seqnum; - } - mutex_exit(&listener->tcp_eager_lock); - - if (listener->tcp_syn_defense) { - /* Don't drop the SYN that comes from a good IP source */ - ipaddr_t *addr_cache; - - addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); - if (addr_cache != NULL && econnp->conn_faddr_v4 == - addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { - eager->tcp_dontdrop = B_TRUE; - } - } - - /* - * We need to insert the eager in its own perimeter but as soon - * as we do that, we expose the eager to the classifier and - * should not touch any field outside the eager's perimeter. - * So do all the work necessary before inserting the eager - * in its own perimeter. Be optimistic that conn_connect() - * will succeed but undo everything if it fails. - */ - seg_seq = ntohl(tcpha->tha_seq); - eager->tcp_irs = seg_seq; - eager->tcp_rack = seg_seq; - eager->tcp_rnxt = seg_seq + 1; - eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); - BUMP_MIB(&tcps->tcps_mib, tcpPassiveOpens); - eager->tcp_state = TCPS_SYN_RCVD; - mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, - NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); - if (mp1 == NULL) { - /* - * Increment the ref count as we are going to - * enqueueing an mp in squeue - */ - CONN_INC_REF(econnp); - goto error; - } - - /* - * We need to start the rto timer. In normal case, we start - * the timer after sending the packet on the wire (or at - * least believing that packet was sent by waiting for - * conn_ip_output() to return). Since this is the first packet - * being sent on the wire for the eager, our initial tcp_rto - * is at least tcp_rexmit_interval_min which is a fairly - * large value to allow the algorithm to adjust slowly to large - * fluctuations of RTT during first few transmissions. - * - * Starting the timer first and then sending the packet in this - * case shouldn't make much difference since tcp_rexmit_interval_min - * is of the order of several 100ms and starting the timer - * first and then sending the packet will result in difference - * of few micro seconds. - * - * Without this optimization, we are forced to hold the fanout - * lock across the ipcl_bind_insert() and sending the packet - * so that we don't race against an incoming packet (maybe RST) - * for this eager. - * - * It is necessary to acquire an extra reference on the eager - * at this point and hold it until after tcp_send_data() to - * ensure against an eager close race. - */ - - CONN_INC_REF(econnp); - - TCP_TIMER_RESTART(eager, eager->tcp_rto); - - /* - * Insert the eager in its own perimeter now. We are ready to deal - * with any packets on eager. - */ - if (ipcl_conn_insert(econnp) != 0) - goto error; - - ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); - freemsg(mp); - /* - * Send the SYN-ACK. Use the right squeue so that conn_ixa is - * only used by one thread at a time. - */ - if (econnp->conn_sqp == lconnp->conn_sqp) { - (void) conn_ip_output(mp1, econnp->conn_ixa); - CONN_DEC_REF(econnp); - } else { - SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack, - econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK); - } - return; -error: - freemsg(mp1); - eager->tcp_closemp_used = B_TRUE; - TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); - mp1 = &eager->tcp_closemp; - SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, - econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); - - /* - * If a connection already exists, send the mp to that connections so - * that it can be appropriately dealt with. - */ - ipst = tcps->tcps_netstack->netstack_ip; - - if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { - if (!IPCL_IS_CONNECTED(econnp)) { - /* - * Something bad happened. ipcl_conn_insert() - * failed because a connection already existed - * in connected hash but we can't find it - * anymore (someone blew it away). Just - * free this message and hopefully remote - * will retransmit at which time the SYN can be - * treated as a new connection or dealth with - * a TH_RST if a connection already exists. - */ - CONN_DEC_REF(econnp); - freemsg(mp); - } else { - SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, - econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); - } - } else { - /* Nobody wants this packet */ - freemsg(mp); - } - return; -error3: - CONN_DEC_REF(econnp); -error2: - freemsg(mp); - if (tlc_set) - atomic_add_32(&listener->tcp_listen_cnt->tlc_cnt, -1); -} - -/* ARGSUSED2 */ -void -tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *econnp = (conn_t *)arg; - tcp_t *tcp = econnp->conn_tcp; - - /* Guard against a RST having blown it away while on the squeue */ - if (tcp->tcp_state == TCPS_CLOSED) { - freemsg(mp); - return; - } - - (void) conn_ip_output(mp, econnp->conn_ixa); -} - -/* - * In an ideal case of vertical partition in NUMA architecture, its - * beneficial to have the listener and all the incoming connections - * tied to the same squeue. The other constraint is that incoming - * connections should be tied to the squeue attached to interrupted - * CPU for obvious locality reason so this leaves the listener to - * be tied to the same squeue. Our only problem is that when listener - * is binding, the CPU that will get interrupted by the NIC whose - * IP address the listener is binding to is not even known. So - * the code below allows us to change that binding at the time the - * CPU is interrupted by virtue of incoming connection's squeue. - * - * This is usefull only in case of a listener bound to a specific IP - * address. For other kind of listeners, they get bound the - * very first time and there is no attempt to rebind them. - */ -void -tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *ira) -{ - conn_t *connp = (conn_t *)arg; - squeue_t *sqp = (squeue_t *)arg2; - squeue_t *new_sqp; - uint32_t conn_flags; - - /* - * IP sets ira_sqp to either the senders conn_sqp (for loopback) - * or based on the ring (for packets from GLD). Otherwise it is - * set based on lbolt i.e., a somewhat random number. - */ - ASSERT(ira->ira_sqp != NULL); - new_sqp = ira->ira_sqp; - - if (connp->conn_fanout == NULL) - goto done; - - if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { - mutex_enter(&connp->conn_fanout->connf_lock); - mutex_enter(&connp->conn_lock); - /* - * No one from read or write side can access us now - * except for already queued packets on this squeue. - * But since we haven't changed the squeue yet, they - * can't execute. If they are processed after we have - * changed the squeue, they are sent back to the - * correct squeue down below. - * But a listner close can race with processing of - * incoming SYN. If incoming SYN processing changes - * the squeue then the listener close which is waiting - * to enter the squeue would operate on the wrong - * squeue. Hence we don't change the squeue here unless - * the refcount is exactly the minimum refcount. The - * minimum refcount of 4 is counted as - 1 each for - * TCP and IP, 1 for being in the classifier hash, and - * 1 for the mblk being processed. - */ - - if (connp->conn_ref != 4 || - connp->conn_tcp->tcp_state != TCPS_LISTEN) { - mutex_exit(&connp->conn_lock); - mutex_exit(&connp->conn_fanout->connf_lock); - goto done; - } - if (connp->conn_sqp != new_sqp) { - while (connp->conn_sqp != new_sqp) - (void) casptr(&connp->conn_sqp, sqp, new_sqp); - /* No special MT issues for outbound ixa_sqp hint */ - connp->conn_ixa->ixa_sqp = new_sqp; - } - - do { - conn_flags = connp->conn_flags; - conn_flags |= IPCL_FULLY_BOUND; - (void) cas32(&connp->conn_flags, connp->conn_flags, - conn_flags); - } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); - - mutex_exit(&connp->conn_fanout->connf_lock); - mutex_exit(&connp->conn_lock); - - /* - * Assume we have picked a good squeue for the listener. Make - * subsequent SYNs not try to change the squeue. - */ - connp->conn_recv = tcp_input_listener; - } - -done: - if (connp->conn_sqp != sqp) { - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, - ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); - } else { - tcp_input_listener(connp, mp, sqp, ira); - } -} - -/* - * Successful connect request processing begins when our client passes - * a T_CONN_REQ message into tcp_wput(), which performs function calls into - * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). - * - * After various error checks are completed, tcp_tpi_connect() lays - * the target address and port into the composite header template. - * Then we ask IP for information, including a source address if we didn't - * already have one. Finally we prepare to send the SYN packet, and then - * send up the T_OK_ACK reply message. - */ -static void -tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) -{ - sin_t *sin; - struct T_conn_req *tcr; - struct sockaddr *sa; - socklen_t len; - int error; - cred_t *cr; - pid_t cpid; - conn_t *connp = tcp->tcp_connp; - queue_t *q = connp->conn_wq; - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, &cpid); - ASSERT(cr != NULL); - if (cr == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - - tcr = (struct T_conn_req *)mp->b_rptr; - - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); - if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { - tcp_err_ack(tcp, mp, TPROTO, 0); - return; - } - - /* - * Pre-allocate the T_ordrel_ind mblk so that at close time, we - * will always have that to send up. Otherwise, we need to do - * special handling in case the allocation fails at that time. - * If the end point is TPI, the tcp_t can be reused and the - * tcp_ordrel_mp may be allocated already. - */ - if (tcp->tcp_ordrel_mp == NULL) { - if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - } - - /* - * Determine packet type based on type of address passed in - * the request should contain an IPv4 or IPv6 address. - * Make sure that address family matches the type of - * family of the address passed down. - */ - switch (tcr->DEST_length) { - default: - tcp_err_ack(tcp, mp, TBADADDR, 0); - return; - - case (sizeof (sin_t) - sizeof (sin->sin_zero)): { - /* - * XXX: The check for valid DEST_length was not there - * in earlier releases and some buggy - * TLI apps (e.g Sybase) got away with not feeding - * in sin_zero part of address. - * We allow that bug to keep those buggy apps humming. - * Test suites require the check on DEST_length. - * We construct a new mblk with valid DEST_length - * free the original so the rest of the code does - * not have to keep track of this special shorter - * length address case. - */ - mblk_t *nmp; - struct T_conn_req *ntcr; - sin_t *nsin; - - nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + - tcr->OPT_length, BPRI_HI); - if (nmp == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - ntcr = (struct T_conn_req *)nmp->b_rptr; - bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ - ntcr->PRIM_type = T_CONN_REQ; - ntcr->DEST_length = sizeof (sin_t); - ntcr->DEST_offset = sizeof (struct T_conn_req); - - nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); - *nsin = sin_null; - /* Get pointer to shorter address to copy from original mp */ - sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, - tcr->DEST_length); /* extract DEST_length worth of sin_t */ - if (sin == NULL || !OK_32PTR((char *)sin)) { - freemsg(nmp); - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - nsin->sin_family = sin->sin_family; - nsin->sin_port = sin->sin_port; - nsin->sin_addr = sin->sin_addr; - /* Note:nsin->sin_zero zero-fill with sin_null assign above */ - nmp->b_wptr = (uchar_t *)&nsin[1]; - if (tcr->OPT_length != 0) { - ntcr->OPT_length = tcr->OPT_length; - ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; - bcopy((uchar_t *)tcr + tcr->OPT_offset, - (uchar_t *)ntcr + ntcr->OPT_offset, - tcr->OPT_length); - nmp->b_wptr += tcr->OPT_length; - } - freemsg(mp); /* original mp freed */ - mp = nmp; /* re-initialize original variables */ - tcr = ntcr; - } - /* FALLTHRU */ - - case sizeof (sin_t): - sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, - sizeof (sin_t)); - len = sizeof (sin_t); - break; - - case sizeof (sin6_t): - sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, - sizeof (sin6_t)); - len = sizeof (sin6_t); - break; - } - - error = proto_verify_ip_addr(connp->conn_family, sa, len); - if (error != 0) { - tcp_err_ack(tcp, mp, TSYSERR, error); - return; - } - - /* - * TODO: If someone in TCPS_TIME_WAIT has this dst/port we - * should key on their sequence number and cut them loose. - */ - - /* - * If options passed in, feed it for verification and handling - */ - if (tcr->OPT_length != 0) { - mblk_t *ok_mp; - mblk_t *discon_mp; - mblk_t *conn_opts_mp; - int t_error, sys_error, do_disconnect; - - conn_opts_mp = NULL; - - if (tcp_conprim_opt_process(tcp, mp, - &do_disconnect, &t_error, &sys_error) < 0) { - if (do_disconnect) { - ASSERT(t_error == 0 && sys_error == 0); - discon_mp = mi_tpi_discon_ind(NULL, - ECONNREFUSED, 0); - if (!discon_mp) { - tcp_err_ack_prim(tcp, mp, T_CONN_REQ, - TSYSERR, ENOMEM); - return; - } - ok_mp = mi_tpi_ok_ack_alloc(mp); - if (!ok_mp) { - tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, - TSYSERR, ENOMEM); - return; - } - qreply(q, ok_mp); - qreply(q, discon_mp); /* no flush! */ - } else { - ASSERT(t_error != 0); - tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, - sys_error); - } - return; - } - /* - * Success in setting options, the mp option buffer represented - * by OPT_length/offset has been potentially modified and - * contains results of option processing. We copy it in - * another mp to save it for potentially influencing returning - * it in T_CONN_CONN. - */ - if (tcr->OPT_length != 0) { /* there are resulting options */ - conn_opts_mp = copyb(mp); - if (!conn_opts_mp) { - tcp_err_ack_prim(tcp, mp, T_CONN_REQ, - TSYSERR, ENOMEM); - return; - } - ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); - tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; - /* - * Note: - * These resulting option negotiation can include any - * end-to-end negotiation options but there no such - * thing (yet?) in our TCP/IP. - */ - } - } - - /* call the non-TPI version */ - error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); - if (error < 0) { - mp = mi_tpi_err_ack_alloc(mp, -error, 0); - } else if (error > 0) { - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); - } else { - mp = mi_tpi_ok_ack_alloc(mp); - } - - /* - * Note: Code below is the "failure" case - */ - /* return error ack and blow away saved option results if any */ -connect_failed: - if (mp != NULL) - putnext(connp->conn_rq, mp); - else { - tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, - TSYSERR, ENOMEM); - } -} - /* * Handle connect to IPv4 destinations, including connections for AF_INET6 * sockets connecting to IPv4 mapped IPv6 destinations. @@ -5646,9 +1828,10 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) "tcp_disconnect: bad state, %d", tcp->tcp_state); } return (TOUTSTATE); + } else if (tcp->tcp_state >= TCPS_ESTABLISHED) { + TCPS_CONN_DEC(tcps); } - if (seqnum == -1 || tcp->tcp_conn_req_max == 0) { /* @@ -5708,11 +1891,15 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) } if (lconnp != NULL) CONN_DEC_REF(lconnp); - if (old_state == TCPS_SYN_SENT || old_state == TCPS_SYN_RCVD) { - BUMP_MIB(&tcps->tcps_mib, tcpAttemptFails); - } else if (old_state == TCPS_ESTABLISHED || - old_state == TCPS_CLOSE_WAIT) { - BUMP_MIB(&tcps->tcps_mib, tcpEstabResets); + switch (old_state) { + case TCPS_SYN_SENT: + case TCPS_SYN_RCVD: + TCPS_BUMP_MIB(tcps, tcpAttemptFails); + break; + case TCPS_ESTABLISHED: + case TCPS_CLOSE_WAIT: + TCPS_BUMP_MIB(tcps, tcpEstabResets); + break; } if (tcp->tcp_fused) @@ -5742,7 +1929,7 @@ tcp_disconnect_common(tcp_t *tcp, t_scalar_t seqnum) * that tcp_input_listener() marked with 'seqnum'. Rejection consists * of sending the appropriate RST, not an ICMP error. */ -static void +void tcp_disconnect(tcp_t *tcp, mblk_t *mp) { t_scalar_t seqnum; @@ -5770,358 +1957,6 @@ tcp_disconnect(tcp_t *tcp, mblk_t *mp) } /* - * Diagnostic routine used to return a string associated with the tcp state. - * Note that if the caller does not supply a buffer, it will use an internal - * static string. This means that if multiple threads call this function at - * the same time, output can be corrupted... Note also that this function - * does not check the size of the supplied buffer. The caller has to make - * sure that it is big enough. - */ -static char * -tcp_display(tcp_t *tcp, char *sup_buf, char format) -{ - char buf1[30]; - static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; - char *buf; - char *cp; - in6_addr_t local, remote; - char local_addrbuf[INET6_ADDRSTRLEN]; - char remote_addrbuf[INET6_ADDRSTRLEN]; - conn_t *connp; - - if (sup_buf != NULL) - buf = sup_buf; - else - buf = priv_buf; - - if (tcp == NULL) - return ("NULL_TCP"); - - connp = tcp->tcp_connp; - switch (tcp->tcp_state) { - case TCPS_CLOSED: - cp = "TCP_CLOSED"; - break; - case TCPS_IDLE: - cp = "TCP_IDLE"; - break; - case TCPS_BOUND: - cp = "TCP_BOUND"; - break; - case TCPS_LISTEN: - cp = "TCP_LISTEN"; - break; - case TCPS_SYN_SENT: - cp = "TCP_SYN_SENT"; - break; - case TCPS_SYN_RCVD: - cp = "TCP_SYN_RCVD"; - break; - case TCPS_ESTABLISHED: - cp = "TCP_ESTABLISHED"; - break; - case TCPS_CLOSE_WAIT: - cp = "TCP_CLOSE_WAIT"; - break; - case TCPS_FIN_WAIT_1: - cp = "TCP_FIN_WAIT_1"; - break; - case TCPS_CLOSING: - cp = "TCP_CLOSING"; - break; - case TCPS_LAST_ACK: - cp = "TCP_LAST_ACK"; - break; - case TCPS_FIN_WAIT_2: - cp = "TCP_FIN_WAIT_2"; - break; - case TCPS_TIME_WAIT: - cp = "TCP_TIME_WAIT"; - break; - default: - (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); - cp = buf1; - break; - } - switch (format) { - case DISP_ADDR_AND_PORT: - if (connp->conn_ipversion == IPV4_VERSION) { - /* - * Note that we use the remote address in the tcp_b - * structure. This means that it will print out - * the real destination address, not the next hop's - * address if source routing is used. - */ - IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); - IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); - - } else { - local = connp->conn_laddr_v6; - remote = connp->conn_faddr_v6; - } - (void) inet_ntop(AF_INET6, &local, local_addrbuf, - sizeof (local_addrbuf)); - (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, - sizeof (remote_addrbuf)); - (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", - local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, - ntohs(connp->conn_fport), cp); - break; - case DISP_PORT_ONLY: - default: - (void) mi_sprintf(buf, "[%u, %u] %s", - ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); - break; - } - - return (buf); -} - -/* - * Called via squeue to get on to eager's perimeter. It sends a - * TH_RST if eager is in the fanout table. The listener wants the - * eager to disappear either by means of tcp_eager_blowoff() or - * tcp_eager_cleanup() being called. tcp_eager_kill() can also be - * called (via squeue) if the eager cannot be inserted in the - * fanout table in tcp_input_listener(). - */ -/* ARGSUSED */ -void -tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *econnp = (conn_t *)arg; - tcp_t *eager = econnp->conn_tcp; - tcp_t *listener = eager->tcp_listener; - - /* - * We could be called because listener is closing. Since - * the eager was using listener's queue's, we avoid - * using the listeners queues from now on. - */ - ASSERT(eager->tcp_detached); - econnp->conn_rq = NULL; - econnp->conn_wq = NULL; - - /* - * An eager's conn_fanout will be NULL if it's a duplicate - * for an existing 4-tuples in the conn fanout table. - * We don't want to send an RST out in such case. - */ - if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) { - tcp_xmit_ctl("tcp_eager_kill, can't wait", - eager, eager->tcp_snxt, 0, TH_RST); - } - - /* We are here because listener wants this eager gone */ - if (listener != NULL) { - mutex_enter(&listener->tcp_eager_lock); - tcp_eager_unlink(eager); - if (eager->tcp_tconnind_started) { - /* - * The eager has sent a conn_ind up to the - * listener but listener decides to close - * instead. We need to drop the extra ref - * placed on eager in tcp_input_data() before - * sending the conn_ind to listener. - */ - CONN_DEC_REF(econnp); - } - mutex_exit(&listener->tcp_eager_lock); - CONN_DEC_REF(listener->tcp_connp); - } - - if (eager->tcp_state != TCPS_CLOSED) - tcp_close_detached(eager); -} - -/* - * Reset any eager connection hanging off this listener marked - * with 'seqnum' and then reclaim it's resources. - */ -static boolean_t -tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) -{ - tcp_t *eager; - mblk_t *mp; - tcp_stack_t *tcps = listener->tcp_tcps; - - TCP_STAT(tcps, tcp_eager_blowoff_calls); - eager = listener; - mutex_enter(&listener->tcp_eager_lock); - do { - eager = eager->tcp_eager_next_q; - if (eager == NULL) { - mutex_exit(&listener->tcp_eager_lock); - return (B_FALSE); - } - } while (eager->tcp_conn_req_seqnum != seqnum); - - if (eager->tcp_closemp_used) { - mutex_exit(&listener->tcp_eager_lock); - return (B_TRUE); - } - eager->tcp_closemp_used = B_TRUE; - TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); - CONN_INC_REF(eager->tcp_connp); - mutex_exit(&listener->tcp_eager_lock); - mp = &eager->tcp_closemp; - SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, - eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); - return (B_TRUE); -} - -/* - * Reset any eager connection hanging off this listener - * and then reclaim it's resources. - */ -static void -tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) -{ - tcp_t *eager; - mblk_t *mp; - tcp_stack_t *tcps = listener->tcp_tcps; - - ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); - - if (!q0_only) { - /* First cleanup q */ - TCP_STAT(tcps, tcp_eager_blowoff_q); - eager = listener->tcp_eager_next_q; - while (eager != NULL) { - if (!eager->tcp_closemp_used) { - eager->tcp_closemp_used = B_TRUE; - TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); - CONN_INC_REF(eager->tcp_connp); - mp = &eager->tcp_closemp; - SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, NULL, - SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); - } - eager = eager->tcp_eager_next_q; - } - } - /* Then cleanup q0 */ - TCP_STAT(tcps, tcp_eager_blowoff_q0); - eager = listener->tcp_eager_next_q0; - while (eager != listener) { - if (!eager->tcp_closemp_used) { - eager->tcp_closemp_used = B_TRUE; - TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); - CONN_INC_REF(eager->tcp_connp); - mp = &eager->tcp_closemp; - SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, - SQTAG_TCP_EAGER_CLEANUP_Q0); - } - eager = eager->tcp_eager_next_q0; - } -} - -/* - * If we are an eager connection hanging off a listener that hasn't - * formally accepted the connection yet, get off his list and blow off - * any data that we have accumulated. - */ -static void -tcp_eager_unlink(tcp_t *tcp) -{ - tcp_t *listener = tcp->tcp_listener; - - ASSERT(listener != NULL); - ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); - if (tcp->tcp_eager_next_q0 != NULL) { - ASSERT(tcp->tcp_eager_prev_q0 != NULL); - - /* Remove the eager tcp from q0 */ - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = - tcp->tcp_eager_prev_q0; - tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = - tcp->tcp_eager_next_q0; - ASSERT(listener->tcp_conn_req_cnt_q0 > 0); - listener->tcp_conn_req_cnt_q0--; - - tcp->tcp_eager_next_q0 = NULL; - tcp->tcp_eager_prev_q0 = NULL; - - /* - * Take the eager out, if it is in the list of droppable - * eagers. - */ - MAKE_UNDROPPABLE(tcp); - - if (tcp->tcp_syn_rcvd_timeout != 0) { - /* we have timed out before */ - ASSERT(listener->tcp_syn_rcvd_timeout > 0); - listener->tcp_syn_rcvd_timeout--; - } - } else { - tcp_t **tcpp = &listener->tcp_eager_next_q; - tcp_t *prev = NULL; - - for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { - if (tcpp[0] == tcp) { - if (listener->tcp_eager_last_q == tcp) { - /* - * If we are unlinking the last - * element on the list, adjust - * tail pointer. Set tail pointer - * to nil when list is empty. - */ - ASSERT(tcp->tcp_eager_next_q == NULL); - if (listener->tcp_eager_last_q == - listener->tcp_eager_next_q) { - listener->tcp_eager_last_q = - NULL; - } else { - /* - * We won't get here if there - * is only one eager in the - * list. - */ - ASSERT(prev != NULL); - listener->tcp_eager_last_q = - prev; - } - } - tcpp[0] = tcp->tcp_eager_next_q; - tcp->tcp_eager_next_q = NULL; - tcp->tcp_eager_last_q = NULL; - ASSERT(listener->tcp_conn_req_cnt_q > 0); - listener->tcp_conn_req_cnt_q--; - break; - } - prev = tcpp[0]; - } - } - tcp->tcp_listener = NULL; -} - -/* Shorthand to generate and send TPI error acks to our client */ -static void -tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) -{ - if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) - putnext(tcp->tcp_connp->conn_rq, mp); -} - -/* Shorthand to generate and send TPI error acks to our client */ -static void -tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, - int t_error, int sys_error) -{ - struct T_error_ack *teackp; - - if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), - M_PCPROTO, T_ERROR_ACK)) != NULL) { - teackp = (struct T_error_ack *)mp->b_rptr; - teackp->ERROR_prim = primitive; - teackp->TLI_error = t_error; - teackp->UNIX_error = sys_error; - putnext(tcp->tcp_connp->conn_rq, mp); - } -} - -/* * Note: No locks are held when inspecting tcp_g_*epriv_ports * but instead the code relies on: * - the fact that the address of the array and its size never changes @@ -6225,214 +2060,6 @@ tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, return (0); } -/* Return the TPI/TLI equivalent of our current tcp_state */ -static int -tcp_tpistate(tcp_t *tcp) -{ - switch (tcp->tcp_state) { - case TCPS_IDLE: - return (TS_UNBND); - case TCPS_LISTEN: - /* - * Return whether there are outstanding T_CONN_IND waiting - * for the matching T_CONN_RES. Therefore don't count q0. - */ - if (tcp->tcp_conn_req_cnt_q > 0) - return (TS_WRES_CIND); - else - return (TS_IDLE); - case TCPS_BOUND: - return (TS_IDLE); - case TCPS_SYN_SENT: - return (TS_WCON_CREQ); - case TCPS_SYN_RCVD: - /* - * Note: assumption: this has to the active open SYN_RCVD. - * The passive instance is detached in SYN_RCVD stage of - * incoming connection processing so we cannot get request - * for T_info_ack on it. - */ - return (TS_WACK_CRES); - case TCPS_ESTABLISHED: - return (TS_DATA_XFER); - case TCPS_CLOSE_WAIT: - return (TS_WREQ_ORDREL); - case TCPS_FIN_WAIT_1: - return (TS_WIND_ORDREL); - case TCPS_FIN_WAIT_2: - return (TS_WIND_ORDREL); - - case TCPS_CLOSING: - case TCPS_LAST_ACK: - case TCPS_TIME_WAIT: - case TCPS_CLOSED: - /* - * Following TS_WACK_DREQ7 is a rendition of "not - * yet TS_IDLE" TPI state. There is no best match to any - * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we - * choose a value chosen that will map to TLI/XTI level - * state of TSTATECHNG (state is process of changing) which - * captures what this dummy state represents. - */ - return (TS_WACK_DREQ7); - default: - cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", - tcp->tcp_state, tcp_display(tcp, NULL, - DISP_PORT_ONLY)); - return (TS_UNBND); - } -} - -static void -tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) -{ - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - - if (connp->conn_family == AF_INET6) - *tia = tcp_g_t_info_ack_v6; - else - *tia = tcp_g_t_info_ack; - tia->CURRENT_state = tcp_tpistate(tcp); - tia->OPT_size = tcp_max_optsize; - if (tcp->tcp_mss == 0) { - /* Not yet set - tcp_open does not set mss */ - if (connp->conn_ipversion == IPV4_VERSION) - tia->TIDU_size = tcps->tcps_mss_def_ipv4; - else - tia->TIDU_size = tcps->tcps_mss_def_ipv6; - } else { - tia->TIDU_size = tcp->tcp_mss; - } - /* TODO: Default ETSDU is 1. Is that correct for tcp? */ -} - -static void -tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, - t_uscalar_t cap_bits1) -{ - tcap->CAP_bits1 = 0; - - if (cap_bits1 & TC1_INFO) { - tcp_copy_info(&tcap->INFO_ack, tcp); - tcap->CAP_bits1 |= TC1_INFO; - } - - if (cap_bits1 & TC1_ACCEPTOR_ID) { - tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; - tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; - } - -} - -/* - * This routine responds to T_CAPABILITY_REQ messages. It is called by - * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from - * tcp_g_t_info_ack. The current state of the stream is copied from - * tcp_state. - */ -static void -tcp_capability_req(tcp_t *tcp, mblk_t *mp) -{ - t_uscalar_t cap_bits1; - struct T_capability_ack *tcap; - - if (MBLKL(mp) < sizeof (struct T_capability_req)) { - freemsg(mp); - return; - } - - cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; - - mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), - mp->b_datap->db_type, T_CAPABILITY_ACK); - if (mp == NULL) - return; - - tcap = (struct T_capability_ack *)mp->b_rptr; - tcp_do_capability_ack(tcp, tcap, cap_bits1); - - putnext(tcp->tcp_connp->conn_rq, mp); -} - -/* - * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. - * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. - * The current state of the stream is copied from tcp_state. - */ -static void -tcp_info_req(tcp_t *tcp, mblk_t *mp) -{ - mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, - T_INFO_ACK); - if (!mp) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); - putnext(tcp->tcp_connp->conn_rq, mp); -} - -/* Respond to the TPI addr request */ -static void -tcp_addr_req(tcp_t *tcp, mblk_t *mp) -{ - struct sockaddr *sa; - mblk_t *ackmp; - struct T_addr_ack *taa; - conn_t *connp = tcp->tcp_connp; - uint_t addrlen; - - /* Make it large enough for worst case */ - ackmp = reallocb(mp, sizeof (struct T_addr_ack) + - 2 * sizeof (sin6_t), 1); - if (ackmp == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); - return; - } - - taa = (struct T_addr_ack *)ackmp->b_rptr; - - bzero(taa, sizeof (struct T_addr_ack)); - ackmp->b_wptr = (uchar_t *)&taa[1]; - - taa->PRIM_type = T_ADDR_ACK; - ackmp->b_datap->db_type = M_PCPROTO; - - if (connp->conn_family == AF_INET) - addrlen = sizeof (sin_t); - else - addrlen = sizeof (sin6_t); - - /* - * Note: Following code assumes 32 bit alignment of basic - * data structures like sin_t and struct T_addr_ack. - */ - if (tcp->tcp_state >= TCPS_BOUND) { - /* - * Fill in local address first - */ - taa->LOCADDR_offset = sizeof (*taa); - taa->LOCADDR_length = addrlen; - sa = (struct sockaddr *)&taa[1]; - (void) conn_getsockname(connp, sa, &addrlen); - ackmp->b_wptr += addrlen; - } - if (tcp->tcp_state >= TCPS_SYN_RCVD) { - /* - * Fill in Remote address - */ - taa->REMADDR_length = addrlen; - /* assumed 32-bit alignment */ - taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; - sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); - (void) conn_getpeername(connp, sa, &addrlen); - ackmp->b_wptr += addrlen; - } - ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); - putnext(tcp->tcp_connp->conn_rq, ackmp); -} - /* * Handle reinitialization of a tcp structure. * Maintain "binding state" resetting the state to BOUND, LISTEN, or IDLE. @@ -6444,8 +2071,6 @@ tcp_reinit(tcp_t *tcp) tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; - TCP_STAT(tcps, tcp_reinit_calls); - /* tcp_reinit should never be called for detached tcp_t's */ ASSERT(tcp->tcp_listener == NULL); ASSERT((connp->conn_family == AF_INET && @@ -6461,9 +2086,9 @@ tcp_reinit(tcp_t *tcp) * Reset everything in the state vector, after updating global * MIB data from instance counters. */ - UPDATE_MIB(&tcps->tcps_mib, tcpHCInSegs, tcp->tcp_ibsegs); + TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); tcp->tcp_ibsegs = 0; - UPDATE_MIB(&tcps->tcps_mib, tcpHCOutSegs, tcp->tcp_obsegs); + TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); tcp->tcp_obsegs = 0; tcp_close_mpp(&tcp->tcp_xmit_head); @@ -6825,7 +2450,6 @@ tcp_reinit_values(tcp) tcp->tcp_recvifindex = 0; tcp->tcp_recvhops = 0; tcp->tcp_closed = 0; - tcp->tcp_cleandeathtag = 0; if (tcp->tcp_hopopts != NULL) { mi_free(tcp->tcp_hopopts); tcp->tcp_hopopts = NULL; @@ -6887,7 +2511,7 @@ tcp_reinit_values(tcp) #undef PRESERVE } -static void +void tcp_init_values(tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; @@ -6967,188 +2591,6 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; } -/* At minimum we need 8 bytes in the TCP header for the lookup */ -#define ICMP_MIN_TCP_HDR 8 - -/* - * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages - * passed up by IP. The message is always received on the correct tcp_t. - * Assumes that IP has pulled up everything up to and including the ICMP header. - */ -/* ARGSUSED2 */ -static void -tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) -{ - conn_t *connp = (conn_t *)arg1; - icmph_t *icmph; - ipha_t *ipha; - int iph_hdr_length; - tcpha_t *tcpha; - uint32_t seg_seq; - tcp_t *tcp = connp->conn_tcp; - - /* Assume IP provides aligned packets */ - ASSERT(OK_32PTR(mp->b_rptr)); - ASSERT((MBLKL(mp) >= sizeof (ipha_t))); - - /* - * Verify IP version. Anything other than IPv4 or IPv6 packet is sent - * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. - */ - if (!(ira->ira_flags & IRAF_IS_IPV4)) { - tcp_icmp_error_ipv6(tcp, mp, ira); - return; - } - - /* Skip past the outer IP and ICMP headers */ - iph_hdr_length = ira->ira_ip_hdr_length; - icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; - /* - * If we don't have the correct outer IP header length - * or if we don't have a complete inner IP header - * drop it. - */ - if (iph_hdr_length < sizeof (ipha_t) || - (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { -noticmpv4: - freemsg(mp); - return; - } - ipha = (ipha_t *)&icmph[1]; - - /* Skip past the inner IP and find the ULP header */ - iph_hdr_length = IPH_HDR_LENGTH(ipha); - tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length); - /* - * If we don't have the correct inner IP header length or if the ULP - * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR - * bytes of TCP header, drop it. - */ - if (iph_hdr_length < sizeof (ipha_t) || - ipha->ipha_protocol != IPPROTO_TCP || - (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) { - goto noticmpv4; - } - - seg_seq = ntohl(tcpha->tha_seq); - switch (icmph->icmph_type) { - case ICMP_DEST_UNREACHABLE: - switch (icmph->icmph_code) { - case ICMP_FRAGMENTATION_NEEDED: - /* - * Update Path MTU, then try to send something out. - */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); - break; - case ICMP_PORT_UNREACHABLE: - case ICMP_PROTOCOL_UNREACHABLE: - switch (tcp->tcp_state) { - case TCPS_SYN_SENT: - case TCPS_SYN_RCVD: - /* - * ICMP can snipe away incipient - * TCP connections as long as - * seq number is same as initial - * send seq number. - */ - if (seg_seq == tcp->tcp_iss) { - (void) tcp_clean_death(tcp, - ECONNREFUSED, 6); - } - break; - } - break; - case ICMP_HOST_UNREACHABLE: - case ICMP_NET_UNREACHABLE: - /* Record the error in case we finally time out. */ - if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) - tcp->tcp_client_errno = EHOSTUNREACH; - else - tcp->tcp_client_errno = ENETUNREACH; - if (tcp->tcp_state == TCPS_SYN_RCVD) { - if (tcp->tcp_listener != NULL && - tcp->tcp_listener->tcp_syn_defense) { - /* - * Ditch the half-open connection if we - * suspect a SYN attack is under way. - */ - (void) tcp_clean_death(tcp, - tcp->tcp_client_errno, 7); - } - } - break; - default: - break; - } - break; - case ICMP_SOURCE_QUENCH: { - /* - * use a global boolean to control - * whether TCP should respond to ICMP_SOURCE_QUENCH. - * The default is false. - */ - if (tcp_icmp_source_quench) { - /* - * Reduce the sending rate as if we got a - * retransmit timeout - */ - uint32_t npkt; - - npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / - tcp->tcp_mss; - tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; - tcp->tcp_cwnd = tcp->tcp_mss; - tcp->tcp_cwnd_cnt = 0; - } - break; - } - } - freemsg(mp); -} - -/* - * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might - * change. But it can refer to fields like tcp_suna and tcp_snxt. - * - * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP - * error messages received by IP. The message is always received on the correct - * tcp_t. - */ -/* ARGSUSED */ -static boolean_t -tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, - ip_recv_attr_t *ira) -{ - tcpha_t *tcpha = (tcpha_t *)arg2; - uint32_t seq = ntohl(tcpha->tha_seq); - tcp_t *tcp = connp->conn_tcp; - - /* - * TCP sequence number contained in payload of the ICMP error message - * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise, - * the message is either a stale ICMP error, or an attack from the - * network. Fail the verification. - */ - if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) - return (B_FALSE); - - /* For "too big" we also check the ignore flag */ - if (ira->ira_flags & IRAF_IS_IPV4) { - ASSERT(icmph != NULL); - if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && - icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && - tcp->tcp_tcps->tcps_ignore_path_mtu) - return (B_FALSE); - } else { - ASSERT(icmp6 != NULL); - if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG && - tcp->tcp_tcps->tcps_ignore_path_mtu) - return (B_FALSE); - } - return (B_TRUE); -} - /* * Update the TCP connection according to change of PMTU. * @@ -7156,7 +2598,7 @@ tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny * or negative MSS, since tcp_mss_set() will do it. */ -static void +void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) { uint32_t pmtu; @@ -7232,287 +2674,6 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) ixa->ixa_flags = ixaflags; } -/* - * Do slow start retransmission after ICMP errors of PMTU changes. - */ -static void -tcp_rexmit_after_error(tcp_t *tcp) -{ - /* - * All sent data has been acknowledged or no data left to send, just - * to return. - */ - if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) || - (tcp->tcp_xmit_head == NULL)) - return; - - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0)) - tcp->tcp_rexmit_max = tcp->tcp_fss; - else - tcp->tcp_rexmit_max = tcp->tcp_snxt; - - tcp->tcp_rexmit_nxt = tcp->tcp_suna; - tcp->tcp_rexmit = B_TRUE; - tcp->tcp_dupack_cnt = 0; - tcp->tcp_snd_burst = TCP_CWND_SS; - tcp_ss_rexmit(tcp); -} - -/* - * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6 - * error messages passed up by IP. - * Assumes that IP has pulled up all the extension headers as well - * as the ICMPv6 header. - */ -static void -tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira) -{ - icmp6_t *icmp6; - ip6_t *ip6h; - uint16_t iph_hdr_length = ira->ira_ip_hdr_length; - tcpha_t *tcpha; - uint8_t *nexthdrp; - uint32_t seg_seq; - - /* - * Verify that we have a complete IP header. - */ - ASSERT((MBLKL(mp) >= sizeof (ip6_t))); - - icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; - ip6h = (ip6_t *)&icmp6[1]; - /* - * Verify if we have a complete ICMP and inner IP header. - */ - if ((uchar_t *)&ip6h[1] > mp->b_wptr) { -noticmpv6: - freemsg(mp); - return; - } - - if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) - goto noticmpv6; - tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); - /* - * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't - * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the - * packet. - */ - if ((*nexthdrp != IPPROTO_TCP) || - ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { - goto noticmpv6; - } - - seg_seq = ntohl(tcpha->tha_seq); - switch (icmp6->icmp6_type) { - case ICMP6_PACKET_TOO_BIG: - /* - * Update Path MTU, then try to send something out. - */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); - break; - case ICMP6_DST_UNREACH: - switch (icmp6->icmp6_code) { - case ICMP6_DST_UNREACH_NOPORT: - if (((tcp->tcp_state == TCPS_SYN_SENT) || - (tcp->tcp_state == TCPS_SYN_RCVD)) && - (seg_seq == tcp->tcp_iss)) { - (void) tcp_clean_death(tcp, - ECONNREFUSED, 8); - } - break; - case ICMP6_DST_UNREACH_ADMIN: - case ICMP6_DST_UNREACH_NOROUTE: - case ICMP6_DST_UNREACH_BEYONDSCOPE: - case ICMP6_DST_UNREACH_ADDR: - /* Record the error in case we finally time out. */ - tcp->tcp_client_errno = EHOSTUNREACH; - if (((tcp->tcp_state == TCPS_SYN_SENT) || - (tcp->tcp_state == TCPS_SYN_RCVD)) && - (seg_seq == tcp->tcp_iss)) { - if (tcp->tcp_listener != NULL && - tcp->tcp_listener->tcp_syn_defense) { - /* - * Ditch the half-open connection if we - * suspect a SYN attack is under way. - */ - (void) tcp_clean_death(tcp, - tcp->tcp_client_errno, 9); - } - } - - - break; - default: - break; - } - break; - case ICMP6_PARAM_PROB: - /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ - if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && - (uchar_t *)ip6h + icmp6->icmp6_pptr == - (uchar_t *)nexthdrp) { - if (tcp->tcp_state == TCPS_SYN_SENT || - tcp->tcp_state == TCPS_SYN_RCVD) { - (void) tcp_clean_death(tcp, - ECONNREFUSED, 10); - } - break; - } - break; - - case ICMP6_TIME_EXCEEDED: - default: - break; - } - freemsg(mp); -} - -/* - * Notify IP that we are having trouble with this connection. IP should - * make note so it can potentially use a different IRE. - */ -static void -tcp_ip_notify(tcp_t *tcp) -{ - conn_t *connp = tcp->tcp_connp; - ire_t *ire; - - /* - * Note: in the case of source routing we want to blow away the - * route to the first source route hop. - */ - ire = connp->conn_ixa->ixa_ire; - if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { - if (ire->ire_ipversion == IPV4_VERSION) { - /* - * As per RFC 1122, we send an RTM_LOSING to inform - * routing protocols. - */ - ip_rts_change(RTM_LOSING, ire->ire_addr, - ire->ire_gateway_addr, ire->ire_mask, - connp->conn_laddr_v4, 0, 0, 0, - (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), - ire->ire_ipst); - } - (void) ire_no_good(ire); - } -} - -#pragma inline(tcp_send_data) - -/* - * Timer callback routine for keepalive probe. We do a fake resend of - * last ACKed byte. Then set a timer using RTO. When the timer expires, - * check to see if we have heard anything from the other end for the last - * RTO period. If we have, set the timer to expire for another - * tcp_keepalive_intrvl and check again. If we have not, set a timer using - * RTO << 1 and check again when it expires. Keep exponentially increasing - * the timeout if we have not heard from the other side. If for more than - * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, - * kill the connection unless the keepalive abort threshold is 0. In - * that case, we will probe "forever." - */ -static void -tcp_keepalive_killer(void *arg) -{ - mblk_t *mp; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - int32_t firetime; - int32_t idletime; - int32_t ka_intrvl; - tcp_stack_t *tcps = tcp->tcp_tcps; - - tcp->tcp_ka_tid = 0; - - if (tcp->tcp_fused) - return; - - BUMP_MIB(&tcps->tcps_mib, tcpTimKeepalive); - ka_intrvl = tcp->tcp_ka_interval; - - /* - * Keepalive probe should only be sent if the application has not - * done a close on the connection. - */ - if (tcp->tcp_state > TCPS_CLOSE_WAIT) { - return; - } - /* Timer fired too early, restart it. */ - if (tcp->tcp_state < TCPS_ESTABLISHED) { - tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, - MSEC_TO_TICK(ka_intrvl)); - return; - } - - idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time); - /* - * If we have not heard from the other side for a long - * time, kill the connection unless the keepalive abort - * threshold is 0. In that case, we will probe "forever." - */ - if (tcp->tcp_ka_abort_thres != 0 && - idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { - BUMP_MIB(&tcps->tcps_mib, tcpTimKeepaliveDrop); - (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? - tcp->tcp_client_errno : ETIMEDOUT, 11); - return; - } - - if (tcp->tcp_snxt == tcp->tcp_suna && - idletime >= ka_intrvl) { - /* Fake resend of last ACKed byte. */ - mblk_t *mp1 = allocb(1, BPRI_LO); - - if (mp1 != NULL) { - *mp1->b_wptr++ = '\0'; - mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, - tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); - freeb(mp1); - /* - * if allocation failed, fall through to start the - * timer back. - */ - if (mp != NULL) { - tcp_send_data(tcp, mp); - BUMP_MIB(&tcps->tcps_mib, - tcpTimKeepaliveProbe); - if (tcp->tcp_ka_last_intrvl != 0) { - int max; - /* - * We should probe again at least - * in ka_intrvl, but not more than - * tcp_rexmit_interval_max. - */ - max = tcps->tcps_rexmit_interval_max; - firetime = MIN(ka_intrvl - 1, - tcp->tcp_ka_last_intrvl << 1); - if (firetime > max) - firetime = max; - } else { - firetime = tcp->tcp_rto; - } - tcp->tcp_ka_tid = TCP_TIMER(tcp, - tcp_keepalive_killer, - MSEC_TO_TICK(firetime)); - tcp->tcp_ka_last_intrvl = firetime; - return; - } - } - } else { - tcp->tcp_ka_last_intrvl = 0; - } - - /* firetime can be negative if (mp1 == NULL || mp == NULL) */ - if ((firetime = ka_intrvl - idletime) < 0) { - firetime = ka_intrvl; - } - tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, - MSEC_TO_TICK(firetime)); -} - int tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) { @@ -7562,214 +2723,6 @@ tcp_maxpsz_set(tcp_t *tcp, boolean_t set_maxblk) return (mss); } -/* - * Extract option values from a tcp header. We put any found values into the - * tcpopt struct and return a bitmask saying which options were found. - */ -static int -tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) -{ - uchar_t *endp; - int len; - uint32_t mss; - uchar_t *up = (uchar_t *)tcpha; - int found = 0; - int32_t sack_len; - tcp_seq sack_begin, sack_end; - tcp_t *tcp; - - endp = up + TCP_HDR_LENGTH(tcpha); - up += TCP_MIN_HEADER_LENGTH; - while (up < endp) { - len = endp - up; - switch (*up) { - case TCPOPT_EOL: - break; - - case TCPOPT_NOP: - up++; - continue; - - case TCPOPT_MAXSEG: - if (len < TCPOPT_MAXSEG_LEN || - up[1] != TCPOPT_MAXSEG_LEN) - break; - - mss = BE16_TO_U16(up+2); - /* Caller must handle tcp_mss_min and tcp_mss_max_* */ - tcpopt->tcp_opt_mss = mss; - found |= TCP_OPT_MSS_PRESENT; - - up += TCPOPT_MAXSEG_LEN; - continue; - - case TCPOPT_WSCALE: - if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) - break; - - if (up[2] > TCP_MAX_WINSHIFT) - tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; - else - tcpopt->tcp_opt_wscale = up[2]; - found |= TCP_OPT_WSCALE_PRESENT; - - up += TCPOPT_WS_LEN; - continue; - - case TCPOPT_SACK_PERMITTED: - if (len < TCPOPT_SACK_OK_LEN || - up[1] != TCPOPT_SACK_OK_LEN) - break; - found |= TCP_OPT_SACK_OK_PRESENT; - up += TCPOPT_SACK_OK_LEN; - continue; - - case TCPOPT_SACK: - if (len <= 2 || up[1] <= 2 || len < up[1]) - break; - - /* If TCP is not interested in SACK blks... */ - if ((tcp = tcpopt->tcp) == NULL) { - up += up[1]; - continue; - } - sack_len = up[1] - TCPOPT_HEADER_LEN; - up += TCPOPT_HEADER_LEN; - - /* - * If the list is empty, allocate one and assume - * nothing is sack'ed. - */ - ASSERT(tcp->tcp_sack_info != NULL); - if (tcp->tcp_notsack_list == NULL) { - tcp_notsack_update(&(tcp->tcp_notsack_list), - tcp->tcp_suna, tcp->tcp_snxt, - &(tcp->tcp_num_notsack_blk), - &(tcp->tcp_cnt_notsack_list)); - - /* - * Make sure tcp_notsack_list is not NULL. - * This happens when kmem_alloc(KM_NOSLEEP) - * returns NULL. - */ - if (tcp->tcp_notsack_list == NULL) { - up += sack_len; - continue; - } - tcp->tcp_fack = tcp->tcp_suna; - } - - while (sack_len > 0) { - if (up + 8 > endp) { - up = endp; - break; - } - sack_begin = BE32_TO_U32(up); - up += 4; - sack_end = BE32_TO_U32(up); - up += 4; - sack_len -= 8; - /* - * Bounds checking. Make sure the SACK - * info is within tcp_suna and tcp_snxt. - * If this SACK blk is out of bound, ignore - * it but continue to parse the following - * blks. - */ - if (SEQ_LEQ(sack_end, sack_begin) || - SEQ_LT(sack_begin, tcp->tcp_suna) || - SEQ_GT(sack_end, tcp->tcp_snxt)) { - continue; - } - tcp_notsack_insert(&(tcp->tcp_notsack_list), - sack_begin, sack_end, - &(tcp->tcp_num_notsack_blk), - &(tcp->tcp_cnt_notsack_list)); - if (SEQ_GT(sack_end, tcp->tcp_fack)) { - tcp->tcp_fack = sack_end; - } - } - found |= TCP_OPT_SACK_PRESENT; - continue; - - case TCPOPT_TSTAMP: - if (len < TCPOPT_TSTAMP_LEN || - up[1] != TCPOPT_TSTAMP_LEN) - break; - - tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); - tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); - - found |= TCP_OPT_TSTAMP_PRESENT; - - up += TCPOPT_TSTAMP_LEN; - continue; - - default: - if (len <= 1 || len < (int)up[1] || up[1] == 0) - break; - up += up[1]; - continue; - } - break; - } - return (found); -} - -/* - * Set the MSS associated with a particular tcp based on its current value, - * and a new one passed in. Observe minimums and maximums, and reset other - * state variables that we want to view as multiples of MSS. - * - * The value of MSS could be either increased or descreased. - */ -static void -tcp_mss_set(tcp_t *tcp, uint32_t mss) -{ - uint32_t mss_max; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - - if (connp->conn_ipversion == IPV4_VERSION) - mss_max = tcps->tcps_mss_max_ipv4; - else - mss_max = tcps->tcps_mss_max_ipv6; - - if (mss < tcps->tcps_mss_min) - mss = tcps->tcps_mss_min; - if (mss > mss_max) - mss = mss_max; - /* - * Unless naglim has been set by our client to - * a non-mss value, force naglim to track mss. - * This can help to aggregate small writes. - */ - if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) - tcp->tcp_naglim = mss; - /* - * TCP should be able to buffer at least 4 MSS data for obvious - * performance reason. - */ - if ((mss << 2) > connp->conn_sndbuf) - connp->conn_sndbuf = mss << 2; - - /* - * Set the send lowater to at least twice of MSS. - */ - if ((mss << 1) > connp->conn_sndlowat) - connp->conn_sndlowat = mss << 1; - - /* - * Update tcp_cwnd according to the new value of MSS. Keep the - * previous ratio to preserve the transmit rate. - */ - tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; - tcp->tcp_cwnd_cnt = 0; - - tcp->tcp_mss = mss; - (void) tcp_maxpsz_set(tcp, B_TRUE); -} - /* For /dev/tcp aka AF_INET open */ static int tcp_openv4(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) @@ -7784,7 +2737,7 @@ tcp_openv6(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp) return (tcp_open(q, devp, flag, sflag, credp, B_TRUE)); } -static conn_t * +conn_t * tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, int *errorp) { @@ -8028,740 +2981,6 @@ tcp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp, } /* - * Some TCP options can be "set" by requesting them in the option - * buffer. This is needed for XTI feature test though we do not - * allow it in general. We interpret that this mechanism is more - * applicable to OSI protocols and need not be allowed in general. - * This routine filters out options for which it is not allowed (most) - * and lets through those (few) for which it is. [ The XTI interface - * test suite specifics will imply that any XTI_GENERIC level XTI_* if - * ever implemented will have to be allowed here ]. - */ -static boolean_t -tcp_allow_connopt_set(int level, int name) -{ - - switch (level) { - case IPPROTO_TCP: - switch (name) { - case TCP_NODELAY: - return (B_TRUE); - default: - return (B_FALSE); - } - /*NOTREACHED*/ - default: - return (B_FALSE); - } - /*NOTREACHED*/ -} - -/* - * This routine gets default values of certain options whose default - * values are maintained by protocol specific code - */ -/* ARGSUSED */ -int -tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) -{ - int32_t *i1 = (int32_t *)ptr; - tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - - switch (level) { - case IPPROTO_TCP: - switch (name) { - case TCP_NOTIFY_THRESHOLD: - *i1 = tcps->tcps_ip_notify_interval; - break; - case TCP_ABORT_THRESHOLD: - *i1 = tcps->tcps_ip_abort_interval; - break; - case TCP_CONN_NOTIFY_THRESHOLD: - *i1 = tcps->tcps_ip_notify_cinterval; - break; - case TCP_CONN_ABORT_THRESHOLD: - *i1 = tcps->tcps_ip_abort_cinterval; - break; - default: - return (-1); - } - break; - case IPPROTO_IP: - switch (name) { - case IP_TTL: - *i1 = tcps->tcps_ipv4_ttl; - break; - default: - return (-1); - } - break; - case IPPROTO_IPV6: - switch (name) { - case IPV6_UNICAST_HOPS: - *i1 = tcps->tcps_ipv6_hoplimit; - break; - default: - return (-1); - } - break; - default: - return (-1); - } - return (sizeof (int)); -} - -/* - * TCP routine to get the values of options. - */ -static int -tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) -{ - int *i1 = (int *)ptr; - tcp_t *tcp = connp->conn_tcp; - conn_opt_arg_t coas; - int retval; - - coas.coa_connp = connp; - coas.coa_ixa = connp->conn_ixa; - coas.coa_ipp = &connp->conn_xmit_ipp; - coas.coa_ancillary = B_FALSE; - coas.coa_changed = 0; - - switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_SND_COPYAVOID: - *i1 = tcp->tcp_snd_zcopy_on ? - SO_SND_COPYAVOID : 0; - return (sizeof (int)); - case SO_ACCEPTCONN: - *i1 = (tcp->tcp_state == TCPS_LISTEN); - return (sizeof (int)); - } - break; - case IPPROTO_TCP: - switch (name) { - case TCP_NODELAY: - *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; - return (sizeof (int)); - case TCP_MAXSEG: - *i1 = tcp->tcp_mss; - return (sizeof (int)); - case TCP_NOTIFY_THRESHOLD: - *i1 = (int)tcp->tcp_first_timer_threshold; - return (sizeof (int)); - case TCP_ABORT_THRESHOLD: - *i1 = tcp->tcp_second_timer_threshold; - return (sizeof (int)); - case TCP_CONN_NOTIFY_THRESHOLD: - *i1 = tcp->tcp_first_ctimer_threshold; - return (sizeof (int)); - case TCP_CONN_ABORT_THRESHOLD: - *i1 = tcp->tcp_second_ctimer_threshold; - return (sizeof (int)); - case TCP_INIT_CWND: - *i1 = tcp->tcp_init_cwnd; - return (sizeof (int)); - case TCP_KEEPALIVE_THRESHOLD: - *i1 = tcp->tcp_ka_interval; - return (sizeof (int)); - case TCP_KEEPALIVE_ABORT_THRESHOLD: - *i1 = tcp->tcp_ka_abort_thres; - return (sizeof (int)); - case TCP_CORK: - *i1 = tcp->tcp_cork; - return (sizeof (int)); - } - break; - case IPPROTO_IP: - if (connp->conn_family != AF_INET) - return (-1); - switch (name) { - case IP_OPTIONS: - case T_IP_OPTIONS: - /* Caller ensures enough space */ - return (ip_opt_get_user(connp, ptr)); - default: - break; - } - break; - - case IPPROTO_IPV6: - /* - * IPPROTO_IPV6 options are only supported for sockets - * that are using IPv6 on the wire. - */ - if (connp->conn_ipversion != IPV6_VERSION) { - return (-1); - } - switch (name) { - case IPV6_PATHMTU: - if (tcp->tcp_state < TCPS_ESTABLISHED) - return (-1); - break; - } - break; - } - mutex_enter(&connp->conn_lock); - retval = conn_opt_get(&coas, level, name, ptr); - mutex_exit(&connp->conn_lock); - return (retval); -} - -/* - * TCP routine to get the values of options. - */ -int -tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) -{ - return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); -} - -/* returns UNIX error, the optlen is a value-result arg */ -int -tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, - void *optvalp, socklen_t *optlen, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - squeue_t *sqp = connp->conn_sqp; - int error; - t_uscalar_t max_optbuf_len; - void *optvalp_buf; - int len; - - ASSERT(connp->conn_upper_handle != NULL); - - error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, - tcp_opt_obj.odb_opt_des_arr, - tcp_opt_obj.odb_opt_arr_cnt, - B_FALSE, B_TRUE, cr); - if (error != 0) { - if (error < 0) { - error = proto_tlitosyserr(-error); - } - return (error); - } - - optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); - - error = squeue_synch_enter(sqp, connp, NULL); - if (error == ENOMEM) { - kmem_free(optvalp_buf, max_optbuf_len); - return (ENOMEM); - } - - len = tcp_opt_get(connp, level, option_name, optvalp_buf); - squeue_synch_exit(sqp, connp); - - if (len == -1) { - kmem_free(optvalp_buf, max_optbuf_len); - return (EINVAL); - } - - /* - * update optlen and copy option value - */ - t_uscalar_t size = MIN(len, *optlen); - - bcopy(optvalp_buf, optvalp, size); - bcopy(&size, optlen, sizeof (size)); - - kmem_free(optvalp_buf, max_optbuf_len); - return (0); -} - -/* - * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. - * Parameters are assumed to be verified by the caller. - */ -/* ARGSUSED */ -int -tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, - uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr) -{ - tcp_t *tcp = connp->conn_tcp; - int *i1 = (int *)invalp; - boolean_t onoff = (*i1 == 0) ? 0 : 1; - boolean_t checkonly; - int reterr; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_opt_arg_t coas; - - coas.coa_connp = connp; - coas.coa_ixa = connp->conn_ixa; - coas.coa_ipp = &connp->conn_xmit_ipp; - coas.coa_ancillary = B_FALSE; - coas.coa_changed = 0; - - switch (optset_context) { - case SETFN_OPTCOM_CHECKONLY: - checkonly = B_TRUE; - /* - * Note: Implies T_CHECK semantics for T_OPTCOM_REQ - * inlen != 0 implies value supplied and - * we have to "pretend" to set it. - * inlen == 0 implies that there is no - * value part in T_CHECK request and just validation - * done elsewhere should be enough, we just return here. - */ - if (inlen == 0) { - *outlenp = 0; - return (0); - } - break; - case SETFN_OPTCOM_NEGOTIATE: - checkonly = B_FALSE; - break; - case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ - case SETFN_CONN_NEGOTIATE: - checkonly = B_FALSE; - /* - * Negotiating local and "association-related" options - * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) - * primitives is allowed by XTI, but we choose - * to not implement this style negotiation for Internet - * protocols (We interpret it is a must for OSI world but - * optional for Internet protocols) for all options. - * [ Will do only for the few options that enable test - * suites that our XTI implementation of this feature - * works for transports that do allow it ] - */ - if (!tcp_allow_connopt_set(level, name)) { - *outlenp = 0; - return (EINVAL); - } - break; - default: - /* - * We should never get here - */ - *outlenp = 0; - return (EINVAL); - } - - ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || - (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); - - /* - * For TCP, we should have no ancillary data sent down - * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs - * has to be zero. - */ - ASSERT(thisdg_attrs == NULL); - - /* - * For fixed length options, no sanity check - * of passed in length is done. It is assumed *_optcom_req() - * routines do the right thing. - */ - switch (level) { - case SOL_SOCKET: - switch (name) { - case SO_KEEPALIVE: - if (checkonly) { - /* check only case */ - break; - } - - if (!onoff) { - if (connp->conn_keepalive) { - if (tcp->tcp_ka_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, - tcp->tcp_ka_tid); - tcp->tcp_ka_tid = 0; - } - connp->conn_keepalive = 0; - } - break; - } - if (!connp->conn_keepalive) { - /* Crank up the keepalive timer */ - tcp->tcp_ka_last_intrvl = 0; - tcp->tcp_ka_tid = TCP_TIMER(tcp, - tcp_keepalive_killer, - MSEC_TO_TICK(tcp->tcp_ka_interval)); - connp->conn_keepalive = 1; - } - break; - case SO_SNDBUF: { - if (*i1 > tcps->tcps_max_buf) { - *outlenp = 0; - return (ENOBUFS); - } - if (checkonly) - break; - - connp->conn_sndbuf = *i1; - if (tcps->tcps_snd_lowat_fraction != 0) { - connp->conn_sndlowat = connp->conn_sndbuf / - tcps->tcps_snd_lowat_fraction; - } - (void) tcp_maxpsz_set(tcp, B_TRUE); - /* - * If we are flow-controlled, recheck the condition. - * There are apps that increase SO_SNDBUF size when - * flow-controlled (EWOULDBLOCK), and expect the flow - * control condition to be lifted right away. - */ - mutex_enter(&tcp->tcp_non_sq_lock); - if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { - tcp_clrqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); - *outlenp = inlen; - return (0); - } - case SO_RCVBUF: - if (*i1 > tcps->tcps_max_buf) { - *outlenp = 0; - return (ENOBUFS); - } - /* Silently ignore zero */ - if (!checkonly && *i1 != 0) { - *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); - (void) tcp_rwnd_set(tcp, *i1); - } - /* - * XXX should we return the rwnd here - * and tcp_opt_get ? - */ - *outlenp = inlen; - return (0); - case SO_SND_COPYAVOID: - if (!checkonly) { - if (tcp->tcp_loopback || - (tcp->tcp_kssl_ctx != NULL) || - (onoff != 1) || !tcp_zcopy_check(tcp)) { - *outlenp = 0; - return (EOPNOTSUPP); - } - tcp->tcp_snd_zcopy_aware = 1; - } - *outlenp = inlen; - return (0); - } - break; - case IPPROTO_TCP: - switch (name) { - case TCP_NODELAY: - if (!checkonly) - tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; - break; - case TCP_NOTIFY_THRESHOLD: - if (!checkonly) - tcp->tcp_first_timer_threshold = *i1; - break; - case TCP_ABORT_THRESHOLD: - if (!checkonly) - tcp->tcp_second_timer_threshold = *i1; - break; - case TCP_CONN_NOTIFY_THRESHOLD: - if (!checkonly) - tcp->tcp_first_ctimer_threshold = *i1; - break; - case TCP_CONN_ABORT_THRESHOLD: - if (!checkonly) - tcp->tcp_second_ctimer_threshold = *i1; - break; - case TCP_RECVDSTADDR: - if (tcp->tcp_state > TCPS_LISTEN) { - *outlenp = 0; - return (EOPNOTSUPP); - } - /* Setting done in conn_opt_set */ - break; - case TCP_INIT_CWND: { - uint32_t init_cwnd = *((uint32_t *)invalp); - - if (checkonly) - break; - - /* - * Only allow socket with network configuration - * privilege to set the initial cwnd to be larger - * than allowed by RFC 3390. - */ - if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { - tcp->tcp_init_cwnd = init_cwnd; - break; - } - if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) { - *outlenp = 0; - return (reterr); - } - if (init_cwnd > TCP_MAX_INIT_CWND) { - *outlenp = 0; - return (EINVAL); - } - tcp->tcp_init_cwnd = init_cwnd; - break; - } - case TCP_KEEPALIVE_THRESHOLD: - if (checkonly) - break; - - if (*i1 < tcps->tcps_keepalive_interval_low || - *i1 > tcps->tcps_keepalive_interval_high) { - *outlenp = 0; - return (EINVAL); - } - if (*i1 != tcp->tcp_ka_interval) { - tcp->tcp_ka_interval = *i1; - /* - * Check if we need to restart the - * keepalive timer. - */ - if (tcp->tcp_ka_tid != 0) { - ASSERT(connp->conn_keepalive); - (void) TCP_TIMER_CANCEL(tcp, - tcp->tcp_ka_tid); - tcp->tcp_ka_last_intrvl = 0; - tcp->tcp_ka_tid = TCP_TIMER(tcp, - tcp_keepalive_killer, - MSEC_TO_TICK(tcp->tcp_ka_interval)); - } - } - break; - case TCP_KEEPALIVE_ABORT_THRESHOLD: - if (!checkonly) { - if (*i1 < - tcps->tcps_keepalive_abort_interval_low || - *i1 > - tcps->tcps_keepalive_abort_interval_high) { - *outlenp = 0; - return (EINVAL); - } - tcp->tcp_ka_abort_thres = *i1; - } - break; - case TCP_CORK: - if (!checkonly) { - /* - * if tcp->tcp_cork was set and is now - * being unset, we have to make sure that - * the remaining data gets sent out. Also - * unset tcp->tcp_cork so that tcp_wput_data() - * can send data even if it is less than mss - */ - if (tcp->tcp_cork && onoff == 0 && - tcp->tcp_unsent > 0) { - tcp->tcp_cork = B_FALSE; - tcp_wput_data(tcp, NULL, B_FALSE); - } - tcp->tcp_cork = onoff; - } - break; - default: - break; - } - break; - case IPPROTO_IP: - if (connp->conn_family != AF_INET) { - *outlenp = 0; - return (EINVAL); - } - switch (name) { - case IP_SEC_OPT: - /* - * We should not allow policy setting after - * we start listening for connections. - */ - if (tcp->tcp_state == TCPS_LISTEN) { - return (EINVAL); - } - break; - } - break; - case IPPROTO_IPV6: - /* - * IPPROTO_IPV6 options are only supported for sockets - * that are using IPv6 on the wire. - */ - if (connp->conn_ipversion != IPV6_VERSION) { - *outlenp = 0; - return (EINVAL); - } - - switch (name) { - case IPV6_RECVPKTINFO: - if (!checkonly) { - /* Force it to be sent up with the next msg */ - tcp->tcp_recvifindex = 0; - } - break; - case IPV6_RECVTCLASS: - if (!checkonly) { - /* Force it to be sent up with the next msg */ - tcp->tcp_recvtclass = 0xffffffffU; - } - break; - case IPV6_RECVHOPLIMIT: - if (!checkonly) { - /* Force it to be sent up with the next msg */ - tcp->tcp_recvhops = 0xffffffffU; - } - break; - case IPV6_PKTINFO: - /* This is an extra check for TCP */ - if (inlen == sizeof (struct in6_pktinfo)) { - struct in6_pktinfo *pkti; - - pkti = (struct in6_pktinfo *)invalp; - /* - * RFC 3542 states that ipi6_addr must be - * the unspecified address when setting the - * IPV6_PKTINFO sticky socket option on a - * TCP socket. - */ - if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) - return (EINVAL); - } - break; - case IPV6_SEC_OPT: - /* - * We should not allow policy setting after - * we start listening for connections. - */ - if (tcp->tcp_state == TCPS_LISTEN) { - return (EINVAL); - } - break; - } - break; - } - reterr = conn_opt_set(&coas, level, name, inlen, invalp, - checkonly, cr); - if (reterr != 0) { - *outlenp = 0; - return (reterr); - } - - /* - * Common case of OK return with outval same as inval - */ - if (invalp != outvalp) { - /* don't trust bcopy for identical src/dst */ - (void) bcopy(invalp, outvalp, inlen); - } - *outlenp = inlen; - - if (coas.coa_changed & COA_HEADER_CHANGED) { - /* If we are connected we rebuilt the headers */ - if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && - !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { - reterr = tcp_build_hdrs(tcp); - if (reterr != 0) - return (reterr); - } - } - if (coas.coa_changed & COA_ROUTE_CHANGED) { - in6_addr_t nexthop; - - /* - * If we are connected we re-cache the information. - * We ignore errors to preserve BSD behavior. - * Note that we don't redo IPsec policy lookup here - * since the final destination (or source) didn't change. - */ - ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, - &connp->conn_faddr_v6, &nexthop); - - if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && - !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { - (void) ip_attr_connect(connp, connp->conn_ixa, - &connp->conn_laddr_v6, &connp->conn_faddr_v6, - &nexthop, connp->conn_fport, NULL, NULL, - IPDF_VERIFY_DST); - } - } - if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { - connp->conn_wq->q_hiwat = connp->conn_sndbuf; - } - if (coas.coa_changed & COA_WROFF_CHANGED) { - connp->conn_wroff = connp->conn_ht_iphc_allocated + - tcps->tcps_wroff_xtra; - (void) proto_set_tx_wroff(connp->conn_rq, connp, - connp->conn_wroff); - } - if (coas.coa_changed & COA_OOBINLINE_CHANGED) { - if (IPCL_IS_NONSTR(connp)) - proto_set_rx_oob_opt(connp, onoff); - } - return (0); -} - -/* ARGSUSED */ -int -tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, - uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, - void *thisdg_attrs, cred_t *cr) -{ - conn_t *connp = Q_TO_CONN(q); - - return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, - outlenp, outvalp, thisdg_attrs, cr)); -} - -int -tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, - const void *optvalp, socklen_t optlen, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - squeue_t *sqp = connp->conn_sqp; - int error; - - ASSERT(connp->conn_upper_handle != NULL); - /* - * Entering the squeue synchronously can result in a context switch, - * which can cause a rather sever performance degradation. So we try to - * handle whatever options we can without entering the squeue. - */ - if (level == IPPROTO_TCP) { - switch (option_name) { - case TCP_NODELAY: - if (optlen != sizeof (int32_t)) - return (EINVAL); - mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); - connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : - connp->conn_tcp->tcp_mss; - mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); - return (0); - default: - break; - } - } - - error = squeue_synch_enter(sqp, connp, NULL); - if (error == ENOMEM) { - return (ENOMEM); - } - - error = proto_opt_check(level, option_name, optlen, NULL, - tcp_opt_obj.odb_opt_des_arr, - tcp_opt_obj.odb_opt_arr_cnt, - B_TRUE, B_FALSE, cr); - - if (error != 0) { - if (error < 0) { - error = proto_tlitosyserr(-error); - } - squeue_synch_exit(sqp, connp); - return (error); - } - - error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, - optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, - NULL, cr); - squeue_synch_exit(sqp, connp); - - ASSERT(error >= 0); - - return (error); -} - -/* * Build/update the tcp header template (in conn_ht_iphc) based on * conn_xmit_ipp. The headers include ip6_t, any extension * headers, and the maximum size tcp header (to avoid reallocation @@ -8770,7 +2989,7 @@ tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, * Assumes the caller has already set conn_{faddr,laddr,fport,lport,flowinfo}. * Returns failure if can't allocate memory. */ -static int +int tcp_build_hdrs(tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; @@ -8981,4100 +3200,6 @@ tcp_param_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr) return (0); } -static void -tcp_reass_timer(void *arg) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - - tcp->tcp_reass_tid = 0; - if (tcp->tcp_reass_head == NULL) - return; - ASSERT(tcp->tcp_reass_tail != NULL); - if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { - tcp_sack_remove(tcp->tcp_sack_list, - TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk); - } - tcp_close_mpp(&tcp->tcp_reass_head); - tcp->tcp_reass_tail = NULL; -} - -/* - * Add a new piece to the tcp reassembly queue. If the gap at the beginning - * is filled, return as much as we can. The message passed in may be - * multi-part, chained using b_cont. "start" is the starting sequence - * number for this piece. - */ -static mblk_t * -tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) -{ - uint32_t end; - mblk_t *mp1; - mblk_t *mp2; - mblk_t *next_mp; - uint32_t u1; - tcp_stack_t *tcps = tcp->tcp_tcps; - - - /* Walk through all the new pieces. */ - do { - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= - (uintptr_t)INT_MAX); - end = start + (int)(mp->b_wptr - mp->b_rptr); - next_mp = mp->b_cont; - if (start == end) { - /* Empty. Blast it. */ - freeb(mp); - continue; - } - mp->b_cont = NULL; - TCP_REASS_SET_SEQ(mp, start); - TCP_REASS_SET_END(mp, end); - mp1 = tcp->tcp_reass_tail; - if (!mp1) { - tcp->tcp_reass_tail = mp; - tcp->tcp_reass_head = mp; - BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); - UPDATE_MIB(&tcps->tcps_mib, - tcpInDataUnorderBytes, end - start); - continue; - } - /* New stuff completely beyond tail? */ - if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { - /* Link it on end. */ - mp1->b_cont = mp; - tcp->tcp_reass_tail = mp; - BUMP_MIB(&tcps->tcps_mib, tcpInDataUnorderSegs); - UPDATE_MIB(&tcps->tcps_mib, - tcpInDataUnorderBytes, end - start); - continue; - } - mp1 = tcp->tcp_reass_head; - u1 = TCP_REASS_SEQ(mp1); - /* New stuff at the front? */ - if (SEQ_LT(start, u1)) { - /* Yes... Check for overlap. */ - mp->b_cont = mp1; - tcp->tcp_reass_head = mp; - tcp_reass_elim_overlap(tcp, mp); - continue; - } - /* - * The new piece fits somewhere between the head and tail. - * We find our slot, where mp1 precedes us and mp2 trails. - */ - for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { - u1 = TCP_REASS_SEQ(mp2); - if (SEQ_LEQ(start, u1)) - break; - } - /* Link ourselves in */ - mp->b_cont = mp2; - mp1->b_cont = mp; - - /* Trim overlap with following mblk(s) first */ - tcp_reass_elim_overlap(tcp, mp); - - /* Trim overlap with preceding mblk */ - tcp_reass_elim_overlap(tcp, mp1); - - } while (start = end, mp = next_mp); - mp1 = tcp->tcp_reass_head; - /* Anything ready to go? */ - if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) - return (NULL); - /* Eat what we can off the queue */ - for (;;) { - mp = mp1->b_cont; - end = TCP_REASS_END(mp1); - TCP_REASS_SET_SEQ(mp1, 0); - TCP_REASS_SET_END(mp1, 0); - if (!mp) { - tcp->tcp_reass_tail = NULL; - break; - } - if (end != TCP_REASS_SEQ(mp)) { - mp1->b_cont = NULL; - break; - } - mp1 = mp; - } - mp1 = tcp->tcp_reass_head; - tcp->tcp_reass_head = mp; - return (mp1); -} - -/* Eliminate any overlap that mp may have over later mblks */ -static void -tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) -{ - uint32_t end; - mblk_t *mp1; - uint32_t u1; - tcp_stack_t *tcps = tcp->tcp_tcps; - - end = TCP_REASS_END(mp); - while ((mp1 = mp->b_cont) != NULL) { - u1 = TCP_REASS_SEQ(mp1); - if (!SEQ_GT(end, u1)) - break; - if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { - mp->b_wptr -= end - u1; - TCP_REASS_SET_END(mp, u1); - BUMP_MIB(&tcps->tcps_mib, tcpInDataPartDupSegs); - UPDATE_MIB(&tcps->tcps_mib, - tcpInDataPartDupBytes, end - u1); - break; - } - mp->b_cont = mp1->b_cont; - TCP_REASS_SET_SEQ(mp1, 0); - TCP_REASS_SET_END(mp1, 0); - freeb(mp1); - BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, end - u1); - } - if (!mp1) - tcp->tcp_reass_tail = mp; -} - -static uint_t -tcp_rwnd_reopen(tcp_t *tcp) -{ - uint_t ret = 0; - uint_t thwin; - conn_t *connp = tcp->tcp_connp; - - /* Learn the latest rwnd information that we sent to the other side. */ - thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win)) - << tcp->tcp_rcv_ws; - /* This is peer's calculated send window (our receive window). */ - thwin -= tcp->tcp_rnxt - tcp->tcp_rack; - /* - * Increase the receive window to max. But we need to do receiver - * SWS avoidance. This means that we need to check the increase of - * of receive window is at least 1 MSS. - */ - if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) { - /* - * If the window that the other side knows is less than max - * deferred acks segments, send an update immediately. - */ - if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { - BUMP_MIB(&tcp->tcp_tcps->tcps_mib, tcpOutWinUpdate); - ret = TH_ACK_NEEDED; - } - tcp->tcp_rwnd = connp->conn_rcvbuf; - } - return (ret); -} - -/* - * Send up all messages queued on tcp_rcv_list. - */ -static uint_t -tcp_rcv_drain(tcp_t *tcp) -{ - mblk_t *mp; - uint_t ret = 0; -#ifdef DEBUG - uint_t cnt = 0; -#endif - queue_t *q = tcp->tcp_connp->conn_rq; - - /* Can't drain on an eager connection */ - if (tcp->tcp_listener != NULL) - return (ret); - - /* Can't be a non-STREAMS connection */ - ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); - - /* No need for the push timer now. */ - if (tcp->tcp_push_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); - tcp->tcp_push_tid = 0; - } - - /* - * Handle two cases here: we are currently fused or we were - * previously fused and have some urgent data to be delivered - * upstream. The latter happens because we either ran out of - * memory or were detached and therefore sending the SIGURG was - * deferred until this point. In either case we pass control - * over to tcp_fuse_rcv_drain() since it may need to complete - * some work. - */ - if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { - ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) || - tcp->tcp_fused_sigurg_mp != NULL); - if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : - &tcp->tcp_fused_sigurg_mp)) - return (ret); - } - - while ((mp = tcp->tcp_rcv_list) != NULL) { - tcp->tcp_rcv_list = mp->b_next; - mp->b_next = NULL; -#ifdef DEBUG - cnt += msgdsize(mp); -#endif - /* Does this need SSL processing first? */ - if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { - DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain, - mblk_t *, mp); - tcp_kssl_input(tcp, mp, NULL); - continue; - } - putnext(q, mp); - } -#ifdef DEBUG - ASSERT(cnt == tcp->tcp_rcv_cnt); -#endif - tcp->tcp_rcv_last_head = NULL; - tcp->tcp_rcv_last_tail = NULL; - tcp->tcp_rcv_cnt = 0; - - if (canputnext(q)) - return (tcp_rwnd_reopen(tcp)); - - return (ret); -} - -/* - * Queue data on tcp_rcv_list which is a b_next chain. - * tcp_rcv_last_head/tail is the last element of this chain. - * Each element of the chain is a b_cont chain. - * - * M_DATA messages are added to the current element. - * Other messages are added as new (b_next) elements. - */ -void -tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) -{ - ASSERT(seg_len == msgdsize(mp)); - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); - - if (is_system_labeled()) { - ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); - /* - * Provide for protocols above TCP such as RPC. NOPID leaves - * db_cpid unchanged. - * The cred could have already been set. - */ - if (cr != NULL) - mblk_setcred(mp, cr, NOPID); - } - - if (tcp->tcp_rcv_list == NULL) { - ASSERT(tcp->tcp_rcv_last_head == NULL); - tcp->tcp_rcv_list = mp; - tcp->tcp_rcv_last_head = mp; - } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { - tcp->tcp_rcv_last_tail->b_cont = mp; - } else { - tcp->tcp_rcv_last_head->b_next = mp; - tcp->tcp_rcv_last_head = mp; - } - - while (mp->b_cont) - mp = mp->b_cont; - - tcp->tcp_rcv_last_tail = mp; - tcp->tcp_rcv_cnt += seg_len; - tcp->tcp_rwnd -= seg_len; -} - -/* The minimum of smoothed mean deviation in RTO calculation. */ -#define TCP_SD_MIN 400 - -/* - * Set RTO for this connection. The formula is from Jacobson and Karels' - * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names - * are the same as those in Appendix A.2 of that paper. - * - * m = new measurement - * sa = smoothed RTT average (8 * average estimates). - * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). - */ -static void -tcp_set_rto(tcp_t *tcp, clock_t rtt) -{ - long m = TICK_TO_MSEC(rtt); - clock_t sa = tcp->tcp_rtt_sa; - clock_t sv = tcp->tcp_rtt_sd; - clock_t rto; - tcp_stack_t *tcps = tcp->tcp_tcps; - - BUMP_MIB(&tcps->tcps_mib, tcpRttUpdate); - tcp->tcp_rtt_update++; - - /* tcp_rtt_sa is not 0 means this is a new sample. */ - if (sa != 0) { - /* - * Update average estimator: - * new rtt = 7/8 old rtt + 1/8 Error - */ - - /* m is now Error in estimate. */ - m -= sa >> 3; - if ((sa += m) <= 0) { - /* - * Don't allow the smoothed average to be negative. - * We use 0 to denote reinitialization of the - * variables. - */ - sa = 1; - } - - /* - * Update deviation estimator: - * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) - */ - if (m < 0) - m = -m; - m -= sv >> 2; - sv += m; - } else { - /* - * This follows BSD's implementation. So the reinitialized - * RTO is 3 * m. We cannot go less than 2 because if the - * link is bandwidth dominated, doubling the window size - * during slow start means doubling the RTT. We want to be - * more conservative when we reinitialize our estimates. 3 - * is just a convenient number. - */ - sa = m << 3; - sv = m << 1; - } - if (sv < TCP_SD_MIN) { - /* - * We do not know that if sa captures the delay ACK - * effect as in a long train of segments, a receiver - * does not delay its ACKs. So set the minimum of sv - * to be TCP_SD_MIN, which is default to 400 ms, twice - * of BSD DATO. That means the minimum of mean - * deviation is 100 ms. - * - */ - sv = TCP_SD_MIN; - } - tcp->tcp_rtt_sa = sa; - tcp->tcp_rtt_sd = sv; - /* - * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) - * - * Add tcp_rexmit_interval extra in case of extreme environment - * where the algorithm fails to work. The default value of - * tcp_rexmit_interval_extra should be 0. - * - * As we use a finer grained clock than BSD and update - * RTO for every ACKs, add in another .25 of RTT to the - * deviation of RTO to accomodate burstiness of 1/4 of - * window size. - */ - rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); - - if (rto > tcps->tcps_rexmit_interval_max) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_max; - } else if (rto < tcps->tcps_rexmit_interval_min) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_min; - } else { - tcp->tcp_rto = rto; - } - - /* Now, we can reset tcp_timer_backoff to use the new RTO... */ - tcp->tcp_timer_backoff = 0; -} - -/* - * tcp_get_seg_mp() is called to get the pointer to a segment in the - * send queue which starts at the given sequence number. If the given - * sequence number is equal to last valid sequence number (tcp_snxt), the - * returned mblk is the last valid mblk, and off is set to the length of - * that mblk. - * - * send queue which starts at the given seq. no. - * - * Parameters: - * tcp_t *tcp: the tcp instance pointer. - * uint32_t seq: the starting seq. no of the requested segment. - * int32_t *off: after the execution, *off will be the offset to - * the returned mblk which points to the requested seq no. - * It is the caller's responsibility to send in a non-null off. - * - * Return: - * A mblk_t pointer pointing to the requested segment in send queue. - */ -static mblk_t * -tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) -{ - int32_t cnt; - mblk_t *mp; - - /* Defensive coding. Make sure we don't send incorrect data. */ - if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt)) - return (NULL); - - cnt = seq - tcp->tcp_suna; - mp = tcp->tcp_xmit_head; - while (cnt > 0 && mp != NULL) { - cnt -= mp->b_wptr - mp->b_rptr; - if (cnt <= 0) { - cnt += mp->b_wptr - mp->b_rptr; - break; - } - mp = mp->b_cont; - } - ASSERT(mp != NULL); - *off = cnt; - return (mp); -} - -/* - * This function handles all retransmissions if SACK is enabled for this - * connection. First it calculates how many segments can be retransmitted - * based on tcp_pipe. Then it goes thru the notsack list to find eligible - * segments. A segment is eligible if sack_cnt for that segment is greater - * than or equal tcp_dupack_fast_retransmit. After it has retransmitted - * all eligible segments, it checks to see if TCP can send some new segments - * (fast recovery). If it can, set the appropriate flag for tcp_input_data(). - * - * Parameters: - * tcp_t *tcp: the tcp structure of the connection. - * uint_t *flags: in return, appropriate value will be set for - * tcp_input_data(). - */ -static void -tcp_sack_rxmit(tcp_t *tcp, uint_t *flags) -{ - notsack_blk_t *notsack_blk; - int32_t usable_swnd; - int32_t mss; - uint32_t seg_len; - mblk_t *xmit_mp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - ASSERT(tcp->tcp_sack_info != NULL); - ASSERT(tcp->tcp_notsack_list != NULL); - ASSERT(tcp->tcp_rexmit == B_FALSE); - - /* Defensive coding in case there is a bug... */ - if (tcp->tcp_notsack_list == NULL) { - return; - } - notsack_blk = tcp->tcp_notsack_list; - mss = tcp->tcp_mss; - - /* - * Limit the num of outstanding data in the network to be - * tcp_cwnd_ssthresh, which is half of the original congestion wnd. - */ - usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; - - /* At least retransmit 1 MSS of data. */ - if (usable_swnd <= 0) { - usable_swnd = mss; - } - - /* Make sure no new RTT samples will be taken. */ - tcp->tcp_csuna = tcp->tcp_snxt; - - notsack_blk = tcp->tcp_notsack_list; - while (usable_swnd > 0) { - mblk_t *snxt_mp, *tmp_mp; - tcp_seq begin = tcp->tcp_sack_snxt; - tcp_seq end; - int32_t off; - - for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { - if (SEQ_GT(notsack_blk->end, begin) && - (notsack_blk->sack_cnt >= - tcps->tcps_dupack_fast_retransmit)) { - end = notsack_blk->end; - if (SEQ_LT(begin, notsack_blk->begin)) { - begin = notsack_blk->begin; - } - break; - } - } - /* - * All holes are filled. Manipulate tcp_cwnd to send more - * if we can. Note that after the SACK recovery, tcp_cwnd is - * set to tcp_cwnd_ssthresh. - */ - if (notsack_blk == NULL) { - usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; - if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { - tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; - ASSERT(tcp->tcp_cwnd > 0); - return; - } else { - usable_swnd = usable_swnd / mss; - tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + - MAX(usable_swnd * mss, mss); - *flags |= TH_XMIT_NEEDED; - return; - } - } - - /* - * Note that we may send more than usable_swnd allows here - * because of round off, but no more than 1 MSS of data. - */ - seg_len = end - begin; - if (seg_len > mss) - seg_len = mss; - snxt_mp = tcp_get_seg_mp(tcp, begin, &off); - ASSERT(snxt_mp != NULL); - /* This should not happen. Defensive coding again... */ - if (snxt_mp == NULL) { - return; - } - - xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, - &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); - if (xmit_mp == NULL) - return; - - usable_swnd -= seg_len; - tcp->tcp_pipe += seg_len; - tcp->tcp_sack_snxt = begin + seg_len; - - tcp_send_data(tcp, xmit_mp); - - /* - * Update the send timestamp to avoid false retransmission. - */ - snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); - - BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, seg_len); - BUMP_MIB(&tcps->tcps_mib, tcpOutSackRetransSegs); - /* - * Update tcp_rexmit_max to extend this SACK recovery phase. - * This happens when new data sent during fast recovery is - * also lost. If TCP retransmits those new data, it needs - * to extend SACK recover phase to avoid starting another - * fast retransmit/recovery unnecessarily. - */ - if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { - tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; - } - } -} - -/* - * tcp_ss_rexmit() is called to do slow start retransmission after a timeout - * or ICMP errors. - * - * To limit the number of duplicate segments, we limit the number of segment - * to be sent in one time to tcp_snd_burst, the burst variable. - */ -static void -tcp_ss_rexmit(tcp_t *tcp) -{ - uint32_t snxt; - uint32_t smax; - int32_t win; - int32_t mss; - int32_t off; - int32_t burst = tcp->tcp_snd_burst; - mblk_t *snxt_mp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * Note that tcp_rexmit can be set even though TCP has retransmitted - * all unack'ed segments. - */ - if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { - smax = tcp->tcp_rexmit_max; - snxt = tcp->tcp_rexmit_nxt; - if (SEQ_LT(snxt, tcp->tcp_suna)) { - snxt = tcp->tcp_suna; - } - win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); - win -= snxt - tcp->tcp_suna; - mss = tcp->tcp_mss; - snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); - - while (SEQ_LT(snxt, smax) && (win > 0) && - (burst > 0) && (snxt_mp != NULL)) { - mblk_t *xmit_mp; - mblk_t *old_snxt_mp = snxt_mp; - uint32_t cnt = mss; - - if (win < cnt) { - cnt = win; - } - if (SEQ_GT(snxt + cnt, smax)) { - cnt = smax - snxt; - } - xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, - &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); - if (xmit_mp == NULL) - return; - - tcp_send_data(tcp, xmit_mp); - - snxt += cnt; - win -= cnt; - /* - * Update the send timestamp to avoid false - * retransmission. - */ - old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); - BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, cnt); - - tcp->tcp_rexmit_nxt = snxt; - burst--; - } - /* - * If we have transmitted all we have at the time - * we started the retranmission, we can leave - * the rest of the job to tcp_wput_data(). But we - * need to check the send window first. If the - * win is not 0, go on with tcp_wput_data(). - */ - if (SEQ_LT(snxt, smax) || win == 0) { - return; - } - } - /* Only call tcp_wput_data() if there is data to be sent. */ - if (tcp->tcp_unsent) { - tcp_wput_data(tcp, NULL, B_FALSE); - } -} - -/* - * Process all TCP option in SYN segment. Note that this function should - * be called after tcp_set_destination() is called so that the necessary info - * from IRE is already set in the tcp structure. - * - * This function sets up the correct tcp_mss value according to the - * MSS option value and our header size. It also sets up the window scale - * and timestamp values, and initialize SACK info blocks. But it does not - * change receive window size after setting the tcp_mss value. The caller - * should do the appropriate change. - */ -void -tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) -{ - int options; - tcp_opt_t tcpopt; - uint32_t mss_max; - char *tmp_tcph; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - - tcpopt.tcp = NULL; - options = tcp_parse_options(tcpha, &tcpopt); - - /* - * Process MSS option. Note that MSS option value does not account - * for IP or TCP options. This means that it is equal to MTU - minimum - * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for - * IPv6. - */ - if (!(options & TCP_OPT_MSS_PRESENT)) { - if (connp->conn_ipversion == IPV4_VERSION) - tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; - else - tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; - } else { - if (connp->conn_ipversion == IPV4_VERSION) - mss_max = tcps->tcps_mss_max_ipv4; - else - mss_max = tcps->tcps_mss_max_ipv6; - if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) - tcpopt.tcp_opt_mss = tcps->tcps_mss_min; - else if (tcpopt.tcp_opt_mss > mss_max) - tcpopt.tcp_opt_mss = mss_max; - } - - /* Process Window Scale option. */ - if (options & TCP_OPT_WSCALE_PRESENT) { - tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; - tcp->tcp_snd_ws_ok = B_TRUE; - } else { - tcp->tcp_snd_ws = B_FALSE; - tcp->tcp_snd_ws_ok = B_FALSE; - tcp->tcp_rcv_ws = B_FALSE; - } - - /* Process Timestamp option. */ - if ((options & TCP_OPT_TSTAMP_PRESENT) && - (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { - tmp_tcph = (char *)tcp->tcp_tcpha; - - tcp->tcp_snd_ts_ok = B_TRUE; - tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; - tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); - ASSERT(OK_32PTR(tmp_tcph)); - ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); - - /* Fill in our template header with basic timestamp option. */ - tmp_tcph += connp->conn_ht_ulp_len; - tmp_tcph[0] = TCPOPT_NOP; - tmp_tcph[1] = TCPOPT_NOP; - tmp_tcph[2] = TCPOPT_TSTAMP; - tmp_tcph[3] = TCPOPT_TSTAMP_LEN; - connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; - connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; - tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); - } else { - tcp->tcp_snd_ts_ok = B_FALSE; - } - - /* - * Process SACK options. If SACK is enabled for this connection, - * then allocate the SACK info structure. Note the following ways - * when tcp_snd_sack_ok is set to true. - * - * For active connection: in tcp_set_destination() called in - * tcp_connect(). - * - * For passive connection: in tcp_set_destination() called in - * tcp_input_listener(). - * - * That's the reason why the extra TCP_IS_DETACHED() check is there. - * That check makes sure that if we did not send a SACK OK option, - * we will not enable SACK for this connection even though the other - * side sends us SACK OK option. For active connection, the SACK - * info structure has already been allocated. So we need to free - * it if SACK is disabled. - */ - if ((options & TCP_OPT_SACK_OK_PRESENT) && - (tcp->tcp_snd_sack_ok || - (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { - /* This should be true only in the passive case. */ - if (tcp->tcp_sack_info == NULL) { - ASSERT(TCP_IS_DETACHED(tcp)); - tcp->tcp_sack_info = - kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP); - } - if (tcp->tcp_sack_info == NULL) { - tcp->tcp_snd_sack_ok = B_FALSE; - } else { - tcp->tcp_snd_sack_ok = B_TRUE; - if (tcp->tcp_snd_ts_ok) { - tcp->tcp_max_sack_blk = 3; - } else { - tcp->tcp_max_sack_blk = 4; - } - } - } else { - /* - * Resetting tcp_snd_sack_ok to B_FALSE so that - * no SACK info will be used for this - * connection. This assumes that SACK usage - * permission is negotiated. This may need - * to be changed once this is clarified. - */ - if (tcp->tcp_sack_info != NULL) { - ASSERT(tcp->tcp_notsack_list == NULL); - kmem_cache_free(tcp_sack_info_cache, - tcp->tcp_sack_info); - tcp->tcp_sack_info = NULL; - } - tcp->tcp_snd_sack_ok = B_FALSE; - } - - /* - * Now we know the exact TCP/IP header length, subtract - * that from tcp_mss to get our side's MSS. - */ - tcp->tcp_mss -= connp->conn_ht_iphc_len; - - /* - * Here we assume that the other side's header size will be equal to - * our header size. We calculate the real MSS accordingly. Need to - * take into additional stuffs IPsec puts in. - * - * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) - */ - tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + - tcp->tcp_ipsec_overhead - - ((connp->conn_ipversion == IPV4_VERSION ? - IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); - - /* - * Set MSS to the smaller one of both ends of the connection. - * We should not have called tcp_mss_set() before, but our - * side of the MSS should have been set to a proper value - * by tcp_set_destination(). tcp_mss_set() will also set up the - * STREAM head parameters properly. - * - * If we have a larger-than-16-bit window but the other side - * didn't want to do window scale, tcp_rwnd_set() will take - * care of that. - */ - tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); - - /* - * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been - * updated properly. - */ - SET_TCP_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); -} - -/* - * Sends the T_CONN_IND to the listener. The caller calls this - * functions via squeue to get inside the listener's perimeter - * once the 3 way hand shake is done a T_CONN_IND needs to be - * sent. As an optimization, the caller can call this directly - * if listener's perimeter is same as eager's. - */ -/* ARGSUSED */ -void -tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) -{ - conn_t *lconnp = (conn_t *)arg; - tcp_t *listener = lconnp->conn_tcp; - tcp_t *tcp; - struct T_conn_ind *conn_ind; - ipaddr_t *addr_cache; - boolean_t need_send_conn_ind = B_FALSE; - tcp_stack_t *tcps = listener->tcp_tcps; - - /* retrieve the eager */ - conn_ind = (struct T_conn_ind *)mp->b_rptr; - ASSERT(conn_ind->OPT_offset != 0 && - conn_ind->OPT_length == sizeof (intptr_t)); - bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, - conn_ind->OPT_length); - - /* - * TLI/XTI applications will get confused by - * sending eager as an option since it violates - * the option semantics. So remove the eager as - * option since TLI/XTI app doesn't need it anyway. - */ - if (!TCP_IS_SOCKET(listener)) { - conn_ind->OPT_length = 0; - conn_ind->OPT_offset = 0; - } - if (listener->tcp_state != TCPS_LISTEN) { - /* - * If listener has closed, it would have caused a - * a cleanup/blowoff to happen for the eager. We - * just need to return. - */ - freemsg(mp); - return; - } - - - /* - * if the conn_req_q is full defer passing up the - * T_CONN_IND until space is availabe after t_accept() - * processing - */ - mutex_enter(&listener->tcp_eager_lock); - - /* - * Take the eager out, if it is in the list of droppable eagers - * as we are here because the 3W handshake is over. - */ - MAKE_UNDROPPABLE(tcp); - - if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { - tcp_t *tail; - - /* - * The eager already has an extra ref put in tcp_input_data - * so that it stays till accept comes back even though it - * might get into TCPS_CLOSED as a result of a TH_RST etc. - */ - ASSERT(listener->tcp_conn_req_cnt_q0 > 0); - listener->tcp_conn_req_cnt_q0--; - listener->tcp_conn_req_cnt_q++; - - /* Move from SYN_RCVD to ESTABLISHED list */ - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = - tcp->tcp_eager_prev_q0; - tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = - tcp->tcp_eager_next_q0; - tcp->tcp_eager_prev_q0 = NULL; - tcp->tcp_eager_next_q0 = NULL; - - /* - * Insert at end of the queue because sockfs - * sends down T_CONN_RES in chronological - * order. Leaving the older conn indications - * at front of the queue helps reducing search - * time. - */ - tail = listener->tcp_eager_last_q; - if (tail != NULL) - tail->tcp_eager_next_q = tcp; - else - listener->tcp_eager_next_q = tcp; - listener->tcp_eager_last_q = tcp; - tcp->tcp_eager_next_q = NULL; - /* - * Delay sending up the T_conn_ind until we are - * done with the eager. Once we have have sent up - * the T_conn_ind, the accept can potentially complete - * any time and release the refhold we have on the eager. - */ - need_send_conn_ind = B_TRUE; - } else { - /* - * Defer connection on q0 and set deferred - * connection bit true - */ - tcp->tcp_conn_def_q0 = B_TRUE; - - /* take tcp out of q0 ... */ - tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = - tcp->tcp_eager_next_q0; - tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = - tcp->tcp_eager_prev_q0; - - /* ... and place it at the end of q0 */ - tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; - tcp->tcp_eager_next_q0 = listener; - listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; - listener->tcp_eager_prev_q0 = tcp; - tcp->tcp_conn.tcp_eager_conn_ind = mp; - } - - /* we have timed out before */ - if (tcp->tcp_syn_rcvd_timeout != 0) { - tcp->tcp_syn_rcvd_timeout = 0; - listener->tcp_syn_rcvd_timeout--; - if (listener->tcp_syn_defense && - listener->tcp_syn_rcvd_timeout <= - (tcps->tcps_conn_req_max_q0 >> 5) && - 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - - listener->tcp_last_rcv_lbolt)) { - /* - * Turn off the defense mode if we - * believe the SYN attack is over. - */ - listener->tcp_syn_defense = B_FALSE; - if (listener->tcp_ip_addr_cache) { - kmem_free((void *)listener->tcp_ip_addr_cache, - IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); - listener->tcp_ip_addr_cache = NULL; - } - } - } - addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); - if (addr_cache != NULL) { - /* - * We have finished a 3-way handshake with this - * remote host. This proves the IP addr is good. - * Cache it! - */ - addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = - tcp->tcp_connp->conn_faddr_v4; - } - mutex_exit(&listener->tcp_eager_lock); - if (need_send_conn_ind) - tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); -} - -/* - * Send the newconn notification to ulp. The eager is blown off if the - * notification fails. - */ -static void -tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) -{ - if (IPCL_IS_NONSTR(lconnp)) { - cred_t *cr; - pid_t cpid = NOPID; - - ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp); - ASSERT(econnp->conn_tcp->tcp_saved_listener == - lconnp->conn_tcp); - - cr = msg_getcred(mp, &cpid); - - /* Keep the message around in case of a fallback to TPI */ - econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp; - /* - * Notify the ULP about the newconn. It is guaranteed that no - * tcp_accept() call will be made for the eager if the - * notification fails, so it's safe to blow it off in that - * case. - * - * The upper handle will be assigned when tcp_accept() is - * called. - */ - if ((*lconnp->conn_upcalls->su_newconn) - (lconnp->conn_upper_handle, - (sock_lower_handle_t)econnp, - &sock_tcp_downcalls, cr, cpid, - &econnp->conn_upcalls) == NULL) { - /* Failed to allocate a socket */ - BUMP_MIB(&lconnp->conn_tcp->tcp_tcps->tcps_mib, - tcpEstabResets); - (void) tcp_eager_blowoff(lconnp->conn_tcp, - econnp->conn_tcp->tcp_conn_req_seqnum); - } - } else { - putnext(lconnp->conn_rq, mp); - } -} - -/* - * Handle a packet that has been reclassified by TCP. - * This function drops the ref on connp that the caller had. - */ -static void -tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) -{ - ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; - - if (connp->conn_incoming_ifindex != 0 && - connp->conn_incoming_ifindex != ira->ira_ruifindex) { - freemsg(mp); - CONN_DEC_REF(connp); - return; - } - - if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || - (ira->ira_flags & IRAF_IPSEC_SECURE)) { - ip6_t *ip6h; - ipha_t *ipha; - - if (ira->ira_flags & IRAF_IS_IPV4) { - ipha = (ipha_t *)mp->b_rptr; - ip6h = NULL; - } else { - ipha = NULL; - ip6h = (ip6_t *)mp->b_rptr; - } - mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira); - if (mp == NULL) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - /* Note that mp is NULL */ - ip_drop_input("ipIfStatsInDiscards", mp, NULL); - CONN_DEC_REF(connp); - return; - } - } - - if (IPCL_IS_TCP(connp)) { - /* - * do not drain, certain use cases can blow - * the stack - */ - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - connp->conn_recv, connp, ira, - SQ_NODRAIN, SQTAG_IP_TCP_INPUT); - } else { - /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ - (connp->conn_recv)(connp, mp, NULL, - ira); - CONN_DEC_REF(connp); - } - -} - -boolean_t tcp_outbound_squeue_switch = B_FALSE; - -/* - * Handle M_DATA messages from IP. Its called directly from IP via - * squeue for received IP packets. - * - * The first argument is always the connp/tcp to which the mp belongs. - * There are no exceptions to this rule. The caller has already put - * a reference on this connp/tcp and once tcp_input_data() returns, - * the squeue will do the refrele. - * - * The TH_SYN for the listener directly go to tcp_input_listener via - * squeue. ICMP errors go directly to tcp_icmp_input(). - * - * sqp: NULL = recursive, sqp != NULL means called from squeue - */ -void -tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) -{ - int32_t bytes_acked; - int32_t gap; - mblk_t *mp1; - uint_t flags; - uint32_t new_swnd = 0; - uchar_t *iphdr; - uchar_t *rptr; - int32_t rgap; - uint32_t seg_ack; - int seg_len; - uint_t ip_hdr_len; - uint32_t seg_seq; - tcpha_t *tcpha; - int urp; - tcp_opt_t tcpopt; - ip_pkt_t ipp; - boolean_t ofo_seg = B_FALSE; /* Out of order segment */ - uint32_t cwnd; - uint32_t add; - int npkt; - int mss; - conn_t *connp = (conn_t *)arg; - squeue_t *sqp = (squeue_t *)arg2; - tcp_t *tcp = connp->conn_tcp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - /* - * RST from fused tcp loopback peer should trigger an unfuse. - */ - if (tcp->tcp_fused) { - TCP_STAT(tcps, tcp_fusion_aborted); - tcp_unfuse(tcp); - } - - iphdr = mp->b_rptr; - rptr = mp->b_rptr; - ASSERT(OK_32PTR(rptr)); - - ip_hdr_len = ira->ira_ip_hdr_length; - if (connp->conn_recv_ancillary.crb_all != 0) { - /* - * Record packet information in the ip_pkt_t - */ - ipp.ipp_fields = 0; - if (ira->ira_flags & IRAF_IS_IPV4) { - (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, - B_FALSE); - } else { - uint8_t nexthdrp; - - /* - * IPv6 packets can only be received by applications - * that are prepared to receive IPv6 addresses. - * The IP fanout must ensure this. - */ - ASSERT(connp->conn_family == AF_INET6); - - (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, - &nexthdrp); - ASSERT(nexthdrp == IPPROTO_TCP); - - /* Could have caused a pullup? */ - iphdr = mp->b_rptr; - rptr = mp->b_rptr; - } - } - ASSERT(DB_TYPE(mp) == M_DATA); - ASSERT(mp->b_next == NULL); - - tcpha = (tcpha_t *)&rptr[ip_hdr_len]; - seg_seq = ntohl(tcpha->tha_seq); - seg_ack = ntohl(tcpha->tha_ack); - ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); - seg_len = (int)(mp->b_wptr - rptr) - - (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); - if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { - do { - ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= - (uintptr_t)INT_MAX); - seg_len += (int)(mp1->b_wptr - mp1->b_rptr); - } while ((mp1 = mp1->b_cont) != NULL && - mp1->b_datap->db_type == M_DATA); - } - - if (tcp->tcp_state == TCPS_TIME_WAIT) { - tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, - seg_len, tcpha, ira); - return; - } - - if (sqp != NULL) { - /* - * This is the correct place to update tcp_last_recv_time. Note - * that it is also updated for tcp structure that belongs to - * global and listener queues which do not really need updating. - * But that should not cause any harm. And it is updated for - * all kinds of incoming segments, not only for data segments. - */ - tcp->tcp_last_recv_time = LBOLT_FASTPATH; - } - - flags = (unsigned int)tcpha->tha_flags & 0xFF; - - BUMP_LOCAL(tcp->tcp_ibsegs); - DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); - - if ((flags & TH_URG) && sqp != NULL) { - /* - * TCP can't handle urgent pointers that arrive before - * the connection has been accept()ed since it can't - * buffer OOB data. Discard segment if this happens. - * - * We can't just rely on a non-null tcp_listener to indicate - * that the accept() has completed since unlinking of the - * eager and completion of the accept are not atomic. - * tcp_detached, when it is not set (B_FALSE) indicates - * that the accept() has completed. - * - * Nor can it reassemble urgent pointers, so discard - * if it's not the next segment expected. - * - * Otherwise, collapse chain into one mblk (discard if - * that fails). This makes sure the headers, retransmitted - * data, and new data all are in the same mblk. - */ - ASSERT(mp != NULL); - if (tcp->tcp_detached || !pullupmsg(mp, -1)) { - freemsg(mp); - return; - } - /* Update pointers into message */ - iphdr = rptr = mp->b_rptr; - tcpha = (tcpha_t *)&rptr[ip_hdr_len]; - if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { - /* - * Since we can't handle any data with this urgent - * pointer that is out of sequence, we expunge - * the data. This allows us to still register - * the urgent mark and generate the M_PCSIG, - * which we can do. - */ - mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); - seg_len = 0; - } - } - - switch (tcp->tcp_state) { - case TCPS_SYN_SENT: - if (connp->conn_final_sqp == NULL && - tcp_outbound_squeue_switch && sqp != NULL) { - ASSERT(connp->conn_initial_sqp == connp->conn_sqp); - connp->conn_final_sqp = sqp; - if (connp->conn_final_sqp != connp->conn_sqp) { - DTRACE_PROBE1(conn__final__sqp__switch, - conn_t *, connp); - CONN_INC_REF(connp); - SQUEUE_SWITCH(connp, connp->conn_final_sqp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_input_data, connp, ira, ip_squeue_flag, - SQTAG_CONNECT_FINISH); - return; - } - DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); - } - if (flags & TH_ACK) { - /* - * Note that our stack cannot send data before a - * connection is established, therefore the - * following check is valid. Otherwise, it has - * to be changed. - */ - if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || - SEQ_GT(seg_ack, tcp->tcp_snxt)) { - freemsg(mp); - if (flags & TH_RST) - return; - tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", - tcp, seg_ack, 0, TH_RST); - return; - } - ASSERT(tcp->tcp_suna + 1 == seg_ack); - } - if (flags & TH_RST) { - freemsg(mp); - if (flags & TH_ACK) - (void) tcp_clean_death(tcp, - ECONNREFUSED, 13); - return; - } - if (!(flags & TH_SYN)) { - freemsg(mp); - return; - } - - /* Process all TCP options. */ - tcp_process_options(tcp, tcpha); - /* - * The following changes our rwnd to be a multiple of the - * MIN(peer MSS, our MSS) for performance reason. - */ - (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, - tcp->tcp_mss)); - - /* Is the other end ECN capable? */ - if (tcp->tcp_ecn_ok) { - if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { - tcp->tcp_ecn_ok = B_FALSE; - } - } - /* - * Clear ECN flags because it may interfere with later - * processing. - */ - flags &= ~(TH_ECE|TH_CWR); - - tcp->tcp_irs = seg_seq; - tcp->tcp_rack = seg_seq; - tcp->tcp_rnxt = seg_seq + 1; - tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); - if (!TCP_IS_DETACHED(tcp)) { - /* Allocate room for SACK options if needed. */ - connp->conn_wroff = connp->conn_ht_iphc_len; - if (tcp->tcp_snd_sack_ok) - connp->conn_wroff += TCPOPT_MAX_SACK_LEN; - if (!tcp->tcp_loopback) - connp->conn_wroff += tcps->tcps_wroff_xtra; - - (void) proto_set_tx_wroff(connp->conn_rq, connp, - connp->conn_wroff); - } - if (flags & TH_ACK) { - /* - * If we can't get the confirmation upstream, pretend - * we didn't even see this one. - * - * XXX: how can we pretend we didn't see it if we - * have updated rnxt et. al. - * - * For loopback we defer sending up the T_CONN_CON - * until after some checks below. - */ - mp1 = NULL; - /* - * tcp_sendmsg() checks tcp_state without entering - * the squeue so tcp_state should be updated before - * sending up connection confirmation - */ - tcp->tcp_state = TCPS_ESTABLISHED; - if (!tcp_conn_con(tcp, iphdr, mp, - tcp->tcp_loopback ? &mp1 : NULL, ira)) { - tcp->tcp_state = TCPS_SYN_SENT; - freemsg(mp); - return; - } - /* SYN was acked - making progress */ - tcp->tcp_ip_forward_progress = B_TRUE; - - /* One for the SYN */ - tcp->tcp_suna = tcp->tcp_iss + 1; - tcp->tcp_valid_bits &= ~TCP_ISS_VALID; - - /* - * If SYN was retransmitted, need to reset all - * retransmission info. This is because this - * segment will be treated as a dup ACK. - */ - if (tcp->tcp_rexmit) { - tcp->tcp_rexmit = B_FALSE; - tcp->tcp_rexmit_nxt = tcp->tcp_snxt; - tcp->tcp_rexmit_max = tcp->tcp_snxt; - tcp->tcp_snd_burst = tcp->tcp_localnet ? - TCP_CWND_INFINITE : TCP_CWND_NORMAL; - tcp->tcp_ms_we_have_waited = 0; - - /* - * Set tcp_cwnd back to 1 MSS, per - * recommendation from - * draft-floyd-incr-init-win-01.txt, - * Increasing TCP's Initial Window. - */ - tcp->tcp_cwnd = tcp->tcp_mss; - } - - tcp->tcp_swl1 = seg_seq; - tcp->tcp_swl2 = seg_ack; - - new_swnd = ntohs(tcpha->tha_win); - tcp->tcp_swnd = new_swnd; - if (new_swnd > tcp->tcp_max_swnd) - tcp->tcp_max_swnd = new_swnd; - - /* - * Always send the three-way handshake ack immediately - * in order to make the connection complete as soon as - * possible on the accepting host. - */ - flags |= TH_ACK_NEEDED; - - /* - * Special case for loopback. At this point we have - * received SYN-ACK from the remote endpoint. In - * order to ensure that both endpoints reach the - * fused state prior to any data exchange, the final - * ACK needs to be sent before we indicate T_CONN_CON - * to the module upstream. - */ - if (tcp->tcp_loopback) { - mblk_t *ack_mp; - - ASSERT(!tcp->tcp_unfusable); - ASSERT(mp1 != NULL); - /* - * For loopback, we always get a pure SYN-ACK - * and only need to send back the final ACK - * with no data (this is because the other - * tcp is ours and we don't do T/TCP). This - * final ACK triggers the passive side to - * perform fusion in ESTABLISHED state. - */ - if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { - if (tcp->tcp_ack_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, - tcp->tcp_ack_tid); - tcp->tcp_ack_tid = 0; - } - tcp_send_data(tcp, ack_mp); - BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - - if (!IPCL_IS_NONSTR(connp)) { - /* Send up T_CONN_CON */ - if (ira->ira_cred != NULL) { - mblk_setcred(mp1, - ira->ira_cred, - ira->ira_cpid); - } - putnext(connp->conn_rq, mp1); - } else { - (*connp->conn_upcalls-> - su_connected) - (connp->conn_upper_handle, - tcp->tcp_connid, - ira->ira_cred, - ira->ira_cpid); - freemsg(mp1); - } - - freemsg(mp); - return; - } - /* - * Forget fusion; we need to handle more - * complex cases below. Send the deferred - * T_CONN_CON message upstream and proceed - * as usual. Mark this tcp as not capable - * of fusion. - */ - TCP_STAT(tcps, tcp_fusion_unfusable); - tcp->tcp_unfusable = B_TRUE; - if (!IPCL_IS_NONSTR(connp)) { - if (ira->ira_cred != NULL) { - mblk_setcred(mp1, ira->ira_cred, - ira->ira_cpid); - } - putnext(connp->conn_rq, mp1); - } else { - (*connp->conn_upcalls->su_connected) - (connp->conn_upper_handle, - tcp->tcp_connid, ira->ira_cred, - ira->ira_cpid); - freemsg(mp1); - } - } - - /* - * Check to see if there is data to be sent. If - * yes, set the transmit flag. Then check to see - * if received data processing needs to be done. - * If not, go straight to xmit_check. This short - * cut is OK as we don't support T/TCP. - */ - if (tcp->tcp_unsent) - flags |= TH_XMIT_NEEDED; - - if (seg_len == 0 && !(flags & TH_URG)) { - freemsg(mp); - goto xmit_check; - } - - flags &= ~TH_SYN; - seg_seq++; - break; - } - tcp->tcp_state = TCPS_SYN_RCVD; - mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, - NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); - if (mp1 != NULL) { - tcp_send_data(tcp, mp1); - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - freemsg(mp); - return; - case TCPS_SYN_RCVD: - if (flags & TH_ACK) { - /* - * In this state, a SYN|ACK packet is either bogus - * because the other side must be ACKing our SYN which - * indicates it has seen the ACK for their SYN and - * shouldn't retransmit it or we're crossing SYNs - * on active open. - */ - if ((flags & TH_SYN) && !tcp->tcp_active_open) { - freemsg(mp); - tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", - tcp, seg_ack, 0, TH_RST); - return; - } - /* - * NOTE: RFC 793 pg. 72 says this should be - * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt - * but that would mean we have an ack that ignored - * our SYN. - */ - if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || - SEQ_GT(seg_ack, tcp->tcp_snxt)) { - freemsg(mp); - tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", - tcp, seg_ack, 0, TH_RST); - return; - } - /* - * No sane TCP stack will send such a small window - * without receiving any data. Just drop this invalid - * ACK. We also shorten the abort timeout in case - * this is an attack. - */ - if ((ntohs(tcpha->tha_win) << tcp->tcp_snd_ws) < - (tcp->tcp_mss >> tcp_init_wnd_shft)) { - freemsg(mp); - TCP_STAT(tcps, tcp_zwin_ack_syn); - tcp->tcp_second_ctimer_threshold = - tcp_early_abort * SECONDS; - return; - } - } - break; - case TCPS_LISTEN: - /* - * Only a TLI listener can come through this path when a - * acceptor is going back to be a listener and a packet - * for the acceptor hits the classifier. For a socket - * listener, this can never happen because a listener - * can never accept connection on itself and hence a - * socket acceptor can not go back to being a listener. - */ - ASSERT(!TCP_IS_SOCKET(tcp)); - /*FALLTHRU*/ - case TCPS_CLOSED: - case TCPS_BOUND: { - conn_t *new_connp; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - /* - * Don't accept any input on a closed tcp as this TCP logically - * does not exist on the system. Don't proceed further with - * this TCP. For instance, this packet could trigger another - * close of this tcp which would be disastrous for tcp_refcnt. - * tcp_close_detached / tcp_clean_death / tcp_closei_local must - * be called at most once on a TCP. In this case we need to - * refeed the packet into the classifier and figure out where - * the packet should go. - */ - new_connp = ipcl_classify(mp, ira, ipst); - if (new_connp != NULL) { - /* Drops ref on new_connp */ - tcp_reinput(new_connp, mp, ira, ipst); - return; - } - /* We failed to classify. For now just drop the packet */ - freemsg(mp); - return; - } - case TCPS_IDLE: - /* - * Handle the case where the tcp_clean_death() has happened - * on a connection (application hasn't closed yet) but a packet - * was already queued on squeue before tcp_clean_death() - * was processed. Calling tcp_clean_death() twice on same - * connection can result in weird behaviour. - */ - freemsg(mp); - return; - default: - break; - } - - /* - * Already on the correct queue/perimeter. - * If this is a detached connection and not an eager - * connection hanging off a listener then new data - * (past the FIN) will cause a reset. - * We do a special check here where it - * is out of the main line, rather than check - * if we are detached every time we see new - * data down below. - */ - if (TCP_IS_DETACHED_NONEAGER(tcp) && - (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { - BUMP_MIB(&tcps->tcps_mib, tcpInClosed); - DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); - - freemsg(mp); - /* - * This could be an SSL closure alert. We're detached so just - * acknowledge it this last time. - */ - if (tcp->tcp_kssl_ctx != NULL) { - kssl_release_ctx(tcp->tcp_kssl_ctx); - tcp->tcp_kssl_ctx = NULL; - - tcp->tcp_rnxt += seg_len; - tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); - flags |= TH_ACK_NEEDED; - goto ack_check; - } - - tcp_xmit_ctl("new data when detached", tcp, - tcp->tcp_snxt, 0, TH_RST); - (void) tcp_clean_death(tcp, EPROTO, 12); - return; - } - - mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); - urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION; - new_swnd = ntohs(tcpha->tha_win) << - ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); - - if (tcp->tcp_snd_ts_ok) { - if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { - /* - * This segment is not acceptable. - * Drop it and send back an ACK. - */ - freemsg(mp); - flags |= TH_ACK_NEEDED; - goto ack_check; - } - } else if (tcp->tcp_snd_sack_ok) { - ASSERT(tcp->tcp_sack_info != NULL); - tcpopt.tcp = tcp; - /* - * SACK info in already updated in tcp_parse_options. Ignore - * all other TCP options... - */ - (void) tcp_parse_options(tcpha, &tcpopt); - } -try_again:; - mss = tcp->tcp_mss; - gap = seg_seq - tcp->tcp_rnxt; - rgap = tcp->tcp_rwnd - (gap + seg_len); - /* - * gap is the amount of sequence space between what we expect to see - * and what we got for seg_seq. A positive value for gap means - * something got lost. A negative value means we got some old stuff. - */ - if (gap < 0) { - /* Old stuff present. Is the SYN in there? */ - if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && - (seg_len != 0)) { - flags &= ~TH_SYN; - seg_seq++; - urp--; - /* Recompute the gaps after noting the SYN. */ - goto try_again; - } - BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, - (seg_len > -gap ? -gap : seg_len)); - /* Remove the old stuff from seg_len. */ - seg_len += gap; - /* - * Anything left? - * Make sure to check for unack'd FIN when rest of data - * has been previously ack'd. - */ - if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { - /* - * Resets are only valid if they lie within our offered - * window. If the RST bit is set, we just ignore this - * segment. - */ - if (flags & TH_RST) { - freemsg(mp); - return; - } - - /* - * The arriving of dup data packets indicate that we - * may have postponed an ack for too long, or the other - * side's RTT estimate is out of shape. Start acking - * more often. - */ - if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && - tcp->tcp_rack_cnt >= 1 && - tcp->tcp_rack_abs_max > 2) { - tcp->tcp_rack_abs_max--; - } - tcp->tcp_rack_cur_max = 1; - - /* - * This segment is "unacceptable". None of its - * sequence space lies within our advertized window. - * - * Adjust seg_len to the original value for tracing. - */ - seg_len -= gap; - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: unacceptable, gap %d, rgap %d, " - "flags 0x%x, seg_seq %u, seg_ack %u, " - "seg_len %d, rnxt %u, snxt %u, %s", - gap, rgap, flags, seg_seq, seg_ack, - seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, - tcp_display(tcp, NULL, - DISP_ADDR_AND_PORT)); - } - - /* - * Arrange to send an ACK in response to the - * unacceptable segment per RFC 793 page 69. There - * is only one small difference between ours and the - * acceptability test in the RFC - we accept ACK-only - * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK - * will be generated. - * - * Note that we have to ACK an ACK-only packet at least - * for stacks that send 0-length keep-alives with - * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, - * section 4.2.3.6. As long as we don't ever generate - * an unacceptable packet in response to an incoming - * packet that is unacceptable, it should not cause - * "ACK wars". - */ - flags |= TH_ACK_NEEDED; - - /* - * Continue processing this segment in order to use the - * ACK information it contains, but skip all other - * sequence-number processing. Processing the ACK - * information is necessary in order to - * re-synchronize connections that may have lost - * synchronization. - * - * We clear seg_len and flag fields related to - * sequence number processing as they are not - * to be trusted for an unacceptable segment. - */ - seg_len = 0; - flags &= ~(TH_SYN | TH_FIN | TH_URG); - goto process_ack; - } - - /* Fix seg_seq, and chew the gap off the front. */ - seg_seq = tcp->tcp_rnxt; - urp += gap; - do { - mblk_t *mp2; - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= - (uintptr_t)UINT_MAX); - gap += (uint_t)(mp->b_wptr - mp->b_rptr); - if (gap > 0) { - mp->b_rptr = mp->b_wptr - gap; - break; - } - mp2 = mp; - mp = mp->b_cont; - freeb(mp2); - } while (gap < 0); - /* - * If the urgent data has already been acknowledged, we - * should ignore TH_URG below - */ - if (urp < 0) - flags &= ~TH_URG; - } - /* - * rgap is the amount of stuff received out of window. A negative - * value is the amount out of window. - */ - if (rgap < 0) { - mblk_t *mp2; - - if (tcp->tcp_rwnd == 0) { - BUMP_MIB(&tcps->tcps_mib, tcpInWinProbe); - } else { - BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); - UPDATE_MIB(&tcps->tcps_mib, - tcpInDataPastWinBytes, -rgap); - } - - /* - * seg_len does not include the FIN, so if more than - * just the FIN is out of window, we act like we don't - * see it. (If just the FIN is out of window, rgap - * will be zero and we will go ahead and acknowledge - * the FIN.) - */ - flags &= ~TH_FIN; - - /* Fix seg_len and make sure there is something left. */ - seg_len += rgap; - if (seg_len <= 0) { - /* - * Resets are only valid if they lie within our offered - * window. If the RST bit is set, we just ignore this - * segment. - */ - if (flags & TH_RST) { - freemsg(mp); - return; - } - - /* Per RFC 793, we need to send back an ACK. */ - flags |= TH_ACK_NEEDED; - - /* - * Send SIGURG as soon as possible i.e. even - * if the TH_URG was delivered in a window probe - * packet (which will be unacceptable). - * - * We generate a signal if none has been generated - * for this connection or if this is a new urgent - * byte. Also send a zero-length "unmarked" message - * to inform SIOCATMARK that this is not the mark. - * - * tcp_urp_last_valid is cleared when the T_exdata_ind - * is sent up. This plus the check for old data - * (gap >= 0) handles the wraparound of the sequence - * number space without having to always track the - * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks - * this max in its rcv_up variable). - * - * This prevents duplicate SIGURGS due to a "late" - * zero-window probe when the T_EXDATA_IND has already - * been sent up. - */ - if ((flags & TH_URG) && - (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, - tcp->tcp_urp_last))) { - if (IPCL_IS_NONSTR(connp)) { - if (!TCP_IS_DETACHED(tcp)) { - (*connp->conn_upcalls-> - su_signal_oob) - (connp->conn_upper_handle, - urp); - } - } else { - mp1 = allocb(0, BPRI_MED); - if (mp1 == NULL) { - freemsg(mp); - return; - } - if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(connp->conn_rq, - M_PCSIG, SIGURG)) { - /* Try again on the rexmit. */ - freemsg(mp1); - freemsg(mp); - return; - } - /* - * If the next byte would be the mark - * then mark with MARKNEXT else mark - * with NOTMARKNEXT. - */ - if (gap == 0 && urp == 0) - mp1->b_flag |= MSGMARKNEXT; - else - mp1->b_flag |= MSGNOTMARKNEXT; - freemsg(tcp->tcp_urp_mark_mp); - tcp->tcp_urp_mark_mp = mp1; - flags |= TH_SEND_URP_MARK; - } - tcp->tcp_urp_last_valid = B_TRUE; - tcp->tcp_urp_last = urp + seg_seq; - } - /* - * If this is a zero window probe, continue to - * process the ACK part. But we need to set seg_len - * to 0 to avoid data processing. Otherwise just - * drop the segment and send back an ACK. - */ - if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { - flags &= ~(TH_SYN | TH_URG); - seg_len = 0; - goto process_ack; - } else { - freemsg(mp); - goto ack_check; - } - } - /* Pitch out of window stuff off the end. */ - rgap = seg_len; - mp2 = mp; - do { - ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= - (uintptr_t)INT_MAX); - rgap -= (int)(mp2->b_wptr - mp2->b_rptr); - if (rgap < 0) { - mp2->b_wptr += rgap; - if ((mp1 = mp2->b_cont) != NULL) { - mp2->b_cont = NULL; - freemsg(mp1); - } - break; - } - } while ((mp2 = mp2->b_cont) != NULL); - } -ok:; - /* - * TCP should check ECN info for segments inside the window only. - * Therefore the check should be done here. - */ - if (tcp->tcp_ecn_ok) { - if (flags & TH_CWR) { - tcp->tcp_ecn_echo_on = B_FALSE; - } - /* - * Note that both ECN_CE and CWR can be set in the - * same segment. In this case, we once again turn - * on ECN_ECHO. - */ - if (connp->conn_ipversion == IPV4_VERSION) { - uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; - - if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { - tcp->tcp_ecn_echo_on = B_TRUE; - } - } else { - uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; - - if ((vcf & htonl(IPH_ECN_CE << 20)) == - htonl(IPH_ECN_CE << 20)) { - tcp->tcp_ecn_echo_on = B_TRUE; - } - } - } - - /* - * Check whether we can update tcp_ts_recent. This test is - * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP - * Extensions for High Performance: An Update", Internet Draft. - */ - if (tcp->tcp_snd_ts_ok && - TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && - SEQ_LEQ(seg_seq, tcp->tcp_rack)) { - tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; - tcp->tcp_last_rcv_lbolt = LBOLT_FASTPATH64; - } - - if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { - /* - * FIN in an out of order segment. We record this in - * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. - * Clear the FIN so that any check on FIN flag will fail. - * Remember that FIN also counts in the sequence number - * space. So we need to ack out of order FIN only segments. - */ - if (flags & TH_FIN) { - tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; - tcp->tcp_ofo_fin_seq = seg_seq + seg_len; - flags &= ~TH_FIN; - flags |= TH_ACK_NEEDED; - } - if (seg_len > 0) { - /* Fill in the SACK blk list. */ - if (tcp->tcp_snd_sack_ok) { - ASSERT(tcp->tcp_sack_info != NULL); - tcp_sack_insert(tcp->tcp_sack_list, - seg_seq, seg_seq + seg_len, - &(tcp->tcp_num_sack_blk)); - } - - /* - * Attempt reassembly and see if we have something - * ready to go. - */ - mp = tcp_reass(tcp, mp, seg_seq); - /* Always ack out of order packets */ - flags |= TH_ACK_NEEDED | TH_PUSH; - if (mp) { - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= - (uintptr_t)INT_MAX); - seg_len = mp->b_cont ? msgdsize(mp) : - (int)(mp->b_wptr - mp->b_rptr); - seg_seq = tcp->tcp_rnxt; - /* - * A gap is filled and the seq num and len - * of the gap match that of a previously - * received FIN, put the FIN flag back in. - */ - if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && - seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { - flags |= TH_FIN; - tcp->tcp_valid_bits &= - ~TCP_OFO_FIN_VALID; - } - if (tcp->tcp_reass_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, - tcp->tcp_reass_tid); - /* - * Restart the timer if there is still - * data in the reassembly queue. - */ - if (tcp->tcp_reass_head != NULL) { - tcp->tcp_reass_tid = TCP_TIMER( - tcp, tcp_reass_timer, - MSEC_TO_TICK( - tcps->tcps_reass_timeout)); - } else { - tcp->tcp_reass_tid = 0; - } - } - } else { - /* - * Keep going even with NULL mp. - * There may be a useful ACK or something else - * we don't want to miss. - * - * But TCP should not perform fast retransmit - * because of the ack number. TCP uses - * seg_len == 0 to determine if it is a pure - * ACK. And this is not a pure ACK. - */ - seg_len = 0; - ofo_seg = B_TRUE; - - if (tcps->tcps_reass_timeout != 0 && - tcp->tcp_reass_tid == 0) { - tcp->tcp_reass_tid = TCP_TIMER(tcp, - tcp_reass_timer, MSEC_TO_TICK( - tcps->tcps_reass_timeout)); - } - } - } - } else if (seg_len > 0) { - BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); - /* - * If an out of order FIN was received before, and the seq - * num and len of the new segment match that of the FIN, - * put the FIN flag back in. - */ - if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && - seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { - flags |= TH_FIN; - tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; - } - } - if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { - if (flags & TH_RST) { - freemsg(mp); - switch (tcp->tcp_state) { - case TCPS_SYN_RCVD: - (void) tcp_clean_death(tcp, ECONNREFUSED, 14); - break; - case TCPS_ESTABLISHED: - case TCPS_FIN_WAIT_1: - case TCPS_FIN_WAIT_2: - case TCPS_CLOSE_WAIT: - (void) tcp_clean_death(tcp, ECONNRESET, 15); - break; - case TCPS_CLOSING: - case TCPS_LAST_ACK: - (void) tcp_clean_death(tcp, 0, 16); - break; - default: - ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); - (void) tcp_clean_death(tcp, ENXIO, 17); - break; - } - return; - } - if (flags & TH_SYN) { - /* - * See RFC 793, Page 71 - * - * The seq number must be in the window as it should - * be "fixed" above. If it is outside window, it should - * be already rejected. Note that we allow seg_seq to be - * rnxt + rwnd because we want to accept 0 window probe. - */ - ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && - SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); - freemsg(mp); - /* - * If the ACK flag is not set, just use our snxt as the - * seq number of the RST segment. - */ - if (!(flags & TH_ACK)) { - seg_ack = tcp->tcp_snxt; - } - tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, - TH_RST|TH_ACK); - ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); - (void) tcp_clean_death(tcp, ECONNRESET, 18); - return; - } - /* - * urp could be -1 when the urp field in the packet is 0 - * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent - * byte was at seg_seq - 1, in which case we ignore the urgent flag. - */ - if (flags & TH_URG && urp >= 0) { - if (!tcp->tcp_urp_last_valid || - SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { - /* - * Non-STREAMS sockets handle the urgent data a litte - * differently from STREAMS based sockets. There is no - * need to mark any mblks with the MSG{NOT,}MARKNEXT - * flags to keep SIOCATMARK happy. Instead a - * su_signal_oob upcall is made to update the mark. - * Neither is a T_EXDATA_IND mblk needed to be - * prepended to the urgent data. The urgent data is - * delivered using the su_recv upcall, where we set - * the MSG_OOB flag to indicate that it is urg data. - * - * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED - * are used by non-STREAMS sockets. - */ - if (IPCL_IS_NONSTR(connp)) { - if (!TCP_IS_DETACHED(tcp)) { - (*connp->conn_upcalls->su_signal_oob) - (connp->conn_upper_handle, urp); - } - } else { - /* - * If we haven't generated the signal yet for - * this urgent pointer value, do it now. Also, - * send up a zero-length M_DATA indicating - * whether or not this is the mark. The latter - * is not needed when a T_EXDATA_IND is sent up. - * However, if there are allocation failures - * this code relies on the sender retransmitting - * and the socket code for determining the mark - * should not block waiting for the peer to - * transmit. Thus, for simplicity we always - * send up the mark indication. - */ - mp1 = allocb(0, BPRI_MED); - if (mp1 == NULL) { - freemsg(mp); - return; - } - if (!TCP_IS_DETACHED(tcp) && - !putnextctl1(connp->conn_rq, M_PCSIG, - SIGURG)) { - /* Try again on the rexmit. */ - freemsg(mp1); - freemsg(mp); - return; - } - /* - * Mark with NOTMARKNEXT for now. - * The code below will change this to MARKNEXT - * if we are at the mark. - * - * If there are allocation failures (e.g. in - * dupmsg below) the next time tcp_input_data - * sees the urgent segment it will send up the - * MSGMARKNEXT message. - */ - mp1->b_flag |= MSGNOTMARKNEXT; - freemsg(tcp->tcp_urp_mark_mp); - tcp->tcp_urp_mark_mp = mp1; - flags |= TH_SEND_URP_MARK; -#ifdef DEBUG - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: sent M_PCSIG 2 seq %x urp %x " - "last %x, %s", - seg_seq, urp, tcp->tcp_urp_last, - tcp_display(tcp, NULL, DISP_PORT_ONLY)); -#endif /* DEBUG */ - } - tcp->tcp_urp_last_valid = B_TRUE; - tcp->tcp_urp_last = urp + seg_seq; - } else if (tcp->tcp_urp_mark_mp != NULL) { - /* - * An allocation failure prevented the previous - * tcp_input_data from sending up the allocated - * MSG*MARKNEXT message - send it up this time - * around. - */ - flags |= TH_SEND_URP_MARK; - } - - /* - * If the urgent byte is in this segment, make sure that it is - * all by itself. This makes it much easier to deal with the - * possibility of an allocation failure on the T_exdata_ind. - * Note that seg_len is the number of bytes in the segment, and - * urp is the offset into the segment of the urgent byte. - * urp < seg_len means that the urgent byte is in this segment. - */ - if (urp < seg_len) { - if (seg_len != 1) { - uint32_t tmp_rnxt; - /* - * Break it up and feed it back in. - * Re-attach the IP header. - */ - mp->b_rptr = iphdr; - if (urp > 0) { - /* - * There is stuff before the urgent - * byte. - */ - mp1 = dupmsg(mp); - if (!mp1) { - /* - * Trim from urgent byte on. - * The rest will come back. - */ - (void) adjmsg(mp, - urp - seg_len); - tcp_input_data(connp, - mp, NULL, ira); - return; - } - (void) adjmsg(mp1, urp - seg_len); - /* Feed this piece back in. */ - tmp_rnxt = tcp->tcp_rnxt; - tcp_input_data(connp, mp1, NULL, ira); - /* - * If the data passed back in was not - * processed (ie: bad ACK) sending - * the remainder back in will cause a - * loop. In this case, drop the - * packet and let the sender try - * sending a good packet. - */ - if (tmp_rnxt == tcp->tcp_rnxt) { - freemsg(mp); - return; - } - } - if (urp != seg_len - 1) { - uint32_t tmp_rnxt; - /* - * There is stuff after the urgent - * byte. - */ - mp1 = dupmsg(mp); - if (!mp1) { - /* - * Trim everything beyond the - * urgent byte. The rest will - * come back. - */ - (void) adjmsg(mp, - urp + 1 - seg_len); - tcp_input_data(connp, - mp, NULL, ira); - return; - } - (void) adjmsg(mp1, urp + 1 - seg_len); - tmp_rnxt = tcp->tcp_rnxt; - tcp_input_data(connp, mp1, NULL, ira); - /* - * If the data passed back in was not - * processed (ie: bad ACK) sending - * the remainder back in will cause a - * loop. In this case, drop the - * packet and let the sender try - * sending a good packet. - */ - if (tmp_rnxt == tcp->tcp_rnxt) { - freemsg(mp); - return; - } - } - tcp_input_data(connp, mp, NULL, ira); - return; - } - /* - * This segment contains only the urgent byte. We - * have to allocate the T_exdata_ind, if we can. - */ - if (IPCL_IS_NONSTR(connp)) { - int error; - - (*connp->conn_upcalls->su_recv) - (connp->conn_upper_handle, mp, seg_len, - MSG_OOB, &error, NULL); - /* - * We should never be in middle of a - * fallback, the squeue guarantees that. - */ - ASSERT(error != EOPNOTSUPP); - mp = NULL; - goto update_ack; - } else if (!tcp->tcp_urp_mp) { - struct T_exdata_ind *tei; - mp1 = allocb(sizeof (struct T_exdata_ind), - BPRI_MED); - if (!mp1) { - /* - * Sigh... It'll be back. - * Generate any MSG*MARK message now. - */ - freemsg(mp); - seg_len = 0; - if (flags & TH_SEND_URP_MARK) { - - - ASSERT(tcp->tcp_urp_mark_mp); - tcp->tcp_urp_mark_mp->b_flag &= - ~MSGNOTMARKNEXT; - tcp->tcp_urp_mark_mp->b_flag |= - MSGMARKNEXT; - } - goto ack_check; - } - mp1->b_datap->db_type = M_PROTO; - tei = (struct T_exdata_ind *)mp1->b_rptr; - tei->PRIM_type = T_EXDATA_IND; - tei->MORE_flag = 0; - mp1->b_wptr = (uchar_t *)&tei[1]; - tcp->tcp_urp_mp = mp1; -#ifdef DEBUG - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: allocated exdata_ind %s", - tcp_display(tcp, NULL, - DISP_PORT_ONLY)); -#endif /* DEBUG */ - /* - * There is no need to send a separate MSG*MARK - * message since the T_EXDATA_IND will be sent - * now. - */ - flags &= ~TH_SEND_URP_MARK; - freemsg(tcp->tcp_urp_mark_mp); - tcp->tcp_urp_mark_mp = NULL; - } - /* - * Now we are all set. On the next putnext upstream, - * tcp_urp_mp will be non-NULL and will get prepended - * to what has to be this piece containing the urgent - * byte. If for any reason we abort this segment below, - * if it comes back, we will have this ready, or it - * will get blown off in close. - */ - } else if (urp == seg_len) { - /* - * The urgent byte is the next byte after this sequence - * number. If this endpoint is non-STREAMS, then there - * is nothing to do here since the socket has already - * been notified about the urg pointer by the - * su_signal_oob call above. - * - * In case of STREAMS, some more work might be needed. - * If there is data it is marked with MSGMARKNEXT and - * and any tcp_urp_mark_mp is discarded since it is not - * needed. Otherwise, if the code above just allocated - * a zero-length tcp_urp_mark_mp message, that message - * is tagged with MSGMARKNEXT. Sending up these - * MSGMARKNEXT messages makes SIOCATMARK work correctly - * even though the T_EXDATA_IND will not be sent up - * until the urgent byte arrives. - */ - if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { - if (seg_len != 0) { - flags |= TH_MARKNEXT_NEEDED; - freemsg(tcp->tcp_urp_mark_mp); - tcp->tcp_urp_mark_mp = NULL; - flags &= ~TH_SEND_URP_MARK; - } else if (tcp->tcp_urp_mark_mp != NULL) { - flags |= TH_SEND_URP_MARK; - tcp->tcp_urp_mark_mp->b_flag &= - ~MSGNOTMARKNEXT; - tcp->tcp_urp_mark_mp->b_flag |= - MSGMARKNEXT; - } - } -#ifdef DEBUG - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: AT MARK, len %d, flags 0x%x, %s", - seg_len, flags, - tcp_display(tcp, NULL, DISP_PORT_ONLY)); -#endif /* DEBUG */ - } -#ifdef DEBUG - else { - /* Data left until we hit mark */ - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: URP %d bytes left, %s", - urp - seg_len, tcp_display(tcp, NULL, - DISP_PORT_ONLY)); - } -#endif /* DEBUG */ - } - -process_ack: - if (!(flags & TH_ACK)) { - freemsg(mp); - goto xmit_check; - } - } - bytes_acked = (int)(seg_ack - tcp->tcp_suna); - - if (bytes_acked > 0) - tcp->tcp_ip_forward_progress = B_TRUE; - if (tcp->tcp_state == TCPS_SYN_RCVD) { - if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && - ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { - /* 3-way handshake complete - pass up the T_CONN_IND */ - tcp_t *listener = tcp->tcp_listener; - mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; - - tcp->tcp_tconnind_started = B_TRUE; - tcp->tcp_conn.tcp_eager_conn_ind = NULL; - /* - * We are here means eager is fine but it can - * get a TH_RST at any point between now and till - * accept completes and disappear. We need to - * ensure that reference to eager is valid after - * we get out of eager's perimeter. So we do - * an extra refhold. - */ - CONN_INC_REF(connp); - - /* - * The listener also exists because of the refhold - * done in tcp_input_listener. Its possible that it - * might have closed. We will check that once we - * get inside listeners context. - */ - CONN_INC_REF(listener->tcp_connp); - if (listener->tcp_connp->conn_sqp == - connp->conn_sqp) { - /* - * We optimize by not calling an SQUEUE_ENTER - * on the listener since we know that the - * listener and eager squeues are the same. - * We are able to make this check safely only - * because neither the eager nor the listener - * can change its squeue. Only an active connect - * can change its squeue - */ - tcp_send_conn_ind(listener->tcp_connp, mp, - listener->tcp_connp->conn_sqp); - CONN_DEC_REF(listener->tcp_connp); - } else if (!tcp->tcp_loopback) { - SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, - mp, tcp_send_conn_ind, - listener->tcp_connp, NULL, SQ_FILL, - SQTAG_TCP_CONN_IND); - } else { - SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, - mp, tcp_send_conn_ind, - listener->tcp_connp, NULL, SQ_PROCESS, - SQTAG_TCP_CONN_IND); - } - } - - /* - * We are seeing the final ack in the three way - * hand shake of a active open'ed connection - * so we must send up a T_CONN_CON - * - * tcp_sendmsg() checks tcp_state without entering - * the squeue so tcp_state should be updated before - * sending up connection confirmation. - */ - tcp->tcp_state = TCPS_ESTABLISHED; - if (tcp->tcp_active_open) { - if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { - freemsg(mp); - tcp->tcp_state = TCPS_SYN_RCVD; - return; - } - /* - * Don't fuse the loopback endpoints for - * simultaneous active opens. - */ - if (tcp->tcp_loopback) { - TCP_STAT(tcps, tcp_fusion_unfusable); - tcp->tcp_unfusable = B_TRUE; - } - } - - tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ - bytes_acked--; - /* SYN was acked - making progress */ - tcp->tcp_ip_forward_progress = B_TRUE; - - /* - * If SYN was retransmitted, need to reset all - * retransmission info as this segment will be - * treated as a dup ACK. - */ - if (tcp->tcp_rexmit) { - tcp->tcp_rexmit = B_FALSE; - tcp->tcp_rexmit_nxt = tcp->tcp_snxt; - tcp->tcp_rexmit_max = tcp->tcp_snxt; - tcp->tcp_snd_burst = tcp->tcp_localnet ? - TCP_CWND_INFINITE : TCP_CWND_NORMAL; - tcp->tcp_ms_we_have_waited = 0; - tcp->tcp_cwnd = mss; - } - - /* - * We set the send window to zero here. - * This is needed if there is data to be - * processed already on the queue. - * Later (at swnd_update label), the - * "new_swnd > tcp_swnd" condition is satisfied - * the XMIT_NEEDED flag is set in the current - * (SYN_RCVD) state. This ensures tcp_wput_data() is - * called if there is already data on queue in - * this state. - */ - tcp->tcp_swnd = 0; - - if (new_swnd > tcp->tcp_max_swnd) - tcp->tcp_max_swnd = new_swnd; - tcp->tcp_swl1 = seg_seq; - tcp->tcp_swl2 = seg_ack; - tcp->tcp_valid_bits &= ~TCP_ISS_VALID; - - /* Fuse when both sides are in ESTABLISHED state */ - if (tcp->tcp_loopback && do_tcp_fusion) - tcp_fuse(tcp, iphdr, tcpha); - - } - /* This code follows 4.4BSD-Lite2 mostly. */ - if (bytes_acked < 0) - goto est; - - /* - * If TCP is ECN capable and the congestion experience bit is - * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be - * done once per window (or more loosely, per RTT). - */ - if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) - tcp->tcp_cwr = B_FALSE; - if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { - if (!tcp->tcp_cwr) { - npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; - tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; - tcp->tcp_cwnd = npkt * mss; - /* - * If the cwnd is 0, use the timer to clock out - * new segments. This is required by the ECN spec. - */ - if (npkt == 0) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - /* - * This makes sure that when the ACK comes - * back, we will increase tcp_cwnd by 1 MSS. - */ - tcp->tcp_cwnd_cnt = 0; - } - tcp->tcp_cwr = B_TRUE; - /* - * This marks the end of the current window of in - * flight data. That is why we don't use - * tcp_suna + tcp_swnd. Only data in flight can - * provide ECN info. - */ - tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; - } - } - - mp1 = tcp->tcp_xmit_head; - if (bytes_acked == 0) { - if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { - int dupack_cnt; - - BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); - /* - * Fast retransmit. When we have seen exactly three - * identical ACKs while we have unacked data - * outstanding we take it as a hint that our peer - * dropped something. - * - * If TCP is retransmitting, don't do fast retransmit. - */ - if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && - ! tcp->tcp_rexmit) { - /* Do Limited Transmit */ - if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < - tcps->tcps_dupack_fast_retransmit) { - /* - * RFC 3042 - * - * What we need to do is temporarily - * increase tcp_cwnd so that new - * data can be sent if it is allowed - * by the receive window (tcp_rwnd). - * tcp_wput_data() will take care of - * the rest. - * - * If the connection is SACK capable, - * only do limited xmit when there - * is SACK info. - * - * Note how tcp_cwnd is incremented. - * The first dup ACK will increase - * it by 1 MSS. The second dup ACK - * will increase it by 2 MSS. This - * means that only 1 new segment will - * be sent for each dup ACK. - */ - if (tcp->tcp_unsent > 0 && - (!tcp->tcp_snd_sack_ok || - (tcp->tcp_snd_sack_ok && - tcp->tcp_notsack_list != NULL))) { - tcp->tcp_cwnd += mss << - (tcp->tcp_dupack_cnt - 1); - flags |= TH_LIMIT_XMIT; - } - } else if (dupack_cnt == - tcps->tcps_dupack_fast_retransmit) { - - /* - * If we have reduced tcp_ssthresh - * because of ECN, do not reduce it again - * unless it is already one window of data - * away. After one window of data, tcp_cwr - * should then be cleared. Note that - * for non ECN capable connection, tcp_cwr - * should always be false. - * - * Adjust cwnd since the duplicate - * ack indicates that a packet was - * dropped (due to congestion.) - */ - if (!tcp->tcp_cwr) { - npkt = ((tcp->tcp_snxt - - tcp->tcp_suna) >> 1) / mss; - tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * - mss; - tcp->tcp_cwnd = (npkt + - tcp->tcp_dupack_cnt) * mss; - } - if (tcp->tcp_ecn_ok) { - tcp->tcp_cwr = B_TRUE; - tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; - } - - /* - * We do Hoe's algorithm. Refer to her - * paper "Improving the Start-up Behavior - * of a Congestion Control Scheme for TCP," - * appeared in SIGCOMM'96. - * - * Save highest seq no we have sent so far. - * Be careful about the invisible FIN byte. - */ - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - (tcp->tcp_unsent == 0)) { - tcp->tcp_rexmit_max = tcp->tcp_fss; - } else { - tcp->tcp_rexmit_max = tcp->tcp_snxt; - } - - /* - * Do not allow bursty traffic during. - * fast recovery. Refer to Fall and Floyd's - * paper "Simulation-based Comparisons of - * Tahoe, Reno and SACK TCP" (in CCR?) - * This is a best current practise. - */ - tcp->tcp_snd_burst = TCP_CWND_SS; - - /* - * For SACK: - * Calculate tcp_pipe, which is the - * estimated number of bytes in - * network. - * - * tcp_fack is the highest sack'ed seq num - * TCP has received. - * - * tcp_pipe is explained in the above quoted - * Fall and Floyd's paper. tcp_fack is - * explained in Mathis and Mahdavi's - * "Forward Acknowledgment: Refining TCP - * Congestion Control" in SIGCOMM '96. - */ - if (tcp->tcp_snd_sack_ok) { - ASSERT(tcp->tcp_sack_info != NULL); - if (tcp->tcp_notsack_list != NULL) { - tcp->tcp_pipe = tcp->tcp_snxt - - tcp->tcp_fack; - tcp->tcp_sack_snxt = seg_ack; - flags |= TH_NEED_SACK_REXMIT; - } else { - /* - * Always initialize tcp_pipe - * even though we don't have - * any SACK info. If later - * we get SACK info and - * tcp_pipe is not initialized, - * funny things will happen. - */ - tcp->tcp_pipe = - tcp->tcp_cwnd_ssthresh; - } - } else { - flags |= TH_REXMIT_NEEDED; - } /* tcp_snd_sack_ok */ - - } else { - /* - * Here we perform congestion - * avoidance, but NOT slow start. - * This is known as the Fast - * Recovery Algorithm. - */ - if (tcp->tcp_snd_sack_ok && - tcp->tcp_notsack_list != NULL) { - flags |= TH_NEED_SACK_REXMIT; - tcp->tcp_pipe -= mss; - if (tcp->tcp_pipe < 0) - tcp->tcp_pipe = 0; - } else { - /* - * We know that one more packet has - * left the pipe thus we can update - * cwnd. - */ - cwnd = tcp->tcp_cwnd + mss; - if (cwnd > tcp->tcp_cwnd_max) - cwnd = tcp->tcp_cwnd_max; - tcp->tcp_cwnd = cwnd; - if (tcp->tcp_unsent > 0) - flags |= TH_XMIT_NEEDED; - } - } - } - } else if (tcp->tcp_zero_win_probe) { - /* - * If the window has opened, need to arrange - * to send additional data. - */ - if (new_swnd != 0) { - /* tcp_suna != tcp_snxt */ - /* Packet contains a window update */ - BUMP_MIB(&tcps->tcps_mib, tcpInWinUpdate); - tcp->tcp_zero_win_probe = 0; - tcp->tcp_timer_backoff = 0; - tcp->tcp_ms_we_have_waited = 0; - - /* - * Transmit starting with tcp_suna since - * the one byte probe is not ack'ed. - * If TCP has sent more than one identical - * probe, tcp_rexmit will be set. That means - * tcp_ss_rexmit() will send out the one - * byte along with new data. Otherwise, - * fake the retransmission. - */ - flags |= TH_XMIT_NEEDED; - if (!tcp->tcp_rexmit) { - tcp->tcp_rexmit = B_TRUE; - tcp->tcp_dupack_cnt = 0; - tcp->tcp_rexmit_nxt = tcp->tcp_suna; - tcp->tcp_rexmit_max = tcp->tcp_suna + 1; - } - } - } - goto swnd_update; - } - - /* - * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. - * If the ACK value acks something that we have not yet sent, it might - * be an old duplicate segment. Send an ACK to re-synchronize the - * other side. - * Note: reset in response to unacceptable ACK in SYN_RECEIVE - * state is handled above, so we can always just drop the segment and - * send an ACK here. - * - * In the case where the peer shrinks the window, we see the new window - * update, but all the data sent previously is queued up by the peer. - * To account for this, in tcp_process_shrunk_swnd(), the sequence - * number, which was already sent, and within window, is recorded. - * tcp_snxt is then updated. - * - * If the window has previously shrunk, and an ACK for data not yet - * sent, according to tcp_snxt is recieved, it may still be valid. If - * the ACK is for data within the window at the time the window was - * shrunk, then the ACK is acceptable. In this case tcp_snxt is set to - * the sequence number ACK'ed. - * - * If the ACK covers all the data sent at the time the window was - * shrunk, we can now set tcp_is_wnd_shrnk to B_FALSE. - * - * Should we send ACKs in response to ACK only segments? - */ - - if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { - if ((tcp->tcp_is_wnd_shrnk) && - (SEQ_LEQ(seg_ack, tcp->tcp_snxt_shrunk))) { - uint32_t data_acked_ahead_snxt; - - data_acked_ahead_snxt = seg_ack - tcp->tcp_snxt; - tcp_update_xmit_tail(tcp, seg_ack); - tcp->tcp_unsent -= data_acked_ahead_snxt; - } else { - BUMP_MIB(&tcps->tcps_mib, tcpInAckUnsent); - /* drop the received segment */ - freemsg(mp); - - /* - * Send back an ACK. If tcp_drop_ack_unsent_cnt is - * greater than 0, check if the number of such - * bogus ACks is greater than that count. If yes, - * don't send back any ACK. This prevents TCP from - * getting into an ACK storm if somehow an attacker - * successfully spoofs an acceptable segment to our - * peer. If this continues (count > 2 X threshold), - * we should abort this connection. - */ - if (tcp_drop_ack_unsent_cnt > 0 && - ++tcp->tcp_in_ack_unsent > - tcp_drop_ack_unsent_cnt) { - TCP_STAT(tcps, tcp_in_ack_unsent_drop); - if (tcp->tcp_in_ack_unsent > 2 * - tcp_drop_ack_unsent_cnt) { - (void) tcp_clean_death(tcp, EPROTO, 20); - } - return; - } - mp = tcp_ack_mp(tcp); - if (mp != NULL) { - BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - tcp_send_data(tcp, mp); - } - return; - } - } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack, - tcp->tcp_snxt_shrunk)) { - tcp->tcp_is_wnd_shrnk = B_FALSE; - } - - /* - * TCP gets a new ACK, update the notsack'ed list to delete those - * blocks that are covered by this ACK. - */ - if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { - tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, - &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); - } - - /* - * If we got an ACK after fast retransmit, check to see - * if it is a partial ACK. If it is not and the congestion - * window was inflated to account for the other side's - * cached packets, retract it. If it is, do Hoe's algorithm. - */ - if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { - ASSERT(tcp->tcp_rexmit == B_FALSE); - if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { - tcp->tcp_dupack_cnt = 0; - /* - * Restore the orig tcp_cwnd_ssthresh after - * fast retransmit phase. - */ - if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { - tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; - } - tcp->tcp_rexmit_max = seg_ack; - tcp->tcp_cwnd_cnt = 0; - tcp->tcp_snd_burst = tcp->tcp_localnet ? - TCP_CWND_INFINITE : TCP_CWND_NORMAL; - - /* - * Remove all notsack info to avoid confusion with - * the next fast retrasnmit/recovery phase. - */ - if (tcp->tcp_snd_sack_ok && - tcp->tcp_notsack_list != NULL) { - TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, - tcp); - } - } else { - if (tcp->tcp_snd_sack_ok && - tcp->tcp_notsack_list != NULL) { - flags |= TH_NEED_SACK_REXMIT; - tcp->tcp_pipe -= mss; - if (tcp->tcp_pipe < 0) - tcp->tcp_pipe = 0; - } else { - /* - * Hoe's algorithm: - * - * Retransmit the unack'ed segment and - * restart fast recovery. Note that we - * need to scale back tcp_cwnd to the - * original value when we started fast - * recovery. This is to prevent overly - * aggressive behaviour in sending new - * segments. - */ - tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + - tcps->tcps_dupack_fast_retransmit * mss; - tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; - flags |= TH_REXMIT_NEEDED; - } - } - } else { - tcp->tcp_dupack_cnt = 0; - if (tcp->tcp_rexmit) { - /* - * TCP is retranmitting. If the ACK ack's all - * outstanding data, update tcp_rexmit_max and - * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt - * to the correct value. - * - * Note that SEQ_LEQ() is used. This is to avoid - * unnecessary fast retransmit caused by dup ACKs - * received when TCP does slow start retransmission - * after a time out. During this phase, TCP may - * send out segments which are already received. - * This causes dup ACKs to be sent back. - */ - if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { - if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { - tcp->tcp_rexmit_nxt = seg_ack; - } - if (seg_ack != tcp->tcp_rexmit_max) { - flags |= TH_XMIT_NEEDED; - } - } else { - tcp->tcp_rexmit = B_FALSE; - tcp->tcp_rexmit_nxt = tcp->tcp_snxt; - tcp->tcp_snd_burst = tcp->tcp_localnet ? - TCP_CWND_INFINITE : TCP_CWND_NORMAL; - } - tcp->tcp_ms_we_have_waited = 0; - } - } - - BUMP_MIB(&tcps->tcps_mib, tcpInAckSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInAckBytes, bytes_acked); - tcp->tcp_suna = seg_ack; - if (tcp->tcp_zero_win_probe != 0) { - tcp->tcp_zero_win_probe = 0; - tcp->tcp_timer_backoff = 0; - } - - /* - * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. - * Note that it cannot be the SYN being ack'ed. The code flow - * will not reach here. - */ - if (mp1 == NULL) { - goto fin_acked; - } - - /* - * Update the congestion window. - * - * If TCP is not ECN capable or TCP is ECN capable but the - * congestion experience bit is not set, increase the tcp_cwnd as - * usual. - */ - if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { - cwnd = tcp->tcp_cwnd; - add = mss; - - if (cwnd >= tcp->tcp_cwnd_ssthresh) { - /* - * This is to prevent an increase of less than 1 MSS of - * tcp_cwnd. With partial increase, tcp_wput_data() - * may send out tinygrams in order to preserve mblk - * boundaries. - * - * By initializing tcp_cwnd_cnt to new tcp_cwnd and - * decrementing it by 1 MSS for every ACKs, tcp_cwnd is - * increased by 1 MSS for every RTTs. - */ - if (tcp->tcp_cwnd_cnt <= 0) { - tcp->tcp_cwnd_cnt = cwnd + add; - } else { - tcp->tcp_cwnd_cnt -= add; - add = 0; - } - } - tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); - } - - /* See if the latest urgent data has been acknowledged */ - if ((tcp->tcp_valid_bits & TCP_URG_VALID) && - SEQ_GT(seg_ack, tcp->tcp_urg)) - tcp->tcp_valid_bits &= ~TCP_URG_VALID; - - /* Can we update the RTT estimates? */ - if (tcp->tcp_snd_ts_ok) { - /* Ignore zero timestamp echo-reply. */ - if (tcpopt.tcp_opt_ts_ecr != 0) { - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)tcpopt.tcp_opt_ts_ecr); - } - - /* If needed, restart the timer. */ - if (tcp->tcp_set_timer == 1) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - tcp->tcp_set_timer = 0; - } - /* - * Update tcp_csuna in case the other side stops sending - * us timestamps. - */ - tcp->tcp_csuna = tcp->tcp_snxt; - } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { - /* - * An ACK sequence we haven't seen before, so get the RTT - * and update the RTO. But first check if the timestamp is - * valid to use. - */ - if ((mp1->b_next != NULL) && - SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)(intptr_t)mp1->b_prev); - else - BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); - - /* Remeber the last sequence to be ACKed */ - tcp->tcp_csuna = seg_ack; - if (tcp->tcp_set_timer == 1) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - tcp->tcp_set_timer = 0; - } - } else { - BUMP_MIB(&tcps->tcps_mib, tcpRttNoUpdate); - } - - /* Eat acknowledged bytes off the xmit queue. */ - for (;;) { - mblk_t *mp2; - uchar_t *wptr; - - wptr = mp1->b_wptr; - ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); - bytes_acked -= (int)(wptr - mp1->b_rptr); - if (bytes_acked < 0) { - mp1->b_rptr = wptr + bytes_acked; - /* - * Set a new timestamp if all the bytes timed by the - * old timestamp have been ack'ed. - */ - if (SEQ_GT(seg_ack, - (uint32_t)(uintptr_t)(mp1->b_next))) { - mp1->b_prev = - (mblk_t *)(uintptr_t)LBOLT_FASTPATH; - mp1->b_next = NULL; - } - break; - } - mp1->b_next = NULL; - mp1->b_prev = NULL; - mp2 = mp1; - mp1 = mp1->b_cont; - - /* - * This notification is required for some zero-copy - * clients to maintain a copy semantic. After the data - * is ack'ed, client is safe to modify or reuse the buffer. - */ - if (tcp->tcp_snd_zcopy_aware && - (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) - tcp_zcopy_notify(tcp); - freeb(mp2); - if (bytes_acked == 0) { - if (mp1 == NULL) { - /* Everything is ack'ed, clear the tail. */ - tcp->tcp_xmit_tail = NULL; - /* - * Cancel the timer unless we are still - * waiting for an ACK for the FIN packet. - */ - if (tcp->tcp_timer_tid != 0 && - tcp->tcp_snxt == tcp->tcp_suna) { - (void) TCP_TIMER_CANCEL(tcp, - tcp->tcp_timer_tid); - tcp->tcp_timer_tid = 0; - } - goto pre_swnd_update; - } - if (mp2 != tcp->tcp_xmit_tail) - break; - tcp->tcp_xmit_tail = mp1; - ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= - (uintptr_t)INT_MAX); - tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - - mp1->b_rptr); - break; - } - if (mp1 == NULL) { - /* - * More was acked but there is nothing more - * outstanding. This means that the FIN was - * just acked or that we're talking to a clown. - */ -fin_acked: - ASSERT(tcp->tcp_fin_sent); - tcp->tcp_xmit_tail = NULL; - if (tcp->tcp_fin_sent) { - /* FIN was acked - making progress */ - if (!tcp->tcp_fin_acked) - tcp->tcp_ip_forward_progress = B_TRUE; - tcp->tcp_fin_acked = B_TRUE; - if (tcp->tcp_linger_tid != 0 && - TCP_TIMER_CANCEL(tcp, - tcp->tcp_linger_tid) >= 0) { - tcp_stop_lingering(tcp); - freemsg(mp); - mp = NULL; - } - } else { - /* - * We should never get here because - * we have already checked that the - * number of bytes ack'ed should be - * smaller than or equal to what we - * have sent so far (it is the - * acceptability check of the ACK). - * We can only get here if the send - * queue is corrupted. - * - * Terminate the connection and - * panic the system. It is better - * for us to panic instead of - * continuing to avoid other disaster. - */ - tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, - tcp->tcp_rnxt, TH_RST|TH_ACK); - panic("Memory corruption " - "detected for connection %s.", - tcp_display(tcp, NULL, - DISP_ADDR_AND_PORT)); - /*NOTREACHED*/ - } - goto pre_swnd_update; - } - ASSERT(mp2 != tcp->tcp_xmit_tail); - } - if (tcp->tcp_unsent) { - flags |= TH_XMIT_NEEDED; - } -pre_swnd_update: - tcp->tcp_xmit_head = mp1; -swnd_update: - /* - * The following check is different from most other implementations. - * For bi-directional transfer, when segments are dropped, the - * "normal" check will not accept a window update in those - * retransmitted segemnts. Failing to do that, TCP may send out - * segments which are outside receiver's window. As TCP accepts - * the ack in those retransmitted segments, if the window update in - * the same segment is not accepted, TCP will incorrectly calculates - * that it can send more segments. This can create a deadlock - * with the receiver if its window becomes zero. - */ - if (SEQ_LT(tcp->tcp_swl2, seg_ack) || - SEQ_LT(tcp->tcp_swl1, seg_seq) || - (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { - /* - * The criteria for update is: - * - * 1. the segment acknowledges some data. Or - * 2. the segment is new, i.e. it has a higher seq num. Or - * 3. the segment is not old and the advertised window is - * larger than the previous advertised window. - */ - if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) - flags |= TH_XMIT_NEEDED; - tcp->tcp_swnd = new_swnd; - if (new_swnd > tcp->tcp_max_swnd) - tcp->tcp_max_swnd = new_swnd; - tcp->tcp_swl1 = seg_seq; - tcp->tcp_swl2 = seg_ack; - } -est: - if (tcp->tcp_state > TCPS_ESTABLISHED) { - - switch (tcp->tcp_state) { - case TCPS_FIN_WAIT_1: - if (tcp->tcp_fin_acked) { - tcp->tcp_state = TCPS_FIN_WAIT_2; - /* - * We implement the non-standard BSD/SunOS - * FIN_WAIT_2 flushing algorithm. - * If there is no user attached to this - * TCP endpoint, then this TCP struct - * could hang around forever in FIN_WAIT_2 - * state if the peer forgets to send us - * a FIN. To prevent this, we wait only - * 2*MSL (a convenient time value) for - * the FIN to arrive. If it doesn't show up, - * we flush the TCP endpoint. This algorithm, - * though a violation of RFC-793, has worked - * for over 10 years in BSD systems. - * Note: SunOS 4.x waits 675 seconds before - * flushing the FIN_WAIT_2 connection. - */ - TCP_TIMER_RESTART(tcp, - tcps->tcps_fin_wait_2_flush_interval); - } - break; - case TCPS_FIN_WAIT_2: - break; /* Shutdown hook? */ - case TCPS_LAST_ACK: - freemsg(mp); - if (tcp->tcp_fin_acked) { - (void) tcp_clean_death(tcp, 0, 19); - return; - } - goto xmit_check; - case TCPS_CLOSING: - if (tcp->tcp_fin_acked) - SET_TIME_WAIT(tcps, tcp, connp); - /*FALLTHRU*/ - case TCPS_CLOSE_WAIT: - freemsg(mp); - goto xmit_check; - default: - ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); - break; - } - } - if (flags & TH_FIN) { - /* Make sure we ack the fin */ - flags |= TH_ACK_NEEDED; - if (!tcp->tcp_fin_rcvd) { - tcp->tcp_fin_rcvd = B_TRUE; - tcp->tcp_rnxt++; - tcpha = tcp->tcp_tcpha; - tcpha->tha_ack = htonl(tcp->tcp_rnxt); - - /* - * Generate the ordrel_ind at the end unless we - * are an eager guy. - * In the eager case tcp_rsrv will do this when run - * after tcp_accept is done. - */ - if (tcp->tcp_listener == NULL && - !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding) - flags |= TH_ORDREL_NEEDED; - switch (tcp->tcp_state) { - case TCPS_SYN_RCVD: - case TCPS_ESTABLISHED: - tcp->tcp_state = TCPS_CLOSE_WAIT; - /* Keepalive? */ - break; - case TCPS_FIN_WAIT_1: - if (!tcp->tcp_fin_acked) { - tcp->tcp_state = TCPS_CLOSING; - break; - } - /* FALLTHRU */ - case TCPS_FIN_WAIT_2: - SET_TIME_WAIT(tcps, tcp, connp); - if (seg_len) { - /* - * implies data piggybacked on FIN. - * break to handle data. - */ - break; - } - freemsg(mp); - goto ack_check; - } - } - } - if (mp == NULL) - goto xmit_check; - if (seg_len == 0) { - freemsg(mp); - goto xmit_check; - } - if (mp->b_rptr == mp->b_wptr) { - /* - * The header has been consumed, so we remove the - * zero-length mblk here. - */ - mp1 = mp; - mp = mp->b_cont; - freeb(mp1); - } -update_ack: - tcpha = tcp->tcp_tcpha; - tcp->tcp_rack_cnt++; - { - uint32_t cur_max; - - cur_max = tcp->tcp_rack_cur_max; - if (tcp->tcp_rack_cnt >= cur_max) { - /* - * We have more unacked data than we should - send - * an ACK now. - */ - flags |= TH_ACK_NEEDED; - cur_max++; - if (cur_max > tcp->tcp_rack_abs_max) - tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; - else - tcp->tcp_rack_cur_max = cur_max; - } else if (TCP_IS_DETACHED(tcp)) { - /* We don't have an ACK timer for detached TCP. */ - flags |= TH_ACK_NEEDED; - } else if (seg_len < mss) { - /* - * If we get a segment that is less than an mss, and we - * already have unacknowledged data, and the amount - * unacknowledged is not a multiple of mss, then we - * better generate an ACK now. Otherwise, this may be - * the tail piece of a transaction, and we would rather - * wait for the response. - */ - uint32_t udif; - ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= - (uintptr_t)INT_MAX); - udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); - if (udif && (udif % mss)) - flags |= TH_ACK_NEEDED; - else - flags |= TH_ACK_TIMER_NEEDED; - } else { - /* Start delayed ack timer */ - flags |= TH_ACK_TIMER_NEEDED; - } - } - tcp->tcp_rnxt += seg_len; - tcpha->tha_ack = htonl(tcp->tcp_rnxt); - - if (mp == NULL) - goto xmit_check; - - /* Update SACK list */ - if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { - tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, - &(tcp->tcp_num_sack_blk)); - } - - if (tcp->tcp_urp_mp) { - tcp->tcp_urp_mp->b_cont = mp; - mp = tcp->tcp_urp_mp; - tcp->tcp_urp_mp = NULL; - /* Ready for a new signal. */ - tcp->tcp_urp_last_valid = B_FALSE; -#ifdef DEBUG - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: sending exdata_ind %s", - tcp_display(tcp, NULL, DISP_PORT_ONLY)); -#endif /* DEBUG */ - } - - /* - * Check for ancillary data changes compared to last segment. - */ - if (connp->conn_recv_ancillary.crb_all != 0) { - mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira); - if (mp == NULL) - return; - } - - if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { - /* - * Side queue inbound data until the accept happens. - * tcp_accept/tcp_rput drains this when the accept happens. - * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or - * T_EXDATA_IND) it is queued on b_next. - * XXX Make urgent data use this. Requires: - * Removing tcp_listener check for TH_URG - * Making M_PCPROTO and MARK messages skip the eager case - */ - - if (tcp->tcp_kssl_pending) { - DTRACE_PROBE1(kssl_mblk__ksslinput_pending, - mblk_t *, mp); - tcp_kssl_input(tcp, mp, ira->ira_cred); - } else { - tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); - } - } else if (IPCL_IS_NONSTR(connp)) { - /* - * Non-STREAMS socket - * - * Note that no KSSL processing is done here, because - * KSSL is not supported for non-STREAMS sockets. - */ - boolean_t push = flags & (TH_PUSH|TH_FIN); - int error; - - if ((*connp->conn_upcalls->su_recv)( - connp->conn_upper_handle, - mp, seg_len, 0, &error, &push) <= 0) { - /* - * We should never be in middle of a - * fallback, the squeue guarantees that. - */ - ASSERT(error != EOPNOTSUPP); - if (error == ENOSPC) - tcp->tcp_rwnd -= seg_len; - } else if (push) { - /* PUSH bit set and sockfs is not flow controlled */ - flags |= tcp_rwnd_reopen(tcp); - } - } else { - /* STREAMS socket */ - if (mp->b_datap->db_type != M_DATA || - (flags & TH_MARKNEXT_NEEDED)) { - if (tcp->tcp_rcv_list != NULL) { - flags |= tcp_rcv_drain(tcp); - } - ASSERT(tcp->tcp_rcv_list == NULL || - tcp->tcp_fused_sigurg); - - if (flags & TH_MARKNEXT_NEEDED) { -#ifdef DEBUG - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: sending MSGMARKNEXT %s", - tcp_display(tcp, NULL, - DISP_PORT_ONLY)); -#endif /* DEBUG */ - mp->b_flag |= MSGMARKNEXT; - flags &= ~TH_MARKNEXT_NEEDED; - } - - /* Does this need SSL processing first? */ - if ((tcp->tcp_kssl_ctx != NULL) && - (DB_TYPE(mp) == M_DATA)) { - DTRACE_PROBE1(kssl_mblk__ksslinput_data1, - mblk_t *, mp); - tcp_kssl_input(tcp, mp, ira->ira_cred); - } else { - if (is_system_labeled()) - tcp_setcred_data(mp, ira); - - putnext(connp->conn_rq, mp); - if (!canputnext(connp->conn_rq)) - tcp->tcp_rwnd -= seg_len; - } - } else if ((tcp->tcp_kssl_ctx != NULL) && - (DB_TYPE(mp) == M_DATA)) { - /* Does this need SSL processing first? */ - DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp); - tcp_kssl_input(tcp, mp, ira->ira_cred); - } else if ((flags & (TH_PUSH|TH_FIN)) || - tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) { - if (tcp->tcp_rcv_list != NULL) { - /* - * Enqueue the new segment first and then - * call tcp_rcv_drain() to send all data - * up. The other way to do this is to - * send all queued data up and then call - * putnext() to send the new segment up. - * This way can remove the else part later - * on. - * - * We don't do this to avoid one more call to - * canputnext() as tcp_rcv_drain() needs to - * call canputnext(). - */ - tcp_rcv_enqueue(tcp, mp, seg_len, - ira->ira_cred); - flags |= tcp_rcv_drain(tcp); - } else { - if (is_system_labeled()) - tcp_setcred_data(mp, ira); - - putnext(connp->conn_rq, mp); - if (!canputnext(connp->conn_rq)) - tcp->tcp_rwnd -= seg_len; - } - } else { - /* - * Enqueue all packets when processing an mblk - * from the co queue and also enqueue normal packets. - */ - tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); - } - /* - * Make sure the timer is running if we have data waiting - * for a push bit. This provides resiliency against - * implementations that do not correctly generate push bits. - */ - if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { - /* - * The connection may be closed at this point, so don't - * do anything for a detached tcp. - */ - if (!TCP_IS_DETACHED(tcp)) - tcp->tcp_push_tid = TCP_TIMER(tcp, - tcp_push_timer, - MSEC_TO_TICK( - tcps->tcps_push_timer_interval)); - } - } - -xmit_check: - /* Is there anything left to do? */ - ASSERT(!(flags & TH_MARKNEXT_NEEDED)); - if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| - TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| - TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) - goto done; - - /* Any transmit work to do and a non-zero window? */ - if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| - TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { - if (flags & TH_REXMIT_NEEDED) { - uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; - - BUMP_MIB(&tcps->tcps_mib, tcpOutFastRetrans); - if (snd_size > mss) - snd_size = mss; - if (snd_size > tcp->tcp_swnd) - snd_size = tcp->tcp_swnd; - mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, - NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, - B_TRUE); - - if (mp1 != NULL) { - tcp->tcp_xmit_head->b_prev = - (mblk_t *)LBOLT_FASTPATH; - tcp->tcp_csuna = tcp->tcp_snxt; - BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); - UPDATE_MIB(&tcps->tcps_mib, - tcpRetransBytes, snd_size); - tcp_send_data(tcp, mp1); - } - } - if (flags & TH_NEED_SACK_REXMIT) { - tcp_sack_rxmit(tcp, &flags); - } - /* - * For TH_LIMIT_XMIT, tcp_wput_data() is called to send - * out new segment. Note that tcp_rexmit should not be - * set, otherwise TH_LIMIT_XMIT should not be set. - */ - if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { - if (!tcp->tcp_rexmit) { - tcp_wput_data(tcp, NULL, B_FALSE); - } else { - tcp_ss_rexmit(tcp); - } - } - /* - * Adjust tcp_cwnd back to normal value after sending - * new data segments. - */ - if (flags & TH_LIMIT_XMIT) { - tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); - /* - * This will restart the timer. Restarting the - * timer is used to avoid a timeout before the - * limited transmitted segment's ACK gets back. - */ - if (tcp->tcp_xmit_head != NULL) - tcp->tcp_xmit_head->b_prev = - (mblk_t *)LBOLT_FASTPATH; - } - - /* Anything more to do? */ - if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| - TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) - goto done; - } -ack_check: - if (flags & TH_SEND_URP_MARK) { - ASSERT(tcp->tcp_urp_mark_mp); - ASSERT(!IPCL_IS_NONSTR(connp)); - /* - * Send up any queued data and then send the mark message - */ - if (tcp->tcp_rcv_list != NULL) { - flags |= tcp_rcv_drain(tcp); - - } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); - mp1 = tcp->tcp_urp_mark_mp; - tcp->tcp_urp_mark_mp = NULL; - if (is_system_labeled()) - tcp_setcred_data(mp1, ira); - - putnext(connp->conn_rq, mp1); -#ifdef DEBUG - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_rput: sending zero-length %s %s", - ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : - "MSGNOTMARKNEXT"), - tcp_display(tcp, NULL, DISP_PORT_ONLY)); -#endif /* DEBUG */ - flags &= ~TH_SEND_URP_MARK; - } - if (flags & TH_ACK_NEEDED) { - /* - * Time to send an ack for some reason. - */ - mp1 = tcp_ack_mp(tcp); - - if (mp1 != NULL) { - tcp_send_data(tcp, mp1); - BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - } - if (tcp->tcp_ack_tid != 0) { - (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); - tcp->tcp_ack_tid = 0; - } - } - if (flags & TH_ACK_TIMER_NEEDED) { - /* - * Arrange for deferred ACK or push wait timeout. - * Start timer if it is not already running. - */ - if (tcp->tcp_ack_tid == 0) { - tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, - MSEC_TO_TICK(tcp->tcp_localnet ? - (clock_t)tcps->tcps_local_dack_interval : - (clock_t)tcps->tcps_deferred_ack_interval)); - } - } - if (flags & TH_ORDREL_NEEDED) { - /* - * Send up the ordrel_ind unless we are an eager guy. - * In the eager case tcp_rsrv will do this when run - * after tcp_accept is done. - */ - ASSERT(tcp->tcp_listener == NULL); - ASSERT(!tcp->tcp_detached); - - if (IPCL_IS_NONSTR(connp)) { - ASSERT(tcp->tcp_ordrel_mp == NULL); - tcp->tcp_ordrel_done = B_TRUE; - (*connp->conn_upcalls->su_opctl) - (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0); - goto done; - } - - if (tcp->tcp_rcv_list != NULL) { - /* - * Push any mblk(s) enqueued from co processing. - */ - flags |= tcp_rcv_drain(tcp); - } - ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); - - mp1 = tcp->tcp_ordrel_mp; - tcp->tcp_ordrel_mp = NULL; - tcp->tcp_ordrel_done = B_TRUE; - putnext(connp->conn_rq, mp1); - } -done: - ASSERT(!(flags & TH_MARKNEXT_NEEDED)); -} - -/* - * This routine adjusts next-to-send sequence number variables, in the - * case where the reciever has shrunk it's window. - */ -static void -tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt) -{ - mblk_t *xmit_tail; - int32_t offset; - - tcp->tcp_snxt = snxt; - - /* Get the mblk, and the offset in it, as per the shrunk window */ - xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); - ASSERT(xmit_tail != NULL); - tcp->tcp_xmit_tail = xmit_tail; - tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - - xmit_tail->b_rptr - offset; -} - -/* - * This function does PAWS protection check. Returns B_TRUE if the - * segment passes the PAWS test, else returns B_FALSE. - */ -boolean_t -tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp) -{ - uint8_t flags; - int options; - uint8_t *up; - conn_t *connp = tcp->tcp_connp; - - flags = (unsigned int)tcpha->tha_flags & 0xFF; - /* - * If timestamp option is aligned nicely, get values inline, - * otherwise call general routine to parse. Only do that - * if timestamp is the only option. - */ - if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + - TCPOPT_REAL_TS_LEN && - OK_32PTR((up = ((uint8_t *)tcpha) + - TCP_MIN_HEADER_LENGTH)) && - *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { - tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); - tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); - - options = TCP_OPT_TSTAMP_PRESENT; - } else { - if (tcp->tcp_snd_sack_ok) { - tcpoptp->tcp = tcp; - } else { - tcpoptp->tcp = NULL; - } - options = tcp_parse_options(tcpha, tcpoptp); - } - - if (options & TCP_OPT_TSTAMP_PRESENT) { - /* - * Do PAWS per RFC 1323 section 4.2. Accept RST - * regardless of the timestamp, page 18 RFC 1323.bis. - */ - if ((flags & TH_RST) == 0 && - TSTMP_LT(tcpoptp->tcp_opt_ts_val, - tcp->tcp_ts_recent)) { - if (TSTMP_LT(LBOLT_FASTPATH64, - tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { - /* This segment is not acceptable. */ - return (B_FALSE); - } else { - /* - * Connection has been idle for - * too long. Reset the timestamp - * and assume the segment is valid. - */ - tcp->tcp_ts_recent = - tcpoptp->tcp_opt_ts_val; - } - } - } else { - /* - * If we don't get a timestamp on every packet, we - * figure we can't really trust 'em, so we stop sending - * and parsing them. - */ - tcp->tcp_snd_ts_ok = B_FALSE; - - connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN; - connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN; - tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4); - /* - * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid - * doing a slow start here so as to not to lose on the - * transfer rate built up so far. - */ - tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); - if (tcp->tcp_snd_sack_ok) { - ASSERT(tcp->tcp_sack_info != NULL); - tcp->tcp_max_sack_blk = 4; - } - } - return (B_TRUE); -} - -/* - * Attach ancillary data to a received TCP segments for the - * ancillary pieces requested by the application that are - * different than they were in the previous data segment. - * - * Save the "current" values once memory allocation is ok so that - * when memory allocation fails we can just wait for the next data segment. - */ -static mblk_t * -tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, - ip_recv_attr_t *ira) -{ - struct T_optdata_ind *todi; - int optlen; - uchar_t *optptr; - struct T_opthdr *toh; - crb_t addflag; /* Which pieces to add */ - mblk_t *mp1; - conn_t *connp = tcp->tcp_connp; - - optlen = 0; - addflag.crb_all = 0; - /* If app asked for pktinfo and the index has changed ... */ - if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && - ira->ira_ruifindex != tcp->tcp_recvifindex) { - optlen += sizeof (struct T_opthdr) + - sizeof (struct in6_pktinfo); - addflag.crb_ip_recvpktinfo = 1; - } - /* If app asked for hoplimit and it has changed ... */ - if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && - ipp->ipp_hoplimit != tcp->tcp_recvhops) { - optlen += sizeof (struct T_opthdr) + sizeof (uint_t); - addflag.crb_ipv6_recvhoplimit = 1; - } - /* If app asked for tclass and it has changed ... */ - if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && - ipp->ipp_tclass != tcp->tcp_recvtclass) { - optlen += sizeof (struct T_opthdr) + sizeof (uint_t); - addflag.crb_ipv6_recvtclass = 1; - } - /* - * If app asked for hopbyhop headers and it has changed ... - * For security labels, note that (1) security labels can't change on - * a connected socket at all, (2) we're connected to at most one peer, - * (3) if anything changes, then it must be some other extra option. - */ - if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && - ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, - (ipp->ipp_fields & IPPF_HOPOPTS), - ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { - optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; - addflag.crb_ipv6_recvhopopts = 1; - if (!ip_allocbuf((void **)&tcp->tcp_hopopts, - &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), - ipp->ipp_hopopts, ipp->ipp_hopoptslen)) - return (mp); - } - /* If app asked for dst headers before routing headers ... */ - if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && - ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen, - (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), - ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { - optlen += sizeof (struct T_opthdr) + - ipp->ipp_rthdrdstoptslen; - addflag.crb_ipv6_recvrthdrdstopts = 1; - if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts, - &tcp->tcp_rthdrdstoptslen, - (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), - ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) - return (mp); - } - /* If app asked for routing headers and it has changed ... */ - if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && - ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, - (ipp->ipp_fields & IPPF_RTHDR), - ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { - optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; - addflag.crb_ipv6_recvrthdr = 1; - if (!ip_allocbuf((void **)&tcp->tcp_rthdr, - &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), - ipp->ipp_rthdr, ipp->ipp_rthdrlen)) - return (mp); - } - /* If app asked for dest headers and it has changed ... */ - if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts || - connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) && - ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, - (ipp->ipp_fields & IPPF_DSTOPTS), - ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { - optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; - addflag.crb_ipv6_recvdstopts = 1; - if (!ip_allocbuf((void **)&tcp->tcp_dstopts, - &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), - ipp->ipp_dstopts, ipp->ipp_dstoptslen)) - return (mp); - } - - if (optlen == 0) { - /* Nothing to add */ - return (mp); - } - mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); - if (mp1 == NULL) { - /* - * Defer sending ancillary data until the next TCP segment - * arrives. - */ - return (mp); - } - mp1->b_cont = mp; - mp = mp1; - mp->b_wptr += sizeof (*todi) + optlen; - mp->b_datap->db_type = M_PROTO; - todi = (struct T_optdata_ind *)mp->b_rptr; - todi->PRIM_type = T_OPTDATA_IND; - todi->DATA_flag = 1; /* MORE data */ - todi->OPT_length = optlen; - todi->OPT_offset = sizeof (*todi); - optptr = (uchar_t *)&todi[1]; - /* - * If app asked for pktinfo and the index has changed ... - * Note that the local address never changes for the connection. - */ - if (addflag.crb_ip_recvpktinfo) { - struct in6_pktinfo *pkti; - uint_t ifindex; - - ifindex = ira->ira_ruifindex; - toh = (struct T_opthdr *)optptr; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_PKTINFO; - toh->len = sizeof (*toh) + sizeof (*pkti); - toh->status = 0; - optptr += sizeof (*toh); - pkti = (struct in6_pktinfo *)optptr; - pkti->ipi6_addr = connp->conn_laddr_v6; - pkti->ipi6_ifindex = ifindex; - optptr += sizeof (*pkti); - ASSERT(OK_32PTR(optptr)); - /* Save as "last" value */ - tcp->tcp_recvifindex = ifindex; - } - /* If app asked for hoplimit and it has changed ... */ - if (addflag.crb_ipv6_recvhoplimit) { - toh = (struct T_opthdr *)optptr; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_HOPLIMIT; - toh->len = sizeof (*toh) + sizeof (uint_t); - toh->status = 0; - optptr += sizeof (*toh); - *(uint_t *)optptr = ipp->ipp_hoplimit; - optptr += sizeof (uint_t); - ASSERT(OK_32PTR(optptr)); - /* Save as "last" value */ - tcp->tcp_recvhops = ipp->ipp_hoplimit; - } - /* If app asked for tclass and it has changed ... */ - if (addflag.crb_ipv6_recvtclass) { - toh = (struct T_opthdr *)optptr; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_TCLASS; - toh->len = sizeof (*toh) + sizeof (uint_t); - toh->status = 0; - optptr += sizeof (*toh); - *(uint_t *)optptr = ipp->ipp_tclass; - optptr += sizeof (uint_t); - ASSERT(OK_32PTR(optptr)); - /* Save as "last" value */ - tcp->tcp_recvtclass = ipp->ipp_tclass; - } - if (addflag.crb_ipv6_recvhopopts) { - toh = (struct T_opthdr *)optptr; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_HOPOPTS; - toh->len = sizeof (*toh) + ipp->ipp_hopoptslen; - toh->status = 0; - optptr += sizeof (*toh); - bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen); - optptr += ipp->ipp_hopoptslen; - ASSERT(OK_32PTR(optptr)); - /* Save as last value */ - ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, - (ipp->ipp_fields & IPPF_HOPOPTS), - ipp->ipp_hopopts, ipp->ipp_hopoptslen); - } - if (addflag.crb_ipv6_recvrthdrdstopts) { - toh = (struct T_opthdr *)optptr; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_RTHDRDSTOPTS; - toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen; - toh->status = 0; - optptr += sizeof (*toh); - bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); - optptr += ipp->ipp_rthdrdstoptslen; - ASSERT(OK_32PTR(optptr)); - /* Save as last value */ - ip_savebuf((void **)&tcp->tcp_rthdrdstopts, - &tcp->tcp_rthdrdstoptslen, - (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), - ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); - } - if (addflag.crb_ipv6_recvrthdr) { - toh = (struct T_opthdr *)optptr; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_RTHDR; - toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; - toh->status = 0; - optptr += sizeof (*toh); - bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); - optptr += ipp->ipp_rthdrlen; - ASSERT(OK_32PTR(optptr)); - /* Save as last value */ - ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, - (ipp->ipp_fields & IPPF_RTHDR), - ipp->ipp_rthdr, ipp->ipp_rthdrlen); - } - if (addflag.crb_ipv6_recvdstopts) { - toh = (struct T_opthdr *)optptr; - toh->level = IPPROTO_IPV6; - toh->name = IPV6_DSTOPTS; - toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; - toh->status = 0; - optptr += sizeof (*toh); - bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); - optptr += ipp->ipp_dstoptslen; - ASSERT(OK_32PTR(optptr)); - /* Save as last value */ - ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, - (ipp->ipp_fields & IPPF_DSTOPTS), - ipp->ipp_dstopts, ipp->ipp_dstoptslen); - } - ASSERT(optptr == mp->b_wptr); - return (mp); -} - -/* ARGSUSED */ -static void -tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - queue_t *q = connp->conn_rq; - tcp_stack_t *tcps = tcp->tcp_tcps; - - ASSERT(!IPCL_IS_NONSTR(connp)); - mutex_enter(&tcp->tcp_rsrv_mp_lock); - tcp->tcp_rsrv_mp = mp; - mutex_exit(&tcp->tcp_rsrv_mp_lock); - - TCP_STAT(tcps, tcp_rsrv_calls); - - if (TCP_IS_DETACHED(tcp) || q == NULL) { - return; - } - - if (tcp->tcp_fused) { - tcp_fuse_backenable(tcp); - return; - } - - if (canputnext(q)) { - /* Not flow-controlled, open rwnd */ - tcp->tcp_rwnd = connp->conn_rcvbuf; - - /* - * Send back a window update immediately if TCP is above - * ESTABLISHED state and the increase of the rcv window - * that the other side knows is at least 1 MSS after flow - * control is lifted. - */ - if (tcp->tcp_state >= TCPS_ESTABLISHED && - tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { - tcp_xmit_ctl(NULL, tcp, - (tcp->tcp_swnd == 0) ? tcp->tcp_suna : - tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); - } - } -} - -/* - * The read side service routine is called mostly when we get back-enabled as a - * result of flow control relief. Since we don't actually queue anything in - * TCP, we have no data to send out of here. What we do is clear the receive - * window, and send out a window update. - */ -static void -tcp_rsrv(queue_t *q) -{ - conn_t *connp = Q_TO_CONN(q); - tcp_t *tcp = connp->conn_tcp; - mblk_t *mp; - - /* No code does a putq on the read side */ - ASSERT(q->q_first == NULL); - - /* - * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already - * been run. So just return. - */ - mutex_enter(&tcp->tcp_rsrv_mp_lock); - if ((mp = tcp->tcp_rsrv_mp) == NULL) { - mutex_exit(&tcp->tcp_rsrv_mp_lock); - return; - } - tcp->tcp_rsrv_mp = NULL; - mutex_exit(&tcp->tcp_rsrv_mp_lock); - - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, - NULL, SQ_PROCESS, SQTAG_TCP_RSRV); -} - /* * tcp_rwnd_set() is called to adjust the receive window to a desired value. * We do not allow the receive window to shrink. After setting rwnd, @@ -13218,805 +3343,7 @@ tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd) return (rwnd); } -/* - * Return SNMP stuff in buffer in mpdata. - */ -mblk_t * -tcp_snmp_get(queue_t *q, mblk_t *mpctl) -{ - mblk_t *mpdata; - mblk_t *mp_conn_ctl = NULL; - mblk_t *mp_conn_tail; - mblk_t *mp_attr_ctl = NULL; - mblk_t *mp_attr_tail; - mblk_t *mp6_conn_ctl = NULL; - mblk_t *mp6_conn_tail; - mblk_t *mp6_attr_ctl = NULL; - mblk_t *mp6_attr_tail; - struct opthdr *optp; - mib2_tcpConnEntry_t tce; - mib2_tcp6ConnEntry_t tce6; - mib2_transportMLPEntry_t mlp; - connf_t *connfp; - int i; - boolean_t ispriv; - zoneid_t zoneid; - int v4_conn_idx; - int v6_conn_idx; - conn_t *connp = Q_TO_CONN(q); - tcp_stack_t *tcps; - ip_stack_t *ipst; - mblk_t *mp2ctl; - - /* - * make a copy of the original message - */ - mp2ctl = copymsg(mpctl); - - if (mpctl == NULL || - (mpdata = mpctl->b_cont) == NULL || - (mp_conn_ctl = copymsg(mpctl)) == NULL || - (mp_attr_ctl = copymsg(mpctl)) == NULL || - (mp6_conn_ctl = copymsg(mpctl)) == NULL || - (mp6_attr_ctl = copymsg(mpctl)) == NULL) { - freemsg(mp_conn_ctl); - freemsg(mp_attr_ctl); - freemsg(mp6_conn_ctl); - freemsg(mp6_attr_ctl); - freemsg(mpctl); - freemsg(mp2ctl); - return (NULL); - } - - ipst = connp->conn_netstack->netstack_ip; - tcps = connp->conn_netstack->netstack_tcp; - - /* build table of connections -- need count in fixed part */ - SET_MIB(tcps->tcps_mib.tcpRtoAlgorithm, 4); /* vanj */ - SET_MIB(tcps->tcps_mib.tcpRtoMin, tcps->tcps_rexmit_interval_min); - SET_MIB(tcps->tcps_mib.tcpRtoMax, tcps->tcps_rexmit_interval_max); - SET_MIB(tcps->tcps_mib.tcpMaxConn, -1); - SET_MIB(tcps->tcps_mib.tcpCurrEstab, 0); - - ispriv = - secpolicy_ip_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; - zoneid = Q_TO_CONN(q)->conn_zoneid; - - v4_conn_idx = v6_conn_idx = 0; - mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; - - for (i = 0; i < CONN_G_HASH_SIZE; i++) { - ipst = tcps->tcps_netstack->netstack_ip; - - connfp = &ipst->ips_ipcl_globalhash_fanout[i]; - - connp = NULL; - - while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { - tcp_t *tcp; - boolean_t needattr; - - if (connp->conn_zoneid != zoneid) - continue; /* not in this zone */ - - tcp = connp->conn_tcp; - UPDATE_MIB(&tcps->tcps_mib, - tcpHCInSegs, tcp->tcp_ibsegs); - tcp->tcp_ibsegs = 0; - UPDATE_MIB(&tcps->tcps_mib, - tcpHCOutSegs, tcp->tcp_obsegs); - tcp->tcp_obsegs = 0; - - tce6.tcp6ConnState = tce.tcpConnState = - tcp_snmp_state(tcp); - if (tce.tcpConnState == MIB2_TCP_established || - tce.tcpConnState == MIB2_TCP_closeWait) - BUMP_MIB(&tcps->tcps_mib, tcpCurrEstab); - - needattr = B_FALSE; - bzero(&mlp, sizeof (mlp)); - if (connp->conn_mlp_type != mlptSingle) { - if (connp->conn_mlp_type == mlptShared || - connp->conn_mlp_type == mlptBoth) - mlp.tme_flags |= MIB2_TMEF_SHARED; - if (connp->conn_mlp_type == mlptPrivate || - connp->conn_mlp_type == mlptBoth) - mlp.tme_flags |= MIB2_TMEF_PRIVATE; - needattr = B_TRUE; - } - if (connp->conn_anon_mlp) { - mlp.tme_flags |= MIB2_TMEF_ANONMLP; - needattr = B_TRUE; - } - switch (connp->conn_mac_mode) { - case CONN_MAC_DEFAULT: - break; - case CONN_MAC_AWARE: - mlp.tme_flags |= MIB2_TMEF_MACEXEMPT; - needattr = B_TRUE; - break; - case CONN_MAC_IMPLICIT: - mlp.tme_flags |= MIB2_TMEF_MACIMPLICIT; - needattr = B_TRUE; - break; - } - if (connp->conn_ixa->ixa_tsl != NULL) { - ts_label_t *tsl; - - tsl = connp->conn_ixa->ixa_tsl; - mlp.tme_flags |= MIB2_TMEF_IS_LABELED; - mlp.tme_doi = label2doi(tsl); - mlp.tme_label = *label2bslabel(tsl); - needattr = B_TRUE; - } - - /* Create a message to report on IPv6 entries */ - if (connp->conn_ipversion == IPV6_VERSION) { - tce6.tcp6ConnLocalAddress = connp->conn_laddr_v6; - tce6.tcp6ConnRemAddress = connp->conn_faddr_v6; - tce6.tcp6ConnLocalPort = ntohs(connp->conn_lport); - tce6.tcp6ConnRemPort = ntohs(connp->conn_fport); - if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) { - tce6.tcp6ConnIfIndex = - connp->conn_ixa->ixa_scopeid; - } else { - tce6.tcp6ConnIfIndex = connp->conn_bound_if; - } - /* Don't want just anybody seeing these... */ - if (ispriv) { - tce6.tcp6ConnEntryInfo.ce_snxt = - tcp->tcp_snxt; - tce6.tcp6ConnEntryInfo.ce_suna = - tcp->tcp_suna; - tce6.tcp6ConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt; - tce6.tcp6ConnEntryInfo.ce_rack = - tcp->tcp_rack; - } else { - /* - * Netstat, unfortunately, uses this to - * get send/receive queue sizes. How to fix? - * Why not compute the difference only? - */ - tce6.tcp6ConnEntryInfo.ce_snxt = - tcp->tcp_snxt - tcp->tcp_suna; - tce6.tcp6ConnEntryInfo.ce_suna = 0; - tce6.tcp6ConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt - tcp->tcp_rack; - tce6.tcp6ConnEntryInfo.ce_rack = 0; - } - - tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; - tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; - tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; - tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; - tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; - - tce6.tcp6ConnCreationProcess = - (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : - connp->conn_cpid; - tce6.tcp6ConnCreationTime = connp->conn_open_time; - - (void) snmp_append_data2(mp6_conn_ctl->b_cont, - &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); - - mlp.tme_connidx = v6_conn_idx++; - if (needattr) - (void) snmp_append_data2(mp6_attr_ctl->b_cont, - &mp6_attr_tail, (char *)&mlp, sizeof (mlp)); - } - /* - * Create an IPv4 table entry for IPv4 entries and also - * for IPv6 entries which are bound to in6addr_any - * but don't have IPV6_V6ONLY set. - * (i.e. anything an IPv4 peer could connect to) - */ - if (connp->conn_ipversion == IPV4_VERSION || - (tcp->tcp_state <= TCPS_LISTEN && - !connp->conn_ipv6_v6only && - IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) { - if (connp->conn_ipversion == IPV6_VERSION) { - tce.tcpConnRemAddress = INADDR_ANY; - tce.tcpConnLocalAddress = INADDR_ANY; - } else { - tce.tcpConnRemAddress = - connp->conn_faddr_v4; - tce.tcpConnLocalAddress = - connp->conn_laddr_v4; - } - tce.tcpConnLocalPort = ntohs(connp->conn_lport); - tce.tcpConnRemPort = ntohs(connp->conn_fport); - /* Don't want just anybody seeing these... */ - if (ispriv) { - tce.tcpConnEntryInfo.ce_snxt = - tcp->tcp_snxt; - tce.tcpConnEntryInfo.ce_suna = - tcp->tcp_suna; - tce.tcpConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt; - tce.tcpConnEntryInfo.ce_rack = - tcp->tcp_rack; - } else { - /* - * Netstat, unfortunately, uses this to - * get send/receive queue sizes. How - * to fix? - * Why not compute the difference only? - */ - tce.tcpConnEntryInfo.ce_snxt = - tcp->tcp_snxt - tcp->tcp_suna; - tce.tcpConnEntryInfo.ce_suna = 0; - tce.tcpConnEntryInfo.ce_rnxt = - tcp->tcp_rnxt - tcp->tcp_rack; - tce.tcpConnEntryInfo.ce_rack = 0; - } - - tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; - tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; - tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; - tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; - tce.tcpConnEntryInfo.ce_state = - tcp->tcp_state; - - tce.tcpConnCreationProcess = - (connp->conn_cpid < 0) ? - MIB2_UNKNOWN_PROCESS : - connp->conn_cpid; - tce.tcpConnCreationTime = connp->conn_open_time; - - (void) snmp_append_data2(mp_conn_ctl->b_cont, - &mp_conn_tail, (char *)&tce, sizeof (tce)); - - mlp.tme_connidx = v4_conn_idx++; - if (needattr) - (void) snmp_append_data2( - mp_attr_ctl->b_cont, - &mp_attr_tail, (char *)&mlp, - sizeof (mlp)); - } - } - } - - /* fixed length structure for IPv4 and IPv6 counters */ - SET_MIB(tcps->tcps_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); - SET_MIB(tcps->tcps_mib.tcp6ConnTableSize, - sizeof (mib2_tcp6ConnEntry_t)); - /* synchronize 32- and 64-bit counters */ - SYNC32_MIB(&tcps->tcps_mib, tcpInSegs, tcpHCInSegs); - SYNC32_MIB(&tcps->tcps_mib, tcpOutSegs, tcpHCOutSegs); - optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; - optp->level = MIB2_TCP; - optp->name = 0; - (void) snmp_append_data(mpdata, (char *)&tcps->tcps_mib, - sizeof (tcps->tcps_mib)); - optp->len = msgdsize(mpdata); - qreply(q, mpctl); - - /* table of connections... */ - optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ - sizeof (struct T_optmgmt_ack)]; - optp->level = MIB2_TCP; - optp->name = MIB2_TCP_CONN; - optp->len = msgdsize(mp_conn_ctl->b_cont); - qreply(q, mp_conn_ctl); - - /* table of MLP attributes... */ - optp = (struct opthdr *)&mp_attr_ctl->b_rptr[ - sizeof (struct T_optmgmt_ack)]; - optp->level = MIB2_TCP; - optp->name = EXPER_XPORT_MLP; - optp->len = msgdsize(mp_attr_ctl->b_cont); - if (optp->len == 0) - freemsg(mp_attr_ctl); - else - qreply(q, mp_attr_ctl); - - /* table of IPv6 connections... */ - optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[ - sizeof (struct T_optmgmt_ack)]; - optp->level = MIB2_TCP6; - optp->name = MIB2_TCP6_CONN; - optp->len = msgdsize(mp6_conn_ctl->b_cont); - qreply(q, mp6_conn_ctl); - - /* table of IPv6 MLP attributes... */ - optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[ - sizeof (struct T_optmgmt_ack)]; - optp->level = MIB2_TCP6; - optp->name = EXPER_XPORT_MLP; - optp->len = msgdsize(mp6_attr_ctl->b_cont); - if (optp->len == 0) - freemsg(mp6_attr_ctl); - else - qreply(q, mp6_attr_ctl); - return (mp2ctl); -} - -/* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ -/* ARGSUSED */ int -tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) -{ - mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; - - switch (level) { - case MIB2_TCP: - switch (name) { - case 13: - if (tce->tcpConnState != MIB2_TCP_deleteTCB) - return (0); - /* TODO: delete entry defined by tce */ - return (1); - default: - return (0); - } - default: - return (1); - } -} - -/* Translate TCP state to MIB2 TCP state. */ -static int -tcp_snmp_state(tcp_t *tcp) -{ - if (tcp == NULL) - return (0); - - switch (tcp->tcp_state) { - case TCPS_CLOSED: - case TCPS_IDLE: /* RFC1213 doesn't have analogue for IDLE & BOUND */ - case TCPS_BOUND: - return (MIB2_TCP_closed); - case TCPS_LISTEN: - return (MIB2_TCP_listen); - case TCPS_SYN_SENT: - return (MIB2_TCP_synSent); - case TCPS_SYN_RCVD: - return (MIB2_TCP_synReceived); - case TCPS_ESTABLISHED: - return (MIB2_TCP_established); - case TCPS_CLOSE_WAIT: - return (MIB2_TCP_closeWait); - case TCPS_FIN_WAIT_1: - return (MIB2_TCP_finWait1); - case TCPS_CLOSING: - return (MIB2_TCP_closing); - case TCPS_LAST_ACK: - return (MIB2_TCP_lastAck); - case TCPS_FIN_WAIT_2: - return (MIB2_TCP_finWait2); - case TCPS_TIME_WAIT: - return (MIB2_TCP_timeWait); - default: - return (0); - } -} - -/* - * tcp_timer is the timer service routine. It handles the retransmission, - * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out - * from the state of the tcp instance what kind of action needs to be done - * at the time it is called. - */ -static void -tcp_timer(void *arg) -{ - mblk_t *mp; - clock_t first_threshold; - clock_t second_threshold; - clock_t ms; - uint32_t mss; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - tcp->tcp_timer_tid = 0; - - if (tcp->tcp_fused) - return; - - first_threshold = tcp->tcp_first_timer_threshold; - second_threshold = tcp->tcp_second_timer_threshold; - switch (tcp->tcp_state) { - case TCPS_IDLE: - case TCPS_BOUND: - case TCPS_LISTEN: - return; - case TCPS_SYN_RCVD: { - tcp_t *listener = tcp->tcp_listener; - - if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { - /* it's our first timeout */ - tcp->tcp_syn_rcvd_timeout = 1; - mutex_enter(&listener->tcp_eager_lock); - listener->tcp_syn_rcvd_timeout++; - if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) { - /* - * Make this eager available for drop if we - * need to drop one to accomodate a new - * incoming SYN request. - */ - MAKE_DROPPABLE(listener, tcp); - } - if (!listener->tcp_syn_defense && - (listener->tcp_syn_rcvd_timeout > - (tcps->tcps_conn_req_max_q0 >> 2)) && - (tcps->tcps_conn_req_max_q0 > 200)) { - /* We may be under attack. Put on a defense. */ - listener->tcp_syn_defense = B_TRUE; - cmn_err(CE_WARN, "High TCP connect timeout " - "rate! System (port %d) may be under a " - "SYN flood attack!", - ntohs(listener->tcp_connp->conn_lport)); - - listener->tcp_ip_addr_cache = kmem_zalloc( - IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), - KM_NOSLEEP); - } - mutex_exit(&listener->tcp_eager_lock); - } else if (listener != NULL) { - mutex_enter(&listener->tcp_eager_lock); - tcp->tcp_syn_rcvd_timeout++; - if (tcp->tcp_syn_rcvd_timeout > 1 && - !tcp->tcp_closemp_used) { - /* - * This is our second timeout. Put the tcp in - * the list of droppable eagers to allow it to - * be dropped, if needed. We don't check - * whether tcp_dontdrop is set or not to - * protect ourselve from a SYN attack where a - * remote host can spoof itself as one of the - * good IP source and continue to hold - * resources too long. - */ - MAKE_DROPPABLE(listener, tcp); - } - mutex_exit(&listener->tcp_eager_lock); - } - } - /* FALLTHRU */ - case TCPS_SYN_SENT: - first_threshold = tcp->tcp_first_ctimer_threshold; - second_threshold = tcp->tcp_second_ctimer_threshold; - break; - case TCPS_ESTABLISHED: - case TCPS_FIN_WAIT_1: - case TCPS_CLOSING: - case TCPS_CLOSE_WAIT: - case TCPS_LAST_ACK: - /* If we have data to rexmit */ - if (tcp->tcp_suna != tcp->tcp_snxt) { - clock_t time_to_wait; - - BUMP_MIB(&tcps->tcps_mib, tcpTimRetrans); - if (!tcp->tcp_xmit_head) - break; - time_to_wait = ddi_get_lbolt() - - (clock_t)tcp->tcp_xmit_head->b_prev; - time_to_wait = tcp->tcp_rto - - TICK_TO_MSEC(time_to_wait); - /* - * If the timer fires too early, 1 clock tick earlier, - * restart the timer. - */ - if (time_to_wait > msec_per_tick) { - TCP_STAT(tcps, tcp_timer_fire_early); - TCP_TIMER_RESTART(tcp, time_to_wait); - return; - } - /* - * When we probe zero windows, we force the swnd open. - * If our peer acks with a closed window swnd will be - * set to zero by tcp_rput(). As long as we are - * receiving acks tcp_rput will - * reset 'tcp_ms_we_have_waited' so as not to trip the - * first and second interval actions. NOTE: the timer - * interval is allowed to continue its exponential - * backoff. - */ - if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_TRACE, "tcp_timer: zero win"); - } - } else { - /* - * After retransmission, we need to do - * slow start. Set the ssthresh to one - * half of current effective window and - * cwnd to one MSS. Also reset - * tcp_cwnd_cnt. - * - * Note that if tcp_ssthresh is reduced because - * of ECN, do not reduce it again unless it is - * already one window of data away (tcp_cwr - * should then be cleared) or this is a - * timeout for a retransmitted segment. - */ - uint32_t npkt; - - if (!tcp->tcp_cwr || tcp->tcp_rexmit) { - npkt = ((tcp->tcp_timer_backoff ? - tcp->tcp_cwnd_ssthresh : - tcp->tcp_snxt - - tcp->tcp_suna) >> 1) / tcp->tcp_mss; - tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * - tcp->tcp_mss; - } - tcp->tcp_cwnd = tcp->tcp_mss; - tcp->tcp_cwnd_cnt = 0; - if (tcp->tcp_ecn_ok) { - tcp->tcp_cwr = B_TRUE; - tcp->tcp_cwr_snd_max = tcp->tcp_snxt; - tcp->tcp_ecn_cwr_sent = B_FALSE; - } - } - break; - } - /* - * We have something to send yet we cannot send. The - * reason can be: - * - * 1. Zero send window: we need to do zero window probe. - * 2. Zero cwnd: because of ECN, we need to "clock out - * segments. - * 3. SWS avoidance: receiver may have shrunk window, - * reset our knowledge. - * - * Note that condition 2 can happen with either 1 or - * 3. But 1 and 3 are exclusive. - */ - if (tcp->tcp_unsent != 0) { - /* - * Should not hold the zero-copy messages for too long. - */ - if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) - tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, - tcp->tcp_xmit_head, B_TRUE); - - if (tcp->tcp_cwnd == 0) { - /* - * Set tcp_cwnd to 1 MSS so that a - * new segment can be sent out. We - * are "clocking out" new data when - * the network is really congested. - */ - ASSERT(tcp->tcp_ecn_ok); - tcp->tcp_cwnd = tcp->tcp_mss; - } - if (tcp->tcp_swnd == 0) { - /* Extend window for zero window probe */ - tcp->tcp_swnd++; - tcp->tcp_zero_win_probe = B_TRUE; - BUMP_MIB(&tcps->tcps_mib, tcpOutWinProbe); - } else { - /* - * Handle timeout from sender SWS avoidance. - * Reset our knowledge of the max send window - * since the receiver might have reduced its - * receive buffer. Avoid setting tcp_max_swnd - * to one since that will essentially disable - * the SWS checks. - * - * Note that since we don't have a SWS - * state variable, if the timeout is set - * for ECN but not for SWS, this - * code will also be executed. This is - * fine as tcp_max_swnd is updated - * constantly and it will not affect - * anything. - */ - tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); - } - tcp_wput_data(tcp, NULL, B_FALSE); - return; - } - /* Is there a FIN that needs to be to re retransmitted? */ - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - !tcp->tcp_fin_acked) - break; - /* Nothing to do, return without restarting timer. */ - TCP_STAT(tcps, tcp_timer_fire_miss); - return; - case TCPS_FIN_WAIT_2: - /* - * User closed the TCP endpoint and peer ACK'ed our FIN. - * We waited some time for for peer's FIN, but it hasn't - * arrived. We flush the connection now to avoid - * case where the peer has rebooted. - */ - if (TCP_IS_DETACHED(tcp)) { - (void) tcp_clean_death(tcp, 0, 23); - } else { - TCP_TIMER_RESTART(tcp, - tcps->tcps_fin_wait_2_flush_interval); - } - return; - case TCPS_TIME_WAIT: - (void) tcp_clean_death(tcp, 0, 24); - return; - default: - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, - "tcp_timer: strange state (%d) %s", - tcp->tcp_state, tcp_display(tcp, NULL, - DISP_PORT_ONLY)); - } - return; - } - - /* - * If the system is under memory pressure or the max number of - * connections have been established for the listener, be more - * aggressive in aborting connections. - */ - if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL && - tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) { - second_threshold = tcp_early_abort * SECONDS; - } - - if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { - /* - * Should not hold the zero-copy messages for too long. - */ - if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) - tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, - tcp->tcp_xmit_head, B_TRUE); - - /* - * For zero window probe, we need to send indefinitely, - * unless we have not heard from the other side for some - * time... - */ - if ((tcp->tcp_zero_win_probe == 0) || - (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) > - second_threshold)) { - BUMP_MIB(&tcps->tcps_mib, tcpTimRetransDrop); - /* - * If TCP is in SYN_RCVD state, send back a - * RST|ACK as BSD does. Note that tcp_zero_win_probe - * should be zero in TCPS_SYN_RCVD state. - */ - if (tcp->tcp_state == TCPS_SYN_RCVD) { - tcp_xmit_ctl("tcp_timer: RST sent on timeout " - "in SYN_RCVD", - tcp, tcp->tcp_snxt, - tcp->tcp_rnxt, TH_RST | TH_ACK); - } - (void) tcp_clean_death(tcp, - tcp->tcp_client_errno ? - tcp->tcp_client_errno : ETIMEDOUT, 25); - return; - } else { - /* - * If the system is under memory pressure, we also - * abort connection in zero window probing. - */ - if (tcps->tcps_reclaim) { - (void) tcp_clean_death(tcp, - tcp->tcp_client_errno ? - tcp->tcp_client_errno : ETIMEDOUT, 25); - return; - } - /* - * Set tcp_ms_we_have_waited to second_threshold - * so that in next timeout, we will do the above - * check (ddi_get_lbolt() - tcp_last_recv_time). - * This is also to avoid overflow. - * - * We don't need to decrement tcp_timer_backoff - * to avoid overflow because it will be decremented - * later if new timeout value is greater than - * tcp_rexmit_interval_max. In the case when - * tcp_rexmit_interval_max is greater than - * second_threshold, it means that we will wait - * longer than second_threshold to send the next - * window probe. - */ - tcp->tcp_ms_we_have_waited = second_threshold; - } - } else if (ms > first_threshold) { - /* - * Should not hold the zero-copy messages for too long. - */ - if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) - tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, - tcp->tcp_xmit_head, B_TRUE); - - /* - * We have been retransmitting for too long... The RTT - * we calculated is probably incorrect. Reinitialize it. - * Need to compensate for 0 tcp_rtt_sa. Reset - * tcp_rtt_update so that we won't accidentally cache a - * bad value. But only do this if this is not a zero - * window probe. - */ - if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { - tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + - (tcp->tcp_rtt_sa >> 5); - tcp->tcp_rtt_sa = 0; - tcp_ip_notify(tcp); - tcp->tcp_rtt_update = 0; - } - } - tcp->tcp_timer_backoff++; - if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < - tcps->tcps_rexmit_interval_min) { - /* - * This means the original RTO is tcp_rexmit_interval_min. - * So we will use tcp_rexmit_interval_min as the RTO value - * and do the backoff. - */ - ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff; - } else { - ms <<= tcp->tcp_timer_backoff; - } - if (ms > tcps->tcps_rexmit_interval_max) { - ms = tcps->tcps_rexmit_interval_max; - /* - * ms is at max, decrement tcp_timer_backoff to avoid - * overflow. - */ - tcp->tcp_timer_backoff--; - } - tcp->tcp_ms_we_have_waited += ms; - if (tcp->tcp_zero_win_probe == 0) { - tcp->tcp_rto = ms; - } - TCP_TIMER_RESTART(tcp, ms); - /* - * This is after a timeout and tcp_rto is backed off. Set - * tcp_set_timer to 1 so that next time RTO is updated, we will - * restart the timer with a correct value. - */ - tcp->tcp_set_timer = 1; - mss = tcp->tcp_snxt - tcp->tcp_suna; - if (mss > tcp->tcp_mss) - mss = tcp->tcp_mss; - if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) - mss = tcp->tcp_swnd; - - if ((mp = tcp->tcp_xmit_head) != NULL) - mp->b_prev = (mblk_t *)ddi_get_lbolt(); - mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, - B_TRUE); - - /* - * When slow start after retransmission begins, start with - * this seq no. tcp_rexmit_max marks the end of special slow - * start phase. tcp_snd_burst controls how many segments - * can be sent because of an ack. - */ - tcp->tcp_rexmit_nxt = tcp->tcp_suna; - tcp->tcp_snd_burst = TCP_CWND_SS; - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - (tcp->tcp_unsent == 0)) { - tcp->tcp_rexmit_max = tcp->tcp_fss; - } else { - tcp->tcp_rexmit_max = tcp->tcp_snxt; - } - tcp->tcp_rexmit = B_TRUE; - tcp->tcp_dupack_cnt = 0; - - /* - * Remove all rexmit SACK blk to start from fresh. - */ - if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) - TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); - if (mp == NULL) { - return; - } - - tcp->tcp_csuna = tcp->tcp_snxt; - BUMP_MIB(&tcps->tcps_mib, tcpRetransSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpRetransBytes, mss); - tcp_send_data(tcp, mp); - -} - -static int tcp_do_unbind(conn_t *connp) { tcp_t *tcp = connp->conn_tcp; @@ -14053,492 +3380,6 @@ tcp_do_unbind(conn_t *connp) return (0); } -/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ -static void -tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) -{ - conn_t *connp = tcp->tcp_connp; - int error; - - error = tcp_do_unbind(connp); - if (error > 0) { - tcp_err_ack(tcp, mp, TSYSERR, error); - } else if (error < 0) { - tcp_err_ack(tcp, mp, -error, 0); - } else { - /* Send M_FLUSH according to TPI */ - (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); - - mp = mi_tpi_ok_ack_alloc(mp); - if (mp != NULL) - putnext(connp->conn_rq, mp); - } -} - -/* - * Don't let port fall into the privileged range. - * Since the extra privileged ports can be arbitrary we also - * ensure that we exclude those from consideration. - * tcp_g_epriv_ports is not sorted thus we loop over it until - * there are no changes. - * - * Note: No locks are held when inspecting tcp_g_*epriv_ports - * but instead the code relies on: - * - the fact that the address of the array and its size never changes - * - the atomic assignment of the elements of the array - * - * Returns 0 if there are no more ports available. - * - * TS note: skip multilevel ports. - */ -static in_port_t -tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) -{ - int i; - boolean_t restart = B_FALSE; - tcp_stack_t *tcps = tcp->tcp_tcps; - - if (random && tcp_random_anon_port != 0) { - (void) random_get_pseudo_bytes((uint8_t *)&port, - sizeof (in_port_t)); - /* - * Unless changed by a sys admin, the smallest anon port - * is 32768 and the largest anon port is 65535. It is - * very likely (50%) for the random port to be smaller - * than the smallest anon port. When that happens, - * add port % (anon port range) to the smallest anon - * port to get the random port. It should fall into the - * valid anon port range. - */ - if (port < tcps->tcps_smallest_anon_port) { - port = tcps->tcps_smallest_anon_port + - port % (tcps->tcps_largest_anon_port - - tcps->tcps_smallest_anon_port); - } - } - -retry: - if (port < tcps->tcps_smallest_anon_port) - port = (in_port_t)tcps->tcps_smallest_anon_port; - - if (port > tcps->tcps_largest_anon_port) { - if (restart) - return (0); - restart = B_TRUE; - port = (in_port_t)tcps->tcps_smallest_anon_port; - } - - if (port < tcps->tcps_smallest_nonpriv_port) - port = (in_port_t)tcps->tcps_smallest_nonpriv_port; - - for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { - if (port == tcps->tcps_g_epriv_ports[i]) { - port++; - /* - * Make sure whether the port is in the - * valid range. - */ - goto retry; - } - } - if (is_system_labeled() && - (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, - IPPROTO_TCP, B_TRUE)) != 0) { - port = i; - goto retry; - } - return (port); -} - -/* - * Return the next anonymous port in the privileged port range for - * bind checking. It starts at IPPORT_RESERVED - 1 and goes - * downwards. This is the same behavior as documented in the userland - * library call rresvport(3N). - * - * TS note: skip multilevel ports. - */ -static in_port_t -tcp_get_next_priv_port(const tcp_t *tcp) -{ - static in_port_t next_priv_port = IPPORT_RESERVED - 1; - in_port_t nextport; - boolean_t restart = B_FALSE; - tcp_stack_t *tcps = tcp->tcp_tcps; -retry: - if (next_priv_port < tcps->tcps_min_anonpriv_port || - next_priv_port >= IPPORT_RESERVED) { - next_priv_port = IPPORT_RESERVED - 1; - if (restart) - return (0); - restart = B_TRUE; - } - if (is_system_labeled() && - (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), - next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { - next_priv_port = nextport; - goto retry; - } - return (next_priv_port--); -} - -/* The write side r/w procedure. */ - -#if CCS_STATS -struct { - struct { - int64_t count, bytes; - } tot, hit; -} wrw_stats; -#endif - -/* - * Call by tcp_wput() to handle all non data, except M_PROTO and M_PCPROTO, - * messages. - */ -/* ARGSUSED */ -static void -tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - - ASSERT(DB_TYPE(mp) != M_IOCTL); - /* - * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. - * Once the close starts, streamhead and sockfs will not let any data - * packets come down (close ensures that there are no threads using the - * queue and no new threads will come down) but since qprocsoff() - * hasn't happened yet, a M_FLUSH or some non data message might - * get reflected back (in response to our own FLUSHRW) and get - * processed after tcp_close() is done. The conn would still be valid - * because a ref would have added but we need to check the state - * before actually processing the packet. - */ - if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { - freemsg(mp); - return; - } - - switch (DB_TYPE(mp)) { - case M_IOCDATA: - tcp_wput_iocdata(tcp, mp); - break; - case M_FLUSH: - tcp_wput_flush(tcp, mp); - break; - default: - ip_wput_nondata(connp->conn_wq, mp); - break; - } -} - -/* - * The TCP fast path write put procedure. - * NOTE: the logic of the fast path is duplicated from tcp_wput_data() - */ -/* ARGSUSED */ -void -tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - int len; - int hdrlen; - int plen; - mblk_t *mp1; - uchar_t *rptr; - uint32_t snxt; - tcpha_t *tcpha; - struct datab *db; - uint32_t suna; - uint32_t mss; - ipaddr_t *dst; - ipaddr_t *src; - uint32_t sum; - int usable; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - uint32_t msize; - tcp_stack_t *tcps = tcp->tcp_tcps; - ip_xmit_attr_t *ixa; - clock_t now; - - /* - * Try and ASSERT the minimum possible references on the - * conn early enough. Since we are executing on write side, - * the connection is obviously not detached and that means - * there is a ref each for TCP and IP. Since we are behind - * the squeue, the minimum references needed are 3. If the - * conn is in classifier hash list, there should be an - * extra ref for that (we check both the possibilities). - */ - ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || - (connp->conn_fanout == NULL && connp->conn_ref >= 3)); - - ASSERT(DB_TYPE(mp) == M_DATA); - msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); - - mutex_enter(&tcp->tcp_non_sq_lock); - tcp->tcp_squeue_bytes -= msize; - mutex_exit(&tcp->tcp_non_sq_lock); - - /* Bypass tcp protocol for fused tcp loopback */ - if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) - return; - - mss = tcp->tcp_mss; - /* - * If ZEROCOPY has turned off, try not to send any zero-copy message - * down. Do backoff, now. - */ - if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on) - mp = tcp_zcopy_backoff(tcp, mp, B_FALSE); - - - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); - len = (int)(mp->b_wptr - mp->b_rptr); - - /* - * Criteria for fast path: - * - * 1. no unsent data - * 2. single mblk in request - * 3. connection established - * 4. data in mblk - * 5. len <= mss - * 6. no tcp_valid bits - */ - if ((tcp->tcp_unsent != 0) || - (tcp->tcp_cork) || - (mp->b_cont != NULL) || - (tcp->tcp_state != TCPS_ESTABLISHED) || - (len == 0) || - (len > mss) || - (tcp->tcp_valid_bits != 0)) { - tcp_wput_data(tcp, mp, B_FALSE); - return; - } - - ASSERT(tcp->tcp_xmit_tail_unsent == 0); - ASSERT(tcp->tcp_fin_sent == 0); - - /* queue new packet onto retransmission queue */ - if (tcp->tcp_xmit_head == NULL) { - tcp->tcp_xmit_head = mp; - } else { - tcp->tcp_xmit_last->b_cont = mp; - } - tcp->tcp_xmit_last = mp; - tcp->tcp_xmit_tail = mp; - - /* find out how much we can send */ - /* BEGIN CSTYLED */ - /* - * un-acked usable - * |--------------|-----------------| - * tcp_suna tcp_snxt tcp_suna+tcp_swnd - */ - /* END CSTYLED */ - - /* start sending from tcp_snxt */ - snxt = tcp->tcp_snxt; - - /* - * Check to see if this connection has been idled for some - * time and no ACK is expected. If it is, we need to slow - * start again to get back the connection's "self-clock" as - * described in VJ's paper. - * - * Reinitialize tcp_cwnd after idle. - */ - now = LBOLT_FASTPATH; - if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && - (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { - SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); - } - - usable = tcp->tcp_swnd; /* tcp window size */ - if (usable > tcp->tcp_cwnd) - usable = tcp->tcp_cwnd; /* congestion window smaller */ - usable -= snxt; /* subtract stuff already sent */ - suna = tcp->tcp_suna; - usable += suna; - /* usable can be < 0 if the congestion window is smaller */ - if (len > usable) { - /* Can't send complete M_DATA in one shot */ - goto slow; - } - - mutex_enter(&tcp->tcp_non_sq_lock); - if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { - tcp_clrqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); - - /* - * determine if anything to send (Nagle). - * - * 1. len < tcp_mss (i.e. small) - * 2. unacknowledged data present - * 3. len < nagle limit - * 4. last packet sent < nagle limit (previous packet sent) - */ - if ((len < mss) && (snxt != suna) && - (len < (int)tcp->tcp_naglim) && - (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { - /* - * This was the first unsent packet and normally - * mss < xmit_hiwater so there is no need to worry - * about flow control. The next packet will go - * through the flow control check in tcp_wput_data(). - */ - /* leftover work from above */ - tcp->tcp_unsent = len; - tcp->tcp_xmit_tail_unsent = len; - - return; - } - - /* len <= tcp->tcp_mss && len == unsent so no silly window */ - - if (snxt == suna) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - - /* we have always sent something */ - tcp->tcp_rack_cnt = 0; - - tcp->tcp_snxt = snxt + len; - tcp->tcp_rack = tcp->tcp_rnxt; - - if ((mp1 = dupb(mp)) == 0) - goto no_memory; - mp->b_prev = (mblk_t *)(uintptr_t)now; - mp->b_next = (mblk_t *)(uintptr_t)snxt; - - /* adjust tcp header information */ - tcpha = tcp->tcp_tcpha; - tcpha->tha_flags = (TH_ACK|TH_PUSH); - - sum = len + connp->conn_ht_ulp_len + connp->conn_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - tcpha->tha_sum = htons(sum); - - tcpha->tha_seq = htonl(snxt); - - BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); - BUMP_LOCAL(tcp->tcp_obsegs); - - /* Update the latest receive window size in TCP header. */ - tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); - - tcp->tcp_last_sent_len = (ushort_t)len; - - plen = len + connp->conn_ht_iphc_len; - - ixa = connp->conn_ixa; - ixa->ixa_pktlen = plen; - - if (ixa->ixa_flags & IXAF_IS_IPV4) { - tcp->tcp_ipha->ipha_length = htons(plen); - } else { - tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN); - } - - /* see if we need to allocate a mblk for the headers */ - hdrlen = connp->conn_ht_iphc_len; - rptr = mp1->b_rptr - hdrlen; - db = mp1->b_datap; - if ((db->db_ref != 2) || rptr < db->db_base || - (!OK_32PTR(rptr))) { - /* NOTE: we assume allocb returns an OK_32PTR */ - mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED); - if (!mp) { - freemsg(mp1); - goto no_memory; - } - mp->b_cont = mp1; - mp1 = mp; - /* Leave room for Link Level header */ - rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; - mp1->b_wptr = &rptr[hdrlen]; - } - mp1->b_rptr = rptr; - - /* Fill in the timestamp option. */ - if (tcp->tcp_snd_ts_ok) { - uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; - - U32_TO_BE32(llbolt, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); - U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); - } else { - ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); - } - - /* copy header into outgoing packet */ - dst = (ipaddr_t *)rptr; - src = (ipaddr_t *)connp->conn_ht_iphc; - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; - dst[8] = src[8]; - dst[9] = src[9]; - if (hdrlen -= 40) { - hdrlen >>= 2; - dst += 10; - src += 10; - do { - *dst++ = *src++; - } while (--hdrlen); - } - - /* - * Set the ECN info in the TCP header. Note that this - * is not the template header. - */ - if (tcp->tcp_ecn_ok) { - SET_ECT(tcp, rptr); - - tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length); - if (tcp->tcp_ecn_echo_on) - tcpha->tha_flags |= TH_ECE; - if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { - tcpha->tha_flags |= TH_CWR; - tcp->tcp_ecn_cwr_sent = B_TRUE; - } - } - - if (tcp->tcp_ip_forward_progress) { - tcp->tcp_ip_forward_progress = B_FALSE; - connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; - } else { - connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; - } - tcp_send_data(tcp, mp1); - return; - - /* - * If we ran out of memory, we pretend to have sent the packet - * and that it was lost on the wire. - */ -no_memory: - return; - -slow: - /* leftover work from above */ - tcp->tcp_unsent = len; - tcp->tcp_xmit_tail_unsent = len; - tcp_wput_data(tcp, NULL, B_FALSE); -} - /* * This runs at the tail end of accept processing on the squeue of the * new connection. @@ -14833,7 +3674,7 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) if (connp->conn_keepalive) { tcp->tcp_ka_last_intrvl = 0; - tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_killer, + tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, MSEC_TO_TICK(tcp->tcp_ka_interval)); } @@ -14852,40 +3693,10 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) } /* - * The function called through squeue to get behind listener's perimeter to - * send a deferred conn_ind. - */ -/* ARGSUSED */ -void -tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *lconnp = (conn_t *)arg; - tcp_t *listener = lconnp->conn_tcp; - struct T_conn_ind *conn_ind; - tcp_t *tcp; - - conn_ind = (struct T_conn_ind *)mp->b_rptr; - bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, - conn_ind->OPT_length); - - if (listener->tcp_state != TCPS_LISTEN) { - /* - * If listener has closed, it would have caused a - * a cleanup/blowoff to happen for the eager, so - * we don't need to do anything more. - */ - freemsg(mp); - return; - } - - tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); -} - -/* * Common to TPI and sockfs accept code. */ /* ARGSUSED2 */ -static int +int tcp_accept_common(conn_t *lconnp, conn_t *econnp, cred_t *cr) { tcp_t *listener, *eager; @@ -15010,380 +3821,10 @@ no_more_eagers: return (0); } -int -tcp_accept(sock_lower_handle_t lproto_handle, - sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, - cred_t *cr) -{ - conn_t *lconnp, *econnp; - tcp_t *listener, *eager; - - lconnp = (conn_t *)lproto_handle; - listener = lconnp->conn_tcp; - ASSERT(listener->tcp_state == TCPS_LISTEN); - econnp = (conn_t *)eproto_handle; - eager = econnp->conn_tcp; - ASSERT(eager->tcp_listener != NULL); - - /* - * It is OK to manipulate these fields outside the eager's squeue - * because they will not start being used until tcp_accept_finish - * has been called. - */ - ASSERT(lconnp->conn_upper_handle != NULL); - ASSERT(econnp->conn_upper_handle == NULL); - econnp->conn_upper_handle = sock_handle; - econnp->conn_upcalls = lconnp->conn_upcalls; - ASSERT(IPCL_IS_NONSTR(econnp)); - return (tcp_accept_common(lconnp, econnp, cr)); -} - - -/* - * This is the STREAMS entry point for T_CONN_RES coming down on - * Acceptor STREAM when sockfs listener does accept processing. - * Read the block comment on top of tcp_input_listener(). - */ -void -tcp_tpi_accept(queue_t *q, mblk_t *mp) -{ - queue_t *rq = RD(q); - struct T_conn_res *conn_res; - tcp_t *eager; - tcp_t *listener; - struct T_ok_ack *ok; - t_scalar_t PRIM_type; - conn_t *econnp; - cred_t *cr; - - ASSERT(DB_TYPE(mp) == M_PROTO); - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) { - mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); - if (mp != NULL) - putnext(rq, mp); - return; - } - conn_res = (struct T_conn_res *)mp->b_rptr; - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); - if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { - mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); - if (mp != NULL) - putnext(rq, mp); - return; - } - switch (conn_res->PRIM_type) { - case O_T_CONN_RES: - case T_CONN_RES: - /* - * We pass up an err ack if allocb fails. This will - * cause sockfs to issue a T_DISCON_REQ which will cause - * tcp_eager_blowoff to be called. sockfs will then call - * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. - * we need to do the allocb up here because we have to - * make sure rq->q_qinfo->qi_qclose still points to the - * correct function (tcp_tpi_close_accept) in case allocb - * fails. - */ - bcopy(mp->b_rptr + conn_res->OPT_offset, - &eager, conn_res->OPT_length); - PRIM_type = conn_res->PRIM_type; - mp->b_datap->db_type = M_PCPROTO; - mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); - ok = (struct T_ok_ack *)mp->b_rptr; - ok->PRIM_type = T_OK_ACK; - ok->CORRECT_prim = PRIM_type; - econnp = eager->tcp_connp; - econnp->conn_dev = (dev_t)RD(q)->q_ptr; - econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); - econnp->conn_rq = rq; - econnp->conn_wq = q; - rq->q_ptr = econnp; - rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ - q->q_ptr = econnp; - q->q_qinfo = &tcp_winit; - listener = eager->tcp_listener; - - if (tcp_accept_common(listener->tcp_connp, - econnp, cr) < 0) { - mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); - if (mp != NULL) - putnext(rq, mp); - return; - } - - /* - * Send the new local address also up to sockfs. There - * should already be enough space in the mp that came - * down from soaccept(). - */ - if (econnp->conn_family == AF_INET) { - sin_t *sin; - - ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= - (sizeof (struct T_ok_ack) + sizeof (sin_t))); - sin = (sin_t *)mp->b_wptr; - mp->b_wptr += sizeof (sin_t); - sin->sin_family = AF_INET; - sin->sin_port = econnp->conn_lport; - sin->sin_addr.s_addr = econnp->conn_laddr_v4; - } else { - sin6_t *sin6; - - ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= - sizeof (struct T_ok_ack) + sizeof (sin6_t)); - sin6 = (sin6_t *)mp->b_wptr; - mp->b_wptr += sizeof (sin6_t); - sin6->sin6_family = AF_INET6; - sin6->sin6_port = econnp->conn_lport; - sin6->sin6_addr = econnp->conn_laddr_v6; - if (econnp->conn_ipversion == IPV4_VERSION) - sin6->sin6_flowinfo = 0; - else - sin6->sin6_flowinfo = econnp->conn_flowinfo; - if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && - (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { - sin6->sin6_scope_id = - econnp->conn_ixa->ixa_scopeid; - } else { - sin6->sin6_scope_id = 0; - } - sin6->__sin6_src_id = 0; - } - - putnext(rq, mp); - return; - default: - mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); - if (mp != NULL) - putnext(rq, mp); - return; - } -} - -/* - * Handle special out-of-band ioctl requests (see PSARC/2008/265). - */ -static void -tcp_wput_cmdblk(queue_t *q, mblk_t *mp) -{ - void *data; - mblk_t *datamp = mp->b_cont; - conn_t *connp = Q_TO_CONN(q); - tcp_t *tcp = connp->conn_tcp; - cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; - - if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { - cmdp->cb_error = EPROTO; - qreply(q, mp); - return; - } - - data = datamp->b_rptr; - - switch (cmdp->cb_cmd) { - case TI_GETPEERNAME: - if (tcp->tcp_state < TCPS_SYN_RCVD) - cmdp->cb_error = ENOTCONN; - else - cmdp->cb_error = conn_getpeername(connp, data, - &cmdp->cb_len); - break; - case TI_GETMYNAME: - cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); - break; - default: - cmdp->cb_error = EINVAL; - break; - } - - qreply(q, mp); -} - -void -tcp_wput(queue_t *q, mblk_t *mp) -{ - conn_t *connp = Q_TO_CONN(q); - tcp_t *tcp; - void (*output_proc)(); - t_scalar_t type; - uchar_t *rptr; - struct iocblk *iocp; - size_t size; - tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - - ASSERT(connp->conn_ref >= 2); - - switch (DB_TYPE(mp)) { - case M_DATA: - tcp = connp->conn_tcp; - ASSERT(tcp != NULL); - - size = msgdsize(mp); - - mutex_enter(&tcp->tcp_non_sq_lock); - tcp->tcp_squeue_bytes += size; - if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { - tcp_setqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); - - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, - NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); - return; - - case M_CMD: - tcp_wput_cmdblk(q, mp); - return; - - case M_PROTO: - case M_PCPROTO: - /* - * if it is a snmp message, don't get behind the squeue - */ - tcp = connp->conn_tcp; - rptr = mp->b_rptr; - if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { - type = ((union T_primitives *)rptr)->type; - } else { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_wput_proto, dropping one..."); - } - freemsg(mp); - return; - } - if (type == T_SVR4_OPTMGMT_REQ) { - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cred_t *cr = msg_getcred(mp, NULL); - - ASSERT(cr != NULL); - if (cr == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get, - cr)) { - /* - * This was a SNMP request - */ - return; - } else { - output_proc = tcp_wput_proto; - } - } else { - output_proc = tcp_wput_proto; - } - break; - case M_IOCTL: - /* - * Most ioctls can be processed right away without going via - * squeues - process them right here. Those that do require - * squeue (currently _SIOCSOCKFALLBACK) - * are processed by tcp_wput_ioctl(). - */ - iocp = (struct iocblk *)mp->b_rptr; - tcp = connp->conn_tcp; - - switch (iocp->ioc_cmd) { - case TCP_IOC_ABORT_CONN: - tcp_ioctl_abort_conn(q, mp); - return; - case TI_GETPEERNAME: - case TI_GETMYNAME: - mi_copyin(q, mp, NULL, - SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); - return; - case ND_SET: - /* nd_getset does the necessary checks */ - case ND_GET: - if (nd_getset(q, tcps->tcps_g_nd, mp)) { - qreply(q, mp); - return; - } - CONN_INC_IOCTLREF(connp); - ip_wput_nondata(q, mp); - CONN_DEC_IOCTLREF(connp); - return; - - default: - output_proc = tcp_wput_ioctl; - break; - } - break; - default: - output_proc = tcp_wput_nondata; - break; - } - - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, - NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); -} - -/* - * Initial STREAMS write side put() procedure for sockets. It tries to - * handle the T_CAPABILITY_REQ which sockfs sends down while setting - * up the socket without using the squeue. Non T_CAPABILITY_REQ messages - * are handled by tcp_wput() as usual. - * - * All further messages will also be handled by tcp_wput() because we cannot - * be sure that the above short cut is safe later. - */ -static void -tcp_wput_sock(queue_t *wq, mblk_t *mp) -{ - conn_t *connp = Q_TO_CONN(wq); - tcp_t *tcp = connp->conn_tcp; - struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; - - ASSERT(wq->q_qinfo == &tcp_sock_winit); - wq->q_qinfo = &tcp_winit; - - ASSERT(IPCL_IS_TCP(connp)); - ASSERT(TCP_IS_SOCKET(tcp)); - - if (DB_TYPE(mp) == M_PCPROTO && - MBLKL(mp) == sizeof (struct T_capability_req) && - car->PRIM_type == T_CAPABILITY_REQ) { - tcp_capability_req(tcp, mp); - return; - } - - tcp_wput(wq, mp); -} - -/* ARGSUSED */ -static void -tcp_wput_fallback(queue_t *wq, mblk_t *mp) -{ -#ifdef DEBUG - cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n"); -#endif - freemsg(mp); -} - /* * Check the usability of ZEROCOPY. It's instead checking the flag set by IP. */ -static boolean_t +boolean_t tcp_zcopy_check(tcp_t *tcp) { conn_t *connp = tcp->tcp_connp; @@ -15424,7 +3865,7 @@ tcp_zcopy_check(tcp_t *tcp) * 2. tcp_output: fix_xmitlist is set to B_FALSE. Flag STRUIO_ZCNOTIFY need * to be copied to new message. */ -static mblk_t * +mblk_t * tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist) { mblk_t *nbp; @@ -15494,7 +3935,7 @@ tcp_zcopy_backoff(tcp_t *tcp, mblk_t *bp, boolean_t fix_xmitlist) return (head); } -static void +void tcp_zcopy_notify(tcp_t *tcp) { struct stdata *stp; @@ -15628,1445 +4069,6 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, } } -static void -tcp_send_data(tcp_t *tcp, mblk_t *mp) -{ - conn_t *connp = tcp->tcp_connp; - - /* - * Check here to avoid sending zero-copy message down to IP when - * ZEROCOPY capability has turned off. We only need to deal with - * the race condition between sockfs and the notification here. - * Since we have tried to backoff the tcp_xmit_head when turning - * zero-copy off and new messages in tcp_output(), we simply drop - * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean - * is not true. - */ - if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on && - !tcp->tcp_xmit_zc_clean) { - ip_drop_output("TCP ZC was disabled but not clean", mp, NULL); - freemsg(mp); - return; - } - - ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp); - (void) conn_ip_output(mp, connp->conn_ixa); -} - -/* - * This handles the case when the receiver has shrunk its win. Per RFC 1122 - * if the receiver shrinks the window, i.e. moves the right window to the - * left, the we should not send new data, but should retransmit normally the - * old unacked data between suna and suna + swnd. We might has sent data - * that is now outside the new window, pretend that we didn't send it. - */ -static void -tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) -{ - uint32_t snxt = tcp->tcp_snxt; - - ASSERT(shrunk_count > 0); - - if (!tcp->tcp_is_wnd_shrnk) { - tcp->tcp_snxt_shrunk = snxt; - tcp->tcp_is_wnd_shrnk = B_TRUE; - } else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) { - tcp->tcp_snxt_shrunk = snxt; - } - - /* Pretend we didn't send the data outside the window */ - snxt -= shrunk_count; - - /* Reset all the values per the now shrunk window */ - tcp_update_xmit_tail(tcp, snxt); - tcp->tcp_unsent += shrunk_count; - - /* - * If the SACK option is set, delete the entire list of - * notsack'ed blocks. - */ - if (tcp->tcp_sack_info != NULL) { - if (tcp->tcp_notsack_list != NULL) - TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); - } - - if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) - /* - * Make sure the timer is running so that we will probe a zero - * window. - */ - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); -} - - -/* - * The TCP normal data output path. - * NOTE: the logic of the fast path is duplicated from this function. - */ -static void -tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) -{ - int len; - mblk_t *local_time; - mblk_t *mp1; - uint32_t snxt; - int tail_unsent; - int tcpstate; - int usable = 0; - mblk_t *xmit_tail; - int32_t mss; - int32_t num_sack_blk = 0; - int32_t total_hdr_len; - int32_t tcp_hdr_len; - int rc; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - clock_t now = LBOLT_FASTPATH; - - tcpstate = tcp->tcp_state; - if (mp == NULL) { - /* - * tcp_wput_data() with NULL mp should only be called when - * there is unsent data. - */ - ASSERT(tcp->tcp_unsent > 0); - /* Really tacky... but we need this for detached closes. */ - len = tcp->tcp_unsent; - goto data_null; - } - -#if CCS_STATS - wrw_stats.tot.count++; - wrw_stats.tot.bytes += msgdsize(mp); -#endif - ASSERT(mp->b_datap->db_type == M_DATA); - /* - * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, - * or before a connection attempt has begun. - */ - if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || - (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { -#ifdef DEBUG - cmn_err(CE_WARN, - "tcp_wput_data: data after ordrel, %s", - tcp_display(tcp, NULL, - DISP_ADDR_AND_PORT)); -#else - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_TRACE|SL_ERROR, - "tcp_wput_data: data after ordrel, %s\n", - tcp_display(tcp, NULL, - DISP_ADDR_AND_PORT)); - } -#endif /* DEBUG */ - } - if (tcp->tcp_snd_zcopy_aware && - (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) - tcp_zcopy_notify(tcp); - freemsg(mp); - mutex_enter(&tcp->tcp_non_sq_lock); - if (tcp->tcp_flow_stopped && - TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { - tcp_clrqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); - return; - } - - /* Strip empties */ - for (;;) { - ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= - (uintptr_t)INT_MAX); - len = (int)(mp->b_wptr - mp->b_rptr); - if (len > 0) - break; - mp1 = mp; - mp = mp->b_cont; - freeb(mp1); - if (!mp) { - return; - } - } - - /* If we are the first on the list ... */ - if (tcp->tcp_xmit_head == NULL) { - tcp->tcp_xmit_head = mp; - tcp->tcp_xmit_tail = mp; - tcp->tcp_xmit_tail_unsent = len; - } else { - /* If tiny tx and room in txq tail, pullup to save mblks. */ - struct datab *dp; - - mp1 = tcp->tcp_xmit_last; - if (len < tcp_tx_pull_len && - (dp = mp1->b_datap)->db_ref == 1 && - dp->db_lim - mp1->b_wptr >= len) { - ASSERT(len > 0); - ASSERT(!mp1->b_cont); - if (len == 1) { - *mp1->b_wptr++ = *mp->b_rptr; - } else { - bcopy(mp->b_rptr, mp1->b_wptr, len); - mp1->b_wptr += len; - } - if (mp1 == tcp->tcp_xmit_tail) - tcp->tcp_xmit_tail_unsent += len; - mp1->b_cont = mp->b_cont; - if (tcp->tcp_snd_zcopy_aware && - (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) - mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; - freeb(mp); - mp = mp1; - } else { - tcp->tcp_xmit_last->b_cont = mp; - } - len += tcp->tcp_unsent; - } - - /* Tack on however many more positive length mblks we have */ - if ((mp1 = mp->b_cont) != NULL) { - do { - int tlen; - ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= - (uintptr_t)INT_MAX); - tlen = (int)(mp1->b_wptr - mp1->b_rptr); - if (tlen <= 0) { - mp->b_cont = mp1->b_cont; - freeb(mp1); - } else { - len += tlen; - mp = mp1; - } - } while ((mp1 = mp->b_cont) != NULL); - } - tcp->tcp_xmit_last = mp; - tcp->tcp_unsent = len; - - if (urgent) - usable = 1; - -data_null: - snxt = tcp->tcp_snxt; - xmit_tail = tcp->tcp_xmit_tail; - tail_unsent = tcp->tcp_xmit_tail_unsent; - - /* - * Note that tcp_mss has been adjusted to take into account the - * timestamp option if applicable. Because SACK options do not - * appear in every TCP segments and they are of variable lengths, - * they cannot be included in tcp_mss. Thus we need to calculate - * the actual segment length when we need to send a segment which - * includes SACK options. - */ - if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { - int32_t opt_len; - - num_sack_blk = MIN(tcp->tcp_max_sack_blk, - tcp->tcp_num_sack_blk); - opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * - 2 + TCPOPT_HEADER_LEN; - mss = tcp->tcp_mss - opt_len; - total_hdr_len = connp->conn_ht_iphc_len + opt_len; - tcp_hdr_len = connp->conn_ht_ulp_len + opt_len; - } else { - mss = tcp->tcp_mss; - total_hdr_len = connp->conn_ht_iphc_len; - tcp_hdr_len = connp->conn_ht_ulp_len; - } - - if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && - (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { - SET_TCP_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); - } - if (tcpstate == TCPS_SYN_RCVD) { - /* - * The three-way connection establishment handshake is not - * complete yet. We want to queue the data for transmission - * after entering ESTABLISHED state (RFC793). A jump to - * "done" label effectively leaves data on the queue. - */ - goto done; - } else { - int usable_r; - - /* - * In the special case when cwnd is zero, which can only - * happen if the connection is ECN capable, return now. - * New segments is sent using tcp_timer(). The timer - * is set in tcp_input_data(). - */ - if (tcp->tcp_cwnd == 0) { - /* - * Note that tcp_cwnd is 0 before 3-way handshake is - * finished. - */ - ASSERT(tcp->tcp_ecn_ok || - tcp->tcp_state < TCPS_ESTABLISHED); - return; - } - - /* NOTE: trouble if xmitting while SYN not acked? */ - usable_r = snxt - tcp->tcp_suna; - usable_r = tcp->tcp_swnd - usable_r; - - /* - * Check if the receiver has shrunk the window. If - * tcp_wput_data() with NULL mp is called, tcp_fin_sent - * cannot be set as there is unsent data, so FIN cannot - * be sent out. Otherwise, we need to take into account - * of FIN as it consumes an "invisible" sequence number. - */ - ASSERT(tcp->tcp_fin_sent == 0); - if (usable_r < 0) { - /* - * The receiver has shrunk the window and we have sent - * -usable_r date beyond the window, re-adjust. - * - * If TCP window scaling is enabled, there can be - * round down error as the advertised receive window - * is actually right shifted n bits. This means that - * the lower n bits info is wiped out. It will look - * like the window is shrunk. Do a check here to - * see if the shrunk amount is actually within the - * error in window calculation. If it is, just - * return. Note that this check is inside the - * shrunk window check. This makes sure that even - * though tcp_process_shrunk_swnd() is not called, - * we will stop further processing. - */ - if ((-usable_r >> tcp->tcp_snd_ws) > 0) { - tcp_process_shrunk_swnd(tcp, -usable_r); - } - return; - } - - /* usable = MIN(swnd, cwnd) - unacked_bytes */ - if (tcp->tcp_swnd > tcp->tcp_cwnd) - usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; - - /* usable = MIN(usable, unsent) */ - if (usable_r > len) - usable_r = len; - - /* usable = MAX(usable, {1 for urgent, 0 for data}) */ - if (usable_r > 0) { - usable = usable_r; - } else { - /* Bypass all other unnecessary processing. */ - goto done; - } - } - - local_time = (mblk_t *)now; - - /* - * "Our" Nagle Algorithm. This is not the same as in the old - * BSD. This is more in line with the true intent of Nagle. - * - * The conditions are: - * 1. The amount of unsent data (or amount of data which can be - * sent, whichever is smaller) is less than Nagle limit. - * 2. The last sent size is also less than Nagle limit. - * 3. There is unack'ed data. - * 4. Urgent pointer is not set. Send urgent data ignoring the - * Nagle algorithm. This reduces the probability that urgent - * bytes get "merged" together. - * 5. The app has not closed the connection. This eliminates the - * wait time of the receiving side waiting for the last piece of - * (small) data. - * - * If all are satisified, exit without sending anything. Note - * that Nagle limit can be smaller than 1 MSS. Nagle limit is - * the smaller of 1 MSS and global tcp_naglim_def (default to be - * 4095). - */ - if (usable < (int)tcp->tcp_naglim && - tcp->tcp_naglim > tcp->tcp_last_sent_len && - snxt != tcp->tcp_suna && - !(tcp->tcp_valid_bits & TCP_URG_VALID) && - !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { - goto done; - } - - /* - * If tcp_zero_win_probe is not set and the tcp->tcp_cork option - * is set, then we have to force TCP not to send partial segment - * (smaller than MSS bytes). We are calculating the usable now - * based on full mss and will save the rest of remaining data for - * later. When tcp_zero_win_probe is set, TCP needs to send out - * something to do zero window probe. - */ - if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) { - if (usable < mss) - goto done; - usable = (usable / mss) * mss; - } - - /* Update the latest receive window size in TCP header. */ - tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); - - /* Send the packet. */ - rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len, - num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, - local_time); - - /* Pretend that all we were trying to send really got sent */ - if (rc < 0 && tail_unsent < 0) { - do { - xmit_tail = xmit_tail->b_cont; - xmit_tail->b_prev = local_time; - ASSERT((uintptr_t)(xmit_tail->b_wptr - - xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); - tail_unsent += (int)(xmit_tail->b_wptr - - xmit_tail->b_rptr); - } while (tail_unsent < 0); - } -done:; - tcp->tcp_xmit_tail = xmit_tail; - tcp->tcp_xmit_tail_unsent = tail_unsent; - len = tcp->tcp_snxt - snxt; - if (len) { - /* - * If new data was sent, need to update the notsack - * list, which is, afterall, data blocks that have - * not been sack'ed by the receiver. New data is - * not sack'ed. - */ - if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { - /* len is a negative value. */ - tcp->tcp_pipe -= len; - tcp_notsack_update(&(tcp->tcp_notsack_list), - tcp->tcp_snxt, snxt, - &(tcp->tcp_num_notsack_blk), - &(tcp->tcp_cnt_notsack_list)); - } - tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; - tcp->tcp_rack = tcp->tcp_rnxt; - tcp->tcp_rack_cnt = 0; - if ((snxt + len) == tcp->tcp_suna) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { - /* - * Didn't send anything. Make sure the timer is running - * so that we will probe a zero window. - */ - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - /* Note that len is the amount we just sent but with a negative sign */ - tcp->tcp_unsent += len; - mutex_enter(&tcp->tcp_non_sq_lock); - if (tcp->tcp_flow_stopped) { - if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { - tcp_clrqfull(tcp); - } - } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) { - if (!(tcp->tcp_detached)) - tcp_setqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); -} - -/* - * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header - * with the template header, as well as other options such as time-stamp, - * ECN and/or SACK. - */ -static void -tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) -{ - tcpha_t *tcp_tmpl, *tcpha; - uint32_t *dst, *src; - int hdrlen; - conn_t *connp = tcp->tcp_connp; - - ASSERT(OK_32PTR(rptr)); - - /* Template header */ - tcp_tmpl = tcp->tcp_tcpha; - - /* Header of outgoing packet */ - tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length); - - /* dst and src are opaque 32-bit fields, used for copying */ - dst = (uint32_t *)rptr; - src = (uint32_t *)connp->conn_ht_iphc; - hdrlen = connp->conn_ht_iphc_len; - - /* Fill time-stamp option if needed */ - if (tcp->tcp_snd_ts_ok) { - U32_TO_BE32((uint32_t)now, - (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); - U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); - } else { - ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); - } - - /* - * Copy the template header; is this really more efficient than - * calling bcopy()? For simple IPv4/TCP, it may be the case, - * but perhaps not for other scenarios. - */ - dst[0] = src[0]; - dst[1] = src[1]; - dst[2] = src[2]; - dst[3] = src[3]; - dst[4] = src[4]; - dst[5] = src[5]; - dst[6] = src[6]; - dst[7] = src[7]; - dst[8] = src[8]; - dst[9] = src[9]; - if (hdrlen -= 40) { - hdrlen >>= 2; - dst += 10; - src += 10; - do { - *dst++ = *src++; - } while (--hdrlen); - } - - /* - * Set the ECN info in the TCP header if it is not a zero - * window probe. Zero window probe is only sent in - * tcp_wput_data() and tcp_timer(). - */ - if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { - SET_ECT(tcp, rptr); - - if (tcp->tcp_ecn_echo_on) - tcpha->tha_flags |= TH_ECE; - if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { - tcpha->tha_flags |= TH_CWR; - tcp->tcp_ecn_cwr_sent = B_TRUE; - } - } - - /* Fill in SACK options */ - if (num_sack_blk > 0) { - uchar_t *wptr = rptr + connp->conn_ht_iphc_len; - sack_blk_t *tmp; - int32_t i; - - wptr[0] = TCPOPT_NOP; - wptr[1] = TCPOPT_NOP; - wptr[2] = TCPOPT_SACK; - wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * - sizeof (sack_blk_t); - wptr += TCPOPT_REAL_SACK_LEN; - - tmp = tcp->tcp_sack_list; - for (i = 0; i < num_sack_blk; i++) { - U32_TO_BE32(tmp[i].begin, wptr); - wptr += sizeof (tcp_seq); - U32_TO_BE32(tmp[i].end, wptr); - wptr += sizeof (tcp_seq); - } - tcpha->tha_offset_and_reserved += - ((num_sack_blk * 2 + 1) << 4); - } -} - -/* - * tcp_send() is called by tcp_wput_data() and returns one of the following: - * - * -1 = failed allocation. - * 0 = success; burst count reached, or usable send window is too small, - * and that we'd rather wait until later before sending again. - */ -static int -tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, - const int tcp_hdr_len, const int num_sack_blk, int *usable, - uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) -{ - int num_burst_seg = tcp->tcp_snd_burst; - int num_lso_seg = 1; - uint_t lso_usable; - boolean_t do_lso_send = B_FALSE; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - ip_xmit_attr_t *ixa = connp->conn_ixa; - - /* - * Check LSO possibility. The value of tcp->tcp_lso indicates whether - * the underlying connection is LSO capable. Will check whether having - * enough available data to initiate LSO transmission in the for(){} - * loops. - */ - if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0) - do_lso_send = B_TRUE; - - for (;;) { - struct datab *db; - tcpha_t *tcpha; - uint32_t sum; - mblk_t *mp, *mp1; - uchar_t *rptr; - int len; - - /* - * Burst count reached, return successfully. - */ - if (num_burst_seg == 0) - break; - - /* - * Calculate the maximum payload length we can send at one - * time. - */ - if (do_lso_send) { - /* - * Check whether be able to to do LSO for the current - * available data. - */ - if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { - lso_usable = MIN(tcp->tcp_lso_max, *usable); - lso_usable = MIN(lso_usable, - num_burst_seg * mss); - - num_lso_seg = lso_usable / mss; - if (lso_usable % mss) { - num_lso_seg++; - tcp->tcp_last_sent_len = (ushort_t) - (lso_usable % mss); - } else { - tcp->tcp_last_sent_len = (ushort_t)mss; - } - } else { - do_lso_send = B_FALSE; - num_lso_seg = 1; - lso_usable = mss; - } - } - - ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); -#ifdef DEBUG - DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t, - do_lso_send); -#endif - /* - * Adjust num_burst_seg here. - */ - num_burst_seg -= num_lso_seg; - - len = mss; - if (len > *usable) { - ASSERT(do_lso_send == B_FALSE); - - len = *usable; - if (len <= 0) { - /* Terminate the loop */ - break; /* success; too small */ - } - /* - * Sender silly-window avoidance. - * Ignore this if we are going to send a - * zero window probe out. - * - * TODO: force data into microscopic window? - * ==> (!pushed || (unsent > usable)) - */ - if (len < (tcp->tcp_max_swnd >> 1) && - (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && - !((tcp->tcp_valid_bits & TCP_URG_VALID) && - len == 1) && (! tcp->tcp_zero_win_probe)) { - /* - * If the retransmit timer is not running - * we start it so that we will retransmit - * in the case when the receiver has - * decremented the window. - */ - if (*snxt == tcp->tcp_snxt && - *snxt == tcp->tcp_suna) { - /* - * We are not supposed to send - * anything. So let's wait a little - * bit longer before breaking SWS - * avoidance. - * - * What should the value be? - * Suggestion: MAX(init rexmit time, - * tcp->tcp_rto) - */ - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - break; /* success; too small */ - } - } - - tcpha = tcp->tcp_tcpha; - - /* - * The reason to adjust len here is that we need to set flags - * and calculate checksum. - */ - if (do_lso_send) - len = lso_usable; - - *usable -= len; /* Approximate - can be adjusted later */ - if (*usable > 0) - tcpha->tha_flags = TH_ACK; - else - tcpha->tha_flags = (TH_ACK | TH_PUSH); - - /* - * Prime pump for IP's checksumming on our behalf. - * Include the adjustment for a source route if any. - * In case of LSO, the partial pseudo-header checksum should - * exclusive TCP length, so zero tha_sum before IP calculate - * pseudo-header checksum for partial checksum offload. - */ - if (do_lso_send) { - sum = 0; - } else { - sum = len + tcp_hdr_len + connp->conn_sum; - sum = (sum >> 16) + (sum & 0xFFFF); - } - tcpha->tha_sum = htons(sum); - tcpha->tha_seq = htonl(*snxt); - - /* - * Branch off to tcp_xmit_mp() if any of the VALID bits is - * set. For the case when TCP_FSS_VALID is the only valid - * bit (normal active close), branch off only when we think - * that the FIN flag needs to be set. Note for this case, - * that (snxt + len) may not reflect the actual seg_len, - * as len may be further reduced in tcp_xmit_mp(). If len - * gets modified, we will end up here again. - */ - if (tcp->tcp_valid_bits != 0 && - (tcp->tcp_valid_bits != TCP_FSS_VALID || - ((*snxt + len) == tcp->tcp_fss))) { - uchar_t *prev_rptr; - uint32_t prev_snxt = tcp->tcp_snxt; - - if (*tail_unsent == 0) { - ASSERT((*xmit_tail)->b_cont != NULL); - *xmit_tail = (*xmit_tail)->b_cont; - prev_rptr = (*xmit_tail)->b_rptr; - *tail_unsent = (int)((*xmit_tail)->b_wptr - - (*xmit_tail)->b_rptr); - } else { - prev_rptr = (*xmit_tail)->b_rptr; - (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - - *tail_unsent; - } - mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, - *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); - /* Restore tcp_snxt so we get amount sent right. */ - tcp->tcp_snxt = prev_snxt; - if (prev_rptr == (*xmit_tail)->b_rptr) { - /* - * If the previous timestamp is still in use, - * don't stomp on it. - */ - if ((*xmit_tail)->b_next == NULL) { - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = - (mblk_t *)(uintptr_t)(*snxt); - } - } else - (*xmit_tail)->b_rptr = prev_rptr; - - if (mp == NULL) { - return (-1); - } - mp1 = mp->b_cont; - - if (len <= mss) /* LSO is unusable (!do_lso_send) */ - tcp->tcp_last_sent_len = (ushort_t)len; - while (mp1->b_cont) { - *xmit_tail = (*xmit_tail)->b_cont; - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = - (mblk_t *)(uintptr_t)(*snxt); - mp1 = mp1->b_cont; - } - *snxt += len; - *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; - BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); - tcp_send_data(tcp, mp); - continue; - } - - *snxt += len; /* Adjust later if we don't send all of len */ - BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, len); - - if (*tail_unsent) { - /* Are the bytes above us in flight? */ - rptr = (*xmit_tail)->b_wptr - *tail_unsent; - if (rptr != (*xmit_tail)->b_rptr) { - *tail_unsent -= len; - if (len <= mss) /* LSO is unusable */ - tcp->tcp_last_sent_len = (ushort_t)len; - len += total_hdr_len; - ixa->ixa_pktlen = len; - - if (ixa->ixa_flags & IXAF_IS_IPV4) { - tcp->tcp_ipha->ipha_length = htons(len); - } else { - tcp->tcp_ip6h->ip6_plen = - htons(len - IPV6_HDR_LEN); - } - - mp = dupb(*xmit_tail); - if (mp == NULL) { - return (-1); /* out_of_mem */ - } - mp->b_rptr = rptr; - /* - * If the old timestamp is no longer in use, - * sample a new timestamp now. - */ - if ((*xmit_tail)->b_next == NULL) { - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = - (mblk_t *)(uintptr_t)(*snxt-len); - } - goto must_alloc; - } - } else { - *xmit_tail = (*xmit_tail)->b_cont; - ASSERT((uintptr_t)((*xmit_tail)->b_wptr - - (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); - *tail_unsent = (int)((*xmit_tail)->b_wptr - - (*xmit_tail)->b_rptr); - } - - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); - - *tail_unsent -= len; - if (len <= mss) /* LSO is unusable (!do_lso_send) */ - tcp->tcp_last_sent_len = (ushort_t)len; - - len += total_hdr_len; - ixa->ixa_pktlen = len; - - if (ixa->ixa_flags & IXAF_IS_IPV4) { - tcp->tcp_ipha->ipha_length = htons(len); - } else { - tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); - } - - mp = dupb(*xmit_tail); - if (mp == NULL) { - return (-1); /* out_of_mem */ - } - - len = total_hdr_len; - /* - * There are four reasons to allocate a new hdr mblk: - * 1) The bytes above us are in use by another packet - * 2) We don't have good alignment - * 3) The mblk is being shared - * 4) We don't have enough room for a header - */ - rptr = mp->b_rptr - len; - if (!OK_32PTR(rptr) || - ((db = mp->b_datap), db->db_ref != 2) || - rptr < db->db_base) { - /* NOTE: we assume allocb returns an OK_32PTR */ - - must_alloc:; - mp1 = allocb(connp->conn_ht_iphc_allocated + - tcps->tcps_wroff_xtra, BPRI_MED); - if (mp1 == NULL) { - freemsg(mp); - return (-1); /* out_of_mem */ - } - mp1->b_cont = mp; - mp = mp1; - /* Leave room for Link Level header */ - len = total_hdr_len; - rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; - mp->b_wptr = &rptr[len]; - } - - /* - * Fill in the header using the template header, and add - * options such as time-stamp, ECN and/or SACK, as needed. - */ - tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); - - mp->b_rptr = rptr; - - if (*tail_unsent) { - int spill = *tail_unsent; - - mp1 = mp->b_cont; - if (mp1 == NULL) - mp1 = mp; - - /* - * If we're a little short, tack on more mblks until - * there is no more spillover. - */ - while (spill < 0) { - mblk_t *nmp; - int nmpsz; - - nmp = (*xmit_tail)->b_cont; - nmpsz = MBLKL(nmp); - - /* - * Excess data in mblk; can we split it? - * If LSO is enabled for the connection, - * keep on splitting as this is a transient - * send path. - */ - if (!do_lso_send && (spill + nmpsz > 0)) { - /* - * Don't split if stream head was - * told to break up larger writes - * into smaller ones. - */ - if (tcp->tcp_maxpsz_multiplier > 0) - break; - - /* - * Next mblk is less than SMSS/2 - * rounded up to nearest 64-byte; - * let it get sent as part of the - * next segment. - */ - if (tcp->tcp_localnet && - !tcp->tcp_cork && - (nmpsz < roundup((mss >> 1), 64))) - break; - } - - *xmit_tail = nmp; - ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); - /* Stash for rtt use later */ - (*xmit_tail)->b_prev = local_time; - (*xmit_tail)->b_next = - (mblk_t *)(uintptr_t)(*snxt - len); - mp1->b_cont = dupb(*xmit_tail); - mp1 = mp1->b_cont; - - spill += nmpsz; - if (mp1 == NULL) { - *tail_unsent = spill; - freemsg(mp); - return (-1); /* out_of_mem */ - } - } - - /* Trim back any surplus on the last mblk */ - if (spill >= 0) { - mp1->b_wptr -= spill; - *tail_unsent = spill; - } else { - /* - * We did not send everything we could in - * order to remain within the b_cont limit. - */ - *usable -= spill; - *snxt += spill; - tcp->tcp_last_sent_len += spill; - UPDATE_MIB(&tcps->tcps_mib, - tcpOutDataBytes, spill); - /* - * Adjust the checksum - */ - tcpha = (tcpha_t *)(rptr + - ixa->ixa_ip_hdr_length); - sum += spill; - sum = (sum >> 16) + (sum & 0xFFFF); - tcpha->tha_sum = htons(sum); - if (connp->conn_ipversion == IPV4_VERSION) { - sum = ntohs( - ((ipha_t *)rptr)->ipha_length) + - spill; - ((ipha_t *)rptr)->ipha_length = - htons(sum); - } else { - sum = ntohs( - ((ip6_t *)rptr)->ip6_plen) + - spill; - ((ip6_t *)rptr)->ip6_plen = - htons(sum); - } - ixa->ixa_pktlen += spill; - *tail_unsent = 0; - } - } - if (tcp->tcp_ip_forward_progress) { - tcp->tcp_ip_forward_progress = B_FALSE; - ixa->ixa_flags |= IXAF_REACH_CONF; - } else { - ixa->ixa_flags &= ~IXAF_REACH_CONF; - } - - if (do_lso_send) { - /* Append LSO information to the mp. */ - lso_info_set(mp, mss, HW_LSO); - ixa->ixa_fragsize = IP_MAXPACKET; - ixa->ixa_extra_ident = num_lso_seg - 1; - - DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, - boolean_t, B_TRUE); - - tcp_send_data(tcp, mp); - - /* - * Restore values of ixa_fragsize and ixa_extra_ident. - */ - ixa->ixa_fragsize = ixa->ixa_pmtu; - ixa->ixa_extra_ident = 0; - tcp->tcp_obsegs += num_lso_seg; - TCP_STAT(tcps, tcp_lso_times); - TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); - } else { - /* - * Make sure to clean up LSO information. Wherever a - * new mp uses the prepended header room after dupb(), - * lso_info_cleanup() should be called. - */ - lso_info_cleanup(mp); - tcp_send_data(tcp, mp); - BUMP_LOCAL(tcp->tcp_obsegs); - } - } - - return (0); -} - -/* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ -static void -tcp_wput_flush(tcp_t *tcp, mblk_t *mp) -{ - uchar_t fval = *mp->b_rptr; - mblk_t *tail; - conn_t *connp = tcp->tcp_connp; - queue_t *q = connp->conn_wq; - - /* TODO: How should flush interact with urgent data? */ - if ((fval & FLUSHW) && tcp->tcp_xmit_head && - !(tcp->tcp_valid_bits & TCP_URG_VALID)) { - /* - * Flush only data that has not yet been put on the wire. If - * we flush data that we have already transmitted, life, as we - * know it, may come to an end. - */ - tail = tcp->tcp_xmit_tail; - tail->b_wptr -= tcp->tcp_xmit_tail_unsent; - tcp->tcp_xmit_tail_unsent = 0; - tcp->tcp_unsent = 0; - if (tail->b_wptr != tail->b_rptr) - tail = tail->b_cont; - if (tail) { - mblk_t **excess = &tcp->tcp_xmit_head; - for (;;) { - mblk_t *mp1 = *excess; - if (mp1 == tail) - break; - tcp->tcp_xmit_tail = mp1; - tcp->tcp_xmit_last = mp1; - excess = &mp1->b_cont; - } - *excess = NULL; - tcp_close_mpp(&tail); - if (tcp->tcp_snd_zcopy_aware) - tcp_zcopy_notify(tcp); - } - /* - * We have no unsent data, so unsent must be less than - * conn_sndlowat, so re-enable flow. - */ - mutex_enter(&tcp->tcp_non_sq_lock); - if (tcp->tcp_flow_stopped) { - tcp_clrqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); - } - /* - * TODO: you can't just flush these, you have to increase rwnd for one - * thing. For another, how should urgent data interact? - */ - if (fval & FLUSHR) { - *mp->b_rptr = fval & ~FLUSHW; - /* XXX */ - qreply(q, mp); - return; - } - freemsg(mp); -} - -/* - * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA - * messages. - */ -static void -tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) -{ - mblk_t *mp1; - struct iocblk *iocp = (struct iocblk *)mp->b_rptr; - STRUCT_HANDLE(strbuf, sb); - uint_t addrlen; - conn_t *connp = tcp->tcp_connp; - queue_t *q = connp->conn_wq; - - /* Make sure it is one of ours. */ - switch (iocp->ioc_cmd) { - case TI_GETMYNAME: - case TI_GETPEERNAME: - break; - default: - /* - * If the conn is closing, then error the ioctl here. Otherwise - * use the CONN_IOCTLREF_* macros to hold off tcp_close until - * we're done here. - */ - mutex_enter(&connp->conn_lock); - if (connp->conn_state_flags & CONN_CLOSING) { - mutex_exit(&connp->conn_lock); - iocp->ioc_error = EINVAL; - mp->b_datap->db_type = M_IOCNAK; - iocp->ioc_count = 0; - qreply(q, mp); - return; - } - - CONN_INC_IOCTLREF_LOCKED(connp); - ip_wput_nondata(q, mp); - CONN_DEC_IOCTLREF(connp); - return; - } - switch (mi_copy_state(q, mp, &mp1)) { - case -1: - return; - case MI_COPY_CASE(MI_COPY_IN, 1): - break; - case MI_COPY_CASE(MI_COPY_OUT, 1): - /* Copy out the strbuf. */ - mi_copyout(q, mp); - return; - case MI_COPY_CASE(MI_COPY_OUT, 2): - /* All done. */ - mi_copy_done(q, mp, 0); - return; - default: - mi_copy_done(q, mp, EPROTO); - return; - } - /* Check alignment of the strbuf */ - if (!OK_32PTR(mp1->b_rptr)) { - mi_copy_done(q, mp, EINVAL); - return; - } - - STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); - - if (connp->conn_family == AF_INET) - addrlen = sizeof (sin_t); - else - addrlen = sizeof (sin6_t); - - if (STRUCT_FGET(sb, maxlen) < addrlen) { - mi_copy_done(q, mp, EINVAL); - return; - } - - switch (iocp->ioc_cmd) { - case TI_GETMYNAME: - break; - case TI_GETPEERNAME: - if (tcp->tcp_state < TCPS_SYN_RCVD) { - mi_copy_done(q, mp, ENOTCONN); - return; - } - break; - } - mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); - if (!mp1) - return; - - STRUCT_FSET(sb, len, addrlen); - switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { - case TI_GETMYNAME: - (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, - &addrlen); - break; - case TI_GETPEERNAME: - (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, - &addrlen); - break; - } - mp1->b_wptr += addrlen; - /* Copy out the address */ - mi_copyout(q, mp); -} - -static void -tcp_use_pure_tpi(tcp_t *tcp) -{ - conn_t *connp = tcp->tcp_connp; - -#ifdef _ILP32 - tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; -#else - tcp->tcp_acceptor_id = connp->conn_dev; -#endif - /* - * Insert this socket into the acceptor hash. - * We might need it for T_CONN_RES message - */ - tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); - - tcp->tcp_issocket = B_FALSE; - TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); -} - -/* - * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL - * messages. - */ -/* ARGSUSED */ -static void -tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - queue_t *q = connp->conn_wq; - struct iocblk *iocp; - - ASSERT(DB_TYPE(mp) == M_IOCTL); - /* - * Try and ASSERT the minimum possible references on the - * conn early enough. Since we are executing on write side, - * the connection is obviously not detached and that means - * there is a ref each for TCP and IP. Since we are behind - * the squeue, the minimum references needed are 3. If the - * conn is in classifier hash list, there should be an - * extra ref for that (we check both the possibilities). - */ - ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || - (connp->conn_fanout == NULL && connp->conn_ref >= 3)); - - iocp = (struct iocblk *)mp->b_rptr; - switch (iocp->ioc_cmd) { - case _SIOCSOCKFALLBACK: - /* - * Either sockmod is about to be popped and the socket - * would now be treated as a plain stream, or a module - * is about to be pushed so we could no longer use read- - * side synchronous streams for fused loopback tcp. - * Drain any queued data and disable direct sockfs - * interface from now on. - */ - if (!tcp->tcp_issocket) { - DB_TYPE(mp) = M_IOCNAK; - iocp->ioc_error = EINVAL; - } else { - tcp_use_pure_tpi(tcp); - DB_TYPE(mp) = M_IOCACK; - iocp->ioc_error = 0; - } - iocp->ioc_count = 0; - iocp->ioc_rval = 0; - qreply(q, mp); - return; - } - - /* - * If the conn is closing, then error the ioctl here. Otherwise bump the - * conn_ioctlref to hold off tcp_close until we're done here. - */ - mutex_enter(&(connp)->conn_lock); - if ((connp)->conn_state_flags & CONN_CLOSING) { - mutex_exit(&(connp)->conn_lock); - iocp->ioc_error = EINVAL; - mp->b_datap->db_type = M_IOCNAK; - iocp->ioc_count = 0; - qreply(q, mp); - return; - } - - CONN_INC_IOCTLREF_LOCKED(connp); - ip_wput_nondata(q, mp); - CONN_DEC_IOCTLREF(connp); -} - -/* - * This routine is called by tcp_wput() to handle all TPI requests. - */ -/* ARGSUSED */ -static void -tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - union T_primitives *tprim = (union T_primitives *)mp->b_rptr; - uchar_t *rptr; - t_scalar_t type; - cred_t *cr; - - /* - * Try and ASSERT the minimum possible references on the - * conn early enough. Since we are executing on write side, - * the connection is obviously not detached and that means - * there is a ref each for TCP and IP. Since we are behind - * the squeue, the minimum references needed are 3. If the - * conn is in classifier hash list, there should be an - * extra ref for that (we check both the possibilities). - */ - ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || - (connp->conn_fanout == NULL && connp->conn_ref >= 3)); - - rptr = mp->b_rptr; - ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); - if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { - type = ((union T_primitives *)rptr)->type; - if (type == T_EXDATA_REQ) { - tcp_output_urgent(connp, mp, arg2, NULL); - } else if (type != T_DATA_REQ) { - goto non_urgent_data; - } else { - /* TODO: options, flags, ... from user */ - /* Set length to zero for reclamation below */ - tcp_wput_data(tcp, mp->b_cont, B_TRUE); - freeb(mp); - } - return; - } else { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_wput_proto, dropping one..."); - } - freemsg(mp); - return; - } - -non_urgent_data: - - switch ((int)tprim->type) { - case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ - /* - * save the kssl_ent_t from the next block, and convert this - * back to a normal bind_req. - */ - if (mp->b_cont != NULL) { - ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); - - if (tcp->tcp_kssl_ent != NULL) { - kssl_release_ent(tcp->tcp_kssl_ent, NULL, - KSSL_NO_PROXY); - tcp->tcp_kssl_ent = NULL; - } - bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, - sizeof (kssl_ent_t)); - kssl_hold_ent(tcp->tcp_kssl_ent); - freemsg(mp->b_cont); - mp->b_cont = NULL; - } - tprim->type = T_BIND_REQ; - - /* FALLTHROUGH */ - case O_T_BIND_REQ: /* bind request */ - case T_BIND_REQ: /* new semantics bind request */ - tcp_tpi_bind(tcp, mp); - break; - case T_UNBIND_REQ: /* unbind request */ - tcp_tpi_unbind(tcp, mp); - break; - case O_T_CONN_RES: /* old connection response XXX */ - case T_CONN_RES: /* connection response */ - tcp_tli_accept(tcp, mp); - break; - case T_CONN_REQ: /* connection request */ - tcp_tpi_connect(tcp, mp); - break; - case T_DISCON_REQ: /* disconnect request */ - tcp_disconnect(tcp, mp); - break; - case T_CAPABILITY_REQ: - tcp_capability_req(tcp, mp); /* capability request */ - break; - case T_INFO_REQ: /* information request */ - tcp_info_req(tcp, mp); - break; - case T_SVR4_OPTMGMT_REQ: /* manage options req */ - case T_OPTMGMT_REQ: - /* - * Note: no support for snmpcom_req() through new - * T_OPTMGMT_REQ. See comments in ip.c - */ - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) { - tcp_err_ack(tcp, mp, TSYSERR, EINVAL); - return; - } - /* - * If EINPROGRESS is returned, the request has been queued - * for subsequent processing by ip_restart_optmgmt(), which - * will do the CONN_DEC_REF(). - */ - if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) { - svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); - } else { - tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); - } - break; - - case T_UNITDATA_REQ: /* unitdata request */ - tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); - break; - case T_ORDREL_REQ: /* orderly release req */ - freemsg(mp); - - if (tcp->tcp_fused) - tcp_unfuse(tcp); - - if (tcp_xmit_end(tcp) != 0) { - /* - * We were crossing FINs and got a reset from - * the other side. Just ignore it. - */ - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_wput_proto, T_ORDREL_REQ out of " - "state %s", - tcp_display(tcp, NULL, - DISP_ADDR_AND_PORT)); - } - } - break; - case T_ADDR_REQ: - tcp_addr_req(tcp, mp); - break; - default: - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_wput_proto, bogus TPI msg, type %d", - tprim->type); - } - /* - * We used to M_ERROR. Sending TNOTSUPPORT gives the user - * to recover. - */ - tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); - break; - } -} - /* * The TCP write service routine should never be called... */ @@ -17080,1339 +4082,10 @@ tcp_wsrv(queue_t *q) } /* - * Send out a control packet on the tcp connection specified. This routine - * is typically called where we need a simple ACK or RST generated. - */ -static void -tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) -{ - uchar_t *rptr; - tcpha_t *tcpha; - ipha_t *ipha = NULL; - ip6_t *ip6h = NULL; - uint32_t sum; - int total_hdr_len; - int ip_hdr_len; - mblk_t *mp; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - ip_xmit_attr_t *ixa = connp->conn_ixa; - - /* - * Save sum for use in source route later. - */ - sum = connp->conn_ht_ulp_len + connp->conn_sum; - total_hdr_len = connp->conn_ht_iphc_len; - ip_hdr_len = ixa->ixa_ip_hdr_length; - - /* If a text string is passed in with the request, pass it to strlog. */ - if (str != NULL && connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", - str, seq, ack, ctl); - } - mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, - BPRI_MED); - if (mp == NULL) { - return; - } - rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; - mp->b_rptr = rptr; - mp->b_wptr = &rptr[total_hdr_len]; - bcopy(connp->conn_ht_iphc, rptr, total_hdr_len); - - ixa->ixa_pktlen = total_hdr_len; - - if (ixa->ixa_flags & IXAF_IS_IPV4) { - ipha = (ipha_t *)rptr; - ipha->ipha_length = htons(total_hdr_len); - } else { - ip6h = (ip6_t *)rptr; - ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); - } - tcpha = (tcpha_t *)&rptr[ip_hdr_len]; - tcpha->tha_flags = (uint8_t)ctl; - if (ctl & TH_RST) { - BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); - BUMP_MIB(&tcps->tcps_mib, tcpOutControl); - /* - * Don't send TSopt w/ TH_RST packets per RFC 1323. - */ - if (tcp->tcp_snd_ts_ok && - tcp->tcp_state > TCPS_SYN_SENT) { - mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN]; - *(mp->b_wptr) = TCPOPT_EOL; - - ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN; - - if (connp->conn_ipversion == IPV4_VERSION) { - ipha->ipha_length = htons(total_hdr_len - - TCPOPT_REAL_TS_LEN); - } else { - ip6h->ip6_plen = htons(total_hdr_len - - IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN); - } - tcpha->tha_offset_and_reserved -= (3 << 4); - sum -= TCPOPT_REAL_TS_LEN; - } - } - if (ctl & TH_ACK) { - if (tcp->tcp_snd_ts_ok) { - uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; - - U32_TO_BE32(llbolt, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); - U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); - } - - /* Update the latest receive window size in TCP header. */ - tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); - /* Track what we sent to the peer */ - tcp->tcp_tcpha->tha_win = tcpha->tha_win; - tcp->tcp_rack = ack; - tcp->tcp_rack_cnt = 0; - BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - } - BUMP_LOCAL(tcp->tcp_obsegs); - tcpha->tha_seq = htonl(seq); - tcpha->tha_ack = htonl(ack); - /* - * Include the adjustment for a source route if any. - */ - sum = (sum >> 16) + (sum & 0xFFFF); - tcpha->tha_sum = htons(sum); - tcp_send_data(tcp, mp); -} - -/* - * If this routine returns B_TRUE, TCP can generate a RST in response - * to a segment. If it returns B_FALSE, TCP should not respond. - */ -static boolean_t -tcp_send_rst_chk(tcp_stack_t *tcps) -{ - int64_t now; - - /* - * TCP needs to protect itself from generating too many RSTs. - * This can be a DoS attack by sending us random segments - * soliciting RSTs. - * - * What we do here is to have a limit of tcp_rst_sent_rate RSTs - * in each 1 second interval. In this way, TCP still generate - * RSTs in normal cases but when under attack, the impact is - * limited. - */ - if (tcps->tcps_rst_sent_rate_enabled != 0) { - now = ddi_get_lbolt64(); - if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) > - 1*SECONDS) { - tcps->tcps_last_rst_intrvl = now; - tcps->tcps_rst_cnt = 1; - } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) { - return (B_FALSE); - } - } - return (B_TRUE); -} - -/* - * Generate a reset based on an inbound packet, connp is set by caller - * when RST is in response to an unexpected inbound packet for which - * there is active tcp state in the system. - * - * IPSEC NOTE : Try to send the reply with the same protection as it came - * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t. - * That way the packet will go out at the same level of protection as it - * came in with. - */ -static void -tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, - ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp) -{ - ipha_t *ipha = NULL; - ip6_t *ip6h = NULL; - ushort_t len; - tcpha_t *tcpha; - int i; - ipaddr_t v4addr; - in6_addr_t v6addr; - netstack_t *ns = ipst->ips_netstack; - tcp_stack_t *tcps = ns->netstack_tcp; - ip_xmit_attr_t ixas, *ixa; - uint_t ip_hdr_len = ira->ira_ip_hdr_length; - boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */ - ushort_t port; - - if (!tcp_send_rst_chk(tcps)) { - TCP_STAT(tcps, tcp_rst_unsent); - freemsg(mp); - return; - } - - /* - * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other - * options from the listener. In that case the caller must ensure that - * we are running on the listener = connp squeue. - * - * We get a safe copy of conn_ixa so we don't need to restore anything - * we or ip_output_simple might change in the ixa. - */ - if (connp != NULL) { - ASSERT(connp->conn_on_sqp); - - ixa = conn_get_ixa_exclusive(connp); - if (ixa == NULL) { - TCP_STAT(tcps, tcp_rst_unsent); - freemsg(mp); - return; - } - need_refrele = B_TRUE; - } else { - bzero(&ixas, sizeof (ixas)); - ixa = &ixas; - /* - * IXAF_VERIFY_SOURCE is overkill since we know the - * packet was for us. - */ - ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE; - ixa->ixa_protocol = IPPROTO_TCP; - ixa->ixa_zoneid = ira->ira_zoneid; - ixa->ixa_ifindex = 0; - ixa->ixa_ipst = ipst; - ixa->ixa_cred = kcred; - ixa->ixa_cpid = NOPID; - } - - if (str && tcps->tcps_dbg) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, - "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " - "flags 0x%x", - str, seq, ack, ctl); - } - if (mp->b_datap->db_ref != 1) { - mblk_t *mp1 = copyb(mp); - freemsg(mp); - mp = mp1; - if (mp == NULL) - goto done; - } else if (mp->b_cont) { - freemsg(mp->b_cont); - mp->b_cont = NULL; - DB_CKSUMFLAGS(mp) = 0; - } - /* - * We skip reversing source route here. - * (for now we replace all IP options with EOL) - */ - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { - ipha = (ipha_t *)mp->b_rptr; - for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) - mp->b_rptr[i] = IPOPT_EOL; - /* - * Make sure that src address isn't flagrantly invalid. - * Not all broadcast address checking for the src address - * is possible, since we don't know the netmask of the src - * addr. No check for destination address is done, since - * IP will not pass up a packet with a broadcast dest - * address to TCP. Similar checks are done below for IPv6. - */ - if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || - CLASSD(ipha->ipha_src)) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); - ip_drop_input("ipIfStatsInDiscards", mp, NULL); - freemsg(mp); - goto done; - } - } else { - ip6h = (ip6_t *)mp->b_rptr; - - if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || - IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { - BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); - ip_drop_input("ipIfStatsInDiscards", mp, NULL); - freemsg(mp); - goto done; - } - - /* Remove any extension headers assuming partial overlay */ - if (ip_hdr_len > IPV6_HDR_LEN) { - uint8_t *to; - - to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; - ovbcopy(ip6h, to, IPV6_HDR_LEN); - mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; - ip_hdr_len = IPV6_HDR_LEN; - ip6h = (ip6_t *)mp->b_rptr; - ip6h->ip6_nxt = IPPROTO_TCP; - } - } - tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; - if (tcpha->tha_flags & TH_RST) { - freemsg(mp); - goto done; - } - tcpha->tha_offset_and_reserved = (5 << 4); - len = ip_hdr_len + sizeof (tcpha_t); - mp->b_wptr = &mp->b_rptr[len]; - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { - ipha->ipha_length = htons(len); - /* Swap addresses */ - v4addr = ipha->ipha_src; - ipha->ipha_src = ipha->ipha_dst; - ipha->ipha_dst = v4addr; - ipha->ipha_ident = 0; - ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; - ixa->ixa_flags |= IXAF_IS_IPV4; - ixa->ixa_ip_hdr_length = ip_hdr_len; - } else { - ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); - /* Swap addresses */ - v6addr = ip6h->ip6_src; - ip6h->ip6_src = ip6h->ip6_dst; - ip6h->ip6_dst = v6addr; - ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; - ixa->ixa_flags &= ~IXAF_IS_IPV4; - - if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { - ixa->ixa_flags |= IXAF_SCOPEID_SET; - ixa->ixa_scopeid = ira->ira_ruifindex; - } - ixa->ixa_ip_hdr_length = IPV6_HDR_LEN; - } - ixa->ixa_pktlen = len; - - /* Swap the ports */ - port = tcpha->tha_fport; - tcpha->tha_fport = tcpha->tha_lport; - tcpha->tha_lport = port; - - tcpha->tha_ack = htonl(ack); - tcpha->tha_seq = htonl(seq); - tcpha->tha_win = 0; - tcpha->tha_sum = htons(sizeof (tcpha_t)); - tcpha->tha_flags = (uint8_t)ctl; - if (ctl & TH_RST) { - BUMP_MIB(&tcps->tcps_mib, tcpOutRsts); - BUMP_MIB(&tcps->tcps_mib, tcpOutControl); - } - - /* Discard any old label */ - if (ixa->ixa_free_flags & IXA_FREE_TSL) { - ASSERT(ixa->ixa_tsl != NULL); - label_rele(ixa->ixa_tsl); - ixa->ixa_free_flags &= ~IXA_FREE_TSL; - } - ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ - - if (ira->ira_flags & IRAF_IPSEC_SECURE) { - /* - * Apply IPsec based on how IPsec was applied to - * the packet that caused the RST. - */ - if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) { - BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - /* Note: mp already consumed and ip_drop_packet done */ - goto done; - } - } else { - /* - * This is in clear. The RST message we are building - * here should go out in clear, independent of our policy. - */ - ixa->ixa_flags |= IXAF_NO_IPSEC; - } - - /* - * NOTE: one might consider tracing a TCP packet here, but - * this function has no active TCP state and no tcp structure - * that has a trace buffer. If we traced here, we would have - * to keep a local trace buffer in tcp_record_trace(). - */ - - (void) ip_output_simple(mp, ixa); -done: - ixa_cleanup(ixa); - if (need_refrele) { - ASSERT(ixa != &ixas); - ixa_refrele(ixa); - } -} - -/* - * Initiate closedown sequence on an active connection. (May be called as - * writer.) Return value zero for OK return, non-zero for error return. - */ -static int -tcp_xmit_end(tcp_t *tcp) -{ - mblk_t *mp; - tcp_stack_t *tcps = tcp->tcp_tcps; - iulp_t uinfo; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - conn_t *connp = tcp->tcp_connp; - - if (tcp->tcp_state < TCPS_SYN_RCVD || - tcp->tcp_state > TCPS_CLOSE_WAIT) { - /* - * Invalid state, only states TCPS_SYN_RCVD, - * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid - */ - return (-1); - } - - tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; - tcp->tcp_valid_bits |= TCP_FSS_VALID; - /* - * If there is nothing more unsent, send the FIN now. - * Otherwise, it will go out with the last segment. - */ - if (tcp->tcp_unsent == 0) { - mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, - tcp->tcp_fss, B_FALSE, NULL, B_FALSE); - - if (mp) { - tcp_send_data(tcp, mp); - } else { - /* - * Couldn't allocate msg. Pretend we got it out. - * Wait for rexmit timeout. - */ - tcp->tcp_snxt = tcp->tcp_fss + 1; - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - } - - /* - * If needed, update tcp_rexmit_snxt as tcp_snxt is - * changed. - */ - if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { - tcp->tcp_rexmit_nxt = tcp->tcp_snxt; - } - } else { - /* - * If tcp->tcp_cork is set, then the data will not get sent, - * so we have to check that and unset it first. - */ - if (tcp->tcp_cork) - tcp->tcp_cork = B_FALSE; - tcp_wput_data(tcp, NULL, B_FALSE); - } - - /* - * If TCP does not get enough samples of RTT or tcp_rtt_updates - * is 0, don't update the cache. - */ - if (tcps->tcps_rtt_updates == 0 || - tcp->tcp_rtt_update < tcps->tcps_rtt_updates) - return (0); - - /* - * We do not have a good algorithm to update ssthresh at this time. - * So don't do any update. - */ - bzero(&uinfo, sizeof (uinfo)); - uinfo.iulp_rtt = tcp->tcp_rtt_sa; - uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; - - /* - * Note that uinfo is kept for conn_faddr in the DCE. Could update even - * if source routed but we don't. - */ - if (connp->conn_ipversion == IPV4_VERSION) { - if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) { - return (0); - } - (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst); - } else { - uint_t ifindex; - - if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, - &tcp->tcp_ip6h->ip6_dst))) { - return (0); - } - ifindex = 0; - if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) { - ip_xmit_attr_t *ixa = connp->conn_ixa; - - /* - * If we are going to create a DCE we'd better have - * an ifindex - */ - if (ixa->ixa_nce != NULL) { - ifindex = ixa->ixa_nce->nce_common->ncec_ill-> - ill_phyint->phyint_ifindex; - } else { - return (0); - } - } - - (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo, - ipst); - } - return (0); -} - -/* - * Generate a "no listener here" RST in response to an "unknown" segment. - * connp is set by caller when RST is in response to an unexpected - * inbound packet for which there is active tcp state in the system. - * Note that we are reusing the incoming mp to construct the outgoing RST. - */ -void -tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst, - conn_t *connp) -{ - uchar_t *rptr; - uint32_t seg_len; - tcpha_t *tcpha; - uint32_t seg_seq; - uint32_t seg_ack; - uint_t flags; - ipha_t *ipha; - ip6_t *ip6h; - boolean_t policy_present; - netstack_t *ns = ipst->ips_netstack; - tcp_stack_t *tcps = ns->netstack_tcp; - ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; - uint_t ip_hdr_len = ira->ira_ip_hdr_length; - - TCP_STAT(tcps, tcp_no_listener); - - if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { - policy_present = ipss->ipsec_inbound_v4_policy_present; - ipha = (ipha_t *)mp->b_rptr; - ip6h = NULL; - } else { - policy_present = ipss->ipsec_inbound_v6_policy_present; - ipha = NULL; - ip6h = (ip6_t *)mp->b_rptr; - } - - if (policy_present) { - /* - * The conn_t parameter is NULL because we already know - * nobody's home. - */ - mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h, - ira, ns); - if (mp == NULL) - return; - } - if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { - DTRACE_PROBE2( - tx__ip__log__error__nolistener__tcp, - char *, "Could not reply with RST to mp(1)", - mblk_t *, mp); - ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); - freemsg(mp); - return; - } - - rptr = mp->b_rptr; - - tcpha = (tcpha_t *)&rptr[ip_hdr_len]; - seg_seq = ntohl(tcpha->tha_seq); - seg_ack = ntohl(tcpha->tha_ack); - flags = tcpha->tha_flags; - - seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len); - if (flags & TH_RST) { - freemsg(mp); - } else if (flags & TH_ACK) { - tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST, - ira, ipst, connp); - } else { - if (flags & TH_SYN) { - seg_len++; - } else { - /* - * Here we violate the RFC. Note that a normal - * TCP will never send a segment without the ACK - * flag, except for RST or SYN segment. This - * segment is neither. Just drop it on the - * floor. - */ - freemsg(mp); - TCP_STAT(tcps, tcp_rst_unsent); - return; - } - - tcp_xmit_early_reset("no tcp, reset/ack", mp, 0, - seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp); - } -} - -/* - * tcp_xmit_mp is called to return a pointer to an mblk chain complete with - * ip and tcp header ready to pass down to IP. If the mp passed in is - * non-NULL, then up to max_to_send bytes of data will be dup'ed off that - * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary - * otherwise it will dup partial mblks.) - * Otherwise, an appropriate ACK packet will be generated. This - * routine is not usually called to send new data for the first time. It - * is mostly called out of the timer for retransmits, and to generate ACKs. - * - * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will - * be adjusted by *offset. And after dupb(), the offset and the ending mblk - * of the original mblk chain will be returned in *offset and *end_mp. - */ -mblk_t * -tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, - mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, - boolean_t rexmit) -{ - int data_length; - int32_t off = 0; - uint_t flags; - mblk_t *mp1; - mblk_t *mp2; - uchar_t *rptr; - tcpha_t *tcpha; - int32_t num_sack_blk = 0; - int32_t sack_opt_len = 0; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - ip_xmit_attr_t *ixa = connp->conn_ixa; - - /* Allocate for our maximum TCP header + link-level */ - mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, - BPRI_MED); - if (!mp1) - return (NULL); - data_length = 0; - - /* - * Note that tcp_mss has been adjusted to take into account the - * timestamp option if applicable. Because SACK options do not - * appear in every TCP segments and they are of variable lengths, - * they cannot be included in tcp_mss. Thus we need to calculate - * the actual segment length when we need to send a segment which - * includes SACK options. - */ - if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { - num_sack_blk = MIN(tcp->tcp_max_sack_blk, - tcp->tcp_num_sack_blk); - sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + - TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; - if (max_to_send + sack_opt_len > tcp->tcp_mss) - max_to_send -= sack_opt_len; - } - - if (offset != NULL) { - off = *offset; - /* We use offset as an indicator that end_mp is not NULL. */ - *end_mp = NULL; - } - for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { - /* This could be faster with cooperation from downstream */ - if (mp2 != mp1 && !sendall && - data_length + (int)(mp->b_wptr - mp->b_rptr) > - max_to_send) - /* - * Don't send the next mblk since the whole mblk - * does not fit. - */ - break; - mp2->b_cont = dupb(mp); - mp2 = mp2->b_cont; - if (!mp2) { - freemsg(mp1); - return (NULL); - } - mp2->b_rptr += off; - ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= - (uintptr_t)INT_MAX); - - data_length += (int)(mp2->b_wptr - mp2->b_rptr); - if (data_length > max_to_send) { - mp2->b_wptr -= data_length - max_to_send; - data_length = max_to_send; - off = mp2->b_wptr - mp->b_rptr; - break; - } else { - off = 0; - } - } - if (offset != NULL) { - *offset = off; - *end_mp = mp; - } - if (seg_len != NULL) { - *seg_len = data_length; - } - - /* Update the latest receive window size in TCP header. */ - tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); - - rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; - mp1->b_rptr = rptr; - mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len; - bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); - tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; - tcpha->tha_seq = htonl(seq); - - /* - * Use tcp_unsent to determine if the PUSH bit should be used assumes - * that this function was called from tcp_wput_data. Thus, when called - * to retransmit data the setting of the PUSH bit may appear some - * what random in that it might get set when it should not. This - * should not pose any performance issues. - */ - if (data_length != 0 && (tcp->tcp_unsent == 0 || - tcp->tcp_unsent == data_length)) { - flags = TH_ACK | TH_PUSH; - } else { - flags = TH_ACK; - } - - if (tcp->tcp_ecn_ok) { - if (tcp->tcp_ecn_echo_on) - flags |= TH_ECE; - - /* - * Only set ECT bit and ECN_CWR if a segment contains new data. - * There is no TCP flow control for non-data segments, and - * only data segment is transmitted reliably. - */ - if (data_length > 0 && !rexmit) { - SET_ECT(tcp, rptr); - if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { - flags |= TH_CWR; - tcp->tcp_ecn_cwr_sent = B_TRUE; - } - } - } - - if (tcp->tcp_valid_bits) { - uint32_t u1; - - if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && - seq == tcp->tcp_iss) { - uchar_t *wptr; - - /* - * If TCP_ISS_VALID and the seq number is tcp_iss, - * TCP can only be in SYN-SENT, SYN-RCVD or - * FIN-WAIT-1 state. It can be FIN-WAIT-1 if - * our SYN is not ack'ed but the app closes this - * TCP connection. - */ - ASSERT(tcp->tcp_state == TCPS_SYN_SENT || - tcp->tcp_state == TCPS_SYN_RCVD || - tcp->tcp_state == TCPS_FIN_WAIT_1); - - /* - * Tack on the MSS option. It is always needed - * for both active and passive open. - * - * MSS option value should be interface MTU - MIN - * TCP/IP header according to RFC 793 as it means - * the maximum segment size TCP can receive. But - * to get around some broken middle boxes/end hosts - * out there, we allow the option value to be the - * same as the MSS option size on the peer side. - * In this way, the other side will not send - * anything larger than they can receive. - * - * Note that for SYN_SENT state, the ndd param - * tcp_use_smss_as_mss_opt has no effect as we - * don't know the peer's MSS option value. So - * the only case we need to take care of is in - * SYN_RCVD state, which is done later. - */ - wptr = mp1->b_wptr; - wptr[0] = TCPOPT_MAXSEG; - wptr[1] = TCPOPT_MAXSEG_LEN; - wptr += 2; - u1 = tcp->tcp_initial_pmtu - - (connp->conn_ipversion == IPV4_VERSION ? - IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - - TCP_MIN_HEADER_LENGTH; - U16_TO_BE16(u1, wptr); - mp1->b_wptr = wptr + 2; - /* Update the offset to cover the additional word */ - tcpha->tha_offset_and_reserved += (1 << 4); - - /* - * Note that the following way of filling in - * TCP options are not optimal. Some NOPs can - * be saved. But there is no need at this time - * to optimize it. When it is needed, we will - * do it. - */ - switch (tcp->tcp_state) { - case TCPS_SYN_SENT: - flags = TH_SYN; - - if (tcp->tcp_snd_ts_ok) { - uint32_t llbolt = - (uint32_t)LBOLT_FASTPATH; - - wptr = mp1->b_wptr; - wptr[0] = TCPOPT_NOP; - wptr[1] = TCPOPT_NOP; - wptr[2] = TCPOPT_TSTAMP; - wptr[3] = TCPOPT_TSTAMP_LEN; - wptr += 4; - U32_TO_BE32(llbolt, wptr); - wptr += 4; - ASSERT(tcp->tcp_ts_recent == 0); - U32_TO_BE32(0L, wptr); - mp1->b_wptr += TCPOPT_REAL_TS_LEN; - tcpha->tha_offset_and_reserved += - (3 << 4); - } - - /* - * Set up all the bits to tell other side - * we are ECN capable. - */ - if (tcp->tcp_ecn_ok) { - flags |= (TH_ECE | TH_CWR); - } - break; - case TCPS_SYN_RCVD: - flags |= TH_SYN; - - /* - * Reset the MSS option value to be SMSS - * We should probably add back the bytes - * for timestamp option and IPsec. We - * don't do that as this is a workaround - * for broken middle boxes/end hosts, it - * is better for us to be more cautious. - * They may not take these things into - * account in their SMSS calculation. Thus - * the peer's calculated SMSS may be smaller - * than what it can be. This should be OK. - */ - if (tcps->tcps_use_smss_as_mss_opt) { - u1 = tcp->tcp_mss; - U16_TO_BE16(u1, wptr); - } - - /* - * If the other side is ECN capable, reply - * that we are also ECN capable. - */ - if (tcp->tcp_ecn_ok) - flags |= TH_ECE; - break; - default: - /* - * The above ASSERT() makes sure that this - * must be FIN-WAIT-1 state. Our SYN has - * not been ack'ed so retransmit it. - */ - flags |= TH_SYN; - break; - } - - if (tcp->tcp_snd_ws_ok) { - wptr = mp1->b_wptr; - wptr[0] = TCPOPT_NOP; - wptr[1] = TCPOPT_WSCALE; - wptr[2] = TCPOPT_WS_LEN; - wptr[3] = (uchar_t)tcp->tcp_rcv_ws; - mp1->b_wptr += TCPOPT_REAL_WS_LEN; - tcpha->tha_offset_and_reserved += (1 << 4); - } - - if (tcp->tcp_snd_sack_ok) { - wptr = mp1->b_wptr; - wptr[0] = TCPOPT_NOP; - wptr[1] = TCPOPT_NOP; - wptr[2] = TCPOPT_SACK_PERMITTED; - wptr[3] = TCPOPT_SACK_OK_LEN; - mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; - tcpha->tha_offset_and_reserved += (1 << 4); - } - - /* allocb() of adequate mblk assures space */ - ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= - (uintptr_t)INT_MAX); - u1 = (int)(mp1->b_wptr - mp1->b_rptr); - /* - * Get IP set to checksum on our behalf - * Include the adjustment for a source route if any. - */ - u1 += connp->conn_sum; - u1 = (u1 >> 16) + (u1 & 0xFFFF); - tcpha->tha_sum = htons(u1); - BUMP_MIB(&tcps->tcps_mib, tcpOutControl); - } - if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && - (seq + data_length) == tcp->tcp_fss) { - if (!tcp->tcp_fin_acked) { - flags |= TH_FIN; - BUMP_MIB(&tcps->tcps_mib, tcpOutControl); - } - if (!tcp->tcp_fin_sent) { - tcp->tcp_fin_sent = B_TRUE; - switch (tcp->tcp_state) { - case TCPS_SYN_RCVD: - case TCPS_ESTABLISHED: - tcp->tcp_state = TCPS_FIN_WAIT_1; - break; - case TCPS_CLOSE_WAIT: - tcp->tcp_state = TCPS_LAST_ACK; - break; - } - if (tcp->tcp_suna == tcp->tcp_snxt) - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - tcp->tcp_snxt = tcp->tcp_fss + 1; - } - } - /* - * Note the trick here. u1 is unsigned. When tcp_urg - * is smaller than seq, u1 will become a very huge value. - * So the comparison will fail. Also note that tcp_urp - * should be positive, see RFC 793 page 17. - */ - u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; - if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && - u1 < (uint32_t)(64 * 1024)) { - flags |= TH_URG; - BUMP_MIB(&tcps->tcps_mib, tcpOutUrg); - tcpha->tha_urp = htons(u1); - } - } - tcpha->tha_flags = (uchar_t)flags; - tcp->tcp_rack = tcp->tcp_rnxt; - tcp->tcp_rack_cnt = 0; - - if (tcp->tcp_snd_ts_ok) { - if (tcp->tcp_state != TCPS_SYN_SENT) { - uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; - - U32_TO_BE32(llbolt, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); - U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); - } - } - - if (num_sack_blk > 0) { - uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len; - sack_blk_t *tmp; - int32_t i; - - wptr[0] = TCPOPT_NOP; - wptr[1] = TCPOPT_NOP; - wptr[2] = TCPOPT_SACK; - wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * - sizeof (sack_blk_t); - wptr += TCPOPT_REAL_SACK_LEN; - - tmp = tcp->tcp_sack_list; - for (i = 0; i < num_sack_blk; i++) { - U32_TO_BE32(tmp[i].begin, wptr); - wptr += sizeof (tcp_seq); - U32_TO_BE32(tmp[i].end, wptr); - wptr += sizeof (tcp_seq); - } - tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); - } - ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); - data_length += (int)(mp1->b_wptr - rptr); - - ixa->ixa_pktlen = data_length; - - if (ixa->ixa_flags & IXAF_IS_IPV4) { - ((ipha_t *)rptr)->ipha_length = htons(data_length); - } else { - ip6_t *ip6 = (ip6_t *)rptr; - - ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN); - } - - /* - * Prime pump for IP - * Include the adjustment for a source route if any. - */ - data_length -= ixa->ixa_ip_hdr_length; - data_length += connp->conn_sum; - data_length = (data_length >> 16) + (data_length & 0xFFFF); - tcpha->tha_sum = htons(data_length); - if (tcp->tcp_ip_forward_progress) { - tcp->tcp_ip_forward_progress = B_FALSE; - connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; - } else { - connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; - } - return (mp1); -} - -/* This function handles the push timeout. */ -void -tcp_push_timer(void *arg) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - - TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt); - - ASSERT(tcp->tcp_listener == NULL); - - ASSERT(!IPCL_IS_NONSTR(connp)); - - tcp->tcp_push_tid = 0; - - if (tcp->tcp_rcv_list != NULL && - tcp_rcv_drain(tcp) == TH_ACK_NEEDED) - tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); -} - -/* - * This function handles delayed ACK timeout. - */ -static void -tcp_ack_timer(void *arg) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - mblk_t *mp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); - - tcp->tcp_ack_tid = 0; - - if (tcp->tcp_fused) - return; - - /* - * Do not send ACK if there is no outstanding unack'ed data. - */ - if (tcp->tcp_rnxt == tcp->tcp_rack) { - return; - } - - if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { - /* - * Make sure we don't allow deferred ACKs to result in - * timer-based ACKing. If we have held off an ACK - * when there was more than an mss here, and the timer - * goes off, we have to worry about the possibility - * that the sender isn't doing slow-start, or is out - * of step with us for some other reason. We fall - * permanently back in the direction of - * ACK-every-other-packet as suggested in RFC 1122. - */ - if (tcp->tcp_rack_abs_max > 2) - tcp->tcp_rack_abs_max--; - tcp->tcp_rack_cur_max = 2; - } - mp = tcp_ack_mp(tcp); - - if (mp != NULL) { - BUMP_LOCAL(tcp->tcp_obsegs); - BUMP_MIB(&tcps->tcps_mib, tcpOutAck); - BUMP_MIB(&tcps->tcps_mib, tcpOutAckDelayed); - tcp_send_data(tcp, mp); - } -} - - -/* Generate an ACK-only (no data) segment for a TCP endpoint */ -static mblk_t * -tcp_ack_mp(tcp_t *tcp) -{ - uint32_t seq_no; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - - /* - * There are a few cases to be considered while setting the sequence no. - * Essentially, we can come here while processing an unacceptable pkt - * in the TCPS_SYN_RCVD state, in which case we set the sequence number - * to snxt (per RFC 793), note the swnd wouldn't have been set yet. - * If we are here for a zero window probe, stick with suna. In all - * other cases, we check if suna + swnd encompasses snxt and set - * the sequence number to snxt, if so. If snxt falls outside the - * window (the receiver probably shrunk its window), we will go with - * suna + swnd, otherwise the sequence no will be unacceptable to the - * receiver. - */ - if (tcp->tcp_zero_win_probe) { - seq_no = tcp->tcp_suna; - } else if (tcp->tcp_state == TCPS_SYN_RCVD) { - ASSERT(tcp->tcp_swnd == 0); - seq_no = tcp->tcp_snxt; - } else { - seq_no = SEQ_GT(tcp->tcp_snxt, - (tcp->tcp_suna + tcp->tcp_swnd)) ? - (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; - } - - if (tcp->tcp_valid_bits) { - /* - * For the complex case where we have to send some - * controls (FIN or SYN), let tcp_xmit_mp do it. - */ - return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, - NULL, B_FALSE)); - } else { - /* Generate a simple ACK */ - int data_length; - uchar_t *rptr; - tcpha_t *tcpha; - mblk_t *mp1; - int32_t total_hdr_len; - int32_t tcp_hdr_len; - int32_t num_sack_blk = 0; - int32_t sack_opt_len; - ip_xmit_attr_t *ixa = connp->conn_ixa; - - /* - * Allocate space for TCP + IP headers - * and link-level header - */ - if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { - num_sack_blk = MIN(tcp->tcp_max_sack_blk, - tcp->tcp_num_sack_blk); - sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + - TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; - total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; - tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; - } else { - total_hdr_len = connp->conn_ht_iphc_len; - tcp_hdr_len = connp->conn_ht_ulp_len; - } - mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); - if (!mp1) - return (NULL); - - /* Update the latest receive window size in TCP header. */ - tcp->tcp_tcpha->tha_win = - htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); - /* copy in prototype TCP + IP header */ - rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; - mp1->b_rptr = rptr; - mp1->b_wptr = rptr + total_hdr_len; - bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); - - tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; - - /* Set the TCP sequence number. */ - tcpha->tha_seq = htonl(seq_no); - - /* Set up the TCP flag field. */ - tcpha->tha_flags = (uchar_t)TH_ACK; - if (tcp->tcp_ecn_echo_on) - tcpha->tha_flags |= TH_ECE; - - tcp->tcp_rack = tcp->tcp_rnxt; - tcp->tcp_rack_cnt = 0; - - /* fill in timestamp option if in use */ - if (tcp->tcp_snd_ts_ok) { - uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; - - U32_TO_BE32(llbolt, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); - U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); - } - - /* Fill in SACK options */ - if (num_sack_blk > 0) { - uchar_t *wptr = (uchar_t *)tcpha + - connp->conn_ht_ulp_len; - sack_blk_t *tmp; - int32_t i; - - wptr[0] = TCPOPT_NOP; - wptr[1] = TCPOPT_NOP; - wptr[2] = TCPOPT_SACK; - wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * - sizeof (sack_blk_t); - wptr += TCPOPT_REAL_SACK_LEN; - - tmp = tcp->tcp_sack_list; - for (i = 0; i < num_sack_blk; i++) { - U32_TO_BE32(tmp[i].begin, wptr); - wptr += sizeof (tcp_seq); - U32_TO_BE32(tmp[i].end, wptr); - wptr += sizeof (tcp_seq); - } - tcpha->tha_offset_and_reserved += - ((num_sack_blk * 2 + 1) << 4); - } - - ixa->ixa_pktlen = total_hdr_len; - - if (ixa->ixa_flags & IXAF_IS_IPV4) { - ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); - } else { - ip6_t *ip6 = (ip6_t *)rptr; - - ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); - } - - /* - * Prime pump for checksum calculation in IP. Include the - * adjustment for a source route if any. - */ - data_length = tcp_hdr_len + connp->conn_sum; - data_length = (data_length >> 16) + (data_length & 0xFFFF); - tcpha->tha_sum = htons(data_length); - - if (tcp->tcp_ip_forward_progress) { - tcp->tcp_ip_forward_progress = B_FALSE; - connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; - } else { - connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; - } - return (mp1); - } -} - -/* - * Hash list insertion routine for tcp_t structures. Each hash bucket - * contains a list of tcp_t entries, and each entry is bound to a unique - * port. If there are multiple tcp_t's that are bound to the same port, then - * one of them will be linked into the hash bucket list, and the rest will - * hang off of that one entry. For each port, entries bound to a specific IP - * address will be inserted before those those bound to INADDR_ANY. - */ -static void -tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) -{ - tcp_t **tcpp; - tcp_t *tcpnext; - tcp_t *tcphash; - conn_t *connp = tcp->tcp_connp; - conn_t *connext; - - if (tcp->tcp_ptpbhn != NULL) { - ASSERT(!caller_holds_lock); - tcp_bind_hash_remove(tcp); - } - tcpp = &tbf->tf_tcp; - if (!caller_holds_lock) { - mutex_enter(&tbf->tf_lock); - } else { - ASSERT(MUTEX_HELD(&tbf->tf_lock)); - } - tcphash = tcpp[0]; - tcpnext = NULL; - if (tcphash != NULL) { - /* Look for an entry using the same port */ - while ((tcphash = tcpp[0]) != NULL && - connp->conn_lport != tcphash->tcp_connp->conn_lport) - tcpp = &(tcphash->tcp_bind_hash); - - /* The port was not found, just add to the end */ - if (tcphash == NULL) - goto insert; - - /* - * OK, there already exists an entry bound to the - * same port. - * - * If the new tcp bound to the INADDR_ANY address - * and the first one in the list is not bound to - * INADDR_ANY we skip all entries until we find the - * first one bound to INADDR_ANY. - * This makes sure that applications binding to a - * specific address get preference over those binding to - * INADDR_ANY. - */ - tcpnext = tcphash; - connext = tcpnext->tcp_connp; - tcphash = NULL; - if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && - !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { - while ((tcpnext = tcpp[0]) != NULL) { - connext = tcpnext->tcp_connp; - if (!V6_OR_V4_INADDR_ANY( - connext->conn_bound_addr_v6)) - tcpp = &(tcpnext->tcp_bind_hash_port); - else - break; - } - if (tcpnext != NULL) { - tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; - tcphash = tcpnext->tcp_bind_hash; - if (tcphash != NULL) { - tcphash->tcp_ptpbhn = - &(tcp->tcp_bind_hash); - tcpnext->tcp_bind_hash = NULL; - } - } - } else { - tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; - tcphash = tcpnext->tcp_bind_hash; - if (tcphash != NULL) { - tcphash->tcp_ptpbhn = - &(tcp->tcp_bind_hash); - tcpnext->tcp_bind_hash = NULL; - } - } - } -insert: - tcp->tcp_bind_hash_port = tcpnext; - tcp->tcp_bind_hash = tcphash; - tcp->tcp_ptpbhn = tcpp; - tcpp[0] = tcp; - if (!caller_holds_lock) - mutex_exit(&tbf->tf_lock); -} - -/* - * Hash list removal routine for tcp_t structures. - */ -static void -tcp_bind_hash_remove(tcp_t *tcp) -{ - tcp_t *tcpnext; - kmutex_t *lockp; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; - - if (tcp->tcp_ptpbhn == NULL) - return; - - /* - * Extract the lock pointer in case there are concurrent - * hash_remove's for this instance. - */ - ASSERT(connp->conn_lport != 0); - lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( - connp->conn_lport)].tf_lock; - - ASSERT(lockp != NULL); - mutex_enter(lockp); - if (tcp->tcp_ptpbhn) { - tcpnext = tcp->tcp_bind_hash_port; - if (tcpnext != NULL) { - tcp->tcp_bind_hash_port = NULL; - tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; - tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; - if (tcpnext->tcp_bind_hash != NULL) { - tcpnext->tcp_bind_hash->tcp_ptpbhn = - &(tcpnext->tcp_bind_hash); - tcp->tcp_bind_hash = NULL; - } - } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { - tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; - tcp->tcp_bind_hash = NULL; - } - *tcp->tcp_ptpbhn = tcpnext; - tcp->tcp_ptpbhn = NULL; - } - mutex_exit(lockp); -} - - -/* * Hash list lookup routine for tcp_t structures. * Returns with a CONN_INC_REF tcp structure. Caller must do a CONN_DEC_REF. */ -static tcp_t * +tcp_t * tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *tcps) { tf_t *tf; @@ -18432,7 +4105,6 @@ tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *tcps) return (NULL); } - /* * Hash list insertion routine for tcp_t structures. */ @@ -18463,7 +4135,7 @@ tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp) /* * Hash list removal routine for tcp_t structures. */ -static void +void tcp_acceptor_hash_remove(tcp_t *tcp) { tcp_t *tcpnext; @@ -18611,81 +4283,6 @@ tcp_random(void) return (i); } -static int -tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, - int *t_errorp, int *sys_errorp) -{ - int error; - int is_absreq_failure; - t_scalar_t *opt_lenp; - t_scalar_t opt_offset; - int prim_type; - struct T_conn_req *tcreqp; - struct T_conn_res *tcresp; - cred_t *cr; - - /* - * All Solaris components should pass a db_credp - * for this TPI message, hence we ASSERT. - * But in case there is some other M_PROTO that looks - * like a TPI message sent by some other kernel - * component, we check and return an error. - */ - cr = msg_getcred(mp, NULL); - ASSERT(cr != NULL); - if (cr == NULL) - return (-1); - - prim_type = ((union T_primitives *)mp->b_rptr)->type; - ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || - prim_type == T_CONN_RES); - - switch (prim_type) { - case T_CONN_REQ: - tcreqp = (struct T_conn_req *)mp->b_rptr; - opt_offset = tcreqp->OPT_offset; - opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; - break; - case O_T_CONN_RES: - case T_CONN_RES: - tcresp = (struct T_conn_res *)mp->b_rptr; - opt_offset = tcresp->OPT_offset; - opt_lenp = (t_scalar_t *)&tcresp->OPT_length; - break; - } - - *t_errorp = 0; - *sys_errorp = 0; - *do_disconnectp = 0; - - error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, - opt_offset, cr, &tcp_opt_obj, - NULL, &is_absreq_failure); - - switch (error) { - case 0: /* no error */ - ASSERT(is_absreq_failure == 0); - return (0); - case ENOPROTOOPT: - *t_errorp = TBADOPT; - break; - case EACCES: - *t_errorp = TACCES; - break; - default: - *t_errorp = TSYSERR; *sys_errorp = error; - break; - } - if (is_absreq_failure != 0) { - /* - * The connection request should get the local ack - * T_OK_ACK and then a T_DISCON_IND. - */ - *do_disconnectp = 1; - } - return (-1); -} - /* * Split this function out so that if the secret changes, I'm okay. * @@ -18805,6 +4402,10 @@ tcp_ddi_g_init(void) * set of tcp_stack_t's. */ netstack_register(NS_TCP, tcp_stack_init, NULL, tcp_stack_fini); + + mutex_enter(&cpu_lock); + register_cpu_setup_func(tcp_cpu_update, NULL); + mutex_exit(&cpu_lock); } @@ -18880,8 +4481,8 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcp_iss_key_init((uint8_t *)&tcp_g_t_info_ack, sizeof (tcp_g_t_info_ack), tcps); - tcps->tcps_kstat = tcp_kstat2_init(stackid, &tcps->tcps_statistics); - tcps->tcps_mibkp = tcp_kstat_init(stackid, tcps); + tcps->tcps_kstat = tcp_kstat2_init(stackid); + tcps->tcps_mibkp = tcp_kstat_init(stackid); major = mod_name_to_major(INET_NAME); error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); @@ -18894,7 +4495,26 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL); tcps->tcps_reclaim = B_FALSE; tcps->tcps_reclaim_tid = 0; - tcps->tcps_reclaim_period = tcps->tcps_rexmit_interval_max * 3; + tcps->tcps_reclaim_period = tcps->tcps_rexmit_interval_max; + + /* + * ncpus is the current number of CPUs, which can be bigger than + * boot_ncpus. But we don't want to use ncpus to allocate all the + * tcp_stats_cpu_t at system boot up time since it will be 1. While + * we handle adding CPU in tcp_cpu_update(), it will be slow if + * there are many CPUs as we will be adding them 1 by 1. + * + * Note that tcps_sc_cnt never decreases and the tcps_sc[x] pointers + * are not freed until the stack is going away. So there is no need + * to grab a lock to access the per CPU tcps_sc[x] pointer. + */ + tcps->tcps_sc_cnt = MAX(ncpus, boot_ncpus); + tcps->tcps_sc = kmem_zalloc(max_ncpus * sizeof (tcp_stats_cpu_t *), + KM_SLEEP); + for (i = 0; i < tcps->tcps_sc_cnt; i++) { + tcps->tcps_sc[i] = kmem_zalloc(sizeof (tcp_stats_cpu_t), + KM_SLEEP); + } mutex_init(&tcps->tcps_listener_conf_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&tcps->tcps_listener_conf, sizeof (tcp_listener_t), @@ -18909,6 +4529,10 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) void tcp_ddi_g_destroy(void) { + mutex_enter(&cpu_lock); + unregister_cpu_setup_func(tcp_cpu_update, NULL); + mutex_exit(&cpu_lock); + tcp_g_kstat_fini(tcp_g_kstat); tcp_g_kstat = NULL; bzero(&tcp_g_statistics, sizeof (tcp_g_statistics)); @@ -18935,12 +4559,23 @@ tcp_stack_fini(netstackid_t stackid, void *arg) cv_destroy(&tcps->tcps_ixa_cleanup_cv); mutex_destroy(&tcps->tcps_ixa_cleanup_lock); + /* + * Set tcps_reclaim to false tells tcp_reclaim_timer() not to restart + * the timer. + */ + mutex_enter(&tcps->tcps_reclaim_lock); + tcps->tcps_reclaim = B_FALSE; + mutex_exit(&tcps->tcps_reclaim_lock); if (tcps->tcps_reclaim_tid != 0) (void) untimeout(tcps->tcps_reclaim_tid); mutex_destroy(&tcps->tcps_reclaim_lock); tcp_listener_conf_cleanup(tcps); + for (i = 0; i < tcps->tcps_sc_cnt; i++) + kmem_free(tcps->tcps_sc[i], sizeof (tcp_stats_cpu_t)); + kmem_free(tcps->tcps_sc, max_ncpus * sizeof (tcp_stats_cpu_t *)); + nd_free(&tcps->tcps_g_nd); kmem_free(tcps->tcps_params, sizeof (lcl_tcp_param_arr)); tcps->tcps_params = NULL; @@ -18971,7 +4606,6 @@ tcp_stack_fini(netstackid_t stackid, void *arg) tcp_kstat2_fini(stackid, tcps->tcps_kstat); tcps->tcps_kstat = NULL; - bzero(&tcps->tcps_statistics, sizeof (tcps->tcps_statistics)); tcp_kstat_fini(stackid, tcps->tcps_mibkp); tcps->tcps_mibkp = NULL; @@ -19029,968 +4663,6 @@ tcp_iss_init(tcp_t *tcp) } /* - * Exported routine for extracting active tcp connection status. - * - * This is used by the Solaris Cluster Networking software to - * gather a list of connections that need to be forwarded to - * specific nodes in the cluster when configuration changes occur. - * - * The callback is invoked for each tcp_t structure from all netstacks, - * if 'stack_id' is less than 0. Otherwise, only for tcp_t structures - * from the netstack with the specified stack_id. Returning - * non-zero from the callback routine terminates the search. - */ -int -cl_tcp_walk_list(netstackid_t stack_id, - int (*cl_callback)(cl_tcp_info_t *, void *), void *arg) -{ - netstack_handle_t nh; - netstack_t *ns; - int ret = 0; - - if (stack_id >= 0) { - if ((ns = netstack_find_by_stackid(stack_id)) == NULL) - return (EINVAL); - - ret = cl_tcp_walk_list_stack(cl_callback, arg, - ns->netstack_tcp); - netstack_rele(ns); - return (ret); - } - - netstack_next_init(&nh); - while ((ns = netstack_next(&nh)) != NULL) { - ret = cl_tcp_walk_list_stack(cl_callback, arg, - ns->netstack_tcp); - netstack_rele(ns); - } - netstack_next_fini(&nh); - return (ret); -} - -static int -cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, - tcp_stack_t *tcps) -{ - tcp_t *tcp; - cl_tcp_info_t cl_tcpi; - connf_t *connfp; - conn_t *connp; - int i; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - ASSERT(callback != NULL); - - for (i = 0; i < CONN_G_HASH_SIZE; i++) { - connfp = &ipst->ips_ipcl_globalhash_fanout[i]; - connp = NULL; - - while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { - - tcp = connp->conn_tcp; - cl_tcpi.cl_tcpi_version = CL_TCPI_V1; - cl_tcpi.cl_tcpi_ipversion = connp->conn_ipversion; - cl_tcpi.cl_tcpi_state = tcp->tcp_state; - cl_tcpi.cl_tcpi_lport = connp->conn_lport; - cl_tcpi.cl_tcpi_fport = connp->conn_fport; - cl_tcpi.cl_tcpi_laddr_v6 = connp->conn_laddr_v6; - cl_tcpi.cl_tcpi_faddr_v6 = connp->conn_faddr_v6; - - /* - * If the callback returns non-zero - * we terminate the traversal. - */ - if ((*callback)(&cl_tcpi, arg) != 0) { - CONN_DEC_REF(tcp->tcp_connp); - return (1); - } - } - } - - return (0); -} - -/* - * Macros used for accessing the different types of sockaddr - * structures inside a tcp_ioc_abort_conn_t. - */ -#define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) -#define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) -#define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) -#define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) -#define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) -#define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) -#define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) -#define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) -#define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) -#define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) -#define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) -#define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) - -/* - * Return the correct error code to mimic the behavior - * of a connection reset. - */ -#define TCP_AC_GET_ERRCODE(state, err) { \ - switch ((state)) { \ - case TCPS_SYN_SENT: \ - case TCPS_SYN_RCVD: \ - (err) = ECONNREFUSED; \ - break; \ - case TCPS_ESTABLISHED: \ - case TCPS_FIN_WAIT_1: \ - case TCPS_FIN_WAIT_2: \ - case TCPS_CLOSE_WAIT: \ - (err) = ECONNRESET; \ - break; \ - case TCPS_CLOSING: \ - case TCPS_LAST_ACK: \ - case TCPS_TIME_WAIT: \ - (err) = 0; \ - break; \ - default: \ - (err) = ENXIO; \ - } \ - } - -/* - * Check if a tcp structure matches the info in acp. - */ -#define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ - (((acp)->ac_local.ss_family == AF_INET) ? \ - ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ - TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ - (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ - TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ - (TCP_AC_V4LPORT((acp)) == 0 || \ - TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ - (TCP_AC_V4RPORT((acp)) == 0 || \ - TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ - (acp)->ac_start <= (tcp)->tcp_state && \ - (acp)->ac_end >= (tcp)->tcp_state) : \ - ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ - IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ - &(connp)->conn_laddr_v6)) && \ - (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ - IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ - &(connp)->conn_faddr_v6)) && \ - (TCP_AC_V6LPORT((acp)) == 0 || \ - TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ - (TCP_AC_V6RPORT((acp)) == 0 || \ - TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ - (acp)->ac_start <= (tcp)->tcp_state && \ - (acp)->ac_end >= (tcp)->tcp_state)) - -#define TCP_AC_MATCH(acp, connp, tcp) \ - (((acp)->ac_zoneid == ALL_ZONES || \ - (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ - TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) - -/* - * Build a message containing a tcp_ioc_abort_conn_t structure - * which is filled in with information from acp and tp. - */ -static mblk_t * -tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) -{ - mblk_t *mp; - tcp_ioc_abort_conn_t *tacp; - - mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); - if (mp == NULL) - return (NULL); - - *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; - tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + - sizeof (uint32_t)); - - tacp->ac_start = acp->ac_start; - tacp->ac_end = acp->ac_end; - tacp->ac_zoneid = acp->ac_zoneid; - - if (acp->ac_local.ss_family == AF_INET) { - tacp->ac_local.ss_family = AF_INET; - tacp->ac_remote.ss_family = AF_INET; - TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; - TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; - TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; - TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; - } else { - tacp->ac_local.ss_family = AF_INET6; - tacp->ac_remote.ss_family = AF_INET6; - TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; - TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; - TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; - TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; - } - mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); - return (mp); -} - -/* - * Print a tcp_ioc_abort_conn_t structure. - */ -static void -tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) -{ - char lbuf[128]; - char rbuf[128]; - sa_family_t af; - in_port_t lport, rport; - ushort_t logflags; - - af = acp->ac_local.ss_family; - - if (af == AF_INET) { - (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), - lbuf, 128); - (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), - rbuf, 128); - lport = ntohs(TCP_AC_V4LPORT(acp)); - rport = ntohs(TCP_AC_V4RPORT(acp)); - } else { - (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), - lbuf, 128); - (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), - rbuf, 128); - lport = ntohs(TCP_AC_V6LPORT(acp)); - rport = ntohs(TCP_AC_V6RPORT(acp)); - } - - logflags = SL_TRACE | SL_NOTE; - /* - * Don't print this message to the console if the operation was done - * to a non-global zone. - */ - if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) - logflags |= SL_CONSOLE; - (void) strlog(TCP_MOD_ID, 0, 1, logflags, - "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " - "start = %d, end = %d\n", lbuf, lport, rbuf, rport, - acp->ac_start, acp->ac_end); -} - -/* - * Called using SQ_FILL when a message built using - * tcp_ioctl_abort_build_msg is put into a queue. - * Note that when we get here there is no wildcard in acp any more. - */ -/* ARGSUSED2 */ -static void -tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *dummy) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - tcp_ioc_abort_conn_t *acp; - - /* - * Don't accept any input on a closed tcp as this TCP logically does - * not exist on the system. Don't proceed further with this TCP. - * For eg. this packet could trigger another close of this tcp - * which would be disastrous for tcp_refcnt. tcp_close_detached / - * tcp_clean_death / tcp_closei_local must be called at most once - * on a TCP. - */ - if (tcp->tcp_state == TCPS_CLOSED || - tcp->tcp_state == TCPS_BOUND) { - freemsg(mp); - return; - } - - acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); - if (tcp->tcp_state <= acp->ac_end) { - /* - * If we get here, we are already on the correct - * squeue. This ioctl follows the following path - * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn - * ->tcp_ioctl_abort->squeue_enter (if on a - * different squeue) - */ - int errcode; - - TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); - (void) tcp_clean_death(tcp, errcode, 26); - } - freemsg(mp); -} - -/* - * Abort all matching connections on a hash chain. - */ -static int -tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, - boolean_t exact, tcp_stack_t *tcps) -{ - int nmatch, err = 0; - tcp_t *tcp; - MBLKP mp, last, listhead = NULL; - conn_t *tconnp; - connf_t *connfp; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - connfp = &ipst->ips_ipcl_conn_fanout[index]; - -startover: - nmatch = 0; - - mutex_enter(&connfp->connf_lock); - for (tconnp = connfp->connf_head; tconnp != NULL; - tconnp = tconnp->conn_next) { - tcp = tconnp->conn_tcp; - /* - * We are missing a check on sin6_scope_id for linklocals here, - * but current usage is just for aborting based on zoneid - * for shared-IP zones. - */ - if (TCP_AC_MATCH(acp, tconnp, tcp)) { - CONN_INC_REF(tconnp); - mp = tcp_ioctl_abort_build_msg(acp, tcp); - if (mp == NULL) { - err = ENOMEM; - CONN_DEC_REF(tconnp); - break; - } - mp->b_prev = (mblk_t *)tcp; - - if (listhead == NULL) { - listhead = mp; - last = mp; - } else { - last->b_next = mp; - last = mp; - } - nmatch++; - if (exact) - break; - } - - /* Avoid holding lock for too long. */ - if (nmatch >= 500) - break; - } - mutex_exit(&connfp->connf_lock); - - /* Pass mp into the correct tcp */ - while ((mp = listhead) != NULL) { - listhead = listhead->b_next; - tcp = (tcp_t *)mp->b_prev; - mp->b_next = mp->b_prev = NULL; - SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, - tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, - SQ_FILL, SQTAG_TCP_ABORT_BUCKET); - } - - *count += nmatch; - if (nmatch >= 500 && err == 0) - goto startover; - return (err); -} - -/* - * Abort all connections that matches the attributes specified in acp. - */ -static int -tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) -{ - sa_family_t af; - uint32_t ports; - uint16_t *pports; - int err = 0, count = 0; - boolean_t exact = B_FALSE; /* set when there is no wildcard */ - int index = -1; - ushort_t logflags; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - af = acp->ac_local.ss_family; - - if (af == AF_INET) { - if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && - TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { - pports = (uint16_t *)&ports; - pports[1] = TCP_AC_V4LPORT(acp); - pports[0] = TCP_AC_V4RPORT(acp); - exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); - } - } else { - if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && - TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { - pports = (uint16_t *)&ports; - pports[1] = TCP_AC_V6LPORT(acp); - pports[0] = TCP_AC_V6RPORT(acp); - exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); - } - } - - /* - * For cases where remote addr, local port, and remote port are non- - * wildcards, tcp_ioctl_abort_bucket will only be called once. - */ - if (index != -1) { - err = tcp_ioctl_abort_bucket(acp, index, - &count, exact, tcps); - } else { - /* - * loop through all entries for wildcard case - */ - for (index = 0; - index < ipst->ips_ipcl_conn_fanout_size; - index++) { - err = tcp_ioctl_abort_bucket(acp, index, - &count, exact, tcps); - if (err != 0) - break; - } - } - - logflags = SL_TRACE | SL_NOTE; - /* - * Don't print this message to the console if the operation was done - * to a non-global zone. - */ - if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) - logflags |= SL_CONSOLE; - (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " - "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); - if (err == 0 && count == 0) - err = ENOENT; - return (err); -} - -/* - * Process the TCP_IOC_ABORT_CONN ioctl request. - */ -static void -tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) -{ - int err; - IOCP iocp; - MBLKP mp1; - sa_family_t laf, raf; - tcp_ioc_abort_conn_t *acp; - zone_t *zptr; - conn_t *connp = Q_TO_CONN(q); - zoneid_t zoneid = connp->conn_zoneid; - tcp_t *tcp = connp->conn_tcp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - iocp = (IOCP)mp->b_rptr; - - if ((mp1 = mp->b_cont) == NULL || - iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { - err = EINVAL; - goto out; - } - - /* check permissions */ - if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { - err = EPERM; - goto out; - } - - if (mp1->b_cont != NULL) { - freemsg(mp1->b_cont); - mp1->b_cont = NULL; - } - - acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; - laf = acp->ac_local.ss_family; - raf = acp->ac_remote.ss_family; - - /* check that a zone with the supplied zoneid exists */ - if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { - zptr = zone_find_by_id(zoneid); - if (zptr != NULL) { - zone_rele(zptr); - } else { - err = EINVAL; - goto out; - } - } - - /* - * For exclusive stacks we set the zoneid to zero - * to make TCP operate as if in the global zone. - */ - if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) - acp->ac_zoneid = GLOBAL_ZONEID; - - if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || - acp->ac_start > acp->ac_end || laf != raf || - (laf != AF_INET && laf != AF_INET6)) { - err = EINVAL; - goto out; - } - - tcp_ioctl_abort_dump(acp); - err = tcp_ioctl_abort(acp, tcps); - -out: - if (mp1 != NULL) { - freemsg(mp1); - mp->b_cont = NULL; - } - - if (err != 0) - miocnak(q, mp, 0, err); - else - miocack(q, mp, 0, 0); -} - -/* - * tcp_time_wait_processing() handles processing of incoming packets when - * the tcp is in the TIME_WAIT state. - * A TIME_WAIT tcp that has an associated open TCP stream is never put - * on the time wait list. - */ -void -tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, - uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) -{ - int32_t bytes_acked; - int32_t gap; - int32_t rgap; - tcp_opt_t tcpopt; - uint_t flags; - uint32_t new_swnd = 0; - conn_t *nconnp; - conn_t *connp = tcp->tcp_connp; - tcp_stack_t *tcps = tcp->tcp_tcps; - - BUMP_LOCAL(tcp->tcp_ibsegs); - DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); - - flags = (unsigned int)tcpha->tha_flags & 0xFF; - new_swnd = ntohs(tcpha->tha_win) << - ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); - if (tcp->tcp_snd_ts_ok) { - if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { - tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, - tcp->tcp_rnxt, TH_ACK); - goto done; - } - } - gap = seg_seq - tcp->tcp_rnxt; - rgap = tcp->tcp_rwnd - (gap + seg_len); - if (gap < 0) { - BUMP_MIB(&tcps->tcps_mib, tcpInDataDupSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInDataDupBytes, - (seg_len > -gap ? -gap : seg_len)); - seg_len += gap; - if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { - if (flags & TH_RST) { - goto done; - } - if ((flags & TH_FIN) && seg_len == -1) { - /* - * When TCP receives a duplicate FIN in - * TIME_WAIT state, restart the 2 MSL timer. - * See page 73 in RFC 793. Make sure this TCP - * is already on the TIME_WAIT list. If not, - * just restart the timer. - */ - if (TCP_IS_DETACHED(tcp)) { - if (tcp_time_wait_remove(tcp, NULL) == - B_TRUE) { - tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcps, - tcp_rput_time_wait); - } - } else { - ASSERT(tcp != NULL); - TCP_TIMER_RESTART(tcp, - tcps->tcps_time_wait_interval); - } - tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, - tcp->tcp_rnxt, TH_ACK); - goto done; - } - flags |= TH_ACK_NEEDED; - seg_len = 0; - goto process_ack; - } - - /* Fix seg_seq, and chew the gap off the front. */ - seg_seq = tcp->tcp_rnxt; - } - - if ((flags & TH_SYN) && gap > 0 && rgap < 0) { - /* - * Make sure that when we accept the connection, pick - * an ISS greater than (tcp_snxt + ISS_INCR/2) for the - * old connection. - * - * The next ISS generated is equal to tcp_iss_incr_extra - * + ISS_INCR/2 + other components depending on the - * value of tcp_strong_iss. We pre-calculate the new - * ISS here and compare with tcp_snxt to determine if - * we need to make adjustment to tcp_iss_incr_extra. - * - * The above calculation is ugly and is a - * waste of CPU cycles... - */ - uint32_t new_iss = tcps->tcps_iss_incr_extra; - int32_t adj; - ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; - - switch (tcps->tcps_strong_iss) { - case 2: { - /* Add time and MD5 components. */ - uint32_t answer[4]; - struct { - uint32_t ports; - in6_addr_t src; - in6_addr_t dst; - } arg; - MD5_CTX context; - - mutex_enter(&tcps->tcps_iss_key_lock); - context = tcps->tcps_iss_key; - mutex_exit(&tcps->tcps_iss_key_lock); - arg.ports = connp->conn_ports; - /* We use MAPPED addresses in tcp_iss_init */ - arg.src = connp->conn_laddr_v6; - arg.dst = connp->conn_faddr_v6; - MD5Update(&context, (uchar_t *)&arg, - sizeof (arg)); - MD5Final((uchar_t *)answer, &context); - answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; - new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; - break; - } - case 1: - /* Add time component and min random (i.e. 1). */ - new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; - break; - default: - /* Add only time component. */ - new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; - break; - } - if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { - /* - * New ISS not guaranteed to be ISS_INCR/2 - * ahead of the current tcp_snxt, so add the - * difference to tcp_iss_incr_extra. - */ - tcps->tcps_iss_incr_extra += adj; - } - /* - * If tcp_clean_death() can not perform the task now, - * drop the SYN packet and let the other side re-xmit. - * Otherwise pass the SYN packet back in, since the - * old tcp state has been cleaned up or freed. - */ - if (tcp_clean_death(tcp, 0, 27) == -1) - goto done; - nconnp = ipcl_classify(mp, ira, ipst); - if (nconnp != NULL) { - TCP_STAT(tcps, tcp_time_wait_syn_success); - /* Drops ref on nconnp */ - tcp_reinput(nconnp, mp, ira, ipst); - return; - } - goto done; - } - - /* - * rgap is the amount of stuff received out of window. A negative - * value is the amount out of window. - */ - if (rgap < 0) { - BUMP_MIB(&tcps->tcps_mib, tcpInDataPastWinSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInDataPastWinBytes, -rgap); - /* Fix seg_len and make sure there is something left. */ - seg_len += rgap; - if (seg_len <= 0) { - if (flags & TH_RST) { - goto done; - } - flags |= TH_ACK_NEEDED; - seg_len = 0; - goto process_ack; - } - } - /* - * Check whether we can update tcp_ts_recent. This test is - * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP - * Extensions for High Performance: An Update", Internet Draft. - */ - if (tcp->tcp_snd_ts_ok && - TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && - SEQ_LEQ(seg_seq, tcp->tcp_rack)) { - tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; - tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); - } - - if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { - /* Always ack out of order packets */ - flags |= TH_ACK_NEEDED; - seg_len = 0; - } else if (seg_len > 0) { - BUMP_MIB(&tcps->tcps_mib, tcpInClosed); - BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, seg_len); - } - if (flags & TH_RST) { - (void) tcp_clean_death(tcp, 0, 28); - goto done; - } - if (flags & TH_SYN) { - tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, - TH_RST|TH_ACK); - /* - * Do not delete the TCP structure if it is in - * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. - */ - goto done; - } -process_ack: - if (flags & TH_ACK) { - bytes_acked = (int)(seg_ack - tcp->tcp_suna); - if (bytes_acked <= 0) { - if (bytes_acked == 0 && seg_len == 0 && - new_swnd == tcp->tcp_swnd) - BUMP_MIB(&tcps->tcps_mib, tcpInDupAck); - } else { - /* Acks something not sent */ - flags |= TH_ACK_NEEDED; - } - } - if (flags & TH_ACK_NEEDED) { - /* - * Time to send an ack for some reason. - */ - tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, - tcp->tcp_rnxt, TH_ACK); - } -done: - freemsg(mp); -} - -/* - * TCP Timers Implementation. - */ -timeout_id_t -tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) -{ - mblk_t *mp; - tcp_timer_t *tcpt; - tcp_t *tcp = connp->conn_tcp; - - ASSERT(connp->conn_sqp != NULL); - - TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls); - - if (tcp->tcp_timercache == NULL) { - mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); - } else { - TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc); - mp = tcp->tcp_timercache; - tcp->tcp_timercache = mp->b_next; - mp->b_next = NULL; - ASSERT(mp->b_wptr == NULL); - } - - CONN_INC_REF(connp); - tcpt = (tcp_timer_t *)mp->b_rptr; - tcpt->connp = connp; - tcpt->tcpt_proc = f; - /* - * TCP timers are normal timeouts. Plus, they do not require more than - * a 10 millisecond resolution. By choosing a coarser resolution and by - * rounding up the expiration to the next resolution boundary, we can - * batch timers in the callout subsystem to make TCP timers more - * efficient. The roundup also protects short timers from expiring too - * early before they have a chance to be cancelled. - */ - tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp, - TICK_TO_NSEC(tim), CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); - - return ((timeout_id_t)mp); -} - -static void -tcp_timer_callback(void *arg) -{ - mblk_t *mp = (mblk_t *)arg; - tcp_timer_t *tcpt; - conn_t *connp; - - tcpt = (tcp_timer_t *)mp->b_rptr; - connp = tcpt->connp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, - NULL, SQ_FILL, SQTAG_TCP_TIMER); -} - -/* ARGSUSED */ -static void -tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - tcp_timer_t *tcpt; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - - tcpt = (tcp_timer_t *)mp->b_rptr; - ASSERT(connp == tcpt->connp); - ASSERT((squeue_t *)arg2 == connp->conn_sqp); - - /* - * If the TCP has reached the closed state, don't proceed any - * further. This TCP logically does not exist on the system. - * tcpt_proc could for example access queues, that have already - * been qprocoff'ed off. - */ - if (tcp->tcp_state != TCPS_CLOSED) { - (*tcpt->tcpt_proc)(connp); - } else { - tcp->tcp_timer_tid = 0; - } - tcp_timer_free(connp->conn_tcp, mp); -} - -/* - * There is potential race with untimeout and the handler firing at the same - * time. The mblock may be freed by the handler while we are trying to use - * it. But since both should execute on the same squeue, this race should not - * occur. - */ -clock_t -tcp_timeout_cancel(conn_t *connp, timeout_id_t id) -{ - mblk_t *mp = (mblk_t *)id; - tcp_timer_t *tcpt; - clock_t delta; - - TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs); - - if (mp == NULL) - return (-1); - - tcpt = (tcp_timer_t *)mp->b_rptr; - ASSERT(tcpt->connp == connp); - - delta = untimeout_default(tcpt->tcpt_tid, 0); - - if (delta >= 0) { - TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled); - tcp_timer_free(connp->conn_tcp, mp); - CONN_DEC_REF(connp); - } - - return (delta); -} - -/* - * Allocate space for the timer event. The allocation looks like mblk, but it is - * not a proper mblk. To avoid confusion we set b_wptr to NULL. - * - * Dealing with failures: If we can't allocate from the timer cache we try - * allocating from dblock caches using allocb_tryhard(). In this case b_wptr - * points to b_rptr. - * If we can't allocate anything using allocb_tryhard(), we perform a last - * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and - * save the actual allocation size in b_datap. - */ -mblk_t * -tcp_timermp_alloc(int kmflags) -{ - mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, - kmflags & ~KM_PANIC); - - if (mp != NULL) { - mp->b_next = mp->b_prev = NULL; - mp->b_rptr = (uchar_t *)(&mp[1]); - mp->b_wptr = NULL; - mp->b_datap = NULL; - mp->b_queue = NULL; - mp->b_cont = NULL; - } else if (kmflags & KM_PANIC) { - /* - * Failed to allocate memory for the timer. Try allocating from - * dblock caches. - */ - /* ipclassifier calls this from a constructor - hence no tcps */ - TCP_G_STAT(tcp_timermp_allocfail); - mp = allocb_tryhard(sizeof (tcp_timer_t)); - if (mp == NULL) { - size_t size = 0; - /* - * Memory is really low. Try tryhard allocation. - * - * ipclassifier calls this from a constructor - - * hence no tcps - */ - TCP_G_STAT(tcp_timermp_allocdblfail); - mp = kmem_alloc_tryhard(sizeof (mblk_t) + - sizeof (tcp_timer_t), &size, kmflags); - mp->b_rptr = (uchar_t *)(&mp[1]); - mp->b_next = mp->b_prev = NULL; - mp->b_wptr = (uchar_t *)-1; - mp->b_datap = (dblk_t *)size; - mp->b_queue = NULL; - mp->b_cont = NULL; - } - ASSERT(mp->b_wptr != NULL); - } - /* ipclassifier calls this from a constructor - hence no tcps */ - TCP_G_DBGSTAT(tcp_timermp_alloced); - - return (mp); -} - -/* - * Free per-tcp timer cache. - * It can only contain entries from tcp_timercache. - */ -void -tcp_timermp_free(tcp_t *tcp) -{ - mblk_t *mp; - - while ((mp = tcp->tcp_timercache) != NULL) { - ASSERT(mp->b_wptr == NULL); - tcp->tcp_timercache = tcp->tcp_timercache->b_next; - kmem_cache_free(tcp_timercache, mp); - } -} - -/* - * Free timer event. Put it on the per-tcp timer cache if there is not too many - * events there already (currently at most two events are cached). - * If the event is not allocated from the timer cache, free it right away. - */ -static void -tcp_timer_free(tcp_t *tcp, mblk_t *mp) -{ - mblk_t *mp1 = tcp->tcp_timercache; - - if (mp->b_wptr != NULL) { - /* - * This allocation is not from a timer cache, free it right - * away. - */ - if (mp->b_wptr != (uchar_t *)-1) - freeb(mp); - else - kmem_free(mp, (size_t)mp->b_datap); - } else if (mp1 == NULL || mp1->b_next == NULL) { - /* Cache this timer block for future allocations */ - mp->b_rptr = (uchar_t *)(&mp[1]); - mp->b_next = mp1; - tcp->tcp_timercache = mp; - } else { - kmem_cache_free(tcp_timercache, mp); - TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed); - } -} - -/* - * End of TCP Timers implementation. - */ - -/* * tcp_{set,clr}qfull() functions are used to either set or clear QFULL * on the specified backing STREAMS q. Note, the caller may make the * decision to call based on the tcp_t.tcp_flow_stopped value which @@ -20020,349 +4692,6 @@ tcp_clrqfull(tcp_t *tcp) conn_clrqfull(connp, &tcp->tcp_flow_stopped); } -/* - * kstats related to squeues i.e. not per IP instance - */ -static void * -tcp_g_kstat_init(tcp_g_stat_t *tcp_g_statp) -{ - kstat_t *ksp; - - tcp_g_stat_t template = { - { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, - { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, - { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, - { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, - }; - - ksp = kstat_create(TCP_MOD_NAME, 0, "tcpstat_g", "net", - KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL); - - if (ksp == NULL) - return (NULL); - - bcopy(&template, tcp_g_statp, sizeof (template)); - ksp->ks_data = (void *)tcp_g_statp; - - kstat_install(ksp); - return (ksp); -} - -static void -tcp_g_kstat_fini(kstat_t *ksp) -{ - if (ksp != NULL) { - kstat_delete(ksp); - } -} - - -static void * -tcp_kstat2_init(netstackid_t stackid, tcp_stat_t *tcps_statisticsp) -{ - kstat_t *ksp; - - tcp_stat_t template = { - { "tcp_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_syn", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_syn_success", KSTAT_DATA_UINT64 }, - { "tcp_detach_non_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_detach_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_time_wait_reap", KSTAT_DATA_UINT64 }, - { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64 }, - { "tcp_reinit_calls", KSTAT_DATA_UINT64 }, - { "tcp_eager_err1", KSTAT_DATA_UINT64 }, - { "tcp_eager_err2", KSTAT_DATA_UINT64 }, - { "tcp_eager_blowoff_calls", KSTAT_DATA_UINT64 }, - { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64 }, - { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64 }, - { "tcp_not_hard_bound", KSTAT_DATA_UINT64 }, - { "tcp_no_listener", KSTAT_DATA_UINT64 }, - { "tcp_found_eager", KSTAT_DATA_UINT64 }, - { "tcp_wrong_queue", KSTAT_DATA_UINT64 }, - { "tcp_found_eager_binding1", KSTAT_DATA_UINT64 }, - { "tcp_found_eager_bound1", KSTAT_DATA_UINT64 }, - { "tcp_eager_has_listener1", KSTAT_DATA_UINT64 }, - { "tcp_open_alloc", KSTAT_DATA_UINT64 }, - { "tcp_open_detached_alloc", KSTAT_DATA_UINT64 }, - { "tcp_rput_time_wait", KSTAT_DATA_UINT64 }, - { "tcp_listendrop", KSTAT_DATA_UINT64 }, - { "tcp_listendropq0", KSTAT_DATA_UINT64 }, - { "tcp_wrong_rq", KSTAT_DATA_UINT64 }, - { "tcp_rsrv_calls", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree2", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree3", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree4", KSTAT_DATA_UINT64 }, - { "tcp_eagerfree5", KSTAT_DATA_UINT64 }, - { "tcp_timewait_syn_fail", KSTAT_DATA_UINT64 }, - { "tcp_listen_badflags", KSTAT_DATA_UINT64 }, - { "tcp_timeout_calls", KSTAT_DATA_UINT64 }, - { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64 }, - { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64 }, - { "tcp_timeout_canceled", KSTAT_DATA_UINT64 }, - { "tcp_timermp_freed", KSTAT_DATA_UINT64 }, - { "tcp_push_timer_cnt", KSTAT_DATA_UINT64 }, - { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64 }, - { "tcp_wsrv_called", KSTAT_DATA_UINT64 }, - { "tcp_flwctl_on", KSTAT_DATA_UINT64 }, - { "tcp_timer_fire_early", KSTAT_DATA_UINT64 }, - { "tcp_timer_fire_miss", KSTAT_DATA_UINT64 }, - { "tcp_rput_v6_error", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_on", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_off", KSTAT_DATA_UINT64 }, - { "tcp_zcopy_backoff", KSTAT_DATA_UINT64 }, - { "tcp_fusion_flowctl", KSTAT_DATA_UINT64 }, - { "tcp_fusion_backenabled", KSTAT_DATA_UINT64 }, - { "tcp_fusion_urg", KSTAT_DATA_UINT64 }, - { "tcp_fusion_putnext", KSTAT_DATA_UINT64 }, - { "tcp_fusion_unfusable", KSTAT_DATA_UINT64 }, - { "tcp_fusion_aborted", KSTAT_DATA_UINT64 }, - { "tcp_fusion_unqualified", KSTAT_DATA_UINT64 }, - { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64 }, - { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64 }, - { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64 }, - { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64 }, - { "tcp_sock_fallback", KSTAT_DATA_UINT64 }, - { "tcp_lso_enabled", KSTAT_DATA_UINT64 }, - { "tcp_lso_disabled", KSTAT_DATA_UINT64 }, - { "tcp_lso_times", KSTAT_DATA_UINT64 }, - { "tcp_lso_pkt_out", KSTAT_DATA_UINT64 }, - { "tcp_listen_cnt_drop", KSTAT_DATA_UINT64 }, - { "tcp_listen_mem_drop", KSTAT_DATA_UINT64 }, - { "tcp_zwin_ack_syn", KSTAT_DATA_UINT64 }, - { "tcp_rst_unsent", KSTAT_DATA_UINT64 } - }; - - ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net", - KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), - KSTAT_FLAG_VIRTUAL, stackid); - - if (ksp == NULL) - return (NULL); - - bcopy(&template, tcps_statisticsp, sizeof (template)); - ksp->ks_data = (void *)tcps_statisticsp; - ksp->ks_private = (void *)(uintptr_t)stackid; - - kstat_install(ksp); - return (ksp); -} - -static void -tcp_kstat2_fini(netstackid_t stackid, kstat_t *ksp) -{ - if (ksp != NULL) { - ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); - kstat_delete_netstack(ksp, stackid); - } -} - -/* - * TCP Kstats implementation - */ -static void * -tcp_kstat_init(netstackid_t stackid, tcp_stack_t *tcps) -{ - kstat_t *ksp; - - tcp_named_kstat_t template = { - { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, - { "rtoMin", KSTAT_DATA_INT32, 0 }, - { "rtoMax", KSTAT_DATA_INT32, 0 }, - { "maxConn", KSTAT_DATA_INT32, 0 }, - { "activeOpens", KSTAT_DATA_UINT32, 0 }, - { "passiveOpens", KSTAT_DATA_UINT32, 0 }, - { "attemptFails", KSTAT_DATA_UINT32, 0 }, - { "estabResets", KSTAT_DATA_UINT32, 0 }, - { "currEstab", KSTAT_DATA_UINT32, 0 }, - { "inSegs", KSTAT_DATA_UINT64, 0 }, - { "outSegs", KSTAT_DATA_UINT64, 0 }, - { "retransSegs", KSTAT_DATA_UINT32, 0 }, - { "connTableSize", KSTAT_DATA_INT32, 0 }, - { "outRsts", KSTAT_DATA_UINT32, 0 }, - { "outDataSegs", KSTAT_DATA_UINT32, 0 }, - { "outDataBytes", KSTAT_DATA_UINT32, 0 }, - { "retransBytes", KSTAT_DATA_UINT32, 0 }, - { "outAck", KSTAT_DATA_UINT32, 0 }, - { "outAckDelayed", KSTAT_DATA_UINT32, 0 }, - { "outUrg", KSTAT_DATA_UINT32, 0 }, - { "outWinUpdate", KSTAT_DATA_UINT32, 0 }, - { "outWinProbe", KSTAT_DATA_UINT32, 0 }, - { "outControl", KSTAT_DATA_UINT32, 0 }, - { "outFastRetrans", KSTAT_DATA_UINT32, 0 }, - { "inAckSegs", KSTAT_DATA_UINT32, 0 }, - { "inAckBytes", KSTAT_DATA_UINT32, 0 }, - { "inDupAck", KSTAT_DATA_UINT32, 0 }, - { "inAckUnsent", KSTAT_DATA_UINT32, 0 }, - { "inDataInorderSegs", KSTAT_DATA_UINT32, 0 }, - { "inDataInorderBytes", KSTAT_DATA_UINT32, 0 }, - { "inDataUnorderSegs", KSTAT_DATA_UINT32, 0 }, - { "inDataUnorderBytes", KSTAT_DATA_UINT32, 0 }, - { "inDataDupSegs", KSTAT_DATA_UINT32, 0 }, - { "inDataDupBytes", KSTAT_DATA_UINT32, 0 }, - { "inDataPartDupSegs", KSTAT_DATA_UINT32, 0 }, - { "inDataPartDupBytes", KSTAT_DATA_UINT32, 0 }, - { "inDataPastWinSegs", KSTAT_DATA_UINT32, 0 }, - { "inDataPastWinBytes", KSTAT_DATA_UINT32, 0 }, - { "inWinProbe", KSTAT_DATA_UINT32, 0 }, - { "inWinUpdate", KSTAT_DATA_UINT32, 0 }, - { "inClosed", KSTAT_DATA_UINT32, 0 }, - { "rttUpdate", KSTAT_DATA_UINT32, 0 }, - { "rttNoUpdate", KSTAT_DATA_UINT32, 0 }, - { "timRetrans", KSTAT_DATA_UINT32, 0 }, - { "timRetransDrop", KSTAT_DATA_UINT32, 0 }, - { "timKeepalive", KSTAT_DATA_UINT32, 0 }, - { "timKeepaliveProbe", KSTAT_DATA_UINT32, 0 }, - { "timKeepaliveDrop", KSTAT_DATA_UINT32, 0 }, - { "listenDrop", KSTAT_DATA_UINT32, 0 }, - { "listenDropQ0", KSTAT_DATA_UINT32, 0 }, - { "halfOpenDrop", KSTAT_DATA_UINT32, 0 }, - { "outSackRetransSegs", KSTAT_DATA_UINT32, 0 }, - { "connTableSize6", KSTAT_DATA_INT32, 0 } - }; - - ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2", - KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid); - - if (ksp == NULL) - return (NULL); - - template.rtoAlgorithm.value.ui32 = 4; - template.rtoMin.value.ui32 = tcps->tcps_rexmit_interval_min; - template.rtoMax.value.ui32 = tcps->tcps_rexmit_interval_max; - template.maxConn.value.i32 = -1; - - bcopy(&template, ksp->ks_data, sizeof (template)); - ksp->ks_update = tcp_kstat_update; - ksp->ks_private = (void *)(uintptr_t)stackid; - - kstat_install(ksp); - return (ksp); -} - -static void -tcp_kstat_fini(netstackid_t stackid, kstat_t *ksp) -{ - if (ksp != NULL) { - ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); - kstat_delete_netstack(ksp, stackid); - } -} - -static int -tcp_kstat_update(kstat_t *kp, int rw) -{ - tcp_named_kstat_t *tcpkp; - tcp_t *tcp; - connf_t *connfp; - conn_t *connp; - int i; - netstackid_t stackid = (netstackid_t)(uintptr_t)kp->ks_private; - netstack_t *ns; - tcp_stack_t *tcps; - ip_stack_t *ipst; - - if ((kp == NULL) || (kp->ks_data == NULL)) - return (EIO); - - if (rw == KSTAT_WRITE) - return (EACCES); - - ns = netstack_find_by_stackid(stackid); - if (ns == NULL) - return (-1); - tcps = ns->netstack_tcp; - if (tcps == NULL) { - netstack_rele(ns); - return (-1); - } - - tcpkp = (tcp_named_kstat_t *)kp->ks_data; - - tcpkp->currEstab.value.ui32 = 0; - - ipst = ns->netstack_ip; - - for (i = 0; i < CONN_G_HASH_SIZE; i++) { - connfp = &ipst->ips_ipcl_globalhash_fanout[i]; - connp = NULL; - while ((connp = - ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { - tcp = connp->conn_tcp; - switch (tcp_snmp_state(tcp)) { - case MIB2_TCP_established: - case MIB2_TCP_closeWait: - tcpkp->currEstab.value.ui32++; - break; - } - } - } - - tcpkp->activeOpens.value.ui32 = tcps->tcps_mib.tcpActiveOpens; - tcpkp->passiveOpens.value.ui32 = tcps->tcps_mib.tcpPassiveOpens; - tcpkp->attemptFails.value.ui32 = tcps->tcps_mib.tcpAttemptFails; - tcpkp->estabResets.value.ui32 = tcps->tcps_mib.tcpEstabResets; - tcpkp->inSegs.value.ui64 = tcps->tcps_mib.tcpHCInSegs; - tcpkp->outSegs.value.ui64 = tcps->tcps_mib.tcpHCOutSegs; - tcpkp->retransSegs.value.ui32 = tcps->tcps_mib.tcpRetransSegs; - tcpkp->connTableSize.value.i32 = tcps->tcps_mib.tcpConnTableSize; - tcpkp->outRsts.value.ui32 = tcps->tcps_mib.tcpOutRsts; - tcpkp->outDataSegs.value.ui32 = tcps->tcps_mib.tcpOutDataSegs; - tcpkp->outDataBytes.value.ui32 = tcps->tcps_mib.tcpOutDataBytes; - tcpkp->retransBytes.value.ui32 = tcps->tcps_mib.tcpRetransBytes; - tcpkp->outAck.value.ui32 = tcps->tcps_mib.tcpOutAck; - tcpkp->outAckDelayed.value.ui32 = tcps->tcps_mib.tcpOutAckDelayed; - tcpkp->outUrg.value.ui32 = tcps->tcps_mib.tcpOutUrg; - tcpkp->outWinUpdate.value.ui32 = tcps->tcps_mib.tcpOutWinUpdate; - tcpkp->outWinProbe.value.ui32 = tcps->tcps_mib.tcpOutWinProbe; - tcpkp->outControl.value.ui32 = tcps->tcps_mib.tcpOutControl; - tcpkp->outFastRetrans.value.ui32 = tcps->tcps_mib.tcpOutFastRetrans; - tcpkp->inAckSegs.value.ui32 = tcps->tcps_mib.tcpInAckSegs; - tcpkp->inAckBytes.value.ui32 = tcps->tcps_mib.tcpInAckBytes; - tcpkp->inDupAck.value.ui32 = tcps->tcps_mib.tcpInDupAck; - tcpkp->inAckUnsent.value.ui32 = tcps->tcps_mib.tcpInAckUnsent; - tcpkp->inDataInorderSegs.value.ui32 = - tcps->tcps_mib.tcpInDataInorderSegs; - tcpkp->inDataInorderBytes.value.ui32 = - tcps->tcps_mib.tcpInDataInorderBytes; - tcpkp->inDataUnorderSegs.value.ui32 = - tcps->tcps_mib.tcpInDataUnorderSegs; - tcpkp->inDataUnorderBytes.value.ui32 = - tcps->tcps_mib.tcpInDataUnorderBytes; - tcpkp->inDataDupSegs.value.ui32 = tcps->tcps_mib.tcpInDataDupSegs; - tcpkp->inDataDupBytes.value.ui32 = tcps->tcps_mib.tcpInDataDupBytes; - tcpkp->inDataPartDupSegs.value.ui32 = - tcps->tcps_mib.tcpInDataPartDupSegs; - tcpkp->inDataPartDupBytes.value.ui32 = - tcps->tcps_mib.tcpInDataPartDupBytes; - tcpkp->inDataPastWinSegs.value.ui32 = - tcps->tcps_mib.tcpInDataPastWinSegs; - tcpkp->inDataPastWinBytes.value.ui32 = - tcps->tcps_mib.tcpInDataPastWinBytes; - tcpkp->inWinProbe.value.ui32 = tcps->tcps_mib.tcpInWinProbe; - tcpkp->inWinUpdate.value.ui32 = tcps->tcps_mib.tcpInWinUpdate; - tcpkp->inClosed.value.ui32 = tcps->tcps_mib.tcpInClosed; - tcpkp->rttNoUpdate.value.ui32 = tcps->tcps_mib.tcpRttNoUpdate; - tcpkp->rttUpdate.value.ui32 = tcps->tcps_mib.tcpRttUpdate; - tcpkp->timRetrans.value.ui32 = tcps->tcps_mib.tcpTimRetrans; - tcpkp->timRetransDrop.value.ui32 = tcps->tcps_mib.tcpTimRetransDrop; - tcpkp->timKeepalive.value.ui32 = tcps->tcps_mib.tcpTimKeepalive; - tcpkp->timKeepaliveProbe.value.ui32 = - tcps->tcps_mib.tcpTimKeepaliveProbe; - tcpkp->timKeepaliveDrop.value.ui32 = - tcps->tcps_mib.tcpTimKeepaliveDrop; - tcpkp->listenDrop.value.ui32 = tcps->tcps_mib.tcpListenDrop; - tcpkp->listenDropQ0.value.ui32 = tcps->tcps_mib.tcpListenDropQ0; - tcpkp->halfOpenDrop.value.ui32 = tcps->tcps_mib.tcpHalfOpenDrop; - tcpkp->outSackRetransSegs.value.ui32 = - tcps->tcps_mib.tcpOutSackRetransSegs; - tcpkp->connTableSize6.value.i32 = tcps->tcps_mib.tcp6ConnTableSize; - - netstack_rele(ns); - return (0); -} - static int tcp_squeue_switch(int val) { @@ -20392,9 +4721,10 @@ tcp_squeue_add(squeue_t *sqp) sizeof (tcp_squeue_priv_t), KM_SLEEP); *squeue_getprivate(sqp, SQPRIVATE_TCP) = (intptr_t)tcp_time_wait; + /* Kick start the periodic TIME WAIT collector. */ tcp_time_wait->tcp_time_wait_tid = timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, - TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION, + (hrtime_t)10 * NANOSEC, CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); if (tcp_free_list_max_cnt == 0) { int tcp_ncpus = ((boot_max_ncpus == -1) ? @@ -20408,352 +4738,6 @@ tcp_squeue_add(squeue_t *sqp) } tcp_time_wait->tcp_free_list_cnt = 0; } - -/* - * On a labeled system we have some protocols above TCP, such as RPC, which - * appear to assume that every mblk in a chain has a db_credp. - */ -static void -tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira) -{ - ASSERT(is_system_labeled()); - ASSERT(ira->ira_cred != NULL); - - while (mp != NULL) { - mblk_setcred(mp, ira->ira_cred, NOPID); - mp = mp->b_cont; - } -} - -static int -tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, - boolean_t bind_to_req_port_only, cred_t *cr) -{ - in_port_t mlp_port; - mlp_type_t addrtype, mlptype; - boolean_t user_specified; - in_port_t allocated_port; - in_port_t requested_port = *requested_port_ptr; - conn_t *connp = tcp->tcp_connp; - zone_t *zone; - tcp_stack_t *tcps = tcp->tcp_tcps; - in6_addr_t v6addr = connp->conn_laddr_v6; - - /* - * XXX It's up to the caller to specify bind_to_req_port_only or not. - */ - ASSERT(cr != NULL); - - /* - * Get a valid port (within the anonymous range and should not - * be a privileged one) to use if the user has not given a port. - * If multiple threads are here, they may all start with - * with the same initial port. But, it should be fine as long as - * tcp_bindi will ensure that no two threads will be assigned - * the same port. - * - * NOTE: XXX If a privileged process asks for an anonymous port, we - * still check for ports only in the range > tcp_smallest_non_priv_port, - * unless TCP_ANONPRIVBIND option is set. - */ - mlptype = mlptSingle; - mlp_port = requested_port; - if (requested_port == 0) { - requested_port = connp->conn_anon_priv_bind ? - tcp_get_next_priv_port(tcp) : - tcp_update_next_port(tcps->tcps_next_port_to_try, - tcp, B_TRUE); - if (requested_port == 0) { - return (-TNOADDR); - } - user_specified = B_FALSE; - - /* - * If the user went through one of the RPC interfaces to create - * this socket and RPC is MLP in this zone, then give him an - * anonymous MLP. - */ - if (connp->conn_anon_mlp && is_system_labeled()) { - zone = crgetzone(cr); - addrtype = tsol_mlp_addr_type( - connp->conn_allzones ? ALL_ZONES : zone->zone_id, - IPV6_VERSION, &v6addr, - tcps->tcps_netstack->netstack_ip); - if (addrtype == mlptSingle) { - return (-TNOADDR); - } - mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, - PMAPPORT, addrtype); - mlp_port = PMAPPORT; - } - } else { - int i; - boolean_t priv = B_FALSE; - - /* - * If the requested_port is in the well-known privileged range, - * verify that the stream was opened by a privileged user. - * Note: No locks are held when inspecting tcp_g_*epriv_ports - * but instead the code relies on: - * - the fact that the address of the array and its size never - * changes - * - the atomic assignment of the elements of the array - */ - if (requested_port < tcps->tcps_smallest_nonpriv_port) { - priv = B_TRUE; - } else { - for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { - if (requested_port == - tcps->tcps_g_epriv_ports[i]) { - priv = B_TRUE; - break; - } - } - } - if (priv) { - if (secpolicy_net_privaddr(cr, requested_port, - IPPROTO_TCP) != 0) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: no priv for port %d", - requested_port); - } - return (-TACCES); - } - } - user_specified = B_TRUE; - - connp = tcp->tcp_connp; - if (is_system_labeled()) { - zone = crgetzone(cr); - addrtype = tsol_mlp_addr_type( - connp->conn_allzones ? ALL_ZONES : zone->zone_id, - IPV6_VERSION, &v6addr, - tcps->tcps_netstack->netstack_ip); - if (addrtype == mlptSingle) { - return (-TNOADDR); - } - mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, - requested_port, addrtype); - } - } - - if (mlptype != mlptSingle) { - if (secpolicy_net_bindmlp(cr) != 0) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: no priv for multilevel port %d", - requested_port); - } - return (-TACCES); - } - - /* - * If we're specifically binding a shared IP address and the - * port is MLP on shared addresses, then check to see if this - * zone actually owns the MLP. Reject if not. - */ - if (mlptype == mlptShared && addrtype == mlptShared) { - /* - * No need to handle exclusive-stack zones since - * ALL_ZONES only applies to the shared stack. - */ - zoneid_t mlpzone; - - mlpzone = tsol_mlp_findzone(IPPROTO_TCP, - htons(mlp_port)); - if (connp->conn_zoneid != mlpzone) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: attempt to bind port " - "%d on shared addr in zone %d " - "(should be %d)", - mlp_port, connp->conn_zoneid, - mlpzone); - } - return (-TACCES); - } - } - - if (!user_specified) { - int err; - err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, - requested_port, B_TRUE); - if (err != 0) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: cannot establish anon " - "MLP for port %d", - requested_port); - } - return (err); - } - connp->conn_anon_port = B_TRUE; - } - connp->conn_mlp_type = mlptype; - } - - allocated_port = tcp_bindi(tcp, requested_port, &v6addr, - connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, - user_specified); - - if (allocated_port == 0) { - connp->conn_mlp_type = mlptSingle; - if (connp->conn_anon_port) { - connp->conn_anon_port = B_FALSE; - (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, - requested_port, B_FALSE); - } - if (bind_to_req_port_only) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: requested addr busy"); - } - return (-TADDRBUSY); - } else { - /* If we are out of ports, fail the bind. */ - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: out of ports?"); - } - return (-TNOADDR); - } - } - - /* Pass the allocated port back */ - *requested_port_ptr = allocated_port; - return (0); -} - -/* - * Check the address and check/pick a local port number. - */ -static int -tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, - boolean_t bind_to_req_port_only) -{ - tcp_t *tcp = connp->conn_tcp; - sin_t *sin; - sin6_t *sin6; - in_port_t requested_port; - ipaddr_t v4addr; - in6_addr_t v6addr; - ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ - zoneid_t zoneid = IPCL_ZONEID(connp); - ip_stack_t *ipst = connp->conn_netstack->netstack_ip; - uint_t scopeid = 0; - int error = 0; - ip_xmit_attr_t *ixa = connp->conn_ixa; - - ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); - - if (tcp->tcp_state == TCPS_BOUND) { - return (0); - } else if (tcp->tcp_state > TCPS_BOUND) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_bind: bad state, %d", tcp->tcp_state); - } - return (-TOUTSTATE); - } - - ASSERT(sa != NULL && len != 0); - - if (!OK_32PTR((char *)sa)) { - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_bind: bad address parameter, " - "address %p, len %d", - (void *)sa, len); - } - return (-TPROTO); - } - - error = proto_verify_ip_addr(connp->conn_family, sa, len); - if (error != 0) { - return (error); - } - - switch (len) { - case sizeof (sin_t): /* Complete IPv4 address */ - sin = (sin_t *)sa; - requested_port = ntohs(sin->sin_port); - v4addr = sin->sin_addr.s_addr; - IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); - if (v4addr != INADDR_ANY) { - laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, - B_FALSE); - } - break; - - case sizeof (sin6_t): /* Complete IPv6 address */ - sin6 = (sin6_t *)sa; - v6addr = sin6->sin6_addr; - requested_port = ntohs(sin6->sin6_port); - if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { - if (connp->conn_ipv6_v6only) - return (EADDRNOTAVAIL); - - IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); - if (v4addr != INADDR_ANY) { - laddr_type = ip_laddr_verify_v4(v4addr, - zoneid, ipst, B_FALSE); - } - } else { - if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { - if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) - scopeid = sin6->sin6_scope_id; - laddr_type = ip_laddr_verify_v6(&v6addr, - zoneid, ipst, B_FALSE, scopeid); - } - } - break; - - default: - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, - "tcp_bind: bad address length, %d", len); - } - return (EAFNOSUPPORT); - /* return (-TBADADDR); */ - } - - /* Is the local address a valid unicast address? */ - if (laddr_type == IPVL_BAD) - return (EADDRNOTAVAIL); - - connp->conn_bound_addr_v6 = v6addr; - if (scopeid != 0) { - ixa->ixa_flags |= IXAF_SCOPEID_SET; - ixa->ixa_scopeid = scopeid; - connp->conn_incoming_ifindex = scopeid; - } else { - ixa->ixa_flags &= ~IXAF_SCOPEID_SET; - connp->conn_incoming_ifindex = connp->conn_bound_if; - } - - connp->conn_laddr_v6 = v6addr; - connp->conn_saddr_v6 = v6addr; - - bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; - - error = tcp_bind_select_lport(tcp, &requested_port, - bind_to_req_port_only, cr); - if (error != 0) { - connp->conn_laddr_v6 = ipv6_all_zeros; - connp->conn_saddr_v6 = ipv6_all_zeros; - connp->conn_bound_addr_v6 = ipv6_all_zeros; - } - return (error); -} - /* * Return unix error is tli error is TSYSERR, otherwise return a negative * tli error. @@ -20782,48 +4766,6 @@ tcp_do_bind(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, return (0); } -int -tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, - socklen_t len, cred_t *cr) -{ - int error; - conn_t *connp = (conn_t *)proto_handle; - squeue_t *sqp = connp->conn_sqp; - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - ASSERT(sqp != NULL); - ASSERT(connp->conn_upper_handle != NULL); - - error = squeue_synch_enter(sqp, connp, NULL); - if (error != 0) { - /* failed to enter */ - return (ENOSR); - } - - /* binding to a NULL address really means unbind */ - if (sa == NULL) { - if (connp->conn_tcp->tcp_state < TCPS_LISTEN) - error = tcp_do_unbind(connp); - else - error = EINVAL; - } else { - error = tcp_do_bind(connp, sa, len, cr, B_TRUE); - } - - squeue_synch_exit(sqp, connp); - - if (error < 0) { - if (error == -TOUTSTATE) - error = EINVAL; - else - error = proto_tlitosyserr(-error); - } - - return (error); -} - /* * If the return value from this function is positive, it's a UNIX error. * Otherwise, if it's negative, then the absolute value is a TLI error. @@ -20963,7 +4905,7 @@ tcp_do_connect(conn_t *connp, const struct sockaddr *sa, socklen_t len, goto connect_failed; /* connect succeeded */ - BUMP_MIB(&tcps->tcps_mib, tcpActiveOpens); + TCPS_BUMP_MIB(tcps, tcpActiveOpens); tcp->tcp_active_open = 1; /* @@ -21050,652 +4992,6 @@ connect_failed: } int -tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, - socklen_t len, sock_connid_t *id, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - squeue_t *sqp = connp->conn_sqp; - int error; - - ASSERT(connp->conn_upper_handle != NULL); - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - error = proto_verify_ip_addr(connp->conn_family, sa, len); - if (error != 0) { - return (error); - } - - error = squeue_synch_enter(sqp, connp, NULL); - if (error != 0) { - /* failed to enter */ - return (ENOSR); - } - - /* - * TCP supports quick connect, so no need to do an implicit bind - */ - error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); - if (error == 0) { - *id = connp->conn_tcp->tcp_connid; - } else if (error < 0) { - if (error == -TOUTSTATE) { - switch (connp->conn_tcp->tcp_state) { - case TCPS_SYN_SENT: - error = EALREADY; - break; - case TCPS_ESTABLISHED: - error = EISCONN; - break; - case TCPS_LISTEN: - error = EOPNOTSUPP; - break; - default: - error = EINVAL; - break; - } - } else { - error = proto_tlitosyserr(-error); - } - } - - if (connp->conn_tcp->tcp_loopback) { - struct sock_proto_props sopp; - - sopp.sopp_flags = SOCKOPT_LOOPBACK; - sopp.sopp_loopback = B_TRUE; - - (*connp->conn_upcalls->su_set_proto_props)( - connp->conn_upper_handle, &sopp); - } -done: - squeue_synch_exit(sqp, connp); - - return ((error == 0) ? EINPROGRESS : error); -} - -/* ARGSUSED */ -sock_lower_handle_t -tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, - uint_t *smodep, int *errorp, int flags, cred_t *credp) -{ - conn_t *connp; - boolean_t isv6 = family == AF_INET6; - if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || - (proto != 0 && proto != IPPROTO_TCP)) { - *errorp = EPROTONOSUPPORT; - return (NULL); - } - - connp = tcp_create_common(credp, isv6, B_TRUE, errorp); - if (connp == NULL) { - return (NULL); - } - - /* - * Put the ref for TCP. Ref for IP was already put - * by ipcl_conn_create. Also Make the conn_t globally - * visible to walkers - */ - mutex_enter(&connp->conn_lock); - CONN_INC_REF_LOCKED(connp); - ASSERT(connp->conn_ref == 2); - connp->conn_state_flags &= ~CONN_INCIPIENT; - - connp->conn_flags |= IPCL_NONSTR; - mutex_exit(&connp->conn_lock); - - ASSERT(errorp != NULL); - *errorp = 0; - *sock_downcalls = &sock_tcp_downcalls; - *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | - SM_SENDFILESUPP; - - return ((sock_lower_handle_t)connp); -} - -/* ARGSUSED */ -void -tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, - sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - struct sock_proto_props sopp; - - ASSERT(connp->conn_upper_handle == NULL); - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | - SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | - SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; - - sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; - sopp.sopp_rxlowat = SOCKET_RECVLOWATER; - sopp.sopp_maxpsz = INFPSZ; - sopp.sopp_maxblk = INFPSZ; - sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; - sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; - sopp.sopp_maxaddrlen = sizeof (sin6_t); - sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : - tcp_rinfo.mi_minpsz; - - connp->conn_upcalls = sock_upcalls; - connp->conn_upper_handle = sock_handle; - - ASSERT(connp->conn_rcvbuf != 0 && - connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); - (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); -} - -/* ARGSUSED */ -int -tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - - ASSERT(connp->conn_upper_handle != NULL); - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - tcp_close_common(connp, flags); - - ip_free_helper_stream(connp); - - /* - * Drop IP's reference on the conn. This is the last reference - * on the connp if the state was less than established. If the - * connection has gone into timewait state, then we will have - * one ref for the TCP and one more ref (total of two) for the - * classifier connected hash list (a timewait connections stays - * in connected hash till closed). - * - * We can't assert the references because there might be other - * transient reference places because of some walkers or queued - * packets in squeue for the timewait state. - */ - CONN_DEC_REF(connp); - return (0); -} - -/* ARGSUSED */ -int -tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, - cred_t *cr) -{ - tcp_t *tcp; - uint32_t msize; - conn_t *connp = (conn_t *)proto_handle; - int32_t tcpstate; - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - ASSERT(connp->conn_ref >= 2); - ASSERT(connp->conn_upper_handle != NULL); - - if (msg->msg_controllen != 0) { - freemsg(mp); - return (EOPNOTSUPP); - } - - switch (DB_TYPE(mp)) { - case M_DATA: - tcp = connp->conn_tcp; - ASSERT(tcp != NULL); - - tcpstate = tcp->tcp_state; - if (tcpstate < TCPS_ESTABLISHED) { - freemsg(mp); - /* - * We return ENOTCONN if the endpoint is trying to - * connect or has never been connected, and EPIPE if it - * has been disconnected. The connection id helps us - * distinguish between the last two cases. - */ - return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : - ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); - } else if (tcpstate > TCPS_CLOSE_WAIT) { - freemsg(mp); - return (EPIPE); - } - - msize = msgdsize(mp); - - mutex_enter(&tcp->tcp_non_sq_lock); - tcp->tcp_squeue_bytes += msize; - /* - * Squeue Flow Control - */ - if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { - tcp_setqfull(tcp); - } - mutex_exit(&tcp->tcp_non_sq_lock); - - /* - * The application may pass in an address in the msghdr, but - * we ignore the address on connection-oriented sockets. - * Just like BSD this code does not generate an error for - * TCP (a CONNREQUIRED socket) when sending to an address - * passed in with sendto/sendmsg. Instead the data is - * delivered on the connection as if no address had been - * supplied. - */ - CONN_INC_REF(connp); - - if (msg->msg_flags & MSG_OOB) { - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, - connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); - } else { - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, - connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); - } - - return (0); - - default: - ASSERT(0); - } - - freemsg(mp); - return (0); -} - -/* ARGSUSED2 */ -void -tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - int len; - uint32_t msize; - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - - msize = msgdsize(mp); - - len = msize - 1; - if (len < 0) { - freemsg(mp); - return; - } - - /* - * Try to force urgent data out on the wire. Even if we have unsent - * data this will at least send the urgent flag. - * XXX does not handle more flag correctly. - */ - len += tcp->tcp_unsent; - len += tcp->tcp_snxt; - tcp->tcp_urg = len; - tcp->tcp_valid_bits |= TCP_URG_VALID; - - /* Bypass tcp protocol for fused tcp loopback */ - if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) - return; - - /* Strip off the T_EXDATA_REQ if the data is from TPI */ - if (DB_TYPE(mp) != M_DATA) { - mblk_t *mp1 = mp; - ASSERT(!IPCL_IS_NONSTR(connp)); - mp = mp->b_cont; - freeb(mp1); - } - tcp_wput_data(tcp, mp, B_TRUE); -} - -/* ARGSUSED3 */ -int -tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, - socklen_t *addrlenp, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - tcp_t *tcp = connp->conn_tcp; - - ASSERT(connp->conn_upper_handle != NULL); - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - ASSERT(tcp != NULL); - if (tcp->tcp_state < TCPS_SYN_RCVD) - return (ENOTCONN); - - return (conn_getpeername(connp, addr, addrlenp)); -} - -/* ARGSUSED3 */ -int -tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, - socklen_t *addrlenp, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - ASSERT(connp->conn_upper_handle != NULL); - return (conn_getsockname(connp, addr, addrlenp)); -} - -/* - * tcp_fallback - * - * A direct socket is falling back to using STREAMS. The queue - * that is being passed down was created using tcp_open() with - * the SO_FALLBACK flag set. As a result, the queue is not - * associated with a conn, and the q_ptrs instead contain the - * dev and minor area that should be used. - * - * The 'issocket' flag indicates whether the FireEngine - * optimizations should be used. The common case would be that - * optimizations are enabled, and they might be subsequently - * disabled using the _SIOCSOCKFALLBACK ioctl. - */ - -/* - * An active connection is falling back to TPI. Gather all the information - * required by the STREAM head and TPI sonode and send it up. - */ -void -tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, - boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb) -{ - conn_t *connp = tcp->tcp_connp; - struct stroptions *stropt; - struct T_capability_ack tca; - struct sockaddr_in6 laddr, faddr; - socklen_t laddrlen, faddrlen; - short opts; - int error; - mblk_t *mp; - - connp->conn_dev = (dev_t)RD(q)->q_ptr; - connp->conn_minor_arena = WR(q)->q_ptr; - - RD(q)->q_ptr = WR(q)->q_ptr = connp; - - connp->conn_rq = RD(q); - connp->conn_wq = WR(q); - - WR(q)->q_qinfo = &tcp_sock_winit; - - if (!issocket) - tcp_use_pure_tpi(tcp); - - /* - * free the helper stream - */ - ip_free_helper_stream(connp); - - /* - * Notify the STREAM head about options - */ - DB_TYPE(stropt_mp) = M_SETOPTS; - stropt = (struct stroptions *)stropt_mp->b_rptr; - stropt_mp->b_wptr += sizeof (struct stroptions); - stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; - - stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : - tcp->tcp_tcps->tcps_wroff_xtra); - if (tcp->tcp_snd_sack_ok) - stropt->so_wroff += TCPOPT_MAX_SACK_LEN; - stropt->so_hiwat = connp->conn_rcvbuf; - stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); - - putnext(RD(q), stropt_mp); - - /* - * Collect the information needed to sync with the sonode - */ - tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); - - laddrlen = faddrlen = sizeof (sin6_t); - (void) tcp_getsockname((sock_lower_handle_t)connp, - (struct sockaddr *)&laddr, &laddrlen, CRED()); - error = tcp_getpeername((sock_lower_handle_t)connp, - (struct sockaddr *)&faddr, &faddrlen, CRED()); - if (error != 0) - faddrlen = 0; - - opts = 0; - if (connp->conn_oobinline) - opts |= SO_OOBINLINE; - if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) - opts |= SO_DONTROUTE; - - /* - * Notify the socket that the protocol is now quiescent, - * and it's therefore safe move data from the socket - * to the stream head. - */ - (*quiesced_cb)(connp->conn_upper_handle, q, &tca, - (struct sockaddr *)&laddr, laddrlen, - (struct sockaddr *)&faddr, faddrlen, opts); - - while ((mp = tcp->tcp_rcv_list) != NULL) { - tcp->tcp_rcv_list = mp->b_next; - mp->b_next = NULL; - /* We never do fallback for kernel RPC */ - putnext(q, mp); - } - tcp->tcp_rcv_last_head = NULL; - tcp->tcp_rcv_last_tail = NULL; - tcp->tcp_rcv_cnt = 0; -} - -/* - * An eager is falling back to TPI. All we have to do is send - * up a T_CONN_IND. - */ -void -tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) -{ - tcp_t *listener = eager->tcp_listener; - mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind; - - ASSERT(listener != NULL); - ASSERT(mp != NULL); - - eager->tcp_conn.tcp_eager_conn_ind = NULL; - - /* - * TLI/XTI applications will get confused by - * sending eager as an option since it violates - * the option semantics. So remove the eager as - * option since TLI/XTI app doesn't need it anyway. - */ - if (!direct_sockfs) { - struct T_conn_ind *conn_ind; - - conn_ind = (struct T_conn_ind *)mp->b_rptr; - conn_ind->OPT_length = 0; - conn_ind->OPT_offset = 0; - } - - /* - * Sockfs guarantees that the listener will not be closed - * during fallback. So we can safely use the listener's queue. - */ - putnext(listener->tcp_connp->conn_rq, mp); -} - -int -tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, - boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) -{ - tcp_t *tcp; - conn_t *connp = (conn_t *)proto_handle; - int error; - mblk_t *stropt_mp; - mblk_t *ordrel_mp; - - tcp = connp->conn_tcp; - - stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, - NULL); - - /* Pre-allocate the T_ordrel_ind mblk. */ - ASSERT(tcp->tcp_ordrel_mp == NULL); - ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, - STR_NOSIG, NULL); - ordrel_mp->b_datap->db_type = M_PROTO; - ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; - ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); - - /* - * Enter the squeue so that no new packets can come in - */ - error = squeue_synch_enter(connp->conn_sqp, connp, NULL); - if (error != 0) { - /* failed to enter, free all the pre-allocated messages. */ - freeb(stropt_mp); - freeb(ordrel_mp); - /* - * We cannot process the eager, so at least send out a - * RST so the peer can reconnect. - */ - if (tcp->tcp_listener != NULL) { - (void) tcp_eager_blowoff(tcp->tcp_listener, - tcp->tcp_conn_req_seqnum); - } - return (ENOMEM); - } - - /* - * Both endpoints must be of the same type (either STREAMS or - * non-STREAMS) for fusion to be enabled. So if we are fused, - * we have to unfuse. - */ - if (tcp->tcp_fused) - tcp_unfuse(tcp); - - /* - * No longer a direct socket - */ - connp->conn_flags &= ~IPCL_NONSTR; - tcp->tcp_ordrel_mp = ordrel_mp; - - if (tcp->tcp_listener != NULL) { - /* The eager will deal with opts when accept() is called */ - freeb(stropt_mp); - tcp_fallback_eager(tcp, direct_sockfs); - } else { - tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, - quiesced_cb); - } - - /* - * There should be atleast two ref's (IP + TCP) - */ - ASSERT(connp->conn_ref >= 2); - squeue_synch_exit(connp->conn_sqp, connp); - - return (0); -} - -/* ARGSUSED */ -static void -tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) -{ - conn_t *connp = (conn_t *)arg; - tcp_t *tcp = connp->conn_tcp; - - freemsg(mp); - - if (tcp->tcp_fused) - tcp_unfuse(tcp); - - if (tcp_xmit_end(tcp) != 0) { - /* - * We were crossing FINs and got a reset from - * the other side. Just ignore it. - */ - if (connp->conn_debug) { - (void) strlog(TCP_MOD_ID, 0, 1, - SL_ERROR|SL_TRACE, - "tcp_shutdown_output() out of state %s", - tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); - } - } -} - -/* ARGSUSED */ -int -tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - tcp_t *tcp = connp->conn_tcp; - - ASSERT(connp->conn_upper_handle != NULL); - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - /* - * X/Open requires that we check the connected state. - */ - if (tcp->tcp_state < TCPS_SYN_SENT) - return (ENOTCONN); - - /* shutdown the send side */ - if (how != SHUT_RD) { - mblk_t *bp; - - bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); - CONN_INC_REF(connp); - SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, - connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); - - (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, - SOCK_OPCTL_SHUT_SEND, 0); - } - - /* shutdown the recv side */ - if (how != SHUT_WR) - (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, - SOCK_OPCTL_SHUT_RECV, 0); - - return (0); -} - -/* - * SOP_LISTEN() calls into tcp_listen(). - */ -/* ARGSUSED */ -int -tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - int error; - squeue_t *sqp = connp->conn_sqp; - - ASSERT(connp->conn_upper_handle != NULL); - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - error = squeue_synch_enter(sqp, connp, NULL); - if (error != 0) { - /* failed to enter */ - return (ENOBUFS); - } - - error = tcp_do_listen(connp, NULL, 0, backlog, cr, FALSE); - if (error == 0) { - (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, - SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog); - } else if (error < 0) { - if (error == -TOUTSTATE) - error = EINVAL; - else - error = proto_tlitosyserr(-error); - } - squeue_synch_exit(sqp, connp); - return (error); -} - -static int tcp_do_listen(conn_t *connp, struct sockaddr *sa, socklen_t len, int backlog, cred_t *cr, boolean_t bind_to_req_port_only) { @@ -21861,330 +5157,3 @@ do_listen: } return (error); } - -void -tcp_clr_flowctrl(sock_lower_handle_t proto_handle) -{ - conn_t *connp = (conn_t *)proto_handle; - tcp_t *tcp = connp->conn_tcp; - mblk_t *mp; - int error; - - ASSERT(connp->conn_upper_handle != NULL); - - /* - * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() - * is currently running. - */ - mutex_enter(&tcp->tcp_rsrv_mp_lock); - if ((mp = tcp->tcp_rsrv_mp) == NULL) { - mutex_exit(&tcp->tcp_rsrv_mp_lock); - return; - } - tcp->tcp_rsrv_mp = NULL; - mutex_exit(&tcp->tcp_rsrv_mp_lock); - - error = squeue_synch_enter(connp->conn_sqp, connp, mp); - ASSERT(error == 0); - - mutex_enter(&tcp->tcp_rsrv_mp_lock); - tcp->tcp_rsrv_mp = mp; - mutex_exit(&tcp->tcp_rsrv_mp_lock); - - if (tcp->tcp_fused) { - tcp_fuse_backenable(tcp); - } else { - tcp->tcp_rwnd = connp->conn_rcvbuf; - /* - * Send back a window update immediately if TCP is above - * ESTABLISHED state and the increase of the rcv window - * that the other side knows is at least 1 MSS after flow - * control is lifted. - */ - if (tcp->tcp_state >= TCPS_ESTABLISHED && - tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { - tcp_xmit_ctl(NULL, tcp, - (tcp->tcp_swnd == 0) ? tcp->tcp_suna : - tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); - } - } - - squeue_synch_exit(connp->conn_sqp, connp); -} - -/* ARGSUSED */ -int -tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, - int mode, int32_t *rvalp, cred_t *cr) -{ - conn_t *connp = (conn_t *)proto_handle; - int error; - - ASSERT(connp->conn_upper_handle != NULL); - - /* All Solaris components should pass a cred for this operation. */ - ASSERT(cr != NULL); - - /* - * If we don't have a helper stream then create one. - * ip_create_helper_stream takes care of locking the conn_t, - * so this check for NULL is just a performance optimization. - */ - if (connp->conn_helper_info == NULL) { - tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; - - /* - * Create a helper stream for non-STREAMS socket. - */ - error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); - if (error != 0) { - ip0dbg(("tcp_ioctl: create of IP helper stream " - "failed %d\n", error)); - return (error); - } - } - - switch (cmd) { - case ND_SET: - case ND_GET: - case _SIOCSOCKFALLBACK: - case TCP_IOC_ABORT_CONN: - case TI_GETPEERNAME: - case TI_GETMYNAME: - ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", - cmd)); - error = EINVAL; - break; - default: - /* - * If the conn is not closing, pass on to IP using - * helper stream. Bump the ioctlref to prevent tcp_close - * from closing the rq/wq out from underneath the ioctl - * if it ends up queued or aborted/interrupted. - */ - mutex_enter(&connp->conn_lock); - if (connp->conn_state_flags & (CONN_CLOSING)) { - mutex_exit(&connp->conn_lock); - error = EINVAL; - break; - } - CONN_INC_IOCTLREF_LOCKED(connp); - error = ldi_ioctl(connp->conn_helper_info->iphs_handle, - cmd, arg, mode, cr, rvalp); - CONN_DEC_IOCTLREF(connp); - break; - } - return (error); -} - -sock_downcalls_t sock_tcp_downcalls = { - tcp_activate, - tcp_accept, - tcp_bind, - tcp_listen, - tcp_connect, - tcp_getpeername, - tcp_getsockname, - tcp_getsockopt, - tcp_setsockopt, - tcp_sendmsg, - NULL, - NULL, - NULL, - tcp_shutdown, - tcp_clr_flowctrl, - tcp_ioctl, - tcp_close, -}; - -/* - * Timeout function to reset the TCP stack variable tcps_reclaim to false. - */ -static void -tcp_reclaim_timer(void *arg) -{ - tcp_stack_t *tcps = (tcp_stack_t *)arg; - - mutex_enter(&tcps->tcps_reclaim_lock); - tcps->tcps_reclaim = B_FALSE; - tcps->tcps_reclaim_tid = 0; - mutex_exit(&tcps->tcps_reclaim_lock); -} - -/* - * Kmem reclaim call back function. When the system is under memory - * pressure, we set the TCP stack variable tcps_reclaim to true. This - * variable is reset to false after tcps_reclaim_period msecs. During this - * period, TCP will be more aggressive in aborting connections not making - * progress, meaning retransmitting for some time (tcp_early_abort seconds). - * TCP will also not accept new connection request for those listeners whose - * q or q0 is not empty. - */ -/* ARGSUSED */ -void -tcp_conn_reclaim(void *arg) -{ - netstack_handle_t nh; - netstack_t *ns; - tcp_stack_t *tcps; - extern pgcnt_t lotsfree, needfree; - - if (!tcp_do_reclaim) - return; - - /* - * The reclaim function may be called even when the system is not - * really under memory pressure. - */ - if (freemem >= lotsfree + needfree) - return; - - netstack_next_init(&nh); - while ((ns = netstack_next(&nh)) != NULL) { - tcps = ns->netstack_tcp; - mutex_enter(&tcps->tcps_reclaim_lock); - if (!tcps->tcps_reclaim) { - tcps->tcps_reclaim = B_TRUE; - tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, - tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); - } - mutex_exit(&tcps->tcps_reclaim_lock); - netstack_rele(ns); - } - netstack_next_fini(&nh); -} - -/* - * Given a tcp_stack_t and a port (in host byte order), find a listener - * configuration for that port and return the ratio. - */ -static uint32_t -tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port) -{ - tcp_listener_t *tl; - uint32_t ratio = 0; - - mutex_enter(&tcps->tcps_listener_conf_lock); - for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; - tl = list_next(&tcps->tcps_listener_conf, tl)) { - if (tl->tl_port == port) { - ratio = tl->tl_ratio; - break; - } - } - mutex_exit(&tcps->tcps_listener_conf_lock); - return (ratio); -} - -/* - * Ndd param helper routine to return the current list of listener limit - * configuration. - */ -/* ARGSUSED */ -static int -tcp_listener_conf_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) -{ - tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - tcp_listener_t *tl; - - mutex_enter(&tcps->tcps_listener_conf_lock); - for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; - tl = list_next(&tcps->tcps_listener_conf, tl)) { - (void) mi_mpprintf(mp, "%d:%d ", tl->tl_port, tl->tl_ratio); - } - mutex_exit(&tcps->tcps_listener_conf_lock); - return (0); -} - -/* - * Ndd param helper routine to add a new listener limit configuration. - */ -/* ARGSUSED */ -static int -tcp_listener_conf_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, - cred_t *cr) -{ - tcp_listener_t *new_tl; - tcp_listener_t *tl; - long lport; - long ratio; - char *colon; - tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - - if (ddi_strtol(value, &colon, 10, &lport) != 0 || lport <= 0 || - lport > USHRT_MAX || *colon != ':') { - return (EINVAL); - } - if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0) - return (EINVAL); - - mutex_enter(&tcps->tcps_listener_conf_lock); - for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; - tl = list_next(&tcps->tcps_listener_conf, tl)) { - /* There is an existing entry, so update its ratio value. */ - if (tl->tl_port == lport) { - tl->tl_ratio = ratio; - mutex_exit(&tcps->tcps_listener_conf_lock); - return (0); - } - } - - if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) == - NULL) { - mutex_exit(&tcps->tcps_listener_conf_lock); - return (ENOMEM); - } - - new_tl->tl_port = lport; - new_tl->tl_ratio = ratio; - list_insert_tail(&tcps->tcps_listener_conf, new_tl); - mutex_exit(&tcps->tcps_listener_conf_lock); - return (0); -} - -/* - * Ndd param helper routine to remove a listener limit configuration. - */ -/* ARGSUSED */ -static int -tcp_listener_conf_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, - cred_t *cr) -{ - tcp_listener_t *tl; - long lport; - tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; - - if (ddi_strtol(value, NULL, 10, &lport) != 0 || lport <= 0 || - lport > USHRT_MAX) { - return (EINVAL); - } - mutex_enter(&tcps->tcps_listener_conf_lock); - for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; - tl = list_next(&tcps->tcps_listener_conf, tl)) { - if (tl->tl_port == lport) { - list_remove(&tcps->tcps_listener_conf, tl); - mutex_exit(&tcps->tcps_listener_conf_lock); - kmem_free(tl, sizeof (tcp_listener_t)); - return (0); - } - } - mutex_exit(&tcps->tcps_listener_conf_lock); - return (ESRCH); -} - -/* - * To remove all listener limit configuration in a tcp_stack_t. - */ -static void -tcp_listener_conf_cleanup(tcp_stack_t *tcps) -{ - tcp_listener_t *tl; - - mutex_enter(&tcps->tcps_listener_conf_lock); - while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) { - list_remove(&tcps->tcps_listener_conf, tl); - kmem_free(tl, sizeof (tcp_listener_t)); - } - mutex_destroy(&tcps->tcps_listener_conf_lock); - list_destroy(&tcps->tcps_listener_conf); -} diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c new file mode 100644 index 0000000000..5d91fe7a7f --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -0,0 +1,935 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/stropts.h> +#include <sys/strlog.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/suntpi.h> +#include <sys/xti_inet.h> +#include <sys/policy.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> +#include <sys/tsol/tnet.h> + +#include <rpc/pmap_prot.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/proto_set.h> +#include <inet/ipsec_impl.h> + +/* Setable in /etc/system */ +/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ +static uint32_t tcp_random_anon_port = 1; + +static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, + cred_t *cr); +static in_port_t tcp_get_next_priv_port(const tcp_t *); + +/* + * Hash list insertion routine for tcp_t structures. Each hash bucket + * contains a list of tcp_t entries, and each entry is bound to a unique + * port. If there are multiple tcp_t's that are bound to the same port, then + * one of them will be linked into the hash bucket list, and the rest will + * hang off of that one entry. For each port, entries bound to a specific IP + * address will be inserted before those those bound to INADDR_ANY. + */ +void +tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) +{ + tcp_t **tcpp; + tcp_t *tcpnext; + tcp_t *tcphash; + conn_t *connp = tcp->tcp_connp; + conn_t *connext; + + if (tcp->tcp_ptpbhn != NULL) { + ASSERT(!caller_holds_lock); + tcp_bind_hash_remove(tcp); + } + tcpp = &tbf->tf_tcp; + if (!caller_holds_lock) { + mutex_enter(&tbf->tf_lock); + } else { + ASSERT(MUTEX_HELD(&tbf->tf_lock)); + } + tcphash = tcpp[0]; + tcpnext = NULL; + if (tcphash != NULL) { + /* Look for an entry using the same port */ + while ((tcphash = tcpp[0]) != NULL && + connp->conn_lport != tcphash->tcp_connp->conn_lport) + tcpp = &(tcphash->tcp_bind_hash); + + /* The port was not found, just add to the end */ + if (tcphash == NULL) + goto insert; + + /* + * OK, there already exists an entry bound to the + * same port. + * + * If the new tcp bound to the INADDR_ANY address + * and the first one in the list is not bound to + * INADDR_ANY we skip all entries until we find the + * first one bound to INADDR_ANY. + * This makes sure that applications binding to a + * specific address get preference over those binding to + * INADDR_ANY. + */ + tcpnext = tcphash; + connext = tcpnext->tcp_connp; + tcphash = NULL; + if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && + !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { + while ((tcpnext = tcpp[0]) != NULL) { + connext = tcpnext->tcp_connp; + if (!V6_OR_V4_INADDR_ANY( + connext->conn_bound_addr_v6)) + tcpp = &(tcpnext->tcp_bind_hash_port); + else + break; + } + if (tcpnext != NULL) { + tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; + tcphash = tcpnext->tcp_bind_hash; + if (tcphash != NULL) { + tcphash->tcp_ptpbhn = + &(tcp->tcp_bind_hash); + tcpnext->tcp_bind_hash = NULL; + } + } + } else { + tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; + tcphash = tcpnext->tcp_bind_hash; + if (tcphash != NULL) { + tcphash->tcp_ptpbhn = + &(tcp->tcp_bind_hash); + tcpnext->tcp_bind_hash = NULL; + } + } + } +insert: + tcp->tcp_bind_hash_port = tcpnext; + tcp->tcp_bind_hash = tcphash; + tcp->tcp_ptpbhn = tcpp; + tcpp[0] = tcp; + if (!caller_holds_lock) + mutex_exit(&tbf->tf_lock); +} + +/* + * Hash list removal routine for tcp_t structures. + */ +void +tcp_bind_hash_remove(tcp_t *tcp) +{ + tcp_t *tcpnext; + kmutex_t *lockp; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + + if (tcp->tcp_ptpbhn == NULL) + return; + + /* + * Extract the lock pointer in case there are concurrent + * hash_remove's for this instance. + */ + ASSERT(connp->conn_lport != 0); + lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( + connp->conn_lport)].tf_lock; + + ASSERT(lockp != NULL); + mutex_enter(lockp); + if (tcp->tcp_ptpbhn) { + tcpnext = tcp->tcp_bind_hash_port; + if (tcpnext != NULL) { + tcp->tcp_bind_hash_port = NULL; + tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; + tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; + if (tcpnext->tcp_bind_hash != NULL) { + tcpnext->tcp_bind_hash->tcp_ptpbhn = + &(tcpnext->tcp_bind_hash); + tcp->tcp_bind_hash = NULL; + } + } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { + tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; + tcp->tcp_bind_hash = NULL; + } + *tcp->tcp_ptpbhn = tcpnext; + tcp->tcp_ptpbhn = NULL; + } + mutex_exit(lockp); +} + +/* + * Don't let port fall into the privileged range. + * Since the extra privileged ports can be arbitrary we also + * ensure that we exclude those from consideration. + * tcp_g_epriv_ports is not sorted thus we loop over it until + * there are no changes. + * + * Note: No locks are held when inspecting tcp_g_*epriv_ports + * but instead the code relies on: + * - the fact that the address of the array and its size never changes + * - the atomic assignment of the elements of the array + * + * Returns 0 if there are no more ports available. + * + * TS note: skip multilevel ports. + */ +in_port_t +tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) +{ + int i; + boolean_t restart = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; + + if (random && tcp_random_anon_port != 0) { + (void) random_get_pseudo_bytes((uint8_t *)&port, + sizeof (in_port_t)); + /* + * Unless changed by a sys admin, the smallest anon port + * is 32768 and the largest anon port is 65535. It is + * very likely (50%) for the random port to be smaller + * than the smallest anon port. When that happens, + * add port % (anon port range) to the smallest anon + * port to get the random port. It should fall into the + * valid anon port range. + */ + if (port < tcps->tcps_smallest_anon_port) { + port = tcps->tcps_smallest_anon_port + + port % (tcps->tcps_largest_anon_port - + tcps->tcps_smallest_anon_port); + } + } + +retry: + if (port < tcps->tcps_smallest_anon_port) + port = (in_port_t)tcps->tcps_smallest_anon_port; + + if (port > tcps->tcps_largest_anon_port) { + if (restart) + return (0); + restart = B_TRUE; + port = (in_port_t)tcps->tcps_smallest_anon_port; + } + + if (port < tcps->tcps_smallest_nonpriv_port) + port = (in_port_t)tcps->tcps_smallest_nonpriv_port; + + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (port == tcps->tcps_g_epriv_ports[i]) { + port++; + /* + * Make sure whether the port is in the + * valid range. + */ + goto retry; + } + } + if (is_system_labeled() && + (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, + IPPROTO_TCP, B_TRUE)) != 0) { + port = i; + goto retry; + } + return (port); +} + +/* + * Return the next anonymous port in the privileged port range for + * bind checking. It starts at IPPORT_RESERVED - 1 and goes + * downwards. This is the same behavior as documented in the userland + * library call rresvport(3N). + * + * TS note: skip multilevel ports. + */ +static in_port_t +tcp_get_next_priv_port(const tcp_t *tcp) +{ + static in_port_t next_priv_port = IPPORT_RESERVED - 1; + in_port_t nextport; + boolean_t restart = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; +retry: + if (next_priv_port < tcps->tcps_min_anonpriv_port || + next_priv_port >= IPPORT_RESERVED) { + next_priv_port = IPPORT_RESERVED - 1; + if (restart) + return (0); + restart = B_TRUE; + } + if (is_system_labeled() && + (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), + next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { + next_priv_port = nextport; + goto retry; + } + return (next_priv_port--); +} + +static int +tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, + boolean_t bind_to_req_port_only, cred_t *cr) +{ + in_port_t mlp_port; + mlp_type_t addrtype, mlptype; + boolean_t user_specified; + in_port_t allocated_port; + in_port_t requested_port = *requested_port_ptr; + conn_t *connp = tcp->tcp_connp; + zone_t *zone; + tcp_stack_t *tcps = tcp->tcp_tcps; + in6_addr_t v6addr = connp->conn_laddr_v6; + + /* + * XXX It's up to the caller to specify bind_to_req_port_only or not. + */ + ASSERT(cr != NULL); + + /* + * Get a valid port (within the anonymous range and should not + * be a privileged one) to use if the user has not given a port. + * If multiple threads are here, they may all start with + * with the same initial port. But, it should be fine as long as + * tcp_bindi will ensure that no two threads will be assigned + * the same port. + * + * NOTE: XXX If a privileged process asks for an anonymous port, we + * still check for ports only in the range > tcp_smallest_non_priv_port, + * unless TCP_ANONPRIVBIND option is set. + */ + mlptype = mlptSingle; + mlp_port = requested_port; + if (requested_port == 0) { + requested_port = connp->conn_anon_priv_bind ? + tcp_get_next_priv_port(tcp) : + tcp_update_next_port(tcps->tcps_next_port_to_try, + tcp, B_TRUE); + if (requested_port == 0) { + return (-TNOADDR); + } + user_specified = B_FALSE; + + /* + * If the user went through one of the RPC interfaces to create + * this socket and RPC is MLP in this zone, then give him an + * anonymous MLP. + */ + if (connp->conn_anon_mlp && is_system_labeled()) { + zone = crgetzone(cr); + addrtype = tsol_mlp_addr_type( + connp->conn_allzones ? ALL_ZONES : zone->zone_id, + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); + if (addrtype == mlptSingle) { + return (-TNOADDR); + } + mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, + PMAPPORT, addrtype); + mlp_port = PMAPPORT; + } + } else { + int i; + boolean_t priv = B_FALSE; + + /* + * If the requested_port is in the well-known privileged range, + * verify that the stream was opened by a privileged user. + * Note: No locks are held when inspecting tcp_g_*epriv_ports + * but instead the code relies on: + * - the fact that the address of the array and its size never + * changes + * - the atomic assignment of the elements of the array + */ + if (requested_port < tcps->tcps_smallest_nonpriv_port) { + priv = B_TRUE; + } else { + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (requested_port == + tcps->tcps_g_epriv_ports[i]) { + priv = B_TRUE; + break; + } + } + } + if (priv) { + if (secpolicy_net_privaddr(cr, requested_port, + IPPROTO_TCP) != 0) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: no priv for port %d", + requested_port); + } + return (-TACCES); + } + } + user_specified = B_TRUE; + + connp = tcp->tcp_connp; + if (is_system_labeled()) { + zone = crgetzone(cr); + addrtype = tsol_mlp_addr_type( + connp->conn_allzones ? ALL_ZONES : zone->zone_id, + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); + if (addrtype == mlptSingle) { + return (-TNOADDR); + } + mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, + requested_port, addrtype); + } + } + + if (mlptype != mlptSingle) { + if (secpolicy_net_bindmlp(cr) != 0) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: no priv for multilevel port %d", + requested_port); + } + return (-TACCES); + } + + /* + * If we're specifically binding a shared IP address and the + * port is MLP on shared addresses, then check to see if this + * zone actually owns the MLP. Reject if not. + */ + if (mlptype == mlptShared && addrtype == mlptShared) { + /* + * No need to handle exclusive-stack zones since + * ALL_ZONES only applies to the shared stack. + */ + zoneid_t mlpzone; + + mlpzone = tsol_mlp_findzone(IPPROTO_TCP, + htons(mlp_port)); + if (connp->conn_zoneid != mlpzone) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: attempt to bind port " + "%d on shared addr in zone %d " + "(should be %d)", + mlp_port, connp->conn_zoneid, + mlpzone); + } + return (-TACCES); + } + } + + if (!user_specified) { + int err; + err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, + requested_port, B_TRUE); + if (err != 0) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: cannot establish anon " + "MLP for port %d", + requested_port); + } + return (err); + } + connp->conn_anon_port = B_TRUE; + } + connp->conn_mlp_type = mlptype; + } + + allocated_port = tcp_bindi(tcp, requested_port, &v6addr, + connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, + user_specified); + + if (allocated_port == 0) { + connp->conn_mlp_type = mlptSingle; + if (connp->conn_anon_port) { + connp->conn_anon_port = B_FALSE; + (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, + requested_port, B_FALSE); + } + if (bind_to_req_port_only) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: requested addr busy"); + } + return (-TADDRBUSY); + } else { + /* If we are out of ports, fail the bind. */ + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: out of ports?"); + } + return (-TNOADDR); + } + } + + /* Pass the allocated port back */ + *requested_port_ptr = allocated_port; + return (0); +} + +/* + * Check the address and check/pick a local port number. + */ +int +tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, + boolean_t bind_to_req_port_only) +{ + tcp_t *tcp = connp->conn_tcp; + sin_t *sin; + sin6_t *sin6; + in_port_t requested_port; + ipaddr_t v4addr; + in6_addr_t v6addr; + ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + uint_t scopeid = 0; + int error = 0; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); + + if (tcp->tcp_state == TCPS_BOUND) { + return (0); + } else if (tcp->tcp_state > TCPS_BOUND) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad state, %d", tcp->tcp_state); + } + return (-TOUTSTATE); + } + + ASSERT(sa != NULL && len != 0); + + if (!OK_32PTR((char *)sa)) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: bad address parameter, " + "address %p, len %d", + (void *)sa, len); + } + return (-TPROTO); + } + + error = proto_verify_ip_addr(connp->conn_family, sa, len); + if (error != 0) { + return (error); + } + + switch (len) { + case sizeof (sin_t): /* Complete IPv4 address */ + sin = (sin_t *)sa; + requested_port = ntohs(sin->sin_port); + v4addr = sin->sin_addr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, + B_FALSE); + } + break; + + case sizeof (sin6_t): /* Complete IPv6 address */ + sin6 = (sin6_t *)sa; + v6addr = sin6->sin6_addr; + requested_port = ntohs(sin6->sin6_port); + if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); + + IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, + zoneid, ipst, B_FALSE); + } + } else { + if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { + if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) + scopeid = sin6->sin6_scope_id; + laddr_type = ip_laddr_verify_v6(&v6addr, + zoneid, ipst, B_FALSE, scopeid); + } + } + break; + + default: + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad address length, %d", len); + } + return (EAFNOSUPPORT); + /* return (-TBADADDR); */ + } + + /* Is the local address a valid unicast address? */ + if (laddr_type == IPVL_BAD) + return (EADDRNOTAVAIL); + + connp->conn_bound_addr_v6 = v6addr; + if (scopeid != 0) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; + } + + connp->conn_laddr_v6 = v6addr; + connp->conn_saddr_v6 = v6addr; + + bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; + + error = tcp_bind_select_lport(tcp, &requested_port, + bind_to_req_port_only, cr); + if (error != 0) { + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + } + return (error); +} + +/* + * If the "bind_to_req_port_only" parameter is set, if the requested port + * number is available, return it, If not return 0 + * + * If "bind_to_req_port_only" parameter is not set and + * If the requested port number is available, return it. If not, return + * the first anonymous port we happen across. If no anonymous ports are + * available, return 0. addr is the requested local address, if any. + * + * In either case, when succeeding update the tcp_t to record the port number + * and insert it in the bind hash table. + * + * Note that TCP over IPv4 and IPv6 sockets can use the same port number + * without setting SO_REUSEADDR. This is needed so that they + * can be viewed as two independent transport protocols. + */ +in_port_t +tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, + int reuseaddr, boolean_t quick_connect, + boolean_t bind_to_req_port_only, boolean_t user_specified) +{ + /* number of times we have run around the loop */ + int count = 0; + /* maximum number of times to run around the loop */ + int loopmax; + conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + /* + * Lookup for free addresses is done in a loop and "loopmax" + * influences how long we spin in the loop + */ + if (bind_to_req_port_only) { + /* + * If the requested port is busy, don't bother to look + * for a new one. Setting loop maximum count to 1 has + * that effect. + */ + loopmax = 1; + } else { + /* + * If the requested port is busy, look for a free one + * in the anonymous port range. + * Set loopmax appropriately so that one does not look + * forever in the case all of the anonymous ports are in use. + */ + if (connp->conn_anon_priv_bind) { + /* + * loopmax = + * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 + */ + loopmax = IPPORT_RESERVED - + tcps->tcps_min_anonpriv_port; + } else { + loopmax = (tcps->tcps_largest_anon_port - + tcps->tcps_smallest_anon_port + 1); + } + } + do { + uint16_t lport; + tf_t *tbf; + tcp_t *ltcp; + conn_t *lconnp; + + lport = htons(port); + + /* + * Ensure that the tcp_t is not currently in the bind hash. + * Hold the lock on the hash bucket to ensure that + * the duplicate check plus the insertion is an atomic + * operation. + * + * This function does an inline lookup on the bind hash list + * Make sure that we access only members of tcp_t + * and that we don't look at tcp_tcp, since we are not + * doing a CONN_INC_REF. + */ + tcp_bind_hash_remove(tcp); + tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; + mutex_enter(&tbf->tf_lock); + for (ltcp = tbf->tf_tcp; ltcp != NULL; + ltcp = ltcp->tcp_bind_hash) { + if (lport == ltcp->tcp_connp->conn_lport) + break; + } + + for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { + boolean_t not_socket; + boolean_t exclbind; + + lconnp = ltcp->tcp_connp; + + /* + * On a labeled system, we must treat bindings to ports + * on shared IP addresses by sockets with MAC exemption + * privilege as being in all zones, as there's + * otherwise no way to identify the right receiver. + */ + if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) + continue; + + /* + * If TCP_EXCLBIND is set for either the bound or + * binding endpoint, the semantics of bind + * is changed according to the following. + * + * spec = specified address (v4 or v6) + * unspec = unspecified address (v4 or v6) + * A = specified addresses are different for endpoints + * + * bound bind to allowed + * ------------------------------------- + * unspec unspec no + * unspec spec no + * spec unspec no + * spec spec yes if A + * + * For labeled systems, SO_MAC_EXEMPT behaves the same + * as TCP_EXCLBIND, except that zoneid is ignored. + * + * Note: + * + * 1. Because of TLI semantics, an endpoint can go + * back from, say TCP_ESTABLISHED to TCPS_LISTEN or + * TCPS_BOUND, depending on whether it is originally + * a listener or not. That is why we need to check + * for states greater than or equal to TCPS_BOUND + * here. + * + * 2. Ideally, we should only check for state equals + * to TCPS_LISTEN. And the following check should be + * added. + * + * if (ltcp->tcp_state == TCPS_LISTEN || + * !reuseaddr || !lconnp->conn_reuseaddr) { + * ... + * } + * + * The semantics will be changed to this. If the + * endpoint on the list is in state not equal to + * TCPS_LISTEN and both endpoints have SO_REUSEADDR + * set, let the bind succeed. + * + * Because of (1), we cannot do that for TLI + * endpoints. But we can do that for socket endpoints. + * If in future, we can change this going back + * semantics, we can use the above check for TLI also. + */ + not_socket = !(TCP_IS_SOCKET(ltcp) && + TCP_IS_SOCKET(tcp)); + exclbind = lconnp->conn_exclbind || + connp->conn_exclbind; + + if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || + (connp->conn_mac_mode != CONN_MAC_DEFAULT) || + (exclbind && (not_socket || + ltcp->tcp_state <= TCPS_ESTABLISHED))) { + if (V6_OR_V4_INADDR_ANY( + lconnp->conn_bound_addr_v6) || + V6_OR_V4_INADDR_ANY(*laddr) || + IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6)) { + break; + } + continue; + } + + /* + * Check ipversion to allow IPv4 and IPv6 sockets to + * have disjoint port number spaces, if *_EXCLBIND + * is not set and only if the application binds to a + * specific port. We use the same autoassigned port + * number space for IPv4 and IPv6 sockets. + */ + if (connp->conn_ipversion != lconnp->conn_ipversion && + bind_to_req_port_only) + continue; + + /* + * Ideally, we should make sure that the source + * address, remote address, and remote port in the + * four tuple for this tcp-connection is unique. + * However, trying to find out the local source + * address would require too much code duplication + * with IP, since IP needs needs to have that code + * to support userland TCP implementations. + */ + if (quick_connect && + (ltcp->tcp_state > TCPS_LISTEN) && + ((connp->conn_fport != lconnp->conn_fport) || + !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, + &lconnp->conn_faddr_v6))) + continue; + + if (!reuseaddr) { + /* + * No socket option SO_REUSEADDR. + * If existing port is bound to + * a non-wildcard IP address + * and the requesting stream is + * bound to a distinct + * different IP addresses + * (non-wildcard, also), keep + * going. + */ + if (!V6_OR_V4_INADDR_ANY(*laddr) && + !V6_OR_V4_INADDR_ANY( + lconnp->conn_bound_addr_v6) && + !IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6)) + continue; + if (ltcp->tcp_state >= TCPS_BOUND) { + /* + * This port is being used and + * its state is >= TCPS_BOUND, + * so we can't bind to it. + */ + break; + } + } else { + /* + * socket option SO_REUSEADDR is set on the + * binding tcp_t. + * + * If two streams are bound to + * same IP address or both addr + * and bound source are wildcards + * (INADDR_ANY), we want to stop + * searching. + * We have found a match of IP source + * address and source port, which is + * refused regardless of the + * SO_REUSEADDR setting, so we break. + */ + if (IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6) && + (ltcp->tcp_state == TCPS_LISTEN || + ltcp->tcp_state == TCPS_BOUND)) + break; + } + } + if (ltcp != NULL) { + /* The port number is busy */ + mutex_exit(&tbf->tf_lock); + } else { + /* + * This port is ours. Insert in fanout and mark as + * bound to prevent others from getting the port + * number. + */ + tcp->tcp_state = TCPS_BOUND; + connp->conn_lport = htons(port); + + ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( + connp->conn_lport)] == tbf); + tcp_bind_hash_insert(tbf, tcp, 1); + + mutex_exit(&tbf->tf_lock); + + /* + * We don't want tcp_next_port_to_try to "inherit" + * a port number supplied by the user in a bind. + */ + if (user_specified) + return (port); + + /* + * This is the only place where tcp_next_port_to_try + * is updated. After the update, it may or may not + * be in the valid range. + */ + if (!connp->conn_anon_priv_bind) + tcps->tcps_next_port_to_try = port + 1; + return (port); + } + + if (connp->conn_anon_priv_bind) { + port = tcp_get_next_priv_port(tcp); + } else { + if (count == 0 && user_specified) { + /* + * We may have to return an anonymous port. So + * get one to start with. + */ + port = + tcp_update_next_port( + tcps->tcps_next_port_to_try, + tcp, B_TRUE); + user_specified = B_FALSE; + } else { + port = tcp_update_next_port(port + 1, tcp, + B_FALSE); + } + } + if (port == 0) + break; + + /* + * Don't let this loop run forever in the case where + * all of the anonymous ports are in use. + */ + } while (++count < loopmax); + return (0); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_cluster.c b/usr/src/uts/common/inet/tcp/tcp_cluster.c new file mode 100644 index 0000000000..fbf2c96d41 --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_cluster.c @@ -0,0 +1,139 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* This file contains Solaris Cluster related TCP hooks and functions. */ + +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/tcp_cluster.h> + +static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), + void *arg, tcp_stack_t *tcps); + +/* + * Hook functions to enable cluster networking + * On non-clustered systems these vectors must always be NULL. + */ +void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol, + sa_family_t addr_family, uint8_t *laddrp, + in_port_t lport, void *args) = NULL; +void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol, + sa_family_t addr_family, uint8_t *laddrp, + in_port_t lport, void *args) = NULL; + +int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol, + boolean_t is_outgoing, + sa_family_t addr_family, + uint8_t *laddrp, in_port_t lport, + uint8_t *faddrp, in_port_t fport, + void *args) = NULL; +void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol, + sa_family_t addr_family, uint8_t *laddrp, + in_port_t lport, uint8_t *faddrp, + in_port_t fport, void *args) = NULL; + +/* + * Exported routine for extracting active tcp connection status. + * + * This is used by the Solaris Cluster Networking software to + * gather a list of connections that need to be forwarded to + * specific nodes in the cluster when configuration changes occur. + * + * The callback is invoked for each tcp_t structure from all netstacks, + * if 'stack_id' is less than 0. Otherwise, only for tcp_t structures + * from the netstack with the specified stack_id. Returning + * non-zero from the callback routine terminates the search. + */ +int +cl_tcp_walk_list(netstackid_t stack_id, + int (*cl_callback)(cl_tcp_info_t *, void *), void *arg) +{ + netstack_handle_t nh; + netstack_t *ns; + int ret = 0; + + if (stack_id >= 0) { + if ((ns = netstack_find_by_stackid(stack_id)) == NULL) + return (EINVAL); + + ret = cl_tcp_walk_list_stack(cl_callback, arg, + ns->netstack_tcp); + netstack_rele(ns); + return (ret); + } + + netstack_next_init(&nh); + while ((ns = netstack_next(&nh)) != NULL) { + ret = cl_tcp_walk_list_stack(cl_callback, arg, + ns->netstack_tcp); + netstack_rele(ns); + } + netstack_next_fini(&nh); + return (ret); +} + +static int +cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *), void *arg, + tcp_stack_t *tcps) +{ + tcp_t *tcp; + cl_tcp_info_t cl_tcpi; + connf_t *connfp; + conn_t *connp; + int i; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + + ASSERT(callback != NULL); + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { + + tcp = connp->conn_tcp; + cl_tcpi.cl_tcpi_version = CL_TCPI_V1; + cl_tcpi.cl_tcpi_ipversion = connp->conn_ipversion; + cl_tcpi.cl_tcpi_state = tcp->tcp_state; + cl_tcpi.cl_tcpi_lport = connp->conn_lport; + cl_tcpi.cl_tcpi_fport = connp->conn_fport; + cl_tcpi.cl_tcpi_laddr_v6 = connp->conn_laddr_v6; + cl_tcpi.cl_tcpi_faddr_v6 = connp->conn_faddr_v6; + + /* + * If the callback returns non-zero + * we terminate the traversal. + */ + if ((*callback)(&cl_tcpi, arg) != 0) { + CONN_DEC_REF(tcp->tcp_connp); + return (1); + } + } + } + + return (0); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c index 93f3250fcc..f5bd5031ea 100644 --- a/usr/src/uts/common/inet/tcp/tcp_fusion.c +++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -401,7 +401,7 @@ tcp_fuse_output_urg(tcp_t *tcp, mblk_t *mp) mp->b_wptr = (uchar_t *)&tei[1]; TCP_STAT(tcps, tcp_fusion_urg); - BUMP_MIB(&tcps->tcps_mib, tcpOutUrg); + TCPS_BUMP_MIB(tcps, tcpOutUrg); head = peer_tcp->tcp_rcv_list; while (head != NULL) { @@ -645,12 +645,12 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp, uint32_t send_size) peer_tcp->tcp_rnxt += recv_size; peer_tcp->tcp_rack = peer_tcp->tcp_rnxt; - BUMP_MIB(&tcps->tcps_mib, tcpOutDataSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpOutDataBytes, send_size); + TCPS_BUMP_MIB(tcps, tcpOutDataSegs); + TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, send_size); - BUMP_MIB(&tcps->tcps_mib, tcpInSegs); - BUMP_MIB(&tcps->tcps_mib, tcpInDataInorderSegs); - UPDATE_MIB(&tcps->tcps_mib, tcpInDataInorderBytes, send_size); + TCPS_BUMP_MIB(tcps, tcpHCInSegs); + TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, send_size); BUMP_LOCAL(tcp->tcp_obsegs); BUMP_LOCAL(peer_tcp->tcp_ibsegs); diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c new file mode 100644 index 0000000000..0741a8292f --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -0,0 +1,5648 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* This file contains all TCP input processing functions. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/stropts.h> +#include <sys/strlog.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/suntpi.h> +#include <sys/xti_inet.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> +#include <sys/tsol/tnet.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/tcp_cluster.h> +#include <inet/proto_set.h> +#include <inet/ipsec_impl.h> + +/* + * RFC1323-recommended phrasing of TSTAMP option, for easier parsing + */ + +#ifdef _BIG_ENDIAN +#define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \ + (TCPOPT_TSTAMP << 8) | 10) +#else +#define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \ + (TCPOPT_NOP << 8) | TCPOPT_NOP) +#endif + +/* + * Flags returned from tcp_parse_options. + */ +#define TCP_OPT_MSS_PRESENT 1 +#define TCP_OPT_WSCALE_PRESENT 2 +#define TCP_OPT_TSTAMP_PRESENT 4 +#define TCP_OPT_SACK_OK_PRESENT 8 +#define TCP_OPT_SACK_PRESENT 16 + +/* + * PAWS needs a timer for 24 days. This is the number of ticks in 24 days + */ +#define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz)) + +/* + * Since tcp_listener is not cleared atomically with tcp_detached + * being cleared we need this extra bit to tell a detached connection + * apart from one that is in the process of being accepted. + */ +#define TCP_IS_DETACHED_NONEAGER(tcp) \ + (TCP_IS_DETACHED(tcp) && \ + (!(tcp)->tcp_hard_binding)) + +/* + * Steps to do when a tcp_t moves to TIME-WAIT state. + * + * This connection is done, we don't need to account for it. Decrement + * the listener connection counter if needed. + * + * Decrement the connection counter of the stack. Note that this counter + * is per CPU. So the total number of connections in a stack is the sum of all + * of them. Since there is no lock for handling all of them exclusively, the + * resulting sum is only an approximation. + * + * Unconditionally clear the exclusive binding bit so this TIME-WAIT + * connection won't interfere with new ones. + * + * Start the TIME-WAIT timer. If upper layer has not closed the connection, + * the timer is handled within the context of this tcp_t. When the timer + * fires, tcp_clean_death() is called. If upper layer closes the connection + * during this period, tcp_time_wait_append() will be called to add this + * tcp_t to the global TIME-WAIT list. Note that this means that the + * actual wait time in TIME-WAIT state will be longer than the + * tcps_time_wait_interval since the period before upper layer closes the + * connection is not accounted for when tcp_time_wait_append() is called. + * + * If uppser layer has closed the connection, call tcp_time_wait_append() + * directly. + * + */ +#define SET_TIME_WAIT(tcps, tcp, connp) \ +{ \ + (tcp)->tcp_state = TCPS_TIME_WAIT; \ + if ((tcp)->tcp_listen_cnt != NULL) \ + TCP_DECR_LISTEN_CNT(tcp); \ + atomic_dec_64( \ + (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt); \ + (connp)->conn_exclbind = 0; \ + if (!TCP_IS_DETACHED(tcp)) { \ + TCP_TIMER_RESTART(tcp, (tcps)->tcps_time_wait_interval); \ + } else { \ + tcp_time_wait_append(tcp); \ + TCP_DBGSTAT(tcps, tcp_rput_time_wait); \ + } \ +} + +/* + * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more + * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent + * data, TCP will not respond with an ACK. RFC 793 requires that + * TCP responds with an ACK for such a bogus ACK. By not following + * the RFC, we prevent TCP from getting into an ACK storm if somehow + * an attacker successfully spoofs an acceptable segment to our + * peer; or when our peer is "confused." + */ +static uint32_t tcp_drop_ack_unsent_cnt = 10; + +/* + * The shift factor applied to tcp_mss to decide if the peer sends us a + * valid initial receive window. By default, if the peer receive window + * is smaller than 1 MSS (shift factor is 0), it is considered as invalid. + */ +static uint32_t tcp_init_wnd_shft = 0; + +/* Process ICMP source quench message or not. */ +static boolean_t tcp_icmp_source_quench = B_FALSE; + +static boolean_t tcp_outbound_squeue_switch = B_FALSE; + +static mblk_t *tcp_conn_create_v4(conn_t *, conn_t *, mblk_t *, + ip_recv_attr_t *); +static mblk_t *tcp_conn_create_v6(conn_t *, conn_t *, mblk_t *, + ip_recv_attr_t *); +static boolean_t tcp_drop_q0(tcp_t *); +static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *); +static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *, + ip_recv_attr_t *); +static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *); +static int tcp_parse_options(tcpha_t *, tcp_opt_t *); +static void tcp_process_options(tcp_t *, tcpha_t *); +static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); +static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); +static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_set_rto(tcp_t *, time_t); +static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); + +extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *); + +/* + * Set the MSS associated with a particular tcp based on its current value, + * and a new one passed in. Observe minimums and maximums, and reset other + * state variables that we want to view as multiples of MSS. + * + * The value of MSS could be either increased or descreased. + */ +void +tcp_mss_set(tcp_t *tcp, uint32_t mss) +{ + uint32_t mss_max; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + + if (connp->conn_ipversion == IPV4_VERSION) + mss_max = tcps->tcps_mss_max_ipv4; + else + mss_max = tcps->tcps_mss_max_ipv6; + + if (mss < tcps->tcps_mss_min) + mss = tcps->tcps_mss_min; + if (mss > mss_max) + mss = mss_max; + /* + * Unless naglim has been set by our client to + * a non-mss value, force naglim to track mss. + * This can help to aggregate small writes. + */ + if (mss < tcp->tcp_naglim || tcp->tcp_mss == tcp->tcp_naglim) + tcp->tcp_naglim = mss; + /* + * TCP should be able to buffer at least 4 MSS data for obvious + * performance reason. + */ + if ((mss << 2) > connp->conn_sndbuf) + connp->conn_sndbuf = mss << 2; + + /* + * Set the send lowater to at least twice of MSS. + */ + if ((mss << 1) > connp->conn_sndlowat) + connp->conn_sndlowat = mss << 1; + + /* + * Update tcp_cwnd according to the new value of MSS. Keep the + * previous ratio to preserve the transmit rate. + */ + tcp->tcp_cwnd = (tcp->tcp_cwnd / tcp->tcp_mss) * mss; + tcp->tcp_cwnd_cnt = 0; + + tcp->tcp_mss = mss; + (void) tcp_maxpsz_set(tcp, B_TRUE); +} + +/* + * Extract option values from a tcp header. We put any found values into the + * tcpopt struct and return a bitmask saying which options were found. + */ +static int +tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt) +{ + uchar_t *endp; + int len; + uint32_t mss; + uchar_t *up = (uchar_t *)tcpha; + int found = 0; + int32_t sack_len; + tcp_seq sack_begin, sack_end; + tcp_t *tcp; + + endp = up + TCP_HDR_LENGTH(tcpha); + up += TCP_MIN_HEADER_LENGTH; + while (up < endp) { + len = endp - up; + switch (*up) { + case TCPOPT_EOL: + break; + + case TCPOPT_NOP: + up++; + continue; + + case TCPOPT_MAXSEG: + if (len < TCPOPT_MAXSEG_LEN || + up[1] != TCPOPT_MAXSEG_LEN) + break; + + mss = BE16_TO_U16(up+2); + /* Caller must handle tcp_mss_min and tcp_mss_max_* */ + tcpopt->tcp_opt_mss = mss; + found |= TCP_OPT_MSS_PRESENT; + + up += TCPOPT_MAXSEG_LEN; + continue; + + case TCPOPT_WSCALE: + if (len < TCPOPT_WS_LEN || up[1] != TCPOPT_WS_LEN) + break; + + if (up[2] > TCP_MAX_WINSHIFT) + tcpopt->tcp_opt_wscale = TCP_MAX_WINSHIFT; + else + tcpopt->tcp_opt_wscale = up[2]; + found |= TCP_OPT_WSCALE_PRESENT; + + up += TCPOPT_WS_LEN; + continue; + + case TCPOPT_SACK_PERMITTED: + if (len < TCPOPT_SACK_OK_LEN || + up[1] != TCPOPT_SACK_OK_LEN) + break; + found |= TCP_OPT_SACK_OK_PRESENT; + up += TCPOPT_SACK_OK_LEN; + continue; + + case TCPOPT_SACK: + if (len <= 2 || up[1] <= 2 || len < up[1]) + break; + + /* If TCP is not interested in SACK blks... */ + if ((tcp = tcpopt->tcp) == NULL) { + up += up[1]; + continue; + } + sack_len = up[1] - TCPOPT_HEADER_LEN; + up += TCPOPT_HEADER_LEN; + + /* + * If the list is empty, allocate one and assume + * nothing is sack'ed. + */ + ASSERT(tcp->tcp_sack_info != NULL); + if (tcp->tcp_notsack_list == NULL) { + tcp_notsack_update(&(tcp->tcp_notsack_list), + tcp->tcp_suna, tcp->tcp_snxt, + &(tcp->tcp_num_notsack_blk), + &(tcp->tcp_cnt_notsack_list)); + + /* + * Make sure tcp_notsack_list is not NULL. + * This happens when kmem_alloc(KM_NOSLEEP) + * returns NULL. + */ + if (tcp->tcp_notsack_list == NULL) { + up += sack_len; + continue; + } + tcp->tcp_fack = tcp->tcp_suna; + } + + while (sack_len > 0) { + if (up + 8 > endp) { + up = endp; + break; + } + sack_begin = BE32_TO_U32(up); + up += 4; + sack_end = BE32_TO_U32(up); + up += 4; + sack_len -= 8; + /* + * Bounds checking. Make sure the SACK + * info is within tcp_suna and tcp_snxt. + * If this SACK blk is out of bound, ignore + * it but continue to parse the following + * blks. + */ + if (SEQ_LEQ(sack_end, sack_begin) || + SEQ_LT(sack_begin, tcp->tcp_suna) || + SEQ_GT(sack_end, tcp->tcp_snxt)) { + continue; + } + tcp_notsack_insert(&(tcp->tcp_notsack_list), + sack_begin, sack_end, + &(tcp->tcp_num_notsack_blk), + &(tcp->tcp_cnt_notsack_list)); + if (SEQ_GT(sack_end, tcp->tcp_fack)) { + tcp->tcp_fack = sack_end; + } + } + found |= TCP_OPT_SACK_PRESENT; + continue; + + case TCPOPT_TSTAMP: + if (len < TCPOPT_TSTAMP_LEN || + up[1] != TCPOPT_TSTAMP_LEN) + break; + + tcpopt->tcp_opt_ts_val = BE32_TO_U32(up+2); + tcpopt->tcp_opt_ts_ecr = BE32_TO_U32(up+6); + + found |= TCP_OPT_TSTAMP_PRESENT; + + up += TCPOPT_TSTAMP_LEN; + continue; + + default: + if (len <= 1 || len < (int)up[1] || up[1] == 0) + break; + up += up[1]; + continue; + } + break; + } + return (found); +} + +/* + * Process all TCP option in SYN segment. Note that this function should + * be called after tcp_set_destination() is called so that the necessary info + * from IRE is already set in the tcp structure. + * + * This function sets up the correct tcp_mss value according to the + * MSS option value and our header size. It also sets up the window scale + * and timestamp values, and initialize SACK info blocks. But it does not + * change receive window size after setting the tcp_mss value. The caller + * should do the appropriate change. + */ +static void +tcp_process_options(tcp_t *tcp, tcpha_t *tcpha) +{ + int options; + tcp_opt_t tcpopt; + uint32_t mss_max; + char *tmp_tcph; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + + tcpopt.tcp = NULL; + options = tcp_parse_options(tcpha, &tcpopt); + + /* + * Process MSS option. Note that MSS option value does not account + * for IP or TCP options. This means that it is equal to MTU - minimum + * IP+TCP header size, which is 40 bytes for IPv4 and 60 bytes for + * IPv6. + */ + if (!(options & TCP_OPT_MSS_PRESENT)) { + if (connp->conn_ipversion == IPV4_VERSION) + tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv4; + else + tcpopt.tcp_opt_mss = tcps->tcps_mss_def_ipv6; + } else { + if (connp->conn_ipversion == IPV4_VERSION) + mss_max = tcps->tcps_mss_max_ipv4; + else + mss_max = tcps->tcps_mss_max_ipv6; + if (tcpopt.tcp_opt_mss < tcps->tcps_mss_min) + tcpopt.tcp_opt_mss = tcps->tcps_mss_min; + else if (tcpopt.tcp_opt_mss > mss_max) + tcpopt.tcp_opt_mss = mss_max; + } + + /* Process Window Scale option. */ + if (options & TCP_OPT_WSCALE_PRESENT) { + tcp->tcp_snd_ws = tcpopt.tcp_opt_wscale; + tcp->tcp_snd_ws_ok = B_TRUE; + } else { + tcp->tcp_snd_ws = B_FALSE; + tcp->tcp_snd_ws_ok = B_FALSE; + tcp->tcp_rcv_ws = B_FALSE; + } + + /* Process Timestamp option. */ + if ((options & TCP_OPT_TSTAMP_PRESENT) && + (tcp->tcp_snd_ts_ok || TCP_IS_DETACHED(tcp))) { + tmp_tcph = (char *)tcp->tcp_tcpha; + + tcp->tcp_snd_ts_ok = B_TRUE; + tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; + tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); + ASSERT(OK_32PTR(tmp_tcph)); + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); + + /* Fill in our template header with basic timestamp option. */ + tmp_tcph += connp->conn_ht_ulp_len; + tmp_tcph[0] = TCPOPT_NOP; + tmp_tcph[1] = TCPOPT_NOP; + tmp_tcph[2] = TCPOPT_TSTAMP; + tmp_tcph[3] = TCPOPT_TSTAMP_LEN; + connp->conn_ht_iphc_len += TCPOPT_REAL_TS_LEN; + connp->conn_ht_ulp_len += TCPOPT_REAL_TS_LEN; + tcp->tcp_tcpha->tha_offset_and_reserved += (3 << 4); + } else { + tcp->tcp_snd_ts_ok = B_FALSE; + } + + /* + * Process SACK options. If SACK is enabled for this connection, + * then allocate the SACK info structure. Note the following ways + * when tcp_snd_sack_ok is set to true. + * + * For active connection: in tcp_set_destination() called in + * tcp_connect(). + * + * For passive connection: in tcp_set_destination() called in + * tcp_input_listener(). + * + * That's the reason why the extra TCP_IS_DETACHED() check is there. + * That check makes sure that if we did not send a SACK OK option, + * we will not enable SACK for this connection even though the other + * side sends us SACK OK option. For active connection, the SACK + * info structure has already been allocated. So we need to free + * it if SACK is disabled. + */ + if ((options & TCP_OPT_SACK_OK_PRESENT) && + (tcp->tcp_snd_sack_ok || + (tcps->tcps_sack_permitted != 0 && TCP_IS_DETACHED(tcp)))) { + /* This should be true only in the passive case. */ + if (tcp->tcp_sack_info == NULL) { + ASSERT(TCP_IS_DETACHED(tcp)); + tcp->tcp_sack_info = + kmem_cache_alloc(tcp_sack_info_cache, KM_NOSLEEP); + } + if (tcp->tcp_sack_info == NULL) { + tcp->tcp_snd_sack_ok = B_FALSE; + } else { + tcp->tcp_snd_sack_ok = B_TRUE; + if (tcp->tcp_snd_ts_ok) { + tcp->tcp_max_sack_blk = 3; + } else { + tcp->tcp_max_sack_blk = 4; + } + } + } else { + /* + * Resetting tcp_snd_sack_ok to B_FALSE so that + * no SACK info will be used for this + * connection. This assumes that SACK usage + * permission is negotiated. This may need + * to be changed once this is clarified. + */ + if (tcp->tcp_sack_info != NULL) { + ASSERT(tcp->tcp_notsack_list == NULL); + kmem_cache_free(tcp_sack_info_cache, + tcp->tcp_sack_info); + tcp->tcp_sack_info = NULL; + } + tcp->tcp_snd_sack_ok = B_FALSE; + } + + /* + * Now we know the exact TCP/IP header length, subtract + * that from tcp_mss to get our side's MSS. + */ + tcp->tcp_mss -= connp->conn_ht_iphc_len; + + /* + * Here we assume that the other side's header size will be equal to + * our header size. We calculate the real MSS accordingly. Need to + * take into additional stuffs IPsec puts in. + * + * Real MSS = Opt.MSS - (our TCP/IP header - min TCP/IP header) + */ + tcpopt.tcp_opt_mss -= connp->conn_ht_iphc_len + + tcp->tcp_ipsec_overhead - + ((connp->conn_ipversion == IPV4_VERSION ? + IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) + TCP_MIN_HEADER_LENGTH); + + /* + * Set MSS to the smaller one of both ends of the connection. + * We should not have called tcp_mss_set() before, but our + * side of the MSS should have been set to a proper value + * by tcp_set_destination(). tcp_mss_set() will also set up the + * STREAM head parameters properly. + * + * If we have a larger-than-16-bit window but the other side + * didn't want to do window scale, tcp_rwnd_set() will take + * care of that. + */ + tcp_mss_set(tcp, MIN(tcpopt.tcp_opt_mss, tcp->tcp_mss)); + + /* + * Initialize tcp_cwnd value. After tcp_mss_set(), tcp_mss has been + * updated properly. + */ + TCP_SET_INIT_CWND(tcp, tcp->tcp_mss, tcps->tcps_slow_start_initial); +} + +/* + * Add a new piece to the tcp reassembly queue. If the gap at the beginning + * is filled, return as much as we can. The message passed in may be + * multi-part, chained using b_cont. "start" is the starting sequence + * number for this piece. + */ +static mblk_t * +tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start) +{ + uint32_t end; + mblk_t *mp1; + mblk_t *mp2; + mblk_t *next_mp; + uint32_t u1; + tcp_stack_t *tcps = tcp->tcp_tcps; + + + /* Walk through all the new pieces. */ + do { + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= + (uintptr_t)INT_MAX); + end = start + (int)(mp->b_wptr - mp->b_rptr); + next_mp = mp->b_cont; + if (start == end) { + /* Empty. Blast it. */ + freeb(mp); + continue; + } + mp->b_cont = NULL; + TCP_REASS_SET_SEQ(mp, start); + TCP_REASS_SET_END(mp, end); + mp1 = tcp->tcp_reass_tail; + if (!mp1) { + tcp->tcp_reass_tail = mp; + tcp->tcp_reass_head = mp; + TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, + end - start); + continue; + } + /* New stuff completely beyond tail? */ + if (SEQ_GEQ(start, TCP_REASS_END(mp1))) { + /* Link it on end. */ + mp1->b_cont = mp; + tcp->tcp_reass_tail = mp; + TCPS_BUMP_MIB(tcps, tcpInDataUnorderSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataUnorderBytes, + end - start); + continue; + } + mp1 = tcp->tcp_reass_head; + u1 = TCP_REASS_SEQ(mp1); + /* New stuff at the front? */ + if (SEQ_LT(start, u1)) { + /* Yes... Check for overlap. */ + mp->b_cont = mp1; + tcp->tcp_reass_head = mp; + tcp_reass_elim_overlap(tcp, mp); + continue; + } + /* + * The new piece fits somewhere between the head and tail. + * We find our slot, where mp1 precedes us and mp2 trails. + */ + for (; (mp2 = mp1->b_cont) != NULL; mp1 = mp2) { + u1 = TCP_REASS_SEQ(mp2); + if (SEQ_LEQ(start, u1)) + break; + } + /* Link ourselves in */ + mp->b_cont = mp2; + mp1->b_cont = mp; + + /* Trim overlap with following mblk(s) first */ + tcp_reass_elim_overlap(tcp, mp); + + /* Trim overlap with preceding mblk */ + tcp_reass_elim_overlap(tcp, mp1); + + } while (start = end, mp = next_mp); + mp1 = tcp->tcp_reass_head; + /* Anything ready to go? */ + if (TCP_REASS_SEQ(mp1) != tcp->tcp_rnxt) + return (NULL); + /* Eat what we can off the queue */ + for (;;) { + mp = mp1->b_cont; + end = TCP_REASS_END(mp1); + TCP_REASS_SET_SEQ(mp1, 0); + TCP_REASS_SET_END(mp1, 0); + if (!mp) { + tcp->tcp_reass_tail = NULL; + break; + } + if (end != TCP_REASS_SEQ(mp)) { + mp1->b_cont = NULL; + break; + } + mp1 = mp; + } + mp1 = tcp->tcp_reass_head; + tcp->tcp_reass_head = mp; + return (mp1); +} + +/* Eliminate any overlap that mp may have over later mblks */ +static void +tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp) +{ + uint32_t end; + mblk_t *mp1; + uint32_t u1; + tcp_stack_t *tcps = tcp->tcp_tcps; + + end = TCP_REASS_END(mp); + while ((mp1 = mp->b_cont) != NULL) { + u1 = TCP_REASS_SEQ(mp1); + if (!SEQ_GT(end, u1)) + break; + if (!SEQ_GEQ(end, TCP_REASS_END(mp1))) { + mp->b_wptr -= end - u1; + TCP_REASS_SET_END(mp, u1); + TCPS_BUMP_MIB(tcps, tcpInDataPartDupSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataPartDupBytes, + end - u1); + break; + } + mp->b_cont = mp1->b_cont; + TCP_REASS_SET_SEQ(mp1, 0); + TCP_REASS_SET_END(mp1, 0); + freeb(mp1); + TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, end - u1); + } + if (!mp1) + tcp->tcp_reass_tail = mp; +} + +/* + * This function does PAWS protection check. Returns B_TRUE if the + * segment passes the PAWS test, else returns B_FALSE. + */ +boolean_t +tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp) +{ + uint8_t flags; + int options; + uint8_t *up; + conn_t *connp = tcp->tcp_connp; + + flags = (unsigned int)tcpha->tha_flags & 0xFF; + /* + * If timestamp option is aligned nicely, get values inline, + * otherwise call general routine to parse. Only do that + * if timestamp is the only option. + */ + if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH + + TCPOPT_REAL_TS_LEN && + OK_32PTR((up = ((uint8_t *)tcpha) + + TCP_MIN_HEADER_LENGTH)) && + *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) { + tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4)); + tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8)); + + options = TCP_OPT_TSTAMP_PRESENT; + } else { + if (tcp->tcp_snd_sack_ok) { + tcpoptp->tcp = tcp; + } else { + tcpoptp->tcp = NULL; + } + options = tcp_parse_options(tcpha, tcpoptp); + } + + if (options & TCP_OPT_TSTAMP_PRESENT) { + /* + * Do PAWS per RFC 1323 section 4.2. Accept RST + * regardless of the timestamp, page 18 RFC 1323.bis. + */ + if ((flags & TH_RST) == 0 && + TSTMP_LT(tcpoptp->tcp_opt_ts_val, + tcp->tcp_ts_recent)) { + if (TSTMP_LT(LBOLT_FASTPATH64, + tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) { + /* This segment is not acceptable. */ + return (B_FALSE); + } else { + /* + * Connection has been idle for + * too long. Reset the timestamp + * and assume the segment is valid. + */ + tcp->tcp_ts_recent = + tcpoptp->tcp_opt_ts_val; + } + } + } else { + /* + * If we don't get a timestamp on every packet, we + * figure we can't really trust 'em, so we stop sending + * and parsing them. + */ + tcp->tcp_snd_ts_ok = B_FALSE; + + connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN; + connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN; + tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4); + /* + * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid + * doing a slow start here so as to not to lose on the + * transfer rate built up so far. + */ + tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN); + if (tcp->tcp_snd_sack_ok) { + ASSERT(tcp->tcp_sack_info != NULL); + tcp->tcp_max_sack_blk = 4; + } + } + return (B_TRUE); +} + +/* + * Defense for the SYN attack - + * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest + * one from the list of droppable eagers. This list is a subset of q0. + * see comments before the definition of MAKE_DROPPABLE(). + * 2. Don't drop a SYN request before its first timeout. This gives every + * request at least til the first timeout to complete its 3-way handshake. + * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many + * requests currently on the queue that has timed out. This will be used + * as an indicator of whether an attack is under way, so that appropriate + * actions can be taken. (It's incremented in tcp_timer() and decremented + * either when eager goes into ESTABLISHED, or gets freed up.) + * 4. The current threshold is - # of timeout > q0len/4 => SYN alert on + * # of timeout drops back to <= q0len/32 => SYN alert off + */ +static boolean_t +tcp_drop_q0(tcp_t *tcp) +{ + tcp_t *eager; + mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); + ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); + + /* Pick oldest eager from the list of droppable eagers */ + eager = tcp->tcp_eager_prev_drop_q0; + + /* If list is empty. return B_FALSE */ + if (eager == tcp) { + return (B_FALSE); + } + + /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ + if ((mp = allocb(0, BPRI_HI)) == NULL) + return (B_FALSE); + + /* + * Take this eager out from the list of droppable eagers since we are + * going to drop it. + */ + MAKE_UNDROPPABLE(eager); + + if (tcp->tcp_connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, + "tcp_drop_q0: listen half-open queue (max=%d) overflow" + " (%d pending) on %s, drop one", tcps->tcps_conn_req_max_q0, + tcp->tcp_conn_req_cnt_q0, + tcp_display(tcp, NULL, DISP_PORT_ONLY)); + } + + TCPS_BUMP_MIB(tcps, tcpHalfOpenDrop); + + /* Put a reference on the conn as we are enqueueing it in the sqeue */ + CONN_INC_REF(eager->tcp_connp); + + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, + tcp_clean_death_wrapper, eager->tcp_connp, NULL, + SQ_FILL, SQTAG_TCP_DROP_Q0); + + return (B_TRUE); +} + +/* + * Handle a SYN on an AF_INET6 socket; can be either IPv4 or IPv6 + */ +static mblk_t * +tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira) +{ + tcp_t *ltcp = lconnp->conn_tcp; + tcp_t *tcp = connp->conn_tcp; + mblk_t *tpi_mp; + ipha_t *ipha; + ip6_t *ip6h; + sin6_t sin6; + uint_t ifindex = ira->ira_ruifindex; + tcp_stack_t *tcps = tcp->tcp_tcps; + + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha = (ipha_t *)mp->b_rptr; + + connp->conn_ipversion = IPV4_VERSION; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); + connp->conn_saddr_v6 = connp->conn_laddr_v6; + + sin6 = sin6_null; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; + sin6.sin6_family = AF_INET6; + sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, + IPCL_ZONEID(lconnp), tcps->tcps_netstack); + + if (connp->conn_recv_ancillary.crb_recvdstaddr) { + sin6_t sin6d; + + sin6d = sin6_null; + sin6d.sin6_addr = connp->conn_laddr_v6; + sin6d.sin6_port = connp->conn_lport; + sin6d.sin6_family = AF_INET; + tpi_mp = mi_tpi_extconn_ind(NULL, + (char *)&sin6d, sizeof (sin6_t), + (char *)&tcp, + (t_scalar_t)sizeof (intptr_t), + (char *)&sin6d, sizeof (sin6_t), + (t_scalar_t)ltcp->tcp_conn_req_seqnum); + } else { + tpi_mp = mi_tpi_conn_ind(NULL, + (char *)&sin6, sizeof (sin6_t), + (char *)&tcp, (t_scalar_t)sizeof (intptr_t), + (t_scalar_t)ltcp->tcp_conn_req_seqnum); + } + } else { + ip6h = (ip6_t *)mp->b_rptr; + + connp->conn_ipversion = IPV6_VERSION; + connp->conn_laddr_v6 = ip6h->ip6_dst; + connp->conn_faddr_v6 = ip6h->ip6_src; + connp->conn_saddr_v6 = connp->conn_laddr_v6; + + sin6 = sin6_null; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; + sin6.sin6_family = AF_INET6; + sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; + sin6.__sin6_src_id = ip_srcid_find_addr(&connp->conn_laddr_v6, + IPCL_ZONEID(lconnp), tcps->tcps_netstack); + + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_src)) { + /* Pass up the scope_id of remote addr */ + sin6.sin6_scope_id = ifindex; + } else { + sin6.sin6_scope_id = 0; + } + if (connp->conn_recv_ancillary.crb_recvdstaddr) { + sin6_t sin6d; + + sin6d = sin6_null; + sin6.sin6_addr = connp->conn_laddr_v6; + sin6d.sin6_port = connp->conn_lport; + sin6d.sin6_family = AF_INET6; + if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_laddr_v6)) + sin6d.sin6_scope_id = ifindex; + + tpi_mp = mi_tpi_extconn_ind(NULL, + (char *)&sin6d, sizeof (sin6_t), + (char *)&tcp, (t_scalar_t)sizeof (intptr_t), + (char *)&sin6d, sizeof (sin6_t), + (t_scalar_t)ltcp->tcp_conn_req_seqnum); + } else { + tpi_mp = mi_tpi_conn_ind(NULL, + (char *)&sin6, sizeof (sin6_t), + (char *)&tcp, (t_scalar_t)sizeof (intptr_t), + (t_scalar_t)ltcp->tcp_conn_req_seqnum); + } + } + + tcp->tcp_mss = tcps->tcps_mss_def_ipv6; + return (tpi_mp); +} + +/* Handle a SYN on an AF_INET socket */ +static mblk_t * +tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp, + ip_recv_attr_t *ira) +{ + tcp_t *ltcp = lconnp->conn_tcp; + tcp_t *tcp = connp->conn_tcp; + sin_t sin; + mblk_t *tpi_mp = NULL; + tcp_stack_t *tcps = tcp->tcp_tcps; + ipha_t *ipha; + + ASSERT(ira->ira_flags & IRAF_IS_IPV4); + ipha = (ipha_t *)mp->b_rptr; + + connp->conn_ipversion = IPV4_VERSION; + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &connp->conn_laddr_v6); + IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &connp->conn_faddr_v6); + connp->conn_saddr_v6 = connp->conn_laddr_v6; + + sin = sin_null; + sin.sin_addr.s_addr = connp->conn_faddr_v4; + sin.sin_port = connp->conn_fport; + sin.sin_family = AF_INET; + if (lconnp->conn_recv_ancillary.crb_recvdstaddr) { + sin_t sind; + + sind = sin_null; + sind.sin_addr.s_addr = connp->conn_laddr_v4; + sind.sin_port = connp->conn_lport; + sind.sin_family = AF_INET; + tpi_mp = mi_tpi_extconn_ind(NULL, + (char *)&sind, sizeof (sin_t), (char *)&tcp, + (t_scalar_t)sizeof (intptr_t), (char *)&sind, + sizeof (sin_t), (t_scalar_t)ltcp->tcp_conn_req_seqnum); + } else { + tpi_mp = mi_tpi_conn_ind(NULL, + (char *)&sin, sizeof (sin_t), + (char *)&tcp, (t_scalar_t)sizeof (intptr_t), + (t_scalar_t)ltcp->tcp_conn_req_seqnum); + } + + tcp->tcp_mss = tcps->tcps_mss_def_ipv4; + return (tpi_mp); +} + +/* + * Called via squeue to get on to eager's perimeter. It sends a + * TH_RST if eager is in the fanout table. The listener wants the + * eager to disappear either by means of tcp_eager_blowoff() or + * tcp_eager_cleanup() being called. tcp_eager_kill() can also be + * called (via squeue) if the eager cannot be inserted in the + * fanout table in tcp_input_listener(). + */ +/* ARGSUSED */ +void +tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *econnp = (conn_t *)arg; + tcp_t *eager = econnp->conn_tcp; + tcp_t *listener = eager->tcp_listener; + + /* + * We could be called because listener is closing. Since + * the eager was using listener's queue's, we avoid + * using the listeners queues from now on. + */ + ASSERT(eager->tcp_detached); + econnp->conn_rq = NULL; + econnp->conn_wq = NULL; + + /* + * An eager's conn_fanout will be NULL if it's a duplicate + * for an existing 4-tuples in the conn fanout table. + * We don't want to send an RST out in such case. + */ + if (econnp->conn_fanout != NULL && eager->tcp_state > TCPS_LISTEN) { + tcp_xmit_ctl("tcp_eager_kill, can't wait", + eager, eager->tcp_snxt, 0, TH_RST); + } + + /* We are here because listener wants this eager gone */ + if (listener != NULL) { + mutex_enter(&listener->tcp_eager_lock); + tcp_eager_unlink(eager); + if (eager->tcp_tconnind_started) { + /* + * The eager has sent a conn_ind up to the + * listener but listener decides to close + * instead. We need to drop the extra ref + * placed on eager in tcp_input_data() before + * sending the conn_ind to listener. + */ + CONN_DEC_REF(econnp); + } + mutex_exit(&listener->tcp_eager_lock); + CONN_DEC_REF(listener->tcp_connp); + } + + if (eager->tcp_state != TCPS_CLOSED) + tcp_close_detached(eager); +} + +/* + * Reset any eager connection hanging off this listener marked + * with 'seqnum' and then reclaim it's resources. + */ +boolean_t +tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) +{ + tcp_t *eager; + mblk_t *mp; + + eager = listener; + mutex_enter(&listener->tcp_eager_lock); + do { + eager = eager->tcp_eager_next_q; + if (eager == NULL) { + mutex_exit(&listener->tcp_eager_lock); + return (B_FALSE); + } + } while (eager->tcp_conn_req_seqnum != seqnum); + + if (eager->tcp_closemp_used) { + mutex_exit(&listener->tcp_eager_lock); + return (B_TRUE); + } + eager->tcp_closemp_used = B_TRUE; + TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); + CONN_INC_REF(eager->tcp_connp); + mutex_exit(&listener->tcp_eager_lock); + mp = &eager->tcp_closemp; + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, tcp_eager_kill, + eager->tcp_connp, NULL, SQ_FILL, SQTAG_TCP_EAGER_BLOWOFF); + return (B_TRUE); +} + +/* + * Reset any eager connection hanging off this listener + * and then reclaim it's resources. + */ +void +tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) +{ + tcp_t *eager; + mblk_t *mp; + tcp_stack_t *tcps = listener->tcp_tcps; + + ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); + + if (!q0_only) { + /* First cleanup q */ + TCP_STAT(tcps, tcp_eager_blowoff_q); + eager = listener->tcp_eager_next_q; + while (eager != NULL) { + if (!eager->tcp_closemp_used) { + eager->tcp_closemp_used = B_TRUE; + TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); + CONN_INC_REF(eager->tcp_connp); + mp = &eager->tcp_closemp; + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, + tcp_eager_kill, eager->tcp_connp, NULL, + SQ_FILL, SQTAG_TCP_EAGER_CLEANUP); + } + eager = eager->tcp_eager_next_q; + } + } + /* Then cleanup q0 */ + TCP_STAT(tcps, tcp_eager_blowoff_q0); + eager = listener->tcp_eager_next_q0; + while (eager != listener) { + if (!eager->tcp_closemp_used) { + eager->tcp_closemp_used = B_TRUE; + TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); + CONN_INC_REF(eager->tcp_connp); + mp = &eager->tcp_closemp; + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, mp, + tcp_eager_kill, eager->tcp_connp, NULL, SQ_FILL, + SQTAG_TCP_EAGER_CLEANUP_Q0); + } + eager = eager->tcp_eager_next_q0; + } +} + +/* + * If we are an eager connection hanging off a listener that hasn't + * formally accepted the connection yet, get off his list and blow off + * any data that we have accumulated. + */ +void +tcp_eager_unlink(tcp_t *tcp) +{ + tcp_t *listener = tcp->tcp_listener; + + ASSERT(listener != NULL); + ASSERT(MUTEX_HELD(&listener->tcp_eager_lock)); + if (tcp->tcp_eager_next_q0 != NULL) { + ASSERT(tcp->tcp_eager_prev_q0 != NULL); + + /* Remove the eager tcp from q0 */ + tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = + tcp->tcp_eager_prev_q0; + tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = + tcp->tcp_eager_next_q0; + ASSERT(listener->tcp_conn_req_cnt_q0 > 0); + listener->tcp_conn_req_cnt_q0--; + + tcp->tcp_eager_next_q0 = NULL; + tcp->tcp_eager_prev_q0 = NULL; + + /* + * Take the eager out, if it is in the list of droppable + * eagers. + */ + MAKE_UNDROPPABLE(tcp); + + if (tcp->tcp_syn_rcvd_timeout != 0) { + /* we have timed out before */ + ASSERT(listener->tcp_syn_rcvd_timeout > 0); + listener->tcp_syn_rcvd_timeout--; + } + } else { + tcp_t **tcpp = &listener->tcp_eager_next_q; + tcp_t *prev = NULL; + + for (; tcpp[0]; tcpp = &tcpp[0]->tcp_eager_next_q) { + if (tcpp[0] == tcp) { + if (listener->tcp_eager_last_q == tcp) { + /* + * If we are unlinking the last + * element on the list, adjust + * tail pointer. Set tail pointer + * to nil when list is empty. + */ + ASSERT(tcp->tcp_eager_next_q == NULL); + if (listener->tcp_eager_last_q == + listener->tcp_eager_next_q) { + listener->tcp_eager_last_q = + NULL; + } else { + /* + * We won't get here if there + * is only one eager in the + * list. + */ + ASSERT(prev != NULL); + listener->tcp_eager_last_q = + prev; + } + } + tcpp[0] = tcp->tcp_eager_next_q; + tcp->tcp_eager_next_q = NULL; + tcp->tcp_eager_last_q = NULL; + ASSERT(listener->tcp_conn_req_cnt_q > 0); + listener->tcp_conn_req_cnt_q--; + break; + } + prev = tcpp[0]; + } + } + tcp->tcp_listener = NULL; +} + +/* BEGIN CSTYLED */ +/* + * + * The sockfs ACCEPT path: + * ======================= + * + * The eager is now established in its own perimeter as soon as SYN is + * received in tcp_input_listener(). When sockfs receives conn_ind, it + * completes the accept processing on the acceptor STREAM. The sending + * of conn_ind part is common for both sockfs listener and a TLI/XTI + * listener but a TLI/XTI listener completes the accept processing + * on the listener perimeter. + * + * Common control flow for 3 way handshake: + * ---------------------------------------- + * + * incoming SYN (listener perimeter) -> tcp_input_listener() + * + * incoming SYN-ACK-ACK (eager perim) -> tcp_input_data() + * send T_CONN_IND (listener perim) -> tcp_send_conn_ind() + * + * Sockfs ACCEPT Path: + * ------------------- + * + * open acceptor stream (tcp_open allocates tcp_tli_accept() + * as STREAM entry point) + * + * soaccept() sends T_CONN_RES on the acceptor STREAM to tcp_tli_accept() + * + * tcp_tli_accept() extracts the eager and makes the q->q_ptr <-> eager + * association (we are not behind eager's squeue but sockfs is protecting us + * and no one knows about this stream yet. The STREAMS entry point q->q_info + * is changed to point at tcp_wput(). + * + * tcp_accept_common() sends any deferred eagers via tcp_send_pending() to + * listener (done on listener's perimeter). + * + * tcp_tli_accept() calls tcp_accept_finish() on eagers perimeter to finish + * accept. + * + * TLI/XTI client ACCEPT path: + * --------------------------- + * + * soaccept() sends T_CONN_RES on the listener STREAM. + * + * tcp_tli_accept() -> tcp_accept_swap() complete the processing and send + * a M_SETOPS mblk to eager perimeter to finish accept (tcp_accept_finish()). + * + * Locks: + * ====== + * + * listener->tcp_eager_lock protects the listeners->tcp_eager_next_q0 and + * and listeners->tcp_eager_next_q. + * + * Referencing: + * ============ + * + * 1) We start out in tcp_input_listener by eager placing a ref on + * listener and listener adding eager to listeners->tcp_eager_next_q0. + * + * 2) When a SYN-ACK-ACK arrives, we send the conn_ind to listener. Before + * doing so we place a ref on the eager. This ref is finally dropped at the + * end of tcp_accept_finish() while unwinding from the squeue, i.e. the + * reference is dropped by the squeue framework. + * + * 3) The ref on listener placed in 1 above is dropped in tcp_accept_finish + * + * The reference must be released by the same entity that added the reference + * In the above scheme, the eager is the entity that adds and releases the + * references. Note that tcp_accept_finish executes in the squeue of the eager + * (albeit after it is attached to the acceptor stream). Though 1. executes + * in the listener's squeue, the eager is nascent at this point and the + * reference can be considered to have been added on behalf of the eager. + * + * Eager getting a Reset or listener closing: + * ========================================== + * + * Once the listener and eager are linked, the listener never does the unlink. + * If the listener needs to close, tcp_eager_cleanup() is called which queues + * a message on all eager perimeter. The eager then does the unlink, clears + * any pointers to the listener's queue and drops the reference to the + * listener. The listener waits in tcp_close outside the squeue until its + * refcount has dropped to 1. This ensures that the listener has waited for + * all eagers to clear their association with the listener. + * + * Similarly, if eager decides to go away, it can unlink itself and close. + * When the T_CONN_RES comes down, we check if eager has closed. Note that + * the reference to eager is still valid because of the extra ref we put + * in tcp_send_conn_ind. + * + * Listener can always locate the eager under the protection + * of the listener->tcp_eager_lock, and then do a refhold + * on the eager during the accept processing. + * + * The acceptor stream accesses the eager in the accept processing + * based on the ref placed on eager before sending T_conn_ind. + * The only entity that can negate this refhold is a listener close + * which is mutually exclusive with an active acceptor stream. + * + * Eager's reference on the listener + * =================================== + * + * If the accept happens (even on a closed eager) the eager drops its + * reference on the listener at the start of tcp_accept_finish. If the + * eager is killed due to an incoming RST before the T_conn_ind is sent up, + * the reference is dropped in tcp_closei_local. If the listener closes, + * the reference is dropped in tcp_eager_kill. In all cases the reference + * is dropped while executing in the eager's context (squeue). + */ +/* END CSTYLED */ + +/* Process the SYN packet, mp, directed at the listener 'tcp' */ + +/* + * THIS FUNCTION IS DIRECTLY CALLED BY IP VIA SQUEUE FOR SYN. + * tcp_input_data will not see any packets for listeners since the listener + * has conn_recv set to tcp_input_listener. + */ +/* ARGSUSED */ +static void +tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) +{ + tcpha_t *tcpha; + uint32_t seg_seq; + tcp_t *eager; + int err; + conn_t *econnp = NULL; + squeue_t *new_sqp; + mblk_t *mp1; + uint_t ip_hdr_len; + conn_t *lconnp = (conn_t *)arg; + tcp_t *listener = lconnp->conn_tcp; + tcp_stack_t *tcps = listener->tcp_tcps; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + uint_t flags; + mblk_t *tpi_mp; + uint_t ifindex = ira->ira_ruifindex; + boolean_t tlc_set = B_FALSE; + + ip_hdr_len = ira->ira_ip_hdr_length; + tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; + flags = (unsigned int)tcpha->tha_flags & 0xFF; + + if (!(flags & TH_SYN)) { + if ((flags & TH_RST) || (flags & TH_URG)) { + freemsg(mp); + return; + } + if (flags & TH_ACK) { + /* Note this executes in listener's squeue */ + tcp_xmit_listeners_reset(mp, ira, ipst, lconnp); + return; + } + + freemsg(mp); + return; + } + + if (listener->tcp_state != TCPS_LISTEN) + goto error2; + + ASSERT(IPCL_IS_BOUND(lconnp)); + + mutex_enter(&listener->tcp_eager_lock); + + /* + * The system is under memory pressure, so we need to do our part + * to relieve the pressure. So we only accept new request if there + * is nothing waiting to be accepted or waiting to complete the 3-way + * handshake. This means that busy listener will not get too many + * new requests which they cannot handle in time while non-busy + * listener is still functioning properly. + */ + if (tcps->tcps_reclaim && (listener->tcp_conn_req_cnt_q > 0 || + listener->tcp_conn_req_cnt_q0 > 0)) { + mutex_exit(&listener->tcp_eager_lock); + TCP_STAT(tcps, tcp_listen_mem_drop); + goto error2; + } + + if (listener->tcp_conn_req_cnt_q >= listener->tcp_conn_req_max) { + mutex_exit(&listener->tcp_eager_lock); + TCP_STAT(tcps, tcp_listendrop); + TCPS_BUMP_MIB(tcps, tcpListenDrop); + if (lconnp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, + "tcp_input_listener: listen backlog (max=%d) " + "overflow (%d pending) on %s", + listener->tcp_conn_req_max, + listener->tcp_conn_req_cnt_q, + tcp_display(listener, NULL, DISP_PORT_ONLY)); + } + goto error2; + } + + if (listener->tcp_conn_req_cnt_q0 >= + listener->tcp_conn_req_max + tcps->tcps_conn_req_max_q0) { + /* + * Q0 is full. Drop a pending half-open req from the queue + * to make room for the new SYN req. Also mark the time we + * drop a SYN. + * + * A more aggressive defense against SYN attack will + * be to set the "tcp_syn_defense" flag now. + */ + TCP_STAT(tcps, tcp_listendropq0); + listener->tcp_last_rcv_lbolt = ddi_get_lbolt64(); + if (!tcp_drop_q0(listener)) { + mutex_exit(&listener->tcp_eager_lock); + TCPS_BUMP_MIB(tcps, tcpListenDropQ0); + if (lconnp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, + "tcp_input_listener: listen half-open " + "queue (max=%d) full (%d pending) on %s", + tcps->tcps_conn_req_max_q0, + listener->tcp_conn_req_cnt_q0, + tcp_display(listener, NULL, + DISP_PORT_ONLY)); + } + goto error2; + } + } + + /* + * Enforce the limit set on the number of connections per listener. + * Note that tlc_cnt starts with 1. So need to add 1 to tlc_max + * for comparison. + */ + if (listener->tcp_listen_cnt != NULL) { + tcp_listen_cnt_t *tlc = listener->tcp_listen_cnt; + int64_t now; + + if (atomic_add_32_nv(&tlc->tlc_cnt, 1) > tlc->tlc_max + 1) { + mutex_exit(&listener->tcp_eager_lock); + now = ddi_get_lbolt64(); + atomic_add_32(&tlc->tlc_cnt, -1); + TCP_STAT(tcps, tcp_listen_cnt_drop); + tlc->tlc_drop++; + if (now - tlc->tlc_report_time > + MSEC_TO_TICK(TCP_TLC_REPORT_INTERVAL)) { + zcmn_err(lconnp->conn_zoneid, CE_WARN, + "Listener (port %d) connection max (%u) " + "reached: %u attempts dropped total\n", + ntohs(listener->tcp_connp->conn_lport), + tlc->tlc_max, tlc->tlc_drop); + tlc->tlc_report_time = now; + } + goto error2; + } + tlc_set = B_TRUE; + } + + mutex_exit(&listener->tcp_eager_lock); + + /* + * IP sets ira_sqp to either the senders conn_sqp (for loopback) + * or based on the ring (for packets from GLD). Otherwise it is + * set based on lbolt i.e., a somewhat random number. + */ + ASSERT(ira->ira_sqp != NULL); + new_sqp = ira->ira_sqp; + + econnp = (conn_t *)tcp_get_conn(arg2, tcps); + if (econnp == NULL) + goto error2; + + ASSERT(econnp->conn_netstack == lconnp->conn_netstack); + econnp->conn_sqp = new_sqp; + econnp->conn_initial_sqp = new_sqp; + econnp->conn_ixa->ixa_sqp = new_sqp; + + econnp->conn_fport = tcpha->tha_lport; + econnp->conn_lport = tcpha->tha_fport; + + err = conn_inherit_parent(lconnp, econnp); + if (err != 0) + goto error3; + + /* We already know the laddr of the new connection is ours */ + econnp->conn_ixa->ixa_src_generation = ipst->ips_src_generation; + + ASSERT(OK_32PTR(mp->b_rptr)); + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION || + IPH_HDR_VERSION(mp->b_rptr) == IPV6_VERSION); + + if (lconnp->conn_family == AF_INET) { + ASSERT(IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION); + tpi_mp = tcp_conn_create_v4(lconnp, econnp, mp, ira); + } else { + tpi_mp = tcp_conn_create_v6(lconnp, econnp, mp, ira); + } + + if (tpi_mp == NULL) + goto error3; + + eager = econnp->conn_tcp; + eager->tcp_detached = B_TRUE; + SOCK_CONNID_INIT(eager->tcp_connid); + + tcp_init_values(eager); + + ASSERT((econnp->conn_ixa->ixa_flags & + (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)) == + (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | + IXAF_VERIFY_PMTU | IXAF_VERIFY_LSO)); + + if (!tcps->tcps_dev_flow_ctl) + econnp->conn_ixa->ixa_flags |= IXAF_NO_DEV_FLOW_CTL; + + /* Prepare for diffing against previous packets */ + eager->tcp_recvifindex = 0; + eager->tcp_recvhops = 0xffffffffU; + + if (!(ira->ira_flags & IRAF_IS_IPV4) && econnp->conn_bound_if == 0) { + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_faddr_v6) || + IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6)) { + econnp->conn_incoming_ifindex = ifindex; + econnp->conn_ixa->ixa_flags |= IXAF_SCOPEID_SET; + econnp->conn_ixa->ixa_scopeid = ifindex; + } + } + + if ((ira->ira_flags & (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS)) == + (IRAF_IS_IPV4|IRAF_IPV4_OPTIONS) && + tcps->tcps_rev_src_routes) { + ipha_t *ipha = (ipha_t *)mp->b_rptr; + ip_pkt_t *ipp = &econnp->conn_xmit_ipp; + + /* Source routing option copyover (reverse it) */ + err = ip_find_hdr_v4(ipha, ipp, B_TRUE); + if (err != 0) { + freemsg(tpi_mp); + goto error3; + } + ip_pkt_source_route_reverse_v4(ipp); + } + + ASSERT(eager->tcp_conn.tcp_eager_conn_ind == NULL); + ASSERT(!eager->tcp_tconnind_started); + /* + * If the SYN came with a credential, it's a loopback packet or a + * labeled packet; attach the credential to the TPI message. + */ + if (ira->ira_cred != NULL) + mblk_setcred(tpi_mp, ira->ira_cred, ira->ira_cpid); + + eager->tcp_conn.tcp_eager_conn_ind = tpi_mp; + + /* Inherit the listener's SSL protection state */ + if ((eager->tcp_kssl_ent = listener->tcp_kssl_ent) != NULL) { + kssl_hold_ent(eager->tcp_kssl_ent); + eager->tcp_kssl_pending = B_TRUE; + } + + /* Inherit the listener's non-STREAMS flag */ + if (IPCL_IS_NONSTR(lconnp)) { + econnp->conn_flags |= IPCL_NONSTR; + } + + ASSERT(eager->tcp_ordrel_mp == NULL); + + if (!IPCL_IS_NONSTR(econnp)) { + /* + * Pre-allocate the T_ordrel_ind mblk for TPI socket so that + * at close time, we will always have that to send up. + * Otherwise, we need to do special handling in case the + * allocation fails at that time. + */ + if ((eager->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) + goto error3; + } + /* + * Now that the IP addresses and ports are setup in econnp we + * can do the IPsec policy work. + */ + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + if (lconnp->conn_policy != NULL) { + /* + * Inherit the policy from the listener; use + * actions from ira + */ + if (!ip_ipsec_policy_inherit(econnp, lconnp, ira)) { + CONN_DEC_REF(econnp); + freemsg(mp); + goto error3; + } + } + } + + /* Inherit various TCP parameters from the listener */ + eager->tcp_naglim = listener->tcp_naglim; + eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold; + eager->tcp_second_timer_threshold = + listener->tcp_second_timer_threshold; + eager->tcp_first_ctimer_threshold = + listener->tcp_first_ctimer_threshold; + eager->tcp_second_ctimer_threshold = + listener->tcp_second_ctimer_threshold; + + /* + * tcp_set_destination() may set tcp_rwnd according to the route + * metrics. If it does not, the eager's receive window will be set + * to the listener's receive window later in this function. + */ + eager->tcp_rwnd = 0; + + /* + * Inherit listener's tcp_init_cwnd. Need to do this before + * calling tcp_process_options() which set the initial cwnd. + */ + eager->tcp_init_cwnd = listener->tcp_init_cwnd; + + if (is_system_labeled()) { + ip_xmit_attr_t *ixa = econnp->conn_ixa; + + ASSERT(ira->ira_tsl != NULL); + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + ixa->ixa_tsl = NULL; + } + if ((lconnp->conn_mlp_type != mlptSingle || + lconnp->conn_mac_mode != CONN_MAC_DEFAULT) && + ira->ira_tsl != NULL) { + /* + * If this is an MLP connection or a MAC-Exempt + * connection with an unlabeled node, packets are to be + * exchanged using the security label of the received + * SYN packet instead of the server application's label. + * tsol_check_dest called from ip_set_destination + * might later update TSF_UNLABELED by replacing + * ixa_tsl with a new label. + */ + label_hold(ira->ira_tsl); + ip_xmit_attr_replace_tsl(ixa, ira->ira_tsl); + DTRACE_PROBE2(mlp_syn_accept, conn_t *, + econnp, ts_label_t *, ixa->ixa_tsl) + } else { + ixa->ixa_tsl = crgetlabel(econnp->conn_cred); + DTRACE_PROBE2(syn_accept, conn_t *, + econnp, ts_label_t *, ixa->ixa_tsl) + } + /* + * conn_connect() called from tcp_set_destination will verify + * the destination is allowed to receive packets at the + * security label of the SYN-ACK we are generating. As part of + * that, tsol_check_dest() may create a new effective label for + * this connection. + * Finally conn_connect() will call conn_update_label. + * All that remains for TCP to do is to call + * conn_build_hdr_template which is done as part of + * tcp_set_destination. + */ + } + + /* + * Since we will clear tcp_listener before we clear tcp_detached + * in the accept code we need tcp_hard_binding aka tcp_accept_inprogress + * so we can tell a TCP_DETACHED_NONEAGER apart. + */ + eager->tcp_hard_binding = B_TRUE; + + tcp_bind_hash_insert(&tcps->tcps_bind_fanout[ + TCP_BIND_HASH(econnp->conn_lport)], eager, 0); + + CL_INET_CONNECT(econnp, B_FALSE, err); + if (err != 0) { + tcp_bind_hash_remove(eager); + goto error3; + } + + /* + * No need to check for multicast destination since ip will only pass + * up multicasts to those that have expressed interest + * TODO: what about rejecting broadcasts? + * Also check that source is not a multicast or broadcast address. + */ + eager->tcp_state = TCPS_SYN_RCVD; + SOCK_CONNID_BUMP(eager->tcp_connid); + + /* + * Adapt our mss, ttl, ... based on the remote address. + */ + + if (tcp_set_destination(eager) != 0) { + TCPS_BUMP_MIB(tcps, tcpAttemptFails); + /* Undo the bind_hash_insert */ + tcp_bind_hash_remove(eager); + goto error3; + } + + /* Process all TCP options. */ + tcp_process_options(eager, tcpha); + + /* Is the other end ECN capable? */ + if (tcps->tcps_ecn_permitted >= 1 && + (tcpha->tha_flags & (TH_ECE|TH_CWR)) == (TH_ECE|TH_CWR)) { + eager->tcp_ecn_ok = B_TRUE; + } + + /* + * The listener's conn_rcvbuf should be the default window size or a + * window size changed via SO_RCVBUF option. First round up the + * eager's tcp_rwnd to the nearest MSS. Then find out the window + * scale option value if needed. Call tcp_rwnd_set() to finish the + * setting. + * + * Note if there is a rpipe metric associated with the remote host, + * we should not inherit receive window size from listener. + */ + eager->tcp_rwnd = MSS_ROUNDUP( + (eager->tcp_rwnd == 0 ? econnp->conn_rcvbuf : + eager->tcp_rwnd), eager->tcp_mss); + if (eager->tcp_snd_ws_ok) + tcp_set_ws_value(eager); + /* + * Note that this is the only place tcp_rwnd_set() is called for + * accepting a connection. We need to call it here instead of + * after the 3-way handshake because we need to tell the other + * side our rwnd in the SYN-ACK segment. + */ + (void) tcp_rwnd_set(eager, eager->tcp_rwnd); + + ASSERT(eager->tcp_connp->conn_rcvbuf != 0 && + eager->tcp_connp->conn_rcvbuf == eager->tcp_rwnd); + + ASSERT(econnp->conn_rcvbuf != 0 && + econnp->conn_rcvbuf == eager->tcp_rwnd); + + /* Put a ref on the listener for the eager. */ + CONN_INC_REF(lconnp); + mutex_enter(&listener->tcp_eager_lock); + listener->tcp_eager_next_q0->tcp_eager_prev_q0 = eager; + eager->tcp_eager_next_q0 = listener->tcp_eager_next_q0; + listener->tcp_eager_next_q0 = eager; + eager->tcp_eager_prev_q0 = listener; + + /* Set tcp_listener before adding it to tcp_conn_fanout */ + eager->tcp_listener = listener; + eager->tcp_saved_listener = listener; + + /* + * Set tcp_listen_cnt so that when the connection is done, the counter + * is decremented. + */ + eager->tcp_listen_cnt = listener->tcp_listen_cnt; + + /* + * Tag this detached tcp vector for later retrieval + * by our listener client in tcp_accept(). + */ + eager->tcp_conn_req_seqnum = listener->tcp_conn_req_seqnum; + listener->tcp_conn_req_cnt_q0++; + if (++listener->tcp_conn_req_seqnum == -1) { + /* + * -1 is "special" and defined in TPI as something + * that should never be used in T_CONN_IND + */ + ++listener->tcp_conn_req_seqnum; + } + mutex_exit(&listener->tcp_eager_lock); + + if (listener->tcp_syn_defense) { + /* Don't drop the SYN that comes from a good IP source */ + ipaddr_t *addr_cache; + + addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); + if (addr_cache != NULL && econnp->conn_faddr_v4 == + addr_cache[IP_ADDR_CACHE_HASH(econnp->conn_faddr_v4)]) { + eager->tcp_dontdrop = B_TRUE; + } + } + + /* + * We need to insert the eager in its own perimeter but as soon + * as we do that, we expose the eager to the classifier and + * should not touch any field outside the eager's perimeter. + * So do all the work necessary before inserting the eager + * in its own perimeter. Be optimistic that conn_connect() + * will succeed but undo everything if it fails. + */ + seg_seq = ntohl(tcpha->tha_seq); + eager->tcp_irs = seg_seq; + eager->tcp_rack = seg_seq; + eager->tcp_rnxt = seg_seq + 1; + eager->tcp_tcpha->tha_ack = htonl(eager->tcp_rnxt); + TCPS_BUMP_MIB(tcps, tcpPassiveOpens); + eager->tcp_state = TCPS_SYN_RCVD; + mp1 = tcp_xmit_mp(eager, eager->tcp_xmit_head, eager->tcp_mss, + NULL, NULL, eager->tcp_iss, B_FALSE, NULL, B_FALSE); + if (mp1 == NULL) { + /* + * Increment the ref count as we are going to + * enqueueing an mp in squeue + */ + CONN_INC_REF(econnp); + goto error; + } + + /* + * We need to start the rto timer. In normal case, we start + * the timer after sending the packet on the wire (or at + * least believing that packet was sent by waiting for + * conn_ip_output() to return). Since this is the first packet + * being sent on the wire for the eager, our initial tcp_rto + * is at least tcp_rexmit_interval_min which is a fairly + * large value to allow the algorithm to adjust slowly to large + * fluctuations of RTT during first few transmissions. + * + * Starting the timer first and then sending the packet in this + * case shouldn't make much difference since tcp_rexmit_interval_min + * is of the order of several 100ms and starting the timer + * first and then sending the packet will result in difference + * of few micro seconds. + * + * Without this optimization, we are forced to hold the fanout + * lock across the ipcl_bind_insert() and sending the packet + * so that we don't race against an incoming packet (maybe RST) + * for this eager. + * + * It is necessary to acquire an extra reference on the eager + * at this point and hold it until after tcp_send_data() to + * ensure against an eager close race. + */ + + CONN_INC_REF(econnp); + + TCP_TIMER_RESTART(eager, eager->tcp_rto); + + /* + * Insert the eager in its own perimeter now. We are ready to deal + * with any packets on eager. + */ + if (ipcl_conn_insert(econnp) != 0) + goto error; + + ASSERT(econnp->conn_ixa->ixa_notify_cookie == econnp->conn_tcp); + freemsg(mp); + /* + * Send the SYN-ACK. Use the right squeue so that conn_ixa is + * only used by one thread at a time. + */ + if (econnp->conn_sqp == lconnp->conn_sqp) { + (void) conn_ip_output(mp1, econnp->conn_ixa); + CONN_DEC_REF(econnp); + } else { + SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_send_synack, + econnp, NULL, SQ_PROCESS, SQTAG_TCP_SEND_SYNACK); + } + return; +error: + freemsg(mp1); + eager->tcp_closemp_used = B_TRUE; + TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); + mp1 = &eager->tcp_closemp; + SQUEUE_ENTER_ONE(econnp->conn_sqp, mp1, tcp_eager_kill, + econnp, NULL, SQ_FILL, SQTAG_TCP_CONN_REQ_2); + + /* + * If a connection already exists, send the mp to that connections so + * that it can be appropriately dealt with. + */ + ipst = tcps->tcps_netstack->netstack_ip; + + if ((econnp = ipcl_classify(mp, ira, ipst)) != NULL) { + if (!IPCL_IS_CONNECTED(econnp)) { + /* + * Something bad happened. ipcl_conn_insert() + * failed because a connection already existed + * in connected hash but we can't find it + * anymore (someone blew it away). Just + * free this message and hopefully remote + * will retransmit at which time the SYN can be + * treated as a new connection or dealth with + * a TH_RST if a connection already exists. + */ + CONN_DEC_REF(econnp); + freemsg(mp); + } else { + SQUEUE_ENTER_ONE(econnp->conn_sqp, mp, tcp_input_data, + econnp, ira, SQ_FILL, SQTAG_TCP_CONN_REQ_1); + } + } else { + /* Nobody wants this packet */ + freemsg(mp); + } + return; +error3: + CONN_DEC_REF(econnp); +error2: + freemsg(mp); + if (tlc_set) + atomic_add_32(&listener->tcp_listen_cnt->tlc_cnt, -1); +} + +/* + * In an ideal case of vertical partition in NUMA architecture, its + * beneficial to have the listener and all the incoming connections + * tied to the same squeue. The other constraint is that incoming + * connections should be tied to the squeue attached to interrupted + * CPU for obvious locality reason so this leaves the listener to + * be tied to the same squeue. Our only problem is that when listener + * is binding, the CPU that will get interrupted by the NIC whose + * IP address the listener is binding to is not even known. So + * the code below allows us to change that binding at the time the + * CPU is interrupted by virtue of incoming connection's squeue. + * + * This is usefull only in case of a listener bound to a specific IP + * address. For other kind of listeners, they get bound the + * very first time and there is no attempt to rebind them. + */ +void +tcp_input_listener_unbound(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *ira) +{ + conn_t *connp = (conn_t *)arg; + squeue_t *sqp = (squeue_t *)arg2; + squeue_t *new_sqp; + uint32_t conn_flags; + + /* + * IP sets ira_sqp to either the senders conn_sqp (for loopback) + * or based on the ring (for packets from GLD). Otherwise it is + * set based on lbolt i.e., a somewhat random number. + */ + ASSERT(ira->ira_sqp != NULL); + new_sqp = ira->ira_sqp; + + if (connp->conn_fanout == NULL) + goto done; + + if (!(connp->conn_flags & IPCL_FULLY_BOUND)) { + mutex_enter(&connp->conn_fanout->connf_lock); + mutex_enter(&connp->conn_lock); + /* + * No one from read or write side can access us now + * except for already queued packets on this squeue. + * But since we haven't changed the squeue yet, they + * can't execute. If they are processed after we have + * changed the squeue, they are sent back to the + * correct squeue down below. + * But a listner close can race with processing of + * incoming SYN. If incoming SYN processing changes + * the squeue then the listener close which is waiting + * to enter the squeue would operate on the wrong + * squeue. Hence we don't change the squeue here unless + * the refcount is exactly the minimum refcount. The + * minimum refcount of 4 is counted as - 1 each for + * TCP and IP, 1 for being in the classifier hash, and + * 1 for the mblk being processed. + */ + + if (connp->conn_ref != 4 || + connp->conn_tcp->tcp_state != TCPS_LISTEN) { + mutex_exit(&connp->conn_lock); + mutex_exit(&connp->conn_fanout->connf_lock); + goto done; + } + if (connp->conn_sqp != new_sqp) { + while (connp->conn_sqp != new_sqp) + (void) casptr(&connp->conn_sqp, sqp, new_sqp); + /* No special MT issues for outbound ixa_sqp hint */ + connp->conn_ixa->ixa_sqp = new_sqp; + } + + do { + conn_flags = connp->conn_flags; + conn_flags |= IPCL_FULLY_BOUND; + (void) cas32(&connp->conn_flags, connp->conn_flags, + conn_flags); + } while (!(connp->conn_flags & IPCL_FULLY_BOUND)); + + mutex_exit(&connp->conn_fanout->connf_lock); + mutex_exit(&connp->conn_lock); + + /* + * Assume we have picked a good squeue for the listener. Make + * subsequent SYNs not try to change the squeue. + */ + connp->conn_recv = tcp_input_listener; + } + +done: + if (connp->conn_sqp != sqp) { + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, connp->conn_recv, connp, + ira, SQ_FILL, SQTAG_TCP_CONN_REQ_UNBOUND); + } else { + tcp_input_listener(connp, mp, sqp, ira); + } +} + +/* + * Send up all messages queued on tcp_rcv_list. + */ +uint_t +tcp_rcv_drain(tcp_t *tcp) +{ + mblk_t *mp; + uint_t ret = 0; +#ifdef DEBUG + uint_t cnt = 0; +#endif + queue_t *q = tcp->tcp_connp->conn_rq; + + /* Can't drain on an eager connection */ + if (tcp->tcp_listener != NULL) + return (ret); + + /* Can't be a non-STREAMS connection */ + ASSERT(!IPCL_IS_NONSTR(tcp->tcp_connp)); + + /* No need for the push timer now. */ + if (tcp->tcp_push_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); + tcp->tcp_push_tid = 0; + } + + /* + * Handle two cases here: we are currently fused or we were + * previously fused and have some urgent data to be delivered + * upstream. The latter happens because we either ran out of + * memory or were detached and therefore sending the SIGURG was + * deferred until this point. In either case we pass control + * over to tcp_fuse_rcv_drain() since it may need to complete + * some work. + */ + if ((tcp->tcp_fused || tcp->tcp_fused_sigurg)) { + ASSERT(IPCL_IS_NONSTR(tcp->tcp_connp) || + tcp->tcp_fused_sigurg_mp != NULL); + if (tcp_fuse_rcv_drain(q, tcp, tcp->tcp_fused ? NULL : + &tcp->tcp_fused_sigurg_mp)) + return (ret); + } + + while ((mp = tcp->tcp_rcv_list) != NULL) { + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; +#ifdef DEBUG + cnt += msgdsize(mp); +#endif + /* Does this need SSL processing first? */ + if ((tcp->tcp_kssl_ctx != NULL) && (DB_TYPE(mp) == M_DATA)) { + DTRACE_PROBE1(kssl_mblk__ksslinput_rcvdrain, + mblk_t *, mp); + tcp_kssl_input(tcp, mp, NULL); + continue; + } + putnext(q, mp); + } +#ifdef DEBUG + ASSERT(cnt == tcp->tcp_rcv_cnt); +#endif + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; + + if (canputnext(q)) + return (tcp_rwnd_reopen(tcp)); + + return (ret); +} + +/* + * Queue data on tcp_rcv_list which is a b_next chain. + * tcp_rcv_last_head/tail is the last element of this chain. + * Each element of the chain is a b_cont chain. + * + * M_DATA messages are added to the current element. + * Other messages are added as new (b_next) elements. + */ +void +tcp_rcv_enqueue(tcp_t *tcp, mblk_t *mp, uint_t seg_len, cred_t *cr) +{ + ASSERT(seg_len == msgdsize(mp)); + ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_rcv_last_head != NULL); + + if (is_system_labeled()) { + ASSERT(cr != NULL || msg_getcred(mp, NULL) != NULL); + /* + * Provide for protocols above TCP such as RPC. NOPID leaves + * db_cpid unchanged. + * The cred could have already been set. + */ + if (cr != NULL) + mblk_setcred(mp, cr, NOPID); + } + + if (tcp->tcp_rcv_list == NULL) { + ASSERT(tcp->tcp_rcv_last_head == NULL); + tcp->tcp_rcv_list = mp; + tcp->tcp_rcv_last_head = mp; + } else if (DB_TYPE(mp) == DB_TYPE(tcp->tcp_rcv_last_head)) { + tcp->tcp_rcv_last_tail->b_cont = mp; + } else { + tcp->tcp_rcv_last_head->b_next = mp; + tcp->tcp_rcv_last_head = mp; + } + + while (mp->b_cont) + mp = mp->b_cont; + + tcp->tcp_rcv_last_tail = mp; + tcp->tcp_rcv_cnt += seg_len; + tcp->tcp_rwnd -= seg_len; +} + +/* Generate an ACK-only (no data) segment for a TCP endpoint */ +mblk_t * +tcp_ack_mp(tcp_t *tcp) +{ + uint32_t seq_no; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + + /* + * There are a few cases to be considered while setting the sequence no. + * Essentially, we can come here while processing an unacceptable pkt + * in the TCPS_SYN_RCVD state, in which case we set the sequence number + * to snxt (per RFC 793), note the swnd wouldn't have been set yet. + * If we are here for a zero window probe, stick with suna. In all + * other cases, we check if suna + swnd encompasses snxt and set + * the sequence number to snxt, if so. If snxt falls outside the + * window (the receiver probably shrunk its window), we will go with + * suna + swnd, otherwise the sequence no will be unacceptable to the + * receiver. + */ + if (tcp->tcp_zero_win_probe) { + seq_no = tcp->tcp_suna; + } else if (tcp->tcp_state == TCPS_SYN_RCVD) { + ASSERT(tcp->tcp_swnd == 0); + seq_no = tcp->tcp_snxt; + } else { + seq_no = SEQ_GT(tcp->tcp_snxt, + (tcp->tcp_suna + tcp->tcp_swnd)) ? + (tcp->tcp_suna + tcp->tcp_swnd) : tcp->tcp_snxt; + } + + if (tcp->tcp_valid_bits) { + /* + * For the complex case where we have to send some + * controls (FIN or SYN), let tcp_xmit_mp do it. + */ + return (tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, seq_no, B_FALSE, + NULL, B_FALSE)); + } else { + /* Generate a simple ACK */ + int data_length; + uchar_t *rptr; + tcpha_t *tcpha; + mblk_t *mp1; + int32_t total_hdr_len; + int32_t tcp_hdr_len; + int32_t num_sack_blk = 0; + int32_t sack_opt_len; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + /* + * Allocate space for TCP + IP headers + * and link-level header + */ + if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { + num_sack_blk = MIN(tcp->tcp_max_sack_blk, + tcp->tcp_num_sack_blk); + sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + + TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; + total_hdr_len = connp->conn_ht_iphc_len + sack_opt_len; + tcp_hdr_len = connp->conn_ht_ulp_len + sack_opt_len; + } else { + total_hdr_len = connp->conn_ht_iphc_len; + tcp_hdr_len = connp->conn_ht_ulp_len; + } + mp1 = allocb(total_hdr_len + tcps->tcps_wroff_xtra, BPRI_MED); + if (!mp1) + return (NULL); + + /* Update the latest receive window size in TCP header. */ + tcp->tcp_tcpha->tha_win = + htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + /* copy in prototype TCP + IP header */ + rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; + mp1->b_rptr = rptr; + mp1->b_wptr = rptr + total_hdr_len; + bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); + + tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; + + /* Set the TCP sequence number. */ + tcpha->tha_seq = htonl(seq_no); + + /* Set up the TCP flag field. */ + tcpha->tha_flags = (uchar_t)TH_ACK; + if (tcp->tcp_ecn_echo_on) + tcpha->tha_flags |= TH_ECE; + + tcp->tcp_rack = tcp->tcp_rnxt; + tcp->tcp_rack_cnt = 0; + + /* fill in timestamp option if in use */ + if (tcp->tcp_snd_ts_ok) { + uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; + + U32_TO_BE32(llbolt, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); + U32_TO_BE32(tcp->tcp_ts_recent, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); + } + + /* Fill in SACK options */ + if (num_sack_blk > 0) { + uchar_t *wptr = (uchar_t *)tcpha + + connp->conn_ht_ulp_len; + sack_blk_t *tmp; + int32_t i; + + wptr[0] = TCPOPT_NOP; + wptr[1] = TCPOPT_NOP; + wptr[2] = TCPOPT_SACK; + wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * + sizeof (sack_blk_t); + wptr += TCPOPT_REAL_SACK_LEN; + + tmp = tcp->tcp_sack_list; + for (i = 0; i < num_sack_blk; i++) { + U32_TO_BE32(tmp[i].begin, wptr); + wptr += sizeof (tcp_seq); + U32_TO_BE32(tmp[i].end, wptr); + wptr += sizeof (tcp_seq); + } + tcpha->tha_offset_and_reserved += + ((num_sack_blk * 2 + 1) << 4); + } + + ixa->ixa_pktlen = total_hdr_len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ((ipha_t *)rptr)->ipha_length = htons(total_hdr_len); + } else { + ip6_t *ip6 = (ip6_t *)rptr; + + ip6->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); + } + + /* + * Prime pump for checksum calculation in IP. Include the + * adjustment for a source route if any. + */ + data_length = tcp_hdr_len + connp->conn_sum; + data_length = (data_length >> 16) + (data_length & 0xFFFF); + tcpha->tha_sum = htons(data_length); + + if (tcp->tcp_ip_forward_progress) { + tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; + } + return (mp1); + } +} + +/* + * Handle M_DATA messages from IP. Its called directly from IP via + * squeue for received IP packets. + * + * The first argument is always the connp/tcp to which the mp belongs. + * There are no exceptions to this rule. The caller has already put + * a reference on this connp/tcp and once tcp_input_data() returns, + * the squeue will do the refrele. + * + * The TH_SYN for the listener directly go to tcp_input_listener via + * squeue. ICMP errors go directly to tcp_icmp_input(). + * + * sqp: NULL = recursive, sqp != NULL means called from squeue + */ +void +tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) +{ + int32_t bytes_acked; + int32_t gap; + mblk_t *mp1; + uint_t flags; + uint32_t new_swnd = 0; + uchar_t *iphdr; + uchar_t *rptr; + int32_t rgap; + uint32_t seg_ack; + int seg_len; + uint_t ip_hdr_len; + uint32_t seg_seq; + tcpha_t *tcpha; + int urp; + tcp_opt_t tcpopt; + ip_pkt_t ipp; + boolean_t ofo_seg = B_FALSE; /* Out of order segment */ + uint32_t cwnd; + uint32_t add; + int npkt; + int mss; + conn_t *connp = (conn_t *)arg; + squeue_t *sqp = (squeue_t *)arg2; + tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + /* + * RST from fused tcp loopback peer should trigger an unfuse. + */ + if (tcp->tcp_fused) { + TCP_STAT(tcps, tcp_fusion_aborted); + tcp_unfuse(tcp); + } + + iphdr = mp->b_rptr; + rptr = mp->b_rptr; + ASSERT(OK_32PTR(rptr)); + + ip_hdr_len = ira->ira_ip_hdr_length; + if (connp->conn_recv_ancillary.crb_all != 0) { + /* + * Record packet information in the ip_pkt_t + */ + ipp.ipp_fields = 0; + if (ira->ira_flags & IRAF_IS_IPV4) { + (void) ip_find_hdr_v4((ipha_t *)rptr, &ipp, + B_FALSE); + } else { + uint8_t nexthdrp; + + /* + * IPv6 packets can only be received by applications + * that are prepared to receive IPv6 addresses. + * The IP fanout must ensure this. + */ + ASSERT(connp->conn_family == AF_INET6); + + (void) ip_find_hdr_v6(mp, (ip6_t *)rptr, B_TRUE, &ipp, + &nexthdrp); + ASSERT(nexthdrp == IPPROTO_TCP); + + /* Could have caused a pullup? */ + iphdr = mp->b_rptr; + rptr = mp->b_rptr; + } + } + ASSERT(DB_TYPE(mp) == M_DATA); + ASSERT(mp->b_next == NULL); + + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + seg_seq = ntohl(tcpha->tha_seq); + seg_ack = ntohl(tcpha->tha_ack); + ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); + seg_len = (int)(mp->b_wptr - rptr) - + (ip_hdr_len + TCP_HDR_LENGTH(tcpha)); + if ((mp1 = mp->b_cont) != NULL && mp1->b_datap->db_type == M_DATA) { + do { + ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= + (uintptr_t)INT_MAX); + seg_len += (int)(mp1->b_wptr - mp1->b_rptr); + } while ((mp1 = mp1->b_cont) != NULL && + mp1->b_datap->db_type == M_DATA); + } + + if (tcp->tcp_state == TCPS_TIME_WAIT) { + tcp_time_wait_processing(tcp, mp, seg_seq, seg_ack, + seg_len, tcpha, ira); + return; + } + + if (sqp != NULL) { + /* + * This is the correct place to update tcp_last_recv_time. Note + * that it is also updated for tcp structure that belongs to + * global and listener queues which do not really need updating. + * But that should not cause any harm. And it is updated for + * all kinds of incoming segments, not only for data segments. + */ + tcp->tcp_last_recv_time = LBOLT_FASTPATH; + } + + flags = (unsigned int)tcpha->tha_flags & 0xFF; + + BUMP_LOCAL(tcp->tcp_ibsegs); + DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); + + if ((flags & TH_URG) && sqp != NULL) { + /* + * TCP can't handle urgent pointers that arrive before + * the connection has been accept()ed since it can't + * buffer OOB data. Discard segment if this happens. + * + * We can't just rely on a non-null tcp_listener to indicate + * that the accept() has completed since unlinking of the + * eager and completion of the accept are not atomic. + * tcp_detached, when it is not set (B_FALSE) indicates + * that the accept() has completed. + * + * Nor can it reassemble urgent pointers, so discard + * if it's not the next segment expected. + * + * Otherwise, collapse chain into one mblk (discard if + * that fails). This makes sure the headers, retransmitted + * data, and new data all are in the same mblk. + */ + ASSERT(mp != NULL); + if (tcp->tcp_detached || !pullupmsg(mp, -1)) { + freemsg(mp); + return; + } + /* Update pointers into message */ + iphdr = rptr = mp->b_rptr; + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + if (SEQ_GT(seg_seq, tcp->tcp_rnxt)) { + /* + * Since we can't handle any data with this urgent + * pointer that is out of sequence, we expunge + * the data. This allows us to still register + * the urgent mark and generate the M_PCSIG, + * which we can do. + */ + mp->b_wptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); + seg_len = 0; + } + } + + switch (tcp->tcp_state) { + case TCPS_SYN_SENT: + if (connp->conn_final_sqp == NULL && + tcp_outbound_squeue_switch && sqp != NULL) { + ASSERT(connp->conn_initial_sqp == connp->conn_sqp); + connp->conn_final_sqp = sqp; + if (connp->conn_final_sqp != connp->conn_sqp) { + DTRACE_PROBE1(conn__final__sqp__switch, + conn_t *, connp); + CONN_INC_REF(connp); + SQUEUE_SWITCH(connp, connp->conn_final_sqp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_input_data, connp, ira, ip_squeue_flag, + SQTAG_CONNECT_FINISH); + return; + } + DTRACE_PROBE1(conn__final__sqp__same, conn_t *, connp); + } + if (flags & TH_ACK) { + /* + * Note that our stack cannot send data before a + * connection is established, therefore the + * following check is valid. Otherwise, it has + * to be changed. + */ + if (SEQ_LEQ(seg_ack, tcp->tcp_iss) || + SEQ_GT(seg_ack, tcp->tcp_snxt)) { + freemsg(mp); + if (flags & TH_RST) + return; + tcp_xmit_ctl("TCPS_SYN_SENT-Bad_seq", + tcp, seg_ack, 0, TH_RST); + return; + } + ASSERT(tcp->tcp_suna + 1 == seg_ack); + } + if (flags & TH_RST) { + freemsg(mp); + if (flags & TH_ACK) + (void) tcp_clean_death(tcp, ECONNREFUSED); + return; + } + if (!(flags & TH_SYN)) { + freemsg(mp); + return; + } + + /* Process all TCP options. */ + tcp_process_options(tcp, tcpha); + /* + * The following changes our rwnd to be a multiple of the + * MIN(peer MSS, our MSS) for performance reason. + */ + (void) tcp_rwnd_set(tcp, MSS_ROUNDUP(connp->conn_rcvbuf, + tcp->tcp_mss)); + + /* Is the other end ECN capable? */ + if (tcp->tcp_ecn_ok) { + if ((flags & (TH_ECE|TH_CWR)) != TH_ECE) { + tcp->tcp_ecn_ok = B_FALSE; + } + } + /* + * Clear ECN flags because it may interfere with later + * processing. + */ + flags &= ~(TH_ECE|TH_CWR); + + tcp->tcp_irs = seg_seq; + tcp->tcp_rack = seg_seq; + tcp->tcp_rnxt = seg_seq + 1; + tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); + if (!TCP_IS_DETACHED(tcp)) { + /* Allocate room for SACK options if needed. */ + connp->conn_wroff = connp->conn_ht_iphc_len; + if (tcp->tcp_snd_sack_ok) + connp->conn_wroff += TCPOPT_MAX_SACK_LEN; + if (!tcp->tcp_loopback) + connp->conn_wroff += tcps->tcps_wroff_xtra; + + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); + } + if (flags & TH_ACK) { + /* + * If we can't get the confirmation upstream, pretend + * we didn't even see this one. + * + * XXX: how can we pretend we didn't see it if we + * have updated rnxt et. al. + * + * For loopback we defer sending up the T_CONN_CON + * until after some checks below. + */ + mp1 = NULL; + /* + * tcp_sendmsg() checks tcp_state without entering + * the squeue so tcp_state should be updated before + * sending up connection confirmation + */ + tcp->tcp_state = TCPS_ESTABLISHED; + if (!tcp_conn_con(tcp, iphdr, mp, + tcp->tcp_loopback ? &mp1 : NULL, ira)) { + tcp->tcp_state = TCPS_SYN_SENT; + freemsg(mp); + return; + } + TCPS_CONN_INC(tcps); + /* SYN was acked - making progress */ + tcp->tcp_ip_forward_progress = B_TRUE; + + /* One for the SYN */ + tcp->tcp_suna = tcp->tcp_iss + 1; + tcp->tcp_valid_bits &= ~TCP_ISS_VALID; + + /* + * If SYN was retransmitted, need to reset all + * retransmission info. This is because this + * segment will be treated as a dup ACK. + */ + if (tcp->tcp_rexmit) { + tcp->tcp_rexmit = B_FALSE; + tcp->tcp_rexmit_nxt = tcp->tcp_snxt; + tcp->tcp_rexmit_max = tcp->tcp_snxt; + tcp->tcp_snd_burst = tcp->tcp_localnet ? + TCP_CWND_INFINITE : TCP_CWND_NORMAL; + tcp->tcp_ms_we_have_waited = 0; + + /* + * Set tcp_cwnd back to 1 MSS, per + * recommendation from + * draft-floyd-incr-init-win-01.txt, + * Increasing TCP's Initial Window. + */ + tcp->tcp_cwnd = tcp->tcp_mss; + } + + tcp->tcp_swl1 = seg_seq; + tcp->tcp_swl2 = seg_ack; + + new_swnd = ntohs(tcpha->tha_win); + tcp->tcp_swnd = new_swnd; + if (new_swnd > tcp->tcp_max_swnd) + tcp->tcp_max_swnd = new_swnd; + + /* + * Always send the three-way handshake ack immediately + * in order to make the connection complete as soon as + * possible on the accepting host. + */ + flags |= TH_ACK_NEEDED; + + /* + * Special case for loopback. At this point we have + * received SYN-ACK from the remote endpoint. In + * order to ensure that both endpoints reach the + * fused state prior to any data exchange, the final + * ACK needs to be sent before we indicate T_CONN_CON + * to the module upstream. + */ + if (tcp->tcp_loopback) { + mblk_t *ack_mp; + + ASSERT(!tcp->tcp_unfusable); + ASSERT(mp1 != NULL); + /* + * For loopback, we always get a pure SYN-ACK + * and only need to send back the final ACK + * with no data (this is because the other + * tcp is ours and we don't do T/TCP). This + * final ACK triggers the passive side to + * perform fusion in ESTABLISHED state. + */ + if ((ack_mp = tcp_ack_mp(tcp)) != NULL) { + if (tcp->tcp_ack_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, + tcp->tcp_ack_tid); + tcp->tcp_ack_tid = 0; + } + tcp_send_data(tcp, ack_mp); + BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpOutAck); + + if (!IPCL_IS_NONSTR(connp)) { + /* Send up T_CONN_CON */ + if (ira->ira_cred != NULL) { + mblk_setcred(mp1, + ira->ira_cred, + ira->ira_cpid); + } + putnext(connp->conn_rq, mp1); + } else { + (*connp->conn_upcalls-> + su_connected) + (connp->conn_upper_handle, + tcp->tcp_connid, + ira->ira_cred, + ira->ira_cpid); + freemsg(mp1); + } + + freemsg(mp); + return; + } + /* + * Forget fusion; we need to handle more + * complex cases below. Send the deferred + * T_CONN_CON message upstream and proceed + * as usual. Mark this tcp as not capable + * of fusion. + */ + TCP_STAT(tcps, tcp_fusion_unfusable); + tcp->tcp_unfusable = B_TRUE; + if (!IPCL_IS_NONSTR(connp)) { + if (ira->ira_cred != NULL) { + mblk_setcred(mp1, ira->ira_cred, + ira->ira_cpid); + } + putnext(connp->conn_rq, mp1); + } else { + (*connp->conn_upcalls->su_connected) + (connp->conn_upper_handle, + tcp->tcp_connid, ira->ira_cred, + ira->ira_cpid); + freemsg(mp1); + } + } + + /* + * Check to see if there is data to be sent. If + * yes, set the transmit flag. Then check to see + * if received data processing needs to be done. + * If not, go straight to xmit_check. This short + * cut is OK as we don't support T/TCP. + */ + if (tcp->tcp_unsent) + flags |= TH_XMIT_NEEDED; + + if (seg_len == 0 && !(flags & TH_URG)) { + freemsg(mp); + goto xmit_check; + } + + flags &= ~TH_SYN; + seg_seq++; + break; + } + tcp->tcp_state = TCPS_SYN_RCVD; + mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, tcp->tcp_mss, + NULL, NULL, tcp->tcp_iss, B_FALSE, NULL, B_FALSE); + if (mp1 != NULL) { + tcp_send_data(tcp, mp1); + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + } + freemsg(mp); + return; + case TCPS_SYN_RCVD: + if (flags & TH_ACK) { + /* + * In this state, a SYN|ACK packet is either bogus + * because the other side must be ACKing our SYN which + * indicates it has seen the ACK for their SYN and + * shouldn't retransmit it or we're crossing SYNs + * on active open. + */ + if ((flags & TH_SYN) && !tcp->tcp_active_open) { + freemsg(mp); + tcp_xmit_ctl("TCPS_SYN_RCVD-bad_syn", + tcp, seg_ack, 0, TH_RST); + return; + } + /* + * NOTE: RFC 793 pg. 72 says this should be + * tcp->tcp_suna <= seg_ack <= tcp->tcp_snxt + * but that would mean we have an ack that ignored + * our SYN. + */ + if (SEQ_LEQ(seg_ack, tcp->tcp_suna) || + SEQ_GT(seg_ack, tcp->tcp_snxt)) { + freemsg(mp); + tcp_xmit_ctl("TCPS_SYN_RCVD-bad_ack", + tcp, seg_ack, 0, TH_RST); + return; + } + /* + * No sane TCP stack will send such a small window + * without receiving any data. Just drop this invalid + * ACK. We also shorten the abort timeout in case + * this is an attack. + */ + if ((ntohs(tcpha->tha_win) << tcp->tcp_snd_ws) < + (tcp->tcp_mss >> tcp_init_wnd_shft)) { + freemsg(mp); + TCP_STAT(tcps, tcp_zwin_ack_syn); + tcp->tcp_second_ctimer_threshold = + tcp_early_abort * SECONDS; + return; + } + } + break; + case TCPS_LISTEN: + /* + * Only a TLI listener can come through this path when a + * acceptor is going back to be a listener and a packet + * for the acceptor hits the classifier. For a socket + * listener, this can never happen because a listener + * can never accept connection on itself and hence a + * socket acceptor can not go back to being a listener. + */ + ASSERT(!TCP_IS_SOCKET(tcp)); + /*FALLTHRU*/ + case TCPS_CLOSED: + case TCPS_BOUND: { + conn_t *new_connp; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + + /* + * Don't accept any input on a closed tcp as this TCP logically + * does not exist on the system. Don't proceed further with + * this TCP. For instance, this packet could trigger another + * close of this tcp which would be disastrous for tcp_refcnt. + * tcp_close_detached / tcp_clean_death / tcp_closei_local must + * be called at most once on a TCP. In this case we need to + * refeed the packet into the classifier and figure out where + * the packet should go. + */ + new_connp = ipcl_classify(mp, ira, ipst); + if (new_connp != NULL) { + /* Drops ref on new_connp */ + tcp_reinput(new_connp, mp, ira, ipst); + return; + } + /* We failed to classify. For now just drop the packet */ + freemsg(mp); + return; + } + case TCPS_IDLE: + /* + * Handle the case where the tcp_clean_death() has happened + * on a connection (application hasn't closed yet) but a packet + * was already queued on squeue before tcp_clean_death() + * was processed. Calling tcp_clean_death() twice on same + * connection can result in weird behaviour. + */ + freemsg(mp); + return; + default: + break; + } + + /* + * Already on the correct queue/perimeter. + * If this is a detached connection and not an eager + * connection hanging off a listener then new data + * (past the FIN) will cause a reset. + * We do a special check here where it + * is out of the main line, rather than check + * if we are detached every time we see new + * data down below. + */ + if (TCP_IS_DETACHED_NONEAGER(tcp) && + (seg_len > 0 && SEQ_GT(seg_seq + seg_len, tcp->tcp_rnxt))) { + TCPS_BUMP_MIB(tcps, tcpInClosed); + DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); + + freemsg(mp); + /* + * This could be an SSL closure alert. We're detached so just + * acknowledge it this last time. + */ + if (tcp->tcp_kssl_ctx != NULL) { + kssl_release_ctx(tcp->tcp_kssl_ctx); + tcp->tcp_kssl_ctx = NULL; + + tcp->tcp_rnxt += seg_len; + tcp->tcp_tcpha->tha_ack = htonl(tcp->tcp_rnxt); + flags |= TH_ACK_NEEDED; + goto ack_check; + } + + tcp_xmit_ctl("new data when detached", tcp, + tcp->tcp_snxt, 0, TH_RST); + (void) tcp_clean_death(tcp, EPROTO); + return; + } + + mp->b_rptr = (uchar_t *)tcpha + TCP_HDR_LENGTH(tcpha); + urp = ntohs(tcpha->tha_urp) - TCP_OLD_URP_INTERPRETATION; + new_swnd = ntohs(tcpha->tha_win) << + ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); + + if (tcp->tcp_snd_ts_ok) { + if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { + /* + * This segment is not acceptable. + * Drop it and send back an ACK. + */ + freemsg(mp); + flags |= TH_ACK_NEEDED; + goto ack_check; + } + } else if (tcp->tcp_snd_sack_ok) { + ASSERT(tcp->tcp_sack_info != NULL); + tcpopt.tcp = tcp; + /* + * SACK info in already updated in tcp_parse_options. Ignore + * all other TCP options... + */ + (void) tcp_parse_options(tcpha, &tcpopt); + } +try_again:; + mss = tcp->tcp_mss; + gap = seg_seq - tcp->tcp_rnxt; + rgap = tcp->tcp_rwnd - (gap + seg_len); + /* + * gap is the amount of sequence space between what we expect to see + * and what we got for seg_seq. A positive value for gap means + * something got lost. A negative value means we got some old stuff. + */ + if (gap < 0) { + /* Old stuff present. Is the SYN in there? */ + if (seg_seq == tcp->tcp_irs && (flags & TH_SYN) && + (seg_len != 0)) { + flags &= ~TH_SYN; + seg_seq++; + urp--; + /* Recompute the gaps after noting the SYN. */ + goto try_again; + } + TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, + (seg_len > -gap ? -gap : seg_len)); + /* Remove the old stuff from seg_len. */ + seg_len += gap; + /* + * Anything left? + * Make sure to check for unack'd FIN when rest of data + * has been previously ack'd. + */ + if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { + /* + * Resets are only valid if they lie within our offered + * window. If the RST bit is set, we just ignore this + * segment. + */ + if (flags & TH_RST) { + freemsg(mp); + return; + } + + /* + * The arriving of dup data packets indicate that we + * may have postponed an ack for too long, or the other + * side's RTT estimate is out of shape. Start acking + * more often. + */ + if (SEQ_GEQ(seg_seq + seg_len - gap, tcp->tcp_rack) && + tcp->tcp_rack_cnt >= 1 && + tcp->tcp_rack_abs_max > 2) { + tcp->tcp_rack_abs_max--; + } + tcp->tcp_rack_cur_max = 1; + + /* + * This segment is "unacceptable". None of its + * sequence space lies within our advertized window. + * + * Adjust seg_len to the original value for tracing. + */ + seg_len -= gap; + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: unacceptable, gap %d, rgap %d, " + "flags 0x%x, seg_seq %u, seg_ack %u, " + "seg_len %d, rnxt %u, snxt %u, %s", + gap, rgap, flags, seg_seq, seg_ack, + seg_len, tcp->tcp_rnxt, tcp->tcp_snxt, + tcp_display(tcp, NULL, + DISP_ADDR_AND_PORT)); + } + + /* + * Arrange to send an ACK in response to the + * unacceptable segment per RFC 793 page 69. There + * is only one small difference between ours and the + * acceptability test in the RFC - we accept ACK-only + * packet with SEG.SEQ = RCV.NXT+RCV.WND and no ACK + * will be generated. + * + * Note that we have to ACK an ACK-only packet at least + * for stacks that send 0-length keep-alives with + * SEG.SEQ = SND.NXT-1 as recommended by RFC1122, + * section 4.2.3.6. As long as we don't ever generate + * an unacceptable packet in response to an incoming + * packet that is unacceptable, it should not cause + * "ACK wars". + */ + flags |= TH_ACK_NEEDED; + + /* + * Continue processing this segment in order to use the + * ACK information it contains, but skip all other + * sequence-number processing. Processing the ACK + * information is necessary in order to + * re-synchronize connections that may have lost + * synchronization. + * + * We clear seg_len and flag fields related to + * sequence number processing as they are not + * to be trusted for an unacceptable segment. + */ + seg_len = 0; + flags &= ~(TH_SYN | TH_FIN | TH_URG); + goto process_ack; + } + + /* Fix seg_seq, and chew the gap off the front. */ + seg_seq = tcp->tcp_rnxt; + urp += gap; + do { + mblk_t *mp2; + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= + (uintptr_t)UINT_MAX); + gap += (uint_t)(mp->b_wptr - mp->b_rptr); + if (gap > 0) { + mp->b_rptr = mp->b_wptr - gap; + break; + } + mp2 = mp; + mp = mp->b_cont; + freeb(mp2); + } while (gap < 0); + /* + * If the urgent data has already been acknowledged, we + * should ignore TH_URG below + */ + if (urp < 0) + flags &= ~TH_URG; + } + /* + * rgap is the amount of stuff received out of window. A negative + * value is the amount out of window. + */ + if (rgap < 0) { + mblk_t *mp2; + + if (tcp->tcp_rwnd == 0) { + TCPS_BUMP_MIB(tcps, tcpInWinProbe); + } else { + TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); + } + + /* + * seg_len does not include the FIN, so if more than + * just the FIN is out of window, we act like we don't + * see it. (If just the FIN is out of window, rgap + * will be zero and we will go ahead and acknowledge + * the FIN.) + */ + flags &= ~TH_FIN; + + /* Fix seg_len and make sure there is something left. */ + seg_len += rgap; + if (seg_len <= 0) { + /* + * Resets are only valid if they lie within our offered + * window. If the RST bit is set, we just ignore this + * segment. + */ + if (flags & TH_RST) { + freemsg(mp); + return; + } + + /* Per RFC 793, we need to send back an ACK. */ + flags |= TH_ACK_NEEDED; + + /* + * Send SIGURG as soon as possible i.e. even + * if the TH_URG was delivered in a window probe + * packet (which will be unacceptable). + * + * We generate a signal if none has been generated + * for this connection or if this is a new urgent + * byte. Also send a zero-length "unmarked" message + * to inform SIOCATMARK that this is not the mark. + * + * tcp_urp_last_valid is cleared when the T_exdata_ind + * is sent up. This plus the check for old data + * (gap >= 0) handles the wraparound of the sequence + * number space without having to always track the + * correct MAX(tcp_urp_last, tcp_rnxt). (BSD tracks + * this max in its rcv_up variable). + * + * This prevents duplicate SIGURGS due to a "late" + * zero-window probe when the T_EXDATA_IND has already + * been sent up. + */ + if ((flags & TH_URG) && + (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, + tcp->tcp_urp_last))) { + if (IPCL_IS_NONSTR(connp)) { + if (!TCP_IS_DETACHED(tcp)) { + (*connp->conn_upcalls-> + su_signal_oob) + (connp->conn_upper_handle, + urp); + } + } else { + mp1 = allocb(0, BPRI_MED); + if (mp1 == NULL) { + freemsg(mp); + return; + } + if (!TCP_IS_DETACHED(tcp) && + !putnextctl1(connp->conn_rq, + M_PCSIG, SIGURG)) { + /* Try again on the rexmit. */ + freemsg(mp1); + freemsg(mp); + return; + } + /* + * If the next byte would be the mark + * then mark with MARKNEXT else mark + * with NOTMARKNEXT. + */ + if (gap == 0 && urp == 0) + mp1->b_flag |= MSGMARKNEXT; + else + mp1->b_flag |= MSGNOTMARKNEXT; + freemsg(tcp->tcp_urp_mark_mp); + tcp->tcp_urp_mark_mp = mp1; + flags |= TH_SEND_URP_MARK; + } + tcp->tcp_urp_last_valid = B_TRUE; + tcp->tcp_urp_last = urp + seg_seq; + } + /* + * If this is a zero window probe, continue to + * process the ACK part. But we need to set seg_len + * to 0 to avoid data processing. Otherwise just + * drop the segment and send back an ACK. + */ + if (tcp->tcp_rwnd == 0 && seg_seq == tcp->tcp_rnxt) { + flags &= ~(TH_SYN | TH_URG); + seg_len = 0; + goto process_ack; + } else { + freemsg(mp); + goto ack_check; + } + } + /* Pitch out of window stuff off the end. */ + rgap = seg_len; + mp2 = mp; + do { + ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= + (uintptr_t)INT_MAX); + rgap -= (int)(mp2->b_wptr - mp2->b_rptr); + if (rgap < 0) { + mp2->b_wptr += rgap; + if ((mp1 = mp2->b_cont) != NULL) { + mp2->b_cont = NULL; + freemsg(mp1); + } + break; + } + } while ((mp2 = mp2->b_cont) != NULL); + } +ok:; + /* + * TCP should check ECN info for segments inside the window only. + * Therefore the check should be done here. + */ + if (tcp->tcp_ecn_ok) { + if (flags & TH_CWR) { + tcp->tcp_ecn_echo_on = B_FALSE; + } + /* + * Note that both ECN_CE and CWR can be set in the + * same segment. In this case, we once again turn + * on ECN_ECHO. + */ + if (connp->conn_ipversion == IPV4_VERSION) { + uchar_t tos = ((ipha_t *)rptr)->ipha_type_of_service; + + if ((tos & IPH_ECN_CE) == IPH_ECN_CE) { + tcp->tcp_ecn_echo_on = B_TRUE; + } + } else { + uint32_t vcf = ((ip6_t *)rptr)->ip6_vcf; + + if ((vcf & htonl(IPH_ECN_CE << 20)) == + htonl(IPH_ECN_CE << 20)) { + tcp->tcp_ecn_echo_on = B_TRUE; + } + } + } + + /* + * Check whether we can update tcp_ts_recent. This test is + * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP + * Extensions for High Performance: An Update", Internet Draft. + */ + if (tcp->tcp_snd_ts_ok && + TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && + SEQ_LEQ(seg_seq, tcp->tcp_rack)) { + tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; + tcp->tcp_last_rcv_lbolt = LBOLT_FASTPATH64; + } + + if (seg_seq != tcp->tcp_rnxt || tcp->tcp_reass_head) { + /* + * FIN in an out of order segment. We record this in + * tcp_valid_bits and the seq num of FIN in tcp_ofo_fin_seq. + * Clear the FIN so that any check on FIN flag will fail. + * Remember that FIN also counts in the sequence number + * space. So we need to ack out of order FIN only segments. + */ + if (flags & TH_FIN) { + tcp->tcp_valid_bits |= TCP_OFO_FIN_VALID; + tcp->tcp_ofo_fin_seq = seg_seq + seg_len; + flags &= ~TH_FIN; + flags |= TH_ACK_NEEDED; + } + if (seg_len > 0) { + /* Fill in the SACK blk list. */ + if (tcp->tcp_snd_sack_ok) { + ASSERT(tcp->tcp_sack_info != NULL); + tcp_sack_insert(tcp->tcp_sack_list, + seg_seq, seg_seq + seg_len, + &(tcp->tcp_num_sack_blk)); + } + + /* + * Attempt reassembly and see if we have something + * ready to go. + */ + mp = tcp_reass(tcp, mp, seg_seq); + /* Always ack out of order packets */ + flags |= TH_ACK_NEEDED | TH_PUSH; + if (mp) { + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= + (uintptr_t)INT_MAX); + seg_len = mp->b_cont ? msgdsize(mp) : + (int)(mp->b_wptr - mp->b_rptr); + seg_seq = tcp->tcp_rnxt; + /* + * A gap is filled and the seq num and len + * of the gap match that of a previously + * received FIN, put the FIN flag back in. + */ + if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && + seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { + flags |= TH_FIN; + tcp->tcp_valid_bits &= + ~TCP_OFO_FIN_VALID; + } + if (tcp->tcp_reass_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, + tcp->tcp_reass_tid); + /* + * Restart the timer if there is still + * data in the reassembly queue. + */ + if (tcp->tcp_reass_head != NULL) { + tcp->tcp_reass_tid = TCP_TIMER( + tcp, tcp_reass_timer, + MSEC_TO_TICK( + tcps->tcps_reass_timeout)); + } else { + tcp->tcp_reass_tid = 0; + } + } + } else { + /* + * Keep going even with NULL mp. + * There may be a useful ACK or something else + * we don't want to miss. + * + * But TCP should not perform fast retransmit + * because of the ack number. TCP uses + * seg_len == 0 to determine if it is a pure + * ACK. And this is not a pure ACK. + */ + seg_len = 0; + ofo_seg = B_TRUE; + + if (tcps->tcps_reass_timeout != 0 && + tcp->tcp_reass_tid == 0) { + tcp->tcp_reass_tid = TCP_TIMER(tcp, + tcp_reass_timer, MSEC_TO_TICK( + tcps->tcps_reass_timeout)); + } + } + } + } else if (seg_len > 0) { + TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); + /* + * If an out of order FIN was received before, and the seq + * num and len of the new segment match that of the FIN, + * put the FIN flag back in. + */ + if ((tcp->tcp_valid_bits & TCP_OFO_FIN_VALID) && + seg_seq + seg_len == tcp->tcp_ofo_fin_seq) { + flags |= TH_FIN; + tcp->tcp_valid_bits &= ~TCP_OFO_FIN_VALID; + } + } + if ((flags & (TH_RST | TH_SYN | TH_URG | TH_ACK)) != TH_ACK) { + if (flags & TH_RST) { + freemsg(mp); + switch (tcp->tcp_state) { + case TCPS_SYN_RCVD: + (void) tcp_clean_death(tcp, ECONNREFUSED); + break; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + (void) tcp_clean_death(tcp, ECONNRESET); + break; + case TCPS_CLOSING: + case TCPS_LAST_ACK: + (void) tcp_clean_death(tcp, 0); + break; + default: + ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); + (void) tcp_clean_death(tcp, ENXIO); + break; + } + return; + } + if (flags & TH_SYN) { + /* + * See RFC 793, Page 71 + * + * The seq number must be in the window as it should + * be "fixed" above. If it is outside window, it should + * be already rejected. Note that we allow seg_seq to be + * rnxt + rwnd because we want to accept 0 window probe. + */ + ASSERT(SEQ_GEQ(seg_seq, tcp->tcp_rnxt) && + SEQ_LEQ(seg_seq, tcp->tcp_rnxt + tcp->tcp_rwnd)); + freemsg(mp); + /* + * If the ACK flag is not set, just use our snxt as the + * seq number of the RST segment. + */ + if (!(flags & TH_ACK)) { + seg_ack = tcp->tcp_snxt; + } + tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, + TH_RST|TH_ACK); + ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); + (void) tcp_clean_death(tcp, ECONNRESET); + return; + } + /* + * urp could be -1 when the urp field in the packet is 0 + * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent + * byte was at seg_seq - 1, in which case we ignore the urgent flag. + */ + if (flags & TH_URG && urp >= 0) { + if (!tcp->tcp_urp_last_valid || + SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { + /* + * Non-STREAMS sockets handle the urgent data a litte + * differently from STREAMS based sockets. There is no + * need to mark any mblks with the MSG{NOT,}MARKNEXT + * flags to keep SIOCATMARK happy. Instead a + * su_signal_oob upcall is made to update the mark. + * Neither is a T_EXDATA_IND mblk needed to be + * prepended to the urgent data. The urgent data is + * delivered using the su_recv upcall, where we set + * the MSG_OOB flag to indicate that it is urg data. + * + * Neither TH_SEND_URP_MARK nor TH_MARKNEXT_NEEDED + * are used by non-STREAMS sockets. + */ + if (IPCL_IS_NONSTR(connp)) { + if (!TCP_IS_DETACHED(tcp)) { + (*connp->conn_upcalls->su_signal_oob) + (connp->conn_upper_handle, urp); + } + } else { + /* + * If we haven't generated the signal yet for + * this urgent pointer value, do it now. Also, + * send up a zero-length M_DATA indicating + * whether or not this is the mark. The latter + * is not needed when a T_EXDATA_IND is sent up. + * However, if there are allocation failures + * this code relies on the sender retransmitting + * and the socket code for determining the mark + * should not block waiting for the peer to + * transmit. Thus, for simplicity we always + * send up the mark indication. + */ + mp1 = allocb(0, BPRI_MED); + if (mp1 == NULL) { + freemsg(mp); + return; + } + if (!TCP_IS_DETACHED(tcp) && + !putnextctl1(connp->conn_rq, M_PCSIG, + SIGURG)) { + /* Try again on the rexmit. */ + freemsg(mp1); + freemsg(mp); + return; + } + /* + * Mark with NOTMARKNEXT for now. + * The code below will change this to MARKNEXT + * if we are at the mark. + * + * If there are allocation failures (e.g. in + * dupmsg below) the next time tcp_input_data + * sees the urgent segment it will send up the + * MSGMARKNEXT message. + */ + mp1->b_flag |= MSGNOTMARKNEXT; + freemsg(tcp->tcp_urp_mark_mp); + tcp->tcp_urp_mark_mp = mp1; + flags |= TH_SEND_URP_MARK; +#ifdef DEBUG + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: sent M_PCSIG 2 seq %x urp %x " + "last %x, %s", + seg_seq, urp, tcp->tcp_urp_last, + tcp_display(tcp, NULL, DISP_PORT_ONLY)); +#endif /* DEBUG */ + } + tcp->tcp_urp_last_valid = B_TRUE; + tcp->tcp_urp_last = urp + seg_seq; + } else if (tcp->tcp_urp_mark_mp != NULL) { + /* + * An allocation failure prevented the previous + * tcp_input_data from sending up the allocated + * MSG*MARKNEXT message - send it up this time + * around. + */ + flags |= TH_SEND_URP_MARK; + } + + /* + * If the urgent byte is in this segment, make sure that it is + * all by itself. This makes it much easier to deal with the + * possibility of an allocation failure on the T_exdata_ind. + * Note that seg_len is the number of bytes in the segment, and + * urp is the offset into the segment of the urgent byte. + * urp < seg_len means that the urgent byte is in this segment. + */ + if (urp < seg_len) { + if (seg_len != 1) { + uint32_t tmp_rnxt; + /* + * Break it up and feed it back in. + * Re-attach the IP header. + */ + mp->b_rptr = iphdr; + if (urp > 0) { + /* + * There is stuff before the urgent + * byte. + */ + mp1 = dupmsg(mp); + if (!mp1) { + /* + * Trim from urgent byte on. + * The rest will come back. + */ + (void) adjmsg(mp, + urp - seg_len); + tcp_input_data(connp, + mp, NULL, ira); + return; + } + (void) adjmsg(mp1, urp - seg_len); + /* Feed this piece back in. */ + tmp_rnxt = tcp->tcp_rnxt; + tcp_input_data(connp, mp1, NULL, ira); + /* + * If the data passed back in was not + * processed (ie: bad ACK) sending + * the remainder back in will cause a + * loop. In this case, drop the + * packet and let the sender try + * sending a good packet. + */ + if (tmp_rnxt == tcp->tcp_rnxt) { + freemsg(mp); + return; + } + } + if (urp != seg_len - 1) { + uint32_t tmp_rnxt; + /* + * There is stuff after the urgent + * byte. + */ + mp1 = dupmsg(mp); + if (!mp1) { + /* + * Trim everything beyond the + * urgent byte. The rest will + * come back. + */ + (void) adjmsg(mp, + urp + 1 - seg_len); + tcp_input_data(connp, + mp, NULL, ira); + return; + } + (void) adjmsg(mp1, urp + 1 - seg_len); + tmp_rnxt = tcp->tcp_rnxt; + tcp_input_data(connp, mp1, NULL, ira); + /* + * If the data passed back in was not + * processed (ie: bad ACK) sending + * the remainder back in will cause a + * loop. In this case, drop the + * packet and let the sender try + * sending a good packet. + */ + if (tmp_rnxt == tcp->tcp_rnxt) { + freemsg(mp); + return; + } + } + tcp_input_data(connp, mp, NULL, ira); + return; + } + /* + * This segment contains only the urgent byte. We + * have to allocate the T_exdata_ind, if we can. + */ + if (IPCL_IS_NONSTR(connp)) { + int error; + + (*connp->conn_upcalls->su_recv) + (connp->conn_upper_handle, mp, seg_len, + MSG_OOB, &error, NULL); + /* + * We should never be in middle of a + * fallback, the squeue guarantees that. + */ + ASSERT(error != EOPNOTSUPP); + mp = NULL; + goto update_ack; + } else if (!tcp->tcp_urp_mp) { + struct T_exdata_ind *tei; + mp1 = allocb(sizeof (struct T_exdata_ind), + BPRI_MED); + if (!mp1) { + /* + * Sigh... It'll be back. + * Generate any MSG*MARK message now. + */ + freemsg(mp); + seg_len = 0; + if (flags & TH_SEND_URP_MARK) { + + + ASSERT(tcp->tcp_urp_mark_mp); + tcp->tcp_urp_mark_mp->b_flag &= + ~MSGNOTMARKNEXT; + tcp->tcp_urp_mark_mp->b_flag |= + MSGMARKNEXT; + } + goto ack_check; + } + mp1->b_datap->db_type = M_PROTO; + tei = (struct T_exdata_ind *)mp1->b_rptr; + tei->PRIM_type = T_EXDATA_IND; + tei->MORE_flag = 0; + mp1->b_wptr = (uchar_t *)&tei[1]; + tcp->tcp_urp_mp = mp1; +#ifdef DEBUG + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: allocated exdata_ind %s", + tcp_display(tcp, NULL, + DISP_PORT_ONLY)); +#endif /* DEBUG */ + /* + * There is no need to send a separate MSG*MARK + * message since the T_EXDATA_IND will be sent + * now. + */ + flags &= ~TH_SEND_URP_MARK; + freemsg(tcp->tcp_urp_mark_mp); + tcp->tcp_urp_mark_mp = NULL; + } + /* + * Now we are all set. On the next putnext upstream, + * tcp_urp_mp will be non-NULL and will get prepended + * to what has to be this piece containing the urgent + * byte. If for any reason we abort this segment below, + * if it comes back, we will have this ready, or it + * will get blown off in close. + */ + } else if (urp == seg_len) { + /* + * The urgent byte is the next byte after this sequence + * number. If this endpoint is non-STREAMS, then there + * is nothing to do here since the socket has already + * been notified about the urg pointer by the + * su_signal_oob call above. + * + * In case of STREAMS, some more work might be needed. + * If there is data it is marked with MSGMARKNEXT and + * and any tcp_urp_mark_mp is discarded since it is not + * needed. Otherwise, if the code above just allocated + * a zero-length tcp_urp_mark_mp message, that message + * is tagged with MSGMARKNEXT. Sending up these + * MSGMARKNEXT messages makes SIOCATMARK work correctly + * even though the T_EXDATA_IND will not be sent up + * until the urgent byte arrives. + */ + if (!IPCL_IS_NONSTR(tcp->tcp_connp)) { + if (seg_len != 0) { + flags |= TH_MARKNEXT_NEEDED; + freemsg(tcp->tcp_urp_mark_mp); + tcp->tcp_urp_mark_mp = NULL; + flags &= ~TH_SEND_URP_MARK; + } else if (tcp->tcp_urp_mark_mp != NULL) { + flags |= TH_SEND_URP_MARK; + tcp->tcp_urp_mark_mp->b_flag &= + ~MSGNOTMARKNEXT; + tcp->tcp_urp_mark_mp->b_flag |= + MSGMARKNEXT; + } + } +#ifdef DEBUG + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: AT MARK, len %d, flags 0x%x, %s", + seg_len, flags, + tcp_display(tcp, NULL, DISP_PORT_ONLY)); +#endif /* DEBUG */ + } +#ifdef DEBUG + else { + /* Data left until we hit mark */ + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: URP %d bytes left, %s", + urp - seg_len, tcp_display(tcp, NULL, + DISP_PORT_ONLY)); + } +#endif /* DEBUG */ + } + +process_ack: + if (!(flags & TH_ACK)) { + freemsg(mp); + goto xmit_check; + } + } + bytes_acked = (int)(seg_ack - tcp->tcp_suna); + + if (bytes_acked > 0) + tcp->tcp_ip_forward_progress = B_TRUE; + if (tcp->tcp_state == TCPS_SYN_RCVD) { + if ((tcp->tcp_conn.tcp_eager_conn_ind != NULL) && + ((tcp->tcp_kssl_ent == NULL) || !tcp->tcp_kssl_pending)) { + /* 3-way handshake complete - pass up the T_CONN_IND */ + tcp_t *listener = tcp->tcp_listener; + mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; + + tcp->tcp_tconnind_started = B_TRUE; + tcp->tcp_conn.tcp_eager_conn_ind = NULL; + /* + * We are here means eager is fine but it can + * get a TH_RST at any point between now and till + * accept completes and disappear. We need to + * ensure that reference to eager is valid after + * we get out of eager's perimeter. So we do + * an extra refhold. + */ + CONN_INC_REF(connp); + + /* + * The listener also exists because of the refhold + * done in tcp_input_listener. Its possible that it + * might have closed. We will check that once we + * get inside listeners context. + */ + CONN_INC_REF(listener->tcp_connp); + if (listener->tcp_connp->conn_sqp == + connp->conn_sqp) { + /* + * We optimize by not calling an SQUEUE_ENTER + * on the listener since we know that the + * listener and eager squeues are the same. + * We are able to make this check safely only + * because neither the eager nor the listener + * can change its squeue. Only an active connect + * can change its squeue + */ + tcp_send_conn_ind(listener->tcp_connp, mp, + listener->tcp_connp->conn_sqp); + CONN_DEC_REF(listener->tcp_connp); + } else if (!tcp->tcp_loopback) { + SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, + mp, tcp_send_conn_ind, + listener->tcp_connp, NULL, SQ_FILL, + SQTAG_TCP_CONN_IND); + } else { + SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, + mp, tcp_send_conn_ind, + listener->tcp_connp, NULL, SQ_PROCESS, + SQTAG_TCP_CONN_IND); + } + } + + /* + * We are seeing the final ack in the three way + * hand shake of a active open'ed connection + * so we must send up a T_CONN_CON + * + * tcp_sendmsg() checks tcp_state without entering + * the squeue so tcp_state should be updated before + * sending up connection confirmation. + */ + tcp->tcp_state = TCPS_ESTABLISHED; + + if (tcp->tcp_active_open) { + if (!tcp_conn_con(tcp, iphdr, mp, NULL, ira)) { + freemsg(mp); + tcp->tcp_state = TCPS_SYN_RCVD; + return; + } + /* + * Don't fuse the loopback endpoints for + * simultaneous active opens. + */ + if (tcp->tcp_loopback) { + TCP_STAT(tcps, tcp_fusion_unfusable); + tcp->tcp_unfusable = B_TRUE; + } + } + TCPS_CONN_INC(tcps); + + tcp->tcp_suna = tcp->tcp_iss + 1; /* One for the SYN */ + bytes_acked--; + /* SYN was acked - making progress */ + tcp->tcp_ip_forward_progress = B_TRUE; + + /* + * If SYN was retransmitted, need to reset all + * retransmission info as this segment will be + * treated as a dup ACK. + */ + if (tcp->tcp_rexmit) { + tcp->tcp_rexmit = B_FALSE; + tcp->tcp_rexmit_nxt = tcp->tcp_snxt; + tcp->tcp_rexmit_max = tcp->tcp_snxt; + tcp->tcp_snd_burst = tcp->tcp_localnet ? + TCP_CWND_INFINITE : TCP_CWND_NORMAL; + tcp->tcp_ms_we_have_waited = 0; + tcp->tcp_cwnd = mss; + } + + /* + * We set the send window to zero here. + * This is needed if there is data to be + * processed already on the queue. + * Later (at swnd_update label), the + * "new_swnd > tcp_swnd" condition is satisfied + * the XMIT_NEEDED flag is set in the current + * (SYN_RCVD) state. This ensures tcp_wput_data() is + * called if there is already data on queue in + * this state. + */ + tcp->tcp_swnd = 0; + + if (new_swnd > tcp->tcp_max_swnd) + tcp->tcp_max_swnd = new_swnd; + tcp->tcp_swl1 = seg_seq; + tcp->tcp_swl2 = seg_ack; + tcp->tcp_valid_bits &= ~TCP_ISS_VALID; + + /* Fuse when both sides are in ESTABLISHED state */ + if (tcp->tcp_loopback && do_tcp_fusion) + tcp_fuse(tcp, iphdr, tcpha); + + } + /* This code follows 4.4BSD-Lite2 mostly. */ + if (bytes_acked < 0) + goto est; + + /* + * If TCP is ECN capable and the congestion experience bit is + * set, reduce tcp_cwnd and tcp_ssthresh. But this should only be + * done once per window (or more loosely, per RTT). + */ + if (tcp->tcp_cwr && SEQ_GT(seg_ack, tcp->tcp_cwr_snd_max)) + tcp->tcp_cwr = B_FALSE; + if (tcp->tcp_ecn_ok && (flags & TH_ECE)) { + if (!tcp->tcp_cwr) { + npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / mss; + tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * mss; + tcp->tcp_cwnd = npkt * mss; + /* + * If the cwnd is 0, use the timer to clock out + * new segments. This is required by the ECN spec. + */ + if (npkt == 0) { + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + /* + * This makes sure that when the ACK comes + * back, we will increase tcp_cwnd by 1 MSS. + */ + tcp->tcp_cwnd_cnt = 0; + } + tcp->tcp_cwr = B_TRUE; + /* + * This marks the end of the current window of in + * flight data. That is why we don't use + * tcp_suna + tcp_swnd. Only data in flight can + * provide ECN info. + */ + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + + mp1 = tcp->tcp_xmit_head; + if (bytes_acked == 0) { + if (!ofo_seg && seg_len == 0 && new_swnd == tcp->tcp_swnd) { + int dupack_cnt; + + TCPS_BUMP_MIB(tcps, tcpInDupAck); + /* + * Fast retransmit. When we have seen exactly three + * identical ACKs while we have unacked data + * outstanding we take it as a hint that our peer + * dropped something. + * + * If TCP is retransmitting, don't do fast retransmit. + */ + if (mp1 && tcp->tcp_suna != tcp->tcp_snxt && + ! tcp->tcp_rexmit) { + /* Do Limited Transmit */ + if ((dupack_cnt = ++tcp->tcp_dupack_cnt) < + tcps->tcps_dupack_fast_retransmit) { + /* + * RFC 3042 + * + * What we need to do is temporarily + * increase tcp_cwnd so that new + * data can be sent if it is allowed + * by the receive window (tcp_rwnd). + * tcp_wput_data() will take care of + * the rest. + * + * If the connection is SACK capable, + * only do limited xmit when there + * is SACK info. + * + * Note how tcp_cwnd is incremented. + * The first dup ACK will increase + * it by 1 MSS. The second dup ACK + * will increase it by 2 MSS. This + * means that only 1 new segment will + * be sent for each dup ACK. + */ + if (tcp->tcp_unsent > 0 && + (!tcp->tcp_snd_sack_ok || + (tcp->tcp_snd_sack_ok && + tcp->tcp_notsack_list != NULL))) { + tcp->tcp_cwnd += mss << + (tcp->tcp_dupack_cnt - 1); + flags |= TH_LIMIT_XMIT; + } + } else if (dupack_cnt == + tcps->tcps_dupack_fast_retransmit) { + + /* + * If we have reduced tcp_ssthresh + * because of ECN, do not reduce it again + * unless it is already one window of data + * away. After one window of data, tcp_cwr + * should then be cleared. Note that + * for non ECN capable connection, tcp_cwr + * should always be false. + * + * Adjust cwnd since the duplicate + * ack indicates that a packet was + * dropped (due to congestion.) + */ + if (!tcp->tcp_cwr) { + npkt = ((tcp->tcp_snxt - + tcp->tcp_suna) >> 1) / mss; + tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * + mss; + tcp->tcp_cwnd = (npkt + + tcp->tcp_dupack_cnt) * mss; + } + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr = B_TRUE; + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + + /* + * We do Hoe's algorithm. Refer to her + * paper "Improving the Start-up Behavior + * of a Congestion Control Scheme for TCP," + * appeared in SIGCOMM'96. + * + * Save highest seq no we have sent so far. + * Be careful about the invisible FIN byte. + */ + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && + (tcp->tcp_unsent == 0)) { + tcp->tcp_rexmit_max = tcp->tcp_fss; + } else { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + } + + /* + * Do not allow bursty traffic during. + * fast recovery. Refer to Fall and Floyd's + * paper "Simulation-based Comparisons of + * Tahoe, Reno and SACK TCP" (in CCR?) + * This is a best current practise. + */ + tcp->tcp_snd_burst = TCP_CWND_SS; + + /* + * For SACK: + * Calculate tcp_pipe, which is the + * estimated number of bytes in + * network. + * + * tcp_fack is the highest sack'ed seq num + * TCP has received. + * + * tcp_pipe is explained in the above quoted + * Fall and Floyd's paper. tcp_fack is + * explained in Mathis and Mahdavi's + * "Forward Acknowledgment: Refining TCP + * Congestion Control" in SIGCOMM '96. + */ + if (tcp->tcp_snd_sack_ok) { + ASSERT(tcp->tcp_sack_info != NULL); + if (tcp->tcp_notsack_list != NULL) { + tcp->tcp_pipe = tcp->tcp_snxt - + tcp->tcp_fack; + tcp->tcp_sack_snxt = seg_ack; + flags |= TH_NEED_SACK_REXMIT; + } else { + /* + * Always initialize tcp_pipe + * even though we don't have + * any SACK info. If later + * we get SACK info and + * tcp_pipe is not initialized, + * funny things will happen. + */ + tcp->tcp_pipe = + tcp->tcp_cwnd_ssthresh; + } + } else { + flags |= TH_REXMIT_NEEDED; + } /* tcp_snd_sack_ok */ + + } else { + /* + * Here we perform congestion + * avoidance, but NOT slow start. + * This is known as the Fast + * Recovery Algorithm. + */ + if (tcp->tcp_snd_sack_ok && + tcp->tcp_notsack_list != NULL) { + flags |= TH_NEED_SACK_REXMIT; + tcp->tcp_pipe -= mss; + if (tcp->tcp_pipe < 0) + tcp->tcp_pipe = 0; + } else { + /* + * We know that one more packet has + * left the pipe thus we can update + * cwnd. + */ + cwnd = tcp->tcp_cwnd + mss; + if (cwnd > tcp->tcp_cwnd_max) + cwnd = tcp->tcp_cwnd_max; + tcp->tcp_cwnd = cwnd; + if (tcp->tcp_unsent > 0) + flags |= TH_XMIT_NEEDED; + } + } + } + } else if (tcp->tcp_zero_win_probe) { + /* + * If the window has opened, need to arrange + * to send additional data. + */ + if (new_swnd != 0) { + /* tcp_suna != tcp_snxt */ + /* Packet contains a window update */ + TCPS_BUMP_MIB(tcps, tcpInWinUpdate); + tcp->tcp_zero_win_probe = 0; + tcp->tcp_timer_backoff = 0; + tcp->tcp_ms_we_have_waited = 0; + + /* + * Transmit starting with tcp_suna since + * the one byte probe is not ack'ed. + * If TCP has sent more than one identical + * probe, tcp_rexmit will be set. That means + * tcp_ss_rexmit() will send out the one + * byte along with new data. Otherwise, + * fake the retransmission. + */ + flags |= TH_XMIT_NEEDED; + if (!tcp->tcp_rexmit) { + tcp->tcp_rexmit = B_TRUE; + tcp->tcp_dupack_cnt = 0; + tcp->tcp_rexmit_nxt = tcp->tcp_suna; + tcp->tcp_rexmit_max = tcp->tcp_suna + 1; + } + } + } + goto swnd_update; + } + + /* + * Check for "acceptability" of ACK value per RFC 793, pages 72 - 73. + * If the ACK value acks something that we have not yet sent, it might + * be an old duplicate segment. Send an ACK to re-synchronize the + * other side. + * Note: reset in response to unacceptable ACK in SYN_RECEIVE + * state is handled above, so we can always just drop the segment and + * send an ACK here. + * + * In the case where the peer shrinks the window, we see the new window + * update, but all the data sent previously is queued up by the peer. + * To account for this, in tcp_process_shrunk_swnd(), the sequence + * number, which was already sent, and within window, is recorded. + * tcp_snxt is then updated. + * + * If the window has previously shrunk, and an ACK for data not yet + * sent, according to tcp_snxt is recieved, it may still be valid. If + * the ACK is for data within the window at the time the window was + * shrunk, then the ACK is acceptable. In this case tcp_snxt is set to + * the sequence number ACK'ed. + * + * If the ACK covers all the data sent at the time the window was + * shrunk, we can now set tcp_is_wnd_shrnk to B_FALSE. + * + * Should we send ACKs in response to ACK only segments? + */ + + if (SEQ_GT(seg_ack, tcp->tcp_snxt)) { + if ((tcp->tcp_is_wnd_shrnk) && + (SEQ_LEQ(seg_ack, tcp->tcp_snxt_shrunk))) { + uint32_t data_acked_ahead_snxt; + + data_acked_ahead_snxt = seg_ack - tcp->tcp_snxt; + tcp_update_xmit_tail(tcp, seg_ack); + tcp->tcp_unsent -= data_acked_ahead_snxt; + } else { + TCPS_BUMP_MIB(tcps, tcpInAckUnsent); + /* drop the received segment */ + freemsg(mp); + + /* + * Send back an ACK. If tcp_drop_ack_unsent_cnt is + * greater than 0, check if the number of such + * bogus ACks is greater than that count. If yes, + * don't send back any ACK. This prevents TCP from + * getting into an ACK storm if somehow an attacker + * successfully spoofs an acceptable segment to our + * peer. If this continues (count > 2 X threshold), + * we should abort this connection. + */ + if (tcp_drop_ack_unsent_cnt > 0 && + ++tcp->tcp_in_ack_unsent > + tcp_drop_ack_unsent_cnt) { + TCP_STAT(tcps, tcp_in_ack_unsent_drop); + if (tcp->tcp_in_ack_unsent > 2 * + tcp_drop_ack_unsent_cnt) { + (void) tcp_clean_death(tcp, EPROTO); + } + return; + } + mp = tcp_ack_mp(tcp); + if (mp != NULL) { + BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpOutAck); + tcp_send_data(tcp, mp); + } + return; + } + } else if (tcp->tcp_is_wnd_shrnk && SEQ_GEQ(seg_ack, + tcp->tcp_snxt_shrunk)) { + tcp->tcp_is_wnd_shrnk = B_FALSE; + } + + /* + * TCP gets a new ACK, update the notsack'ed list to delete those + * blocks that are covered by this ACK. + */ + if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { + tcp_notsack_remove(&(tcp->tcp_notsack_list), seg_ack, + &(tcp->tcp_num_notsack_blk), &(tcp->tcp_cnt_notsack_list)); + } + + /* + * If we got an ACK after fast retransmit, check to see + * if it is a partial ACK. If it is not and the congestion + * window was inflated to account for the other side's + * cached packets, retract it. If it is, do Hoe's algorithm. + */ + if (tcp->tcp_dupack_cnt >= tcps->tcps_dupack_fast_retransmit) { + ASSERT(tcp->tcp_rexmit == B_FALSE); + if (SEQ_GEQ(seg_ack, tcp->tcp_rexmit_max)) { + tcp->tcp_dupack_cnt = 0; + /* + * Restore the orig tcp_cwnd_ssthresh after + * fast retransmit phase. + */ + if (tcp->tcp_cwnd > tcp->tcp_cwnd_ssthresh) { + tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh; + } + tcp->tcp_rexmit_max = seg_ack; + tcp->tcp_cwnd_cnt = 0; + tcp->tcp_snd_burst = tcp->tcp_localnet ? + TCP_CWND_INFINITE : TCP_CWND_NORMAL; + + /* + * Remove all notsack info to avoid confusion with + * the next fast retrasnmit/recovery phase. + */ + if (tcp->tcp_snd_sack_ok && + tcp->tcp_notsack_list != NULL) { + TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, + tcp); + } + } else { + if (tcp->tcp_snd_sack_ok && + tcp->tcp_notsack_list != NULL) { + flags |= TH_NEED_SACK_REXMIT; + tcp->tcp_pipe -= mss; + if (tcp->tcp_pipe < 0) + tcp->tcp_pipe = 0; + } else { + /* + * Hoe's algorithm: + * + * Retransmit the unack'ed segment and + * restart fast recovery. Note that we + * need to scale back tcp_cwnd to the + * original value when we started fast + * recovery. This is to prevent overly + * aggressive behaviour in sending new + * segments. + */ + tcp->tcp_cwnd = tcp->tcp_cwnd_ssthresh + + tcps->tcps_dupack_fast_retransmit * mss; + tcp->tcp_cwnd_cnt = tcp->tcp_cwnd; + flags |= TH_REXMIT_NEEDED; + } + } + } else { + tcp->tcp_dupack_cnt = 0; + if (tcp->tcp_rexmit) { + /* + * TCP is retranmitting. If the ACK ack's all + * outstanding data, update tcp_rexmit_max and + * tcp_rexmit_nxt. Otherwise, update tcp_rexmit_nxt + * to the correct value. + * + * Note that SEQ_LEQ() is used. This is to avoid + * unnecessary fast retransmit caused by dup ACKs + * received when TCP does slow start retransmission + * after a time out. During this phase, TCP may + * send out segments which are already received. + * This causes dup ACKs to be sent back. + */ + if (SEQ_LEQ(seg_ack, tcp->tcp_rexmit_max)) { + if (SEQ_GT(seg_ack, tcp->tcp_rexmit_nxt)) { + tcp->tcp_rexmit_nxt = seg_ack; + } + if (seg_ack != tcp->tcp_rexmit_max) { + flags |= TH_XMIT_NEEDED; + } + } else { + tcp->tcp_rexmit = B_FALSE; + tcp->tcp_rexmit_nxt = tcp->tcp_snxt; + tcp->tcp_snd_burst = tcp->tcp_localnet ? + TCP_CWND_INFINITE : TCP_CWND_NORMAL; + } + tcp->tcp_ms_we_have_waited = 0; + } + } + + TCPS_BUMP_MIB(tcps, tcpInAckSegs); + TCPS_UPDATE_MIB(tcps, tcpInAckBytes, bytes_acked); + tcp->tcp_suna = seg_ack; + if (tcp->tcp_zero_win_probe != 0) { + tcp->tcp_zero_win_probe = 0; + tcp->tcp_timer_backoff = 0; + } + + /* + * If tcp_xmit_head is NULL, then it must be the FIN being ack'ed. + * Note that it cannot be the SYN being ack'ed. The code flow + * will not reach here. + */ + if (mp1 == NULL) { + goto fin_acked; + } + + /* + * Update the congestion window. + * + * If TCP is not ECN capable or TCP is ECN capable but the + * congestion experience bit is not set, increase the tcp_cwnd as + * usual. + */ + if (!tcp->tcp_ecn_ok || !(flags & TH_ECE)) { + cwnd = tcp->tcp_cwnd; + add = mss; + + if (cwnd >= tcp->tcp_cwnd_ssthresh) { + /* + * This is to prevent an increase of less than 1 MSS of + * tcp_cwnd. With partial increase, tcp_wput_data() + * may send out tinygrams in order to preserve mblk + * boundaries. + * + * By initializing tcp_cwnd_cnt to new tcp_cwnd and + * decrementing it by 1 MSS for every ACKs, tcp_cwnd is + * increased by 1 MSS for every RTTs. + */ + if (tcp->tcp_cwnd_cnt <= 0) { + tcp->tcp_cwnd_cnt = cwnd + add; + } else { + tcp->tcp_cwnd_cnt -= add; + add = 0; + } + } + tcp->tcp_cwnd = MIN(cwnd + add, tcp->tcp_cwnd_max); + } + + /* See if the latest urgent data has been acknowledged */ + if ((tcp->tcp_valid_bits & TCP_URG_VALID) && + SEQ_GT(seg_ack, tcp->tcp_urg)) + tcp->tcp_valid_bits &= ~TCP_URG_VALID; + + /* Can we update the RTT estimates? */ + if (tcp->tcp_snd_ts_ok) { + /* Ignore zero timestamp echo-reply. */ + if (tcpopt.tcp_opt_ts_ecr != 0) { + tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - + (int32_t)tcpopt.tcp_opt_ts_ecr); + } + + /* If needed, restart the timer. */ + if (tcp->tcp_set_timer == 1) { + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + tcp->tcp_set_timer = 0; + } + /* + * Update tcp_csuna in case the other side stops sending + * us timestamps. + */ + tcp->tcp_csuna = tcp->tcp_snxt; + } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { + /* + * An ACK sequence we haven't seen before, so get the RTT + * and update the RTO. But first check if the timestamp is + * valid to use. + */ + if ((mp1->b_next != NULL) && + SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) + tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - + (int32_t)(intptr_t)mp1->b_prev); + else + TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); + + /* Remeber the last sequence to be ACKed */ + tcp->tcp_csuna = seg_ack; + if (tcp->tcp_set_timer == 1) { + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + tcp->tcp_set_timer = 0; + } + } else { + TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); + } + + /* Eat acknowledged bytes off the xmit queue. */ + for (;;) { + mblk_t *mp2; + uchar_t *wptr; + + wptr = mp1->b_wptr; + ASSERT((uintptr_t)(wptr - mp1->b_rptr) <= (uintptr_t)INT_MAX); + bytes_acked -= (int)(wptr - mp1->b_rptr); + if (bytes_acked < 0) { + mp1->b_rptr = wptr + bytes_acked; + /* + * Set a new timestamp if all the bytes timed by the + * old timestamp have been ack'ed. + */ + if (SEQ_GT(seg_ack, + (uint32_t)(uintptr_t)(mp1->b_next))) { + mp1->b_prev = + (mblk_t *)(uintptr_t)LBOLT_FASTPATH; + mp1->b_next = NULL; + } + break; + } + mp1->b_next = NULL; + mp1->b_prev = NULL; + mp2 = mp1; + mp1 = mp1->b_cont; + + /* + * This notification is required for some zero-copy + * clients to maintain a copy semantic. After the data + * is ack'ed, client is safe to modify or reuse the buffer. + */ + if (tcp->tcp_snd_zcopy_aware && + (mp2->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) + tcp_zcopy_notify(tcp); + freeb(mp2); + if (bytes_acked == 0) { + if (mp1 == NULL) { + /* Everything is ack'ed, clear the tail. */ + tcp->tcp_xmit_tail = NULL; + /* + * Cancel the timer unless we are still + * waiting for an ACK for the FIN packet. + */ + if (tcp->tcp_timer_tid != 0 && + tcp->tcp_snxt == tcp->tcp_suna) { + (void) TCP_TIMER_CANCEL(tcp, + tcp->tcp_timer_tid); + tcp->tcp_timer_tid = 0; + } + goto pre_swnd_update; + } + if (mp2 != tcp->tcp_xmit_tail) + break; + tcp->tcp_xmit_tail = mp1; + ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= + (uintptr_t)INT_MAX); + tcp->tcp_xmit_tail_unsent = (int)(mp1->b_wptr - + mp1->b_rptr); + break; + } + if (mp1 == NULL) { + /* + * More was acked but there is nothing more + * outstanding. This means that the FIN was + * just acked or that we're talking to a clown. + */ +fin_acked: + ASSERT(tcp->tcp_fin_sent); + tcp->tcp_xmit_tail = NULL; + if (tcp->tcp_fin_sent) { + /* FIN was acked - making progress */ + if (!tcp->tcp_fin_acked) + tcp->tcp_ip_forward_progress = B_TRUE; + tcp->tcp_fin_acked = B_TRUE; + if (tcp->tcp_linger_tid != 0 && + TCP_TIMER_CANCEL(tcp, + tcp->tcp_linger_tid) >= 0) { + tcp_stop_lingering(tcp); + freemsg(mp); + mp = NULL; + } + } else { + /* + * We should never get here because + * we have already checked that the + * number of bytes ack'ed should be + * smaller than or equal to what we + * have sent so far (it is the + * acceptability check of the ACK). + * We can only get here if the send + * queue is corrupted. + * + * Terminate the connection and + * panic the system. It is better + * for us to panic instead of + * continuing to avoid other disaster. + */ + tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, + tcp->tcp_rnxt, TH_RST|TH_ACK); + panic("Memory corruption " + "detected for connection %s.", + tcp_display(tcp, NULL, + DISP_ADDR_AND_PORT)); + /*NOTREACHED*/ + } + goto pre_swnd_update; + } + ASSERT(mp2 != tcp->tcp_xmit_tail); + } + if (tcp->tcp_unsent) { + flags |= TH_XMIT_NEEDED; + } +pre_swnd_update: + tcp->tcp_xmit_head = mp1; +swnd_update: + /* + * The following check is different from most other implementations. + * For bi-directional transfer, when segments are dropped, the + * "normal" check will not accept a window update in those + * retransmitted segemnts. Failing to do that, TCP may send out + * segments which are outside receiver's window. As TCP accepts + * the ack in those retransmitted segments, if the window update in + * the same segment is not accepted, TCP will incorrectly calculates + * that it can send more segments. This can create a deadlock + * with the receiver if its window becomes zero. + */ + if (SEQ_LT(tcp->tcp_swl2, seg_ack) || + SEQ_LT(tcp->tcp_swl1, seg_seq) || + (tcp->tcp_swl1 == seg_seq && new_swnd > tcp->tcp_swnd)) { + /* + * The criteria for update is: + * + * 1. the segment acknowledges some data. Or + * 2. the segment is new, i.e. it has a higher seq num. Or + * 3. the segment is not old and the advertised window is + * larger than the previous advertised window. + */ + if (tcp->tcp_unsent && new_swnd > tcp->tcp_swnd) + flags |= TH_XMIT_NEEDED; + tcp->tcp_swnd = new_swnd; + if (new_swnd > tcp->tcp_max_swnd) + tcp->tcp_max_swnd = new_swnd; + tcp->tcp_swl1 = seg_seq; + tcp->tcp_swl2 = seg_ack; + } +est: + if (tcp->tcp_state > TCPS_ESTABLISHED) { + + switch (tcp->tcp_state) { + case TCPS_FIN_WAIT_1: + if (tcp->tcp_fin_acked) { + tcp->tcp_state = TCPS_FIN_WAIT_2; + /* + * We implement the non-standard BSD/SunOS + * FIN_WAIT_2 flushing algorithm. + * If there is no user attached to this + * TCP endpoint, then this TCP struct + * could hang around forever in FIN_WAIT_2 + * state if the peer forgets to send us + * a FIN. To prevent this, we wait only + * 2*MSL (a convenient time value) for + * the FIN to arrive. If it doesn't show up, + * we flush the TCP endpoint. This algorithm, + * though a violation of RFC-793, has worked + * for over 10 years in BSD systems. + * Note: SunOS 4.x waits 675 seconds before + * flushing the FIN_WAIT_2 connection. + */ + TCP_TIMER_RESTART(tcp, + tcps->tcps_fin_wait_2_flush_interval); + } + break; + case TCPS_FIN_WAIT_2: + break; /* Shutdown hook? */ + case TCPS_LAST_ACK: + freemsg(mp); + if (tcp->tcp_fin_acked) { + (void) tcp_clean_death(tcp, 0); + return; + } + goto xmit_check; + case TCPS_CLOSING: + if (tcp->tcp_fin_acked) + SET_TIME_WAIT(tcps, tcp, connp); + /*FALLTHRU*/ + case TCPS_CLOSE_WAIT: + freemsg(mp); + goto xmit_check; + default: + ASSERT(tcp->tcp_state != TCPS_TIME_WAIT); + break; + } + } + if (flags & TH_FIN) { + /* Make sure we ack the fin */ + flags |= TH_ACK_NEEDED; + if (!tcp->tcp_fin_rcvd) { + tcp->tcp_fin_rcvd = B_TRUE; + tcp->tcp_rnxt++; + tcpha = tcp->tcp_tcpha; + tcpha->tha_ack = htonl(tcp->tcp_rnxt); + + /* + * Generate the ordrel_ind at the end unless we + * are an eager guy. + * In the eager case tcp_rsrv will do this when run + * after tcp_accept is done. + */ + if (tcp->tcp_listener == NULL && + !TCP_IS_DETACHED(tcp) && !tcp->tcp_hard_binding) + flags |= TH_ORDREL_NEEDED; + switch (tcp->tcp_state) { + case TCPS_SYN_RCVD: + case TCPS_ESTABLISHED: + tcp->tcp_state = TCPS_CLOSE_WAIT; + /* Keepalive? */ + break; + case TCPS_FIN_WAIT_1: + if (!tcp->tcp_fin_acked) { + tcp->tcp_state = TCPS_CLOSING; + break; + } + /* FALLTHRU */ + case TCPS_FIN_WAIT_2: + SET_TIME_WAIT(tcps, tcp, connp); + if (seg_len) { + /* + * implies data piggybacked on FIN. + * break to handle data. + */ + break; + } + freemsg(mp); + goto ack_check; + } + } + } + if (mp == NULL) + goto xmit_check; + if (seg_len == 0) { + freemsg(mp); + goto xmit_check; + } + if (mp->b_rptr == mp->b_wptr) { + /* + * The header has been consumed, so we remove the + * zero-length mblk here. + */ + mp1 = mp; + mp = mp->b_cont; + freeb(mp1); + } +update_ack: + tcpha = tcp->tcp_tcpha; + tcp->tcp_rack_cnt++; + { + uint32_t cur_max; + + cur_max = tcp->tcp_rack_cur_max; + if (tcp->tcp_rack_cnt >= cur_max) { + /* + * We have more unacked data than we should - send + * an ACK now. + */ + flags |= TH_ACK_NEEDED; + cur_max++; + if (cur_max > tcp->tcp_rack_abs_max) + tcp->tcp_rack_cur_max = tcp->tcp_rack_abs_max; + else + tcp->tcp_rack_cur_max = cur_max; + } else if (TCP_IS_DETACHED(tcp)) { + /* We don't have an ACK timer for detached TCP. */ + flags |= TH_ACK_NEEDED; + } else if (seg_len < mss) { + /* + * If we get a segment that is less than an mss, and we + * already have unacknowledged data, and the amount + * unacknowledged is not a multiple of mss, then we + * better generate an ACK now. Otherwise, this may be + * the tail piece of a transaction, and we would rather + * wait for the response. + */ + uint32_t udif; + ASSERT((uintptr_t)(tcp->tcp_rnxt - tcp->tcp_rack) <= + (uintptr_t)INT_MAX); + udif = (int)(tcp->tcp_rnxt - tcp->tcp_rack); + if (udif && (udif % mss)) + flags |= TH_ACK_NEEDED; + else + flags |= TH_ACK_TIMER_NEEDED; + } else { + /* Start delayed ack timer */ + flags |= TH_ACK_TIMER_NEEDED; + } + } + tcp->tcp_rnxt += seg_len; + tcpha->tha_ack = htonl(tcp->tcp_rnxt); + + if (mp == NULL) + goto xmit_check; + + /* Update SACK list */ + if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { + tcp_sack_remove(tcp->tcp_sack_list, tcp->tcp_rnxt, + &(tcp->tcp_num_sack_blk)); + } + + if (tcp->tcp_urp_mp) { + tcp->tcp_urp_mp->b_cont = mp; + mp = tcp->tcp_urp_mp; + tcp->tcp_urp_mp = NULL; + /* Ready for a new signal. */ + tcp->tcp_urp_last_valid = B_FALSE; +#ifdef DEBUG + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: sending exdata_ind %s", + tcp_display(tcp, NULL, DISP_PORT_ONLY)); +#endif /* DEBUG */ + } + + /* + * Check for ancillary data changes compared to last segment. + */ + if (connp->conn_recv_ancillary.crb_all != 0) { + mp = tcp_input_add_ancillary(tcp, mp, &ipp, ira); + if (mp == NULL) + return; + } + + if (tcp->tcp_listener != NULL || tcp->tcp_hard_binding) { + /* + * Side queue inbound data until the accept happens. + * tcp_accept/tcp_rput drains this when the accept happens. + * M_DATA is queued on b_cont. Otherwise (T_OPTDATA_IND or + * T_EXDATA_IND) it is queued on b_next. + * XXX Make urgent data use this. Requires: + * Removing tcp_listener check for TH_URG + * Making M_PCPROTO and MARK messages skip the eager case + */ + + if (tcp->tcp_kssl_pending) { + DTRACE_PROBE1(kssl_mblk__ksslinput_pending, + mblk_t *, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); + } else { + tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); + } + } else if (IPCL_IS_NONSTR(connp)) { + /* + * Non-STREAMS socket + * + * Note that no KSSL processing is done here, because + * KSSL is not supported for non-STREAMS sockets. + */ + boolean_t push = flags & (TH_PUSH|TH_FIN); + int error; + + if ((*connp->conn_upcalls->su_recv)( + connp->conn_upper_handle, + mp, seg_len, 0, &error, &push) <= 0) { + /* + * We should never be in middle of a + * fallback, the squeue guarantees that. + */ + ASSERT(error != EOPNOTSUPP); + if (error == ENOSPC) + tcp->tcp_rwnd -= seg_len; + } else if (push) { + /* PUSH bit set and sockfs is not flow controlled */ + flags |= tcp_rwnd_reopen(tcp); + } + } else { + /* STREAMS socket */ + if (mp->b_datap->db_type != M_DATA || + (flags & TH_MARKNEXT_NEEDED)) { + if (tcp->tcp_rcv_list != NULL) { + flags |= tcp_rcv_drain(tcp); + } + ASSERT(tcp->tcp_rcv_list == NULL || + tcp->tcp_fused_sigurg); + + if (flags & TH_MARKNEXT_NEEDED) { +#ifdef DEBUG + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: sending MSGMARKNEXT %s", + tcp_display(tcp, NULL, + DISP_PORT_ONLY)); +#endif /* DEBUG */ + mp->b_flag |= MSGMARKNEXT; + flags &= ~TH_MARKNEXT_NEEDED; + } + + /* Does this need SSL processing first? */ + if ((tcp->tcp_kssl_ctx != NULL) && + (DB_TYPE(mp) == M_DATA)) { + DTRACE_PROBE1(kssl_mblk__ksslinput_data1, + mblk_t *, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); + } else { + if (is_system_labeled()) + tcp_setcred_data(mp, ira); + + putnext(connp->conn_rq, mp); + if (!canputnext(connp->conn_rq)) + tcp->tcp_rwnd -= seg_len; + } + } else if ((tcp->tcp_kssl_ctx != NULL) && + (DB_TYPE(mp) == M_DATA)) { + /* Does this need SSL processing first? */ + DTRACE_PROBE1(kssl_mblk__ksslinput_data2, mblk_t *, mp); + tcp_kssl_input(tcp, mp, ira->ira_cred); + } else if ((flags & (TH_PUSH|TH_FIN)) || + tcp->tcp_rcv_cnt + seg_len >= connp->conn_rcvbuf >> 3) { + if (tcp->tcp_rcv_list != NULL) { + /* + * Enqueue the new segment first and then + * call tcp_rcv_drain() to send all data + * up. The other way to do this is to + * send all queued data up and then call + * putnext() to send the new segment up. + * This way can remove the else part later + * on. + * + * We don't do this to avoid one more call to + * canputnext() as tcp_rcv_drain() needs to + * call canputnext(). + */ + tcp_rcv_enqueue(tcp, mp, seg_len, + ira->ira_cred); + flags |= tcp_rcv_drain(tcp); + } else { + if (is_system_labeled()) + tcp_setcred_data(mp, ira); + + putnext(connp->conn_rq, mp); + if (!canputnext(connp->conn_rq)) + tcp->tcp_rwnd -= seg_len; + } + } else { + /* + * Enqueue all packets when processing an mblk + * from the co queue and also enqueue normal packets. + */ + tcp_rcv_enqueue(tcp, mp, seg_len, ira->ira_cred); + } + /* + * Make sure the timer is running if we have data waiting + * for a push bit. This provides resiliency against + * implementations that do not correctly generate push bits. + */ + if (tcp->tcp_rcv_list != NULL && tcp->tcp_push_tid == 0) { + /* + * The connection may be closed at this point, so don't + * do anything for a detached tcp. + */ + if (!TCP_IS_DETACHED(tcp)) + tcp->tcp_push_tid = TCP_TIMER(tcp, + tcp_push_timer, + MSEC_TO_TICK( + tcps->tcps_push_timer_interval)); + } + } + +xmit_check: + /* Is there anything left to do? */ + ASSERT(!(flags & TH_MARKNEXT_NEEDED)); + if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_ACK_NEEDED| + TH_NEED_SACK_REXMIT|TH_LIMIT_XMIT|TH_ACK_TIMER_NEEDED| + TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) + goto done; + + /* Any transmit work to do and a non-zero window? */ + if ((flags & (TH_REXMIT_NEEDED|TH_XMIT_NEEDED|TH_NEED_SACK_REXMIT| + TH_LIMIT_XMIT)) && tcp->tcp_swnd != 0) { + if (flags & TH_REXMIT_NEEDED) { + uint32_t snd_size = tcp->tcp_snxt - tcp->tcp_suna; + + TCPS_BUMP_MIB(tcps, tcpOutFastRetrans); + if (snd_size > mss) + snd_size = mss; + if (snd_size > tcp->tcp_swnd) + snd_size = tcp->tcp_swnd; + mp1 = tcp_xmit_mp(tcp, tcp->tcp_xmit_head, snd_size, + NULL, NULL, tcp->tcp_suna, B_TRUE, &snd_size, + B_TRUE); + + if (mp1 != NULL) { + tcp->tcp_xmit_head->b_prev = + (mblk_t *)LBOLT_FASTPATH; + tcp->tcp_csuna = tcp->tcp_snxt; + TCPS_BUMP_MIB(tcps, tcpRetransSegs); + TCPS_UPDATE_MIB(tcps, tcpRetransBytes, + snd_size); + tcp_send_data(tcp, mp1); + } + } + if (flags & TH_NEED_SACK_REXMIT) { + tcp_sack_rexmit(tcp, &flags); + } + /* + * For TH_LIMIT_XMIT, tcp_wput_data() is called to send + * out new segment. Note that tcp_rexmit should not be + * set, otherwise TH_LIMIT_XMIT should not be set. + */ + if (flags & (TH_XMIT_NEEDED|TH_LIMIT_XMIT)) { + if (!tcp->tcp_rexmit) { + tcp_wput_data(tcp, NULL, B_FALSE); + } else { + tcp_ss_rexmit(tcp); + } + } + /* + * Adjust tcp_cwnd back to normal value after sending + * new data segments. + */ + if (flags & TH_LIMIT_XMIT) { + tcp->tcp_cwnd -= mss << (tcp->tcp_dupack_cnt - 1); + /* + * This will restart the timer. Restarting the + * timer is used to avoid a timeout before the + * limited transmitted segment's ACK gets back. + */ + if (tcp->tcp_xmit_head != NULL) + tcp->tcp_xmit_head->b_prev = + (mblk_t *)LBOLT_FASTPATH; + } + + /* Anything more to do? */ + if ((flags & (TH_ACK_NEEDED|TH_ACK_TIMER_NEEDED| + TH_ORDREL_NEEDED|TH_SEND_URP_MARK)) == 0) + goto done; + } +ack_check: + if (flags & TH_SEND_URP_MARK) { + ASSERT(tcp->tcp_urp_mark_mp); + ASSERT(!IPCL_IS_NONSTR(connp)); + /* + * Send up any queued data and then send the mark message + */ + if (tcp->tcp_rcv_list != NULL) { + flags |= tcp_rcv_drain(tcp); + + } + ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + mp1 = tcp->tcp_urp_mark_mp; + tcp->tcp_urp_mark_mp = NULL; + if (is_system_labeled()) + tcp_setcred_data(mp1, ira); + + putnext(connp->conn_rq, mp1); +#ifdef DEBUG + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_rput: sending zero-length %s %s", + ((mp1->b_flag & MSGMARKNEXT) ? "MSGMARKNEXT" : + "MSGNOTMARKNEXT"), + tcp_display(tcp, NULL, DISP_PORT_ONLY)); +#endif /* DEBUG */ + flags &= ~TH_SEND_URP_MARK; + } + if (flags & TH_ACK_NEEDED) { + /* + * Time to send an ack for some reason. + */ + mp1 = tcp_ack_mp(tcp); + + if (mp1 != NULL) { + tcp_send_data(tcp, mp1); + BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpOutAck); + } + if (tcp->tcp_ack_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); + tcp->tcp_ack_tid = 0; + } + } + if (flags & TH_ACK_TIMER_NEEDED) { + /* + * Arrange for deferred ACK or push wait timeout. + * Start timer if it is not already running. + */ + if (tcp->tcp_ack_tid == 0) { + tcp->tcp_ack_tid = TCP_TIMER(tcp, tcp_ack_timer, + MSEC_TO_TICK(tcp->tcp_localnet ? + (clock_t)tcps->tcps_local_dack_interval : + (clock_t)tcps->tcps_deferred_ack_interval)); + } + } + if (flags & TH_ORDREL_NEEDED) { + /* + * Send up the ordrel_ind unless we are an eager guy. + * In the eager case tcp_rsrv will do this when run + * after tcp_accept is done. + */ + ASSERT(tcp->tcp_listener == NULL); + ASSERT(!tcp->tcp_detached); + + if (IPCL_IS_NONSTR(connp)) { + ASSERT(tcp->tcp_ordrel_mp == NULL); + tcp->tcp_ordrel_done = B_TRUE; + (*connp->conn_upcalls->su_opctl) + (connp->conn_upper_handle, SOCK_OPCTL_SHUT_RECV, 0); + goto done; + } + + if (tcp->tcp_rcv_list != NULL) { + /* + * Push any mblk(s) enqueued from co processing. + */ + flags |= tcp_rcv_drain(tcp); + } + ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg); + + mp1 = tcp->tcp_ordrel_mp; + tcp->tcp_ordrel_mp = NULL; + tcp->tcp_ordrel_done = B_TRUE; + putnext(connp->conn_rq, mp1); + } +done: + ASSERT(!(flags & TH_MARKNEXT_NEEDED)); +} + +/* + * Attach ancillary data to a received TCP segments for the + * ancillary pieces requested by the application that are + * different than they were in the previous data segment. + * + * Save the "current" values once memory allocation is ok so that + * when memory allocation fails we can just wait for the next data segment. + */ +static mblk_t * +tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, + ip_recv_attr_t *ira) +{ + struct T_optdata_ind *todi; + int optlen; + uchar_t *optptr; + struct T_opthdr *toh; + crb_t addflag; /* Which pieces to add */ + mblk_t *mp1; + conn_t *connp = tcp->tcp_connp; + + optlen = 0; + addflag.crb_all = 0; + /* If app asked for pktinfo and the index has changed ... */ + if (connp->conn_recv_ancillary.crb_ip_recvpktinfo && + ira->ira_ruifindex != tcp->tcp_recvifindex) { + optlen += sizeof (struct T_opthdr) + + sizeof (struct in6_pktinfo); + addflag.crb_ip_recvpktinfo = 1; + } + /* If app asked for hoplimit and it has changed ... */ + if (connp->conn_recv_ancillary.crb_ipv6_recvhoplimit && + ipp->ipp_hoplimit != tcp->tcp_recvhops) { + optlen += sizeof (struct T_opthdr) + sizeof (uint_t); + addflag.crb_ipv6_recvhoplimit = 1; + } + /* If app asked for tclass and it has changed ... */ + if (connp->conn_recv_ancillary.crb_ipv6_recvtclass && + ipp->ipp_tclass != tcp->tcp_recvtclass) { + optlen += sizeof (struct T_opthdr) + sizeof (uint_t); + addflag.crb_ipv6_recvtclass = 1; + } + /* + * If app asked for hopbyhop headers and it has changed ... + * For security labels, note that (1) security labels can't change on + * a connected socket at all, (2) we're connected to at most one peer, + * (3) if anything changes, then it must be some other extra option. + */ + if (connp->conn_recv_ancillary.crb_ipv6_recvhopopts && + ip_cmpbuf(tcp->tcp_hopopts, tcp->tcp_hopoptslen, + (ipp->ipp_fields & IPPF_HOPOPTS), + ipp->ipp_hopopts, ipp->ipp_hopoptslen)) { + optlen += sizeof (struct T_opthdr) + ipp->ipp_hopoptslen; + addflag.crb_ipv6_recvhopopts = 1; + if (!ip_allocbuf((void **)&tcp->tcp_hopopts, + &tcp->tcp_hopoptslen, (ipp->ipp_fields & IPPF_HOPOPTS), + ipp->ipp_hopopts, ipp->ipp_hopoptslen)) + return (mp); + } + /* If app asked for dst headers before routing headers ... */ + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdrdstopts && + ip_cmpbuf(tcp->tcp_rthdrdstopts, tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) { + optlen += sizeof (struct T_opthdr) + + ipp->ipp_rthdrdstoptslen; + addflag.crb_ipv6_recvrthdrdstopts = 1; + if (!ip_allocbuf((void **)&tcp->tcp_rthdrdstopts, + &tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen)) + return (mp); + } + /* If app asked for routing headers and it has changed ... */ + if (connp->conn_recv_ancillary.crb_ipv6_recvrthdr && + ip_cmpbuf(tcp->tcp_rthdr, tcp->tcp_rthdrlen, + (ipp->ipp_fields & IPPF_RTHDR), + ipp->ipp_rthdr, ipp->ipp_rthdrlen)) { + optlen += sizeof (struct T_opthdr) + ipp->ipp_rthdrlen; + addflag.crb_ipv6_recvrthdr = 1; + if (!ip_allocbuf((void **)&tcp->tcp_rthdr, + &tcp->tcp_rthdrlen, (ipp->ipp_fields & IPPF_RTHDR), + ipp->ipp_rthdr, ipp->ipp_rthdrlen)) + return (mp); + } + /* If app asked for dest headers and it has changed ... */ + if ((connp->conn_recv_ancillary.crb_ipv6_recvdstopts || + connp->conn_recv_ancillary.crb_old_ipv6_recvdstopts) && + ip_cmpbuf(tcp->tcp_dstopts, tcp->tcp_dstoptslen, + (ipp->ipp_fields & IPPF_DSTOPTS), + ipp->ipp_dstopts, ipp->ipp_dstoptslen)) { + optlen += sizeof (struct T_opthdr) + ipp->ipp_dstoptslen; + addflag.crb_ipv6_recvdstopts = 1; + if (!ip_allocbuf((void **)&tcp->tcp_dstopts, + &tcp->tcp_dstoptslen, (ipp->ipp_fields & IPPF_DSTOPTS), + ipp->ipp_dstopts, ipp->ipp_dstoptslen)) + return (mp); + } + + if (optlen == 0) { + /* Nothing to add */ + return (mp); + } + mp1 = allocb(sizeof (struct T_optdata_ind) + optlen, BPRI_MED); + if (mp1 == NULL) { + /* + * Defer sending ancillary data until the next TCP segment + * arrives. + */ + return (mp); + } + mp1->b_cont = mp; + mp = mp1; + mp->b_wptr += sizeof (*todi) + optlen; + mp->b_datap->db_type = M_PROTO; + todi = (struct T_optdata_ind *)mp->b_rptr; + todi->PRIM_type = T_OPTDATA_IND; + todi->DATA_flag = 1; /* MORE data */ + todi->OPT_length = optlen; + todi->OPT_offset = sizeof (*todi); + optptr = (uchar_t *)&todi[1]; + /* + * If app asked for pktinfo and the index has changed ... + * Note that the local address never changes for the connection. + */ + if (addflag.crb_ip_recvpktinfo) { + struct in6_pktinfo *pkti; + uint_t ifindex; + + ifindex = ira->ira_ruifindex; + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_PKTINFO; + toh->len = sizeof (*toh) + sizeof (*pkti); + toh->status = 0; + optptr += sizeof (*toh); + pkti = (struct in6_pktinfo *)optptr; + pkti->ipi6_addr = connp->conn_laddr_v6; + pkti->ipi6_ifindex = ifindex; + optptr += sizeof (*pkti); + ASSERT(OK_32PTR(optptr)); + /* Save as "last" value */ + tcp->tcp_recvifindex = ifindex; + } + /* If app asked for hoplimit and it has changed ... */ + if (addflag.crb_ipv6_recvhoplimit) { + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_HOPLIMIT; + toh->len = sizeof (*toh) + sizeof (uint_t); + toh->status = 0; + optptr += sizeof (*toh); + *(uint_t *)optptr = ipp->ipp_hoplimit; + optptr += sizeof (uint_t); + ASSERT(OK_32PTR(optptr)); + /* Save as "last" value */ + tcp->tcp_recvhops = ipp->ipp_hoplimit; + } + /* If app asked for tclass and it has changed ... */ + if (addflag.crb_ipv6_recvtclass) { + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_TCLASS; + toh->len = sizeof (*toh) + sizeof (uint_t); + toh->status = 0; + optptr += sizeof (*toh); + *(uint_t *)optptr = ipp->ipp_tclass; + optptr += sizeof (uint_t); + ASSERT(OK_32PTR(optptr)); + /* Save as "last" value */ + tcp->tcp_recvtclass = ipp->ipp_tclass; + } + if (addflag.crb_ipv6_recvhopopts) { + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_HOPOPTS; + toh->len = sizeof (*toh) + ipp->ipp_hopoptslen; + toh->status = 0; + optptr += sizeof (*toh); + bcopy((uchar_t *)ipp->ipp_hopopts, optptr, ipp->ipp_hopoptslen); + optptr += ipp->ipp_hopoptslen; + ASSERT(OK_32PTR(optptr)); + /* Save as last value */ + ip_savebuf((void **)&tcp->tcp_hopopts, &tcp->tcp_hopoptslen, + (ipp->ipp_fields & IPPF_HOPOPTS), + ipp->ipp_hopopts, ipp->ipp_hopoptslen); + } + if (addflag.crb_ipv6_recvrthdrdstopts) { + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_RTHDRDSTOPTS; + toh->len = sizeof (*toh) + ipp->ipp_rthdrdstoptslen; + toh->status = 0; + optptr += sizeof (*toh); + bcopy(ipp->ipp_rthdrdstopts, optptr, ipp->ipp_rthdrdstoptslen); + optptr += ipp->ipp_rthdrdstoptslen; + ASSERT(OK_32PTR(optptr)); + /* Save as last value */ + ip_savebuf((void **)&tcp->tcp_rthdrdstopts, + &tcp->tcp_rthdrdstoptslen, + (ipp->ipp_fields & IPPF_RTHDRDSTOPTS), + ipp->ipp_rthdrdstopts, ipp->ipp_rthdrdstoptslen); + } + if (addflag.crb_ipv6_recvrthdr) { + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_RTHDR; + toh->len = sizeof (*toh) + ipp->ipp_rthdrlen; + toh->status = 0; + optptr += sizeof (*toh); + bcopy(ipp->ipp_rthdr, optptr, ipp->ipp_rthdrlen); + optptr += ipp->ipp_rthdrlen; + ASSERT(OK_32PTR(optptr)); + /* Save as last value */ + ip_savebuf((void **)&tcp->tcp_rthdr, &tcp->tcp_rthdrlen, + (ipp->ipp_fields & IPPF_RTHDR), + ipp->ipp_rthdr, ipp->ipp_rthdrlen); + } + if (addflag.crb_ipv6_recvdstopts) { + toh = (struct T_opthdr *)optptr; + toh->level = IPPROTO_IPV6; + toh->name = IPV6_DSTOPTS; + toh->len = sizeof (*toh) + ipp->ipp_dstoptslen; + toh->status = 0; + optptr += sizeof (*toh); + bcopy(ipp->ipp_dstopts, optptr, ipp->ipp_dstoptslen); + optptr += ipp->ipp_dstoptslen; + ASSERT(OK_32PTR(optptr)); + /* Save as last value */ + ip_savebuf((void **)&tcp->tcp_dstopts, &tcp->tcp_dstoptslen, + (ipp->ipp_fields & IPPF_DSTOPTS), + ipp->ipp_dstopts, ipp->ipp_dstoptslen); + } + ASSERT(optptr == mp->b_wptr); + return (mp); +} + +/* The minimum of smoothed mean deviation in RTO calculation. */ +#define TCP_SD_MIN 400 + +/* + * Set RTO for this connection. The formula is from Jacobson and Karels' + * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names + * are the same as those in Appendix A.2 of that paper. + * + * m = new measurement + * sa = smoothed RTT average (8 * average estimates). + * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). + */ +static void +tcp_set_rto(tcp_t *tcp, clock_t rtt) +{ + long m = TICK_TO_MSEC(rtt); + clock_t sa = tcp->tcp_rtt_sa; + clock_t sv = tcp->tcp_rtt_sd; + clock_t rto; + tcp_stack_t *tcps = tcp->tcp_tcps; + + TCPS_BUMP_MIB(tcps, tcpRttUpdate); + tcp->tcp_rtt_update++; + + /* tcp_rtt_sa is not 0 means this is a new sample. */ + if (sa != 0) { + /* + * Update average estimator: + * new rtt = 7/8 old rtt + 1/8 Error + */ + + /* m is now Error in estimate. */ + m -= sa >> 3; + if ((sa += m) <= 0) { + /* + * Don't allow the smoothed average to be negative. + * We use 0 to denote reinitialization of the + * variables. + */ + sa = 1; + } + + /* + * Update deviation estimator: + * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) + */ + if (m < 0) + m = -m; + m -= sv >> 2; + sv += m; + } else { + /* + * This follows BSD's implementation. So the reinitialized + * RTO is 3 * m. We cannot go less than 2 because if the + * link is bandwidth dominated, doubling the window size + * during slow start means doubling the RTT. We want to be + * more conservative when we reinitialize our estimates. 3 + * is just a convenient number. + */ + sa = m << 3; + sv = m << 1; + } + if (sv < TCP_SD_MIN) { + /* + * We do not know that if sa captures the delay ACK + * effect as in a long train of segments, a receiver + * does not delay its ACKs. So set the minimum of sv + * to be TCP_SD_MIN, which is default to 400 ms, twice + * of BSD DATO. That means the minimum of mean + * deviation is 100 ms. + * + */ + sv = TCP_SD_MIN; + } + tcp->tcp_rtt_sa = sa; + tcp->tcp_rtt_sd = sv; + /* + * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) + * + * Add tcp_rexmit_interval extra in case of extreme environment + * where the algorithm fails to work. The default value of + * tcp_rexmit_interval_extra should be 0. + * + * As we use a finer grained clock than BSD and update + * RTO for every ACKs, add in another .25 of RTT to the + * deviation of RTO to accomodate burstiness of 1/4 of + * window size. + */ + rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); + + if (rto > tcps->tcps_rexmit_interval_max) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_max; + } else if (rto < tcps->tcps_rexmit_interval_min) { + tcp->tcp_rto = tcps->tcps_rexmit_interval_min; + } else { + tcp->tcp_rto = rto; + } + + /* Now, we can reset tcp_timer_backoff to use the new RTO... */ + tcp->tcp_timer_backoff = 0; +} + +/* + * On a labeled system we have some protocols above TCP, such as RPC, which + * appear to assume that every mblk in a chain has a db_credp. + */ +static void +tcp_setcred_data(mblk_t *mp, ip_recv_attr_t *ira) +{ + ASSERT(is_system_labeled()); + ASSERT(ira->ira_cred != NULL); + + while (mp != NULL) { + mblk_setcred(mp, ira->ira_cred, NOPID); + mp = mp->b_cont; + } +} + +uint_t +tcp_rwnd_reopen(tcp_t *tcp) +{ + uint_t ret = 0; + uint_t thwin; + conn_t *connp = tcp->tcp_connp; + + /* Learn the latest rwnd information that we sent to the other side. */ + thwin = ((uint_t)ntohs(tcp->tcp_tcpha->tha_win)) + << tcp->tcp_rcv_ws; + /* This is peer's calculated send window (our receive window). */ + thwin -= tcp->tcp_rnxt - tcp->tcp_rack; + /* + * Increase the receive window to max. But we need to do receiver + * SWS avoidance. This means that we need to check the increase of + * of receive window is at least 1 MSS. + */ + if (connp->conn_rcvbuf - thwin >= tcp->tcp_mss) { + /* + * If the window that the other side knows is less than max + * deferred acks segments, send an update immediately. + */ + if (thwin < tcp->tcp_rack_cur_max * tcp->tcp_mss) { + TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutWinUpdate); + ret = TH_ACK_NEEDED; + } + tcp->tcp_rwnd = connp->conn_rcvbuf; + } + return (ret); +} + +/* + * Handle a packet that has been reclassified by TCP. + * This function drops the ref on connp that the caller had. + */ +void +tcp_reinput(conn_t *connp, mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst) +{ + ipsec_stack_t *ipss = ipst->ips_netstack->netstack_ipsec; + + if (connp->conn_incoming_ifindex != 0 && + connp->conn_incoming_ifindex != ira->ira_ruifindex) { + freemsg(mp); + CONN_DEC_REF(connp); + return; + } + + if (CONN_INBOUND_POLICY_PRESENT_V6(connp, ipss) || + (ira->ira_flags & IRAF_IPSEC_SECURE)) { + ip6_t *ip6h; + ipha_t *ipha; + + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha = (ipha_t *)mp->b_rptr; + ip6h = NULL; + } else { + ipha = NULL; + ip6h = (ip6_t *)mp->b_rptr; + } + mp = ipsec_check_inbound_policy(mp, connp, ipha, ip6h, ira); + if (mp == NULL) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + /* Note that mp is NULL */ + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + CONN_DEC_REF(connp); + return; + } + } + + if (IPCL_IS_TCP(connp)) { + /* + * do not drain, certain use cases can blow + * the stack + */ + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + connp->conn_recv, connp, ira, + SQ_NODRAIN, SQTAG_IP_TCP_INPUT); + } else { + /* Not TCP; must be SOCK_RAW, IPPROTO_TCP */ + (connp->conn_recv)(connp, mp, NULL, + ira); + CONN_DEC_REF(connp); + } + +} + +/* ARGSUSED */ +static void +tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + queue_t *q = connp->conn_rq; + + ASSERT(!IPCL_IS_NONSTR(connp)); + mutex_enter(&tcp->tcp_rsrv_mp_lock); + tcp->tcp_rsrv_mp = mp; + mutex_exit(&tcp->tcp_rsrv_mp_lock); + + if (TCP_IS_DETACHED(tcp) || q == NULL) { + return; + } + + if (tcp->tcp_fused) { + tcp_fuse_backenable(tcp); + return; + } + + if (canputnext(q)) { + /* Not flow-controlled, open rwnd */ + tcp->tcp_rwnd = connp->conn_rcvbuf; + + /* + * Send back a window update immediately if TCP is above + * ESTABLISHED state and the increase of the rcv window + * that the other side knows is at least 1 MSS after flow + * control is lifted. + */ + if (tcp->tcp_state >= TCPS_ESTABLISHED && + tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { + tcp_xmit_ctl(NULL, tcp, + (tcp->tcp_swnd == 0) ? tcp->tcp_suna : + tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); + } + } +} + +/* + * The read side service routine is called mostly when we get back-enabled as a + * result of flow control relief. Since we don't actually queue anything in + * TCP, we have no data to send out of here. What we do is clear the receive + * window, and send out a window update. + */ +void +tcp_rsrv(queue_t *q) +{ + conn_t *connp = Q_TO_CONN(q); + tcp_t *tcp = connp->conn_tcp; + mblk_t *mp; + + /* No code does a putq on the read side */ + ASSERT(q->q_first == NULL); + + /* + * If tcp->tcp_rsrv_mp == NULL, it means that tcp_rsrv() has already + * been run. So just return. + */ + mutex_enter(&tcp->tcp_rsrv_mp_lock); + if ((mp = tcp->tcp_rsrv_mp) == NULL) { + mutex_exit(&tcp->tcp_rsrv_mp_lock); + return; + } + tcp->tcp_rsrv_mp = NULL; + mutex_exit(&tcp->tcp_rsrv_mp_lock); + + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_rsrv_input, connp, + NULL, SQ_PROCESS, SQTAG_TCP_RSRV); +} + +/* At minimum we need 8 bytes in the TCP header for the lookup */ +#define ICMP_MIN_TCP_HDR 8 + +/* + * tcp_icmp_input is called as conn_recvicmp to process ICMP error messages + * passed up by IP. The message is always received on the correct tcp_t. + * Assumes that IP has pulled up everything up to and including the ICMP header. + */ +/* ARGSUSED2 */ +void +tcp_icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) +{ + conn_t *connp = (conn_t *)arg1; + icmph_t *icmph; + ipha_t *ipha; + int iph_hdr_length; + tcpha_t *tcpha; + uint32_t seg_seq; + tcp_t *tcp = connp->conn_tcp; + + /* Assume IP provides aligned packets */ + ASSERT(OK_32PTR(mp->b_rptr)); + ASSERT((MBLKL(mp) >= sizeof (ipha_t))); + + /* + * Verify IP version. Anything other than IPv4 or IPv6 packet is sent + * upstream. ICMPv6 is handled in tcp_icmp_error_ipv6. + */ + if (!(ira->ira_flags & IRAF_IS_IPV4)) { + tcp_icmp_error_ipv6(tcp, mp, ira); + return; + } + + /* Skip past the outer IP and ICMP headers */ + iph_hdr_length = ira->ira_ip_hdr_length; + icmph = (icmph_t *)&mp->b_rptr[iph_hdr_length]; + /* + * If we don't have the correct outer IP header length + * or if we don't have a complete inner IP header + * drop it. + */ + if (iph_hdr_length < sizeof (ipha_t) || + (ipha_t *)&icmph[1] + 1 > (ipha_t *)mp->b_wptr) { +noticmpv4: + freemsg(mp); + return; + } + ipha = (ipha_t *)&icmph[1]; + + /* Skip past the inner IP and find the ULP header */ + iph_hdr_length = IPH_HDR_LENGTH(ipha); + tcpha = (tcpha_t *)((char *)ipha + iph_hdr_length); + /* + * If we don't have the correct inner IP header length or if the ULP + * is not IPPROTO_TCP or if we don't have at least ICMP_MIN_TCP_HDR + * bytes of TCP header, drop it. + */ + if (iph_hdr_length < sizeof (ipha_t) || + ipha->ipha_protocol != IPPROTO_TCP || + (uchar_t *)tcpha + ICMP_MIN_TCP_HDR > mp->b_wptr) { + goto noticmpv4; + } + + seg_seq = ntohl(tcpha->tha_seq); + switch (icmph->icmph_type) { + case ICMP_DEST_UNREACHABLE: + switch (icmph->icmph_code) { + case ICMP_FRAGMENTATION_NEEDED: + /* + * Update Path MTU, then try to send something out. + */ + tcp_update_pmtu(tcp, B_TRUE); + tcp_rexmit_after_error(tcp); + break; + case ICMP_PORT_UNREACHABLE: + case ICMP_PROTOCOL_UNREACHABLE: + switch (tcp->tcp_state) { + case TCPS_SYN_SENT: + case TCPS_SYN_RCVD: + /* + * ICMP can snipe away incipient + * TCP connections as long as + * seq number is same as initial + * send seq number. + */ + if (seg_seq == tcp->tcp_iss) { + (void) tcp_clean_death(tcp, + ECONNREFUSED); + } + break; + } + break; + case ICMP_HOST_UNREACHABLE: + case ICMP_NET_UNREACHABLE: + /* Record the error in case we finally time out. */ + if (icmph->icmph_code == ICMP_HOST_UNREACHABLE) + tcp->tcp_client_errno = EHOSTUNREACH; + else + tcp->tcp_client_errno = ENETUNREACH; + if (tcp->tcp_state == TCPS_SYN_RCVD) { + if (tcp->tcp_listener != NULL && + tcp->tcp_listener->tcp_syn_defense) { + /* + * Ditch the half-open connection if we + * suspect a SYN attack is under way. + */ + (void) tcp_clean_death(tcp, + tcp->tcp_client_errno); + } + } + break; + default: + break; + } + break; + case ICMP_SOURCE_QUENCH: { + /* + * use a global boolean to control + * whether TCP should respond to ICMP_SOURCE_QUENCH. + * The default is false. + */ + if (tcp_icmp_source_quench) { + /* + * Reduce the sending rate as if we got a + * retransmit timeout + */ + uint32_t npkt; + + npkt = ((tcp->tcp_snxt - tcp->tcp_suna) >> 1) / + tcp->tcp_mss; + tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * tcp->tcp_mss; + tcp->tcp_cwnd = tcp->tcp_mss; + tcp->tcp_cwnd_cnt = 0; + } + break; + } + } + freemsg(mp); +} + +/* + * tcp_icmp_error_ipv6 is called from tcp_icmp_input to process ICMPv6 + * error messages passed up by IP. + * Assumes that IP has pulled up all the extension headers as well + * as the ICMPv6 header. + */ +static void +tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp, ip_recv_attr_t *ira) +{ + icmp6_t *icmp6; + ip6_t *ip6h; + uint16_t iph_hdr_length = ira->ira_ip_hdr_length; + tcpha_t *tcpha; + uint8_t *nexthdrp; + uint32_t seg_seq; + + /* + * Verify that we have a complete IP header. + */ + ASSERT((MBLKL(mp) >= sizeof (ip6_t))); + + icmp6 = (icmp6_t *)&mp->b_rptr[iph_hdr_length]; + ip6h = (ip6_t *)&icmp6[1]; + /* + * Verify if we have a complete ICMP and inner IP header. + */ + if ((uchar_t *)&ip6h[1] > mp->b_wptr) { +noticmpv6: + freemsg(mp); + return; + } + + if (!ip_hdr_length_nexthdr_v6(mp, ip6h, &iph_hdr_length, &nexthdrp)) + goto noticmpv6; + tcpha = (tcpha_t *)((char *)ip6h + iph_hdr_length); + /* + * Validate inner header. If the ULP is not IPPROTO_TCP or if we don't + * have at least ICMP_MIN_TCP_HDR bytes of TCP header drop the + * packet. + */ + if ((*nexthdrp != IPPROTO_TCP) || + ((uchar_t *)tcpha + ICMP_MIN_TCP_HDR) > mp->b_wptr) { + goto noticmpv6; + } + + seg_seq = ntohl(tcpha->tha_seq); + switch (icmp6->icmp6_type) { + case ICMP6_PACKET_TOO_BIG: + /* + * Update Path MTU, then try to send something out. + */ + tcp_update_pmtu(tcp, B_TRUE); + tcp_rexmit_after_error(tcp); + break; + case ICMP6_DST_UNREACH: + switch (icmp6->icmp6_code) { + case ICMP6_DST_UNREACH_NOPORT: + if (((tcp->tcp_state == TCPS_SYN_SENT) || + (tcp->tcp_state == TCPS_SYN_RCVD)) && + (seg_seq == tcp->tcp_iss)) { + (void) tcp_clean_death(tcp, ECONNREFUSED); + } + break; + case ICMP6_DST_UNREACH_ADMIN: + case ICMP6_DST_UNREACH_NOROUTE: + case ICMP6_DST_UNREACH_BEYONDSCOPE: + case ICMP6_DST_UNREACH_ADDR: + /* Record the error in case we finally time out. */ + tcp->tcp_client_errno = EHOSTUNREACH; + if (((tcp->tcp_state == TCPS_SYN_SENT) || + (tcp->tcp_state == TCPS_SYN_RCVD)) && + (seg_seq == tcp->tcp_iss)) { + if (tcp->tcp_listener != NULL && + tcp->tcp_listener->tcp_syn_defense) { + /* + * Ditch the half-open connection if we + * suspect a SYN attack is under way. + */ + (void) tcp_clean_death(tcp, + tcp->tcp_client_errno); + } + } + + + break; + default: + break; + } + break; + case ICMP6_PARAM_PROB: + /* If this corresponds to an ICMP_PROTOCOL_UNREACHABLE */ + if (icmp6->icmp6_code == ICMP6_PARAMPROB_NEXTHEADER && + (uchar_t *)ip6h + icmp6->icmp6_pptr == + (uchar_t *)nexthdrp) { + if (tcp->tcp_state == TCPS_SYN_SENT || + tcp->tcp_state == TCPS_SYN_RCVD) { + (void) tcp_clean_death(tcp, ECONNREFUSED); + } + break; + } + break; + + case ICMP6_TIME_EXCEEDED: + default: + break; + } + freemsg(mp); +} + +/* + * CALLED OUTSIDE OF SQUEUE! It can not follow any pointers that tcp might + * change. But it can refer to fields like tcp_suna and tcp_snxt. + * + * Function tcp_verifyicmp is called as conn_verifyicmp to verify the ICMP + * error messages received by IP. The message is always received on the correct + * tcp_t. + */ +/* ARGSUSED */ +boolean_t +tcp_verifyicmp(conn_t *connp, void *arg2, icmph_t *icmph, icmp6_t *icmp6, + ip_recv_attr_t *ira) +{ + tcpha_t *tcpha = (tcpha_t *)arg2; + uint32_t seq = ntohl(tcpha->tha_seq); + tcp_t *tcp = connp->conn_tcp; + + /* + * TCP sequence number contained in payload of the ICMP error message + * should be within the range SND.UNA <= SEG.SEQ < SND.NXT. Otherwise, + * the message is either a stale ICMP error, or an attack from the + * network. Fail the verification. + */ + if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GEQ(seq, tcp->tcp_snxt)) + return (B_FALSE); + + /* For "too big" we also check the ignore flag */ + if (ira->ira_flags & IRAF_IS_IPV4) { + ASSERT(icmph != NULL); + if (icmph->icmph_type == ICMP_DEST_UNREACHABLE && + icmph->icmph_code == ICMP_FRAGMENTATION_NEEDED && + tcp->tcp_tcps->tcps_ignore_path_mtu) + return (B_FALSE); + } else { + ASSERT(icmp6 != NULL); + if (icmp6->icmp6_type == ICMP6_PACKET_TOO_BIG && + tcp->tcp_tcps->tcps_ignore_path_mtu) + return (B_FALSE); + } + return (B_TRUE); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c index 5d9051aed1..0b84f2d91e 100644 --- a/usr/src/uts/common/inet/tcp/tcp_kssl.c +++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,8 +67,6 @@ static void tcp_kssl_input_asynch(void *, mblk_t *, void *, extern void tcp_output(void *, mblk_t *, void *, ip_recv_attr_t *); extern void tcp_send_conn_ind(void *, mblk_t *, void *); -extern int tcp_squeue_flag; - /* * tcp_input_data() calls this routine for all packet destined to a * connection to the SSL port, when the SSL kernel proxy is configured diff --git a/usr/src/uts/common/inet/tcp/tcp_misc.c b/usr/src/uts/common/inet/tcp/tcp_misc.c new file mode 100644 index 0000000000..4ec8d1d5f6 --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_misc.c @@ -0,0 +1,885 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/strlog.h> +#include <sys/policy.h> +#include <sys/strsun.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> + +/* Control whether TCP can enter defensive mode when under memory pressure. */ +static boolean_t tcp_do_reclaim = B_TRUE; + +/* + * Routines related to the TCP_IOC_ABORT_CONN ioctl command. + * + * TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting + * TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure + * (defined in tcp.h) needs to be filled in and passed into the kernel + * via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t + * structure contains the four-tuple of a TCP connection and a range of TCP + * states (specified by ac_start and ac_end). The use of wildcard addresses + * and ports is allowed. Connections with a matching four tuple and a state + * within the specified range will be aborted. The valid states for the + * ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT, + * inclusive. + * + * An application which has its connection aborted by this ioctl will receive + * an error that is dependent on the connection state at the time of the abort. + * If the connection state is < TCPS_TIME_WAIT, an application should behave as + * though a RST packet has been received. If the connection state is equal to + * TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel + * and all resources associated with the connection will be freed. + */ +static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *); +static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *); +static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy); +static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps); +void tcp_ioctl_abort_conn(queue_t *, mblk_t *); +static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *, + boolean_t, tcp_stack_t *); + +/* + * Macros used for accessing the different types of sockaddr + * structures inside a tcp_ioc_abort_conn_t. + */ +#define TCP_AC_V4LADDR(acp) ((sin_t *)&(acp)->ac_local) +#define TCP_AC_V4RADDR(acp) ((sin_t *)&(acp)->ac_remote) +#define TCP_AC_V4LOCAL(acp) (TCP_AC_V4LADDR(acp)->sin_addr.s_addr) +#define TCP_AC_V4REMOTE(acp) (TCP_AC_V4RADDR(acp)->sin_addr.s_addr) +#define TCP_AC_V4LPORT(acp) (TCP_AC_V4LADDR(acp)->sin_port) +#define TCP_AC_V4RPORT(acp) (TCP_AC_V4RADDR(acp)->sin_port) +#define TCP_AC_V6LADDR(acp) ((sin6_t *)&(acp)->ac_local) +#define TCP_AC_V6RADDR(acp) ((sin6_t *)&(acp)->ac_remote) +#define TCP_AC_V6LOCAL(acp) (TCP_AC_V6LADDR(acp)->sin6_addr) +#define TCP_AC_V6REMOTE(acp) (TCP_AC_V6RADDR(acp)->sin6_addr) +#define TCP_AC_V6LPORT(acp) (TCP_AC_V6LADDR(acp)->sin6_port) +#define TCP_AC_V6RPORT(acp) (TCP_AC_V6RADDR(acp)->sin6_port) + +/* + * Return the correct error code to mimic the behavior + * of a connection reset. + */ +#define TCP_AC_GET_ERRCODE(state, err) { \ + switch ((state)) { \ + case TCPS_SYN_SENT: \ + case TCPS_SYN_RCVD: \ + (err) = ECONNREFUSED; \ + break; \ + case TCPS_ESTABLISHED: \ + case TCPS_FIN_WAIT_1: \ + case TCPS_FIN_WAIT_2: \ + case TCPS_CLOSE_WAIT: \ + (err) = ECONNRESET; \ + break; \ + case TCPS_CLOSING: \ + case TCPS_LAST_ACK: \ + case TCPS_TIME_WAIT: \ + (err) = 0; \ + break; \ + default: \ + (err) = ENXIO; \ + } \ + } + +/* + * Check if a tcp structure matches the info in acp. + */ +#define TCP_AC_ADDR_MATCH(acp, connp, tcp) \ + (((acp)->ac_local.ss_family == AF_INET) ? \ + ((TCP_AC_V4LOCAL((acp)) == INADDR_ANY || \ + TCP_AC_V4LOCAL((acp)) == (connp)->conn_laddr_v4) && \ + (TCP_AC_V4REMOTE((acp)) == INADDR_ANY || \ + TCP_AC_V4REMOTE((acp)) == (connp)->conn_faddr_v4) && \ + (TCP_AC_V4LPORT((acp)) == 0 || \ + TCP_AC_V4LPORT((acp)) == (connp)->conn_lport) && \ + (TCP_AC_V4RPORT((acp)) == 0 || \ + TCP_AC_V4RPORT((acp)) == (connp)->conn_fport) && \ + (acp)->ac_start <= (tcp)->tcp_state && \ + (acp)->ac_end >= (tcp)->tcp_state) : \ + ((IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL((acp))) || \ + IN6_ARE_ADDR_EQUAL(&TCP_AC_V6LOCAL((acp)), \ + &(connp)->conn_laddr_v6)) && \ + (IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE((acp))) || \ + IN6_ARE_ADDR_EQUAL(&TCP_AC_V6REMOTE((acp)), \ + &(connp)->conn_faddr_v6)) && \ + (TCP_AC_V6LPORT((acp)) == 0 || \ + TCP_AC_V6LPORT((acp)) == (connp)->conn_lport) && \ + (TCP_AC_V6RPORT((acp)) == 0 || \ + TCP_AC_V6RPORT((acp)) == (connp)->conn_fport) && \ + (acp)->ac_start <= (tcp)->tcp_state && \ + (acp)->ac_end >= (tcp)->tcp_state)) + +#define TCP_AC_MATCH(acp, connp, tcp) \ + (((acp)->ac_zoneid == ALL_ZONES || \ + (acp)->ac_zoneid == (connp)->conn_zoneid) ? \ + TCP_AC_ADDR_MATCH(acp, connp, tcp) : 0) + +/* + * Build a message containing a tcp_ioc_abort_conn_t structure + * which is filled in with information from acp and tp. + */ +static mblk_t * +tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *acp, tcp_t *tp) +{ + mblk_t *mp; + tcp_ioc_abort_conn_t *tacp; + + mp = allocb(sizeof (uint32_t) + sizeof (*acp), BPRI_LO); + if (mp == NULL) + return (NULL); + + *((uint32_t *)mp->b_rptr) = TCP_IOC_ABORT_CONN; + tacp = (tcp_ioc_abort_conn_t *)((uchar_t *)mp->b_rptr + + sizeof (uint32_t)); + + tacp->ac_start = acp->ac_start; + tacp->ac_end = acp->ac_end; + tacp->ac_zoneid = acp->ac_zoneid; + + if (acp->ac_local.ss_family == AF_INET) { + tacp->ac_local.ss_family = AF_INET; + tacp->ac_remote.ss_family = AF_INET; + TCP_AC_V4LOCAL(tacp) = tp->tcp_connp->conn_laddr_v4; + TCP_AC_V4REMOTE(tacp) = tp->tcp_connp->conn_faddr_v4; + TCP_AC_V4LPORT(tacp) = tp->tcp_connp->conn_lport; + TCP_AC_V4RPORT(tacp) = tp->tcp_connp->conn_fport; + } else { + tacp->ac_local.ss_family = AF_INET6; + tacp->ac_remote.ss_family = AF_INET6; + TCP_AC_V6LOCAL(tacp) = tp->tcp_connp->conn_laddr_v6; + TCP_AC_V6REMOTE(tacp) = tp->tcp_connp->conn_faddr_v6; + TCP_AC_V6LPORT(tacp) = tp->tcp_connp->conn_lport; + TCP_AC_V6RPORT(tacp) = tp->tcp_connp->conn_fport; + } + mp->b_wptr = (uchar_t *)mp->b_rptr + sizeof (uint32_t) + sizeof (*acp); + return (mp); +} + +/* + * Print a tcp_ioc_abort_conn_t structure. + */ +static void +tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *acp) +{ + char lbuf[128]; + char rbuf[128]; + sa_family_t af; + in_port_t lport, rport; + ushort_t logflags; + + af = acp->ac_local.ss_family; + + if (af == AF_INET) { + (void) inet_ntop(af, (const void *)&TCP_AC_V4LOCAL(acp), + lbuf, 128); + (void) inet_ntop(af, (const void *)&TCP_AC_V4REMOTE(acp), + rbuf, 128); + lport = ntohs(TCP_AC_V4LPORT(acp)); + rport = ntohs(TCP_AC_V4RPORT(acp)); + } else { + (void) inet_ntop(af, (const void *)&TCP_AC_V6LOCAL(acp), + lbuf, 128); + (void) inet_ntop(af, (const void *)&TCP_AC_V6REMOTE(acp), + rbuf, 128); + lport = ntohs(TCP_AC_V6LPORT(acp)); + rport = ntohs(TCP_AC_V6RPORT(acp)); + } + + logflags = SL_TRACE | SL_NOTE; + /* + * Don't print this message to the console if the operation was done + * to a non-global zone. + */ + if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) + logflags |= SL_CONSOLE; + (void) strlog(TCP_MOD_ID, 0, 1, logflags, + "TCP_IOC_ABORT_CONN: local = %s:%d, remote = %s:%d, " + "start = %d, end = %d\n", lbuf, lport, rbuf, rport, + acp->ac_start, acp->ac_end); +} + +/* + * Called using SQ_FILL when a message built using + * tcp_ioctl_abort_build_msg is put into a queue. + * Note that when we get here there is no wildcard in acp any more. + */ +/* ARGSUSED2 */ +static void +tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2, + ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + tcp_ioc_abort_conn_t *acp; + + /* + * Don't accept any input on a closed tcp as this TCP logically does + * not exist on the system. Don't proceed further with this TCP. + * For eg. this packet could trigger another close of this tcp + * which would be disastrous for tcp_refcnt. tcp_close_detached / + * tcp_clean_death / tcp_closei_local must be called at most once + * on a TCP. + */ + if (tcp->tcp_state == TCPS_CLOSED || + tcp->tcp_state == TCPS_BOUND) { + freemsg(mp); + return; + } + + acp = (tcp_ioc_abort_conn_t *)(mp->b_rptr + sizeof (uint32_t)); + if (tcp->tcp_state <= acp->ac_end) { + /* + * If we get here, we are already on the correct + * squeue. This ioctl follows the following path + * tcp_wput -> tcp_wput_ioctl -> tcp_ioctl_abort_conn + * ->tcp_ioctl_abort->squeue_enter (if on a + * different squeue) + */ + int errcode; + + TCP_AC_GET_ERRCODE(tcp->tcp_state, errcode); + (void) tcp_clean_death(tcp, errcode); + } + freemsg(mp); +} + +/* + * Abort all matching connections on a hash chain. + */ +static int +tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *acp, int index, int *count, + boolean_t exact, tcp_stack_t *tcps) +{ + int nmatch, err = 0; + tcp_t *tcp; + MBLKP mp, last, listhead = NULL; + conn_t *tconnp; + connf_t *connfp; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + + connfp = &ipst->ips_ipcl_conn_fanout[index]; + +startover: + nmatch = 0; + + mutex_enter(&connfp->connf_lock); + for (tconnp = connfp->connf_head; tconnp != NULL; + tconnp = tconnp->conn_next) { + tcp = tconnp->conn_tcp; + /* + * We are missing a check on sin6_scope_id for linklocals here, + * but current usage is just for aborting based on zoneid + * for shared-IP zones. + */ + if (TCP_AC_MATCH(acp, tconnp, tcp)) { + CONN_INC_REF(tconnp); + mp = tcp_ioctl_abort_build_msg(acp, tcp); + if (mp == NULL) { + err = ENOMEM; + CONN_DEC_REF(tconnp); + break; + } + mp->b_prev = (mblk_t *)tcp; + + if (listhead == NULL) { + listhead = mp; + last = mp; + } else { + last->b_next = mp; + last = mp; + } + nmatch++; + if (exact) + break; + } + + /* Avoid holding lock for too long. */ + if (nmatch >= 500) + break; + } + mutex_exit(&connfp->connf_lock); + + /* Pass mp into the correct tcp */ + while ((mp = listhead) != NULL) { + listhead = listhead->b_next; + tcp = (tcp_t *)mp->b_prev; + mp->b_next = mp->b_prev = NULL; + SQUEUE_ENTER_ONE(tcp->tcp_connp->conn_sqp, mp, + tcp_ioctl_abort_handler, tcp->tcp_connp, NULL, + SQ_FILL, SQTAG_TCP_ABORT_BUCKET); + } + + *count += nmatch; + if (nmatch >= 500 && err == 0) + goto startover; + return (err); +} + +/* + * Abort all connections that matches the attributes specified in acp. + */ +static int +tcp_ioctl_abort(tcp_ioc_abort_conn_t *acp, tcp_stack_t *tcps) +{ + sa_family_t af; + uint32_t ports; + uint16_t *pports; + int err = 0, count = 0; + boolean_t exact = B_FALSE; /* set when there is no wildcard */ + int index = -1; + ushort_t logflags; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + + af = acp->ac_local.ss_family; + + if (af == AF_INET) { + if (TCP_AC_V4REMOTE(acp) != INADDR_ANY && + TCP_AC_V4LPORT(acp) != 0 && TCP_AC_V4RPORT(acp) != 0) { + pports = (uint16_t *)&ports; + pports[1] = TCP_AC_V4LPORT(acp); + pports[0] = TCP_AC_V4RPORT(acp); + exact = (TCP_AC_V4LOCAL(acp) != INADDR_ANY); + } + } else { + if (!IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6REMOTE(acp)) && + TCP_AC_V6LPORT(acp) != 0 && TCP_AC_V6RPORT(acp) != 0) { + pports = (uint16_t *)&ports; + pports[1] = TCP_AC_V6LPORT(acp); + pports[0] = TCP_AC_V6RPORT(acp); + exact = !IN6_IS_ADDR_UNSPECIFIED(&TCP_AC_V6LOCAL(acp)); + } + } + + /* + * For cases where remote addr, local port, and remote port are non- + * wildcards, tcp_ioctl_abort_bucket will only be called once. + */ + if (index != -1) { + err = tcp_ioctl_abort_bucket(acp, index, + &count, exact, tcps); + } else { + /* + * loop through all entries for wildcard case + */ + for (index = 0; + index < ipst->ips_ipcl_conn_fanout_size; + index++) { + err = tcp_ioctl_abort_bucket(acp, index, + &count, exact, tcps); + if (err != 0) + break; + } + } + + logflags = SL_TRACE | SL_NOTE; + /* + * Don't print this message to the console if the operation was done + * to a non-global zone. + */ + if (acp->ac_zoneid == GLOBAL_ZONEID || acp->ac_zoneid == ALL_ZONES) + logflags |= SL_CONSOLE; + (void) strlog(TCP_MOD_ID, 0, 1, logflags, "TCP_IOC_ABORT_CONN: " + "aborted %d connection%c\n", count, ((count > 1) ? 's' : ' ')); + if (err == 0 && count == 0) + err = ENOENT; + return (err); +} + +/* + * Process the TCP_IOC_ABORT_CONN ioctl request. + */ +void +tcp_ioctl_abort_conn(queue_t *q, mblk_t *mp) +{ + int err; + IOCP iocp; + MBLKP mp1; + sa_family_t laf, raf; + tcp_ioc_abort_conn_t *acp; + zone_t *zptr; + conn_t *connp = Q_TO_CONN(q); + zoneid_t zoneid = connp->conn_zoneid; + tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + iocp = (IOCP)mp->b_rptr; + + if ((mp1 = mp->b_cont) == NULL || + iocp->ioc_count != sizeof (tcp_ioc_abort_conn_t)) { + err = EINVAL; + goto out; + } + + /* check permissions */ + if (secpolicy_ip_config(iocp->ioc_cr, B_FALSE) != 0) { + err = EPERM; + goto out; + } + + if (mp1->b_cont != NULL) { + freemsg(mp1->b_cont); + mp1->b_cont = NULL; + } + + acp = (tcp_ioc_abort_conn_t *)mp1->b_rptr; + laf = acp->ac_local.ss_family; + raf = acp->ac_remote.ss_family; + + /* check that a zone with the supplied zoneid exists */ + if (acp->ac_zoneid != GLOBAL_ZONEID && acp->ac_zoneid != ALL_ZONES) { + zptr = zone_find_by_id(zoneid); + if (zptr != NULL) { + zone_rele(zptr); + } else { + err = EINVAL; + goto out; + } + } + + /* + * For exclusive stacks we set the zoneid to zero + * to make TCP operate as if in the global zone. + */ + if (tcps->tcps_netstack->netstack_stackid != GLOBAL_NETSTACKID) + acp->ac_zoneid = GLOBAL_ZONEID; + + if (acp->ac_start < TCPS_SYN_SENT || acp->ac_end > TCPS_TIME_WAIT || + acp->ac_start > acp->ac_end || laf != raf || + (laf != AF_INET && laf != AF_INET6)) { + err = EINVAL; + goto out; + } + + tcp_ioctl_abort_dump(acp); + err = tcp_ioctl_abort(acp, tcps); + +out: + if (mp1 != NULL) { + freemsg(mp1); + mp->b_cont = NULL; + } + + if (err != 0) + miocnak(q, mp, 0, err); + else + miocack(q, mp, 0, 0); +} + +/* + * Timeout function to reset the TCP stack variable tcps_reclaim to false. + */ +void +tcp_reclaim_timer(void *arg) +{ + tcp_stack_t *tcps = (tcp_stack_t *)arg; + int64_t tot_conn = 0; + int i; + extern pgcnt_t lotsfree, needfree; + + for (i = 0; i < tcps->tcps_sc_cnt; i++) + tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; + + /* + * This happens only when a stack is going away. tcps_reclaim_tid + * should not be reset to 0 when returning in this case. + */ + mutex_enter(&tcps->tcps_reclaim_lock); + if (!tcps->tcps_reclaim) { + mutex_exit(&tcps->tcps_reclaim_lock); + return; + } + + if ((freemem >= lotsfree + needfree) || tot_conn < maxusers) { + tcps->tcps_reclaim = B_FALSE; + tcps->tcps_reclaim_tid = 0; + } else { + /* Stay in defensive mode and restart the timer */ + tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, + tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); + } + mutex_exit(&tcps->tcps_reclaim_lock); +} + +/* + * Kmem reclaim call back function. When the system is under memory + * pressure, we set the TCP stack variable tcps_reclaim to true. This + * variable is reset to false after tcps_reclaim_period msecs. During this + * period, TCP will be more aggressive in aborting connections not making + * progress, meaning retransmitting for some time (tcp_early_abort seconds). + * TCP will also not accept new connection request for those listeners whose + * q or q0 is not empty. + */ +/* ARGSUSED */ +void +tcp_conn_reclaim(void *arg) +{ + netstack_handle_t nh; + netstack_t *ns; + tcp_stack_t *tcps; + extern pgcnt_t lotsfree, needfree; + + if (!tcp_do_reclaim) + return; + + /* + * The reclaim function may be called even when the system is not + * really under memory pressure. + */ + if (freemem >= lotsfree + needfree) + return; + + netstack_next_init(&nh); + while ((ns = netstack_next(&nh)) != NULL) { + int i; + int64_t tot_conn = 0; + + tcps = ns->netstack_tcp; + + /* + * Even if the system is under memory pressure, the reason may + * not be because of TCP activity. Check the number of + * connections in each stack. If the number exceeds the + * threshold (maxusers), turn on defensive mode. + */ + for (i = 0; i < tcps->tcps_sc_cnt; i++) + tot_conn += tcps->tcps_sc[i]->tcp_sc_conn_cnt; + if (tot_conn < maxusers) { + netstack_rele(ns); + continue; + } + + mutex_enter(&tcps->tcps_reclaim_lock); + if (!tcps->tcps_reclaim) { + tcps->tcps_reclaim = B_TRUE; + tcps->tcps_reclaim_tid = timeout(tcp_reclaim_timer, + tcps, MSEC_TO_TICK(tcps->tcps_reclaim_period)); + TCP_STAT(tcps, tcp_reclaim_cnt); + } + mutex_exit(&tcps->tcps_reclaim_lock); + netstack_rele(ns); + } + netstack_next_fini(&nh); +} + +/* + * Given a tcp_stack_t and a port (in host byte order), find a listener + * configuration for that port and return the ratio. + */ +uint32_t +tcp_find_listener_conf(tcp_stack_t *tcps, in_port_t port) +{ + tcp_listener_t *tl; + uint32_t ratio = 0; + + mutex_enter(&tcps->tcps_listener_conf_lock); + for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; + tl = list_next(&tcps->tcps_listener_conf, tl)) { + if (tl->tl_port == port) { + ratio = tl->tl_ratio; + break; + } + } + mutex_exit(&tcps->tcps_listener_conf_lock); + return (ratio); +} + +/* + * Ndd param helper routine to return the current list of listener limit + * configuration. + */ +/* ARGSUSED */ +int +tcp_listener_conf_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr) +{ + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + tcp_listener_t *tl; + + mutex_enter(&tcps->tcps_listener_conf_lock); + for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; + tl = list_next(&tcps->tcps_listener_conf, tl)) { + (void) mi_mpprintf(mp, "%d:%d ", tl->tl_port, tl->tl_ratio); + } + mutex_exit(&tcps->tcps_listener_conf_lock); + return (0); +} + +/* + * Ndd param helper routine to add a new listener limit configuration. + */ +/* ARGSUSED */ +int +tcp_listener_conf_add(queue_t *q, mblk_t *mp, char *value, caddr_t cp, + cred_t *cr) +{ + tcp_listener_t *new_tl; + tcp_listener_t *tl; + long lport; + long ratio; + char *colon; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + + if (ddi_strtol(value, &colon, 10, &lport) != 0 || lport <= 0 || + lport > USHRT_MAX || *colon != ':') { + return (EINVAL); + } + if (ddi_strtol(colon + 1, NULL, 10, &ratio) != 0 || ratio <= 0) + return (EINVAL); + + mutex_enter(&tcps->tcps_listener_conf_lock); + for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; + tl = list_next(&tcps->tcps_listener_conf, tl)) { + /* There is an existing entry, so update its ratio value. */ + if (tl->tl_port == lport) { + tl->tl_ratio = ratio; + mutex_exit(&tcps->tcps_listener_conf_lock); + return (0); + } + } + + if ((new_tl = kmem_alloc(sizeof (tcp_listener_t), KM_NOSLEEP)) == + NULL) { + mutex_exit(&tcps->tcps_listener_conf_lock); + return (ENOMEM); + } + + new_tl->tl_port = lport; + new_tl->tl_ratio = ratio; + list_insert_tail(&tcps->tcps_listener_conf, new_tl); + mutex_exit(&tcps->tcps_listener_conf_lock); + return (0); +} + +/* + * Ndd param helper routine to remove a listener limit configuration. + */ +/* ARGSUSED */ +int +tcp_listener_conf_del(queue_t *q, mblk_t *mp, char *value, caddr_t cp, + cred_t *cr) +{ + tcp_listener_t *tl; + long lport; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + + if (ddi_strtol(value, NULL, 10, &lport) != 0 || lport <= 0 || + lport > USHRT_MAX) { + return (EINVAL); + } + mutex_enter(&tcps->tcps_listener_conf_lock); + for (tl = list_head(&tcps->tcps_listener_conf); tl != NULL; + tl = list_next(&tcps->tcps_listener_conf, tl)) { + if (tl->tl_port == lport) { + list_remove(&tcps->tcps_listener_conf, tl); + mutex_exit(&tcps->tcps_listener_conf_lock); + kmem_free(tl, sizeof (tcp_listener_t)); + return (0); + } + } + mutex_exit(&tcps->tcps_listener_conf_lock); + return (ESRCH); +} + +/* + * To remove all listener limit configuration in a tcp_stack_t. + */ +void +tcp_listener_conf_cleanup(tcp_stack_t *tcps) +{ + tcp_listener_t *tl; + + mutex_enter(&tcps->tcps_listener_conf_lock); + while ((tl = list_head(&tcps->tcps_listener_conf)) != NULL) { + list_remove(&tcps->tcps_listener_conf, tl); + kmem_free(tl, sizeof (tcp_listener_t)); + } + mutex_destroy(&tcps->tcps_listener_conf_lock); + list_destroy(&tcps->tcps_listener_conf); +} + +/* + * Call back function for CPU state change. + */ +/* ARGSUSED */ +int +tcp_cpu_update(cpu_setup_t what, int id, void *arg) +{ + cpu_t *cp; + netstack_handle_t nh; + netstack_t *ns; + tcp_stack_t *tcps; + int i; + + ASSERT(MUTEX_HELD(&cpu_lock)); + cp = cpu[id]; + + switch (what) { + case CPU_CONFIG: + case CPU_ON: + case CPU_INIT: + case CPU_CPUPART_IN: + netstack_next_init(&nh); + while ((ns = netstack_next(&nh)) != NULL) { + tcps = ns->netstack_tcp; + if (cp->cpu_seqid >= tcps->tcps_sc_cnt) { + for (i = tcps->tcps_sc_cnt; i <= cp->cpu_seqid; + i++) { + ASSERT(tcps->tcps_sc[i] == NULL); + tcps->tcps_sc[i] = kmem_zalloc( + sizeof (tcp_stats_cpu_t), KM_SLEEP); + } + membar_producer(); + tcps->tcps_sc_cnt = cp->cpu_seqid + 1; + } + netstack_rele(ns); + } + netstack_next_fini(&nh); + break; + case CPU_UNCONFIG: + case CPU_OFF: + case CPU_CPUPART_OUT: + /* Nothing to do */ + break; + default: + break; + } + return (0); +} + +/* + * Diagnostic routine used to return a string associated with the tcp state. + * Note that if the caller does not supply a buffer, it will use an internal + * static string. This means that if multiple threads call this function at + * the same time, output can be corrupted... Note also that this function + * does not check the size of the supplied buffer. The caller has to make + * sure that it is big enough. + */ +char * +tcp_display(tcp_t *tcp, char *sup_buf, char format) +{ + char buf1[30]; + static char priv_buf[INET6_ADDRSTRLEN * 2 + 80]; + char *buf; + char *cp; + in6_addr_t local, remote; + char local_addrbuf[INET6_ADDRSTRLEN]; + char remote_addrbuf[INET6_ADDRSTRLEN]; + conn_t *connp; + + if (sup_buf != NULL) + buf = sup_buf; + else + buf = priv_buf; + + if (tcp == NULL) + return ("NULL_TCP"); + + connp = tcp->tcp_connp; + switch (tcp->tcp_state) { + case TCPS_CLOSED: + cp = "TCP_CLOSED"; + break; + case TCPS_IDLE: + cp = "TCP_IDLE"; + break; + case TCPS_BOUND: + cp = "TCP_BOUND"; + break; + case TCPS_LISTEN: + cp = "TCP_LISTEN"; + break; + case TCPS_SYN_SENT: + cp = "TCP_SYN_SENT"; + break; + case TCPS_SYN_RCVD: + cp = "TCP_SYN_RCVD"; + break; + case TCPS_ESTABLISHED: + cp = "TCP_ESTABLISHED"; + break; + case TCPS_CLOSE_WAIT: + cp = "TCP_CLOSE_WAIT"; + break; + case TCPS_FIN_WAIT_1: + cp = "TCP_FIN_WAIT_1"; + break; + case TCPS_CLOSING: + cp = "TCP_CLOSING"; + break; + case TCPS_LAST_ACK: + cp = "TCP_LAST_ACK"; + break; + case TCPS_FIN_WAIT_2: + cp = "TCP_FIN_WAIT_2"; + break; + case TCPS_TIME_WAIT: + cp = "TCP_TIME_WAIT"; + break; + default: + (void) mi_sprintf(buf1, "TCPUnkState(%d)", tcp->tcp_state); + cp = buf1; + break; + } + switch (format) { + case DISP_ADDR_AND_PORT: + if (connp->conn_ipversion == IPV4_VERSION) { + /* + * Note that we use the remote address in the tcp_b + * structure. This means that it will print out + * the real destination address, not the next hop's + * address if source routing is used. + */ + IN6_IPADDR_TO_V4MAPPED(connp->conn_laddr_v4, &local); + IN6_IPADDR_TO_V4MAPPED(connp->conn_faddr_v4, &remote); + + } else { + local = connp->conn_laddr_v6; + remote = connp->conn_faddr_v6; + } + (void) inet_ntop(AF_INET6, &local, local_addrbuf, + sizeof (local_addrbuf)); + (void) inet_ntop(AF_INET6, &remote, remote_addrbuf, + sizeof (remote_addrbuf)); + (void) mi_sprintf(buf, "[%s.%u, %s.%u] %s", + local_addrbuf, ntohs(connp->conn_lport), remote_addrbuf, + ntohs(connp->conn_fport), cp); + break; + case DISP_PORT_ONLY: + default: + (void) mi_sprintf(buf, "[%u, %u] %s", + ntohs(connp->conn_lport), ntohs(connp->conn_fport), cp); + break; + } + + return (buf); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index d15ff4ffcd..b85cf30ff4 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,7 @@ #include <sys/socket.h> #include <sys/xti_xtiopt.h> #include <sys/xti_inet.h> +#include <sys/policy.h> #include <inet/common.h> #include <netinet/ip6.h> @@ -38,7 +39,7 @@ #include <netinet/in.h> #include <netinet/tcp.h> #include <inet/optcom.h> - +#include <inet/proto_set.h> #include <inet/tcp_impl.h> /* @@ -251,3 +252,611 @@ optdb_obj_t tcp_opt_obj = { TCP_VALID_LEVELS_CNT, /* TCP valid level count of entries */ tcp_valid_levels_arr /* TCP valid level array */ }; + +/* Maximum TCP initial cwin (start/restart). */ +#define TCP_MAX_INIT_CWND 16 + +static int tcp_max_init_cwnd = TCP_MAX_INIT_CWND; + +/* + * Some TCP options can be "set" by requesting them in the option + * buffer. This is needed for XTI feature test though we do not + * allow it in general. We interpret that this mechanism is more + * applicable to OSI protocols and need not be allowed in general. + * This routine filters out options for which it is not allowed (most) + * and lets through those (few) for which it is. [ The XTI interface + * test suite specifics will imply that any XTI_GENERIC level XTI_* if + * ever implemented will have to be allowed here ]. + */ +static boolean_t +tcp_allow_connopt_set(int level, int name) +{ + + switch (level) { + case IPPROTO_TCP: + switch (name) { + case TCP_NODELAY: + return (B_TRUE); + default: + return (B_FALSE); + } + /*NOTREACHED*/ + default: + return (B_FALSE); + } + /*NOTREACHED*/ +} + +/* + * This routine gets default values of certain options whose default + * values are maintained by protocol specific code + */ +/* ARGSUSED */ +int +tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr) +{ + int32_t *i1 = (int32_t *)ptr; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + + switch (level) { + case IPPROTO_TCP: + switch (name) { + case TCP_NOTIFY_THRESHOLD: + *i1 = tcps->tcps_ip_notify_interval; + break; + case TCP_ABORT_THRESHOLD: + *i1 = tcps->tcps_ip_abort_interval; + break; + case TCP_CONN_NOTIFY_THRESHOLD: + *i1 = tcps->tcps_ip_notify_cinterval; + break; + case TCP_CONN_ABORT_THRESHOLD: + *i1 = tcps->tcps_ip_abort_cinterval; + break; + default: + return (-1); + } + break; + case IPPROTO_IP: + switch (name) { + case IP_TTL: + *i1 = tcps->tcps_ipv4_ttl; + break; + default: + return (-1); + } + break; + case IPPROTO_IPV6: + switch (name) { + case IPV6_UNICAST_HOPS: + *i1 = tcps->tcps_ipv6_hoplimit; + break; + default: + return (-1); + } + break; + default: + return (-1); + } + return (sizeof (int)); +} + +/* + * TCP routine to get the values of options. + */ +int +tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) +{ + int *i1 = (int *)ptr; + tcp_t *tcp = connp->conn_tcp; + conn_opt_arg_t coas; + int retval; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; + + switch (level) { + case SOL_SOCKET: + switch (name) { + case SO_SND_COPYAVOID: + *i1 = tcp->tcp_snd_zcopy_on ? + SO_SND_COPYAVOID : 0; + return (sizeof (int)); + case SO_ACCEPTCONN: + *i1 = (tcp->tcp_state == TCPS_LISTEN); + return (sizeof (int)); + } + break; + case IPPROTO_TCP: + switch (name) { + case TCP_NODELAY: + *i1 = (tcp->tcp_naglim == 1) ? TCP_NODELAY : 0; + return (sizeof (int)); + case TCP_MAXSEG: + *i1 = tcp->tcp_mss; + return (sizeof (int)); + case TCP_NOTIFY_THRESHOLD: + *i1 = (int)tcp->tcp_first_timer_threshold; + return (sizeof (int)); + case TCP_ABORT_THRESHOLD: + *i1 = tcp->tcp_second_timer_threshold; + return (sizeof (int)); + case TCP_CONN_NOTIFY_THRESHOLD: + *i1 = tcp->tcp_first_ctimer_threshold; + return (sizeof (int)); + case TCP_CONN_ABORT_THRESHOLD: + *i1 = tcp->tcp_second_ctimer_threshold; + return (sizeof (int)); + case TCP_INIT_CWND: + *i1 = tcp->tcp_init_cwnd; + return (sizeof (int)); + case TCP_KEEPALIVE_THRESHOLD: + *i1 = tcp->tcp_ka_interval; + return (sizeof (int)); + case TCP_KEEPALIVE_ABORT_THRESHOLD: + *i1 = tcp->tcp_ka_abort_thres; + return (sizeof (int)); + case TCP_CORK: + *i1 = tcp->tcp_cork; + return (sizeof (int)); + } + break; + case IPPROTO_IP: + if (connp->conn_family != AF_INET) + return (-1); + switch (name) { + case IP_OPTIONS: + case T_IP_OPTIONS: + /* Caller ensures enough space */ + return (ip_opt_get_user(connp, ptr)); + default: + break; + } + break; + + case IPPROTO_IPV6: + /* + * IPPROTO_IPV6 options are only supported for sockets + * that are using IPv6 on the wire. + */ + if (connp->conn_ipversion != IPV6_VERSION) { + return (-1); + } + switch (name) { + case IPV6_PATHMTU: + if (tcp->tcp_state < TCPS_ESTABLISHED) + return (-1); + break; + } + break; + } + mutex_enter(&connp->conn_lock); + retval = conn_opt_get(&coas, level, name, ptr); + mutex_exit(&connp->conn_lock); + return (retval); +} + +/* + * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. + * Parameters are assumed to be verified by the caller. + */ +/* ARGSUSED */ +int +tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, + uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr) +{ + tcp_t *tcp = connp->conn_tcp; + int *i1 = (int *)invalp; + boolean_t onoff = (*i1 == 0) ? 0 : 1; + boolean_t checkonly; + int reterr; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_opt_arg_t coas; + + coas.coa_connp = connp; + coas.coa_ixa = connp->conn_ixa; + coas.coa_ipp = &connp->conn_xmit_ipp; + coas.coa_ancillary = B_FALSE; + coas.coa_changed = 0; + + switch (optset_context) { + case SETFN_OPTCOM_CHECKONLY: + checkonly = B_TRUE; + /* + * Note: Implies T_CHECK semantics for T_OPTCOM_REQ + * inlen != 0 implies value supplied and + * we have to "pretend" to set it. + * inlen == 0 implies that there is no + * value part in T_CHECK request and just validation + * done elsewhere should be enough, we just return here. + */ + if (inlen == 0) { + *outlenp = 0; + return (0); + } + break; + case SETFN_OPTCOM_NEGOTIATE: + checkonly = B_FALSE; + break; + case SETFN_UD_NEGOTIATE: /* error on conn-oriented transports ? */ + case SETFN_CONN_NEGOTIATE: + checkonly = B_FALSE; + /* + * Negotiating local and "association-related" options + * from other (T_CONN_REQ, T_CONN_RES,T_UNITDATA_REQ) + * primitives is allowed by XTI, but we choose + * to not implement this style negotiation for Internet + * protocols (We interpret it is a must for OSI world but + * optional for Internet protocols) for all options. + * [ Will do only for the few options that enable test + * suites that our XTI implementation of this feature + * works for transports that do allow it ] + */ + if (!tcp_allow_connopt_set(level, name)) { + *outlenp = 0; + return (EINVAL); + } + break; + default: + /* + * We should never get here + */ + *outlenp = 0; + return (EINVAL); + } + + ASSERT((optset_context != SETFN_OPTCOM_CHECKONLY) || + (optset_context == SETFN_OPTCOM_CHECKONLY && inlen != 0)); + + /* + * For TCP, we should have no ancillary data sent down + * (sendmsg isn't supported for SOCK_STREAM), so thisdg_attrs + * has to be zero. + */ + ASSERT(thisdg_attrs == NULL); + + /* + * For fixed length options, no sanity check + * of passed in length is done. It is assumed *_optcom_req() + * routines do the right thing. + */ + switch (level) { + case SOL_SOCKET: + switch (name) { + case SO_KEEPALIVE: + if (checkonly) { + /* check only case */ + break; + } + + if (!onoff) { + if (connp->conn_keepalive) { + if (tcp->tcp_ka_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, + tcp->tcp_ka_tid); + tcp->tcp_ka_tid = 0; + } + connp->conn_keepalive = 0; + } + break; + } + if (!connp->conn_keepalive) { + /* Crank up the keepalive timer */ + tcp->tcp_ka_last_intrvl = 0; + tcp->tcp_ka_tid = TCP_TIMER(tcp, + tcp_keepalive_timer, + MSEC_TO_TICK(tcp->tcp_ka_interval)); + connp->conn_keepalive = 1; + } + break; + case SO_SNDBUF: { + if (*i1 > tcps->tcps_max_buf) { + *outlenp = 0; + return (ENOBUFS); + } + if (checkonly) + break; + + connp->conn_sndbuf = *i1; + if (tcps->tcps_snd_lowat_fraction != 0) { + connp->conn_sndlowat = connp->conn_sndbuf / + tcps->tcps_snd_lowat_fraction; + } + (void) tcp_maxpsz_set(tcp, B_TRUE); + /* + * If we are flow-controlled, recheck the condition. + * There are apps that increase SO_SNDBUF size when + * flow-controlled (EWOULDBLOCK), and expect the flow + * control condition to be lifted right away. + */ + mutex_enter(&tcp->tcp_non_sq_lock); + if (tcp->tcp_flow_stopped && + TCP_UNSENT_BYTES(tcp) < connp->conn_sndbuf) { + tcp_clrqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + *outlenp = inlen; + return (0); + } + case SO_RCVBUF: + if (*i1 > tcps->tcps_max_buf) { + *outlenp = 0; + return (ENOBUFS); + } + /* Silently ignore zero */ + if (!checkonly && *i1 != 0) { + *i1 = MSS_ROUNDUP(*i1, tcp->tcp_mss); + (void) tcp_rwnd_set(tcp, *i1); + } + /* + * XXX should we return the rwnd here + * and tcp_opt_get ? + */ + *outlenp = inlen; + return (0); + case SO_SND_COPYAVOID: + if (!checkonly) { + if (tcp->tcp_loopback || + (tcp->tcp_kssl_ctx != NULL) || + (onoff != 1) || !tcp_zcopy_check(tcp)) { + *outlenp = 0; + return (EOPNOTSUPP); + } + tcp->tcp_snd_zcopy_aware = 1; + } + *outlenp = inlen; + return (0); + } + break; + case IPPROTO_TCP: + switch (name) { + case TCP_NODELAY: + if (!checkonly) + tcp->tcp_naglim = *i1 ? 1 : tcp->tcp_mss; + break; + case TCP_NOTIFY_THRESHOLD: + if (!checkonly) + tcp->tcp_first_timer_threshold = *i1; + break; + case TCP_ABORT_THRESHOLD: + if (!checkonly) + tcp->tcp_second_timer_threshold = *i1; + break; + case TCP_CONN_NOTIFY_THRESHOLD: + if (!checkonly) + tcp->tcp_first_ctimer_threshold = *i1; + break; + case TCP_CONN_ABORT_THRESHOLD: + if (!checkonly) + tcp->tcp_second_ctimer_threshold = *i1; + break; + case TCP_RECVDSTADDR: + if (tcp->tcp_state > TCPS_LISTEN) { + *outlenp = 0; + return (EOPNOTSUPP); + } + /* Setting done in conn_opt_set */ + break; + case TCP_INIT_CWND: { + uint32_t init_cwnd = *((uint32_t *)invalp); + + if (checkonly) + break; + + /* + * Only allow socket with network configuration + * privilege to set the initial cwnd to be larger + * than allowed by RFC 3390. + */ + if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { + tcp->tcp_init_cwnd = init_cwnd; + break; + } + if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) { + *outlenp = 0; + return (reterr); + } + if (init_cwnd > tcp_max_init_cwnd) { + *outlenp = 0; + return (EINVAL); + } + tcp->tcp_init_cwnd = init_cwnd; + break; + } + case TCP_KEEPALIVE_THRESHOLD: + if (checkonly) + break; + + if (*i1 < tcps->tcps_keepalive_interval_low || + *i1 > tcps->tcps_keepalive_interval_high) { + *outlenp = 0; + return (EINVAL); + } + if (*i1 != tcp->tcp_ka_interval) { + tcp->tcp_ka_interval = *i1; + /* + * Check if we need to restart the + * keepalive timer. + */ + if (tcp->tcp_ka_tid != 0) { + ASSERT(connp->conn_keepalive); + (void) TCP_TIMER_CANCEL(tcp, + tcp->tcp_ka_tid); + tcp->tcp_ka_last_intrvl = 0; + tcp->tcp_ka_tid = TCP_TIMER(tcp, + tcp_keepalive_timer, + MSEC_TO_TICK(tcp->tcp_ka_interval)); + } + } + break; + case TCP_KEEPALIVE_ABORT_THRESHOLD: + if (!checkonly) { + if (*i1 < + tcps->tcps_keepalive_abort_interval_low || + *i1 > + tcps->tcps_keepalive_abort_interval_high) { + *outlenp = 0; + return (EINVAL); + } + tcp->tcp_ka_abort_thres = *i1; + } + break; + case TCP_CORK: + if (!checkonly) { + /* + * if tcp->tcp_cork was set and is now + * being unset, we have to make sure that + * the remaining data gets sent out. Also + * unset tcp->tcp_cork so that tcp_wput_data() + * can send data even if it is less than mss + */ + if (tcp->tcp_cork && onoff == 0 && + tcp->tcp_unsent > 0) { + tcp->tcp_cork = B_FALSE; + tcp_wput_data(tcp, NULL, B_FALSE); + } + tcp->tcp_cork = onoff; + } + break; + default: + break; + } + break; + case IPPROTO_IP: + if (connp->conn_family != AF_INET) { + *outlenp = 0; + return (EINVAL); + } + switch (name) { + case IP_SEC_OPT: + /* + * We should not allow policy setting after + * we start listening for connections. + */ + if (tcp->tcp_state == TCPS_LISTEN) { + return (EINVAL); + } + break; + } + break; + case IPPROTO_IPV6: + /* + * IPPROTO_IPV6 options are only supported for sockets + * that are using IPv6 on the wire. + */ + if (connp->conn_ipversion != IPV6_VERSION) { + *outlenp = 0; + return (EINVAL); + } + + switch (name) { + case IPV6_RECVPKTINFO: + if (!checkonly) { + /* Force it to be sent up with the next msg */ + tcp->tcp_recvifindex = 0; + } + break; + case IPV6_RECVTCLASS: + if (!checkonly) { + /* Force it to be sent up with the next msg */ + tcp->tcp_recvtclass = 0xffffffffU; + } + break; + case IPV6_RECVHOPLIMIT: + if (!checkonly) { + /* Force it to be sent up with the next msg */ + tcp->tcp_recvhops = 0xffffffffU; + } + break; + case IPV6_PKTINFO: + /* This is an extra check for TCP */ + if (inlen == sizeof (struct in6_pktinfo)) { + struct in6_pktinfo *pkti; + + pkti = (struct in6_pktinfo *)invalp; + /* + * RFC 3542 states that ipi6_addr must be + * the unspecified address when setting the + * IPV6_PKTINFO sticky socket option on a + * TCP socket. + */ + if (!IN6_IS_ADDR_UNSPECIFIED(&pkti->ipi6_addr)) + return (EINVAL); + } + break; + case IPV6_SEC_OPT: + /* + * We should not allow policy setting after + * we start listening for connections. + */ + if (tcp->tcp_state == TCPS_LISTEN) { + return (EINVAL); + } + break; + } + break; + } + reterr = conn_opt_set(&coas, level, name, inlen, invalp, + checkonly, cr); + if (reterr != 0) { + *outlenp = 0; + return (reterr); + } + + /* + * Common case of OK return with outval same as inval + */ + if (invalp != outvalp) { + /* don't trust bcopy for identical src/dst */ + (void) bcopy(invalp, outvalp, inlen); + } + *outlenp = inlen; + + if (coas.coa_changed & COA_HEADER_CHANGED) { + /* If we are connected we rebuilt the headers */ + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && + !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + reterr = tcp_build_hdrs(tcp); + if (reterr != 0) + return (reterr); + } + } + if (coas.coa_changed & COA_ROUTE_CHANGED) { + in6_addr_t nexthop; + + /* + * If we are connected we re-cache the information. + * We ignore errors to preserve BSD behavior. + * Note that we don't redo IPsec policy lookup here + * since the final destination (or source) didn't change. + */ + ip_attr_nexthop(&connp->conn_xmit_ipp, connp->conn_ixa, + &connp->conn_faddr_v6, &nexthop); + + if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6) && + !IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { + (void) ip_attr_connect(connp, connp->conn_ixa, + &connp->conn_laddr_v6, &connp->conn_faddr_v6, + &nexthop, connp->conn_fport, NULL, NULL, + IPDF_VERIFY_DST); + } + } + if ((coas.coa_changed & COA_SNDBUF_CHANGED) && !IPCL_IS_NONSTR(connp)) { + connp->conn_wq->q_hiwat = connp->conn_sndbuf; + } + if (coas.coa_changed & COA_WROFF_CHANGED) { + connp->conn_wroff = connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra; + (void) proto_set_tx_wroff(connp->conn_rq, connp, + connp->conn_wroff); + } + if (coas.coa_changed & COA_OOBINLINE_CHANGED) { + if (IPCL_IS_NONSTR(connp)) + proto_set_rx_oob_opt(connp, onoff); + } + return (0); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c new file mode 100644 index 0000000000..01b383bb34 --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_output.c @@ -0,0 +1,3612 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* This file contains all TCP output processing functions. */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/stropts.h> +#include <sys/strlog.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/suntpi.h> +#include <sys/xti_inet.h> +#include <sys/timod.h> +#include <sys/pattr.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> +#include <sys/sockio.h> +#include <sys/tsol/tnet.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/snmpcom.h> +#include <inet/proto_set.h> +#include <inet/ipsec_impl.h> +#include <inet/ip_ndp.h> + +static mblk_t *tcp_get_seg_mp(tcp_t *, uint32_t, int32_t *); +static void tcp_wput_cmdblk(queue_t *, mblk_t *); +static void tcp_wput_flush(tcp_t *, mblk_t *); +static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); +static int tcp_xmit_end(tcp_t *); +static int tcp_send(tcp_t *, const int, const int, const int, + const int, int *, uint_t *, int *, mblk_t **, mblk_t *); +static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t, + int, ip_recv_attr_t *, ip_stack_t *, conn_t *); +static boolean_t tcp_send_rst_chk(tcp_stack_t *); +static void tcp_process_shrunk_swnd(tcp_t *, uint32_t); +static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int); + +/* + * Functions called directly via squeue having a prototype of edesc_t. + */ +static void tcp_wput_nondata(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_wput_ioctl(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_wput_proto(void *, mblk_t *, void *, ip_recv_attr_t *); + +/* + * This controls how tiny a write must be before we try to copy it + * into the mblk on the tail of the transmit queue. Not much + * speedup is observed for values larger than sixteen. Zero will + * disable the optimisation. + */ +static int tcp_tx_pull_len = 16; + +void +tcp_wput(queue_t *q, mblk_t *mp) +{ + conn_t *connp = Q_TO_CONN(q); + tcp_t *tcp; + void (*output_proc)(); + t_scalar_t type; + uchar_t *rptr; + struct iocblk *iocp; + size_t size; + tcp_stack_t *tcps = Q_TO_TCP(q)->tcp_tcps; + + ASSERT(connp->conn_ref >= 2); + + switch (DB_TYPE(mp)) { + case M_DATA: + tcp = connp->conn_tcp; + ASSERT(tcp != NULL); + + size = msgdsize(mp); + + mutex_enter(&tcp->tcp_non_sq_lock); + tcp->tcp_squeue_bytes += size; + if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { + tcp_setqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, connp, + NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); + return; + + case M_CMD: + tcp_wput_cmdblk(q, mp); + return; + + case M_PROTO: + case M_PCPROTO: + /* + * if it is a snmp message, don't get behind the squeue + */ + tcp = connp->conn_tcp; + rptr = mp->b_rptr; + if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { + type = ((union T_primitives *)rptr)->type; + } else { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_wput_proto, dropping one..."); + } + freemsg(mp); + return; + } + if (type == T_SVR4_OPTMGMT_REQ) { + /* + * All Solaris components should pass a db_credp + * for this TPI message, hence we ASSERT. + * But in case there is some other M_PROTO that looks + * like a TPI message sent by some other kernel + * component, we check and return an error. + */ + cred_t *cr = msg_getcred(mp, NULL); + + ASSERT(cr != NULL); + if (cr == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, EINVAL); + return; + } + if (snmpcom_req(q, mp, tcp_snmp_set, ip_snmp_get, + cr)) { + /* + * This was a SNMP request + */ + return; + } else { + output_proc = tcp_wput_proto; + } + } else { + output_proc = tcp_wput_proto; + } + break; + case M_IOCTL: + /* + * Most ioctls can be processed right away without going via + * squeues - process them right here. Those that do require + * squeue (currently _SIOCSOCKFALLBACK) + * are processed by tcp_wput_ioctl(). + */ + iocp = (struct iocblk *)mp->b_rptr; + tcp = connp->conn_tcp; + + switch (iocp->ioc_cmd) { + case TCP_IOC_ABORT_CONN: + tcp_ioctl_abort_conn(q, mp); + return; + case TI_GETPEERNAME: + case TI_GETMYNAME: + mi_copyin(q, mp, NULL, + SIZEOF_STRUCT(strbuf, iocp->ioc_flag)); + return; + case ND_SET: + /* nd_getset does the necessary checks */ + case ND_GET: + if (nd_getset(q, tcps->tcps_g_nd, mp)) { + qreply(q, mp); + return; + } + CONN_INC_IOCTLREF(connp); + ip_wput_nondata(q, mp); + CONN_DEC_IOCTLREF(connp); + return; + + default: + output_proc = tcp_wput_ioctl; + break; + } + break; + default: + output_proc = tcp_wput_nondata; + break; + } + + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, output_proc, connp, + NULL, tcp_squeue_flag, SQTAG_TCP_WPUT_OTHER); +} + +/* + * The TCP normal data output path. + * NOTE: the logic of the fast path is duplicated from this function. + */ +void +tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent) +{ + int len; + mblk_t *local_time; + mblk_t *mp1; + uint32_t snxt; + int tail_unsent; + int tcpstate; + int usable = 0; + mblk_t *xmit_tail; + int32_t mss; + int32_t num_sack_blk = 0; + int32_t total_hdr_len; + int32_t tcp_hdr_len; + int rc; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + clock_t now = LBOLT_FASTPATH; + + tcpstate = tcp->tcp_state; + if (mp == NULL) { + /* + * tcp_wput_data() with NULL mp should only be called when + * there is unsent data. + */ + ASSERT(tcp->tcp_unsent > 0); + /* Really tacky... but we need this for detached closes. */ + len = tcp->tcp_unsent; + goto data_null; + } + + ASSERT(mp->b_datap->db_type == M_DATA); + /* + * Don't allow data after T_ORDREL_REQ or T_DISCON_REQ, + * or before a connection attempt has begun. + */ + if (tcpstate < TCPS_SYN_SENT || tcpstate > TCPS_CLOSE_WAIT || + (tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) != 0) { +#ifdef DEBUG + cmn_err(CE_WARN, + "tcp_wput_data: data after ordrel, %s", + tcp_display(tcp, NULL, + DISP_ADDR_AND_PORT)); +#else + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_TRACE|SL_ERROR, + "tcp_wput_data: data after ordrel, %s\n", + tcp_display(tcp, NULL, + DISP_ADDR_AND_PORT)); + } +#endif /* DEBUG */ + } + if (tcp->tcp_snd_zcopy_aware && + (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) + tcp_zcopy_notify(tcp); + freemsg(mp); + mutex_enter(&tcp->tcp_non_sq_lock); + if (tcp->tcp_flow_stopped && + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { + tcp_clrqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + return; + } + + /* Strip empties */ + for (;;) { + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= + (uintptr_t)INT_MAX); + len = (int)(mp->b_wptr - mp->b_rptr); + if (len > 0) + break; + mp1 = mp; + mp = mp->b_cont; + freeb(mp1); + if (mp == NULL) { + return; + } + } + + /* If we are the first on the list ... */ + if (tcp->tcp_xmit_head == NULL) { + tcp->tcp_xmit_head = mp; + tcp->tcp_xmit_tail = mp; + tcp->tcp_xmit_tail_unsent = len; + } else { + /* If tiny tx and room in txq tail, pullup to save mblks. */ + struct datab *dp; + + mp1 = tcp->tcp_xmit_last; + if (len < tcp_tx_pull_len && + (dp = mp1->b_datap)->db_ref == 1 && + dp->db_lim - mp1->b_wptr >= len) { + ASSERT(len > 0); + ASSERT(!mp1->b_cont); + if (len == 1) { + *mp1->b_wptr++ = *mp->b_rptr; + } else { + bcopy(mp->b_rptr, mp1->b_wptr, len); + mp1->b_wptr += len; + } + if (mp1 == tcp->tcp_xmit_tail) + tcp->tcp_xmit_tail_unsent += len; + mp1->b_cont = mp->b_cont; + if (tcp->tcp_snd_zcopy_aware && + (mp->b_datap->db_struioflag & STRUIO_ZCNOTIFY)) + mp1->b_datap->db_struioflag |= STRUIO_ZCNOTIFY; + freeb(mp); + mp = mp1; + } else { + tcp->tcp_xmit_last->b_cont = mp; + } + len += tcp->tcp_unsent; + } + + /* Tack on however many more positive length mblks we have */ + if ((mp1 = mp->b_cont) != NULL) { + do { + int tlen; + ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= + (uintptr_t)INT_MAX); + tlen = (int)(mp1->b_wptr - mp1->b_rptr); + if (tlen <= 0) { + mp->b_cont = mp1->b_cont; + freeb(mp1); + } else { + len += tlen; + mp = mp1; + } + } while ((mp1 = mp->b_cont) != NULL); + } + tcp->tcp_xmit_last = mp; + tcp->tcp_unsent = len; + + if (urgent) + usable = 1; + +data_null: + snxt = tcp->tcp_snxt; + xmit_tail = tcp->tcp_xmit_tail; + tail_unsent = tcp->tcp_xmit_tail_unsent; + + /* + * Note that tcp_mss has been adjusted to take into account the + * timestamp option if applicable. Because SACK options do not + * appear in every TCP segments and they are of variable lengths, + * they cannot be included in tcp_mss. Thus we need to calculate + * the actual segment length when we need to send a segment which + * includes SACK options. + */ + if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { + int32_t opt_len; + + num_sack_blk = MIN(tcp->tcp_max_sack_blk, + tcp->tcp_num_sack_blk); + opt_len = num_sack_blk * sizeof (sack_blk_t) + TCPOPT_NOP_LEN * + 2 + TCPOPT_HEADER_LEN; + mss = tcp->tcp_mss - opt_len; + total_hdr_len = connp->conn_ht_iphc_len + opt_len; + tcp_hdr_len = connp->conn_ht_ulp_len + opt_len; + } else { + mss = tcp->tcp_mss; + total_hdr_len = connp->conn_ht_iphc_len; + tcp_hdr_len = connp->conn_ht_ulp_len; + } + + if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && + (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { + TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); + } + if (tcpstate == TCPS_SYN_RCVD) { + /* + * The three-way connection establishment handshake is not + * complete yet. We want to queue the data for transmission + * after entering ESTABLISHED state (RFC793). A jump to + * "done" label effectively leaves data on the queue. + */ + goto done; + } else { + int usable_r; + + /* + * In the special case when cwnd is zero, which can only + * happen if the connection is ECN capable, return now. + * New segments is sent using tcp_timer(). The timer + * is set in tcp_input_data(). + */ + if (tcp->tcp_cwnd == 0) { + /* + * Note that tcp_cwnd is 0 before 3-way handshake is + * finished. + */ + ASSERT(tcp->tcp_ecn_ok || + tcp->tcp_state < TCPS_ESTABLISHED); + return; + } + + /* NOTE: trouble if xmitting while SYN not acked? */ + usable_r = snxt - tcp->tcp_suna; + usable_r = tcp->tcp_swnd - usable_r; + + /* + * Check if the receiver has shrunk the window. If + * tcp_wput_data() with NULL mp is called, tcp_fin_sent + * cannot be set as there is unsent data, so FIN cannot + * be sent out. Otherwise, we need to take into account + * of FIN as it consumes an "invisible" sequence number. + */ + ASSERT(tcp->tcp_fin_sent == 0); + if (usable_r < 0) { + /* + * The receiver has shrunk the window and we have sent + * -usable_r date beyond the window, re-adjust. + * + * If TCP window scaling is enabled, there can be + * round down error as the advertised receive window + * is actually right shifted n bits. This means that + * the lower n bits info is wiped out. It will look + * like the window is shrunk. Do a check here to + * see if the shrunk amount is actually within the + * error in window calculation. If it is, just + * return. Note that this check is inside the + * shrunk window check. This makes sure that even + * though tcp_process_shrunk_swnd() is not called, + * we will stop further processing. + */ + if ((-usable_r >> tcp->tcp_snd_ws) > 0) { + tcp_process_shrunk_swnd(tcp, -usable_r); + } + return; + } + + /* usable = MIN(swnd, cwnd) - unacked_bytes */ + if (tcp->tcp_swnd > tcp->tcp_cwnd) + usable_r -= tcp->tcp_swnd - tcp->tcp_cwnd; + + /* usable = MIN(usable, unsent) */ + if (usable_r > len) + usable_r = len; + + /* usable = MAX(usable, {1 for urgent, 0 for data}) */ + if (usable_r > 0) { + usable = usable_r; + } else { + /* Bypass all other unnecessary processing. */ + goto done; + } + } + + local_time = (mblk_t *)now; + + /* + * "Our" Nagle Algorithm. This is not the same as in the old + * BSD. This is more in line with the true intent of Nagle. + * + * The conditions are: + * 1. The amount of unsent data (or amount of data which can be + * sent, whichever is smaller) is less than Nagle limit. + * 2. The last sent size is also less than Nagle limit. + * 3. There is unack'ed data. + * 4. Urgent pointer is not set. Send urgent data ignoring the + * Nagle algorithm. This reduces the probability that urgent + * bytes get "merged" together. + * 5. The app has not closed the connection. This eliminates the + * wait time of the receiving side waiting for the last piece of + * (small) data. + * + * If all are satisified, exit without sending anything. Note + * that Nagle limit can be smaller than 1 MSS. Nagle limit is + * the smaller of 1 MSS and global tcp_naglim_def (default to be + * 4095). + */ + if (usable < (int)tcp->tcp_naglim && + tcp->tcp_naglim > tcp->tcp_last_sent_len && + snxt != tcp->tcp_suna && + !(tcp->tcp_valid_bits & TCP_URG_VALID) && + !(tcp->tcp_valid_bits & TCP_FSS_VALID)) { + goto done; + } + + /* + * If tcp_zero_win_probe is not set and the tcp->tcp_cork option + * is set, then we have to force TCP not to send partial segment + * (smaller than MSS bytes). We are calculating the usable now + * based on full mss and will save the rest of remaining data for + * later. When tcp_zero_win_probe is set, TCP needs to send out + * something to do zero window probe. + */ + if (tcp->tcp_cork && !tcp->tcp_zero_win_probe) { + if (usable < mss) + goto done; + usable = (usable / mss) * mss; + } + + /* Update the latest receive window size in TCP header. */ + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + + /* Send the packet. */ + rc = tcp_send(tcp, mss, total_hdr_len, tcp_hdr_len, + num_sack_blk, &usable, &snxt, &tail_unsent, &xmit_tail, + local_time); + + /* Pretend that all we were trying to send really got sent */ + if (rc < 0 && tail_unsent < 0) { + do { + xmit_tail = xmit_tail->b_cont; + xmit_tail->b_prev = local_time; + ASSERT((uintptr_t)(xmit_tail->b_wptr - + xmit_tail->b_rptr) <= (uintptr_t)INT_MAX); + tail_unsent += (int)(xmit_tail->b_wptr - + xmit_tail->b_rptr); + } while (tail_unsent < 0); + } +done:; + tcp->tcp_xmit_tail = xmit_tail; + tcp->tcp_xmit_tail_unsent = tail_unsent; + len = tcp->tcp_snxt - snxt; + if (len) { + /* + * If new data was sent, need to update the notsack + * list, which is, afterall, data blocks that have + * not been sack'ed by the receiver. New data is + * not sack'ed. + */ + if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) { + /* len is a negative value. */ + tcp->tcp_pipe -= len; + tcp_notsack_update(&(tcp->tcp_notsack_list), + tcp->tcp_snxt, snxt, + &(tcp->tcp_num_notsack_blk), + &(tcp->tcp_cnt_notsack_list)); + } + tcp->tcp_snxt = snxt + tcp->tcp_fin_sent; + tcp->tcp_rack = tcp->tcp_rnxt; + tcp->tcp_rack_cnt = 0; + if ((snxt + len) == tcp->tcp_suna) { + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + } + } else if (snxt == tcp->tcp_suna && tcp->tcp_swnd == 0) { + /* + * Didn't send anything. Make sure the timer is running + * so that we will probe a zero window. + */ + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + } + /* Note that len is the amount we just sent but with a negative sign */ + tcp->tcp_unsent += len; + mutex_enter(&tcp->tcp_non_sq_lock); + if (tcp->tcp_flow_stopped) { + if (TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { + tcp_clrqfull(tcp); + } + } else if (TCP_UNSENT_BYTES(tcp) >= connp->conn_sndbuf) { + if (!(tcp->tcp_detached)) + tcp_setqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); +} + +/* + * Initial STREAMS write side put() procedure for sockets. It tries to + * handle the T_CAPABILITY_REQ which sockfs sends down while setting + * up the socket without using the squeue. Non T_CAPABILITY_REQ messages + * are handled by tcp_wput() as usual. + * + * All further messages will also be handled by tcp_wput() because we cannot + * be sure that the above short cut is safe later. + */ +void +tcp_wput_sock(queue_t *wq, mblk_t *mp) +{ + conn_t *connp = Q_TO_CONN(wq); + tcp_t *tcp = connp->conn_tcp; + struct T_capability_req *car = (struct T_capability_req *)mp->b_rptr; + + ASSERT(wq->q_qinfo == &tcp_sock_winit); + wq->q_qinfo = &tcp_winit; + + ASSERT(IPCL_IS_TCP(connp)); + ASSERT(TCP_IS_SOCKET(tcp)); + + if (DB_TYPE(mp) == M_PCPROTO && + MBLKL(mp) == sizeof (struct T_capability_req) && + car->PRIM_type == T_CAPABILITY_REQ) { + tcp_capability_req(tcp, mp); + return; + } + + tcp_wput(wq, mp); +} + +/* ARGSUSED */ +void +tcp_wput_fallback(queue_t *wq, mblk_t *mp) +{ +#ifdef DEBUG + cmn_err(CE_CONT, "tcp_wput_fallback: Message during fallback \n"); +#endif + freemsg(mp); +} + +/* + * Call by tcp_wput() to handle misc non M_DATA messages. + */ +/* ARGSUSED */ +static void +tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + ASSERT(DB_TYPE(mp) != M_IOCTL); + /* + * TCP is D_MP and qprocsoff() is done towards the end of the tcp_close. + * Once the close starts, streamhead and sockfs will not let any data + * packets come down (close ensures that there are no threads using the + * queue and no new threads will come down) but since qprocsoff() + * hasn't happened yet, a M_FLUSH or some non data message might + * get reflected back (in response to our own FLUSHRW) and get + * processed after tcp_close() is done. The conn would still be valid + * because a ref would have added but we need to check the state + * before actually processing the packet. + */ + if (TCP_IS_DETACHED(tcp) || (tcp->tcp_state == TCPS_CLOSED)) { + freemsg(mp); + return; + } + + switch (DB_TYPE(mp)) { + case M_IOCDATA: + tcp_wput_iocdata(tcp, mp); + break; + case M_FLUSH: + tcp_wput_flush(tcp, mp); + break; + default: + ip_wput_nondata(connp->conn_wq, mp); + break; + } +} + +/* tcp_wput_flush is called by tcp_wput_nondata to handle M_FLUSH messages. */ +static void +tcp_wput_flush(tcp_t *tcp, mblk_t *mp) +{ + uchar_t fval = *mp->b_rptr; + mblk_t *tail; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; + + /* TODO: How should flush interact with urgent data? */ + if ((fval & FLUSHW) && tcp->tcp_xmit_head != NULL && + !(tcp->tcp_valid_bits & TCP_URG_VALID)) { + /* + * Flush only data that has not yet been put on the wire. If + * we flush data that we have already transmitted, life, as we + * know it, may come to an end. + */ + tail = tcp->tcp_xmit_tail; + tail->b_wptr -= tcp->tcp_xmit_tail_unsent; + tcp->tcp_xmit_tail_unsent = 0; + tcp->tcp_unsent = 0; + if (tail->b_wptr != tail->b_rptr) + tail = tail->b_cont; + if (tail) { + mblk_t **excess = &tcp->tcp_xmit_head; + for (;;) { + mblk_t *mp1 = *excess; + if (mp1 == tail) + break; + tcp->tcp_xmit_tail = mp1; + tcp->tcp_xmit_last = mp1; + excess = &mp1->b_cont; + } + *excess = NULL; + tcp_close_mpp(&tail); + if (tcp->tcp_snd_zcopy_aware) + tcp_zcopy_notify(tcp); + } + /* + * We have no unsent data, so unsent must be less than + * conn_sndlowat, so re-enable flow. + */ + mutex_enter(&tcp->tcp_non_sq_lock); + if (tcp->tcp_flow_stopped) { + tcp_clrqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + } + /* + * TODO: you can't just flush these, you have to increase rwnd for one + * thing. For another, how should urgent data interact? + */ + if (fval & FLUSHR) { + *mp->b_rptr = fval & ~FLUSHW; + /* XXX */ + qreply(q, mp); + return; + } + freemsg(mp); +} + +/* + * tcp_wput_iocdata is called by tcp_wput_nondata to handle all M_IOCDATA + * messages. + */ +static void +tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp) +{ + mblk_t *mp1; + struct iocblk *iocp = (struct iocblk *)mp->b_rptr; + STRUCT_HANDLE(strbuf, sb); + uint_t addrlen; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; + + /* Make sure it is one of ours. */ + switch (iocp->ioc_cmd) { + case TI_GETMYNAME: + case TI_GETPEERNAME: + break; + default: + /* + * If the conn is closing, then error the ioctl here. Otherwise + * use the CONN_IOCTLREF_* macros to hold off tcp_close until + * we're done here. + */ + mutex_enter(&connp->conn_lock); + if (connp->conn_state_flags & CONN_CLOSING) { + mutex_exit(&connp->conn_lock); + iocp->ioc_error = EINVAL; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); + return; + } + + CONN_INC_IOCTLREF_LOCKED(connp); + ip_wput_nondata(q, mp); + CONN_DEC_IOCTLREF(connp); + return; + } + switch (mi_copy_state(q, mp, &mp1)) { + case -1: + return; + case MI_COPY_CASE(MI_COPY_IN, 1): + break; + case MI_COPY_CASE(MI_COPY_OUT, 1): + /* Copy out the strbuf. */ + mi_copyout(q, mp); + return; + case MI_COPY_CASE(MI_COPY_OUT, 2): + /* All done. */ + mi_copy_done(q, mp, 0); + return; + default: + mi_copy_done(q, mp, EPROTO); + return; + } + /* Check alignment of the strbuf */ + if (!OK_32PTR(mp1->b_rptr)) { + mi_copy_done(q, mp, EINVAL); + return; + } + + STRUCT_SET_HANDLE(sb, iocp->ioc_flag, (void *)mp1->b_rptr); + + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + + if (STRUCT_FGET(sb, maxlen) < addrlen) { + mi_copy_done(q, mp, EINVAL); + return; + } + + switch (iocp->ioc_cmd) { + case TI_GETMYNAME: + break; + case TI_GETPEERNAME: + if (tcp->tcp_state < TCPS_SYN_RCVD) { + mi_copy_done(q, mp, ENOTCONN); + return; + } + break; + } + mp1 = mi_copyout_alloc(q, mp, STRUCT_FGETP(sb, buf), addrlen, B_TRUE); + if (!mp1) + return; + + STRUCT_FSET(sb, len, addrlen); + switch (((struct iocblk *)mp->b_rptr)->ioc_cmd) { + case TI_GETMYNAME: + (void) conn_getsockname(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; + case TI_GETPEERNAME: + (void) conn_getpeername(connp, (struct sockaddr *)mp1->b_wptr, + &addrlen); + break; + } + mp1->b_wptr += addrlen; + /* Copy out the address */ + mi_copyout(q, mp); +} + +/* + * tcp_wput_ioctl is called by tcp_wput_nondata() to handle all M_IOCTL + * messages. + */ +/* ARGSUSED */ +static void +tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + queue_t *q = connp->conn_wq; + struct iocblk *iocp; + + ASSERT(DB_TYPE(mp) == M_IOCTL); + /* + * Try and ASSERT the minimum possible references on the + * conn early enough. Since we are executing on write side, + * the connection is obviously not detached and that means + * there is a ref each for TCP and IP. Since we are behind + * the squeue, the minimum references needed are 3. If the + * conn is in classifier hash list, there should be an + * extra ref for that (we check both the possibilities). + */ + ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || + (connp->conn_fanout == NULL && connp->conn_ref >= 3)); + + iocp = (struct iocblk *)mp->b_rptr; + switch (iocp->ioc_cmd) { + case _SIOCSOCKFALLBACK: + /* + * Either sockmod is about to be popped and the socket + * would now be treated as a plain stream, or a module + * is about to be pushed so we could no longer use read- + * side synchronous streams for fused loopback tcp. + * Drain any queued data and disable direct sockfs + * interface from now on. + */ + if (!tcp->tcp_issocket) { + DB_TYPE(mp) = M_IOCNAK; + iocp->ioc_error = EINVAL; + } else { + tcp_use_pure_tpi(tcp); + DB_TYPE(mp) = M_IOCACK; + iocp->ioc_error = 0; + } + iocp->ioc_count = 0; + iocp->ioc_rval = 0; + qreply(q, mp); + return; + } + + /* + * If the conn is closing, then error the ioctl here. Otherwise bump the + * conn_ioctlref to hold off tcp_close until we're done here. + */ + mutex_enter(&(connp)->conn_lock); + if ((connp)->conn_state_flags & CONN_CLOSING) { + mutex_exit(&(connp)->conn_lock); + iocp->ioc_error = EINVAL; + mp->b_datap->db_type = M_IOCNAK; + iocp->ioc_count = 0; + qreply(q, mp); + return; + } + + CONN_INC_IOCTLREF_LOCKED(connp); + ip_wput_nondata(q, mp); + CONN_DEC_IOCTLREF(connp); +} + +/* + * This routine is called by tcp_wput() to handle all TPI requests. + */ +/* ARGSUSED */ +static void +tcp_wput_proto(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + union T_primitives *tprim = (union T_primitives *)mp->b_rptr; + uchar_t *rptr; + t_scalar_t type; + cred_t *cr; + + /* + * Try and ASSERT the minimum possible references on the + * conn early enough. Since we are executing on write side, + * the connection is obviously not detached and that means + * there is a ref each for TCP and IP. Since we are behind + * the squeue, the minimum references needed are 3. If the + * conn is in classifier hash list, there should be an + * extra ref for that (we check both the possibilities). + */ + ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || + (connp->conn_fanout == NULL && connp->conn_ref >= 3)); + + rptr = mp->b_rptr; + ASSERT((uintptr_t)(mp->b_wptr - rptr) <= (uintptr_t)INT_MAX); + if ((mp->b_wptr - rptr) >= sizeof (t_scalar_t)) { + type = ((union T_primitives *)rptr)->type; + if (type == T_EXDATA_REQ) { + tcp_output_urgent(connp, mp, arg2, NULL); + } else if (type != T_DATA_REQ) { + goto non_urgent_data; + } else { + /* TODO: options, flags, ... from user */ + /* Set length to zero for reclamation below */ + tcp_wput_data(tcp, mp->b_cont, B_TRUE); + freeb(mp); + } + return; + } else { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_wput_proto, dropping one..."); + } + freemsg(mp); + return; + } + +non_urgent_data: + + switch ((int)tprim->type) { + case T_SSL_PROXY_BIND_REQ: /* an SSL proxy endpoint bind request */ + /* + * save the kssl_ent_t from the next block, and convert this + * back to a normal bind_req. + */ + if (mp->b_cont != NULL) { + ASSERT(MBLKL(mp->b_cont) >= sizeof (kssl_ent_t)); + + if (tcp->tcp_kssl_ent != NULL) { + kssl_release_ent(tcp->tcp_kssl_ent, NULL, + KSSL_NO_PROXY); + tcp->tcp_kssl_ent = NULL; + } + bcopy(mp->b_cont->b_rptr, &tcp->tcp_kssl_ent, + sizeof (kssl_ent_t)); + kssl_hold_ent(tcp->tcp_kssl_ent); + freemsg(mp->b_cont); + mp->b_cont = NULL; + } + tprim->type = T_BIND_REQ; + + /* FALLTHROUGH */ + case O_T_BIND_REQ: /* bind request */ + case T_BIND_REQ: /* new semantics bind request */ + tcp_tpi_bind(tcp, mp); + break; + case T_UNBIND_REQ: /* unbind request */ + tcp_tpi_unbind(tcp, mp); + break; + case O_T_CONN_RES: /* old connection response XXX */ + case T_CONN_RES: /* connection response */ + tcp_tli_accept(tcp, mp); + break; + case T_CONN_REQ: /* connection request */ + tcp_tpi_connect(tcp, mp); + break; + case T_DISCON_REQ: /* disconnect request */ + tcp_disconnect(tcp, mp); + break; + case T_CAPABILITY_REQ: + tcp_capability_req(tcp, mp); /* capability request */ + break; + case T_INFO_REQ: /* information request */ + tcp_info_req(tcp, mp); + break; + case T_SVR4_OPTMGMT_REQ: /* manage options req */ + case T_OPTMGMT_REQ: + /* + * Note: no support for snmpcom_req() through new + * T_OPTMGMT_REQ. See comments in ip.c + */ + + /* + * All Solaris components should pass a db_credp + * for this TPI message, hence we ASSERT. + * But in case there is some other M_PROTO that looks + * like a TPI message sent by some other kernel + * component, we check and return an error. + */ + cr = msg_getcred(mp, NULL); + ASSERT(cr != NULL); + if (cr == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, EINVAL); + return; + } + /* + * If EINPROGRESS is returned, the request has been queued + * for subsequent processing by ip_restart_optmgmt(), which + * will do the CONN_DEC_REF(). + */ + if ((int)tprim->type == T_SVR4_OPTMGMT_REQ) { + svr4_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); + } else { + tpi_optcom_req(connp->conn_wq, mp, cr, &tcp_opt_obj); + } + break; + + case T_UNITDATA_REQ: /* unitdata request */ + tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); + break; + case T_ORDREL_REQ: /* orderly release req */ + freemsg(mp); + + if (tcp->tcp_fused) + tcp_unfuse(tcp); + + if (tcp_xmit_end(tcp) != 0) { + /* + * We were crossing FINs and got a reset from + * the other side. Just ignore it. + */ + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_wput_proto, T_ORDREL_REQ out of " + "state %s", + tcp_display(tcp, NULL, + DISP_ADDR_AND_PORT)); + } + } + break; + case T_ADDR_REQ: + tcp_addr_req(tcp, mp); + break; + default: + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_wput_proto, bogus TPI msg, type %d", + tprim->type); + } + /* + * We used to M_ERROR. Sending TNOTSUPPORT gives the user + * to recover. + */ + tcp_err_ack(tcp, mp, TNOTSUPPORT, 0); + break; + } +} + +/* + * Handle special out-of-band ioctl requests (see PSARC/2008/265). + */ +static void +tcp_wput_cmdblk(queue_t *q, mblk_t *mp) +{ + void *data; + mblk_t *datamp = mp->b_cont; + conn_t *connp = Q_TO_CONN(q); + tcp_t *tcp = connp->conn_tcp; + cmdblk_t *cmdp = (cmdblk_t *)mp->b_rptr; + + if (datamp == NULL || MBLKL(datamp) < cmdp->cb_len) { + cmdp->cb_error = EPROTO; + qreply(q, mp); + return; + } + + data = datamp->b_rptr; + + switch (cmdp->cb_cmd) { + case TI_GETPEERNAME: + if (tcp->tcp_state < TCPS_SYN_RCVD) + cmdp->cb_error = ENOTCONN; + else + cmdp->cb_error = conn_getpeername(connp, data, + &cmdp->cb_len); + break; + case TI_GETMYNAME: + cmdp->cb_error = conn_getsockname(connp, data, &cmdp->cb_len); + break; + default: + cmdp->cb_error = EINVAL; + break; + } + + qreply(q, mp); +} + +/* + * The TCP fast path write put procedure. + * NOTE: the logic of the fast path is duplicated from tcp_wput_data() + */ +/* ARGSUSED */ +void +tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + int len; + int hdrlen; + int plen; + mblk_t *mp1; + uchar_t *rptr; + uint32_t snxt; + tcpha_t *tcpha; + struct datab *db; + uint32_t suna; + uint32_t mss; + ipaddr_t *dst; + ipaddr_t *src; + uint32_t sum; + int usable; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + uint32_t msize; + tcp_stack_t *tcps = tcp->tcp_tcps; + ip_xmit_attr_t *ixa; + clock_t now; + + /* + * Try and ASSERT the minimum possible references on the + * conn early enough. Since we are executing on write side, + * the connection is obviously not detached and that means + * there is a ref each for TCP and IP. Since we are behind + * the squeue, the minimum references needed are 3. If the + * conn is in classifier hash list, there should be an + * extra ref for that (we check both the possibilities). + */ + ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || + (connp->conn_fanout == NULL && connp->conn_ref >= 3)); + + ASSERT(DB_TYPE(mp) == M_DATA); + msize = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); + + mutex_enter(&tcp->tcp_non_sq_lock); + tcp->tcp_squeue_bytes -= msize; + mutex_exit(&tcp->tcp_non_sq_lock); + + /* Bypass tcp protocol for fused tcp loopback */ + if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) + return; + + mss = tcp->tcp_mss; + /* + * If ZEROCOPY has turned off, try not to send any zero-copy message + * down. Do backoff, now. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on) + mp = tcp_zcopy_backoff(tcp, mp, B_FALSE); + + + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); + len = (int)(mp->b_wptr - mp->b_rptr); + + /* + * Criteria for fast path: + * + * 1. no unsent data + * 2. single mblk in request + * 3. connection established + * 4. data in mblk + * 5. len <= mss + * 6. no tcp_valid bits + */ + if ((tcp->tcp_unsent != 0) || + (tcp->tcp_cork) || + (mp->b_cont != NULL) || + (tcp->tcp_state != TCPS_ESTABLISHED) || + (len == 0) || + (len > mss) || + (tcp->tcp_valid_bits != 0)) { + tcp_wput_data(tcp, mp, B_FALSE); + return; + } + + ASSERT(tcp->tcp_xmit_tail_unsent == 0); + ASSERT(tcp->tcp_fin_sent == 0); + + /* queue new packet onto retransmission queue */ + if (tcp->tcp_xmit_head == NULL) { + tcp->tcp_xmit_head = mp; + } else { + tcp->tcp_xmit_last->b_cont = mp; + } + tcp->tcp_xmit_last = mp; + tcp->tcp_xmit_tail = mp; + + /* find out how much we can send */ + /* BEGIN CSTYLED */ + /* + * un-acked usable + * |--------------|-----------------| + * tcp_suna tcp_snxt tcp_suna+tcp_swnd + */ + /* END CSTYLED */ + + /* start sending from tcp_snxt */ + snxt = tcp->tcp_snxt; + + /* + * Check to see if this connection has been idled for some + * time and no ACK is expected. If it is, we need to slow + * start again to get back the connection's "self-clock" as + * described in VJ's paper. + * + * Reinitialize tcp_cwnd after idle. + */ + now = LBOLT_FASTPATH; + if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && + (TICK_TO_MSEC(now - tcp->tcp_last_recv_time) >= tcp->tcp_rto)) { + TCP_SET_INIT_CWND(tcp, mss, tcps->tcps_slow_start_after_idle); + } + + usable = tcp->tcp_swnd; /* tcp window size */ + if (usable > tcp->tcp_cwnd) + usable = tcp->tcp_cwnd; /* congestion window smaller */ + usable -= snxt; /* subtract stuff already sent */ + suna = tcp->tcp_suna; + usable += suna; + /* usable can be < 0 if the congestion window is smaller */ + if (len > usable) { + /* Can't send complete M_DATA in one shot */ + goto slow; + } + + mutex_enter(&tcp->tcp_non_sq_lock); + if (tcp->tcp_flow_stopped && + TCP_UNSENT_BYTES(tcp) <= connp->conn_sndlowat) { + tcp_clrqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + + /* + * determine if anything to send (Nagle). + * + * 1. len < tcp_mss (i.e. small) + * 2. unacknowledged data present + * 3. len < nagle limit + * 4. last packet sent < nagle limit (previous packet sent) + */ + if ((len < mss) && (snxt != suna) && + (len < (int)tcp->tcp_naglim) && + (tcp->tcp_last_sent_len < tcp->tcp_naglim)) { + /* + * This was the first unsent packet and normally + * mss < xmit_hiwater so there is no need to worry + * about flow control. The next packet will go + * through the flow control check in tcp_wput_data(). + */ + /* leftover work from above */ + tcp->tcp_unsent = len; + tcp->tcp_xmit_tail_unsent = len; + + return; + } + + /* + * len <= tcp->tcp_mss && len == unsent so no sender silly window. Can + * send now. + */ + + if (snxt == suna) { + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + } + + /* we have always sent something */ + tcp->tcp_rack_cnt = 0; + + tcp->tcp_snxt = snxt + len; + tcp->tcp_rack = tcp->tcp_rnxt; + + if ((mp1 = dupb(mp)) == 0) + goto no_memory; + mp->b_prev = (mblk_t *)(uintptr_t)now; + mp->b_next = (mblk_t *)(uintptr_t)snxt; + + /* adjust tcp header information */ + tcpha = tcp->tcp_tcpha; + tcpha->tha_flags = (TH_ACK|TH_PUSH); + + sum = len + connp->conn_ht_ulp_len + connp->conn_sum; + sum = (sum >> 16) + (sum & 0xFFFF); + tcpha->tha_sum = htons(sum); + + tcpha->tha_seq = htonl(snxt); + + TCPS_BUMP_MIB(tcps, tcpOutDataSegs); + TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); + BUMP_LOCAL(tcp->tcp_obsegs); + + /* Update the latest receive window size in TCP header. */ + tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + + tcp->tcp_last_sent_len = (ushort_t)len; + + plen = len + connp->conn_ht_iphc_len; + + ixa = connp->conn_ixa; + ixa->ixa_pktlen = plen; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + tcp->tcp_ipha->ipha_length = htons(plen); + } else { + tcp->tcp_ip6h->ip6_plen = htons(plen - IPV6_HDR_LEN); + } + + /* see if we need to allocate a mblk for the headers */ + hdrlen = connp->conn_ht_iphc_len; + rptr = mp1->b_rptr - hdrlen; + db = mp1->b_datap; + if ((db->db_ref != 2) || rptr < db->db_base || + (!OK_32PTR(rptr))) { + /* NOTE: we assume allocb returns an OK_32PTR */ + mp = allocb(hdrlen + tcps->tcps_wroff_xtra, BPRI_MED); + if (!mp) { + freemsg(mp1); + goto no_memory; + } + mp->b_cont = mp1; + mp1 = mp; + /* Leave room for Link Level header */ + rptr = &mp1->b_rptr[tcps->tcps_wroff_xtra]; + mp1->b_wptr = &rptr[hdrlen]; + } + mp1->b_rptr = rptr; + + /* Fill in the timestamp option. */ + if (tcp->tcp_snd_ts_ok) { + uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; + + U32_TO_BE32(llbolt, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); + U32_TO_BE32(tcp->tcp_ts_recent, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); + } else { + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); + } + + /* copy header into outgoing packet */ + dst = (ipaddr_t *)rptr; + src = (ipaddr_t *)connp->conn_ht_iphc; + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; + dst[8] = src[8]; + dst[9] = src[9]; + if (hdrlen -= 40) { + hdrlen >>= 2; + dst += 10; + src += 10; + do { + *dst++ = *src++; + } while (--hdrlen); + } + + /* + * Set the ECN info in the TCP header. Note that this + * is not the template header. + */ + if (tcp->tcp_ecn_ok) { + TCP_SET_ECT(tcp, rptr); + + tcpha = (tcpha_t *)(rptr + ixa->ixa_ip_hdr_length); + if (tcp->tcp_ecn_echo_on) + tcpha->tha_flags |= TH_ECE; + if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { + tcpha->tha_flags |= TH_CWR; + tcp->tcp_ecn_cwr_sent = B_TRUE; + } + } + + if (tcp->tcp_ip_forward_progress) { + tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; + } + tcp_send_data(tcp, mp1); + return; + + /* + * If we ran out of memory, we pretend to have sent the packet + * and that it was lost on the wire. + */ +no_memory: + return; + +slow: + /* leftover work from above */ + tcp->tcp_unsent = len; + tcp->tcp_xmit_tail_unsent = len; + tcp_wput_data(tcp, NULL, B_FALSE); +} + +/* ARGSUSED2 */ +void +tcp_output_urgent(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + int len; + uint32_t msize; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + msize = msgdsize(mp); + + len = msize - 1; + if (len < 0) { + freemsg(mp); + return; + } + + /* + * Try to force urgent data out on the wire. Even if we have unsent + * data this will at least send the urgent flag. + * XXX does not handle more flag correctly. + */ + len += tcp->tcp_unsent; + len += tcp->tcp_snxt; + tcp->tcp_urg = len; + tcp->tcp_valid_bits |= TCP_URG_VALID; + + /* Bypass tcp protocol for fused tcp loopback */ + if (tcp->tcp_fused && tcp_fuse_output(tcp, mp, msize)) + return; + + /* Strip off the T_EXDATA_REQ if the data is from TPI */ + if (DB_TYPE(mp) != M_DATA) { + mblk_t *mp1 = mp; + ASSERT(!IPCL_IS_NONSTR(connp)); + mp = mp->b_cont; + freeb(mp1); + } + tcp_wput_data(tcp, mp, B_TRUE); +} + +/* + * Called by streams close routine via squeues when our client blows off her + * descriptor, we take this to mean: "close the stream state NOW, close the tcp + * connection politely" When SO_LINGER is set (with a non-zero linger time and + * it is not a nonblocking socket) then this routine sleeps until the FIN is + * acked. + * + * NOTE: tcp_close potentially returns error when lingering. + * However, the stream head currently does not pass these errors + * to the application. 4.4BSD only returns EINTR and EWOULDBLOCK + * errors to the application (from tsleep()) and not errors + * like ECONNRESET caused by receiving a reset packet. + */ + +/* ARGSUSED */ +void +tcp_close_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + char *msg; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + clock_t delta = 0; + tcp_stack_t *tcps = tcp->tcp_tcps; + + ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) || + (connp->conn_fanout == NULL && connp->conn_ref >= 3)); + + mutex_enter(&tcp->tcp_eager_lock); + if (tcp->tcp_conn_req_cnt_q0 != 0 || tcp->tcp_conn_req_cnt_q != 0) { + /* Cleanup for listener */ + tcp_eager_cleanup(tcp, 0); + tcp->tcp_wait_for_eagers = 1; + } + mutex_exit(&tcp->tcp_eager_lock); + + tcp->tcp_lso = B_FALSE; + + msg = NULL; + switch (tcp->tcp_state) { + case TCPS_CLOSED: + case TCPS_IDLE: + case TCPS_BOUND: + case TCPS_LISTEN: + break; + case TCPS_SYN_SENT: + msg = "tcp_close, during connect"; + break; + case TCPS_SYN_RCVD: + /* + * Close during the connect 3-way handshake + * but here there may or may not be pending data + * already on queue. Process almost same as in + * the ESTABLISHED state. + */ + /* FALLTHRU */ + default: + if (tcp->tcp_fused) + tcp_unfuse(tcp); + + /* + * If SO_LINGER has set a zero linger time, abort the + * connection with a reset. + */ + if (connp->conn_linger && connp->conn_lingertime == 0) { + msg = "tcp_close, zero lingertime"; + break; + } + + /* + * Abort connection if there is unread data queued. + */ + if (tcp->tcp_rcv_list || tcp->tcp_reass_head) { + msg = "tcp_close, unread data"; + break; + } + /* + * We have done a qwait() above which could have possibly + * drained more messages in turn causing transition to a + * different state. Check whether we have to do the rest + * of the processing or not. + */ + if (tcp->tcp_state <= TCPS_LISTEN) + break; + + /* + * Transmit the FIN before detaching the tcp_t. + * After tcp_detach returns this queue/perimeter + * no longer owns the tcp_t thus others can modify it. + */ + (void) tcp_xmit_end(tcp); + + /* + * If lingering on close then wait until the fin is acked, + * the SO_LINGER time passes, or a reset is sent/received. + */ + if (connp->conn_linger && connp->conn_lingertime > 0 && + !(tcp->tcp_fin_acked) && + tcp->tcp_state >= TCPS_ESTABLISHED) { + if (tcp->tcp_closeflags & (FNDELAY|FNONBLOCK)) { + tcp->tcp_client_errno = EWOULDBLOCK; + } else if (tcp->tcp_client_errno == 0) { + + ASSERT(tcp->tcp_linger_tid == 0); + + tcp->tcp_linger_tid = TCP_TIMER(tcp, + tcp_close_linger_timeout, + connp->conn_lingertime * hz); + + /* tcp_close_linger_timeout will finish close */ + if (tcp->tcp_linger_tid == 0) + tcp->tcp_client_errno = ENOSR; + else + return; + } + + /* + * Check if we need to detach or just close + * the instance. + */ + if (tcp->tcp_state <= TCPS_LISTEN) + break; + } + + /* + * Make sure that no other thread will access the conn_rq of + * this instance (through lookups etc.) as conn_rq will go + * away shortly. + */ + tcp_acceptor_hash_remove(tcp); + + mutex_enter(&tcp->tcp_non_sq_lock); + if (tcp->tcp_flow_stopped) { + tcp_clrqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + + if (tcp->tcp_timer_tid != 0) { + delta = TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); + tcp->tcp_timer_tid = 0; + } + /* + * Need to cancel those timers which will not be used when + * TCP is detached. This has to be done before the conn_wq + * is set to NULL. + */ + tcp_timers_stop(tcp); + + tcp->tcp_detached = B_TRUE; + if (tcp->tcp_state == TCPS_TIME_WAIT) { + tcp_time_wait_append(tcp); + TCP_DBGSTAT(tcps, tcp_detach_time_wait); + ASSERT(connp->conn_ref >= 3); + goto finish; + } + + /* + * If delta is zero the timer event wasn't executed and was + * successfully canceled. In this case we need to restart it + * with the minimal delta possible. + */ + if (delta >= 0) + tcp->tcp_timer_tid = TCP_TIMER(tcp, tcp_timer, + delta ? delta : 1); + + ASSERT(connp->conn_ref >= 3); + goto finish; + } + + /* Detach did not complete. Still need to remove q from stream. */ + if (msg) { + if (tcp->tcp_state == TCPS_ESTABLISHED || + tcp->tcp_state == TCPS_CLOSE_WAIT) + TCPS_BUMP_MIB(tcps, tcpEstabResets); + if (tcp->tcp_state == TCPS_SYN_SENT || + tcp->tcp_state == TCPS_SYN_RCVD) + TCPS_BUMP_MIB(tcps, tcpAttemptFails); + tcp_xmit_ctl(msg, tcp, tcp->tcp_snxt, 0, TH_RST); + } + + tcp_closei_local(tcp); + CONN_DEC_REF(connp); + ASSERT(connp->conn_ref >= 2); + +finish: + mutex_enter(&tcp->tcp_closelock); + /* + * Don't change the queues in the case of a listener that has + * eagers in its q or q0. It could surprise the eagers. + * Instead wait for the eagers outside the squeue. + */ + if (!tcp->tcp_wait_for_eagers) { + tcp->tcp_detached = B_TRUE; + connp->conn_rq = NULL; + connp->conn_wq = NULL; + } + + /* Signal tcp_close() to finish closing. */ + tcp->tcp_closed = 1; + cv_signal(&tcp->tcp_closecv); + mutex_exit(&tcp->tcp_closelock); +} + +/* ARGSUSED */ +void +tcp_shutdown_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + freemsg(mp); + + if (tcp->tcp_fused) + tcp_unfuse(tcp); + + if (tcp_xmit_end(tcp) != 0) { + /* + * We were crossing FINs and got a reset from + * the other side. Just ignore it. + */ + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_shutdown_output() out of state %s", + tcp_display(tcp, NULL, DISP_ADDR_AND_PORT)); + } + } +} + +#pragma inline(tcp_send_data) + +void +tcp_send_data(tcp_t *tcp, mblk_t *mp) +{ + conn_t *connp = tcp->tcp_connp; + + /* + * Check here to avoid sending zero-copy message down to IP when + * ZEROCOPY capability has turned off. We only need to deal with + * the race condition between sockfs and the notification here. + * Since we have tried to backoff the tcp_xmit_head when turning + * zero-copy off and new messages in tcp_output(), we simply drop + * the dup'ed packet here and let tcp retransmit, if tcp_xmit_zc_clean + * is not true. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_snd_zcopy_on && + !tcp->tcp_xmit_zc_clean) { + ip_drop_output("TCP ZC was disabled but not clean", mp, NULL); + freemsg(mp); + return; + } + + ASSERT(connp->conn_ixa->ixa_notify_cookie == connp->conn_tcp); + (void) conn_ip_output(mp, connp->conn_ixa); +} + +/* ARGSUSED2 */ +void +tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *econnp = (conn_t *)arg; + tcp_t *tcp = econnp->conn_tcp; + + /* Guard against a RST having blown it away while on the squeue */ + if (tcp->tcp_state == TCPS_CLOSED) { + freemsg(mp); + return; + } + + (void) conn_ip_output(mp, econnp->conn_ixa); +} + +/* + * tcp_send() is called by tcp_wput_data() and returns one of the following: + * + * -1 = failed allocation. + * 0 = success; burst count reached, or usable send window is too small, + * and that we'd rather wait until later before sending again. + */ +static int +tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, + const int tcp_hdr_len, const int num_sack_blk, int *usable, + uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) +{ + int num_burst_seg = tcp->tcp_snd_burst; + int num_lso_seg = 1; + uint_t lso_usable; + boolean_t do_lso_send = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + /* + * Check LSO possibility. The value of tcp->tcp_lso indicates whether + * the underlying connection is LSO capable. Will check whether having + * enough available data to initiate LSO transmission in the for(){} + * loops. + */ + if (tcp->tcp_lso && (tcp->tcp_valid_bits & ~TCP_FSS_VALID) == 0) + do_lso_send = B_TRUE; + + for (;;) { + struct datab *db; + tcpha_t *tcpha; + uint32_t sum; + mblk_t *mp, *mp1; + uchar_t *rptr; + int len; + + /* + * Burst count reached, return successfully. + */ + if (num_burst_seg == 0) + break; + + /* + * Calculate the maximum payload length we can send at one + * time. + */ + if (do_lso_send) { + /* + * Check whether be able to to do LSO for the current + * available data. + */ + if (num_burst_seg >= 2 && (*usable - 1) / mss >= 1) { + lso_usable = MIN(tcp->tcp_lso_max, *usable); + lso_usable = MIN(lso_usable, + num_burst_seg * mss); + + num_lso_seg = lso_usable / mss; + if (lso_usable % mss) { + num_lso_seg++; + tcp->tcp_last_sent_len = (ushort_t) + (lso_usable % mss); + } else { + tcp->tcp_last_sent_len = (ushort_t)mss; + } + } else { + do_lso_send = B_FALSE; + num_lso_seg = 1; + lso_usable = mss; + } + } + + ASSERT(num_lso_seg <= IP_MAXPACKET / mss + 1); +#ifdef DEBUG + DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, boolean_t, + do_lso_send); +#endif + /* + * Adjust num_burst_seg here. + */ + num_burst_seg -= num_lso_seg; + + len = mss; + if (len > *usable) { + ASSERT(do_lso_send == B_FALSE); + + len = *usable; + if (len <= 0) { + /* Terminate the loop */ + break; /* success; too small */ + } + /* + * Sender silly-window avoidance. + * Ignore this if we are going to send a + * zero window probe out. + * + * TODO: force data into microscopic window? + * ==> (!pushed || (unsent > usable)) + */ + if (len < (tcp->tcp_max_swnd >> 1) && + (tcp->tcp_unsent - (*snxt - tcp->tcp_snxt)) > len && + !((tcp->tcp_valid_bits & TCP_URG_VALID) && + len == 1) && (! tcp->tcp_zero_win_probe)) { + /* + * If the retransmit timer is not running + * we start it so that we will retransmit + * in the case when the receiver has + * decremented the window. + */ + if (*snxt == tcp->tcp_snxt && + *snxt == tcp->tcp_suna) { + /* + * We are not supposed to send + * anything. So let's wait a little + * bit longer before breaking SWS + * avoidance. + * + * What should the value be? + * Suggestion: MAX(init rexmit time, + * tcp->tcp_rto) + */ + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + } + break; /* success; too small */ + } + } + + tcpha = tcp->tcp_tcpha; + + /* + * The reason to adjust len here is that we need to set flags + * and calculate checksum. + */ + if (do_lso_send) + len = lso_usable; + + *usable -= len; /* Approximate - can be adjusted later */ + if (*usable > 0) + tcpha->tha_flags = TH_ACK; + else + tcpha->tha_flags = (TH_ACK | TH_PUSH); + + /* + * Prime pump for IP's checksumming on our behalf. + * Include the adjustment for a source route if any. + * In case of LSO, the partial pseudo-header checksum should + * exclusive TCP length, so zero tha_sum before IP calculate + * pseudo-header checksum for partial checksum offload. + */ + if (do_lso_send) { + sum = 0; + } else { + sum = len + tcp_hdr_len + connp->conn_sum; + sum = (sum >> 16) + (sum & 0xFFFF); + } + tcpha->tha_sum = htons(sum); + tcpha->tha_seq = htonl(*snxt); + + /* + * Branch off to tcp_xmit_mp() if any of the VALID bits is + * set. For the case when TCP_FSS_VALID is the only valid + * bit (normal active close), branch off only when we think + * that the FIN flag needs to be set. Note for this case, + * that (snxt + len) may not reflect the actual seg_len, + * as len may be further reduced in tcp_xmit_mp(). If len + * gets modified, we will end up here again. + */ + if (tcp->tcp_valid_bits != 0 && + (tcp->tcp_valid_bits != TCP_FSS_VALID || + ((*snxt + len) == tcp->tcp_fss))) { + uchar_t *prev_rptr; + uint32_t prev_snxt = tcp->tcp_snxt; + + if (*tail_unsent == 0) { + ASSERT((*xmit_tail)->b_cont != NULL); + *xmit_tail = (*xmit_tail)->b_cont; + prev_rptr = (*xmit_tail)->b_rptr; + *tail_unsent = (int)((*xmit_tail)->b_wptr - + (*xmit_tail)->b_rptr); + } else { + prev_rptr = (*xmit_tail)->b_rptr; + (*xmit_tail)->b_rptr = (*xmit_tail)->b_wptr - + *tail_unsent; + } + mp = tcp_xmit_mp(tcp, *xmit_tail, len, NULL, NULL, + *snxt, B_FALSE, (uint32_t *)&len, B_FALSE); + /* Restore tcp_snxt so we get amount sent right. */ + tcp->tcp_snxt = prev_snxt; + if (prev_rptr == (*xmit_tail)->b_rptr) { + /* + * If the previous timestamp is still in use, + * don't stomp on it. + */ + if ((*xmit_tail)->b_next == NULL) { + (*xmit_tail)->b_prev = local_time; + (*xmit_tail)->b_next = + (mblk_t *)(uintptr_t)(*snxt); + } + } else + (*xmit_tail)->b_rptr = prev_rptr; + + if (mp == NULL) { + return (-1); + } + mp1 = mp->b_cont; + + if (len <= mss) /* LSO is unusable (!do_lso_send) */ + tcp->tcp_last_sent_len = (ushort_t)len; + while (mp1->b_cont) { + *xmit_tail = (*xmit_tail)->b_cont; + (*xmit_tail)->b_prev = local_time; + (*xmit_tail)->b_next = + (mblk_t *)(uintptr_t)(*snxt); + mp1 = mp1->b_cont; + } + *snxt += len; + *tail_unsent = (*xmit_tail)->b_wptr - mp1->b_wptr; + BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpOutDataSegs); + TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); + tcp_send_data(tcp, mp); + continue; + } + + *snxt += len; /* Adjust later if we don't send all of len */ + TCPS_BUMP_MIB(tcps, tcpOutDataSegs); + TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, len); + + if (*tail_unsent) { + /* Are the bytes above us in flight? */ + rptr = (*xmit_tail)->b_wptr - *tail_unsent; + if (rptr != (*xmit_tail)->b_rptr) { + *tail_unsent -= len; + if (len <= mss) /* LSO is unusable */ + tcp->tcp_last_sent_len = (ushort_t)len; + len += total_hdr_len; + ixa->ixa_pktlen = len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + tcp->tcp_ipha->ipha_length = htons(len); + } else { + tcp->tcp_ip6h->ip6_plen = + htons(len - IPV6_HDR_LEN); + } + + mp = dupb(*xmit_tail); + if (mp == NULL) { + return (-1); /* out_of_mem */ + } + mp->b_rptr = rptr; + /* + * If the old timestamp is no longer in use, + * sample a new timestamp now. + */ + if ((*xmit_tail)->b_next == NULL) { + (*xmit_tail)->b_prev = local_time; + (*xmit_tail)->b_next = + (mblk_t *)(uintptr_t)(*snxt-len); + } + goto must_alloc; + } + } else { + *xmit_tail = (*xmit_tail)->b_cont; + ASSERT((uintptr_t)((*xmit_tail)->b_wptr - + (*xmit_tail)->b_rptr) <= (uintptr_t)INT_MAX); + *tail_unsent = (int)((*xmit_tail)->b_wptr - + (*xmit_tail)->b_rptr); + } + + (*xmit_tail)->b_prev = local_time; + (*xmit_tail)->b_next = (mblk_t *)(uintptr_t)(*snxt - len); + + *tail_unsent -= len; + if (len <= mss) /* LSO is unusable (!do_lso_send) */ + tcp->tcp_last_sent_len = (ushort_t)len; + + len += total_hdr_len; + ixa->ixa_pktlen = len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + tcp->tcp_ipha->ipha_length = htons(len); + } else { + tcp->tcp_ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); + } + + mp = dupb(*xmit_tail); + if (mp == NULL) { + return (-1); /* out_of_mem */ + } + + len = total_hdr_len; + /* + * There are four reasons to allocate a new hdr mblk: + * 1) The bytes above us are in use by another packet + * 2) We don't have good alignment + * 3) The mblk is being shared + * 4) We don't have enough room for a header + */ + rptr = mp->b_rptr - len; + if (!OK_32PTR(rptr) || + ((db = mp->b_datap), db->db_ref != 2) || + rptr < db->db_base) { + /* NOTE: we assume allocb returns an OK_32PTR */ + + must_alloc:; + mp1 = allocb(connp->conn_ht_iphc_allocated + + tcps->tcps_wroff_xtra, BPRI_MED); + if (mp1 == NULL) { + freemsg(mp); + return (-1); /* out_of_mem */ + } + mp1->b_cont = mp; + mp = mp1; + /* Leave room for Link Level header */ + len = total_hdr_len; + rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; + mp->b_wptr = &rptr[len]; + } + + /* + * Fill in the header using the template header, and add + * options such as time-stamp, ECN and/or SACK, as needed. + */ + tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); + + mp->b_rptr = rptr; + + if (*tail_unsent) { + int spill = *tail_unsent; + + mp1 = mp->b_cont; + if (mp1 == NULL) + mp1 = mp; + + /* + * If we're a little short, tack on more mblks until + * there is no more spillover. + */ + while (spill < 0) { + mblk_t *nmp; + int nmpsz; + + nmp = (*xmit_tail)->b_cont; + nmpsz = MBLKL(nmp); + + /* + * Excess data in mblk; can we split it? + * If LSO is enabled for the connection, + * keep on splitting as this is a transient + * send path. + */ + if (!do_lso_send && (spill + nmpsz > 0)) { + /* + * Don't split if stream head was + * told to break up larger writes + * into smaller ones. + */ + if (tcp->tcp_maxpsz_multiplier > 0) + break; + + /* + * Next mblk is less than SMSS/2 + * rounded up to nearest 64-byte; + * let it get sent as part of the + * next segment. + */ + if (tcp->tcp_localnet && + !tcp->tcp_cork && + (nmpsz < roundup((mss >> 1), 64))) + break; + } + + *xmit_tail = nmp; + ASSERT((uintptr_t)nmpsz <= (uintptr_t)INT_MAX); + /* Stash for rtt use later */ + (*xmit_tail)->b_prev = local_time; + (*xmit_tail)->b_next = + (mblk_t *)(uintptr_t)(*snxt - len); + mp1->b_cont = dupb(*xmit_tail); + mp1 = mp1->b_cont; + + spill += nmpsz; + if (mp1 == NULL) { + *tail_unsent = spill; + freemsg(mp); + return (-1); /* out_of_mem */ + } + } + + /* Trim back any surplus on the last mblk */ + if (spill >= 0) { + mp1->b_wptr -= spill; + *tail_unsent = spill; + } else { + /* + * We did not send everything we could in + * order to remain within the b_cont limit. + */ + *usable -= spill; + *snxt += spill; + tcp->tcp_last_sent_len += spill; + TCPS_UPDATE_MIB(tcps, tcpOutDataBytes, spill); + /* + * Adjust the checksum + */ + tcpha = (tcpha_t *)(rptr + + ixa->ixa_ip_hdr_length); + sum += spill; + sum = (sum >> 16) + (sum & 0xFFFF); + tcpha->tha_sum = htons(sum); + if (connp->conn_ipversion == IPV4_VERSION) { + sum = ntohs( + ((ipha_t *)rptr)->ipha_length) + + spill; + ((ipha_t *)rptr)->ipha_length = + htons(sum); + } else { + sum = ntohs( + ((ip6_t *)rptr)->ip6_plen) + + spill; + ((ip6_t *)rptr)->ip6_plen = + htons(sum); + } + ixa->ixa_pktlen += spill; + *tail_unsent = 0; + } + } + if (tcp->tcp_ip_forward_progress) { + tcp->tcp_ip_forward_progress = B_FALSE; + ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + ixa->ixa_flags &= ~IXAF_REACH_CONF; + } + + if (do_lso_send) { + /* Append LSO information to the mp. */ + lso_info_set(mp, mss, HW_LSO); + ixa->ixa_fragsize = IP_MAXPACKET; + ixa->ixa_extra_ident = num_lso_seg - 1; + + DTRACE_PROBE2(tcp_send_lso, int, num_lso_seg, + boolean_t, B_TRUE); + + tcp_send_data(tcp, mp); + + /* + * Restore values of ixa_fragsize and ixa_extra_ident. + */ + ixa->ixa_fragsize = ixa->ixa_pmtu; + ixa->ixa_extra_ident = 0; + tcp->tcp_obsegs += num_lso_seg; + TCP_STAT(tcps, tcp_lso_times); + TCP_STAT_UPDATE(tcps, tcp_lso_pkt_out, num_lso_seg); + } else { + /* + * Make sure to clean up LSO information. Wherever a + * new mp uses the prepended header room after dupb(), + * lso_info_cleanup() should be called. + */ + lso_info_cleanup(mp); + tcp_send_data(tcp, mp); + BUMP_LOCAL(tcp->tcp_obsegs); + } + } + + return (0); +} + +/* + * Initiate closedown sequence on an active connection. (May be called as + * writer.) Return value zero for OK return, non-zero for error return. + */ +static int +tcp_xmit_end(tcp_t *tcp) +{ + mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + iulp_t uinfo; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + conn_t *connp = tcp->tcp_connp; + + if (tcp->tcp_state < TCPS_SYN_RCVD || + tcp->tcp_state > TCPS_CLOSE_WAIT) { + /* + * Invalid state, only states TCPS_SYN_RCVD, + * TCPS_ESTABLISHED and TCPS_CLOSE_WAIT are valid + */ + return (-1); + } + + tcp->tcp_fss = tcp->tcp_snxt + tcp->tcp_unsent; + tcp->tcp_valid_bits |= TCP_FSS_VALID; + /* + * If there is nothing more unsent, send the FIN now. + * Otherwise, it will go out with the last segment. + */ + if (tcp->tcp_unsent == 0) { + mp = tcp_xmit_mp(tcp, NULL, 0, NULL, NULL, + tcp->tcp_fss, B_FALSE, NULL, B_FALSE); + + if (mp) { + tcp_send_data(tcp, mp); + } else { + /* + * Couldn't allocate msg. Pretend we got it out. + * Wait for rexmit timeout. + */ + tcp->tcp_snxt = tcp->tcp_fss + 1; + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + } + + /* + * If needed, update tcp_rexmit_snxt as tcp_snxt is + * changed. + */ + if (tcp->tcp_rexmit && tcp->tcp_rexmit_nxt == tcp->tcp_fss) { + tcp->tcp_rexmit_nxt = tcp->tcp_snxt; + } + } else { + /* + * If tcp->tcp_cork is set, then the data will not get sent, + * so we have to check that and unset it first. + */ + if (tcp->tcp_cork) + tcp->tcp_cork = B_FALSE; + tcp_wput_data(tcp, NULL, B_FALSE); + } + + /* + * If TCP does not get enough samples of RTT or tcp_rtt_updates + * is 0, don't update the cache. + */ + if (tcps->tcps_rtt_updates == 0 || + tcp->tcp_rtt_update < tcps->tcps_rtt_updates) + return (0); + + /* + * We do not have a good algorithm to update ssthresh at this time. + * So don't do any update. + */ + bzero(&uinfo, sizeof (uinfo)); + uinfo.iulp_rtt = tcp->tcp_rtt_sa; + uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; + + /* + * Note that uinfo is kept for conn_faddr in the DCE. Could update even + * if source routed but we don't. + */ + if (connp->conn_ipversion == IPV4_VERSION) { + if (connp->conn_faddr_v4 != tcp->tcp_ipha->ipha_dst) { + return (0); + } + (void) dce_update_uinfo_v4(connp->conn_faddr_v4, &uinfo, ipst); + } else { + uint_t ifindex; + + if (!(IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, + &tcp->tcp_ip6h->ip6_dst))) { + return (0); + } + ifindex = 0; + if (IN6_IS_ADDR_LINKSCOPE(&connp->conn_faddr_v6)) { + ip_xmit_attr_t *ixa = connp->conn_ixa; + + /* + * If we are going to create a DCE we'd better have + * an ifindex + */ + if (ixa->ixa_nce != NULL) { + ifindex = ixa->ixa_nce->nce_common->ncec_ill-> + ill_phyint->phyint_ifindex; + } else { + return (0); + } + } + + (void) dce_update_uinfo(&connp->conn_faddr_v6, ifindex, &uinfo, + ipst); + } + return (0); +} + +/* + * Send out a control packet on the tcp connection specified. This routine + * is typically called where we need a simple ACK or RST generated. + */ +void +tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq, uint32_t ack, int ctl) +{ + uchar_t *rptr; + tcpha_t *tcpha; + ipha_t *ipha = NULL; + ip6_t *ip6h = NULL; + uint32_t sum; + int total_hdr_len; + int ip_hdr_len; + mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + /* + * Save sum for use in source route later. + */ + sum = connp->conn_ht_ulp_len + connp->conn_sum; + total_hdr_len = connp->conn_ht_iphc_len; + ip_hdr_len = ixa->ixa_ip_hdr_length; + + /* If a text string is passed in with the request, pass it to strlog. */ + if (str != NULL && connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_xmit_ctl: '%s', seq 0x%x, ack 0x%x, ctl 0x%x", + str, seq, ack, ctl); + } + mp = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, + BPRI_MED); + if (mp == NULL) { + return; + } + rptr = &mp->b_rptr[tcps->tcps_wroff_xtra]; + mp->b_rptr = rptr; + mp->b_wptr = &rptr[total_hdr_len]; + bcopy(connp->conn_ht_iphc, rptr, total_hdr_len); + + ixa->ixa_pktlen = total_hdr_len; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ipha = (ipha_t *)rptr; + ipha->ipha_length = htons(total_hdr_len); + } else { + ip6h = (ip6_t *)rptr; + ip6h->ip6_plen = htons(total_hdr_len - IPV6_HDR_LEN); + } + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + tcpha->tha_flags = (uint8_t)ctl; + if (ctl & TH_RST) { + TCPS_BUMP_MIB(tcps, tcpOutRsts); + TCPS_BUMP_MIB(tcps, tcpOutControl); + /* + * Don't send TSopt w/ TH_RST packets per RFC 1323. + */ + if (tcp->tcp_snd_ts_ok && + tcp->tcp_state > TCPS_SYN_SENT) { + mp->b_wptr = &rptr[total_hdr_len - TCPOPT_REAL_TS_LEN]; + *(mp->b_wptr) = TCPOPT_EOL; + + ixa->ixa_pktlen = total_hdr_len - TCPOPT_REAL_TS_LEN; + + if (connp->conn_ipversion == IPV4_VERSION) { + ipha->ipha_length = htons(total_hdr_len - + TCPOPT_REAL_TS_LEN); + } else { + ip6h->ip6_plen = htons(total_hdr_len - + IPV6_HDR_LEN - TCPOPT_REAL_TS_LEN); + } + tcpha->tha_offset_and_reserved -= (3 << 4); + sum -= TCPOPT_REAL_TS_LEN; + } + } + if (ctl & TH_ACK) { + if (tcp->tcp_snd_ts_ok) { + uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; + + U32_TO_BE32(llbolt, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); + U32_TO_BE32(tcp->tcp_ts_recent, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); + } + + /* Update the latest receive window size in TCP header. */ + tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + /* Track what we sent to the peer */ + tcp->tcp_tcpha->tha_win = tcpha->tha_win; + tcp->tcp_rack = ack; + tcp->tcp_rack_cnt = 0; + TCPS_BUMP_MIB(tcps, tcpOutAck); + } + BUMP_LOCAL(tcp->tcp_obsegs); + tcpha->tha_seq = htonl(seq); + tcpha->tha_ack = htonl(ack); + /* + * Include the adjustment for a source route if any. + */ + sum = (sum >> 16) + (sum & 0xFFFF); + tcpha->tha_sum = htons(sum); + tcp_send_data(tcp, mp); +} + +/* + * Generate a reset based on an inbound packet, connp is set by caller + * when RST is in response to an unexpected inbound packet for which + * there is active tcp state in the system. + * + * IPSEC NOTE : Try to send the reply with the same protection as it came + * in. We have the ip_recv_attr_t which is reversed to form the ip_xmit_attr_t. + * That way the packet will go out at the same level of protection as it + * came in with. + */ +static void +tcp_xmit_early_reset(char *str, mblk_t *mp, uint32_t seq, uint32_t ack, int ctl, + ip_recv_attr_t *ira, ip_stack_t *ipst, conn_t *connp) +{ + ipha_t *ipha = NULL; + ip6_t *ip6h = NULL; + ushort_t len; + tcpha_t *tcpha; + int i; + ipaddr_t v4addr; + in6_addr_t v6addr; + netstack_t *ns = ipst->ips_netstack; + tcp_stack_t *tcps = ns->netstack_tcp; + ip_xmit_attr_t ixas, *ixa; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; + boolean_t need_refrele = B_FALSE; /* ixa_refrele(ixa) */ + ushort_t port; + + if (!tcp_send_rst_chk(tcps)) { + TCP_STAT(tcps, tcp_rst_unsent); + freemsg(mp); + return; + } + + /* + * If connp != NULL we use conn_ixa to keep IP_NEXTHOP and other + * options from the listener. In that case the caller must ensure that + * we are running on the listener = connp squeue. + * + * We get a safe copy of conn_ixa so we don't need to restore anything + * we or ip_output_simple might change in the ixa. + */ + if (connp != NULL) { + ASSERT(connp->conn_on_sqp); + + ixa = conn_get_ixa_exclusive(connp); + if (ixa == NULL) { + TCP_STAT(tcps, tcp_rst_unsent); + freemsg(mp); + return; + } + need_refrele = B_TRUE; + } else { + bzero(&ixas, sizeof (ixas)); + ixa = &ixas; + /* + * IXAF_VERIFY_SOURCE is overkill since we know the + * packet was for us. + */ + ixa->ixa_flags |= IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE; + ixa->ixa_protocol = IPPROTO_TCP; + ixa->ixa_zoneid = ira->ira_zoneid; + ixa->ixa_ifindex = 0; + ixa->ixa_ipst = ipst; + ixa->ixa_cred = kcred; + ixa->ixa_cpid = NOPID; + } + + if (str && tcps->tcps_dbg) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE, + "tcp_xmit_early_reset: '%s', seq 0x%x, ack 0x%x, " + "flags 0x%x", + str, seq, ack, ctl); + } + if (mp->b_datap->db_ref != 1) { + mblk_t *mp1 = copyb(mp); + freemsg(mp); + mp = mp1; + if (mp == NULL) + goto done; + } else if (mp->b_cont) { + freemsg(mp->b_cont); + mp->b_cont = NULL; + DB_CKSUMFLAGS(mp) = 0; + } + /* + * We skip reversing source route here. + * (for now we replace all IP options with EOL) + */ + if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { + ipha = (ipha_t *)mp->b_rptr; + for (i = IP_SIMPLE_HDR_LENGTH; i < (int)ip_hdr_len; i++) + mp->b_rptr[i] = IPOPT_EOL; + /* + * Make sure that src address isn't flagrantly invalid. + * Not all broadcast address checking for the src address + * is possible, since we don't know the netmask of the src + * addr. No check for destination address is done, since + * IP will not pass up a packet with a broadcast dest + * address to TCP. Similar checks are done below for IPv6. + */ + if (ipha->ipha_src == 0 || ipha->ipha_src == INADDR_BROADCAST || + CLASSD(ipha->ipha_src)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + goto done; + } + } else { + ip6h = (ip6_t *)mp->b_rptr; + + if (IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) || + IN6_IS_ADDR_MULTICAST(&ip6h->ip6_src)) { + BUMP_MIB(&ipst->ips_ip6_mib, ipIfStatsInDiscards); + ip_drop_input("ipIfStatsInDiscards", mp, NULL); + freemsg(mp); + goto done; + } + + /* Remove any extension headers assuming partial overlay */ + if (ip_hdr_len > IPV6_HDR_LEN) { + uint8_t *to; + + to = mp->b_rptr + ip_hdr_len - IPV6_HDR_LEN; + ovbcopy(ip6h, to, IPV6_HDR_LEN); + mp->b_rptr += ip_hdr_len - IPV6_HDR_LEN; + ip_hdr_len = IPV6_HDR_LEN; + ip6h = (ip6_t *)mp->b_rptr; + ip6h->ip6_nxt = IPPROTO_TCP; + } + } + tcpha = (tcpha_t *)&mp->b_rptr[ip_hdr_len]; + if (tcpha->tha_flags & TH_RST) { + freemsg(mp); + goto done; + } + tcpha->tha_offset_and_reserved = (5 << 4); + len = ip_hdr_len + sizeof (tcpha_t); + mp->b_wptr = &mp->b_rptr[len]; + if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { + ipha->ipha_length = htons(len); + /* Swap addresses */ + v4addr = ipha->ipha_src; + ipha->ipha_src = ipha->ipha_dst; + ipha->ipha_dst = v4addr; + ipha->ipha_ident = 0; + ipha->ipha_ttl = (uchar_t)tcps->tcps_ipv4_ttl; + ixa->ixa_flags |= IXAF_IS_IPV4; + ixa->ixa_ip_hdr_length = ip_hdr_len; + } else { + ip6h->ip6_plen = htons(len - IPV6_HDR_LEN); + /* Swap addresses */ + v6addr = ip6h->ip6_src; + ip6h->ip6_src = ip6h->ip6_dst; + ip6h->ip6_dst = v6addr; + ip6h->ip6_hops = (uchar_t)tcps->tcps_ipv6_hoplimit; + ixa->ixa_flags &= ~IXAF_IS_IPV4; + + if (IN6_IS_ADDR_LINKSCOPE(&ip6h->ip6_dst)) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = ira->ira_ruifindex; + } + ixa->ixa_ip_hdr_length = IPV6_HDR_LEN; + } + ixa->ixa_pktlen = len; + + /* Swap the ports */ + port = tcpha->tha_fport; + tcpha->tha_fport = tcpha->tha_lport; + tcpha->tha_lport = port; + + tcpha->tha_ack = htonl(ack); + tcpha->tha_seq = htonl(seq); + tcpha->tha_win = 0; + tcpha->tha_sum = htons(sizeof (tcpha_t)); + tcpha->tha_flags = (uint8_t)ctl; + if (ctl & TH_RST) { + TCPS_BUMP_MIB(tcps, tcpOutRsts); + TCPS_BUMP_MIB(tcps, tcpOutControl); + } + + /* Discard any old label */ + if (ixa->ixa_free_flags & IXA_FREE_TSL) { + ASSERT(ixa->ixa_tsl != NULL); + label_rele(ixa->ixa_tsl); + ixa->ixa_free_flags &= ~IXA_FREE_TSL; + } + ixa->ixa_tsl = ira->ira_tsl; /* Behave as a multi-level responder */ + + if (ira->ira_flags & IRAF_IPSEC_SECURE) { + /* + * Apply IPsec based on how IPsec was applied to + * the packet that caused the RST. + */ + if (!ipsec_in_to_out(ira, ixa, mp, ipha, ip6h)) { + BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); + /* Note: mp already consumed and ip_drop_packet done */ + goto done; + } + } else { + /* + * This is in clear. The RST message we are building + * here should go out in clear, independent of our policy. + */ + ixa->ixa_flags |= IXAF_NO_IPSEC; + } + + /* + * NOTE: one might consider tracing a TCP packet here, but + * this function has no active TCP state and no tcp structure + * that has a trace buffer. If we traced here, we would have + * to keep a local trace buffer in tcp_record_trace(). + */ + + (void) ip_output_simple(mp, ixa); +done: + ixa_cleanup(ixa); + if (need_refrele) { + ASSERT(ixa != &ixas); + ixa_refrele(ixa); + } +} + +/* + * Generate a "no listener here" RST in response to an "unknown" segment. + * connp is set by caller when RST is in response to an unexpected + * inbound packet for which there is active tcp state in the system. + * Note that we are reusing the incoming mp to construct the outgoing RST. + */ +void +tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst, + conn_t *connp) +{ + uchar_t *rptr; + uint32_t seg_len; + tcpha_t *tcpha; + uint32_t seg_seq; + uint32_t seg_ack; + uint_t flags; + ipha_t *ipha; + ip6_t *ip6h; + boolean_t policy_present; + netstack_t *ns = ipst->ips_netstack; + tcp_stack_t *tcps = ns->netstack_tcp; + ipsec_stack_t *ipss = tcps->tcps_netstack->netstack_ipsec; + uint_t ip_hdr_len = ira->ira_ip_hdr_length; + + TCP_STAT(tcps, tcp_no_listener); + + if (IPH_HDR_VERSION(mp->b_rptr) == IPV4_VERSION) { + policy_present = ipss->ipsec_inbound_v4_policy_present; + ipha = (ipha_t *)mp->b_rptr; + ip6h = NULL; + } else { + policy_present = ipss->ipsec_inbound_v6_policy_present; + ipha = NULL; + ip6h = (ip6_t *)mp->b_rptr; + } + + if (policy_present) { + /* + * The conn_t parameter is NULL because we already know + * nobody's home. + */ + mp = ipsec_check_global_policy(mp, (conn_t *)NULL, ipha, ip6h, + ira, ns); + if (mp == NULL) + return; + } + if (is_system_labeled() && !tsol_can_reply_error(mp, ira)) { + DTRACE_PROBE2( + tx__ip__log__error__nolistener__tcp, + char *, "Could not reply with RST to mp(1)", + mblk_t *, mp); + ip2dbg(("tcp_xmit_listeners_reset: not permitted to reply\n")); + freemsg(mp); + return; + } + + rptr = mp->b_rptr; + + tcpha = (tcpha_t *)&rptr[ip_hdr_len]; + seg_seq = ntohl(tcpha->tha_seq); + seg_ack = ntohl(tcpha->tha_ack); + flags = tcpha->tha_flags; + + seg_len = msgdsize(mp) - (TCP_HDR_LENGTH(tcpha) + ip_hdr_len); + if (flags & TH_RST) { + freemsg(mp); + } else if (flags & TH_ACK) { + tcp_xmit_early_reset("no tcp, reset", mp, seg_ack, 0, TH_RST, + ira, ipst, connp); + } else { + if (flags & TH_SYN) { + seg_len++; + } else { + /* + * Here we violate the RFC. Note that a normal + * TCP will never send a segment without the ACK + * flag, except for RST or SYN segment. This + * segment is neither. Just drop it on the + * floor. + */ + freemsg(mp); + TCP_STAT(tcps, tcp_rst_unsent); + return; + } + + tcp_xmit_early_reset("no tcp, reset/ack", mp, 0, + seg_seq + seg_len, TH_RST | TH_ACK, ira, ipst, connp); + } +} + +/* + * tcp_xmit_mp is called to return a pointer to an mblk chain complete with + * ip and tcp header ready to pass down to IP. If the mp passed in is + * non-NULL, then up to max_to_send bytes of data will be dup'ed off that + * mblk. (If sendall is not set the dup'ing will stop at an mblk boundary + * otherwise it will dup partial mblks.) + * Otherwise, an appropriate ACK packet will be generated. This + * routine is not usually called to send new data for the first time. It + * is mostly called out of the timer for retransmits, and to generate ACKs. + * + * If offset is not NULL, the returned mblk chain's first mblk's b_rptr will + * be adjusted by *offset. And after dupb(), the offset and the ending mblk + * of the original mblk chain will be returned in *offset and *end_mp. + */ +mblk_t * +tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset, + mblk_t **end_mp, uint32_t seq, boolean_t sendall, uint32_t *seg_len, + boolean_t rexmit) +{ + int data_length; + int32_t off = 0; + uint_t flags; + mblk_t *mp1; + mblk_t *mp2; + uchar_t *rptr; + tcpha_t *tcpha; + int32_t num_sack_blk = 0; + int32_t sack_opt_len = 0; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + /* Allocate for our maximum TCP header + link-level */ + mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra, + BPRI_MED); + if (!mp1) + return (NULL); + data_length = 0; + + /* + * Note that tcp_mss has been adjusted to take into account the + * timestamp option if applicable. Because SACK options do not + * appear in every TCP segments and they are of variable lengths, + * they cannot be included in tcp_mss. Thus we need to calculate + * the actual segment length when we need to send a segment which + * includes SACK options. + */ + if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { + num_sack_blk = MIN(tcp->tcp_max_sack_blk, + tcp->tcp_num_sack_blk); + sack_opt_len = num_sack_blk * sizeof (sack_blk_t) + + TCPOPT_NOP_LEN * 2 + TCPOPT_HEADER_LEN; + if (max_to_send + sack_opt_len > tcp->tcp_mss) + max_to_send -= sack_opt_len; + } + + if (offset != NULL) { + off = *offset; + /* We use offset as an indicator that end_mp is not NULL. */ + *end_mp = NULL; + } + for (mp2 = mp1; mp && data_length != max_to_send; mp = mp->b_cont) { + /* This could be faster with cooperation from downstream */ + if (mp2 != mp1 && !sendall && + data_length + (int)(mp->b_wptr - mp->b_rptr) > + max_to_send) + /* + * Don't send the next mblk since the whole mblk + * does not fit. + */ + break; + mp2->b_cont = dupb(mp); + mp2 = mp2->b_cont; + if (!mp2) { + freemsg(mp1); + return (NULL); + } + mp2->b_rptr += off; + ASSERT((uintptr_t)(mp2->b_wptr - mp2->b_rptr) <= + (uintptr_t)INT_MAX); + + data_length += (int)(mp2->b_wptr - mp2->b_rptr); + if (data_length > max_to_send) { + mp2->b_wptr -= data_length - max_to_send; + data_length = max_to_send; + off = mp2->b_wptr - mp->b_rptr; + break; + } else { + off = 0; + } + } + if (offset != NULL) { + *offset = off; + *end_mp = mp; + } + if (seg_len != NULL) { + *seg_len = data_length; + } + + /* Update the latest receive window size in TCP header. */ + tcp->tcp_tcpha->tha_win = htons(tcp->tcp_rwnd >> tcp->tcp_rcv_ws); + + rptr = mp1->b_rptr + tcps->tcps_wroff_xtra; + mp1->b_rptr = rptr; + mp1->b_wptr = rptr + connp->conn_ht_iphc_len + sack_opt_len; + bcopy(connp->conn_ht_iphc, rptr, connp->conn_ht_iphc_len); + tcpha = (tcpha_t *)&rptr[ixa->ixa_ip_hdr_length]; + tcpha->tha_seq = htonl(seq); + + /* + * Use tcp_unsent to determine if the PUSH bit should be used assumes + * that this function was called from tcp_wput_data. Thus, when called + * to retransmit data the setting of the PUSH bit may appear some + * what random in that it might get set when it should not. This + * should not pose any performance issues. + */ + if (data_length != 0 && (tcp->tcp_unsent == 0 || + tcp->tcp_unsent == data_length)) { + flags = TH_ACK | TH_PUSH; + } else { + flags = TH_ACK; + } + + if (tcp->tcp_ecn_ok) { + if (tcp->tcp_ecn_echo_on) + flags |= TH_ECE; + + /* + * Only set ECT bit and ECN_CWR if a segment contains new data. + * There is no TCP flow control for non-data segments, and + * only data segment is transmitted reliably. + */ + if (data_length > 0 && !rexmit) { + TCP_SET_ECT(tcp, rptr); + if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { + flags |= TH_CWR; + tcp->tcp_ecn_cwr_sent = B_TRUE; + } + } + } + + if (tcp->tcp_valid_bits) { + uint32_t u1; + + if ((tcp->tcp_valid_bits & TCP_ISS_VALID) && + seq == tcp->tcp_iss) { + uchar_t *wptr; + + /* + * If TCP_ISS_VALID and the seq number is tcp_iss, + * TCP can only be in SYN-SENT, SYN-RCVD or + * FIN-WAIT-1 state. It can be FIN-WAIT-1 if + * our SYN is not ack'ed but the app closes this + * TCP connection. + */ + ASSERT(tcp->tcp_state == TCPS_SYN_SENT || + tcp->tcp_state == TCPS_SYN_RCVD || + tcp->tcp_state == TCPS_FIN_WAIT_1); + + /* + * Tack on the MSS option. It is always needed + * for both active and passive open. + * + * MSS option value should be interface MTU - MIN + * TCP/IP header according to RFC 793 as it means + * the maximum segment size TCP can receive. But + * to get around some broken middle boxes/end hosts + * out there, we allow the option value to be the + * same as the MSS option size on the peer side. + * In this way, the other side will not send + * anything larger than they can receive. + * + * Note that for SYN_SENT state, the ndd param + * tcp_use_smss_as_mss_opt has no effect as we + * don't know the peer's MSS option value. So + * the only case we need to take care of is in + * SYN_RCVD state, which is done later. + */ + wptr = mp1->b_wptr; + wptr[0] = TCPOPT_MAXSEG; + wptr[1] = TCPOPT_MAXSEG_LEN; + wptr += 2; + u1 = tcp->tcp_initial_pmtu - + (connp->conn_ipversion == IPV4_VERSION ? + IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - + TCP_MIN_HEADER_LENGTH; + U16_TO_BE16(u1, wptr); + mp1->b_wptr = wptr + 2; + /* Update the offset to cover the additional word */ + tcpha->tha_offset_and_reserved += (1 << 4); + + /* + * Note that the following way of filling in + * TCP options are not optimal. Some NOPs can + * be saved. But there is no need at this time + * to optimize it. When it is needed, we will + * do it. + */ + switch (tcp->tcp_state) { + case TCPS_SYN_SENT: + flags = TH_SYN; + + if (tcp->tcp_snd_ts_ok) { + uint32_t llbolt = + (uint32_t)LBOLT_FASTPATH; + + wptr = mp1->b_wptr; + wptr[0] = TCPOPT_NOP; + wptr[1] = TCPOPT_NOP; + wptr[2] = TCPOPT_TSTAMP; + wptr[3] = TCPOPT_TSTAMP_LEN; + wptr += 4; + U32_TO_BE32(llbolt, wptr); + wptr += 4; + ASSERT(tcp->tcp_ts_recent == 0); + U32_TO_BE32(0L, wptr); + mp1->b_wptr += TCPOPT_REAL_TS_LEN; + tcpha->tha_offset_and_reserved += + (3 << 4); + } + + /* + * Set up all the bits to tell other side + * we are ECN capable. + */ + if (tcp->tcp_ecn_ok) { + flags |= (TH_ECE | TH_CWR); + } + break; + case TCPS_SYN_RCVD: + flags |= TH_SYN; + + /* + * Reset the MSS option value to be SMSS + * We should probably add back the bytes + * for timestamp option and IPsec. We + * don't do that as this is a workaround + * for broken middle boxes/end hosts, it + * is better for us to be more cautious. + * They may not take these things into + * account in their SMSS calculation. Thus + * the peer's calculated SMSS may be smaller + * than what it can be. This should be OK. + */ + if (tcps->tcps_use_smss_as_mss_opt) { + u1 = tcp->tcp_mss; + U16_TO_BE16(u1, wptr); + } + + /* + * If the other side is ECN capable, reply + * that we are also ECN capable. + */ + if (tcp->tcp_ecn_ok) + flags |= TH_ECE; + break; + default: + /* + * The above ASSERT() makes sure that this + * must be FIN-WAIT-1 state. Our SYN has + * not been ack'ed so retransmit it. + */ + flags |= TH_SYN; + break; + } + + if (tcp->tcp_snd_ws_ok) { + wptr = mp1->b_wptr; + wptr[0] = TCPOPT_NOP; + wptr[1] = TCPOPT_WSCALE; + wptr[2] = TCPOPT_WS_LEN; + wptr[3] = (uchar_t)tcp->tcp_rcv_ws; + mp1->b_wptr += TCPOPT_REAL_WS_LEN; + tcpha->tha_offset_and_reserved += (1 << 4); + } + + if (tcp->tcp_snd_sack_ok) { + wptr = mp1->b_wptr; + wptr[0] = TCPOPT_NOP; + wptr[1] = TCPOPT_NOP; + wptr[2] = TCPOPT_SACK_PERMITTED; + wptr[3] = TCPOPT_SACK_OK_LEN; + mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN; + tcpha->tha_offset_and_reserved += (1 << 4); + } + + /* allocb() of adequate mblk assures space */ + ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <= + (uintptr_t)INT_MAX); + u1 = (int)(mp1->b_wptr - mp1->b_rptr); + /* + * Get IP set to checksum on our behalf + * Include the adjustment for a source route if any. + */ + u1 += connp->conn_sum; + u1 = (u1 >> 16) + (u1 & 0xFFFF); + tcpha->tha_sum = htons(u1); + TCPS_BUMP_MIB(tcps, tcpOutControl); + } + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && + (seq + data_length) == tcp->tcp_fss) { + if (!tcp->tcp_fin_acked) { + flags |= TH_FIN; + TCPS_BUMP_MIB(tcps, tcpOutControl); + } + if (!tcp->tcp_fin_sent) { + tcp->tcp_fin_sent = B_TRUE; + switch (tcp->tcp_state) { + case TCPS_SYN_RCVD: + case TCPS_ESTABLISHED: + tcp->tcp_state = TCPS_FIN_WAIT_1; + break; + case TCPS_CLOSE_WAIT: + tcp->tcp_state = TCPS_LAST_ACK; + break; + } + if (tcp->tcp_suna == tcp->tcp_snxt) + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); + tcp->tcp_snxt = tcp->tcp_fss + 1; + } + } + /* + * Note the trick here. u1 is unsigned. When tcp_urg + * is smaller than seq, u1 will become a very huge value. + * So the comparison will fail. Also note that tcp_urp + * should be positive, see RFC 793 page 17. + */ + u1 = tcp->tcp_urg - seq + TCP_OLD_URP_INTERPRETATION; + if ((tcp->tcp_valid_bits & TCP_URG_VALID) && u1 != 0 && + u1 < (uint32_t)(64 * 1024)) { + flags |= TH_URG; + TCPS_BUMP_MIB(tcps, tcpOutUrg); + tcpha->tha_urp = htons(u1); + } + } + tcpha->tha_flags = (uchar_t)flags; + tcp->tcp_rack = tcp->tcp_rnxt; + tcp->tcp_rack_cnt = 0; + + if (tcp->tcp_snd_ts_ok) { + if (tcp->tcp_state != TCPS_SYN_SENT) { + uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; + + U32_TO_BE32(llbolt, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); + U32_TO_BE32(tcp->tcp_ts_recent, + (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); + } + } + + if (num_sack_blk > 0) { + uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len; + sack_blk_t *tmp; + int32_t i; + + wptr[0] = TCPOPT_NOP; + wptr[1] = TCPOPT_NOP; + wptr[2] = TCPOPT_SACK; + wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * + sizeof (sack_blk_t); + wptr += TCPOPT_REAL_SACK_LEN; + + tmp = tcp->tcp_sack_list; + for (i = 0; i < num_sack_blk; i++) { + U32_TO_BE32(tmp[i].begin, wptr); + wptr += sizeof (tcp_seq); + U32_TO_BE32(tmp[i].end, wptr); + wptr += sizeof (tcp_seq); + } + tcpha->tha_offset_and_reserved += ((num_sack_blk * 2 + 1) << 4); + } + ASSERT((uintptr_t)(mp1->b_wptr - rptr) <= (uintptr_t)INT_MAX); + data_length += (int)(mp1->b_wptr - rptr); + + ixa->ixa_pktlen = data_length; + + if (ixa->ixa_flags & IXAF_IS_IPV4) { + ((ipha_t *)rptr)->ipha_length = htons(data_length); + } else { + ip6_t *ip6 = (ip6_t *)rptr; + + ip6->ip6_plen = htons(data_length - IPV6_HDR_LEN); + } + + /* + * Prime pump for IP + * Include the adjustment for a source route if any. + */ + data_length -= ixa->ixa_ip_hdr_length; + data_length += connp->conn_sum; + data_length = (data_length >> 16) + (data_length & 0xFFFF); + tcpha->tha_sum = htons(data_length); + if (tcp->tcp_ip_forward_progress) { + tcp->tcp_ip_forward_progress = B_FALSE; + connp->conn_ixa->ixa_flags |= IXAF_REACH_CONF; + } else { + connp->conn_ixa->ixa_flags &= ~IXAF_REACH_CONF; + } + return (mp1); +} + +/* + * If this routine returns B_TRUE, TCP can generate a RST in response + * to a segment. If it returns B_FALSE, TCP should not respond. + */ +static boolean_t +tcp_send_rst_chk(tcp_stack_t *tcps) +{ + int64_t now; + + /* + * TCP needs to protect itself from generating too many RSTs. + * This can be a DoS attack by sending us random segments + * soliciting RSTs. + * + * What we do here is to have a limit of tcp_rst_sent_rate RSTs + * in each 1 second interval. In this way, TCP still generate + * RSTs in normal cases but when under attack, the impact is + * limited. + */ + if (tcps->tcps_rst_sent_rate_enabled != 0) { + now = ddi_get_lbolt64(); + if (TICK_TO_MSEC(now - tcps->tcps_last_rst_intrvl) > + 1*SECONDS) { + tcps->tcps_last_rst_intrvl = now; + tcps->tcps_rst_cnt = 1; + } else if (++tcps->tcps_rst_cnt > tcps->tcps_rst_sent_rate) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +/* + * This function handles all retransmissions if SACK is enabled for this + * connection. First it calculates how many segments can be retransmitted + * based on tcp_pipe. Then it goes thru the notsack list to find eligible + * segments. A segment is eligible if sack_cnt for that segment is greater + * than or equal tcp_dupack_fast_retransmit. After it has retransmitted + * all eligible segments, it checks to see if TCP can send some new segments + * (fast recovery). If it can, set the appropriate flag for tcp_input_data(). + * + * Parameters: + * tcp_t *tcp: the tcp structure of the connection. + * uint_t *flags: in return, appropriate value will be set for + * tcp_input_data(). + */ +void +tcp_sack_rexmit(tcp_t *tcp, uint_t *flags) +{ + notsack_blk_t *notsack_blk; + int32_t usable_swnd; + int32_t mss; + uint32_t seg_len; + mblk_t *xmit_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + ASSERT(tcp->tcp_sack_info != NULL); + ASSERT(tcp->tcp_notsack_list != NULL); + ASSERT(tcp->tcp_rexmit == B_FALSE); + + /* Defensive coding in case there is a bug... */ + if (tcp->tcp_notsack_list == NULL) { + return; + } + notsack_blk = tcp->tcp_notsack_list; + mss = tcp->tcp_mss; + + /* + * Limit the num of outstanding data in the network to be + * tcp_cwnd_ssthresh, which is half of the original congestion wnd. + */ + usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; + + /* At least retransmit 1 MSS of data. */ + if (usable_swnd <= 0) { + usable_swnd = mss; + } + + /* Make sure no new RTT samples will be taken. */ + tcp->tcp_csuna = tcp->tcp_snxt; + + notsack_blk = tcp->tcp_notsack_list; + while (usable_swnd > 0) { + mblk_t *snxt_mp, *tmp_mp; + tcp_seq begin = tcp->tcp_sack_snxt; + tcp_seq end; + int32_t off; + + for (; notsack_blk != NULL; notsack_blk = notsack_blk->next) { + if (SEQ_GT(notsack_blk->end, begin) && + (notsack_blk->sack_cnt >= + tcps->tcps_dupack_fast_retransmit)) { + end = notsack_blk->end; + if (SEQ_LT(begin, notsack_blk->begin)) { + begin = notsack_blk->begin; + } + break; + } + } + /* + * All holes are filled. Manipulate tcp_cwnd to send more + * if we can. Note that after the SACK recovery, tcp_cwnd is + * set to tcp_cwnd_ssthresh. + */ + if (notsack_blk == NULL) { + usable_swnd = tcp->tcp_cwnd_ssthresh - tcp->tcp_pipe; + if (usable_swnd <= 0 || tcp->tcp_unsent == 0) { + tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna; + ASSERT(tcp->tcp_cwnd > 0); + return; + } else { + usable_swnd = usable_swnd / mss; + tcp->tcp_cwnd = tcp->tcp_snxt - tcp->tcp_suna + + MAX(usable_swnd * mss, mss); + *flags |= TH_XMIT_NEEDED; + return; + } + } + + /* + * Note that we may send more than usable_swnd allows here + * because of round off, but no more than 1 MSS of data. + */ + seg_len = end - begin; + if (seg_len > mss) + seg_len = mss; + snxt_mp = tcp_get_seg_mp(tcp, begin, &off); + ASSERT(snxt_mp != NULL); + /* This should not happen. Defensive coding again... */ + if (snxt_mp == NULL) { + return; + } + + xmit_mp = tcp_xmit_mp(tcp, snxt_mp, seg_len, &off, + &tmp_mp, begin, B_TRUE, &seg_len, B_TRUE); + if (xmit_mp == NULL) + return; + + usable_swnd -= seg_len; + tcp->tcp_pipe += seg_len; + tcp->tcp_sack_snxt = begin + seg_len; + + tcp_send_data(tcp, xmit_mp); + + /* + * Update the send timestamp to avoid false retransmission. + */ + snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + + TCPS_BUMP_MIB(tcps, tcpRetransSegs); + TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len); + TCPS_BUMP_MIB(tcps, tcpOutSackRetransSegs); + /* + * Update tcp_rexmit_max to extend this SACK recovery phase. + * This happens when new data sent during fast recovery is + * also lost. If TCP retransmits those new data, it needs + * to extend SACK recover phase to avoid starting another + * fast retransmit/recovery unnecessarily. + */ + if (SEQ_GT(tcp->tcp_sack_snxt, tcp->tcp_rexmit_max)) { + tcp->tcp_rexmit_max = tcp->tcp_sack_snxt; + } + } +} + +/* + * tcp_ss_rexmit() is called to do slow start retransmission after a timeout + * or ICMP errors. + * + * To limit the number of duplicate segments, we limit the number of segment + * to be sent in one time to tcp_snd_burst, the burst variable. + */ +void +tcp_ss_rexmit(tcp_t *tcp) +{ + uint32_t snxt; + uint32_t smax; + int32_t win; + int32_t mss; + int32_t off; + int32_t burst = tcp->tcp_snd_burst; + mblk_t *snxt_mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + /* + * Note that tcp_rexmit can be set even though TCP has retransmitted + * all unack'ed segments. + */ + if (SEQ_LT(tcp->tcp_rexmit_nxt, tcp->tcp_rexmit_max)) { + smax = tcp->tcp_rexmit_max; + snxt = tcp->tcp_rexmit_nxt; + if (SEQ_LT(snxt, tcp->tcp_suna)) { + snxt = tcp->tcp_suna; + } + win = MIN(tcp->tcp_cwnd, tcp->tcp_swnd); + win -= snxt - tcp->tcp_suna; + mss = tcp->tcp_mss; + snxt_mp = tcp_get_seg_mp(tcp, snxt, &off); + + while (SEQ_LT(snxt, smax) && (win > 0) && + (burst > 0) && (snxt_mp != NULL)) { + mblk_t *xmit_mp; + mblk_t *old_snxt_mp = snxt_mp; + uint32_t cnt = mss; + + if (win < cnt) { + cnt = win; + } + if (SEQ_GT(snxt + cnt, smax)) { + cnt = smax - snxt; + } + xmit_mp = tcp_xmit_mp(tcp, snxt_mp, cnt, &off, + &snxt_mp, snxt, B_TRUE, &cnt, B_TRUE); + if (xmit_mp == NULL) + return; + + tcp_send_data(tcp, xmit_mp); + + snxt += cnt; + win -= cnt; + /* + * Update the send timestamp to avoid false + * retransmission. + */ + old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + TCPS_BUMP_MIB(tcps, tcpRetransSegs); + TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt); + + tcp->tcp_rexmit_nxt = snxt; + burst--; + } + /* + * If we have transmitted all we have at the time + * we started the retranmission, we can leave + * the rest of the job to tcp_wput_data(). But we + * need to check the send window first. If the + * win is not 0, go on with tcp_wput_data(). + */ + if (SEQ_LT(snxt, smax) || win == 0) { + return; + } + } + /* Only call tcp_wput_data() if there is data to be sent. */ + if (tcp->tcp_unsent) { + tcp_wput_data(tcp, NULL, B_FALSE); + } +} + +/* + * Do slow start retransmission after ICMP errors of PMTU changes. + */ +void +tcp_rexmit_after_error(tcp_t *tcp) +{ + /* + * All sent data has been acknowledged or no data left to send, just + * to return. + */ + if (!SEQ_LT(tcp->tcp_suna, tcp->tcp_snxt) || + (tcp->tcp_xmit_head == NULL)) + return; + + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && (tcp->tcp_unsent == 0)) + tcp->tcp_rexmit_max = tcp->tcp_fss; + else + tcp->tcp_rexmit_max = tcp->tcp_snxt; + + tcp->tcp_rexmit_nxt = tcp->tcp_suna; + tcp->tcp_rexmit = B_TRUE; + tcp->tcp_dupack_cnt = 0; + tcp->tcp_snd_burst = TCP_CWND_SS; + tcp_ss_rexmit(tcp); +} + +/* + * tcp_get_seg_mp() is called to get the pointer to a segment in the + * send queue which starts at the given sequence number. If the given + * sequence number is equal to last valid sequence number (tcp_snxt), the + * returned mblk is the last valid mblk, and off is set to the length of + * that mblk. + * + * send queue which starts at the given seq. no. + * + * Parameters: + * tcp_t *tcp: the tcp instance pointer. + * uint32_t seq: the starting seq. no of the requested segment. + * int32_t *off: after the execution, *off will be the offset to + * the returned mblk which points to the requested seq no. + * It is the caller's responsibility to send in a non-null off. + * + * Return: + * A mblk_t pointer pointing to the requested segment in send queue. + */ +static mblk_t * +tcp_get_seg_mp(tcp_t *tcp, uint32_t seq, int32_t *off) +{ + int32_t cnt; + mblk_t *mp; + + /* Defensive coding. Make sure we don't send incorrect data. */ + if (SEQ_LT(seq, tcp->tcp_suna) || SEQ_GT(seq, tcp->tcp_snxt)) + return (NULL); + + cnt = seq - tcp->tcp_suna; + mp = tcp->tcp_xmit_head; + while (cnt > 0 && mp != NULL) { + cnt -= mp->b_wptr - mp->b_rptr; + if (cnt <= 0) { + cnt += mp->b_wptr - mp->b_rptr; + break; + } + mp = mp->b_cont; + } + ASSERT(mp != NULL); + *off = cnt; + return (mp); +} + +/* + * This routine adjusts next-to-send sequence number variables, in the + * case where the reciever has shrunk it's window. + */ +void +tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt) +{ + mblk_t *xmit_tail; + int32_t offset; + + tcp->tcp_snxt = snxt; + + /* Get the mblk, and the offset in it, as per the shrunk window */ + xmit_tail = tcp_get_seg_mp(tcp, snxt, &offset); + ASSERT(xmit_tail != NULL); + tcp->tcp_xmit_tail = xmit_tail; + tcp->tcp_xmit_tail_unsent = xmit_tail->b_wptr - + xmit_tail->b_rptr - offset; +} + +/* + * This handles the case when the receiver has shrunk its win. Per RFC 1122 + * if the receiver shrinks the window, i.e. moves the right window to the + * left, the we should not send new data, but should retransmit normally the + * old unacked data between suna and suna + swnd. We might has sent data + * that is now outside the new window, pretend that we didn't send it. + */ +static void +tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) +{ + uint32_t snxt = tcp->tcp_snxt; + + ASSERT(shrunk_count > 0); + + if (!tcp->tcp_is_wnd_shrnk) { + tcp->tcp_snxt_shrunk = snxt; + tcp->tcp_is_wnd_shrnk = B_TRUE; + } else if (SEQ_GT(snxt, tcp->tcp_snxt_shrunk)) { + tcp->tcp_snxt_shrunk = snxt; + } + + /* Pretend we didn't send the data outside the window */ + snxt -= shrunk_count; + + /* Reset all the values per the now shrunk window */ + tcp_update_xmit_tail(tcp, snxt); + tcp->tcp_unsent += shrunk_count; + + /* + * If the SACK option is set, delete the entire list of + * notsack'ed blocks. + */ + if (tcp->tcp_sack_info != NULL) { + if (tcp->tcp_notsack_list != NULL) + TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); + } + + if (tcp->tcp_suna == tcp->tcp_snxt && tcp->tcp_swnd == 0) + /* + * Make sure the timer is running so that we will probe a zero + * window. + */ + TCP_TIMER_RESTART(tcp, tcp->tcp_rto); +} + +/* + * tcp_fill_header is called by tcp_send() to fill the outgoing TCP header + * with the template header, as well as other options such as time-stamp, + * ECN and/or SACK. + */ +static void +tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) +{ + tcpha_t *tcp_tmpl, *tcpha; + uint32_t *dst, *src; + int hdrlen; + conn_t *connp = tcp->tcp_connp; + + ASSERT(OK_32PTR(rptr)); + + /* Template header */ + tcp_tmpl = tcp->tcp_tcpha; + + /* Header of outgoing packet */ + tcpha = (tcpha_t *)(rptr + connp->conn_ixa->ixa_ip_hdr_length); + + /* dst and src are opaque 32-bit fields, used for copying */ + dst = (uint32_t *)rptr; + src = (uint32_t *)connp->conn_ht_iphc; + hdrlen = connp->conn_ht_iphc_len; + + /* Fill time-stamp option if needed */ + if (tcp->tcp_snd_ts_ok) { + U32_TO_BE32((uint32_t)now, + (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); + U32_TO_BE32(tcp->tcp_ts_recent, + (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); + } else { + ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); + } + + /* + * Copy the template header; is this really more efficient than + * calling bcopy()? For simple IPv4/TCP, it may be the case, + * but perhaps not for other scenarios. + */ + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + dst[4] = src[4]; + dst[5] = src[5]; + dst[6] = src[6]; + dst[7] = src[7]; + dst[8] = src[8]; + dst[9] = src[9]; + if (hdrlen -= 40) { + hdrlen >>= 2; + dst += 10; + src += 10; + do { + *dst++ = *src++; + } while (--hdrlen); + } + + /* + * Set the ECN info in the TCP header if it is not a zero + * window probe. Zero window probe is only sent in + * tcp_wput_data() and tcp_timer(). + */ + if (tcp->tcp_ecn_ok && !tcp->tcp_zero_win_probe) { + TCP_SET_ECT(tcp, rptr); + + if (tcp->tcp_ecn_echo_on) + tcpha->tha_flags |= TH_ECE; + if (tcp->tcp_cwr && !tcp->tcp_ecn_cwr_sent) { + tcpha->tha_flags |= TH_CWR; + tcp->tcp_ecn_cwr_sent = B_TRUE; + } + } + + /* Fill in SACK options */ + if (num_sack_blk > 0) { + uchar_t *wptr = rptr + connp->conn_ht_iphc_len; + sack_blk_t *tmp; + int32_t i; + + wptr[0] = TCPOPT_NOP; + wptr[1] = TCPOPT_NOP; + wptr[2] = TCPOPT_SACK; + wptr[3] = TCPOPT_HEADER_LEN + num_sack_blk * + sizeof (sack_blk_t); + wptr += TCPOPT_REAL_SACK_LEN; + + tmp = tcp->tcp_sack_list; + for (i = 0; i < num_sack_blk; i++) { + U32_TO_BE32(tmp[i].begin, wptr); + wptr += sizeof (tcp_seq); + U32_TO_BE32(tmp[i].end, wptr); + wptr += sizeof (tcp_seq); + } + tcpha->tha_offset_and_reserved += + ((num_sack_blk * 2 + 1) << 4); + } +} diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c new file mode 100644 index 0000000000..7f96a851ef --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -0,0 +1,820 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* This file contains all TCP kernel socket related functions. */ + +#include <sys/types.h> +#include <sys/strlog.h> +#include <sys/policy.h> +#include <sys/sockio.h> +#include <sys/strsubr.h> +#include <sys/strsun.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> +#include <sys/tihdr.h> +#include <sys/timod.h> +#include <sys/tpicommon.h> +#include <sys/socketvar.h> + +#include <inet/common.h> +#include <inet/proto_set.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> + +static void tcp_activate(sock_lower_handle_t, sock_upper_handle_t, + sock_upcalls_t *, int, cred_t *); +static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t, + sock_upper_handle_t, cred_t *); +static int tcp_bind(sock_lower_handle_t, struct sockaddr *, + socklen_t, cred_t *); +static int tcp_listen(sock_lower_handle_t, int, cred_t *); +static int tcp_connect(sock_lower_handle_t, const struct sockaddr *, + socklen_t, sock_connid_t *, cred_t *); +static int tcp_getsockopt(sock_lower_handle_t, int, int, void *, + socklen_t *, cred_t *); +static int tcp_setsockopt(sock_lower_handle_t, int, int, const void *, + socklen_t, cred_t *); +static int tcp_sendmsg(sock_lower_handle_t, mblk_t *, struct nmsghdr *, + cred_t *cr); +static int tcp_shutdown(sock_lower_handle_t, int, cred_t *); +static void tcp_clr_flowctrl(sock_lower_handle_t); +static int tcp_ioctl(sock_lower_handle_t, int, intptr_t, int, int32_t *, + cred_t *); +static int tcp_close(sock_lower_handle_t, int, cred_t *); + +sock_downcalls_t sock_tcp_downcalls = { + tcp_activate, + tcp_accept, + tcp_bind, + tcp_listen, + tcp_connect, + tcp_getpeername, + tcp_getsockname, + tcp_getsockopt, + tcp_setsockopt, + tcp_sendmsg, + NULL, + NULL, + NULL, + tcp_shutdown, + tcp_clr_flowctrl, + tcp_ioctl, + tcp_close, +}; + +/* ARGSUSED */ +static void +tcp_activate(sock_lower_handle_t proto_handle, sock_upper_handle_t sock_handle, + sock_upcalls_t *sock_upcalls, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + struct sock_proto_props sopp; + extern struct module_info tcp_rinfo; + + ASSERT(connp->conn_upper_handle == NULL); + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + sopp.sopp_flags = SOCKOPT_RCVHIWAT | SOCKOPT_RCVLOWAT | + SOCKOPT_MAXPSZ | SOCKOPT_MAXBLK | SOCKOPT_RCVTIMER | + SOCKOPT_RCVTHRESH | SOCKOPT_MAXADDRLEN | SOCKOPT_MINPSZ; + + sopp.sopp_rxhiwat = SOCKET_RECVHIWATER; + sopp.sopp_rxlowat = SOCKET_RECVLOWATER; + sopp.sopp_maxpsz = INFPSZ; + sopp.sopp_maxblk = INFPSZ; + sopp.sopp_rcvtimer = SOCKET_TIMER_INTERVAL; + sopp.sopp_rcvthresh = SOCKET_RECVHIWATER >> 3; + sopp.sopp_maxaddrlen = sizeof (sin6_t); + sopp.sopp_minpsz = (tcp_rinfo.mi_minpsz == 1) ? 0 : + tcp_rinfo.mi_minpsz; + + connp->conn_upcalls = sock_upcalls; + connp->conn_upper_handle = sock_handle; + + ASSERT(connp->conn_rcvbuf != 0 && + connp->conn_rcvbuf == connp->conn_tcp->tcp_rwnd); + (*sock_upcalls->su_set_proto_props)(sock_handle, &sopp); +} + +static int +tcp_accept(sock_lower_handle_t lproto_handle, + sock_lower_handle_t eproto_handle, sock_upper_handle_t sock_handle, + cred_t *cr) +{ + conn_t *lconnp, *econnp; + tcp_t *listener, *eager; + + lconnp = (conn_t *)lproto_handle; + listener = lconnp->conn_tcp; + ASSERT(listener->tcp_state == TCPS_LISTEN); + econnp = (conn_t *)eproto_handle; + eager = econnp->conn_tcp; + ASSERT(eager->tcp_listener != NULL); + + /* + * It is OK to manipulate these fields outside the eager's squeue + * because they will not start being used until tcp_accept_finish + * has been called. + */ + ASSERT(lconnp->conn_upper_handle != NULL); + ASSERT(econnp->conn_upper_handle == NULL); + econnp->conn_upper_handle = sock_handle; + econnp->conn_upcalls = lconnp->conn_upcalls; + ASSERT(IPCL_IS_NONSTR(econnp)); + return (tcp_accept_common(lconnp, econnp, cr)); +} + +static int +tcp_bind(sock_lower_handle_t proto_handle, struct sockaddr *sa, + socklen_t len, cred_t *cr) +{ + int error; + conn_t *connp = (conn_t *)proto_handle; + squeue_t *sqp = connp->conn_sqp; + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + ASSERT(sqp != NULL); + ASSERT(connp->conn_upper_handle != NULL); + + error = squeue_synch_enter(sqp, connp, NULL); + if (error != 0) { + /* failed to enter */ + return (ENOSR); + } + + /* binding to a NULL address really means unbind */ + if (sa == NULL) { + if (connp->conn_tcp->tcp_state < TCPS_LISTEN) + error = tcp_do_unbind(connp); + else + error = EINVAL; + } else { + error = tcp_do_bind(connp, sa, len, cr, B_TRUE); + } + + squeue_synch_exit(sqp, connp); + + if (error < 0) { + if (error == -TOUTSTATE) + error = EINVAL; + else + error = proto_tlitosyserr(-error); + } + + return (error); +} + +/* + * SOP_LISTEN() calls into tcp_listen(). + */ +/* ARGSUSED */ +static int +tcp_listen(sock_lower_handle_t proto_handle, int backlog, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + squeue_t *sqp = connp->conn_sqp; + + ASSERT(connp->conn_upper_handle != NULL); + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + error = squeue_synch_enter(sqp, connp, NULL); + if (error != 0) { + /* failed to enter */ + return (ENOBUFS); + } + + error = tcp_do_listen(connp, NULL, 0, backlog, cr, B_FALSE); + if (error == 0) { + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_ENAB_ACCEPT, (uintptr_t)backlog); + } else if (error < 0) { + if (error == -TOUTSTATE) + error = EINVAL; + else + error = proto_tlitosyserr(-error); + } + squeue_synch_exit(sqp, connp); + return (error); +} + +static int +tcp_connect(sock_lower_handle_t proto_handle, const struct sockaddr *sa, + socklen_t len, sock_connid_t *id, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + squeue_t *sqp = connp->conn_sqp; + int error; + + ASSERT(connp->conn_upper_handle != NULL); + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + error = proto_verify_ip_addr(connp->conn_family, sa, len); + if (error != 0) { + return (error); + } + + error = squeue_synch_enter(sqp, connp, NULL); + if (error != 0) { + /* failed to enter */ + return (ENOSR); + } + + /* + * TCP supports quick connect, so no need to do an implicit bind + */ + error = tcp_do_connect(connp, sa, len, cr, curproc->p_pid); + if (error == 0) { + *id = connp->conn_tcp->tcp_connid; + } else if (error < 0) { + if (error == -TOUTSTATE) { + switch (connp->conn_tcp->tcp_state) { + case TCPS_SYN_SENT: + error = EALREADY; + break; + case TCPS_ESTABLISHED: + error = EISCONN; + break; + case TCPS_LISTEN: + error = EOPNOTSUPP; + break; + default: + error = EINVAL; + break; + } + } else { + error = proto_tlitosyserr(-error); + } + } + + if (connp->conn_tcp->tcp_loopback) { + struct sock_proto_props sopp; + + sopp.sopp_flags = SOCKOPT_LOOPBACK; + sopp.sopp_loopback = B_TRUE; + + (*connp->conn_upcalls->su_set_proto_props)( + connp->conn_upper_handle, &sopp); + } +done: + squeue_synch_exit(sqp, connp); + + return ((error == 0) ? EINPROGRESS : error); +} + +/* ARGSUSED3 */ +int +tcp_getpeername(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addrlenp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + + ASSERT(connp->conn_upper_handle != NULL); + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + ASSERT(tcp != NULL); + if (tcp->tcp_state < TCPS_SYN_RCVD) + return (ENOTCONN); + + return (conn_getpeername(connp, addr, addrlenp)); +} + +/* ARGSUSED3 */ +int +tcp_getsockname(sock_lower_handle_t proto_handle, struct sockaddr *addr, + socklen_t *addrlenp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + ASSERT(connp->conn_upper_handle != NULL); + return (conn_getsockname(connp, addr, addrlenp)); +} + +/* returns UNIX error, the optlen is a value-result arg */ +static int +tcp_getsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + void *optvalp, socklen_t *optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + squeue_t *sqp = connp->conn_sqp; + int error; + t_uscalar_t max_optbuf_len; + void *optvalp_buf; + int len; + + ASSERT(connp->conn_upper_handle != NULL); + + error = proto_opt_check(level, option_name, *optlen, &max_optbuf_len, + tcp_opt_obj.odb_opt_des_arr, + tcp_opt_obj.odb_opt_arr_cnt, + B_FALSE, B_TRUE, cr); + if (error != 0) { + if (error < 0) { + error = proto_tlitosyserr(-error); + } + return (error); + } + + optvalp_buf = kmem_alloc(max_optbuf_len, KM_SLEEP); + + error = squeue_synch_enter(sqp, connp, NULL); + if (error == ENOMEM) { + kmem_free(optvalp_buf, max_optbuf_len); + return (ENOMEM); + } + + len = tcp_opt_get(connp, level, option_name, optvalp_buf); + squeue_synch_exit(sqp, connp); + + if (len == -1) { + kmem_free(optvalp_buf, max_optbuf_len); + return (EINVAL); + } + + /* + * update optlen and copy option value + */ + t_uscalar_t size = MIN(len, *optlen); + + bcopy(optvalp_buf, optvalp, size); + bcopy(&size, optlen, sizeof (size)); + + kmem_free(optvalp_buf, max_optbuf_len); + return (0); +} + +static int +tcp_setsockopt(sock_lower_handle_t proto_handle, int level, int option_name, + const void *optvalp, socklen_t optlen, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + squeue_t *sqp = connp->conn_sqp; + int error; + + ASSERT(connp->conn_upper_handle != NULL); + /* + * Entering the squeue synchronously can result in a context switch, + * which can cause a rather sever performance degradation. So we try to + * handle whatever options we can without entering the squeue. + */ + if (level == IPPROTO_TCP) { + switch (option_name) { + case TCP_NODELAY: + if (optlen != sizeof (int32_t)) + return (EINVAL); + mutex_enter(&connp->conn_tcp->tcp_non_sq_lock); + connp->conn_tcp->tcp_naglim = *(int *)optvalp ? 1 : + connp->conn_tcp->tcp_mss; + mutex_exit(&connp->conn_tcp->tcp_non_sq_lock); + return (0); + default: + break; + } + } + + error = squeue_synch_enter(sqp, connp, NULL); + if (error == ENOMEM) { + return (ENOMEM); + } + + error = proto_opt_check(level, option_name, optlen, NULL, + tcp_opt_obj.odb_opt_des_arr, + tcp_opt_obj.odb_opt_arr_cnt, + B_TRUE, B_FALSE, cr); + + if (error != 0) { + if (error < 0) { + error = proto_tlitosyserr(-error); + } + squeue_synch_exit(sqp, connp); + return (error); + } + + error = tcp_opt_set(connp, SETFN_OPTCOM_NEGOTIATE, level, option_name, + optlen, (uchar_t *)optvalp, (uint_t *)&optlen, (uchar_t *)optvalp, + NULL, cr); + squeue_synch_exit(sqp, connp); + + ASSERT(error >= 0); + + return (error); +} + +/* ARGSUSED */ +static int +tcp_sendmsg(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, + cred_t *cr) +{ + tcp_t *tcp; + uint32_t msize; + conn_t *connp = (conn_t *)proto_handle; + int32_t tcpstate; + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + ASSERT(connp->conn_ref >= 2); + ASSERT(connp->conn_upper_handle != NULL); + + if (msg->msg_controllen != 0) { + freemsg(mp); + return (EOPNOTSUPP); + } + + switch (DB_TYPE(mp)) { + case M_DATA: + tcp = connp->conn_tcp; + ASSERT(tcp != NULL); + + tcpstate = tcp->tcp_state; + if (tcpstate < TCPS_ESTABLISHED) { + freemsg(mp); + /* + * We return ENOTCONN if the endpoint is trying to + * connect or has never been connected, and EPIPE if it + * has been disconnected. The connection id helps us + * distinguish between the last two cases. + */ + return ((tcpstate == TCPS_SYN_SENT) ? ENOTCONN : + ((tcp->tcp_connid > 0) ? EPIPE : ENOTCONN)); + } else if (tcpstate > TCPS_CLOSE_WAIT) { + freemsg(mp); + return (EPIPE); + } + + msize = msgdsize(mp); + + mutex_enter(&tcp->tcp_non_sq_lock); + tcp->tcp_squeue_bytes += msize; + /* + * Squeue Flow Control + */ + if (TCP_UNSENT_BYTES(tcp) > connp->conn_sndbuf) { + tcp_setqfull(tcp); + } + mutex_exit(&tcp->tcp_non_sq_lock); + + /* + * The application may pass in an address in the msghdr, but + * we ignore the address on connection-oriented sockets. + * Just like BSD this code does not generate an error for + * TCP (a CONNREQUIRED socket) when sending to an address + * passed in with sendto/sendmsg. Instead the data is + * delivered on the connection as if no address had been + * supplied. + */ + CONN_INC_REF(connp); + + if (msg->msg_flags & MSG_OOB) { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output_urgent, + connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); + } else { + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_output, + connp, NULL, tcp_squeue_flag, SQTAG_TCP_OUTPUT); + } + + return (0); + + default: + ASSERT(0); + } + + freemsg(mp); + return (0); +} + +/* ARGSUSED */ +static int +tcp_shutdown(sock_lower_handle_t proto_handle, int how, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + + ASSERT(connp->conn_upper_handle != NULL); + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + /* + * X/Open requires that we check the connected state. + */ + if (tcp->tcp_state < TCPS_SYN_SENT) + return (ENOTCONN); + + /* shutdown the send side */ + if (how != SHUT_RD) { + mblk_t *bp; + + bp = allocb_wait(0, BPRI_HI, STR_NOSIG, NULL); + CONN_INC_REF(connp); + SQUEUE_ENTER_ONE(connp->conn_sqp, bp, tcp_shutdown_output, + connp, NULL, SQ_NODRAIN, SQTAG_TCP_SHUTDOWN_OUTPUT); + + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_SEND, 0); + } + + /* shutdown the recv side */ + if (how != SHUT_WR) + (*connp->conn_upcalls->su_opctl)(connp->conn_upper_handle, + SOCK_OPCTL_SHUT_RECV, 0); + + return (0); +} + +static void +tcp_clr_flowctrl(sock_lower_handle_t proto_handle) +{ + conn_t *connp = (conn_t *)proto_handle; + tcp_t *tcp = connp->conn_tcp; + mblk_t *mp; + int error; + + ASSERT(connp->conn_upper_handle != NULL); + + /* + * If tcp->tcp_rsrv_mp == NULL, it means that tcp_clr_flowctrl() + * is currently running. + */ + mutex_enter(&tcp->tcp_rsrv_mp_lock); + if ((mp = tcp->tcp_rsrv_mp) == NULL) { + mutex_exit(&tcp->tcp_rsrv_mp_lock); + return; + } + tcp->tcp_rsrv_mp = NULL; + mutex_exit(&tcp->tcp_rsrv_mp_lock); + + error = squeue_synch_enter(connp->conn_sqp, connp, mp); + ASSERT(error == 0); + + mutex_enter(&tcp->tcp_rsrv_mp_lock); + tcp->tcp_rsrv_mp = mp; + mutex_exit(&tcp->tcp_rsrv_mp_lock); + + if (tcp->tcp_fused) { + tcp_fuse_backenable(tcp); + } else { + tcp->tcp_rwnd = connp->conn_rcvbuf; + /* + * Send back a window update immediately if TCP is above + * ESTABLISHED state and the increase of the rcv window + * that the other side knows is at least 1 MSS after flow + * control is lifted. + */ + if (tcp->tcp_state >= TCPS_ESTABLISHED && + tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) { + tcp_xmit_ctl(NULL, tcp, + (tcp->tcp_swnd == 0) ? tcp->tcp_suna : + tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); + } + } + + squeue_synch_exit(connp->conn_sqp, connp); +} + +/* ARGSUSED */ +static int +tcp_ioctl(sock_lower_handle_t proto_handle, int cmd, intptr_t arg, + int mode, int32_t *rvalp, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + int error; + + ASSERT(connp->conn_upper_handle != NULL); + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + /* + * If we don't have a helper stream then create one. + * ip_create_helper_stream takes care of locking the conn_t, + * so this check for NULL is just a performance optimization. + */ + if (connp->conn_helper_info == NULL) { + tcp_stack_t *tcps = connp->conn_tcp->tcp_tcps; + + /* + * Create a helper stream for non-STREAMS socket. + */ + error = ip_create_helper_stream(connp, tcps->tcps_ldi_ident); + if (error != 0) { + ip0dbg(("tcp_ioctl: create of IP helper stream " + "failed %d\n", error)); + return (error); + } + } + + switch (cmd) { + case ND_SET: + case ND_GET: + case _SIOCSOCKFALLBACK: + case TCP_IOC_ABORT_CONN: + case TI_GETPEERNAME: + case TI_GETMYNAME: + ip1dbg(("tcp_ioctl: cmd 0x%x on non streams socket", + cmd)); + error = EINVAL; + break; + default: + /* + * If the conn is not closing, pass on to IP using + * helper stream. Bump the ioctlref to prevent tcp_close + * from closing the rq/wq out from underneath the ioctl + * if it ends up queued or aborted/interrupted. + */ + mutex_enter(&connp->conn_lock); + if (connp->conn_state_flags & (CONN_CLOSING)) { + mutex_exit(&connp->conn_lock); + error = EINVAL; + break; + } + CONN_INC_IOCTLREF_LOCKED(connp); + error = ldi_ioctl(connp->conn_helper_info->iphs_handle, + cmd, arg, mode, cr, rvalp); + CONN_DEC_IOCTLREF(connp); + break; + } + return (error); +} + +/* ARGSUSED */ +static int +tcp_close(sock_lower_handle_t proto_handle, int flags, cred_t *cr) +{ + conn_t *connp = (conn_t *)proto_handle; + + ASSERT(connp->conn_upper_handle != NULL); + + /* All Solaris components should pass a cred for this operation. */ + ASSERT(cr != NULL); + + tcp_close_common(connp, flags); + + ip_free_helper_stream(connp); + + /* + * Drop IP's reference on the conn. This is the last reference + * on the connp if the state was less than established. If the + * connection has gone into timewait state, then we will have + * one ref for the TCP and one more ref (total of two) for the + * classifier connected hash list (a timewait connections stays + * in connected hash till closed). + * + * We can't assert the references because there might be other + * transient reference places because of some walkers or queued + * packets in squeue for the timewait state. + */ + CONN_DEC_REF(connp); + return (0); +} + +/* ARGSUSED */ +sock_lower_handle_t +tcp_create(int family, int type, int proto, sock_downcalls_t **sock_downcalls, + uint_t *smodep, int *errorp, int flags, cred_t *credp) +{ + conn_t *connp; + boolean_t isv6 = family == AF_INET6; + if (type != SOCK_STREAM || (family != AF_INET && family != AF_INET6) || + (proto != 0 && proto != IPPROTO_TCP)) { + *errorp = EPROTONOSUPPORT; + return (NULL); + } + + connp = tcp_create_common(credp, isv6, B_TRUE, errorp); + if (connp == NULL) { + return (NULL); + } + + /* + * Put the ref for TCP. Ref for IP was already put + * by ipcl_conn_create. Also Make the conn_t globally + * visible to walkers + */ + mutex_enter(&connp->conn_lock); + CONN_INC_REF_LOCKED(connp); + ASSERT(connp->conn_ref == 2); + connp->conn_state_flags &= ~CONN_INCIPIENT; + + connp->conn_flags |= IPCL_NONSTR; + mutex_exit(&connp->conn_lock); + + ASSERT(errorp != NULL); + *errorp = 0; + *sock_downcalls = &sock_tcp_downcalls; + *smodep = SM_CONNREQUIRED | SM_EXDATA | SM_ACCEPTSUPP | + SM_SENDFILESUPP; + + return ((sock_lower_handle_t)connp); +} + +int +tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, + boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb) +{ + tcp_t *tcp; + conn_t *connp = (conn_t *)proto_handle; + int error; + mblk_t *stropt_mp; + mblk_t *ordrel_mp; + + tcp = connp->conn_tcp; + + stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG, + NULL); + + /* Pre-allocate the T_ordrel_ind mblk. */ + ASSERT(tcp->tcp_ordrel_mp == NULL); + ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI, + STR_NOSIG, NULL); + ordrel_mp->b_datap->db_type = M_PROTO; + ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND; + ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind); + + /* + * Enter the squeue so that no new packets can come in + */ + error = squeue_synch_enter(connp->conn_sqp, connp, NULL); + if (error != 0) { + /* failed to enter, free all the pre-allocated messages. */ + freeb(stropt_mp); + freeb(ordrel_mp); + /* + * We cannot process the eager, so at least send out a + * RST so the peer can reconnect. + */ + if (tcp->tcp_listener != NULL) { + (void) tcp_eager_blowoff(tcp->tcp_listener, + tcp->tcp_conn_req_seqnum); + } + return (ENOMEM); + } + + /* + * Both endpoints must be of the same type (either STREAMS or + * non-STREAMS) for fusion to be enabled. So if we are fused, + * we have to unfuse. + */ + if (tcp->tcp_fused) + tcp_unfuse(tcp); + + /* + * No longer a direct socket + */ + connp->conn_flags &= ~IPCL_NONSTR; + tcp->tcp_ordrel_mp = ordrel_mp; + + if (tcp->tcp_listener != NULL) { + /* The eager will deal with opts when accept() is called */ + freeb(stropt_mp); + tcp_fallback_eager(tcp, direct_sockfs); + } else { + tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs, + quiesced_cb); + } + + /* + * There should be atleast two ref's (IP + TCP) + */ + ASSERT(connp->conn_ref >= 2); + squeue_synch_exit(connp->conn_sqp, connp); + + return (0); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c new file mode 100644 index 0000000000..3993f09d3f --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_stats.c @@ -0,0 +1,1021 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/tihdr.h> +#include <sys/policy.h> +#include <sys/tsol/tnet.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/tcp_stats.h> +#include <inet/kstatcom.h> +#include <inet/snmpcom.h> + +static int tcp_kstat_update(kstat_t *kp, int rw); +static int tcp_kstat2_update(kstat_t *kp, int rw); +static void tcp_sum_mib(tcp_stack_t *, mib2_tcp_t *); + +static void tcp_cp_mib(mib2_tcp_t *, mib2_tcp_t *); +static void tcp_cp_stats(tcp_stat_t *, tcp_stat_t *); +static void tcp_clr_stats(tcp_stat_t *); + +tcp_g_stat_t tcp_g_statistics; +kstat_t *tcp_g_kstat; + +/* Translate TCP state to MIB2 TCP state. */ +static int +tcp_snmp_state(tcp_t *tcp) +{ + if (tcp == NULL) + return (0); + + switch (tcp->tcp_state) { + case TCPS_CLOSED: + case TCPS_IDLE: /* RFC1213 doesn't have analogue for IDLE & BOUND */ + case TCPS_BOUND: + return (MIB2_TCP_closed); + case TCPS_LISTEN: + return (MIB2_TCP_listen); + case TCPS_SYN_SENT: + return (MIB2_TCP_synSent); + case TCPS_SYN_RCVD: + return (MIB2_TCP_synReceived); + case TCPS_ESTABLISHED: + return (MIB2_TCP_established); + case TCPS_CLOSE_WAIT: + return (MIB2_TCP_closeWait); + case TCPS_FIN_WAIT_1: + return (MIB2_TCP_finWait1); + case TCPS_CLOSING: + return (MIB2_TCP_closing); + case TCPS_LAST_ACK: + return (MIB2_TCP_lastAck); + case TCPS_FIN_WAIT_2: + return (MIB2_TCP_finWait2); + case TCPS_TIME_WAIT: + return (MIB2_TCP_timeWait); + default: + return (0); + } +} + +/* + * Return SNMP stuff in buffer in mpdata. + */ +mblk_t * +tcp_snmp_get(queue_t *q, mblk_t *mpctl) +{ + mblk_t *mpdata; + mblk_t *mp_conn_ctl = NULL; + mblk_t *mp_conn_tail; + mblk_t *mp_attr_ctl = NULL; + mblk_t *mp_attr_tail; + mblk_t *mp6_conn_ctl = NULL; + mblk_t *mp6_conn_tail; + mblk_t *mp6_attr_ctl = NULL; + mblk_t *mp6_attr_tail; + struct opthdr *optp; + mib2_tcpConnEntry_t tce; + mib2_tcp6ConnEntry_t tce6; + mib2_transportMLPEntry_t mlp; + connf_t *connfp; + int i; + boolean_t ispriv; + zoneid_t zoneid; + int v4_conn_idx; + int v6_conn_idx; + conn_t *connp = Q_TO_CONN(q); + tcp_stack_t *tcps; + ip_stack_t *ipst; + mblk_t *mp2ctl; + mib2_tcp_t tcp_mib; + + /* + * make a copy of the original message + */ + mp2ctl = copymsg(mpctl); + + if (mpctl == NULL || + (mpdata = mpctl->b_cont) == NULL || + (mp_conn_ctl = copymsg(mpctl)) == NULL || + (mp_attr_ctl = copymsg(mpctl)) == NULL || + (mp6_conn_ctl = copymsg(mpctl)) == NULL || + (mp6_attr_ctl = copymsg(mpctl)) == NULL) { + freemsg(mp_conn_ctl); + freemsg(mp_attr_ctl); + freemsg(mp6_conn_ctl); + freemsg(mp6_attr_ctl); + freemsg(mpctl); + freemsg(mp2ctl); + return (NULL); + } + + ipst = connp->conn_netstack->netstack_ip; + tcps = connp->conn_netstack->netstack_tcp; + + bzero(&tcp_mib, sizeof (tcp_mib)); + + /* build table of connections -- need count in fixed part */ + SET_MIB(tcp_mib.tcpRtoAlgorithm, 4); /* vanj */ + SET_MIB(tcp_mib.tcpRtoMin, tcps->tcps_rexmit_interval_min); + SET_MIB(tcp_mib.tcpRtoMax, tcps->tcps_rexmit_interval_max); + SET_MIB(tcp_mib.tcpMaxConn, -1); + SET_MIB(tcp_mib.tcpCurrEstab, 0); + + ispriv = + secpolicy_ip_config((Q_TO_CONN(q))->conn_cred, B_TRUE) == 0; + zoneid = Q_TO_CONN(q)->conn_zoneid; + + v4_conn_idx = v6_conn_idx = 0; + mp_conn_tail = mp_attr_tail = mp6_conn_tail = mp6_attr_tail = NULL; + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + ipst = tcps->tcps_netstack->netstack_ip; + + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + + connp = NULL; + + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { + tcp_t *tcp; + boolean_t needattr; + + if (connp->conn_zoneid != zoneid) + continue; /* not in this zone */ + + tcp = connp->conn_tcp; + TCPS_UPDATE_MIB(tcps, tcpHCInSegs, tcp->tcp_ibsegs); + tcp->tcp_ibsegs = 0; + TCPS_UPDATE_MIB(tcps, tcpHCOutSegs, tcp->tcp_obsegs); + tcp->tcp_obsegs = 0; + + tce6.tcp6ConnState = tce.tcpConnState = + tcp_snmp_state(tcp); + if (tce.tcpConnState == MIB2_TCP_established || + tce.tcpConnState == MIB2_TCP_closeWait) + TCPS_BUMP_MIB(tcps, tcpCurrEstab); + + needattr = B_FALSE; + bzero(&mlp, sizeof (mlp)); + if (connp->conn_mlp_type != mlptSingle) { + if (connp->conn_mlp_type == mlptShared || + connp->conn_mlp_type == mlptBoth) + mlp.tme_flags |= MIB2_TMEF_SHARED; + if (connp->conn_mlp_type == mlptPrivate || + connp->conn_mlp_type == mlptBoth) + mlp.tme_flags |= MIB2_TMEF_PRIVATE; + needattr = B_TRUE; + } + if (connp->conn_anon_mlp) { + mlp.tme_flags |= MIB2_TMEF_ANONMLP; + needattr = B_TRUE; + } + switch (connp->conn_mac_mode) { + case CONN_MAC_DEFAULT: + break; + case CONN_MAC_AWARE: + mlp.tme_flags |= MIB2_TMEF_MACEXEMPT; + needattr = B_TRUE; + break; + case CONN_MAC_IMPLICIT: + mlp.tme_flags |= MIB2_TMEF_MACIMPLICIT; + needattr = B_TRUE; + break; + } + if (connp->conn_ixa->ixa_tsl != NULL) { + ts_label_t *tsl; + + tsl = connp->conn_ixa->ixa_tsl; + mlp.tme_flags |= MIB2_TMEF_IS_LABELED; + mlp.tme_doi = label2doi(tsl); + mlp.tme_label = *label2bslabel(tsl); + needattr = B_TRUE; + } + + /* Create a message to report on IPv6 entries */ + if (connp->conn_ipversion == IPV6_VERSION) { + tce6.tcp6ConnLocalAddress = connp->conn_laddr_v6; + tce6.tcp6ConnRemAddress = connp->conn_faddr_v6; + tce6.tcp6ConnLocalPort = ntohs(connp->conn_lport); + tce6.tcp6ConnRemPort = ntohs(connp->conn_fport); + if (connp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET) { + tce6.tcp6ConnIfIndex = + connp->conn_ixa->ixa_scopeid; + } else { + tce6.tcp6ConnIfIndex = connp->conn_bound_if; + } + /* Don't want just anybody seeing these... */ + if (ispriv) { + tce6.tcp6ConnEntryInfo.ce_snxt = + tcp->tcp_snxt; + tce6.tcp6ConnEntryInfo.ce_suna = + tcp->tcp_suna; + tce6.tcp6ConnEntryInfo.ce_rnxt = + tcp->tcp_rnxt; + tce6.tcp6ConnEntryInfo.ce_rack = + tcp->tcp_rack; + } else { + /* + * Netstat, unfortunately, uses this to + * get send/receive queue sizes. How to fix? + * Why not compute the difference only? + */ + tce6.tcp6ConnEntryInfo.ce_snxt = + tcp->tcp_snxt - tcp->tcp_suna; + tce6.tcp6ConnEntryInfo.ce_suna = 0; + tce6.tcp6ConnEntryInfo.ce_rnxt = + tcp->tcp_rnxt - tcp->tcp_rack; + tce6.tcp6ConnEntryInfo.ce_rack = 0; + } + + tce6.tcp6ConnEntryInfo.ce_swnd = tcp->tcp_swnd; + tce6.tcp6ConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; + tce6.tcp6ConnEntryInfo.ce_rto = tcp->tcp_rto; + tce6.tcp6ConnEntryInfo.ce_mss = tcp->tcp_mss; + tce6.tcp6ConnEntryInfo.ce_state = tcp->tcp_state; + + tce6.tcp6ConnCreationProcess = + (connp->conn_cpid < 0) ? MIB2_UNKNOWN_PROCESS : + connp->conn_cpid; + tce6.tcp6ConnCreationTime = connp->conn_open_time; + + (void) snmp_append_data2(mp6_conn_ctl->b_cont, + &mp6_conn_tail, (char *)&tce6, sizeof (tce6)); + + mlp.tme_connidx = v6_conn_idx++; + if (needattr) + (void) snmp_append_data2(mp6_attr_ctl->b_cont, + &mp6_attr_tail, (char *)&mlp, sizeof (mlp)); + } + /* + * Create an IPv4 table entry for IPv4 entries and also + * for IPv6 entries which are bound to in6addr_any + * but don't have IPV6_V6ONLY set. + * (i.e. anything an IPv4 peer could connect to) + */ + if (connp->conn_ipversion == IPV4_VERSION || + (tcp->tcp_state <= TCPS_LISTEN && + !connp->conn_ipv6_v6only && + IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6))) { + if (connp->conn_ipversion == IPV6_VERSION) { + tce.tcpConnRemAddress = INADDR_ANY; + tce.tcpConnLocalAddress = INADDR_ANY; + } else { + tce.tcpConnRemAddress = + connp->conn_faddr_v4; + tce.tcpConnLocalAddress = + connp->conn_laddr_v4; + } + tce.tcpConnLocalPort = ntohs(connp->conn_lport); + tce.tcpConnRemPort = ntohs(connp->conn_fport); + /* Don't want just anybody seeing these... */ + if (ispriv) { + tce.tcpConnEntryInfo.ce_snxt = + tcp->tcp_snxt; + tce.tcpConnEntryInfo.ce_suna = + tcp->tcp_suna; + tce.tcpConnEntryInfo.ce_rnxt = + tcp->tcp_rnxt; + tce.tcpConnEntryInfo.ce_rack = + tcp->tcp_rack; + } else { + /* + * Netstat, unfortunately, uses this to + * get send/receive queue sizes. How + * to fix? + * Why not compute the difference only? + */ + tce.tcpConnEntryInfo.ce_snxt = + tcp->tcp_snxt - tcp->tcp_suna; + tce.tcpConnEntryInfo.ce_suna = 0; + tce.tcpConnEntryInfo.ce_rnxt = + tcp->tcp_rnxt - tcp->tcp_rack; + tce.tcpConnEntryInfo.ce_rack = 0; + } + + tce.tcpConnEntryInfo.ce_swnd = tcp->tcp_swnd; + tce.tcpConnEntryInfo.ce_rwnd = tcp->tcp_rwnd; + tce.tcpConnEntryInfo.ce_rto = tcp->tcp_rto; + tce.tcpConnEntryInfo.ce_mss = tcp->tcp_mss; + tce.tcpConnEntryInfo.ce_state = + tcp->tcp_state; + + tce.tcpConnCreationProcess = + (connp->conn_cpid < 0) ? + MIB2_UNKNOWN_PROCESS : + connp->conn_cpid; + tce.tcpConnCreationTime = connp->conn_open_time; + + (void) snmp_append_data2(mp_conn_ctl->b_cont, + &mp_conn_tail, (char *)&tce, sizeof (tce)); + + mlp.tme_connidx = v4_conn_idx++; + if (needattr) + (void) snmp_append_data2( + mp_attr_ctl->b_cont, + &mp_attr_tail, (char *)&mlp, + sizeof (mlp)); + } + } + } + + /* fixed length structure for IPv4 and IPv6 counters */ + SET_MIB(tcp_mib.tcpConnTableSize, sizeof (mib2_tcpConnEntry_t)); + SET_MIB(tcp_mib.tcp6ConnTableSize, sizeof (mib2_tcp6ConnEntry_t)); + + /* synchronize 32- and 64-bit counters */ + SYNC32_MIB(&tcp_mib, tcpInSegs, tcpHCInSegs); + SYNC32_MIB(&tcp_mib, tcpOutSegs, tcpHCOutSegs); + + tcp_sum_mib(tcps, &tcp_mib); + + optp = (struct opthdr *)&mpctl->b_rptr[sizeof (struct T_optmgmt_ack)]; + optp->level = MIB2_TCP; + optp->name = 0; + (void) snmp_append_data(mpdata, (char *)&tcp_mib, sizeof (tcp_mib)); + optp->len = msgdsize(mpdata); + qreply(q, mpctl); + + /* table of connections... */ + optp = (struct opthdr *)&mp_conn_ctl->b_rptr[ + sizeof (struct T_optmgmt_ack)]; + optp->level = MIB2_TCP; + optp->name = MIB2_TCP_CONN; + optp->len = msgdsize(mp_conn_ctl->b_cont); + qreply(q, mp_conn_ctl); + + /* table of MLP attributes... */ + optp = (struct opthdr *)&mp_attr_ctl->b_rptr[ + sizeof (struct T_optmgmt_ack)]; + optp->level = MIB2_TCP; + optp->name = EXPER_XPORT_MLP; + optp->len = msgdsize(mp_attr_ctl->b_cont); + if (optp->len == 0) + freemsg(mp_attr_ctl); + else + qreply(q, mp_attr_ctl); + + /* table of IPv6 connections... */ + optp = (struct opthdr *)&mp6_conn_ctl->b_rptr[ + sizeof (struct T_optmgmt_ack)]; + optp->level = MIB2_TCP6; + optp->name = MIB2_TCP6_CONN; + optp->len = msgdsize(mp6_conn_ctl->b_cont); + qreply(q, mp6_conn_ctl); + + /* table of IPv6 MLP attributes... */ + optp = (struct opthdr *)&mp6_attr_ctl->b_rptr[ + sizeof (struct T_optmgmt_ack)]; + optp->level = MIB2_TCP6; + optp->name = EXPER_XPORT_MLP; + optp->len = msgdsize(mp6_attr_ctl->b_cont); + if (optp->len == 0) + freemsg(mp6_attr_ctl); + else + qreply(q, mp6_attr_ctl); + return (mp2ctl); +} + +/* Return 0 if invalid set request, 1 otherwise, including non-tcp requests */ +/* ARGSUSED */ +int +tcp_snmp_set(queue_t *q, int level, int name, uchar_t *ptr, int len) +{ + mib2_tcpConnEntry_t *tce = (mib2_tcpConnEntry_t *)ptr; + + switch (level) { + case MIB2_TCP: + switch (name) { + case 13: + if (tce->tcpConnState != MIB2_TCP_deleteTCB) + return (0); + /* TODO: delete entry defined by tce */ + return (1); + default: + return (0); + } + default: + return (1); + } +} + +/* + * TCP Kstats implementation + */ +void * +tcp_kstat_init(netstackid_t stackid) +{ + kstat_t *ksp; + + tcp_named_kstat_t template = { + { "rtoAlgorithm", KSTAT_DATA_INT32, 0 }, + { "rtoMin", KSTAT_DATA_INT32, 0 }, + { "rtoMax", KSTAT_DATA_INT32, 0 }, + { "maxConn", KSTAT_DATA_INT32, 0 }, + { "activeOpens", KSTAT_DATA_UINT32, 0 }, + { "passiveOpens", KSTAT_DATA_UINT32, 0 }, + { "attemptFails", KSTAT_DATA_UINT32, 0 }, + { "estabResets", KSTAT_DATA_UINT32, 0 }, + { "currEstab", KSTAT_DATA_UINT32, 0 }, + { "inSegs", KSTAT_DATA_UINT64, 0 }, + { "outSegs", KSTAT_DATA_UINT64, 0 }, + { "retransSegs", KSTAT_DATA_UINT32, 0 }, + { "connTableSize", KSTAT_DATA_INT32, 0 }, + { "outRsts", KSTAT_DATA_UINT32, 0 }, + { "outDataSegs", KSTAT_DATA_UINT32, 0 }, + { "outDataBytes", KSTAT_DATA_UINT32, 0 }, + { "retransBytes", KSTAT_DATA_UINT32, 0 }, + { "outAck", KSTAT_DATA_UINT32, 0 }, + { "outAckDelayed", KSTAT_DATA_UINT32, 0 }, + { "outUrg", KSTAT_DATA_UINT32, 0 }, + { "outWinUpdate", KSTAT_DATA_UINT32, 0 }, + { "outWinProbe", KSTAT_DATA_UINT32, 0 }, + { "outControl", KSTAT_DATA_UINT32, 0 }, + { "outFastRetrans", KSTAT_DATA_UINT32, 0 }, + { "inAckSegs", KSTAT_DATA_UINT32, 0 }, + { "inAckBytes", KSTAT_DATA_UINT32, 0 }, + { "inDupAck", KSTAT_DATA_UINT32, 0 }, + { "inAckUnsent", KSTAT_DATA_UINT32, 0 }, + { "inDataInorderSegs", KSTAT_DATA_UINT32, 0 }, + { "inDataInorderBytes", KSTAT_DATA_UINT32, 0 }, + { "inDataUnorderSegs", KSTAT_DATA_UINT32, 0 }, + { "inDataUnorderBytes", KSTAT_DATA_UINT32, 0 }, + { "inDataDupSegs", KSTAT_DATA_UINT32, 0 }, + { "inDataDupBytes", KSTAT_DATA_UINT32, 0 }, + { "inDataPartDupSegs", KSTAT_DATA_UINT32, 0 }, + { "inDataPartDupBytes", KSTAT_DATA_UINT32, 0 }, + { "inDataPastWinSegs", KSTAT_DATA_UINT32, 0 }, + { "inDataPastWinBytes", KSTAT_DATA_UINT32, 0 }, + { "inWinProbe", KSTAT_DATA_UINT32, 0 }, + { "inWinUpdate", KSTAT_DATA_UINT32, 0 }, + { "inClosed", KSTAT_DATA_UINT32, 0 }, + { "rttUpdate", KSTAT_DATA_UINT32, 0 }, + { "rttNoUpdate", KSTAT_DATA_UINT32, 0 }, + { "timRetrans", KSTAT_DATA_UINT32, 0 }, + { "timRetransDrop", KSTAT_DATA_UINT32, 0 }, + { "timKeepalive", KSTAT_DATA_UINT32, 0 }, + { "timKeepaliveProbe", KSTAT_DATA_UINT32, 0 }, + { "timKeepaliveDrop", KSTAT_DATA_UINT32, 0 }, + { "listenDrop", KSTAT_DATA_UINT32, 0 }, + { "listenDropQ0", KSTAT_DATA_UINT32, 0 }, + { "halfOpenDrop", KSTAT_DATA_UINT32, 0 }, + { "outSackRetransSegs", KSTAT_DATA_UINT32, 0 }, + { "connTableSize6", KSTAT_DATA_INT32, 0 } + }; + + ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2", + KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid); + + if (ksp == NULL) + return (NULL); + + template.rtoAlgorithm.value.ui32 = 4; + template.maxConn.value.i32 = -1; + + bcopy(&template, ksp->ks_data, sizeof (template)); + ksp->ks_update = tcp_kstat_update; + ksp->ks_private = (void *)(uintptr_t)stackid; + + kstat_install(ksp); + return (ksp); +} + +void +tcp_kstat_fini(netstackid_t stackid, kstat_t *ksp) +{ + if (ksp != NULL) { + ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); + kstat_delete_netstack(ksp, stackid); + } +} + +static int +tcp_kstat_update(kstat_t *kp, int rw) +{ + tcp_named_kstat_t *tcpkp; + tcp_t *tcp; + connf_t *connfp; + conn_t *connp; + int i; + netstackid_t stackid = (netstackid_t)(uintptr_t)kp->ks_private; + netstack_t *ns; + tcp_stack_t *tcps; + ip_stack_t *ipst; + mib2_tcp_t tcp_mib; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ns = netstack_find_by_stackid(stackid); + if (ns == NULL) + return (-1); + tcps = ns->netstack_tcp; + if (tcps == NULL) { + netstack_rele(ns); + return (-1); + } + + tcpkp = (tcp_named_kstat_t *)kp->ks_data; + + tcpkp->currEstab.value.ui32 = 0; + tcpkp->rtoMin.value.ui32 = tcps->tcps_rexmit_interval_min; + tcpkp->rtoMax.value.ui32 = tcps->tcps_rexmit_interval_max; + + ipst = ns->netstack_ip; + + for (i = 0; i < CONN_G_HASH_SIZE; i++) { + connfp = &ipst->ips_ipcl_globalhash_fanout[i]; + connp = NULL; + while ((connp = + ipcl_get_next_conn(connfp, connp, IPCL_TCPCONN)) != NULL) { + tcp = connp->conn_tcp; + switch (tcp_snmp_state(tcp)) { + case MIB2_TCP_established: + case MIB2_TCP_closeWait: + tcpkp->currEstab.value.ui32++; + break; + } + } + } + bzero(&tcp_mib, sizeof (tcp_mib)); + tcp_sum_mib(tcps, &tcp_mib); + + tcpkp->activeOpens.value.ui32 = tcp_mib.tcpActiveOpens; + tcpkp->passiveOpens.value.ui32 = tcp_mib.tcpPassiveOpens; + tcpkp->attemptFails.value.ui32 = tcp_mib.tcpAttemptFails; + tcpkp->estabResets.value.ui32 = tcp_mib.tcpEstabResets; + tcpkp->inSegs.value.ui64 = tcp_mib.tcpHCInSegs; + tcpkp->outSegs.value.ui64 = tcp_mib.tcpHCOutSegs; + tcpkp->retransSegs.value.ui32 = tcp_mib.tcpRetransSegs; + tcpkp->connTableSize.value.i32 = tcp_mib.tcpConnTableSize; + tcpkp->outRsts.value.ui32 = tcp_mib.tcpOutRsts; + tcpkp->outDataSegs.value.ui32 = tcp_mib.tcpOutDataSegs; + tcpkp->outDataBytes.value.ui32 = tcp_mib.tcpOutDataBytes; + tcpkp->retransBytes.value.ui32 = tcp_mib.tcpRetransBytes; + tcpkp->outAck.value.ui32 = tcp_mib.tcpOutAck; + tcpkp->outAckDelayed.value.ui32 = tcp_mib.tcpOutAckDelayed; + tcpkp->outUrg.value.ui32 = tcp_mib.tcpOutUrg; + tcpkp->outWinUpdate.value.ui32 = tcp_mib.tcpOutWinUpdate; + tcpkp->outWinProbe.value.ui32 = tcp_mib.tcpOutWinProbe; + tcpkp->outControl.value.ui32 = tcp_mib.tcpOutControl; + tcpkp->outFastRetrans.value.ui32 = tcp_mib.tcpOutFastRetrans; + tcpkp->inAckSegs.value.ui32 = tcp_mib.tcpInAckSegs; + tcpkp->inAckBytes.value.ui32 = tcp_mib.tcpInAckBytes; + tcpkp->inDupAck.value.ui32 = tcp_mib.tcpInDupAck; + tcpkp->inAckUnsent.value.ui32 = tcp_mib.tcpInAckUnsent; + tcpkp->inDataInorderSegs.value.ui32 = tcp_mib.tcpInDataInorderSegs; + tcpkp->inDataInorderBytes.value.ui32 = tcp_mib.tcpInDataInorderBytes; + tcpkp->inDataUnorderSegs.value.ui32 = tcp_mib.tcpInDataUnorderSegs; + tcpkp->inDataUnorderBytes.value.ui32 = tcp_mib.tcpInDataUnorderBytes; + tcpkp->inDataDupSegs.value.ui32 = tcp_mib.tcpInDataDupSegs; + tcpkp->inDataDupBytes.value.ui32 = tcp_mib.tcpInDataDupBytes; + tcpkp->inDataPartDupSegs.value.ui32 = tcp_mib.tcpInDataPartDupSegs; + tcpkp->inDataPartDupBytes.value.ui32 = tcp_mib.tcpInDataPartDupBytes; + tcpkp->inDataPastWinSegs.value.ui32 = tcp_mib.tcpInDataPastWinSegs; + tcpkp->inDataPastWinBytes.value.ui32 = tcp_mib.tcpInDataPastWinBytes; + tcpkp->inWinProbe.value.ui32 = tcp_mib.tcpInWinProbe; + tcpkp->inWinUpdate.value.ui32 = tcp_mib.tcpInWinUpdate; + tcpkp->inClosed.value.ui32 = tcp_mib.tcpInClosed; + tcpkp->rttNoUpdate.value.ui32 = tcp_mib.tcpRttNoUpdate; + tcpkp->rttUpdate.value.ui32 = tcp_mib.tcpRttUpdate; + tcpkp->timRetrans.value.ui32 = tcp_mib.tcpTimRetrans; + tcpkp->timRetransDrop.value.ui32 = tcp_mib.tcpTimRetransDrop; + tcpkp->timKeepalive.value.ui32 = tcp_mib.tcpTimKeepalive; + tcpkp->timKeepaliveProbe.value.ui32 = tcp_mib.tcpTimKeepaliveProbe; + tcpkp->timKeepaliveDrop.value.ui32 = tcp_mib.tcpTimKeepaliveDrop; + tcpkp->listenDrop.value.ui32 = tcp_mib.tcpListenDrop; + tcpkp->listenDropQ0.value.ui32 = tcp_mib.tcpListenDropQ0; + tcpkp->halfOpenDrop.value.ui32 = tcp_mib.tcpHalfOpenDrop; + tcpkp->outSackRetransSegs.value.ui32 = tcp_mib.tcpOutSackRetransSegs; + tcpkp->connTableSize6.value.i32 = tcp_mib.tcp6ConnTableSize; + + netstack_rele(ns); + return (0); +} + +/* + * kstats related to squeues i.e. not per IP instance + */ +void * +tcp_g_kstat_init(tcp_g_stat_t *tcp_g_statp) +{ + kstat_t *ksp; + + tcp_g_stat_t template = { + { "tcp_timermp_alloced", KSTAT_DATA_UINT64 }, + { "tcp_timermp_allocfail", KSTAT_DATA_UINT64 }, + { "tcp_timermp_allocdblfail", KSTAT_DATA_UINT64 }, + { "tcp_freelist_cleanup", KSTAT_DATA_UINT64 }, + }; + + ksp = kstat_create(TCP_MOD_NAME, 0, "tcpstat_g", "net", + KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (ksp == NULL) + return (NULL); + + bcopy(&template, tcp_g_statp, sizeof (template)); + ksp->ks_data = (void *)tcp_g_statp; + + kstat_install(ksp); + return (ksp); +} + +void +tcp_g_kstat_fini(kstat_t *ksp) +{ + if (ksp != NULL) { + kstat_delete(ksp); + } +} + +void * +tcp_kstat2_init(netstackid_t stackid) +{ + kstat_t *ksp; + + tcp_stat_t template = { + { "tcp_time_wait_syn_success", KSTAT_DATA_UINT64, 0 }, + { "tcp_clean_death_nondetached", KSTAT_DATA_UINT64, 0 }, + { "tcp_eager_blowoff_q", KSTAT_DATA_UINT64, 0 }, + { "tcp_eager_blowoff_q0", KSTAT_DATA_UINT64, 0 }, + { "tcp_no_listener", KSTAT_DATA_UINT64, 0 }, + { "tcp_listendrop", KSTAT_DATA_UINT64, 0 }, + { "tcp_listendropq0", KSTAT_DATA_UINT64, 0 }, + { "tcp_wsrv_called", KSTAT_DATA_UINT64, 0 }, + { "tcp_flwctl_on", KSTAT_DATA_UINT64, 0 }, + { "tcp_timer_fire_early", KSTAT_DATA_UINT64, 0 }, + { "tcp_timer_fire_miss", KSTAT_DATA_UINT64, 0 }, + { "tcp_zcopy_on", KSTAT_DATA_UINT64, 0 }, + { "tcp_zcopy_off", KSTAT_DATA_UINT64, 0 }, + { "tcp_zcopy_backoff", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_flowctl", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_backenabled", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_urg", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_putnext", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_unfusable", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_aborted", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_unqualified", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_rrw_busy", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_rrw_msgcnt", KSTAT_DATA_UINT64, 0 }, + { "tcp_fusion_rrw_plugged", KSTAT_DATA_UINT64, 0 }, + { "tcp_in_ack_unsent_drop", KSTAT_DATA_UINT64, 0 }, + { "tcp_sock_fallback", KSTAT_DATA_UINT64, 0 }, + { "tcp_lso_enabled", KSTAT_DATA_UINT64, 0 }, + { "tcp_lso_disabled", KSTAT_DATA_UINT64, 0 }, + { "tcp_lso_times", KSTAT_DATA_UINT64, 0 }, + { "tcp_lso_pkt_out", KSTAT_DATA_UINT64, 0 }, + { "tcp_listen_cnt_drop", KSTAT_DATA_UINT64, 0 }, + { "tcp_listen_mem_drop", KSTAT_DATA_UINT64, 0 }, + { "tcp_zwin_mem_drop", KSTAT_DATA_UINT64, 0 }, + { "tcp_zwin_ack_syn", KSTAT_DATA_UINT64, 0 }, + { "tcp_rst_unsent", KSTAT_DATA_UINT64, 0 }, + { "tcp_reclaim_cnt", KSTAT_DATA_UINT64, 0 }, + { "tcp_reass_timeout", KSTAT_DATA_UINT64, 0 }, +#ifdef TCP_DEBUG_COUNTER + { "tcp_time_wait", KSTAT_DATA_UINT64, 0 }, + { "tcp_rput_time_wait", KSTAT_DATA_UINT64, 0 }, + { "tcp_detach_time_wait", KSTAT_DATA_UINT64, 0 }, + { "tcp_timeout_calls", KSTAT_DATA_UINT64, 0 }, + { "tcp_timeout_cached_alloc", KSTAT_DATA_UINT64, 0 }, + { "tcp_timeout_cancel_reqs", KSTAT_DATA_UINT64, 0 }, + { "tcp_timeout_canceled", KSTAT_DATA_UINT64, 0 }, + { "tcp_timermp_freed", KSTAT_DATA_UINT64, 0 }, + { "tcp_push_timer_cnt", KSTAT_DATA_UINT64, 0 }, + { "tcp_ack_timer_cnt", KSTAT_DATA_UINT64, 0 }, +#endif + }; + + ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net", + KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 0, + stackid); + + if (ksp == NULL) + return (NULL); + + bcopy(&template, ksp->ks_data, sizeof (template)); + ksp->ks_private = (void *)(uintptr_t)stackid; + ksp->ks_update = tcp_kstat2_update; + + kstat_install(ksp); + return (ksp); +} + +void +tcp_kstat2_fini(netstackid_t stackid, kstat_t *ksp) +{ + if (ksp != NULL) { + ASSERT(stackid == (netstackid_t)(uintptr_t)ksp->ks_private); + kstat_delete_netstack(ksp, stackid); + } +} + +/* + * Sum up all per CPU tcp_stat_t kstat counters. + */ +static int +tcp_kstat2_update(kstat_t *kp, int rw) +{ + netstackid_t stackid = (netstackid_t)(uintptr_t)kp->ks_private; + netstack_t *ns; + tcp_stack_t *tcps; + tcp_stat_t *stats; + int i; + int cnt; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ns = netstack_find_by_stackid(stackid); + if (ns == NULL) + return (-1); + tcps = ns->netstack_tcp; + if (tcps == NULL) { + netstack_rele(ns); + return (-1); + } + + stats = (tcp_stat_t *)kp->ks_data; + tcp_clr_stats(stats); + + /* + * tcps_sc_cnt may change in the middle of the loop. It is better + * to get its value first. + */ + cnt = tcps->tcps_sc_cnt; + for (i = 0; i < cnt; i++) + tcp_cp_stats(&tcps->tcps_sc[i]->tcp_sc_stats, stats); + + netstack_rele(ns); + return (0); +} + +/* + * To copy stats from one mib2_tcp_t to another. Static fields are not copied. + * The caller should set them up propertly. + */ +void +tcp_cp_mib(mib2_tcp_t *from, mib2_tcp_t *to) +{ + to->tcpActiveOpens += from->tcpActiveOpens; + to->tcpPassiveOpens += from->tcpPassiveOpens; + to->tcpAttemptFails += from->tcpAttemptFails; + to->tcpEstabResets += from->tcpEstabResets; + to->tcpInSegs += from->tcpInSegs; + to->tcpOutSegs += from->tcpOutSegs; + to->tcpRetransSegs += from->tcpRetransSegs; + to->tcpOutRsts += from->tcpOutRsts; + + to->tcpOutDataSegs += from->tcpOutDataSegs; + to->tcpOutDataBytes += from->tcpOutDataBytes; + to->tcpRetransBytes += from->tcpRetransBytes; + to->tcpOutAck += from->tcpOutAck; + to->tcpOutAckDelayed += from->tcpOutAckDelayed; + to->tcpOutUrg += from->tcpOutUrg; + to->tcpOutWinUpdate += from->tcpOutWinUpdate; + to->tcpOutWinProbe += from->tcpOutWinProbe; + to->tcpOutControl += from->tcpOutControl; + to->tcpOutFastRetrans += from->tcpOutFastRetrans; + + to->tcpInAckBytes += from->tcpInAckBytes; + to->tcpInDupAck += from->tcpInDupAck; + to->tcpInAckUnsent += from->tcpInAckUnsent; + to->tcpInDataInorderSegs += from->tcpInDataInorderSegs; + to->tcpInDataInorderBytes += from->tcpInDataInorderBytes; + to->tcpInDataUnorderSegs += from->tcpInDataUnorderSegs; + to->tcpInDataUnorderBytes += from->tcpInDataUnorderBytes; + to->tcpInDataDupSegs += from->tcpInDataDupSegs; + to->tcpInDataDupBytes += from->tcpInDataDupBytes; + to->tcpInDataPartDupSegs += from->tcpInDataPartDupSegs; + to->tcpInDataPartDupBytes += from->tcpInDataPartDupBytes; + to->tcpInDataPastWinSegs += from->tcpInDataPastWinSegs; + to->tcpInDataPastWinBytes += from->tcpInDataPastWinBytes; + to->tcpInWinProbe += from->tcpInWinProbe; + to->tcpInWinUpdate += from->tcpInWinUpdate; + to->tcpInClosed += from->tcpInClosed; + + to->tcpRttNoUpdate += from->tcpRttNoUpdate; + to->tcpRttUpdate += from->tcpRttUpdate; + to->tcpTimRetrans += from->tcpTimRetrans; + to->tcpTimRetransDrop += from->tcpTimRetransDrop; + to->tcpTimKeepalive += from->tcpTimKeepalive; + to->tcpTimKeepaliveProbe += from->tcpTimKeepaliveProbe; + to->tcpTimKeepaliveDrop += from->tcpTimKeepaliveDrop; + to->tcpListenDrop += from->tcpListenDrop; + to->tcpListenDropQ0 += from->tcpListenDropQ0; + to->tcpHalfOpenDrop += from->tcpHalfOpenDrop; + to->tcpOutSackRetransSegs += from->tcpOutSackRetransSegs; + to->tcpHCInSegs += from->tcpHCInSegs; + to->tcpHCOutSegs += from->tcpHCOutSegs; +} + +/* + * To sum up all MIB2 stats for a tcp_stack_t from all per CPU stats. The + * caller should initialize the target mib2_tcp_t properly as this function + * just adds up all the per CPU stats. + */ +static void +tcp_sum_mib(tcp_stack_t *tcps, mib2_tcp_t *tcp_mib) +{ + int i; + int cnt; + + /* + * tcps_sc_cnt may change in the middle of the loop. It is better + * to get its value first. + */ + cnt = tcps->tcps_sc_cnt; + for (i = 0; i < cnt; i++) + tcp_cp_mib(&tcps->tcps_sc[i]->tcp_sc_mib, tcp_mib); +} + +/* + * To set all tcp_stat_t counters to 0. + */ +void +tcp_clr_stats(tcp_stat_t *stats) +{ + stats->tcp_time_wait_syn_success.value.ui64 = 0; + stats->tcp_clean_death_nondetached.value.ui64 = 0; + stats->tcp_eager_blowoff_q.value.ui64 = 0; + stats->tcp_eager_blowoff_q0.value.ui64 = 0; + stats->tcp_no_listener.value.ui64 = 0; + stats->tcp_listendrop.value.ui64 = 0; + stats->tcp_listendropq0.value.ui64 = 0; + stats->tcp_wsrv_called.value.ui64 = 0; + stats->tcp_flwctl_on.value.ui64 = 0; + stats->tcp_timer_fire_early.value.ui64 = 0; + stats->tcp_timer_fire_miss.value.ui64 = 0; + stats->tcp_zcopy_on.value.ui64 = 0; + stats->tcp_zcopy_off.value.ui64 = 0; + stats->tcp_zcopy_backoff.value.ui64 = 0; + stats->tcp_fusion_flowctl.value.ui64 = 0; + stats->tcp_fusion_backenabled.value.ui64 = 0; + stats->tcp_fusion_urg.value.ui64 = 0; + stats->tcp_fusion_putnext.value.ui64 = 0; + stats->tcp_fusion_unfusable.value.ui64 = 0; + stats->tcp_fusion_aborted.value.ui64 = 0; + stats->tcp_fusion_unqualified.value.ui64 = 0; + stats->tcp_fusion_rrw_busy.value.ui64 = 0; + stats->tcp_fusion_rrw_msgcnt.value.ui64 = 0; + stats->tcp_fusion_rrw_plugged.value.ui64 = 0; + stats->tcp_in_ack_unsent_drop.value.ui64 = 0; + stats->tcp_sock_fallback.value.ui64 = 0; + stats->tcp_lso_enabled.value.ui64 = 0; + stats->tcp_lso_disabled.value.ui64 = 0; + stats->tcp_lso_times.value.ui64 = 0; + stats->tcp_lso_pkt_out.value.ui64 = 0; + stats->tcp_listen_cnt_drop.value.ui64 = 0; + stats->tcp_listen_mem_drop.value.ui64 = 0; + stats->tcp_zwin_mem_drop.value.ui64 = 0; + stats->tcp_zwin_ack_syn.value.ui64 = 0; + stats->tcp_rst_unsent.value.ui64 = 0; + stats->tcp_reclaim_cnt.value.ui64 = 0; + stats->tcp_reass_timeout.value.ui64 = 0; + +#ifdef TCP_DEBUG_COUNTER + stats->tcp_time_wait.value.ui64 = 0; + stats->tcp_rput_time_wait.value.ui64 = 0; + stats->tcp_detach_time_wait.value.ui64 = 0; + stats->tcp_timeout_calls.value.ui64 = 0; + stats->tcp_timeout_cached_alloc.value.ui64 = 0; + stats->tcp_timeout_cancel_reqs.value.ui64 = 0; + stats->tcp_timeout_canceled.value.ui64 = 0; + stats->tcp_timermp_freed.value.ui64 = 0; + stats->tcp_push_timer_cnt.value.ui64 = 0; + stats->tcp_ack_timer_cnt.value.ui64 = 0; +#endif +} + +/* + * To copy counters from one tcp_stat_t to another. + */ +void +tcp_cp_stats(tcp_stat_t *from, tcp_stat_t *to) +{ + to->tcp_time_wait_syn_success.value.ui64 += + from->tcp_time_wait_syn_success.value.ui64; + to->tcp_clean_death_nondetached.value.ui64 += + from->tcp_clean_death_nondetached.value.ui64; + to->tcp_eager_blowoff_q.value.ui64 += + from->tcp_eager_blowoff_q.value.ui64; + to->tcp_eager_blowoff_q0.value.ui64 += + from->tcp_eager_blowoff_q0.value.ui64; + to->tcp_no_listener.value.ui64 += + from->tcp_no_listener.value.ui64; + to->tcp_listendrop.value.ui64 += + from->tcp_listendrop.value.ui64; + to->tcp_listendropq0.value.ui64 += + from->tcp_listendropq0.value.ui64; + to->tcp_wsrv_called.value.ui64 += + from->tcp_wsrv_called.value.ui64; + to->tcp_flwctl_on.value.ui64 += + from->tcp_flwctl_on.value.ui64; + to->tcp_timer_fire_early.value.ui64 += + from->tcp_timer_fire_early.value.ui64; + to->tcp_timer_fire_miss.value.ui64 += + from->tcp_timer_fire_miss.value.ui64; + to->tcp_zcopy_on.value.ui64 += + from->tcp_zcopy_on.value.ui64; + to->tcp_zcopy_off.value.ui64 += + from->tcp_zcopy_off.value.ui64; + to->tcp_zcopy_backoff.value.ui64 += + from->tcp_zcopy_backoff.value.ui64; + to->tcp_fusion_flowctl.value.ui64 += + from->tcp_fusion_flowctl.value.ui64; + to->tcp_fusion_backenabled.value.ui64 += + from->tcp_fusion_backenabled.value.ui64; + to->tcp_fusion_urg.value.ui64 += + from->tcp_fusion_urg.value.ui64; + to->tcp_fusion_putnext.value.ui64 += + from->tcp_fusion_putnext.value.ui64; + to->tcp_fusion_unfusable.value.ui64 += + from->tcp_fusion_unfusable.value.ui64; + to->tcp_fusion_aborted.value.ui64 += + from->tcp_fusion_aborted.value.ui64; + to->tcp_fusion_unqualified.value.ui64 += + from->tcp_fusion_unqualified.value.ui64; + to->tcp_fusion_rrw_busy.value.ui64 += + from->tcp_fusion_rrw_busy.value.ui64; + to->tcp_fusion_rrw_msgcnt.value.ui64 += + from->tcp_fusion_rrw_msgcnt.value.ui64; + to->tcp_fusion_rrw_plugged.value.ui64 += + from->tcp_fusion_rrw_plugged.value.ui64; + to->tcp_in_ack_unsent_drop.value.ui64 += + from->tcp_in_ack_unsent_drop.value.ui64; + to->tcp_sock_fallback.value.ui64 += + from->tcp_sock_fallback.value.ui64; + to->tcp_lso_enabled.value.ui64 += + from->tcp_lso_enabled.value.ui64; + to->tcp_lso_disabled.value.ui64 += + from->tcp_lso_disabled.value.ui64; + to->tcp_lso_times.value.ui64 += + from->tcp_lso_times.value.ui64; + to->tcp_lso_pkt_out.value.ui64 += + from->tcp_lso_pkt_out.value.ui64; + to->tcp_listen_cnt_drop.value.ui64 += + from->tcp_listen_cnt_drop.value.ui64; + to->tcp_listen_mem_drop.value.ui64 += + from->tcp_listen_mem_drop.value.ui64; + to->tcp_zwin_mem_drop.value.ui64 += + from->tcp_zwin_mem_drop.value.ui64; + to->tcp_zwin_ack_syn.value.ui64 += + from->tcp_zwin_ack_syn.value.ui64; + to->tcp_rst_unsent.value.ui64 += + from->tcp_rst_unsent.value.ui64; + to->tcp_reclaim_cnt.value.ui64 += + from->tcp_reclaim_cnt.value.ui64; + to->tcp_reass_timeout.value.ui64 += + from->tcp_reass_timeout.value.ui64; + +#ifdef TCP_DEBUG_COUNTER + to->tcp_time_wait.value.ui64 += + from->tcp_time_wait.value.ui64; + to->tcp_rput_time_wait.value.ui64 += + from->tcp_rput_time_wait.value.ui64; + to->tcp_detach_time_wait.value.ui64 += + from->tcp_detach_time_wait.value.ui64; + to->tcp_timeout_calls.value.ui64 += + from->tcp_timeout_calls.value.ui64; + to->tcp_timeout_cached_alloc.value.ui64 += + from->tcp_timeout_cached_alloc.value.ui64; + to->tcp_timeout_cancel_reqs.value.ui64 += + from->tcp_timeout_cancel_reqs.value.ui64; + to->tcp_timeout_canceled.value.ui64 += + from->tcp_timeout_canceled.value.ui64; + to->tcp_timermp_freed.value.ui64 += + from->tcp_timermp_freed.value.ui64; + to->tcp_push_timer_cnt.value.ui64 += + from->tcp_push_timer_cnt.value.ui64; + to->tcp_ack_timer_cnt.value.ui64 += + from->tcp_ack_timer_cnt.value.ui64; +#endif +} diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c new file mode 100644 index 0000000000..be241bd6cf --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c @@ -0,0 +1,629 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This file contains functions related to TCP time wait processing. Also + * refer to the time wait handling comments in tcp_impl.h. + */ + +#include <sys/types.h> +#include <sys/strsun.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> +#include <sys/callo.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/tcp_cluster.h> + +static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); + +/* + * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. + * Running it every 5 seconds seems to give the best results. + */ +#define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC) + +/* + * Remove a connection from the list of detached TIME_WAIT connections. + * It returns B_FALSE if it can't remove the connection from the list + * as the connection has already been removed from the list due to an + * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. + */ +boolean_t +tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) +{ + boolean_t locked = B_FALSE; + + if (tcp_time_wait == NULL) { + tcp_time_wait = *((tcp_squeue_priv_t **) + squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); + mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + locked = B_TRUE; + } else { + ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); + } + + /* 0 means that the tcp_t has not been added to the time wait list. */ + if (tcp->tcp_time_wait_expire == 0) { + ASSERT(tcp->tcp_time_wait_next == NULL); + ASSERT(tcp->tcp_time_wait_prev == NULL); + if (locked) + mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + return (B_FALSE); + } + ASSERT(TCP_IS_DETACHED(tcp)); + ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); + + if (tcp == tcp_time_wait->tcp_time_wait_head) { + ASSERT(tcp->tcp_time_wait_prev == NULL); + tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; + if (tcp_time_wait->tcp_time_wait_head != NULL) { + tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = + NULL; + } else { + tcp_time_wait->tcp_time_wait_tail = NULL; + } + } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { + ASSERT(tcp->tcp_time_wait_next == NULL); + tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; + ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); + tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; + } else { + ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); + ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); + tcp->tcp_time_wait_prev->tcp_time_wait_next = + tcp->tcp_time_wait_next; + tcp->tcp_time_wait_next->tcp_time_wait_prev = + tcp->tcp_time_wait_prev; + } + tcp->tcp_time_wait_next = NULL; + tcp->tcp_time_wait_prev = NULL; + tcp->tcp_time_wait_expire = 0; + + if (locked) + mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + return (B_TRUE); +} + +/* + * Add a connection to the list of detached TIME_WAIT connections + * and set its time to expire. + */ +void +tcp_time_wait_append(tcp_t *tcp) +{ + tcp_stack_t *tcps = tcp->tcp_tcps; + tcp_squeue_priv_t *tcp_time_wait = + *((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp, + SQPRIVATE_TCP)); + + tcp_timers_stop(tcp); + + /* Freed above */ + ASSERT(tcp->tcp_timer_tid == 0); + ASSERT(tcp->tcp_ack_tid == 0); + + /* must have happened at the time of detaching the tcp */ + ASSERT(tcp->tcp_ptpahn == NULL); + ASSERT(tcp->tcp_flow_stopped == 0); + ASSERT(tcp->tcp_time_wait_next == NULL); + ASSERT(tcp->tcp_time_wait_prev == NULL); + ASSERT(tcp->tcp_time_wait_expire == NULL); + ASSERT(tcp->tcp_listener == NULL); + + tcp->tcp_time_wait_expire = ddi_get_lbolt(); + /* + * The value computed below in tcp->tcp_time_wait_expire may + * appear negative or wrap around. That is ok since our + * interest is only in the difference between the current lbolt + * value and tcp->tcp_time_wait_expire. But the value should not + * be zero, since it means the tcp is not in the TIME_WAIT list. + * The corresponding comparison in tcp_time_wait_collector() uses + * modular arithmetic. + */ + tcp->tcp_time_wait_expire += MSEC_TO_TICK( + tcps->tcps_time_wait_interval); + if (tcp->tcp_time_wait_expire == 0) + tcp->tcp_time_wait_expire = 1; + + ASSERT(TCP_IS_DETACHED(tcp)); + ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); + ASSERT(tcp->tcp_time_wait_next == NULL); + ASSERT(tcp->tcp_time_wait_prev == NULL); + TCP_DBGSTAT(tcps, tcp_time_wait); + + mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + if (tcp_time_wait->tcp_time_wait_head == NULL) { + ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); + tcp_time_wait->tcp_time_wait_head = tcp; + } else { + ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); + ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == + TCPS_TIME_WAIT); + tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp; + tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail; + } + tcp_time_wait->tcp_time_wait_tail = tcp; + mutex_exit(&tcp_time_wait->tcp_time_wait_lock); +} + +/* + * Wrapper to call tcp_close_detached() via squeue to clean up TIME-WAIT + * tcp_t. Used in tcp_time_wait_collector(). + */ +/* ARGSUSED */ +static void +tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + ASSERT(tcp != NULL); + if (tcp->tcp_state == TCPS_CLOSED) { + return; + } + + ASSERT((connp->conn_family == AF_INET && + connp->conn_ipversion == IPV4_VERSION) || + (connp->conn_family == AF_INET6 && + (connp->conn_ipversion == IPV4_VERSION || + connp->conn_ipversion == IPV6_VERSION))); + ASSERT(!tcp->tcp_listener); + + ASSERT(TCP_IS_DETACHED(tcp)); + + /* + * Because they have no upstream client to rebind or tcp_close() + * them later, we axe the connection here and now. + */ + tcp_close_detached(tcp); +} + +/* + * Blows away all tcps whose TIME_WAIT has expired. List traversal + * is done forwards from the head. + * This walks all stack instances since + * tcp_time_wait remains global across all stacks. + */ +/* ARGSUSED */ +void +tcp_time_wait_collector(void *arg) +{ + tcp_t *tcp; + clock_t now; + mblk_t *mp; + conn_t *connp; + kmutex_t *lock; + boolean_t removed; + extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t, + uint8_t *, in_port_t, uint8_t *, in_port_t, void *); + + squeue_t *sqp = (squeue_t *)arg; + tcp_squeue_priv_t *tcp_time_wait = + *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); + + mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + tcp_time_wait->tcp_time_wait_tid = 0; + + if (tcp_time_wait->tcp_free_list != NULL && + tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { + TCP_G_STAT(tcp_freelist_cleanup); + while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { + tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; + tcp->tcp_time_wait_next = NULL; + tcp_time_wait->tcp_free_list_cnt--; + ASSERT(tcp->tcp_tcps == NULL); + CONN_DEC_REF(tcp->tcp_connp); + } + ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); + } + + /* + * In order to reap time waits reliably, we should use a + * source of time that is not adjustable by the user -- hence + * the call to ddi_get_lbolt(). + */ + now = ddi_get_lbolt(); + while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { + /* + * Compare times using modular arithmetic, since + * lbolt can wrapover. + */ + if ((now - tcp->tcp_time_wait_expire) < 0) { + break; + } + + removed = tcp_time_wait_remove(tcp, tcp_time_wait); + ASSERT(removed); + + connp = tcp->tcp_connp; + ASSERT(connp->conn_fanout != NULL); + lock = &connp->conn_fanout->connf_lock; + /* + * This is essentially a TW reclaim fast path optimization for + * performance where the timewait collector checks under the + * fanout lock (so that no one else can get access to the + * conn_t) that the refcnt is 2 i.e. one for TCP and one for + * the classifier hash list. If ref count is indeed 2, we can + * just remove the conn under the fanout lock and avoid + * cleaning up the conn under the squeue, provided that + * clustering callbacks are not enabled. If clustering is + * enabled, we need to make the clustering callback before + * setting the CONDEMNED flag and after dropping all locks and + * so we forego this optimization and fall back to the slow + * path. Also please see the comments in tcp_closei_local + * regarding the refcnt logic. + * + * Since we are holding the tcp_time_wait_lock, its better + * not to block on the fanout_lock because other connections + * can't add themselves to time_wait list. So we do a + * tryenter instead of mutex_enter. + */ + if (mutex_tryenter(lock)) { + mutex_enter(&connp->conn_lock); + if ((connp->conn_ref == 2) && + (cl_inet_disconnect == NULL)) { + ipcl_hash_remove_locked(connp, + connp->conn_fanout); + /* + * Set the CONDEMNED flag now itself so that + * the refcnt cannot increase due to any + * walker. + */ + connp->conn_state_flags |= CONN_CONDEMNED; + mutex_exit(lock); + mutex_exit(&connp->conn_lock); + if (tcp_time_wait->tcp_free_list_cnt < + tcp_free_list_max_cnt) { + /* Add to head of tcp_free_list */ + mutex_exit( + &tcp_time_wait->tcp_time_wait_lock); + tcp_cleanup(tcp); + ASSERT(connp->conn_latch == NULL); + ASSERT(connp->conn_policy == NULL); + ASSERT(tcp->tcp_tcps == NULL); + ASSERT(connp->conn_netstack == NULL); + + mutex_enter( + &tcp_time_wait->tcp_time_wait_lock); + tcp->tcp_time_wait_next = + tcp_time_wait->tcp_free_list; + tcp_time_wait->tcp_free_list = tcp; + tcp_time_wait->tcp_free_list_cnt++; + continue; + } else { + /* Do not add to tcp_free_list */ + mutex_exit( + &tcp_time_wait->tcp_time_wait_lock); + tcp_bind_hash_remove(tcp); + ixa_cleanup(tcp->tcp_connp->conn_ixa); + tcp_ipsec_cleanup(tcp); + CONN_DEC_REF(tcp->tcp_connp); + } + } else { + CONN_INC_REF_LOCKED(connp); + mutex_exit(lock); + mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&connp->conn_lock); + /* + * We can reuse the closemp here since conn has + * detached (otherwise we wouldn't even be in + * time_wait list). tcp_closemp_used can safely + * be changed without taking a lock as no other + * thread can concurrently access it at this + * point in the connection lifecycle. + */ + + if (tcp->tcp_closemp.b_prev == NULL) + tcp->tcp_closemp_used = B_TRUE; + else + cmn_err(CE_PANIC, + "tcp_timewait_collector: " + "concurrent use of tcp_closemp: " + "connp %p tcp %p\n", (void *)connp, + (void *)tcp); + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); + mp = &tcp->tcp_closemp; + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_timewait_close, connp, NULL, + SQ_FILL, SQTAG_TCP_TIMEWAIT); + } + } else { + mutex_enter(&connp->conn_lock); + CONN_INC_REF_LOCKED(connp); + mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&connp->conn_lock); + /* + * We can reuse the closemp here since conn has + * detached (otherwise we wouldn't even be in + * time_wait list). tcp_closemp_used can safely + * be changed without taking a lock as no other + * thread can concurrently access it at this + * point in the connection lifecycle. + */ + + if (tcp->tcp_closemp.b_prev == NULL) + tcp->tcp_closemp_used = B_TRUE; + else + cmn_err(CE_PANIC, "tcp_timewait_collector: " + "concurrent use of tcp_closemp: " + "connp %p tcp %p\n", (void *)connp, + (void *)tcp); + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); + mp = &tcp->tcp_closemp; + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, + tcp_timewait_close, connp, NULL, + SQ_FILL, SQTAG_TCP_TIMEWAIT); + } + mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + } + + if (tcp_time_wait->tcp_free_list != NULL) + tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; + + tcp_time_wait->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp, + TCP_TIME_WAIT_DELAY, CALLOUT_TCP_RESOLUTION, + CALLOUT_FLAG_ROUNDUP); + mutex_exit(&tcp_time_wait->tcp_time_wait_lock); +} + +/* + * tcp_time_wait_processing() handles processing of incoming packets when + * the tcp_t is in the TIME_WAIT state. + * + * A TIME_WAIT tcp_t that has an associated open TCP end point (not in + * detached state) is never put on the time wait list. + */ +void +tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, + uint32_t seg_ack, int seg_len, tcpha_t *tcpha, ip_recv_attr_t *ira) +{ + int32_t bytes_acked; + int32_t gap; + int32_t rgap; + tcp_opt_t tcpopt; + uint_t flags; + uint32_t new_swnd = 0; + conn_t *nconnp; + conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + BUMP_LOCAL(tcp->tcp_ibsegs); + DTRACE_PROBE2(tcp__trace__recv, mblk_t *, mp, tcp_t *, tcp); + + flags = (unsigned int)tcpha->tha_flags & 0xFF; + new_swnd = ntohs(tcpha->tha_win) << + ((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws); + if (tcp->tcp_snd_ts_ok) { + if (!tcp_paws_check(tcp, tcpha, &tcpopt)) { + tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, + tcp->tcp_rnxt, TH_ACK); + goto done; + } + } + gap = seg_seq - tcp->tcp_rnxt; + rgap = tcp->tcp_rwnd - (gap + seg_len); + if (gap < 0) { + TCPS_BUMP_MIB(tcps, tcpInDataDupSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataDupBytes, + (seg_len > -gap ? -gap : seg_len)); + seg_len += gap; + if (seg_len < 0 || (seg_len == 0 && !(flags & TH_FIN))) { + if (flags & TH_RST) { + goto done; + } + if ((flags & TH_FIN) && seg_len == -1) { + /* + * When TCP receives a duplicate FIN in + * TIME_WAIT state, restart the 2 MSL timer. + * See page 73 in RFC 793. Make sure this TCP + * is already on the TIME_WAIT list. If not, + * just restart the timer. + */ + if (TCP_IS_DETACHED(tcp)) { + if (tcp_time_wait_remove(tcp, NULL) == + B_TRUE) { + tcp_time_wait_append(tcp); + TCP_DBGSTAT(tcps, + tcp_rput_time_wait); + } + } else { + ASSERT(tcp != NULL); + TCP_TIMER_RESTART(tcp, + tcps->tcps_time_wait_interval); + } + tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, + tcp->tcp_rnxt, TH_ACK); + goto done; + } + flags |= TH_ACK_NEEDED; + seg_len = 0; + goto process_ack; + } + + /* Fix seg_seq, and chew the gap off the front. */ + seg_seq = tcp->tcp_rnxt; + } + + if ((flags & TH_SYN) && gap > 0 && rgap < 0) { + /* + * Make sure that when we accept the connection, pick + * an ISS greater than (tcp_snxt + ISS_INCR/2) for the + * old connection. + * + * The next ISS generated is equal to tcp_iss_incr_extra + * + ISS_INCR/2 + other components depending on the + * value of tcp_strong_iss. We pre-calculate the new + * ISS here and compare with tcp_snxt to determine if + * we need to make adjustment to tcp_iss_incr_extra. + * + * The above calculation is ugly and is a + * waste of CPU cycles... + */ + uint32_t new_iss = tcps->tcps_iss_incr_extra; + int32_t adj; + ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip; + + switch (tcps->tcps_strong_iss) { + case 2: { + /* Add time and MD5 components. */ + uint32_t answer[4]; + struct { + uint32_t ports; + in6_addr_t src; + in6_addr_t dst; + } arg; + MD5_CTX context; + + mutex_enter(&tcps->tcps_iss_key_lock); + context = tcps->tcps_iss_key; + mutex_exit(&tcps->tcps_iss_key_lock); + arg.ports = connp->conn_ports; + /* We use MAPPED addresses in tcp_iss_init */ + arg.src = connp->conn_laddr_v6; + arg.dst = connp->conn_faddr_v6; + MD5Update(&context, (uchar_t *)&arg, + sizeof (arg)); + MD5Final((uchar_t *)answer, &context); + answer[0] ^= answer[1] ^ answer[2] ^ answer[3]; + new_iss += (gethrtime() >> ISS_NSEC_SHT) + answer[0]; + break; + } + case 1: + /* Add time component and min random (i.e. 1). */ + new_iss += (gethrtime() >> ISS_NSEC_SHT) + 1; + break; + default: + /* Add only time component. */ + new_iss += (uint32_t)gethrestime_sec() * ISS_INCR; + break; + } + if ((adj = (int32_t)(tcp->tcp_snxt - new_iss)) > 0) { + /* + * New ISS not guaranteed to be ISS_INCR/2 + * ahead of the current tcp_snxt, so add the + * difference to tcp_iss_incr_extra. + */ + tcps->tcps_iss_incr_extra += adj; + } + /* + * If tcp_clean_death() can not perform the task now, + * drop the SYN packet and let the other side re-xmit. + * Otherwise pass the SYN packet back in, since the + * old tcp state has been cleaned up or freed. + */ + if (tcp_clean_death(tcp, 0) == -1) + goto done; + nconnp = ipcl_classify(mp, ira, ipst); + if (nconnp != NULL) { + TCP_STAT(tcps, tcp_time_wait_syn_success); + /* Drops ref on nconnp */ + tcp_reinput(nconnp, mp, ira, ipst); + return; + } + goto done; + } + + /* + * rgap is the amount of stuff received out of window. A negative + * value is the amount out of window. + */ + if (rgap < 0) { + TCPS_BUMP_MIB(tcps, tcpInDataPastWinSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataPastWinBytes, -rgap); + /* Fix seg_len and make sure there is something left. */ + seg_len += rgap; + if (seg_len <= 0) { + if (flags & TH_RST) { + goto done; + } + flags |= TH_ACK_NEEDED; + seg_len = 0; + goto process_ack; + } + } + /* + * Check whether we can update tcp_ts_recent. This test is + * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP + * Extensions for High Performance: An Update", Internet Draft. + */ + if (tcp->tcp_snd_ts_ok && + TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) && + SEQ_LEQ(seg_seq, tcp->tcp_rack)) { + tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val; + tcp->tcp_last_rcv_lbolt = ddi_get_lbolt64(); + } + + if (seg_seq != tcp->tcp_rnxt && seg_len > 0) { + /* Always ack out of order packets */ + flags |= TH_ACK_NEEDED; + seg_len = 0; + } else if (seg_len > 0) { + TCPS_BUMP_MIB(tcps, tcpInClosed); + TCPS_BUMP_MIB(tcps, tcpInDataInorderSegs); + TCPS_UPDATE_MIB(tcps, tcpInDataInorderBytes, seg_len); + } + if (flags & TH_RST) { + (void) tcp_clean_death(tcp, 0); + goto done; + } + if (flags & TH_SYN) { + tcp_xmit_ctl("TH_SYN", tcp, seg_ack, seg_seq + 1, + TH_RST|TH_ACK); + /* + * Do not delete the TCP structure if it is in + * TIME_WAIT state. Refer to RFC 1122, 4.2.2.13. + */ + goto done; + } +process_ack: + if (flags & TH_ACK) { + bytes_acked = (int)(seg_ack - tcp->tcp_suna); + if (bytes_acked <= 0) { + if (bytes_acked == 0 && seg_len == 0 && + new_swnd == tcp->tcp_swnd) + TCPS_BUMP_MIB(tcps, tcpInDupAck); + } else { + /* Acks something not sent */ + flags |= TH_ACK_NEEDED; + } + } + if (flags & TH_ACK_NEEDED) { + /* + * Time to send an ack for some reason. + */ + tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, + tcp->tcp_rnxt, TH_ACK); + } +done: + freemsg(mp); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c new file mode 100644 index 0000000000..5c87620fca --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_timers.c @@ -0,0 +1,1046 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include <sys/types.h> +#include <sys/strlog.h> +#include <sys/strsun.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> +#include <sys/callo.h> +#include <sys/strsubr.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/ip_ire.h> +#include <inet/ip_rts.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> + +/* + * Implementation of TCP Timers. + * ============================= + * + * INTERFACE: + * + * There are two basic functions dealing with tcp timers: + * + * timeout_id_t tcp_timeout(connp, func, time) + * clock_t tcp_timeout_cancel(connp, timeout_id) + * TCP_TIMER_RESTART(tcp, intvl) + * + * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func' + * after 'time' ticks passed. The function called by timeout() must adhere to + * the same restrictions as a driver soft interrupt handler - it must not sleep + * or call other functions that might sleep. The value returned is the opaque + * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to + * cancel the request. The call to tcp_timeout() may fail in which case it + * returns zero. This is different from the timeout(9F) function which never + * fails. + * + * The call-back function 'func' always receives 'connp' as its single + * argument. It is always executed in the squeue corresponding to the tcp + * structure. The tcp structure is guaranteed to be present at the time the + * call-back is called. + * + * NOTE: The call-back function 'func' is never called if tcp is in + * the TCPS_CLOSED state. + * + * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout() + * request. locks acquired by the call-back routine should not be held across + * the call to tcp_timeout_cancel() or a deadlock may result. + * + * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request. + * Otherwise, it returns an integer value greater than or equal to 0. In + * particular, if the call-back function is already placed on the squeue, it can + * not be canceled. + * + * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called + * within squeue context corresponding to the tcp instance. Since the + * call-back is also called via the same squeue, there are no race + * conditions described in untimeout(9F) manual page since all calls are + * strictly serialized. + * + * TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout + * stored in tcp_timer_tid and starts a new one using + * MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back + * and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid + * field. + * + * NOTE: since the timeout cancellation is not guaranteed, the cancelled + * call-back may still be called, so it is possible tcp_timer() will be + * called several times. This should not be a problem since tcp_timer() + * should always check the tcp instance state. + * + * + * IMPLEMENTATION: + * + * TCP timers are implemented using three-stage process. The call to + * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function + * when the timer expires. The tcp_timer_callback() arranges the call of the + * tcp_timer_handler() function via squeue corresponding to the tcp + * instance. The tcp_timer_handler() calls actual requested timeout call-back + * and passes tcp instance as an argument to it. Information is passed between + * stages using the tcp_timer_t structure which contains the connp pointer, the + * tcp call-back to call and the timeout id returned by the timeout(9F). + * + * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t - + * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo + * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout() + * returns the pointer to this mblk. + * + * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It + * looks like a normal mblk without actual dblk attached to it. + * + * To optimize performance each tcp instance holds a small cache of timer + * mblocks. In the current implementation it caches up to two timer mblocks per + * tcp instance. The cache is preserved over tcp frees and is only freed when + * the whole tcp structure is destroyed by its kmem destructor. Since all tcp + * timer processing happens on a corresponding squeue, the cache manipulation + * does not require any locks. Experiments show that majority of timer mblocks + * allocations are satisfied from the tcp cache and do not involve kmem calls. + * + * The tcp_timeout() places a refhold on the connp instance which guarantees + * that it will be present at the time the call-back function fires. The + * tcp_timer_handler() drops the reference after calling the call-back, so the + * call-back function does not need to manipulate the references explicitly. + */ + +kmem_cache_t *tcp_timercache; + +static void tcp_ip_notify(tcp_t *); +static void tcp_timer_callback(void *); +static void tcp_timer_free(tcp_t *, mblk_t *); +static void tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *); + +timeout_id_t +tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim) +{ + mblk_t *mp; + tcp_timer_t *tcpt; + tcp_t *tcp = connp->conn_tcp; + + ASSERT(connp->conn_sqp != NULL); + + TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls); + + if (tcp->tcp_timercache == NULL) { + mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC); + } else { + TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc); + mp = tcp->tcp_timercache; + tcp->tcp_timercache = mp->b_next; + mp->b_next = NULL; + ASSERT(mp->b_wptr == NULL); + } + + CONN_INC_REF(connp); + tcpt = (tcp_timer_t *)mp->b_rptr; + tcpt->connp = connp; + tcpt->tcpt_proc = f; + /* + * TCP timers are normal timeouts. Plus, they do not require more than + * a 10 millisecond resolution. By choosing a coarser resolution and by + * rounding up the expiration to the next resolution boundary, we can + * batch timers in the callout subsystem to make TCP timers more + * efficient. The roundup also protects short timers from expiring too + * early before they have a chance to be cancelled. + */ + tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp, + TICK_TO_NSEC(tim), CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); + + return ((timeout_id_t)mp); +} + +static void +tcp_timer_callback(void *arg) +{ + mblk_t *mp = (mblk_t *)arg; + tcp_timer_t *tcpt; + conn_t *connp; + + tcpt = (tcp_timer_t *)mp->b_rptr; + connp = tcpt->connp; + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp, + NULL, SQ_FILL, SQTAG_TCP_TIMER); +} + +/* ARGSUSED */ +static void +tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + tcp_timer_t *tcpt; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + tcpt = (tcp_timer_t *)mp->b_rptr; + ASSERT(connp == tcpt->connp); + ASSERT((squeue_t *)arg2 == connp->conn_sqp); + + /* + * If the TCP has reached the closed state, don't proceed any + * further. This TCP logically does not exist on the system. + * tcpt_proc could for example access queues, that have already + * been qprocoff'ed off. + */ + if (tcp->tcp_state != TCPS_CLOSED) { + (*tcpt->tcpt_proc)(connp); + } else { + tcp->tcp_timer_tid = 0; + } + tcp_timer_free(connp->conn_tcp, mp); +} + +/* + * There is potential race with untimeout and the handler firing at the same + * time. The mblock may be freed by the handler while we are trying to use + * it. But since both should execute on the same squeue, this race should not + * occur. + */ +clock_t +tcp_timeout_cancel(conn_t *connp, timeout_id_t id) +{ + mblk_t *mp = (mblk_t *)id; + tcp_timer_t *tcpt; + clock_t delta; + + TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs); + + if (mp == NULL) + return (-1); + + tcpt = (tcp_timer_t *)mp->b_rptr; + ASSERT(tcpt->connp == connp); + + delta = untimeout_default(tcpt->tcpt_tid, 0); + + if (delta >= 0) { + TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled); + tcp_timer_free(connp->conn_tcp, mp); + CONN_DEC_REF(connp); + } + + return (delta); +} + +/* + * Allocate space for the timer event. The allocation looks like mblk, but it is + * not a proper mblk. To avoid confusion we set b_wptr to NULL. + * + * Dealing with failures: If we can't allocate from the timer cache we try + * allocating from dblock caches using allocb_tryhard(). In this case b_wptr + * points to b_rptr. + * If we can't allocate anything using allocb_tryhard(), we perform a last + * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and + * save the actual allocation size in b_datap. + */ +mblk_t * +tcp_timermp_alloc(int kmflags) +{ + mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache, + kmflags & ~KM_PANIC); + + if (mp != NULL) { + mp->b_next = mp->b_prev = NULL; + mp->b_rptr = (uchar_t *)(&mp[1]); + mp->b_wptr = NULL; + mp->b_datap = NULL; + mp->b_queue = NULL; + mp->b_cont = NULL; + } else if (kmflags & KM_PANIC) { + /* + * Failed to allocate memory for the timer. Try allocating from + * dblock caches. + */ + /* ipclassifier calls this from a constructor - hence no tcps */ + TCP_G_STAT(tcp_timermp_allocfail); + mp = allocb_tryhard(sizeof (tcp_timer_t)); + if (mp == NULL) { + size_t size = 0; + /* + * Memory is really low. Try tryhard allocation. + * + * ipclassifier calls this from a constructor - + * hence no tcps + */ + TCP_G_STAT(tcp_timermp_allocdblfail); + mp = kmem_alloc_tryhard(sizeof (mblk_t) + + sizeof (tcp_timer_t), &size, kmflags); + mp->b_rptr = (uchar_t *)(&mp[1]); + mp->b_next = mp->b_prev = NULL; + mp->b_wptr = (uchar_t *)-1; + mp->b_datap = (dblk_t *)size; + mp->b_queue = NULL; + mp->b_cont = NULL; + } + ASSERT(mp->b_wptr != NULL); + } + /* ipclassifier calls this from a constructor - hence no tcps */ + TCP_G_DBGSTAT(tcp_timermp_alloced); + + return (mp); +} + +/* + * Free per-tcp timer cache. + * It can only contain entries from tcp_timercache. + */ +void +tcp_timermp_free(tcp_t *tcp) +{ + mblk_t *mp; + + while ((mp = tcp->tcp_timercache) != NULL) { + ASSERT(mp->b_wptr == NULL); + tcp->tcp_timercache = tcp->tcp_timercache->b_next; + kmem_cache_free(tcp_timercache, mp); + } +} + +/* + * Free timer event. Put it on the per-tcp timer cache if there is not too many + * events there already (currently at most two events are cached). + * If the event is not allocated from the timer cache, free it right away. + */ +static void +tcp_timer_free(tcp_t *tcp, mblk_t *mp) +{ + mblk_t *mp1 = tcp->tcp_timercache; + + if (mp->b_wptr != NULL) { + /* + * This allocation is not from a timer cache, free it right + * away. + */ + if (mp->b_wptr != (uchar_t *)-1) + freeb(mp); + else + kmem_free(mp, (size_t)mp->b_datap); + } else if (mp1 == NULL || mp1->b_next == NULL) { + /* Cache this timer block for future allocations */ + mp->b_rptr = (uchar_t *)(&mp[1]); + mp->b_next = mp1; + tcp->tcp_timercache = mp; + } else { + kmem_cache_free(tcp_timercache, mp); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed); + } +} + +/* + * Stop all TCP timers. + */ +void +tcp_timers_stop(tcp_t *tcp) +{ + if (tcp->tcp_timer_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid); + tcp->tcp_timer_tid = 0; + } + if (tcp->tcp_ka_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid); + tcp->tcp_ka_tid = 0; + } + if (tcp->tcp_ack_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid); + tcp->tcp_ack_tid = 0; + } + if (tcp->tcp_push_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid); + tcp->tcp_push_tid = 0; + } + if (tcp->tcp_reass_tid != 0) { + (void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid); + tcp->tcp_reass_tid = 0; + } +} + +/* + * Timer callback routine for keepalive probe. We do a fake resend of + * last ACKed byte. Then set a timer using RTO. When the timer expires, + * check to see if we have heard anything from the other end for the last + * RTO period. If we have, set the timer to expire for another + * tcp_keepalive_intrvl and check again. If we have not, set a timer using + * RTO << 1 and check again when it expires. Keep exponentially increasing + * the timeout if we have not heard from the other side. If for more than + * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything, + * kill the connection unless the keepalive abort threshold is 0. In + * that case, we will probe "forever." + */ +void +tcp_keepalive_timer(void *arg) +{ + mblk_t *mp; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + int32_t firetime; + int32_t idletime; + int32_t ka_intrvl; + tcp_stack_t *tcps = tcp->tcp_tcps; + + tcp->tcp_ka_tid = 0; + + if (tcp->tcp_fused) + return; + + TCPS_BUMP_MIB(tcps, tcpTimKeepalive); + ka_intrvl = tcp->tcp_ka_interval; + + /* + * Keepalive probe should only be sent if the application has not + * done a close on the connection. + */ + if (tcp->tcp_state > TCPS_CLOSE_WAIT) { + return; + } + /* Timer fired too early, restart it. */ + if (tcp->tcp_state < TCPS_ESTABLISHED) { + tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, + MSEC_TO_TICK(ka_intrvl)); + return; + } + + idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time); + /* + * If we have not heard from the other side for a long + * time, kill the connection unless the keepalive abort + * threshold is 0. In that case, we will probe "forever." + */ + if (tcp->tcp_ka_abort_thres != 0 && + idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) { + TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop); + (void) tcp_clean_death(tcp, tcp->tcp_client_errno ? + tcp->tcp_client_errno : ETIMEDOUT); + return; + } + + if (tcp->tcp_snxt == tcp->tcp_suna && + idletime >= ka_intrvl) { + /* Fake resend of last ACKed byte. */ + mblk_t *mp1 = allocb(1, BPRI_LO); + + if (mp1 != NULL) { + *mp1->b_wptr++ = '\0'; + mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL, + tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE); + freeb(mp1); + /* + * if allocation failed, fall through to start the + * timer back. + */ + if (mp != NULL) { + tcp_send_data(tcp, mp); + TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe); + if (tcp->tcp_ka_last_intrvl != 0) { + int max; + /* + * We should probe again at least + * in ka_intrvl, but not more than + * tcp_rexmit_interval_max. + */ + max = tcps->tcps_rexmit_interval_max; + firetime = MIN(ka_intrvl - 1, + tcp->tcp_ka_last_intrvl << 1); + if (firetime > max) + firetime = max; + } else { + firetime = tcp->tcp_rto; + } + tcp->tcp_ka_tid = TCP_TIMER(tcp, + tcp_keepalive_timer, + MSEC_TO_TICK(firetime)); + tcp->tcp_ka_last_intrvl = firetime; + return; + } + } + } else { + tcp->tcp_ka_last_intrvl = 0; + } + + /* firetime can be negative if (mp1 == NULL || mp == NULL) */ + if ((firetime = ka_intrvl - idletime) < 0) { + firetime = ka_intrvl; + } + tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer, + MSEC_TO_TICK(firetime)); +} + +void +tcp_reass_timer(void *arg) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + tcp->tcp_reass_tid = 0; + if (tcp->tcp_reass_head == NULL) + return; + ASSERT(tcp->tcp_reass_tail != NULL); + if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) { + tcp_sack_remove(tcp->tcp_sack_list, + TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk); + } + tcp_close_mpp(&tcp->tcp_reass_head); + tcp->tcp_reass_tail = NULL; + TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout); +} + +/* This function handles the push timeout. */ +void +tcp_push_timer(void *arg) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt); + + ASSERT(tcp->tcp_listener == NULL); + + ASSERT(!IPCL_IS_NONSTR(connp)); + + tcp->tcp_push_tid = 0; + + if (tcp->tcp_rcv_list != NULL && + tcp_rcv_drain(tcp) == TH_ACK_NEEDED) + tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK); +} + +/* + * This function handles delayed ACK timeout. + */ +void +tcp_ack_timer(void *arg) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + mblk_t *mp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + TCP_DBGSTAT(tcps, tcp_ack_timer_cnt); + + tcp->tcp_ack_tid = 0; + + if (tcp->tcp_fused) + return; + + /* + * Do not send ACK if there is no outstanding unack'ed data. + */ + if (tcp->tcp_rnxt == tcp->tcp_rack) { + return; + } + + if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) { + /* + * Make sure we don't allow deferred ACKs to result in + * timer-based ACKing. If we have held off an ACK + * when there was more than an mss here, and the timer + * goes off, we have to worry about the possibility + * that the sender isn't doing slow-start, or is out + * of step with us for some other reason. We fall + * permanently back in the direction of + * ACK-every-other-packet as suggested in RFC 1122. + */ + if (tcp->tcp_rack_abs_max > 2) + tcp->tcp_rack_abs_max--; + tcp->tcp_rack_cur_max = 2; + } + mp = tcp_ack_mp(tcp); + + if (mp != NULL) { + BUMP_LOCAL(tcp->tcp_obsegs); + TCPS_BUMP_MIB(tcps, tcpOutAck); + TCPS_BUMP_MIB(tcps, tcpOutAckDelayed); + tcp_send_data(tcp, mp); + } +} + +/* + * Notify IP that we are having trouble with this connection. IP should + * make note so it can potentially use a different IRE. + */ +static void +tcp_ip_notify(tcp_t *tcp) +{ + conn_t *connp = tcp->tcp_connp; + ire_t *ire; + + /* + * Note: in the case of source routing we want to blow away the + * route to the first source route hop. + */ + ire = connp->conn_ixa->ixa_ire; + if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { + if (ire->ire_ipversion == IPV4_VERSION) { + /* + * As per RFC 1122, we send an RTM_LOSING to inform + * routing protocols. + */ + ip_rts_change(RTM_LOSING, ire->ire_addr, + ire->ire_gateway_addr, ire->ire_mask, + connp->conn_laddr_v4, 0, 0, 0, + (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA), + ire->ire_ipst); + } + (void) ire_no_good(ire); + } +} + +/* + * tcp_timer is the timer service routine. It handles the retransmission, + * FIN_WAIT_2 flush, and zero window probe timeout events. It figures out + * from the state of the tcp instance what kind of action needs to be done + * at the time it is called. + */ +void +tcp_timer(void *arg) +{ + mblk_t *mp; + clock_t first_threshold; + clock_t second_threshold; + clock_t ms; + uint32_t mss; + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + tcp->tcp_timer_tid = 0; + + if (tcp->tcp_fused) + return; + + first_threshold = tcp->tcp_first_timer_threshold; + second_threshold = tcp->tcp_second_timer_threshold; + switch (tcp->tcp_state) { + case TCPS_IDLE: + case TCPS_BOUND: + case TCPS_LISTEN: + return; + case TCPS_SYN_RCVD: { + tcp_t *listener = tcp->tcp_listener; + + if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) { + /* it's our first timeout */ + tcp->tcp_syn_rcvd_timeout = 1; + mutex_enter(&listener->tcp_eager_lock); + listener->tcp_syn_rcvd_timeout++; + if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) { + /* + * Make this eager available for drop if we + * need to drop one to accomodate a new + * incoming SYN request. + */ + MAKE_DROPPABLE(listener, tcp); + } + if (!listener->tcp_syn_defense && + (listener->tcp_syn_rcvd_timeout > + (tcps->tcps_conn_req_max_q0 >> 2)) && + (tcps->tcps_conn_req_max_q0 > 200)) { + /* We may be under attack. Put on a defense. */ + listener->tcp_syn_defense = B_TRUE; + cmn_err(CE_WARN, "High TCP connect timeout " + "rate! System (port %d) may be under a " + "SYN flood attack!", + ntohs(listener->tcp_connp->conn_lport)); + + listener->tcp_ip_addr_cache = kmem_zalloc( + IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t), + KM_NOSLEEP); + } + mutex_exit(&listener->tcp_eager_lock); + } else if (listener != NULL) { + mutex_enter(&listener->tcp_eager_lock); + tcp->tcp_syn_rcvd_timeout++; + if (tcp->tcp_syn_rcvd_timeout > 1 && + !tcp->tcp_closemp_used) { + /* + * This is our second timeout. Put the tcp in + * the list of droppable eagers to allow it to + * be dropped, if needed. We don't check + * whether tcp_dontdrop is set or not to + * protect ourselve from a SYN attack where a + * remote host can spoof itself as one of the + * good IP source and continue to hold + * resources too long. + */ + MAKE_DROPPABLE(listener, tcp); + } + mutex_exit(&listener->tcp_eager_lock); + } + } + /* FALLTHRU */ + case TCPS_SYN_SENT: + first_threshold = tcp->tcp_first_ctimer_threshold; + second_threshold = tcp->tcp_second_ctimer_threshold; + break; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_CLOSING: + case TCPS_CLOSE_WAIT: + case TCPS_LAST_ACK: + /* If we have data to rexmit */ + if (tcp->tcp_suna != tcp->tcp_snxt) { + clock_t time_to_wait; + + TCPS_BUMP_MIB(tcps, tcpTimRetrans); + if (!tcp->tcp_xmit_head) + break; + time_to_wait = ddi_get_lbolt() - + (clock_t)tcp->tcp_xmit_head->b_prev; + time_to_wait = tcp->tcp_rto - + TICK_TO_MSEC(time_to_wait); + /* + * If the timer fires too early, 1 clock tick earlier, + * restart the timer. + */ + if (time_to_wait > msec_per_tick) { + TCP_STAT(tcps, tcp_timer_fire_early); + TCP_TIMER_RESTART(tcp, time_to_wait); + return; + } + /* + * When we probe zero windows, we force the swnd open. + * If our peer acks with a closed window swnd will be + * set to zero by tcp_rput(). As long as we are + * receiving acks tcp_rput will + * reset 'tcp_ms_we_have_waited' so as not to trip the + * first and second interval actions. NOTE: the timer + * interval is allowed to continue its exponential + * backoff. + */ + if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_TRACE, "tcp_timer: zero win"); + } + } else { + /* + * After retransmission, we need to do + * slow start. Set the ssthresh to one + * half of current effective window and + * cwnd to one MSS. Also reset + * tcp_cwnd_cnt. + * + * Note that if tcp_ssthresh is reduced because + * of ECN, do not reduce it again unless it is + * already one window of data away (tcp_cwr + * should then be cleared) or this is a + * timeout for a retransmitted segment. + */ + uint32_t npkt; + + if (!tcp->tcp_cwr || tcp->tcp_rexmit) { + npkt = ((tcp->tcp_timer_backoff ? + tcp->tcp_cwnd_ssthresh : + tcp->tcp_snxt - + tcp->tcp_suna) >> 1) / tcp->tcp_mss; + tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) * + tcp->tcp_mss; + } + tcp->tcp_cwnd = tcp->tcp_mss; + tcp->tcp_cwnd_cnt = 0; + if (tcp->tcp_ecn_ok) { + tcp->tcp_cwr = B_TRUE; + tcp->tcp_cwr_snd_max = tcp->tcp_snxt; + tcp->tcp_ecn_cwr_sent = B_FALSE; + } + } + break; + } + /* + * We have something to send yet we cannot send. The + * reason can be: + * + * 1. Zero send window: we need to do zero window probe. + * 2. Zero cwnd: because of ECN, we need to "clock out + * segments. + * 3. SWS avoidance: receiver may have shrunk window, + * reset our knowledge. + * + * Note that condition 2 can happen with either 1 or + * 3. But 1 and 3 are exclusive. + */ + if (tcp->tcp_unsent != 0) { + /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + + if (tcp->tcp_cwnd == 0) { + /* + * Set tcp_cwnd to 1 MSS so that a + * new segment can be sent out. We + * are "clocking out" new data when + * the network is really congested. + */ + ASSERT(tcp->tcp_ecn_ok); + tcp->tcp_cwnd = tcp->tcp_mss; + } + if (tcp->tcp_swnd == 0) { + /* Extend window for zero window probe */ + tcp->tcp_swnd++; + tcp->tcp_zero_win_probe = B_TRUE; + TCPS_BUMP_MIB(tcps, tcpOutWinProbe); + } else { + /* + * Handle timeout from sender SWS avoidance. + * Reset our knowledge of the max send window + * since the receiver might have reduced its + * receive buffer. Avoid setting tcp_max_swnd + * to one since that will essentially disable + * the SWS checks. + * + * Note that since we don't have a SWS + * state variable, if the timeout is set + * for ECN but not for SWS, this + * code will also be executed. This is + * fine as tcp_max_swnd is updated + * constantly and it will not affect + * anything. + */ + tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2); + } + tcp_wput_data(tcp, NULL, B_FALSE); + return; + } + /* Is there a FIN that needs to be to re retransmitted? */ + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && + !tcp->tcp_fin_acked) + break; + /* Nothing to do, return without restarting timer. */ + TCP_STAT(tcps, tcp_timer_fire_miss); + return; + case TCPS_FIN_WAIT_2: + /* + * User closed the TCP endpoint and peer ACK'ed our FIN. + * We waited some time for for peer's FIN, but it hasn't + * arrived. We flush the connection now to avoid + * case where the peer has rebooted. + */ + if (TCP_IS_DETACHED(tcp)) { + (void) tcp_clean_death(tcp, 0); + } else { + TCP_TIMER_RESTART(tcp, + tcps->tcps_fin_wait_2_flush_interval); + } + return; + case TCPS_TIME_WAIT: + (void) tcp_clean_death(tcp, 0); + return; + default: + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR, + "tcp_timer: strange state (%d) %s", + tcp->tcp_state, tcp_display(tcp, NULL, + DISP_PORT_ONLY)); + } + return; + } + + /* + * If the system is under memory pressure or the max number of + * connections have been established for the listener, be more + * aggressive in aborting connections. + */ + if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL && + tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) { + second_threshold = tcp_early_abort * SECONDS; + } + + if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { + /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + + /* + * For zero window probe, we need to send indefinitely, + * unless we have not heard from the other side for some + * time... + */ + if ((tcp->tcp_zero_win_probe == 0) || + (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) > + second_threshold)) { + TCPS_BUMP_MIB(tcps, tcpTimRetransDrop); + /* + * If TCP is in SYN_RCVD state, send back a + * RST|ACK as BSD does. Note that tcp_zero_win_probe + * should be zero in TCPS_SYN_RCVD state. + */ + if (tcp->tcp_state == TCPS_SYN_RCVD) { + tcp_xmit_ctl("tcp_timer: RST sent on timeout " + "in SYN_RCVD", + tcp, tcp->tcp_snxt, + tcp->tcp_rnxt, TH_RST | TH_ACK); + } + (void) tcp_clean_death(tcp, + tcp->tcp_client_errno ? + tcp->tcp_client_errno : ETIMEDOUT); + return; + } else { + /* + * If the system is under memory pressure, we also + * abort connection in zero window probing. + */ + if (tcps->tcps_reclaim) { + (void) tcp_clean_death(tcp, + tcp->tcp_client_errno ? + tcp->tcp_client_errno : ETIMEDOUT); + TCP_STAT(tcps, tcp_zwin_mem_drop); + return; + } + /* + * Set tcp_ms_we_have_waited to second_threshold + * so that in next timeout, we will do the above + * check (ddi_get_lbolt() - tcp_last_recv_time). + * This is also to avoid overflow. + * + * We don't need to decrement tcp_timer_backoff + * to avoid overflow because it will be decremented + * later if new timeout value is greater than + * tcp_rexmit_interval_max. In the case when + * tcp_rexmit_interval_max is greater than + * second_threshold, it means that we will wait + * longer than second_threshold to send the next + * window probe. + */ + tcp->tcp_ms_we_have_waited = second_threshold; + } + } else if (ms > first_threshold) { + /* + * Should not hold the zero-copy messages for too long. + */ + if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean) + tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, + tcp->tcp_xmit_head, B_TRUE); + + /* + * We have been retransmitting for too long... The RTT + * we calculated is probably incorrect. Reinitialize it. + * Need to compensate for 0 tcp_rtt_sa. Reset + * tcp_rtt_update so that we won't accidentally cache a + * bad value. But only do this if this is not a zero + * window probe. + */ + if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { + tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + + (tcp->tcp_rtt_sa >> 5); + tcp->tcp_rtt_sa = 0; + tcp_ip_notify(tcp); + tcp->tcp_rtt_update = 0; + } + } + tcp->tcp_timer_backoff++; + if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + + tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < + tcps->tcps_rexmit_interval_min) { + /* + * This means the original RTO is tcp_rexmit_interval_min. + * So we will use tcp_rexmit_interval_min as the RTO value + * and do the backoff. + */ + ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff; + } else { + ms <<= tcp->tcp_timer_backoff; + } + if (ms > tcps->tcps_rexmit_interval_max) { + ms = tcps->tcps_rexmit_interval_max; + /* + * ms is at max, decrement tcp_timer_backoff to avoid + * overflow. + */ + tcp->tcp_timer_backoff--; + } + tcp->tcp_ms_we_have_waited += ms; + if (tcp->tcp_zero_win_probe == 0) { + tcp->tcp_rto = ms; + } + TCP_TIMER_RESTART(tcp, ms); + /* + * This is after a timeout and tcp_rto is backed off. Set + * tcp_set_timer to 1 so that next time RTO is updated, we will + * restart the timer with a correct value. + */ + tcp->tcp_set_timer = 1; + mss = tcp->tcp_snxt - tcp->tcp_suna; + if (mss > tcp->tcp_mss) + mss = tcp->tcp_mss; + if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) + mss = tcp->tcp_swnd; + + if ((mp = tcp->tcp_xmit_head) != NULL) + mp->b_prev = (mblk_t *)ddi_get_lbolt(); + mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, + B_TRUE); + + /* + * When slow start after retransmission begins, start with + * this seq no. tcp_rexmit_max marks the end of special slow + * start phase. tcp_snd_burst controls how many segments + * can be sent because of an ack. + */ + tcp->tcp_rexmit_nxt = tcp->tcp_suna; + tcp->tcp_snd_burst = TCP_CWND_SS; + if ((tcp->tcp_valid_bits & TCP_FSS_VALID) && + (tcp->tcp_unsent == 0)) { + tcp->tcp_rexmit_max = tcp->tcp_fss; + } else { + tcp->tcp_rexmit_max = tcp->tcp_snxt; + } + tcp->tcp_rexmit = B_TRUE; + tcp->tcp_dupack_cnt = 0; + + /* + * Remove all rexmit SACK blk to start from fresh. + */ + if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL) + TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp); + if (mp == NULL) { + return; + } + + tcp->tcp_csuna = tcp->tcp_snxt; + TCPS_BUMP_MIB(tcps, tcpRetransSegs); + TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss); + tcp_send_data(tcp, mp); + +} + +/* + * Handle lingering timeouts. This function is called when the SO_LINGER timeout + * expires. + */ +void +tcp_close_linger_timeout(void *arg) +{ + conn_t *connp = (conn_t *)arg; + tcp_t *tcp = connp->conn_tcp; + + tcp->tcp_client_errno = ETIMEDOUT; + tcp_stop_lingering(tcp); +} diff --git a/usr/src/uts/common/inet/tcp_cluster.h b/usr/src/uts/common/inet/tcp_cluster.h new file mode 100644 index 0000000000..90efc1be34 --- /dev/null +++ b/usr/src/uts/common/inet/tcp_cluster.h @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_TCP_CLUSTER_H +#define _INET_TCP_CLUSTER_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +/* + * Cluster hooks defined in tcp_cluster.c. + */ +extern void (*cl_inet_listen)(netstackid_t, uint8_t, sa_family_t, uint8_t *, + in_port_t, void *); +extern void (*cl_inet_unlisten)(netstackid_t, uint8_t, sa_family_t, uint8_t *, + in_port_t, void *); +extern int (*cl_inet_connect2)(netstackid_t, uint8_t, boolean_t, sa_family_t, + uint8_t *, in_port_t, uint8_t *, in_port_t, void *); +extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t, + uint8_t *, in_port_t, uint8_t *, in_port_t, void *); + + +/* + * Cluster networking hook for traversing current connection list. + * This routine is used to extract the current list of live connections + * which must continue to to be dispatched to this node. + */ +extern int cl_tcp_walk_list(netstackid_t, + int (*callback)(cl_tcp_info_t *, void *), void *); + +/* + * int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err) + */ +#define CL_INET_CONNECT(connp, is_outgoing, err) { \ + (err) = 0; \ + if (cl_inet_connect2 != NULL) { \ + /* \ + * Running in cluster mode - register active connection \ + * information \ + */ \ + if ((connp)->conn_ipversion == IPV4_VERSION) { \ + if ((connp)->conn_laddr_v4 != 0) { \ + (err) = (*cl_inet_connect2)( \ + (connp)->conn_netstack->netstack_stackid,\ + IPPROTO_TCP, is_outgoing, AF_INET, \ + (uint8_t *)(&((connp)->conn_laddr_v4)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v4)),\ + (in_port_t)(connp)->conn_fport, NULL); \ + } \ + } else { \ + if (!IN6_IS_ADDR_UNSPECIFIED( \ + &(connp)->conn_laddr_v6)) { \ + (err) = (*cl_inet_connect2)( \ + (connp)->conn_netstack->netstack_stackid,\ + IPPROTO_TCP, is_outgoing, AF_INET6, \ + (uint8_t *)(&((connp)->conn_laddr_v6)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v6)), \ + (in_port_t)(connp)->conn_fport, NULL); \ + } \ + } \ + } \ +} + +#define CL_INET_DISCONNECT(connp) { \ + if (cl_inet_disconnect != NULL) { \ + /* \ + * Running in cluster mode - deregister active \ + * connection information \ + */ \ + if ((connp)->conn_ipversion == IPV4_VERSION) { \ + if ((connp)->conn_laddr_v4 != 0) { \ + (*cl_inet_disconnect)( \ + (connp)->conn_netstack->netstack_stackid,\ + IPPROTO_TCP, AF_INET, \ + (uint8_t *)(&((connp)->conn_laddr_v4)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v4)),\ + (in_port_t)(connp)->conn_fport, NULL); \ + } \ + } else { \ + if (!IN6_IS_ADDR_UNSPECIFIED( \ + &(connp)->conn_laddr_v6)) { \ + (*cl_inet_disconnect)( \ + (connp)->conn_netstack->netstack_stackid,\ + IPPROTO_TCP, AF_INET6, \ + (uint8_t *)(&((connp)->conn_laddr_v6)),\ + (in_port_t)(connp)->conn_lport, \ + (uint8_t *)(&((connp)->conn_faddr_v6)), \ + (in_port_t)(connp)->conn_fport, NULL); \ + } \ + } \ + } \ +} + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_TCP_CLUSTER_H */ diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index a54557cee1..2ee2b6cb39 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,11 +39,55 @@ extern "C" { #ifdef _KERNEL +#include <sys/cpuvar.h> +#include <sys/clock_impl.h> /* For LBOLT_FASTPATH{,64} */ #include <inet/optcom.h> #include <inet/tcp.h> #define TCP_MOD_ID 5105 +extern struct qinit tcp_sock_winit; +extern struct qinit tcp_winit; + +extern sock_downcalls_t sock_tcp_downcalls; + +/* + * Bind hash list size and has function. It has to be a power of 2 for + * hashing. + */ +#define TCP_BIND_FANOUT_SIZE 512 +#define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1)) + +/* + * This implementation follows the 4.3BSD interpretation of the urgent + * pointer and not RFC 1122. Switching to RFC 1122 behavior would cause + * incompatible changes in protocols like telnet and rlogin. + */ +#define TCP_OLD_URP_INTERPRETATION 1 + +/* Handy time related macros. */ +#define MS 1L +#define SECONDS (1000 * MS) +#define MINUTES (60 * SECONDS) +#define HOURS (60 * MINUTES) +#define DAYS (24 * HOURS) + +/* TCP option length */ +#define TCPOPT_NOP_LEN 1 +#define TCPOPT_MAXSEG_LEN 4 +#define TCPOPT_WS_LEN 3 +#define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1) +#define TCPOPT_TSTAMP_LEN 10 +#define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2) +#define TCPOPT_SACK_OK_LEN 2 +#define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2) +#define TCPOPT_REAL_SACK_LEN 4 +#define TCPOPT_MAX_SACK_LEN 36 +#define TCPOPT_HEADER_LEN 2 + +/* Round up the value to the nearest mss. */ +#define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss)) + /* * Was this tcp created via socket() interface? */ @@ -54,6 +98,19 @@ extern "C" { */ #define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached) +/* TCP timers related data strucutres. Refer to tcp_timers.c. */ +typedef struct tcp_timer_s { + conn_t *connp; + void (*tcpt_proc)(void *); + callout_id_t tcpt_tid; +} tcp_timer_t; + +extern kmem_cache_t *tcp_timercache; + +/* + * Macro for starting various timers. Retransmission timer has its own macro, + * TCP_TIMER_RESTART(). + */ #define TCP_TIMER(tcp, f, tim) \ tcp_timeout(tcp->tcp_connp, f, tim) #define TCP_TIMER_CANCEL(tcp, id) \ @@ -70,6 +127,119 @@ extern "C" { } /* + * For scalability, we must not run a timer for every TCP connection + * in TIME_WAIT state. To see why, consider (for time wait interval of + * 1 minutes): + * 10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's + * + * This list is ordered by time, so you need only delete from the head + * until you get to entries which aren't old enough to delete yet. + * The list consists of only the detached TIME_WAIT connections. + * + * When a tcp_t enters TIME_WAIT state, a timer is started (timeout is + * tcps_time_wait_interval). When the tcp_t is detached (upper layer closes + * the end point), it is moved to the time wait list and another timer is + * started (expiry time is set at tcp_time_wait_expire, which is + * also calculated using tcps_time_wait_interval). This means that the + * TIME_WAIT state can be extended (up to doubled) if the tcp_t doesn't + * become detached for a long time. + * + * The list manipulations (including tcp_time_wait_next/prev) + * are protected by the tcp_time_wait_lock. The content of the + * detached TIME_WAIT connections is protected by the normal perimeters. + * + * This list is per squeue and squeues are shared across the tcp_stack_t's. + * Things on tcp_time_wait_head remain associated with the tcp_stack_t + * and conn_netstack. + * The tcp_t's that are added to tcp_free_list are disassociated and + * have NULL tcp_tcps and conn_netstack pointers. + */ +typedef struct tcp_squeue_priv_s { + kmutex_t tcp_time_wait_lock; + callout_id_t tcp_time_wait_tid; + tcp_t *tcp_time_wait_head; + tcp_t *tcp_time_wait_tail; + tcp_t *tcp_free_list; + uint_t tcp_free_list_cnt; +} tcp_squeue_priv_t; + +/* + * Parameters for TCP Initial Send Sequence number (ISS) generation. When + * tcp_strong_iss is set to 1, which is the default, the ISS is calculated + * by adding three components: a time component which grows by 1 every 4096 + * nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27); + * a per-connection component which grows by 125000 for every new connection; + * and an "extra" component that grows by a random amount centered + * approximately on 64000. This causes the ISS generator to cycle every + * 4.89 hours if no TCP connections are made, and faster if connections are + * made. + * + * When tcp_strong_iss is set to 0, ISS is calculated by adding two + * components: a time component which grows by 250000 every second; and + * a per-connection component which grows by 125000 for every new connections. + * + * A third method, when tcp_strong_iss is set to 2, for generating ISS is + * prescribed by Steve Bellovin. This involves adding time, the 125000 per + * connection, and a one-way hash (MD5) of the connection ID <sport, dport, + * src, dst>, a "truly" random (per RFC 1750) number, and a console-entered + * password. + */ +#define ISS_INCR 250000 +#define ISS_NSEC_SHT 12 + +/* Macros for timestamp comparisons */ +#define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0) +#define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0) + +/* + * Initialize cwnd according to RFC 3390. def_max_init_cwnd is + * either tcp_slow_start_initial or tcp_slow_start_after idle + * depending on the caller. If the upper layer has not used the + * TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd + * should be 0 and we use the formula in RFC 3390 to set tcp_cwnd. + * If the upper layer has changed set the tcp_init_cwnd, just use + * it to calculate the tcp_cwnd. + */ +#define TCP_SET_INIT_CWND(tcp, mss, def_max_init_cwnd) \ +{ \ + if ((tcp)->tcp_init_cwnd == 0) { \ + (tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \ + MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \ + } else { \ + (tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \ + } \ + tcp->tcp_cwnd_cnt = 0; \ +} + +/* + * Set ECN capable transport (ECT) code point in IP header. + * + * Note that there are 2 ECT code points '01' and '10', which are called + * ECT(1) and ECT(0) respectively. Here we follow the original ECT code + * point ECT(0) for TCP as described in RFC 2481. + */ +#define TCP_SET_ECT(tcp, iph) \ + if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \ + /* We need to clear the code point first. */ \ + ((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \ + ((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \ + } else { \ + ((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \ + ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ + } + +/* + * TCP options struct returned from tcp_parse_options. + */ +typedef struct tcp_opt_s { + uint32_t tcp_opt_mss; + uint32_t tcp_opt_wscale; + uint32_t tcp_opt_ts_val; + uint32_t tcp_opt_ts_ecr; + tcp_t *tcp; +} tcp_opt_t; + +/* * Write-side flow-control is implemented via the per instance STREAMS * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s) * and clearing QFULL and calling qbackenable() to restart the flow based @@ -97,6 +267,146 @@ extern void tcp_setqfull(tcp_t *); #define TCP_UNSENT_BYTES(tcp) \ ((tcp)->tcp_squeue_bytes + (tcp)->tcp_unsent) +/* + * Linked list struct to store listener connection limit configuration per + * IP stack. The list is stored at tcps_listener_conf in tcp_stack_t. + * + * tl_port: the listener port of this limit configuration + * tl_ratio: the maximum amount of memory consumed by all concurrent TCP + * connections created by a listener does not exceed 1/tl_ratio + * of the total system memory. Note that this is only an + * approximation. + * tl_link: linked list struct + */ +typedef struct tcp_listener_s { + in_port_t tl_port; + uint32_t tl_ratio; + list_node_t tl_link; +} tcp_listener_t; + +/* + * If there is a limit set on the number of connections allowed per each + * listener, the following struct is used to store that counter. It keeps + * the number of TCP connection created by a listener. Note that this needs + * to be separated from the listener since the listener can go away before + * all the connections are gone. + * + * When the struct is allocated, tlc_cnt is set to 1. When a new connection + * is created by the listener, tlc_cnt is incremented by 1. When a connection + * created by the listener goes away, tlc_count is decremented by 1. When the + * listener itself goes away, tlc_cnt is decremented by one. The last + * connection (or the listener) which decrements tlc_cnt to zero frees the + * struct. + * + * tlc_max is the threshold value tcps_conn_listen_port. It is set when the + * tcp_listen_cnt_t is allocated. + * + * tlc_report_time stores the time when cmn_err() is called to report that the + * max has been exceeeded. Report is done at most once every + * TCP_TLC_REPORT_INTERVAL mins for a listener. + * + * tlc_drop stores the number of connection attempt dropped because the + * limit has reached. + */ +typedef struct tcp_listen_cnt_s { + uint32_t tlc_max; + uint32_t tlc_cnt; + int64_t tlc_report_time; + uint32_t tlc_drop; +} tcp_listen_cnt_t; + +#define TCP_TLC_REPORT_INTERVAL (30 * MINUTES) + +#define TCP_DECR_LISTEN_CNT(tcp) \ +{ \ + ASSERT((tcp)->tcp_listen_cnt->tlc_cnt > 0); \ + if (atomic_add_32_nv(&(tcp)->tcp_listen_cnt->tlc_cnt, -1) == 0) \ + kmem_free((tcp)->tcp_listen_cnt, sizeof (tcp_listen_cnt_t)); \ + (tcp)->tcp_listen_cnt = NULL; \ +} + +/* Increment and decrement the number of connections in tcp_stack_t. */ +#define TCPS_CONN_INC(tcps) \ + atomic_inc_64( \ + (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt) + +#define TCPS_CONN_DEC(tcps) \ + atomic_dec_64( \ + (uint64_t *)&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_conn_cnt) + +/* + * When the system is under memory pressure, stack variable tcps_reclaim is + * true, we shorten the connection timeout abort interval to tcp_early_abort + * seconds. Defined in tcp.c. + */ +extern uint32_t tcp_early_abort; + +/* + * To reach to an eager in Q0 which can be dropped due to an incoming + * new SYN request when Q0 is full, a new doubly linked list is + * introduced. This list allows to select an eager from Q0 in O(1) time. + * This is needed to avoid spending too much time walking through the + * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of + * this new list has to be a member of Q0. + * This list is headed by listener's tcp_t. When the list is empty, + * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, + * of listener's tcp_t point to listener's tcp_t itself. + * + * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager + * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. + * These macros do not affect the eager's membership to Q0. + */ +#define MAKE_DROPPABLE(listener, eager) \ + if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ + (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ + = (eager); \ + (eager)->tcp_eager_prev_drop_q0 = (listener); \ + (eager)->tcp_eager_next_drop_q0 = \ + (listener)->tcp_eager_next_drop_q0; \ + (listener)->tcp_eager_next_drop_q0 = (eager); \ + } + +#define MAKE_UNDROPPABLE(eager) \ + if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ + (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ + = (eager)->tcp_eager_prev_drop_q0; \ + (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ + = (eager)->tcp_eager_next_drop_q0; \ + (eager)->tcp_eager_prev_drop_q0 = NULL; \ + (eager)->tcp_eager_next_drop_q0 = NULL; \ + } + +/* + * The format argument to pass to tcp_display(). + * DISP_PORT_ONLY means that the returned string has only port info. + * DISP_ADDR_AND_PORT means that the returned string also contains the + * remote and local IP address. + */ +#define DISP_PORT_ONLY 1 +#define DISP_ADDR_AND_PORT 2 + +#define IP_ADDR_CACHE_SIZE 2048 +#define IP_ADDR_CACHE_HASH(faddr) \ + (ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1)) + +/* TCP cwnd burst factor. */ +#define TCP_CWND_INFINITE 65535 +#define TCP_CWND_SS 3 +#define TCP_CWND_NORMAL 5 + +/* + * TCP reassembly macros. We hide starting and ending sequence numbers in + * b_next and b_prev of messages on the reassembly queue. The messages are + * chained using b_cont. These macros are used in tcp_reass() so we don't + * have to see the ugly casts and assignments. + */ +#define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next)) +#define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \ + (mblk_t *)(uintptr_t)(u)) +#define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev)) +#define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \ + (mblk_t *)(uintptr_t)(u)) + /* Named Dispatch Parameter Management Structure */ typedef struct tcpparam_s { uint32_t tcp_param_min; @@ -170,16 +480,92 @@ typedef struct tcpparam_s { #define tcps_dev_flow_ctl tcps_params[57].tcp_param_val #define tcps_reass_timeout tcps_params[58].tcp_param_val +#define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val + extern struct qinit tcp_rinitv4, tcp_rinitv6; extern boolean_t do_tcp_fusion; +/* + * Object to represent database of options to search passed to + * {sock,tpi}optcom_req() interface routine to take care of option + * management and associated methods. + */ +extern optdb_obj_t tcp_opt_obj; +extern uint_t tcp_max_optsize; + +extern int tcp_squeue_flag; + +extern uint_t tcp_free_list_max_cnt; + +/* + * Functions in tcp.c. + */ +extern int tcp_accept_common(conn_t *, conn_t *, cred_t *); +extern void tcp_accept_finish(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_acceptor_hash_insert(t_uscalar_t, tcp_t *); +extern tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t, tcp_stack_t *); +extern void tcp_acceptor_hash_remove(tcp_t *); +extern mblk_t *tcp_ack_mp(tcp_t *); +extern int tcp_build_hdrs(tcp_t *); +extern void tcp_cleanup(tcp_t *); +extern int tcp_clean_death(tcp_t *, int); +extern void tcp_clean_death_wrapper(void *, mblk_t *, void *, + ip_recv_attr_t *); +extern void tcp_close_common(conn_t *, int); +extern void tcp_close_detached(tcp_t *); +extern void tcp_close_mpp(mblk_t **); +extern void tcp_closei_local(tcp_t *); +extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **, + uint_t *, int *, int, cred_t *); +extern conn_t *tcp_create_common(cred_t *, boolean_t, boolean_t, int *); +extern void tcp_disconnect(tcp_t *, mblk_t *); +extern char *tcp_display(tcp_t *, char *, char); +extern int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *, + boolean_t); +extern int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t, + cred_t *, pid_t); +extern int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, + cred_t *, boolean_t); +extern int tcp_do_unbind(conn_t *); +extern boolean_t tcp_eager_blowoff(tcp_t *, t_scalar_t); +extern void tcp_eager_cleanup(tcp_t *, boolean_t); +extern void tcp_eager_kill(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_eager_unlink(tcp_t *); +extern int tcp_getpeername(sock_lower_handle_t, struct sockaddr *, + socklen_t *, cred_t *); +extern int tcp_getsockname(sock_lower_handle_t, struct sockaddr *, + socklen_t *, cred_t *); +extern void tcp_init_values(tcp_t *); +extern void tcp_ipsec_cleanup(tcp_t *); extern int tcp_maxpsz_set(tcp_t *, boolean_t); -extern void tcp_timers_stop(tcp_t *); -extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *); -extern void tcp_push_timer(void *); -extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t); -extern clock_t tcp_timeout_cancel(conn_t *, timeout_id_t); +extern void tcp_mss_set(tcp_t *, uint32_t); +extern void tcp_reinput(conn_t *, mblk_t *, ip_recv_attr_t *, ip_stack_t *); +extern void tcp_rsrv(queue_t *); +extern uint_t tcp_rwnd_reopen(tcp_t *); +extern int tcp_rwnd_set(tcp_t *, uint32_t); +extern int tcp_set_destination(tcp_t *); +extern void tcp_set_ws_value(tcp_t *); +extern void tcp_stop_lingering(tcp_t *); +extern void tcp_update_pmtu(tcp_t *, boolean_t); +extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); +extern boolean_t tcp_zcopy_check(tcp_t *); +extern void tcp_zcopy_notify(tcp_t *); + +/* + * Bind related functions in tcp_bind.c + */ +extern int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, + cred_t *, boolean_t); +extern void tcp_bind_hash_insert(tf_t *, tcp_t *, int); +extern void tcp_bind_hash_remove(tcp_t *); +extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *, + int, boolean_t, boolean_t, boolean_t); +extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *, + boolean_t); +/* + * Fusion related functions in tcp_fusion.c. + */ extern void tcp_fuse(tcp_t *, uchar_t *, tcpha_t *); extern void tcp_unfuse(tcp_t *); extern boolean_t tcp_fuse_output(tcp_t *, mblk_t *, uint32_t); @@ -188,28 +574,120 @@ extern boolean_t tcp_fuse_rcv_drain(queue_t *, tcp_t *, mblk_t **); extern size_t tcp_fuse_set_rcv_hiwat(tcp_t *, size_t); extern int tcp_fuse_maxpsz(tcp_t *); extern void tcp_fuse_backenable(tcp_t *); -extern int tcp_rwnd_set(tcp_t *, uint32_t); /* - * Object to represent database of options to search passed to - * {sock,tpi}optcom_req() interface routine to take care of option - * management and associated methods. + * Output related functions in tcp_output.c. */ -extern optdb_obj_t tcp_opt_obj; -extern uint_t tcp_max_optsize; +extern void tcp_close_output(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_output(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_output_urgent(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_rexmit_after_error(tcp_t *); +extern void tcp_sack_rexmit(tcp_t *, uint_t *); +extern void tcp_send_data(tcp_t *, mblk_t *); +extern void tcp_send_synack(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_shutdown_output(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_ss_rexmit(tcp_t *); +extern void tcp_update_xmit_tail(tcp_t *, uint32_t); +extern void tcp_wput(queue_t *, mblk_t *); +extern void tcp_wput_data(tcp_t *, mblk_t *, boolean_t); +extern void tcp_wput_sock(queue_t *, mblk_t *); +extern void tcp_wput_fallback(queue_t *, mblk_t *); +extern void tcp_xmit_ctl(char *, tcp_t *, uint32_t, uint32_t, int); +extern void tcp_xmit_listeners_reset(mblk_t *, ip_recv_attr_t *, + ip_stack_t *i, conn_t *); +extern mblk_t *tcp_xmit_mp(tcp_t *, mblk_t *, int32_t, int32_t *, + mblk_t **, uint32_t, boolean_t, uint32_t *, boolean_t); -extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **, - uint_t *, int *, int, cred_t *); +/* + * Input related functions in tcp_input.c. + */ +extern void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_input_data(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_input_listener_unbound(void *, mblk_t *, void *, + ip_recv_attr_t *); +extern boolean_t tcp_paws_check(tcp_t *, tcpha_t *, tcp_opt_t *); +extern uint_t tcp_rcv_drain(tcp_t *); +extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *); +extern boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *, + ip_recv_attr_t *); + +/* + * Kernel socket related functions in tcp_socket.c. + */ extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t, so_proto_quiesced_cb_t); -extern sock_downcalls_t sock_tcp_downcalls; - +/* + * Timer related functions in tcp_timers.c. + */ +extern void tcp_ack_timer(void *); +extern void tcp_close_linger_timeout(void *); +extern void tcp_keepalive_timer(void *); +extern void tcp_push_timer(void *); +extern void tcp_reass_timer(void *); +extern mblk_t *tcp_timermp_alloc(int); +extern void tcp_timermp_free(tcp_t *); +extern timeout_id_t tcp_timeout(conn_t *, void (*)(void *), clock_t); +extern clock_t tcp_timeout_cancel(conn_t *, timeout_id_t); +extern void tcp_timer(void *arg); +extern void tcp_timers_stop(tcp_t *); -extern int tcp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +/* + * TCP TPI related functions in tcp_tpi.c. + */ +extern void tcp_addr_req(tcp_t *, mblk_t *); +extern void tcp_capability_req(tcp_t *, mblk_t *); +extern boolean_t tcp_conn_con(tcp_t *, uchar_t *, mblk_t *, + mblk_t **, ip_recv_attr_t *); +extern void tcp_err_ack(tcp_t *, mblk_t *, int, int); +extern void tcp_err_ack_prim(tcp_t *, mblk_t *, int, int, int); +extern void tcp_fallback_eager(tcp_t *, boolean_t); +extern void tcp_fallback_noneager(tcp_t *, mblk_t *, queue_t *, + boolean_t, so_proto_quiesced_cb_t); +extern void tcp_info_req(tcp_t *, mblk_t *); +extern void tcp_send_conn_ind(void *, mblk_t *, void *); +extern void tcp_send_pending(void *, mblk_t *, void *, ip_recv_attr_t *); +extern void tcp_tpi_accept(queue_t *, mblk_t *); +extern void tcp_tpi_bind(tcp_t *, mblk_t *); +extern int tcp_tpi_close(queue_t *, int); +extern int tcp_tpi_close_accept(queue_t *); +extern void tcp_tpi_connect(tcp_t *, mblk_t *); extern int tcp_tpi_opt_get(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); extern int tcp_tpi_opt_set(queue_t *, uint_t, int, int, uint_t, uchar_t *, uint_t *, uchar_t *, void *, cred_t *); +extern void tcp_tpi_unbind(tcp_t *, mblk_t *); +extern void tcp_tli_accept(tcp_t *, mblk_t *); +extern void tcp_use_pure_tpi(tcp_t *); + +/* + * TCP option processing related functions in tcp_opt_data.c + */ +extern int tcp_opt_default(queue_t *, t_scalar_t, t_scalar_t, uchar_t *); +extern int tcp_opt_get(conn_t *, int, int, uchar_t *); +extern int tcp_opt_set(conn_t *, uint_t, int, int, uint_t, uchar_t *, + uint_t *, uchar_t *, void *, cred_t *); + +/* + * TCP time wait processing related functions in tcp_time_wait.c. + */ +extern void tcp_time_wait_append(tcp_t *); +extern void tcp_time_wait_collector(void *); +extern boolean_t tcp_time_wait_remove(tcp_t *, tcp_squeue_priv_t *); +extern void tcp_time_wait_processing(tcp_t *, mblk_t *, uint32_t, + uint32_t, int, tcpha_t *, ip_recv_attr_t *); + +/* + * Misc functions in tcp_misc.c. + */ +extern int tcp_cpu_update(cpu_setup_t, int, void *); +extern void tcp_ioctl_abort_conn(queue_t *, mblk_t *); +extern uint32_t tcp_find_listener_conf(tcp_stack_t *, in_port_t); +extern int tcp_listener_conf_get(queue_t *, mblk_t *, caddr_t, cred_t *); +extern int tcp_listener_conf_add(queue_t *, mblk_t *, char *, caddr_t, + cred_t *); +extern int tcp_listener_conf_del(queue_t *, mblk_t *, char *, caddr_t, + cred_t *); +extern void tcp_listener_conf_cleanup(tcp_stack_t *); #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h index 34d5e087fa..1a6e374f3e 100644 --- a/usr/src/uts/common/inet/tcp_stack.h +++ b/usr/src/uts/common/inet/tcp_stack.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,6 +30,7 @@ #include <sys/netstack.h> #include <inet/ip.h> #include <inet/ipdrop.h> +#include <inet/tcp_stats.h> #include <sys/sunddi.h> #include <sys/sunldi.h> @@ -37,91 +38,6 @@ extern "C" { #endif -/* Kstats */ -typedef struct tcp_stat { - kstat_named_t tcp_time_wait; - kstat_named_t tcp_time_wait_syn; - kstat_named_t tcp_time_wait_syn_success; - kstat_named_t tcp_detach_non_time_wait; - kstat_named_t tcp_detach_time_wait; - kstat_named_t tcp_time_wait_reap; - kstat_named_t tcp_clean_death_nondetached; - kstat_named_t tcp_reinit_calls; - kstat_named_t tcp_eager_err1; - kstat_named_t tcp_eager_err2; - kstat_named_t tcp_eager_blowoff_calls; - kstat_named_t tcp_eager_blowoff_q; - kstat_named_t tcp_eager_blowoff_q0; - kstat_named_t tcp_not_hard_bound; - kstat_named_t tcp_no_listener; - kstat_named_t tcp_found_eager; - kstat_named_t tcp_wrong_queue; - kstat_named_t tcp_found_eager_binding1; - kstat_named_t tcp_found_eager_bound1; - kstat_named_t tcp_eager_has_listener1; - kstat_named_t tcp_open_alloc; - kstat_named_t tcp_open_detached_alloc; - kstat_named_t tcp_rput_time_wait; - kstat_named_t tcp_listendrop; - kstat_named_t tcp_listendropq0; - kstat_named_t tcp_wrong_rq; - kstat_named_t tcp_rsrv_calls; - kstat_named_t tcp_eagerfree2; - kstat_named_t tcp_eagerfree3; - kstat_named_t tcp_eagerfree4; - kstat_named_t tcp_eagerfree5; - kstat_named_t tcp_timewait_syn_fail; - kstat_named_t tcp_listen_badflags; - kstat_named_t tcp_timeout_calls; - kstat_named_t tcp_timeout_cached_alloc; - kstat_named_t tcp_timeout_cancel_reqs; - kstat_named_t tcp_timeout_canceled; - kstat_named_t tcp_timermp_freed; - kstat_named_t tcp_push_timer_cnt; - kstat_named_t tcp_ack_timer_cnt; - kstat_named_t tcp_wsrv_called; - kstat_named_t tcp_flwctl_on; - kstat_named_t tcp_timer_fire_early; - kstat_named_t tcp_timer_fire_miss; - kstat_named_t tcp_rput_v6_error; - kstat_named_t tcp_zcopy_on; - kstat_named_t tcp_zcopy_off; - kstat_named_t tcp_zcopy_backoff; - kstat_named_t tcp_fusion_flowctl; - kstat_named_t tcp_fusion_backenabled; - kstat_named_t tcp_fusion_urg; - kstat_named_t tcp_fusion_putnext; - kstat_named_t tcp_fusion_unfusable; - kstat_named_t tcp_fusion_aborted; - kstat_named_t tcp_fusion_unqualified; - kstat_named_t tcp_fusion_rrw_busy; - kstat_named_t tcp_fusion_rrw_msgcnt; - kstat_named_t tcp_fusion_rrw_plugged; - kstat_named_t tcp_in_ack_unsent_drop; - kstat_named_t tcp_sock_fallback; - kstat_named_t tcp_lso_enabled; - kstat_named_t tcp_lso_disabled; - kstat_named_t tcp_lso_times; - kstat_named_t tcp_lso_pkt_out; - kstat_named_t tcp_listen_cnt_drop; - kstat_named_t tcp_listen_mem_drop; - kstat_named_t tcp_zwin_ack_syn; - kstat_named_t tcp_rst_unsent; -} tcp_stat_t; - -#define TCP_STAT(tcps, x) ((tcps)->tcps_statistics.x.value.ui64++) -#define TCP_STAT_UPDATE(tcps, x, n) \ - ((tcps)->tcps_statistics.x.value.ui64 += (n)) -#define TCP_STAT_SET(tcps, x, n) \ - ((tcps)->tcps_statistics.x.value.ui64 = (n)) - -typedef struct tcp_g_stat { - kstat_named_t tcp_timermp_alloced; - kstat_named_t tcp_timermp_allocfail; - kstat_named_t tcp_timermp_allocdblfail; - kstat_named_t tcp_freelist_cleanup; -} tcp_g_stat_t; - #ifdef _KERNEL /* @@ -130,8 +46,6 @@ typedef struct tcp_g_stat { struct tcp_stack { netstack_t *tcps_netstack; /* Common netstack */ - mib2_tcp_t tcps_mib; - /* * Extra privileged ports. In host byte order. * Protected by tcp_epriv_port_lock. @@ -167,7 +81,6 @@ struct tcp_stack { */ kstat_t *tcps_mibkp; /* kstat exporting tcp_mib data */ kstat_t *tcps_kstat; - tcp_stat_t tcps_statistics; uint32_t tcps_iss_incr_extra; /* Incremented for each connection */ @@ -202,7 +115,18 @@ struct tcp_stack { /* Listener connection limit configuration. */ kmutex_t tcps_listener_conf_lock; list_t tcps_listener_conf; + + /* + * Per CPU stats + * + * tcps_sc: array of pointer to per CPU stats. The i-th element in the + * array represents the stats of the CPU with cpu_seqid. + * tcps_sc_cnt: number of CPU stats in the tcps_sc array. + */ + tcp_stats_cpu_t **tcps_sc; + int tcps_sc_cnt; }; + typedef struct tcp_stack tcp_stack_t; #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/tcp_stats.h b/usr/src/uts/common/inet/tcp_stats.h new file mode 100644 index 0000000000..fa6a80f47a --- /dev/null +++ b/usr/src/uts/common/inet/tcp_stats.h @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _INET_TCP_STATS_H +#define _INET_TCP_STATS_H + +/* + * TCP private kernel statistics declarations. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +/* + * TCP Statistics. + * + * How TCP statistics work. + * + * There are two types of statistics invoked by two macros. + * + * TCP_STAT(name) does non-atomic increment of a named stat counter. It is + * supposed to be used in non MT-hot paths of the code. + * + * TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is + * supposed to be used for DEBUG purposes and may be used on a hot path. + * These counters are only available in a debugged kerel. They are grouped + * under the TCP_DEBUG_COUNTER C pre-processor condition. + * + * Both TCP_STAT and TCP_DBGSTAT counters are available using kstat + * (use "kstat tcp" to get them). + * + * How to add new counters. + * + * 1) Add a field in the tcp_stat structure describing your counter. + * 2) Add a line in the template in tcp_kstat2_init() with the name + * of the counter. + * 3) Update tcp_clr_stats() and tcp_cp_stats() with the new counters. + * IMPORTANT!! - make sure that all the above functions are in sync !! + * 4) Use either TCP_STAT or TCP_DBGSTAT with the name. + * + * Please avoid using private counters which are not kstat-exported. + * + * Implementation note. + * + * Both the MIB2 and tcp_stat_t counters are kept per CPU in the array + * tcps_sc in tcp_stack_t. Each array element is a pointer to a + * tcp_stats_cpu_t struct. Once allocated, the tcp_stats_cpu_t struct is + * not freed until the tcp_stack_t is going away. So there is no need to + * acquire a lock before accessing the stats counters. + */ + +#ifndef TCP_DEBUG_COUNTER +#ifdef DEBUG +#define TCP_DEBUG_COUNTER 1 +#else +#define TCP_DEBUG_COUNTER 0 +#endif +#endif + +/* Kstats */ +typedef struct tcp_stat { + kstat_named_t tcp_time_wait_syn_success; + kstat_named_t tcp_clean_death_nondetached; + kstat_named_t tcp_eager_blowoff_q; + kstat_named_t tcp_eager_blowoff_q0; + kstat_named_t tcp_no_listener; + kstat_named_t tcp_listendrop; + kstat_named_t tcp_listendropq0; + kstat_named_t tcp_wsrv_called; + kstat_named_t tcp_flwctl_on; + kstat_named_t tcp_timer_fire_early; + kstat_named_t tcp_timer_fire_miss; + kstat_named_t tcp_zcopy_on; + kstat_named_t tcp_zcopy_off; + kstat_named_t tcp_zcopy_backoff; + kstat_named_t tcp_fusion_flowctl; + kstat_named_t tcp_fusion_backenabled; + kstat_named_t tcp_fusion_urg; + kstat_named_t tcp_fusion_putnext; + kstat_named_t tcp_fusion_unfusable; + kstat_named_t tcp_fusion_aborted; + kstat_named_t tcp_fusion_unqualified; + kstat_named_t tcp_fusion_rrw_busy; + kstat_named_t tcp_fusion_rrw_msgcnt; + kstat_named_t tcp_fusion_rrw_plugged; + kstat_named_t tcp_in_ack_unsent_drop; + kstat_named_t tcp_sock_fallback; + kstat_named_t tcp_lso_enabled; + kstat_named_t tcp_lso_disabled; + kstat_named_t tcp_lso_times; + kstat_named_t tcp_lso_pkt_out; + kstat_named_t tcp_listen_cnt_drop; + kstat_named_t tcp_listen_mem_drop; + kstat_named_t tcp_zwin_mem_drop; + kstat_named_t tcp_zwin_ack_syn; + kstat_named_t tcp_rst_unsent; + kstat_named_t tcp_reclaim_cnt; + kstat_named_t tcp_reass_timeout; +#ifdef TCP_DEBUG_COUNTER + kstat_named_t tcp_time_wait; + kstat_named_t tcp_rput_time_wait; + kstat_named_t tcp_detach_time_wait; + kstat_named_t tcp_timeout_calls; + kstat_named_t tcp_timeout_cached_alloc; + kstat_named_t tcp_timeout_cancel_reqs; + kstat_named_t tcp_timeout_canceled; + kstat_named_t tcp_timermp_freed; + kstat_named_t tcp_push_timer_cnt; + kstat_named_t tcp_ack_timer_cnt; +#endif +} tcp_stat_t; + +typedef struct tcp_g_stat { + kstat_named_t tcp_timermp_alloced; + kstat_named_t tcp_timermp_allocfail; + kstat_named_t tcp_timermp_allocdblfail; + kstat_named_t tcp_freelist_cleanup; +} tcp_g_stat_t; + +/* Per CPU stats: TCP MIB2, TCP kstat and connection counter. */ +typedef struct { + int64_t tcp_sc_conn_cnt; + mib2_tcp_t tcp_sc_mib; + tcp_stat_t tcp_sc_stats; +} tcp_stats_cpu_t; + +#define TCPS_BUMP_MIB(tcps, x) \ + BUMP_MIB(&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_mib, x) + +#define TCPS_UPDATE_MIB(tcps, x, y) \ + UPDATE_MIB(&(tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_mib, x, y) + +#if TCP_DEBUG_COUNTER +#define TCP_DBGSTAT(tcps, x) \ + atomic_inc_64( \ + &((tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_stats.x.value.ui64)) +#define TCP_G_DBGSTAT(x) \ + atomic_inc_64(&(tcp_g_statistics.x.value.ui64)) +#else +#define TCP_DBGSTAT(tcps, x) +#define TCP_G_DBGSTAT(x) +#endif + +#define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++) + +#define TCP_STAT(tcps, x) \ + ((tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_stats.x.value.ui64++) +#define TCP_STAT_UPDATE(tcps, x, n) \ + ((tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_stats.x.value.ui64 += (n)) +#define TCP_STAT_SET(tcps, x, n) \ + ((tcps)->tcps_sc[CPU->cpu_seqid]->tcp_sc_stats.x.value.ui64 = (n)) + +extern tcp_g_stat_t tcp_g_statistics; +extern kstat_t *tcp_g_kstat; + +extern void *tcp_g_kstat_init(tcp_g_stat_t *); +extern void tcp_g_kstat_fini(kstat_t *); +extern void *tcp_kstat_init(netstackid_t); +extern void tcp_kstat_fini(netstackid_t, kstat_t *); +extern void *tcp_kstat2_init(netstackid_t); +extern void tcp_kstat2_fini(netstackid_t, kstat_t *); + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_TCP_STATS_H */ diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index bd0599c115..53e1185fa1 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -303,9 +303,6 @@ struct streamtab udpinfov6 = { &udp_rinitv6, &udp_winit, &udp_lrinit, &udp_lwinit }; -static sin_t sin_null; /* Zero address for quick clears */ -static sin6_t sin6_null; /* Zero address for quick clears */ - #define UDP_MAXPACKET_IPV4 (IP_MAXPACKET - UDPH_SIZE - IP_SIMPLE_HDR_LENGTH) /* Default structure copied into T_INFO_ACK messages */ diff --git a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c index 130c51cc15..f8ce819437 100644 --- a/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c +++ b/usr/src/uts/common/io/ib/clients/rds/rds_ioctl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,8 +41,6 @@ #include <sys/ib/clients/rds/rds.h> #include <sys/ib/clients/rds/rds_transport.h> -static sin_t sin_null; /* Zero address for quick clears */ - /* * Just pass the ioctl to IP and the result to the caller. */ diff --git a/usr/src/uts/intel/ip/ip.global-objs.debug64 b/usr/src/uts/intel/ip/ip.global-objs.debug64 index 5162254c12..bfad66f0a2 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.debug64 +++ b/usr/src/uts/intel/ip/ip.global-objs.debug64 @@ -266,6 +266,7 @@ tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench tcp_init_wnd_shft +tcp_max_init_cwnd tcp_max_optsize tcp_min_conn_listener tcp_opt_arr diff --git a/usr/src/uts/intel/ip/ip.global-objs.obj64 b/usr/src/uts/intel/ip/ip.global-objs.obj64 index 84d29916c6..0d3f7a73f7 100644 --- a/usr/src/uts/intel/ip/ip.global-objs.obj64 +++ b/usr/src/uts/intel/ip/ip.global-objs.obj64 @@ -263,6 +263,7 @@ tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench tcp_init_wnd_shft +tcp_max_init_cwnd tcp_max_optsize tcp_min_conn_listener tcp_opt_arr diff --git a/usr/src/uts/sparc/ip/ip.global-objs.debug64 b/usr/src/uts/sparc/ip/ip.global-objs.debug64 index 5162254c12..bfad66f0a2 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.debug64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.debug64 @@ -266,6 +266,7 @@ tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench tcp_init_wnd_shft +tcp_max_init_cwnd tcp_max_optsize tcp_min_conn_listener tcp_opt_arr diff --git a/usr/src/uts/sparc/ip/ip.global-objs.obj64 b/usr/src/uts/sparc/ip/ip.global-objs.obj64 index 84d29916c6..0d3f7a73f7 100644 --- a/usr/src/uts/sparc/ip/ip.global-objs.obj64 +++ b/usr/src/uts/sparc/ip/ip.global-objs.obj64 @@ -263,6 +263,7 @@ tcp_g_t_info_ack tcp_g_t_info_ack_v6 tcp_icmp_source_quench tcp_init_wnd_shft +tcp_max_init_cwnd tcp_max_optsize tcp_min_conn_listener tcp_opt_arr |