From 707e74bc53cd429bcd731df722227c7dc2de47c6 Mon Sep 17 00:00:00 2001 From: Kacheong Poon Date: Thu, 3 Jun 2010 09:14:53 -0700 Subject: PSARC 2010/151 new socket options for TCP timers 6955557 Various new TCP socket options --- usr/src/cmd/truss/print.c | 4 + usr/src/uts/common/inet/tcp.h | 6 ++ usr/src/uts/common/inet/tcp/tcp.c | 93 ++++++++++++++------ usr/src/uts/common/inet/tcp/tcp_input.c | 32 ++----- usr/src/uts/common/inet/tcp/tcp_opt_data.c | 136 +++++++++++++++++++++++++++-- usr/src/uts/common/inet/tcp/tcp_timers.c | 54 +++++++++--- usr/src/uts/common/inet/tcp/tcp_tunables.c | 7 +- usr/src/uts/common/inet/tcp_impl.h | 25 +++++- usr/src/uts/common/netinet/tcp.h | 29 +++++- 9 files changed, 304 insertions(+), 82 deletions(-) diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c index c4ba8b2abd..5de1342c0e 100644 --- a/usr/src/cmd/truss/print.c +++ b/usr/src/cmd/truss/print.c @@ -1832,6 +1832,10 @@ tcp_optname(private_t *pri, long val) case TCP_KEEPALIVE_ABORT_THRESHOLD: return ("TCP_KEEPALIVE_ABORT_THRESHOLD"); case TCP_CORK: return ("TCP_CORK"); + case TCP_RTO_INITIAL: return ("TCP_RTO_INITIAL"); + case TCP_RTO_MIN: return ("TCP_RTO_MIN"); + case TCP_RTO_MAX: return ("TCP_RTO_MAX"); + case TCP_LINGER2: return ("TCP_LINGER2"); default: (void) snprintf(pri->code_buf, sizeof (pri->code_buf), diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 23dbb1a687..92b7a8ed67 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -158,6 +158,9 @@ typedef struct tcp_s { clock_t tcp_rto; /* Round trip timeout */ clock_t tcp_last_rcv_lbolt; /* lbolt on last packet, used for PAWS */ + uint32_t tcp_rto_initial; /* Initial RTO */ + uint32_t tcp_rto_min; /* Minimum RTO */ + uint32_t tcp_rto_max; /* Maximum RTO */ uint32_t tcp_snxt; /* Senders next seq num */ uint32_t tcp_swnd; /* Senders window (relative to suna) */ @@ -478,6 +481,9 @@ typedef struct tcp_s { /* Segment reassembly timer. */ timeout_id_t tcp_reass_tid; + /* FIN-WAIT-2 flush timeout */ + uint32_t tcp_fin_wait_2_flush_interval; + #ifdef DEBUG pc_t tcmp_stk[15]; #endif diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 51ee3be794..441722acd4 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -661,13 +661,7 @@ tcp_set_destination(tcp_t *tcp) tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5); - if (rto > tcps->tcps_rexmit_interval_max) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_max; - } else if (rto < tcps->tcps_rexmit_interval_min) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_min; - } else { - tcp->tcp_rto = rto; - } + TCP_SET_RTO(tcp, rto); } if (uinfo.iulp_ssthresh != 0) tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; @@ -2021,7 +2015,7 @@ tcp_reinit(tcp_t *tcp) /* * Initialize to default values */ - tcp_init_values(tcp); + tcp_init_values(tcp, NULL); DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *, connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL, @@ -2313,11 +2307,16 @@ tcp_reinit_values(tcp) #undef PRESERVE } +/* + * Initialize the various fields in tcp_t. If parent (the listener) is non + * NULL, certain values will be inheritted from it. + */ void -tcp_init_values(tcp_t *tcp) +tcp_init_values(tcp_t *tcp, tcp_t *parent) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; + clock_t rto; ASSERT((connp->conn_family == AF_INET && connp->conn_ipversion == IPV4_VERSION) || @@ -2325,6 +2324,56 @@ tcp_init_values(tcp_t *tcp) (connp->conn_ipversion == IPV4_VERSION || connp->conn_ipversion == IPV6_VERSION))); + if (parent == NULL) { + tcp->tcp_naglim = tcps->tcps_naglim_def; + + tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial; + tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min; + tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max; + + tcp->tcp_first_ctimer_threshold = + tcps->tcps_ip_notify_cinterval; + tcp->tcp_second_ctimer_threshold = + tcps->tcps_ip_abort_cinterval; + tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; + tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval; + + tcp->tcp_fin_wait_2_flush_interval = + tcps->tcps_fin_wait_2_flush_interval; + + tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; + tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; + + /* + * Default value of tcp_init_cwnd is 0, so no need to set here + * if parent is NULL. But we need to inherit it from parent. + */ + } else { + /* Inherit various TCP parameters from the parent. */ + tcp->tcp_naglim = parent->tcp_naglim; + + tcp->tcp_rto_initial = parent->tcp_rto_initial; + tcp->tcp_rto_min = parent->tcp_rto_min; + tcp->tcp_rto_max = parent->tcp_rto_max; + + tcp->tcp_first_ctimer_threshold = + parent->tcp_first_ctimer_threshold; + tcp->tcp_second_ctimer_threshold = + parent->tcp_second_ctimer_threshold; + tcp->tcp_first_timer_threshold = + parent->tcp_first_timer_threshold; + tcp->tcp_second_timer_threshold = + parent->tcp_second_timer_threshold; + + tcp->tcp_fin_wait_2_flush_interval = + parent->tcp_fin_wait_2_flush_interval; + + tcp->tcp_ka_interval = parent->tcp_ka_interval; + tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres; + + tcp->tcp_init_cwnd = parent->tcp_init_cwnd; + } + /* * Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO * will be close to tcp_rexmit_interval_initial. By doing this, we @@ -2332,13 +2381,13 @@ tcp_init_values(tcp_t *tcp) * during first few transmissions of a connection as seen in slow * links. */ - tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2; - tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1; - tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + + tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; + tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; + rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + tcps->tcps_conn_grace_period; - if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min) - tcp->tcp_rto = tcps->tcps_rexmit_interval_min; + TCP_SET_RTO(tcp, rto); + tcp->tcp_timer_backoff = 0; tcp->tcp_ms_we_have_waited = 0; tcp->tcp_last_recv_time = ddi_get_lbolt(); @@ -2348,17 +2397,6 @@ tcp_init_values(tcp_t *tcp) tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier; - tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval; - tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval; - tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval; - /* - * Fix it to tcp_ip_abort_linterval later if it turns out to be a - * passive open. - */ - tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval; - - tcp->tcp_naglim = tcps->tcps_naglim_def; - /* NOTE: ISS is now set in tcp_set_destination(). */ /* Reset fusion-related fields */ @@ -2388,9 +2426,6 @@ tcp_init_values(tcp_t *tcp) */ if (!connp->conn_debug) connp->conn_debug = tcps->tcps_dbg; - - tcp->tcp_ka_interval = tcps->tcps_keepalive_interval; - tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval; } /* @@ -2674,7 +2709,7 @@ tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket, SOCK_CONNID_INIT(tcp->tcp_connid); /* DTrace ignores this - it isn't a tcp:::state-change */ tcp->tcp_state = TCPS_IDLE; - tcp_init_values(tcp); + tcp_init_values(tcp, NULL); return (connp); } diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index 14f34a1591..ce00372741 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -1482,7 +1482,11 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) eager->tcp_detached = B_TRUE; SOCK_CONNID_INIT(eager->tcp_connid); - tcp_init_values(eager); + /* + * Initialize the eager's tcp_t and inherit some parameters from + * the listener. + */ + tcp_init_values(eager, listener); ASSERT((econnp->conn_ixa->ixa_flags & (IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE | @@ -1573,16 +1577,6 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) } } - /* Inherit various TCP parameters from the listener */ - eager->tcp_naglim = listener->tcp_naglim; - eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold; - eager->tcp_second_timer_threshold = - listener->tcp_second_timer_threshold; - eager->tcp_first_ctimer_threshold = - listener->tcp_first_ctimer_threshold; - eager->tcp_second_ctimer_threshold = - listener->tcp_second_ctimer_threshold; - /* * tcp_set_destination() may set tcp_rwnd according to the route * metrics. If it does not, the eager's receive window will be set @@ -1590,12 +1584,6 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) */ eager->tcp_rwnd = 0; - /* - * Inherit listener's tcp_init_cwnd. Need to do this before - * calling tcp_process_options() which set the initial cwnd. - */ - eager->tcp_init_cwnd = listener->tcp_init_cwnd; - if (is_system_labeled()) { ip_xmit_attr_t *ixa = econnp->conn_ixa; @@ -4427,7 +4415,7 @@ est: * flushing the FIN_WAIT_2 connection. */ TCP_TIMER_RESTART(tcp, - tcps->tcps_fin_wait_2_flush_interval); + tcp->tcp_fin_wait_2_flush_interval); } break; case TCPS_FIN_WAIT_2: @@ -5228,13 +5216,7 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) */ rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); - if (rto > tcps->tcps_rexmit_interval_max) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_max; - } else if (rto < tcps->tcps_rexmit_interval_min) { - tcp->tcp_rto = tcps->tcps_rexmit_interval_min; - } else { - tcp->tcp_rto = rto; - } + TCP_SET_RTO(tcp, rto); /* Now, we can reset tcp_timer_backoff to use the new RTO... */ tcp->tcp_timer_backoff = 0; diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 0f46bf4a08..cdb7305a45 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -122,6 +122,14 @@ opdes_t tcp_opt_arr[] = { { TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, + +{ TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, + +{ TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 }, + +{ TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, + { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, @@ -401,6 +409,18 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) case TCP_CORK: *i1 = tcp->tcp_cork; return (sizeof (int)); + case TCP_RTO_INITIAL: + *i1 = tcp->tcp_rto_initial; + return (sizeof (uint32_t)); + case TCP_RTO_MIN: + *i1 = tcp->tcp_rto_min; + return (sizeof (uint32_t)); + case TCP_RTO_MAX: + *i1 = tcp->tcp_rto_max; + return (sizeof (uint32_t)); + case TCP_LINGER2: + *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS; + return (sizeof (int)); } break; case IPPROTO_IP: @@ -455,6 +475,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, int reterr; tcp_stack_t *tcps = tcp->tcp_tcps; conn_opt_arg_t coas; + uint32_t val = *((uint32_t *)invalp); coas.coa_connp = connp; coas.coa_ixa = connp->conn_ixa; @@ -639,9 +660,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } /* Setting done in conn_opt_set */ break; - case TCP_INIT_CWND: { - uint32_t init_cwnd = *((uint32_t *)invalp); - + case TCP_INIT_CWND: if (checkonly) break; @@ -650,21 +669,20 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, * privilege to set the initial cwnd to be larger * than allowed by RFC 3390. */ - if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { - tcp->tcp_init_cwnd = init_cwnd; + if (val <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) { + tcp->tcp_init_cwnd = val; break; } if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) { *outlenp = 0; return (reterr); } - if (init_cwnd > tcp_max_init_cwnd) { + if (val > tcp_max_init_cwnd) { *outlenp = 0; return (EINVAL); } - tcp->tcp_init_cwnd = init_cwnd; + tcp->tcp_init_cwnd = val; break; - } case TCP_KEEPALIVE_THRESHOLD: if (checkonly) break; @@ -720,6 +738,108 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, tcp->tcp_cork = onoff; } break; + case TCP_RTO_INITIAL: { + clock_t rto; + + if (checkonly || val == 0) + break; + + /* + * Sanity checks + * + * The initial RTO should be bounded by the minimum + * and maximum RTO. And it should also be smaller + * than the connect attempt abort timeout. Otherwise, + * the connection won't be aborted in a period + * reasonably close to that timeout. + */ + if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max || + val > tcp->tcp_second_ctimer_threshold || + val < tcps->tcps_rexmit_interval_initial_low || + val > tcps->tcps_rexmit_interval_initial_high) { + *outlenp = 0; + return (EINVAL); + } + tcp->tcp_rto_initial = val; + + /* + * If TCP has not sent anything, need to re-calculate + * tcp_rto. Otherwise, this option change does not + * really affect anything. + */ + if (tcp->tcp_state >= TCPS_SYN_SENT) + break; + + tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; + tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; + rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + + tcps->tcps_rexmit_interval_extra + + (tcp->tcp_rtt_sa >> 5) + + tcps->tcps_conn_grace_period; + TCP_SET_RTO(tcp, rto); + break; + } + case TCP_RTO_MIN: + if (checkonly || val == 0) + break; + + if (val < tcps->tcps_rexmit_interval_min_low || + val > tcps->tcps_rexmit_interval_min_high || + val > tcp->tcp_rto_max) { + *outlenp = 0; + return (EINVAL); + } + tcp->tcp_rto_min = val; + if (tcp->tcp_rto < val) + tcp->tcp_rto = val; + break; + case TCP_RTO_MAX: + if (checkonly || val == 0) + break; + + /* + * Sanity checks + * + * The maximum RTO should not be larger than the + * connection abort timeout. Otherwise, the + * connection won't be aborted in a period reasonably + * close to that timeout. + */ + if (val < tcps->tcps_rexmit_interval_max_low || + val > tcps->tcps_rexmit_interval_max_high || + val < tcp->tcp_rto_min || + val > tcp->tcp_second_timer_threshold) { + *outlenp = 0; + return (EINVAL); + } + tcp->tcp_rto_max = val; + if (tcp->tcp_rto > val) + tcp->tcp_rto = val; + break; + case TCP_LINGER2: + if (checkonly || *i1 == 0) + break; + + /* + * Note that the option value's unit is second. And + * the value should be bigger than the private + * parameter tcp_fin_wait_2_flush_interval's lower + * bound and smaller than the current value of that + * parameter. It should be smaller than the current + * value to avoid an app setting TCP_LINGER2 to a big + * value, causing resource to be held up too long in + * FIN-WAIT-2 state. + */ + if (*i1 < 0 || + tcps->tcps_fin_wait_2_flush_interval_low/SECONDS > + *i1 || + tcps->tcps_fin_wait_2_flush_interval/SECONDS < + *i1) { + *outlenp = 0; + return (EINVAL); + } + tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS; + break; default: break; } diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c index f2eaa3a958..d0e0401857 100644 --- a/usr/src/uts/common/inet/tcp/tcp_timers.c +++ b/usr/src/uts/common/inet/tcp/tcp_timers.c @@ -460,9 +460,9 @@ tcp_keepalive_timer(void *arg) /* * We should probe again at least * in ka_intrvl, but not more than - * tcp_rexmit_interval_max. + * tcp_rto_max. */ - max = tcps->tcps_rexmit_interval_max; + max = tcp->tcp_rto_max; firetime = MIN(ka_intrvl - 1, tcp->tcp_ka_last_intrvl << 1); if (firetime > max) @@ -624,6 +624,7 @@ tcp_timer(void *arg) conn_t *connp = (conn_t *)arg; tcp_t *tcp = connp->conn_tcp; tcp_stack_t *tcps = tcp->tcp_tcps; + boolean_t dont_timeout = B_FALSE; tcp->tcp_timer_tid = 0; @@ -693,11 +694,29 @@ tcp_timer(void *arg) case TCPS_SYN_SENT: first_threshold = tcp->tcp_first_ctimer_threshold; second_threshold = tcp->tcp_second_ctimer_threshold; + + /* Retransmit forever unless this is a passive open... */ + if (second_threshold == 0) { + if (!tcp->tcp_active_open) { + second_threshold = + tcps->tcps_ip_abort_linterval; + } else { + dont_timeout = B_TRUE; + } + } break; case TCPS_ESTABLISHED: + case TCPS_CLOSE_WAIT: + /* + * If the end point has not been closed, TCP can retransmit + * forever. But if the end point is closed, the normal + * timeout applies. + */ + if (second_threshold == 0) + dont_timeout = B_TRUE; + /* FALLTHRU */ case TCPS_FIN_WAIT_1: case TCPS_CLOSING: - case TCPS_CLOSE_WAIT: case TCPS_LAST_ACK: /* If we have data to rexmit */ if (tcp->tcp_suna != tcp->tcp_snxt) { @@ -844,7 +863,7 @@ tcp_timer(void *arg) (void) tcp_clean_death(tcp, 0); } else { TCP_TIMER_RESTART(tcp, - tcps->tcps_fin_wait_2_flush_interval); + tcp->tcp_fin_wait_2_flush_interval); } return; case TCPS_TIME_WAIT: @@ -868,8 +887,14 @@ tcp_timer(void *arg) if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL && tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) { second_threshold = tcp_early_abort * SECONDS; + + /* We will ignore the never timeout promise in this case... */ + dont_timeout = B_FALSE; } + if (!dont_timeout && second_threshold == 0) + second_threshold = tcps->tcps_ip_abort_interval; + if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) { /* * Should not hold the zero-copy messages for too long. @@ -878,6 +903,9 @@ tcp_timer(void *arg) tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp, tcp->tcp_xmit_head, B_TRUE); + if (dont_timeout) + goto timer_rexmit; + /* * For zero window probe, we need to send indefinitely, * unless we have not heard from the other side for some @@ -923,10 +951,10 @@ tcp_timer(void *arg) * We don't need to decrement tcp_timer_backoff * to avoid overflow because it will be decremented * later if new timeout value is greater than - * tcp_rexmit_interval_max. In the case when - * tcp_rexmit_interval_max is greater than - * second_threshold, it means that we will wait - * longer than second_threshold to send the next + * tcp_rto_max. In the case when tcp_rto_max is + * greater than second_threshold, it means that we + * will wait longer than second_threshold to send + * the next * window probe. */ tcp->tcp_ms_we_have_waited = second_threshold; @@ -955,21 +983,23 @@ tcp_timer(void *arg) tcp->tcp_rtt_update = 0; } } + +timer_rexmit: tcp->tcp_timer_backoff++; if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < - tcps->tcps_rexmit_interval_min) { + tcp->tcp_rto_min) { /* * This means the original RTO is tcp_rexmit_interval_min. * So we will use tcp_rexmit_interval_min as the RTO value * and do the backoff. */ - ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff; + ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff; } else { ms <<= tcp->tcp_timer_backoff; } - if (ms > tcps->tcps_rexmit_interval_max) { - ms = tcps->tcps_rexmit_interval_max; + if (ms > tcp->tcp_rto_max) { + ms = tcp->tcp_rto_max; /* * ms is at max, decrement tcp_timer_backoff to avoid * overflow. diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c index 1d01d9a1b1..9f54799fa1 100644 --- a/usr/src/uts/common/inet/tcp/tcp_tunables.c +++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c @@ -19,8 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2010 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include @@ -337,8 +336,8 @@ mod_prop_info_t tcp_propinfo_tbl[] = { { "tcp_fin_wait_2_flush_interval", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, - {1*SECONDS, UINT32_MAX, 675*SECONDS}, - {675*SECONDS} }, + {1*SECONDS, 2*HOURS, 60*SECONDS}, + {60*SECONDS} }, { "tcp_max_buf", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 9d99d933bd..46b12b27f0 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -227,6 +227,17 @@ typedef struct tcp_squeue_priv_s { ((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \ } +/* + * Set tcp_rto with boundary checking. + */ +#define TCP_SET_RTO(tcp, rto) \ + if ((rto) < (tcp)->tcp_rto_min) \ + (tcp)->tcp_rto = (tcp)->tcp_rto_min; \ + else if ((rto) > (tcp)->tcp_rto_max) \ + (tcp)->tcp_rto = (tcp)->tcp_rto_max; \ + else \ + (tcp)->tcp_rto = (rto); + /* * TCP options struct returned from tcp_parse_options. */ @@ -428,9 +439,17 @@ extern uint32_t tcp_early_abort; #define tcps_mss_max_ipv4 tcps_propinfo_tbl[17].prop_cur_uval #define tcps_mss_min tcps_propinfo_tbl[18].prop_cur_uval #define tcps_naglim_def tcps_propinfo_tbl[19].prop_cur_uval +#define tcps_rexmit_interval_initial_high \ + tcps_propinfo_tbl[20].prop_max_uval #define tcps_rexmit_interval_initial tcps_propinfo_tbl[20].prop_cur_uval +#define tcps_rexmit_interval_initial_low \ + tcps_propinfo_tbl[20].prop_min_uval +#define tcps_rexmit_interval_max_high tcps_propinfo_tbl[21].prop_max_uval #define tcps_rexmit_interval_max tcps_propinfo_tbl[21].prop_cur_uval +#define tcps_rexmit_interval_max_low tcps_propinfo_tbl[21].prop_min_uval +#define tcps_rexmit_interval_min_high tcps_propinfo_tbl[22].prop_max_uval #define tcps_rexmit_interval_min tcps_propinfo_tbl[22].prop_cur_uval +#define tcps_rexmit_interval_min_low tcps_propinfo_tbl[22].prop_min_uval #define tcps_deferred_ack_interval tcps_propinfo_tbl[23].prop_cur_uval #define tcps_snd_lowat_fraction tcps_propinfo_tbl[24].prop_cur_uval #define tcps_dupack_fast_retransmit tcps_propinfo_tbl[25].prop_cur_uval @@ -441,7 +460,11 @@ extern uint32_t tcp_early_abort; #define tcps_xmit_lowat tcps_propinfo_tbl[30].prop_cur_uval #define tcps_recv_hiwat tcps_propinfo_tbl[31].prop_cur_uval #define tcps_recv_hiwat_minmss tcps_propinfo_tbl[32].prop_cur_uval +#define tcps_fin_wait_2_flush_interval_high \ + tcps_propinfo_tbl[33].prop_max_uval #define tcps_fin_wait_2_flush_interval tcps_propinfo_tbl[33].prop_cur_uval +#define tcps_fin_wait_2_flush_interval_low \ + tcps_propinfo_tbl[33].prop_min_uval #define tcps_max_buf tcps_propinfo_tbl[34].prop_cur_uval #define tcps_strong_iss tcps_propinfo_tbl[35].prop_cur_uval #define tcps_rtt_updates tcps_propinfo_tbl[36].prop_cur_uval @@ -527,7 +550,7 @@ extern int tcp_getpeername(sock_lower_handle_t, struct sockaddr *, socklen_t *, cred_t *); extern int tcp_getsockname(sock_lower_handle_t, struct sockaddr *, socklen_t *, cred_t *); -extern void tcp_init_values(tcp_t *); +extern void tcp_init_values(tcp_t *, tcp_t *); extern void tcp_ipsec_cleanup(tcp_t *); extern int tcp_maxpsz_set(tcp_t *, boolean_t); extern void tcp_mss_set(tcp_t *, uint32_t); diff --git a/usr/src/uts/common/netinet/tcp.h b/usr/src/uts/common/netinet/tcp.h index 046015d7c9..9a08545ab7 100644 --- a/usr/src/uts/common/netinet/tcp.h +++ b/usr/src/uts/common/netinet/tcp.h @@ -1,6 +1,26 @@ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. - * Use is subject to license terms. + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. */ /* @@ -12,7 +32,6 @@ #ifndef _NETINET_TCP_H #define _NETINET_TCP_H -#pragma ident "%Z%%M% %I% %E% SMI" /* tcp.h 1.11 88/08/19 SMI; from UCB 7.2 10/28/86 */ @@ -98,6 +117,10 @@ struct tcphdr { #define TCP_KEEPALIVE_THRESHOLD 0x16 #define TCP_KEEPALIVE_ABORT_THRESHOLD 0x17 #define TCP_CORK 0x18 +#define TCP_RTO_INITIAL 0x19 +#define TCP_RTO_MIN 0x1A +#define TCP_RTO_MAX 0x1B +#define TCP_LINGER2 0x1C /* gap for expansion of ``standard'' options */ #define TCP_ANONPRIVBIND 0x20 /* for internal use only */ -- cgit v1.2.3