summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKacheong Poon <Kacheong.Poon@Sun.COM>2010-06-03 09:14:53 -0700
committerKacheong Poon <Kacheong.Poon@Sun.COM>2010-06-03 09:14:53 -0700
commit707e74bc53cd429bcd731df722227c7dc2de47c6 (patch)
tree0f2a109c8010346e27f6d1836d8dca9444a89204
parentef497ae340a97bcb1bdee3babfa67414def6d8ca (diff)
downloadillumos-gate-707e74bc53cd429bcd731df722227c7dc2de47c6.tar.gz
PSARC 2010/151 new socket options for TCP timers
6955557 Various new TCP socket options
-rw-r--r--usr/src/cmd/truss/print.c4
-rw-r--r--usr/src/uts/common/inet/tcp.h6
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c93
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c32
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c136
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_timers.c54
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_tunables.c7
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h25
-rw-r--r--usr/src/uts/common/netinet/tcp.h29
9 files changed, 304 insertions, 82 deletions
diff --git a/usr/src/cmd/truss/print.c b/usr/src/cmd/truss/print.c
index c4ba8b2abd..5de1342c0e 100644
--- a/usr/src/cmd/truss/print.c
+++ b/usr/src/cmd/truss/print.c
@@ -1832,6 +1832,10 @@ tcp_optname(private_t *pri, long val)
case TCP_KEEPALIVE_ABORT_THRESHOLD:
return ("TCP_KEEPALIVE_ABORT_THRESHOLD");
case TCP_CORK: return ("TCP_CORK");
+ case TCP_RTO_INITIAL: return ("TCP_RTO_INITIAL");
+ case TCP_RTO_MIN: return ("TCP_RTO_MIN");
+ case TCP_RTO_MAX: return ("TCP_RTO_MAX");
+ case TCP_LINGER2: return ("TCP_LINGER2");
default: (void) snprintf(pri->code_buf,
sizeof (pri->code_buf),
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 23dbb1a687..92b7a8ed67 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -158,6 +158,9 @@ typedef struct tcp_s {
clock_t tcp_rto; /* Round trip timeout */
clock_t tcp_last_rcv_lbolt;
/* lbolt on last packet, used for PAWS */
+ uint32_t tcp_rto_initial; /* Initial RTO */
+ uint32_t tcp_rto_min; /* Minimum RTO */
+ uint32_t tcp_rto_max; /* Maximum RTO */
uint32_t tcp_snxt; /* Senders next seq num */
uint32_t tcp_swnd; /* Senders window (relative to suna) */
@@ -478,6 +481,9 @@ typedef struct tcp_s {
/* Segment reassembly timer. */
timeout_id_t tcp_reass_tid;
+ /* FIN-WAIT-2 flush timeout */
+ uint32_t tcp_fin_wait_2_flush_interval;
+
#ifdef DEBUG
pc_t tcmp_stk[15];
#endif
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 51ee3be794..441722acd4 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -661,13 +661,7 @@ tcp_set_destination(tcp_t *tcp)
tcps->tcps_rexmit_interval_extra +
(tcp->tcp_rtt_sa >> 5);
- if (rto > tcps->tcps_rexmit_interval_max) {
- tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
- } else if (rto < tcps->tcps_rexmit_interval_min) {
- tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
- } else {
- tcp->tcp_rto = rto;
- }
+ TCP_SET_RTO(tcp, rto);
}
if (uinfo.iulp_ssthresh != 0)
tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
@@ -2021,7 +2015,7 @@ tcp_reinit(tcp_t *tcp)
/*
* Initialize to default values
*/
- tcp_init_values(tcp);
+ tcp_init_values(tcp, NULL);
DTRACE_TCP6(state__change, void, NULL, ip_xmit_attr_t *,
connp->conn_ixa, void, NULL, tcp_t *, tcp, void, NULL,
@@ -2313,11 +2307,16 @@ tcp_reinit_values(tcp)
#undef PRESERVE
}
+/*
+ * Initialize the various fields in tcp_t. If parent (the listener) is non
+ * NULL, certain values will be inheritted from it.
+ */
void
-tcp_init_values(tcp_t *tcp)
+tcp_init_values(tcp_t *tcp, tcp_t *parent)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
conn_t *connp = tcp->tcp_connp;
+ clock_t rto;
ASSERT((connp->conn_family == AF_INET &&
connp->conn_ipversion == IPV4_VERSION) ||
@@ -2325,6 +2324,56 @@ tcp_init_values(tcp_t *tcp)
(connp->conn_ipversion == IPV4_VERSION ||
connp->conn_ipversion == IPV6_VERSION)));
+ if (parent == NULL) {
+ tcp->tcp_naglim = tcps->tcps_naglim_def;
+
+ tcp->tcp_rto_initial = tcps->tcps_rexmit_interval_initial;
+ tcp->tcp_rto_min = tcps->tcps_rexmit_interval_min;
+ tcp->tcp_rto_max = tcps->tcps_rexmit_interval_max;
+
+ tcp->tcp_first_ctimer_threshold =
+ tcps->tcps_ip_notify_cinterval;
+ tcp->tcp_second_ctimer_threshold =
+ tcps->tcps_ip_abort_cinterval;
+ tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
+ tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
+
+ tcp->tcp_fin_wait_2_flush_interval =
+ tcps->tcps_fin_wait_2_flush_interval;
+
+ tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
+ tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
+
+ /*
+ * Default value of tcp_init_cwnd is 0, so no need to set here
+ * if parent is NULL. But we need to inherit it from parent.
+ */
+ } else {
+ /* Inherit various TCP parameters from the parent. */
+ tcp->tcp_naglim = parent->tcp_naglim;
+
+ tcp->tcp_rto_initial = parent->tcp_rto_initial;
+ tcp->tcp_rto_min = parent->tcp_rto_min;
+ tcp->tcp_rto_max = parent->tcp_rto_max;
+
+ tcp->tcp_first_ctimer_threshold =
+ parent->tcp_first_ctimer_threshold;
+ tcp->tcp_second_ctimer_threshold =
+ parent->tcp_second_ctimer_threshold;
+ tcp->tcp_first_timer_threshold =
+ parent->tcp_first_timer_threshold;
+ tcp->tcp_second_timer_threshold =
+ parent->tcp_second_timer_threshold;
+
+ tcp->tcp_fin_wait_2_flush_interval =
+ parent->tcp_fin_wait_2_flush_interval;
+
+ tcp->tcp_ka_interval = parent->tcp_ka_interval;
+ tcp->tcp_ka_abort_thres = parent->tcp_ka_abort_thres;
+
+ tcp->tcp_init_cwnd = parent->tcp_init_cwnd;
+ }
+
/*
* Initialize tcp_rtt_sa and tcp_rtt_sd so that the calculated RTO
* will be close to tcp_rexmit_interval_initial. By doing this, we
@@ -2332,13 +2381,13 @@ tcp_init_values(tcp_t *tcp)
* during first few transmissions of a connection as seen in slow
* links.
*/
- tcp->tcp_rtt_sa = tcps->tcps_rexmit_interval_initial << 2;
- tcp->tcp_rtt_sd = tcps->tcps_rexmit_interval_initial >> 1;
- tcp->tcp_rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
+ tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
+ tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
+ rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
tcps->tcps_conn_grace_period;
- if (tcp->tcp_rto < tcps->tcps_rexmit_interval_min)
- tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
+ TCP_SET_RTO(tcp, rto);
+
tcp->tcp_timer_backoff = 0;
tcp->tcp_ms_we_have_waited = 0;
tcp->tcp_last_recv_time = ddi_get_lbolt();
@@ -2348,17 +2397,6 @@ tcp_init_values(tcp_t *tcp)
tcp->tcp_maxpsz_multiplier = tcps->tcps_maxpsz_multiplier;
- tcp->tcp_first_timer_threshold = tcps->tcps_ip_notify_interval;
- tcp->tcp_first_ctimer_threshold = tcps->tcps_ip_notify_cinterval;
- tcp->tcp_second_timer_threshold = tcps->tcps_ip_abort_interval;
- /*
- * Fix it to tcp_ip_abort_linterval later if it turns out to be a
- * passive open.
- */
- tcp->tcp_second_ctimer_threshold = tcps->tcps_ip_abort_cinterval;
-
- tcp->tcp_naglim = tcps->tcps_naglim_def;
-
/* NOTE: ISS is now set in tcp_set_destination(). */
/* Reset fusion-related fields */
@@ -2388,9 +2426,6 @@ tcp_init_values(tcp_t *tcp)
*/
if (!connp->conn_debug)
connp->conn_debug = tcps->tcps_dbg;
-
- tcp->tcp_ka_interval = tcps->tcps_keepalive_interval;
- tcp->tcp_ka_abort_thres = tcps->tcps_keepalive_abort_interval;
}
/*
@@ -2674,7 +2709,7 @@ tcp_create_common(cred_t *credp, boolean_t isv6, boolean_t issocket,
SOCK_CONNID_INIT(tcp->tcp_connid);
/* DTrace ignores this - it isn't a tcp:::state-change */
tcp->tcp_state = TCPS_IDLE;
- tcp_init_values(tcp);
+ tcp_init_values(tcp, NULL);
return (connp);
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index 14f34a1591..ce00372741 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -1482,7 +1482,11 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
eager->tcp_detached = B_TRUE;
SOCK_CONNID_INIT(eager->tcp_connid);
- tcp_init_values(eager);
+ /*
+ * Initialize the eager's tcp_t and inherit some parameters from
+ * the listener.
+ */
+ tcp_init_values(eager, listener);
ASSERT((econnp->conn_ixa->ixa_flags &
(IXAF_SET_ULP_CKSUM | IXAF_VERIFY_SOURCE |
@@ -1573,16 +1577,6 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
}
}
- /* Inherit various TCP parameters from the listener */
- eager->tcp_naglim = listener->tcp_naglim;
- eager->tcp_first_timer_threshold = listener->tcp_first_timer_threshold;
- eager->tcp_second_timer_threshold =
- listener->tcp_second_timer_threshold;
- eager->tcp_first_ctimer_threshold =
- listener->tcp_first_ctimer_threshold;
- eager->tcp_second_ctimer_threshold =
- listener->tcp_second_ctimer_threshold;
-
/*
* tcp_set_destination() may set tcp_rwnd according to the route
* metrics. If it does not, the eager's receive window will be set
@@ -1590,12 +1584,6 @@ tcp_input_listener(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
*/
eager->tcp_rwnd = 0;
- /*
- * Inherit listener's tcp_init_cwnd. Need to do this before
- * calling tcp_process_options() which set the initial cwnd.
- */
- eager->tcp_init_cwnd = listener->tcp_init_cwnd;
-
if (is_system_labeled()) {
ip_xmit_attr_t *ixa = econnp->conn_ixa;
@@ -4427,7 +4415,7 @@ est:
* flushing the FIN_WAIT_2 connection.
*/
TCP_TIMER_RESTART(tcp,
- tcps->tcps_fin_wait_2_flush_interval);
+ tcp->tcp_fin_wait_2_flush_interval);
}
break;
case TCPS_FIN_WAIT_2:
@@ -5228,13 +5216,7 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt)
*/
rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
- if (rto > tcps->tcps_rexmit_interval_max) {
- tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
- } else if (rto < tcps->tcps_rexmit_interval_min) {
- tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
- } else {
- tcp->tcp_rto = rto;
- }
+ TCP_SET_RTO(tcp, rto);
/* Now, we can reset tcp_timer_backoff to use the new RTO... */
tcp->tcp_timer_backoff = 0;
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 0f46bf4a08..cdb7305a45 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -122,6 +122,14 @@ opdes_t tcp_opt_arr[] = {
{ TCP_CORK, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ TCP_RTO_INITIAL, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
+
+{ TCP_RTO_MIN, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
+
+{ TCP_RTO_MAX, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (uint32_t), 0 },
+
+{ TCP_LINGER2, IPPROTO_TCP, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
@@ -401,6 +409,18 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
case TCP_CORK:
*i1 = tcp->tcp_cork;
return (sizeof (int));
+ case TCP_RTO_INITIAL:
+ *i1 = tcp->tcp_rto_initial;
+ return (sizeof (uint32_t));
+ case TCP_RTO_MIN:
+ *i1 = tcp->tcp_rto_min;
+ return (sizeof (uint32_t));
+ case TCP_RTO_MAX:
+ *i1 = tcp->tcp_rto_max;
+ return (sizeof (uint32_t));
+ case TCP_LINGER2:
+ *i1 = tcp->tcp_fin_wait_2_flush_interval / SECONDS;
+ return (sizeof (int));
}
break;
case IPPROTO_IP:
@@ -455,6 +475,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
int reterr;
tcp_stack_t *tcps = tcp->tcp_tcps;
conn_opt_arg_t coas;
+ uint32_t val = *((uint32_t *)invalp);
coas.coa_connp = connp;
coas.coa_ixa = connp->conn_ixa;
@@ -639,9 +660,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
/* Setting done in conn_opt_set */
break;
- case TCP_INIT_CWND: {
- uint32_t init_cwnd = *((uint32_t *)invalp);
-
+ case TCP_INIT_CWND:
if (checkonly)
break;
@@ -650,21 +669,20 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
* privilege to set the initial cwnd to be larger
* than allowed by RFC 3390.
*/
- if (init_cwnd <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
- tcp->tcp_init_cwnd = init_cwnd;
+ if (val <= MIN(4, MAX(2, 4380 / tcp->tcp_mss))) {
+ tcp->tcp_init_cwnd = val;
break;
}
if ((reterr = secpolicy_ip_config(cr, B_TRUE)) != 0) {
*outlenp = 0;
return (reterr);
}
- if (init_cwnd > tcp_max_init_cwnd) {
+ if (val > tcp_max_init_cwnd) {
*outlenp = 0;
return (EINVAL);
}
- tcp->tcp_init_cwnd = init_cwnd;
+ tcp->tcp_init_cwnd = val;
break;
- }
case TCP_KEEPALIVE_THRESHOLD:
if (checkonly)
break;
@@ -720,6 +738,108 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
tcp->tcp_cork = onoff;
}
break;
+ case TCP_RTO_INITIAL: {
+ clock_t rto;
+
+ if (checkonly || val == 0)
+ break;
+
+ /*
+ * Sanity checks
+ *
+ * The initial RTO should be bounded by the minimum
+ * and maximum RTO. And it should also be smaller
+ * than the connect attempt abort timeout. Otherwise,
+ * the connection won't be aborted in a period
+ * reasonably close to that timeout.
+ */
+ if (val < tcp->tcp_rto_min || val > tcp->tcp_rto_max ||
+ val > tcp->tcp_second_ctimer_threshold ||
+ val < tcps->tcps_rexmit_interval_initial_low ||
+ val > tcps->tcps_rexmit_interval_initial_high) {
+ *outlenp = 0;
+ return (EINVAL);
+ }
+ tcp->tcp_rto_initial = val;
+
+ /*
+ * If TCP has not sent anything, need to re-calculate
+ * tcp_rto. Otherwise, this option change does not
+ * really affect anything.
+ */
+ if (tcp->tcp_state >= TCPS_SYN_SENT)
+ break;
+
+ tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
+ tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
+ rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
+ tcps->tcps_rexmit_interval_extra +
+ (tcp->tcp_rtt_sa >> 5) +
+ tcps->tcps_conn_grace_period;
+ TCP_SET_RTO(tcp, rto);
+ break;
+ }
+ case TCP_RTO_MIN:
+ if (checkonly || val == 0)
+ break;
+
+ if (val < tcps->tcps_rexmit_interval_min_low ||
+ val > tcps->tcps_rexmit_interval_min_high ||
+ val > tcp->tcp_rto_max) {
+ *outlenp = 0;
+ return (EINVAL);
+ }
+ tcp->tcp_rto_min = val;
+ if (tcp->tcp_rto < val)
+ tcp->tcp_rto = val;
+ break;
+ case TCP_RTO_MAX:
+ if (checkonly || val == 0)
+ break;
+
+ /*
+ * Sanity checks
+ *
+ * The maximum RTO should not be larger than the
+ * connection abort timeout. Otherwise, the
+ * connection won't be aborted in a period reasonably
+ * close to that timeout.
+ */
+ if (val < tcps->tcps_rexmit_interval_max_low ||
+ val > tcps->tcps_rexmit_interval_max_high ||
+ val < tcp->tcp_rto_min ||
+ val > tcp->tcp_second_timer_threshold) {
+ *outlenp = 0;
+ return (EINVAL);
+ }
+ tcp->tcp_rto_max = val;
+ if (tcp->tcp_rto > val)
+ tcp->tcp_rto = val;
+ break;
+ case TCP_LINGER2:
+ if (checkonly || *i1 == 0)
+ break;
+
+ /*
+ * Note that the option value's unit is second. And
+ * the value should be bigger than the private
+ * parameter tcp_fin_wait_2_flush_interval's lower
+ * bound and smaller than the current value of that
+ * parameter. It should be smaller than the current
+ * value to avoid an app setting TCP_LINGER2 to a big
+ * value, causing resource to be held up too long in
+ * FIN-WAIT-2 state.
+ */
+ if (*i1 < 0 ||
+ tcps->tcps_fin_wait_2_flush_interval_low/SECONDS >
+ *i1 ||
+ tcps->tcps_fin_wait_2_flush_interval/SECONDS <
+ *i1) {
+ *outlenp = 0;
+ return (EINVAL);
+ }
+ tcp->tcp_fin_wait_2_flush_interval = *i1 * SECONDS;
+ break;
default:
break;
}
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
index f2eaa3a958..d0e0401857 100644
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -460,9 +460,9 @@ tcp_keepalive_timer(void *arg)
/*
* We should probe again at least
* in ka_intrvl, but not more than
- * tcp_rexmit_interval_max.
+ * tcp_rto_max.
*/
- max = tcps->tcps_rexmit_interval_max;
+ max = tcp->tcp_rto_max;
firetime = MIN(ka_intrvl - 1,
tcp->tcp_ka_last_intrvl << 1);
if (firetime > max)
@@ -624,6 +624,7 @@ tcp_timer(void *arg)
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ boolean_t dont_timeout = B_FALSE;
tcp->tcp_timer_tid = 0;
@@ -693,11 +694,29 @@ tcp_timer(void *arg)
case TCPS_SYN_SENT:
first_threshold = tcp->tcp_first_ctimer_threshold;
second_threshold = tcp->tcp_second_ctimer_threshold;
+
+ /* Retransmit forever unless this is a passive open... */
+ if (second_threshold == 0) {
+ if (!tcp->tcp_active_open) {
+ second_threshold =
+ tcps->tcps_ip_abort_linterval;
+ } else {
+ dont_timeout = B_TRUE;
+ }
+ }
break;
case TCPS_ESTABLISHED:
+ case TCPS_CLOSE_WAIT:
+ /*
+ * If the end point has not been closed, TCP can retransmit
+ * forever. But if the end point is closed, the normal
+ * timeout applies.
+ */
+ if (second_threshold == 0)
+ dont_timeout = B_TRUE;
+ /* FALLTHRU */
case TCPS_FIN_WAIT_1:
case TCPS_CLOSING:
- case TCPS_CLOSE_WAIT:
case TCPS_LAST_ACK:
/* If we have data to rexmit */
if (tcp->tcp_suna != tcp->tcp_snxt) {
@@ -844,7 +863,7 @@ tcp_timer(void *arg)
(void) tcp_clean_death(tcp, 0);
} else {
TCP_TIMER_RESTART(tcp,
- tcps->tcps_fin_wait_2_flush_interval);
+ tcp->tcp_fin_wait_2_flush_interval);
}
return;
case TCPS_TIME_WAIT:
@@ -868,8 +887,14 @@ tcp_timer(void *arg)
if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
second_threshold = tcp_early_abort * SECONDS;
+
+ /* We will ignore the never timeout promise in this case... */
+ dont_timeout = B_FALSE;
}
+ if (!dont_timeout && second_threshold == 0)
+ second_threshold = tcps->tcps_ip_abort_interval;
+
if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
/*
* Should not hold the zero-copy messages for too long.
@@ -878,6 +903,9 @@ tcp_timer(void *arg)
tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
tcp->tcp_xmit_head, B_TRUE);
+ if (dont_timeout)
+ goto timer_rexmit;
+
/*
* For zero window probe, we need to send indefinitely,
* unless we have not heard from the other side for some
@@ -923,10 +951,10 @@ tcp_timer(void *arg)
* We don't need to decrement tcp_timer_backoff
* to avoid overflow because it will be decremented
* later if new timeout value is greater than
- * tcp_rexmit_interval_max. In the case when
- * tcp_rexmit_interval_max is greater than
- * second_threshold, it means that we will wait
- * longer than second_threshold to send the next
+ * tcp_rto_max. In the case when tcp_rto_max is
+ * greater than second_threshold, it means that we
+ * will wait longer than second_threshold to send
+ * the next
* window probe.
*/
tcp->tcp_ms_we_have_waited = second_threshold;
@@ -955,21 +983,23 @@ tcp_timer(void *arg)
tcp->tcp_rtt_update = 0;
}
}
+
+timer_rexmit:
tcp->tcp_timer_backoff++;
if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
- tcps->tcps_rexmit_interval_min) {
+ tcp->tcp_rto_min) {
/*
* This means the original RTO is tcp_rexmit_interval_min.
* So we will use tcp_rexmit_interval_min as the RTO value
* and do the backoff.
*/
- ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff;
+ ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
} else {
ms <<= tcp->tcp_timer_backoff;
}
- if (ms > tcps->tcps_rexmit_interval_max) {
- ms = tcps->tcps_rexmit_interval_max;
+ if (ms > tcp->tcp_rto_max) {
+ ms = tcp->tcp_rto_max;
/*
* ms is at max, decrement tcp_timer_backoff to avoid
* overflow.
diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c
index 1d01d9a1b1..9f54799fa1 100644
--- a/usr/src/uts/common/inet/tcp/tcp_tunables.c
+++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c
@@ -19,8 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
#include <inet/ip.h>
@@ -337,8 +336,8 @@ mod_prop_info_t tcp_propinfo_tbl[] = {
{ "tcp_fin_wait_2_flush_interval", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
- {1*SECONDS, UINT32_MAX, 675*SECONDS},
- {675*SECONDS} },
+ {1*SECONDS, 2*HOURS, 60*SECONDS},
+ {60*SECONDS} },
{ "tcp_max_buf", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 9d99d933bd..46b12b27f0 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -228,6 +228,17 @@ typedef struct tcp_squeue_priv_s {
}
/*
+ * Set tcp_rto with boundary checking.
+ */
+#define TCP_SET_RTO(tcp, rto) \
+ if ((rto) < (tcp)->tcp_rto_min) \
+ (tcp)->tcp_rto = (tcp)->tcp_rto_min; \
+ else if ((rto) > (tcp)->tcp_rto_max) \
+ (tcp)->tcp_rto = (tcp)->tcp_rto_max; \
+ else \
+ (tcp)->tcp_rto = (rto);
+
+/*
* TCP options struct returned from tcp_parse_options.
*/
typedef struct tcp_opt_s {
@@ -428,9 +439,17 @@ extern uint32_t tcp_early_abort;
#define tcps_mss_max_ipv4 tcps_propinfo_tbl[17].prop_cur_uval
#define tcps_mss_min tcps_propinfo_tbl[18].prop_cur_uval
#define tcps_naglim_def tcps_propinfo_tbl[19].prop_cur_uval
+#define tcps_rexmit_interval_initial_high \
+ tcps_propinfo_tbl[20].prop_max_uval
#define tcps_rexmit_interval_initial tcps_propinfo_tbl[20].prop_cur_uval
+#define tcps_rexmit_interval_initial_low \
+ tcps_propinfo_tbl[20].prop_min_uval
+#define tcps_rexmit_interval_max_high tcps_propinfo_tbl[21].prop_max_uval
#define tcps_rexmit_interval_max tcps_propinfo_tbl[21].prop_cur_uval
+#define tcps_rexmit_interval_max_low tcps_propinfo_tbl[21].prop_min_uval
+#define tcps_rexmit_interval_min_high tcps_propinfo_tbl[22].prop_max_uval
#define tcps_rexmit_interval_min tcps_propinfo_tbl[22].prop_cur_uval
+#define tcps_rexmit_interval_min_low tcps_propinfo_tbl[22].prop_min_uval
#define tcps_deferred_ack_interval tcps_propinfo_tbl[23].prop_cur_uval
#define tcps_snd_lowat_fraction tcps_propinfo_tbl[24].prop_cur_uval
#define tcps_dupack_fast_retransmit tcps_propinfo_tbl[25].prop_cur_uval
@@ -441,7 +460,11 @@ extern uint32_t tcp_early_abort;
#define tcps_xmit_lowat tcps_propinfo_tbl[30].prop_cur_uval
#define tcps_recv_hiwat tcps_propinfo_tbl[31].prop_cur_uval
#define tcps_recv_hiwat_minmss tcps_propinfo_tbl[32].prop_cur_uval
+#define tcps_fin_wait_2_flush_interval_high \
+ tcps_propinfo_tbl[33].prop_max_uval
#define tcps_fin_wait_2_flush_interval tcps_propinfo_tbl[33].prop_cur_uval
+#define tcps_fin_wait_2_flush_interval_low \
+ tcps_propinfo_tbl[33].prop_min_uval
#define tcps_max_buf tcps_propinfo_tbl[34].prop_cur_uval
#define tcps_strong_iss tcps_propinfo_tbl[35].prop_cur_uval
#define tcps_rtt_updates tcps_propinfo_tbl[36].prop_cur_uval
@@ -527,7 +550,7 @@ extern int tcp_getpeername(sock_lower_handle_t, struct sockaddr *,
socklen_t *, cred_t *);
extern int tcp_getsockname(sock_lower_handle_t, struct sockaddr *,
socklen_t *, cred_t *);
-extern void tcp_init_values(tcp_t *);
+extern void tcp_init_values(tcp_t *, tcp_t *);
extern void tcp_ipsec_cleanup(tcp_t *);
extern int tcp_maxpsz_set(tcp_t *, boolean_t);
extern void tcp_mss_set(tcp_t *, uint32_t);
diff --git a/usr/src/uts/common/netinet/tcp.h b/usr/src/uts/common/netinet/tcp.h
index 046015d7c9..9a08545ab7 100644
--- a/usr/src/uts/common/netinet/tcp.h
+++ b/usr/src/uts/common/netinet/tcp.h
@@ -1,6 +1,26 @@
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
*/
/*
@@ -12,7 +32,6 @@
#ifndef _NETINET_TCP_H
#define _NETINET_TCP_H
-#pragma ident "%Z%%M% %I% %E% SMI"
/* tcp.h 1.11 88/08/19 SMI; from UCB 7.2 10/28/86 */
@@ -98,6 +117,10 @@ struct tcphdr {
#define TCP_KEEPALIVE_THRESHOLD 0x16
#define TCP_KEEPALIVE_ABORT_THRESHOLD 0x17
#define TCP_CORK 0x18
+#define TCP_RTO_INITIAL 0x19
+#define TCP_RTO_MIN 0x1A
+#define TCP_RTO_MAX 0x1B
+#define TCP_LINGER2 0x1C
/* gap for expansion of ``standard'' options */
#define TCP_ANONPRIVBIND 0x20 /* for internal use only */