summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastien Roy <seb@delphix.com>2016-01-29 14:43:39 -0500
committerRichard Lowe <richlowe@richlowe.net>2019-08-19 22:32:46 +0000
commitc12492cf73149aa0aa845af5d59966b0eb5aa910 (patch)
tree871b7cc8c1d5f4ee1e09b69771731adbca73b64e
parent519cca71df494bfdf951168b57893cdbe961647f (diff)
downloadillumos-joyent-c12492cf73149aa0aa845af5d59966b0eb5aa910.tar.gz
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com> Portions contributed by: Brandon Baker <bbaker@delphix.com> Reviewed by: Jason King <jason.king@joyent.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Dan McDonald <danmcd@joyent.com> Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r--usr/src/uts/common/inet/tcp.h8
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c27
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c133
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c18
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_output.c46
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_timers.c42
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h70
-rw-r--r--usr/src/uts/common/sys/time.h6
8 files changed, 185 insertions, 165 deletions
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index b2b9973291..9c5ffed2eb 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -22,7 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent, Inc. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -282,9 +282,9 @@ typedef struct tcp_s {
uint32_t tcp_cwnd_max;
uint32_t tcp_csuna; /* Clear (no rexmits in window) suna */
- clock_t tcp_rtt_sa; /* Round trip smoothed average */
- clock_t tcp_rtt_sd; /* Round trip smoothed deviation */
- clock_t tcp_rtt_update; /* Round trip update(s) */
+ hrtime_t tcp_rtt_sa; /* Round trip smoothed average */
+ hrtime_t tcp_rtt_sd; /* Round trip smoothed deviation */
+ uint32_t tcp_rtt_update; /* Round trip update(s) */
clock_t tcp_ms_we_have_waited; /* Total retrans time */
uint32_t tcp_swl1; /* These help us avoid using stale */
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index d340aff2a5..ee1d75924e 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -23,7 +23,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, Joyent Inc. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2013,2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -266,8 +266,6 @@ typedef struct tcpt_s {
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
-void tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
- ip_recv_attr_t *ira);
void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *ira);
static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
@@ -640,15 +638,9 @@ tcp_set_destination(tcp_t *tcp)
tcp->tcp_localnet = uinfo.iulp_localnet;
if (uinfo.iulp_rtt != 0) {
- clock_t rto;
-
- tcp->tcp_rtt_sa = uinfo.iulp_rtt;
- tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra +
- (tcp->tcp_rtt_sa >> 5);
-
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt);
+ tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd);
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
}
if (uinfo.iulp_ssthresh != 0)
tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
@@ -2334,7 +2326,6 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
conn_t *connp = tcp->tcp_connp;
- clock_t rto;
ASSERT((connp->conn_family == AF_INET &&
connp->conn_ipversion == IPV4_VERSION) ||
@@ -2403,12 +2394,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
* during first few transmissions of a connection as seen in slow
* links.
*/
- tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
- tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) +
- tcps->tcps_conn_grace_period;
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
+ tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
+ tcps->tcps_conn_grace_period);
tcp->tcp_timer_backoff = 0;
tcp->tcp_ms_we_have_waited = 0;
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index e917f7c774..0e12d23c3e 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -22,8 +22,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2016 Joyent, Inc.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
/* This file contains all TCP input processing functions. */
@@ -166,7 +166,7 @@ static void tcp_process_options(tcp_t *, tcpha_t *);
static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *);
-static void tcp_set_rto(tcp_t *, time_t);
+static void tcp_set_rto(tcp_t *, hrtime_t);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
/*
@@ -3362,7 +3362,7 @@ ok:;
* and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent
* byte was at seg_seq - 1, in which case we ignore the urgent flag.
*/
- if (flags & TH_URG && urp >= 0) {
+ if ((flags & TH_URG) && urp >= 0) {
if (!tcp->tcp_urp_last_valid ||
SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) {
/*
@@ -4304,36 +4304,29 @@ process_ack:
SEQ_GT(seg_ack, tcp->tcp_urg))
tcp->tcp_valid_bits &= ~TCP_URG_VALID;
- /* Can we update the RTT estimates? */
- if (tcp->tcp_snd_ts_ok) {
- /* Ignore zero timestamp echo-reply. */
- if (tcpopt.tcp_opt_ts_ecr != 0) {
- tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
- (int32_t)tcpopt.tcp_opt_ts_ecr);
- }
-
- /* If needed, restart the timer. */
- if (tcp->tcp_set_timer == 1) {
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- tcp->tcp_set_timer = 0;
- }
- /*
- * Update tcp_csuna in case the other side stops sending
- * us timestamps.
- */
- tcp->tcp_csuna = tcp->tcp_snxt;
- } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
+ /*
+ * Update the RTT estimates. Note that we don't use the TCP
+ * timestamp option to calculate RTT even if one is present. This is
+ * because the timestamp option's resolution (CPU tick) is
+ * too coarse to measure modern datacenter networks' microsecond
+ * latencies. The timestamp field's resolution is limited by its
+ * 4-byte width (see RFC1323), and since we always store a
+ * high-resolution nanosecond presision timestamp along with the data,
+ * there is no point to ever using the timestamp option.
+ */
+ if (SEQ_GT(seg_ack, tcp->tcp_csuna)) {
/*
* An ACK sequence we haven't seen before, so get the RTT
* and update the RTO. But first check if the timestamp is
* valid to use.
*/
if ((mp1->b_next != NULL) &&
- SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next)))
- tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH -
- (int32_t)(intptr_t)mp1->b_prev);
- else
+ SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) {
+ tcp_set_rto(tcp, gethrtime() -
+ (hrtime_t)(intptr_t)mp1->b_prev);
+ } else {
TCPS_BUMP_MIB(tcps, tcpRttNoUpdate);
+ }
/* Remeber the last sequence to be ACKed */
tcp->tcp_csuna = seg_ack;
@@ -4362,7 +4355,7 @@ process_ack:
if (SEQ_GT(seg_ack,
(uint32_t)(uintptr_t)(mp1->b_next))) {
mp1->b_prev =
- (mblk_t *)(uintptr_t)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
mp1->b_next = NULL;
}
break;
@@ -4839,7 +4832,7 @@ xmit_check:
if (mp1 != NULL) {
tcp->tcp_xmit_head->b_prev =
- (mblk_t *)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
tcp->tcp_csuna = tcp->tcp_snxt;
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes,
@@ -4873,9 +4866,10 @@ xmit_check:
* timer is used to avoid a timeout before the
* limited transmitted segment's ACK gets back.
*/
- if (tcp->tcp_xmit_head != NULL)
+ if (tcp->tcp_xmit_head != NULL) {
tcp->tcp_xmit_head->b_prev =
- (mblk_t *)LBOLT_FASTPATH;
+ (mblk_t *)(intptr_t)gethrtime();
+ }
}
/* Anything more to do? */
@@ -5211,26 +5205,26 @@ tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
return (mp);
}
-/* The minimum of smoothed mean deviation in RTO calculation. */
-#define TCP_SD_MIN 400
+/* The minimum of smoothed mean deviation in RTO calculation (nsec). */
+#define TCP_SD_MIN 400000000
/*
- * Set RTO for this connection. The formula is from Jacobson and Karels'
- * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names
- * are the same as those in Appendix A.2 of that paper.
+ * Set RTO for this connection based on a new round-trip time measurement.
+ * The formula is from Jacobson and Karels' "Congestion Avoidance and Control"
+ * in SIGCOMM '88. The variable names are the same as those in Appendix A.2
+ * of that paper.
*
* m = new measurement
* sa = smoothed RTT average (8 * average estimates).
* sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates).
*/
static void
-tcp_set_rto(tcp_t *tcp, clock_t rtt)
+tcp_set_rto(tcp_t *tcp, hrtime_t rtt)
{
- long m = TICK_TO_MSEC(rtt);
- clock_t sa = tcp->tcp_rtt_sa;
- clock_t sv = tcp->tcp_rtt_sd;
- clock_t rto;
- tcp_stack_t *tcps = tcp->tcp_tcps;
+ hrtime_t m = rtt;
+ hrtime_t sa = tcp->tcp_rtt_sa;
+ hrtime_t sv = tcp->tcp_rtt_sd;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
TCPS_BUMP_MIB(tcps, tcpRttUpdate);
tcp->tcp_rtt_update++;
@@ -5238,11 +5232,24 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt)
/* tcp_rtt_sa is not 0 means this is a new sample. */
if (sa != 0) {
/*
- * Update average estimator:
- * new rtt = 7/8 old rtt + 1/8 Error
+ * Update average estimator (see section 2.3 of RFC6298):
+ * SRTT = 7/8 SRTT + 1/8 rtt
+ *
+ * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to:
+ * tcp_rtt_sa = 7 * SRTT + rtt
+ * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt
+ * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8))
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3))
+ * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3))
+ *
+ * (rtt - tcp_rtt_sa / 8) is simply the difference
+ * between the new rtt measurement and the existing smoothed
+ * RTT average. This is referred to as "Error" in subsequent
+ * calculations.
*/
- /* m is now Error in estimate. */
+ /* m is now Error. */
m -= sa >> 3;
if ((sa += m) <= 0) {
/*
@@ -5255,7 +5262,13 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt)
/*
* Update deviation estimator:
- * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev)
+ * mdev = 3/4 mdev + 1/4 abs(Error)
+ *
+ * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to:
+ * tcp_rtt_sd = 3 * mdev + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error)
+ * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error)
*/
if (m < 0)
m = -m;
@@ -5275,33 +5288,21 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt)
}
if (sv < TCP_SD_MIN) {
/*
- * We do not know that if sa captures the delay ACK
- * effect as in a long train of segments, a receiver
- * does not delay its ACKs. So set the minimum of sv
- * to be TCP_SD_MIN, which is default to 400 ms, twice
- * of BSD DATO. That means the minimum of mean
+ * Since a receiver doesn't delay its ACKs during a long run of
+ * segments, sa may not have captured the effect of delayed ACK
+ * timeouts on the RTT. To make sure we always account for the
+ * possible delay (and avoid the unnecessary retransmission),
+ * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of
+ * 200ms on older SunOS/BSD systems and modern Windows systems
+ * (as of 2019). This means that the minimum possible mean
* deviation is 100 ms.
- *
*/
sv = TCP_SD_MIN;
}
tcp->tcp_rtt_sa = sa;
tcp->tcp_rtt_sd = sv;
- /*
- * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv)
- *
- * Add tcp_rexmit_interval extra in case of extreme environment
- * where the algorithm fails to work. The default value of
- * tcp_rexmit_interval_extra should be 0.
- *
- * As we use a finer grained clock than BSD and update
- * RTO for every ACKs, add in another .25 of RTT to the
- * deviation of RTO to accomodate burstiness of 1/4 of
- * window size.
- */
- rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5);
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0);
/* Now, we can reset tcp_timer_backoff to use the new RTO... */
tcp->tcp_timer_backoff = 0;
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 40148b416a..5be23a2ad2 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -21,6 +21,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
+ * Copyright (c) 2016 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -869,9 +871,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
tcp->tcp_cork = onoff;
}
break;
- case TCP_RTO_INITIAL: {
- clock_t rto;
-
+ case TCP_RTO_INITIAL:
if (checkonly || val == 0)
break;
@@ -901,15 +901,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
if (tcp->tcp_state >= TCPS_SYN_SENT)
break;
- tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2;
- tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1;
- rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra +
- (tcp->tcp_rtt_sa >> 5) +
- tcps->tcps_conn_grace_period;
- TCP_SET_RTO(tcp, rto);
+ tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2;
+ tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1;
+ tcp->tcp_rto = tcp_calculate_rto(tcp, tcps,
+ tcps->tcps_conn_grace_period);
break;
- }
case TCP_RTO_MIN:
if (checkonly || val == 0)
break;
diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c
index 60840a3d54..c836076430 100644
--- a/usr/src/uts/common/inet/tcp/tcp_output.c
+++ b/usr/src/uts/common/inet/tcp/tcp_output.c
@@ -21,7 +21,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/* This file contains all TCP output processing functions. */
@@ -58,12 +59,12 @@ static void tcp_wput_flush(tcp_t *, mblk_t *);
static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
static int tcp_xmit_end(tcp_t *);
static int tcp_send(tcp_t *, const int, const int, const int,
- const int, int *, uint_t *, int *, mblk_t **, mblk_t *);
+ const int, int *, uint32_t *, int *, mblk_t **, mblk_t *);
static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t,
int, ip_recv_attr_t *, ip_stack_t *, conn_t *);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_process_shrunk_swnd(tcp_t *, uint32_t);
-static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int);
+static void tcp_fill_header(tcp_t *, uchar_t *, int);
/*
* Functions called directly via squeue having a prototype of edesc_t.
@@ -454,7 +455,7 @@ data_null:
}
}
- local_time = (mblk_t *)now;
+ local_time = (mblk_t *)(intptr_t)gethrtime();
/*
* "Our" Nagle Algorithm. This is not the same as in the old
@@ -1183,12 +1184,13 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
snxt = tcp->tcp_snxt;
/*
- * Check to see if this connection has been idled for some
- * time and no ACK is expected. If it is, we need to slow
- * start again to get back the connection's "self-clock" as
- * described in VJ's paper.
+ * Check to see if this connection has been idle for some time and no
+ * ACK is expected. If so, then the congestion window size is no longer
+ * meaningfully tied to current network conditions.
*
- * Reinitialize tcp_cwnd after idle.
+ * We reinitialize tcp_cwnd, and slow start again to get back the
+ * connection's "self-clock" as described in Van Jacobson's 1988 paper
+ * "Congestion avoidance and control".
*/
now = LBOLT_FASTPATH;
if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet &&
@@ -1256,7 +1258,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
if ((mp1 = dupb(mp)) == 0)
goto no_memory;
- mp->b_prev = (mblk_t *)(uintptr_t)now;
+ mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
mp->b_next = (mblk_t *)(uintptr_t)snxt;
/* adjust tcp header information */
@@ -1311,12 +1313,10 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
/* Fill in the timestamp option. */
if (tcp->tcp_snd_ts_ok) {
- uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
-
- U32_TO_BE32(llbolt,
- (char *)tcpha + TCP_MIN_HEADER_LENGTH+4);
+ U32_TO_BE32(now,
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4);
U32_TO_BE32(tcp->tcp_ts_recent,
- (char *)tcpha + TCP_MIN_HEADER_LENGTH+8);
+ (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8);
} else {
ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH);
}
@@ -1771,7 +1771,7 @@ tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
static int
tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
const int tcp_hdr_len, const int num_sack_blk, int *usable,
- uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
+ uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time)
{
int num_lso_seg = 1;
uint_t lso_usable;
@@ -2066,7 +2066,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len,
* Fill in the header using the template header, and add
* options such as time-stamp, ECN and/or SACK, as needed.
*/
- tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk);
+ tcp_fill_header(tcp, rptr, num_sack_blk);
mp->b_rptr = rptr;
@@ -2284,8 +2284,8 @@ tcp_xmit_end(tcp_t *tcp)
* So don't do any update.
*/
bzero(&uinfo, sizeof (uinfo));
- uinfo.iulp_rtt = tcp->tcp_rtt_sa;
- uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd;
+ uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa);
+ uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd);
/*
* Note that uinfo is kept for conn_faddr in the DCE. Could update even
@@ -3389,7 +3389,7 @@ tcp_sack_rexmit(tcp_t *tcp, uint_t *flags)
/*
* Update the send timestamp to avoid false retransmission.
*/
- snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len);
@@ -3461,7 +3461,7 @@ tcp_ss_rexmit(tcp_t *tcp)
* Update the send timestamp to avoid false
* retransmission.
*/
- old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
TCPS_BUMP_MIB(tcps, tcpRetransSegs);
TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt);
@@ -3621,7 +3621,7 @@ tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count)
* ECN and/or SACK.
*/
static void
-tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
+tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk)
{
tcpha_t *tcp_tmpl, *tcpha;
uint32_t *dst, *src;
@@ -3643,7 +3643,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk)
/* Fill time-stamp option if needed */
if (tcp->tcp_snd_ts_ok) {
- U32_TO_BE32((uint32_t)now,
+ U32_TO_BE32(LBOLT_FASTPATH,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4);
U32_TO_BE32(tcp->tcp_ts_recent,
(char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8);
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
index e3dba42c9b..b890bf6142 100644
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -23,7 +23,7 @@
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright 2011 Joyent, Inc. All rights reserved.
- * Copyright (c) 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2014, 2016 by Delphix. All rights reserved.
*/
#include <sys/types.h>
@@ -751,15 +751,14 @@ tcp_timer(void *arg)
case TCPS_LAST_ACK:
/* If we have data to rexmit */
if (tcp->tcp_suna != tcp->tcp_snxt) {
- clock_t time_to_wait;
+ clock_t time_to_wait;
TCPS_BUMP_MIB(tcps, tcpTimRetrans);
if (!tcp->tcp_xmit_head)
break;
- time_to_wait = ddi_get_lbolt() -
- (clock_t)tcp->tcp_xmit_head->b_prev;
- time_to_wait = tcp->tcp_rto -
- TICK_TO_MSEC(time_to_wait);
+ time_to_wait = NSEC2MSEC(gethrtime() -
+ (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev);
+ time_to_wait = tcp->tcp_rto - time_to_wait;
/*
* If the timer fires too early, 1 clock tick earlier,
* restart the timer.
@@ -1012,8 +1011,8 @@ tcp_timer(void *arg)
* window probe.
*/
if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
- tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
- (tcp->tcp_rtt_sa >> 5);
+ tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 +
+ tcp->tcp_rtt_sa >> 5;
tcp->tcp_rtt_sa = 0;
tcp_ip_notify(tcp);
tcp->tcp_rtt_update = 0;
@@ -1022,24 +1021,14 @@ tcp_timer(void *arg)
timer_rexmit:
tcp->tcp_timer_backoff++;
- if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
- tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
- tcp->tcp_rto_min) {
- /*
- * This means the original RTO is tcp_rexmit_interval_min.
- * So we will use tcp_rexmit_interval_min as the RTO value
- * and do the backoff.
- */
- ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff;
- } else {
- ms <<= tcp->tcp_timer_backoff;
- }
+ /*
+ * Calculate the backed off retransmission timeout. If the shift brings
+ * us back over the max, then we repin the value, and decrement the
+ * backoff to avoid overflow.
+ */
+ ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff;
if (ms > tcp->tcp_rto_max) {
ms = tcp->tcp_rto_max;
- /*
- * ms is at max, decrement tcp_timer_backoff to avoid
- * overflow.
- */
tcp->tcp_timer_backoff--;
}
tcp->tcp_ms_we_have_waited += ms;
@@ -1059,8 +1048,9 @@ timer_rexmit:
if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
mss = tcp->tcp_swnd;
- if ((mp = tcp->tcp_xmit_head) != NULL)
- mp->b_prev = (mblk_t *)ddi_get_lbolt();
+ if ((mp = tcp->tcp_xmit_head) != NULL) {
+ mp->b_prev = (mblk_t *)(intptr_t)gethrtime();
+ }
mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
B_TRUE);
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 4ef1886bae..b110a60fab 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -20,9 +20,9 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2016 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
- * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#ifndef _INET_TCP_IMPL_H
@@ -300,17 +300,6 @@ typedef struct tcp_squeue_priv_s {
}
/*
- * Set tcp_rto with boundary checking.
- */
-#define TCP_SET_RTO(tcp, rto) \
- if ((rto) < (tcp)->tcp_rto_min) \
- (tcp)->tcp_rto = (tcp)->tcp_rto_min; \
- else if ((rto) > (tcp)->tcp_rto_max) \
- (tcp)->tcp_rto = (tcp)->tcp_rto_max; \
- else \
- (tcp)->tcp_rto = (rto);
-
-/*
* TCP options struct returned from tcp_parse_options.
*/
typedef struct tcp_opt_s {
@@ -574,6 +563,61 @@ extern uint32_t tcp_early_abort;
#define tcps_reass_timeout tcps_propinfo_tbl[59].prop_cur_uval
#define tcps_iss_incr tcps_propinfo_tbl[65].prop_cur_uval
+
+/*
+ * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a
+ * multiple of the deviation estimates (K * RTTVAR):
+ *
+ * RTO = SRTT + max(G, K * RTTVAR)
+ *
+ * K is defined in the RFC as 4, and G is the clock granularity. We constrain
+ * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this
+ * becomes:
+ *
+ * RTO = SRTT + 4 * RTTVAR
+ *
+ * In practice, however, we make several additions to it. As we use a finer
+ * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of
+ * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size:
+ *
+ * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR
+ *
+ * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR,
+ * this becomes:
+ *
+ * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd
+ * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd
+ * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd
+ *
+ * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are
+ * used to help account for extreme environments where the algorithm fails to
+ * work; by default they should be 0. (The latter tunable is only used for
+ * calculating the intial RTO, and so is optionally passed in as "extra".) We
+ * add them here:
+ *
+ * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd +
+ * tcps_rexmit_interval_extra + tcps_conn_grace_period
+ *
+ * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5
+ * of RFC 6298).
+ */
+static __GNU_INLINE clock_t
+tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra)
+{
+ clock_t rto;
+
+ rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) +
+ tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra;
+
+ if (rto < tcp->tcp_rto_min) {
+ rto = tcp->tcp_rto_min;
+ } else if (rto > tcp->tcp_rto_max) {
+ rto = tcp->tcp_rto_max;
+ }
+
+ return (rto);
+}
+
extern struct qinit tcp_rinitv4, tcp_rinitv6;
extern boolean_t do_tcp_fusion;
diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h
index 81b4753049..8a36f622c3 100644
--- a/usr/src/uts/common/sys/time.h
+++ b/usr/src/uts/common/sys/time.h
@@ -18,7 +18,7 @@
*/
/*
- * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
*/
#ifndef _SYS_TIME_H
@@ -247,8 +247,8 @@ struct itimerval32 {
#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC))
#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC))
-#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC))
-#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC))
+#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC))
+#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC))
#define NSEC2SEC(n) ((n) / (NANOSEC / SEC))
#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC))