diff options
author | Sebastien Roy <seb@delphix.com> | 2016-01-29 14:43:39 -0500 |
---|---|---|
committer | Richard Lowe <richlowe@richlowe.net> | 2019-08-19 22:32:46 +0000 |
commit | c12492cf73149aa0aa845af5d59966b0eb5aa910 (patch) | |
tree | 871b7cc8c1d5f4ee1e09b69771731adbca73b64e | |
parent | 519cca71df494bfdf951168b57893cdbe961647f (diff) | |
download | illumos-joyent-c12492cf73149aa0aa845af5d59966b0eb5aa910.tar.gz |
11546 Track TCP round-trip time in nanoseconds
Portions contributed by: Cody Peter Mello <cody.mello@joyent.com>
Portions contributed by: Brandon Baker <bbaker@delphix.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Dan McDonald <danmcd@joyent.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r-- | usr/src/uts/common/inet/tcp.h | 8 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 27 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_input.c | 133 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_opt_data.c | 18 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_output.c | 46 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_timers.c | 42 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp_impl.h | 70 | ||||
-rw-r--r-- | usr/src/uts/common/sys/time.h | 6 |
8 files changed, 185 insertions, 165 deletions
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index b2b9973291..9c5ffed2eb 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -22,7 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent, Inc. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -282,9 +282,9 @@ typedef struct tcp_s { uint32_t tcp_cwnd_max; uint32_t tcp_csuna; /* Clear (no rexmits in window) suna */ - clock_t tcp_rtt_sa; /* Round trip smoothed average */ - clock_t tcp_rtt_sd; /* Round trip smoothed deviation */ - clock_t tcp_rtt_update; /* Round trip update(s) */ + hrtime_t tcp_rtt_sa; /* Round trip smoothed average */ + hrtime_t tcp_rtt_sd; /* Round trip smoothed deviation */ + uint32_t tcp_rtt_update; /* Round trip update(s) */ clock_t tcp_ms_we_have_waited; /* Total retrans time */ uint32_t tcp_swl1; /* These help us avoid using stale */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index d340aff2a5..ee1d75924e 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -23,7 +23,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, Joyent Inc. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2013,2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -266,8 +266,6 @@ typedef struct tcpt_s { /* * Functions called directly via squeue having a prototype of edesc_t. */ -void tcp_input_listener(void *arg, mblk_t *mp, void *arg2, - ip_recv_attr_t *ira); void tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira); static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2, @@ -640,15 +638,9 @@ tcp_set_destination(tcp_t *tcp) tcp->tcp_localnet = uinfo.iulp_localnet; if (uinfo.iulp_rtt != 0) { - clock_t rto; - - tcp->tcp_rtt_sa = uinfo.iulp_rtt; - tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + - (tcp->tcp_rtt_sa >> 5); - - TCP_SET_RTO(tcp, rto); + tcp->tcp_rtt_sa = MSEC2NSEC(uinfo.iulp_rtt); + tcp->tcp_rtt_sd = MSEC2NSEC(uinfo.iulp_rtt_sd); + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0); } if (uinfo.iulp_ssthresh != 0) tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh; @@ -2334,7 +2326,6 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; - clock_t rto; ASSERT((connp->conn_family == AF_INET && connp->conn_ipversion == IPV4_VERSION) || @@ -2403,12 +2394,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) * during first few transmissions of a connection as seen in slow * links. */ - tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; - tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5) + - tcps->tcps_conn_grace_period; - TCP_SET_RTO(tcp, rto); + tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; + tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, + tcps->tcps_conn_grace_period); tcp->tcp_timer_backoff = 0; tcp->tcp_ms_we_have_waited = 0; diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index e917f7c774..0e12d23c3e 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -22,8 +22,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2016 Joyent, Inc. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ /* This file contains all TCP input processing functions. */ @@ -166,7 +166,7 @@ static void tcp_process_options(tcp_t *, tcpha_t *); static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t); static void tcp_reass_elim_overlap(tcp_t *, mblk_t *); static void tcp_rsrv_input(void *, mblk_t *, void *, ip_recv_attr_t *); -static void tcp_set_rto(tcp_t *, time_t); +static void tcp_set_rto(tcp_t *, hrtime_t); static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *); /* @@ -3362,7 +3362,7 @@ ok:; * and TCP_OLD_URP_INTERPRETATION is set. This implies that the urgent * byte was at seg_seq - 1, in which case we ignore the urgent flag. */ - if (flags & TH_URG && urp >= 0) { + if ((flags & TH_URG) && urp >= 0) { if (!tcp->tcp_urp_last_valid || SEQ_GT(urp + seg_seq, tcp->tcp_urp_last)) { /* @@ -4304,36 +4304,29 @@ process_ack: SEQ_GT(seg_ack, tcp->tcp_urg)) tcp->tcp_valid_bits &= ~TCP_URG_VALID; - /* Can we update the RTT estimates? */ - if (tcp->tcp_snd_ts_ok) { - /* Ignore zero timestamp echo-reply. */ - if (tcpopt.tcp_opt_ts_ecr != 0) { - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)tcpopt.tcp_opt_ts_ecr); - } - - /* If needed, restart the timer. */ - if (tcp->tcp_set_timer == 1) { - TCP_TIMER_RESTART(tcp, tcp->tcp_rto); - tcp->tcp_set_timer = 0; - } - /* - * Update tcp_csuna in case the other side stops sending - * us timestamps. - */ - tcp->tcp_csuna = tcp->tcp_snxt; - } else if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { + /* + * Update the RTT estimates. Note that we don't use the TCP + * timestamp option to calculate RTT even if one is present. This is + * because the timestamp option's resolution (CPU tick) is + * too coarse to measure modern datacenter networks' microsecond + * latencies. The timestamp field's resolution is limited by its + * 4-byte width (see RFC1323), and since we always store a + * high-resolution nanosecond presision timestamp along with the data, + * there is no point to ever using the timestamp option. + */ + if (SEQ_GT(seg_ack, tcp->tcp_csuna)) { /* * An ACK sequence we haven't seen before, so get the RTT * and update the RTO. But first check if the timestamp is * valid to use. */ if ((mp1->b_next != NULL) && - SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) - tcp_set_rto(tcp, (int32_t)LBOLT_FASTPATH - - (int32_t)(intptr_t)mp1->b_prev); - else + SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { + tcp_set_rto(tcp, gethrtime() - + (hrtime_t)(intptr_t)mp1->b_prev); + } else { TCPS_BUMP_MIB(tcps, tcpRttNoUpdate); + } /* Remeber the last sequence to be ACKed */ tcp->tcp_csuna = seg_ack; @@ -4362,7 +4355,7 @@ process_ack: if (SEQ_GT(seg_ack, (uint32_t)(uintptr_t)(mp1->b_next))) { mp1->b_prev = - (mblk_t *)(uintptr_t)LBOLT_FASTPATH; + (mblk_t *)(intptr_t)gethrtime(); mp1->b_next = NULL; } break; @@ -4839,7 +4832,7 @@ xmit_check: if (mp1 != NULL) { tcp->tcp_xmit_head->b_prev = - (mblk_t *)LBOLT_FASTPATH; + (mblk_t *)(intptr_t)gethrtime(); tcp->tcp_csuna = tcp->tcp_snxt; TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, @@ -4873,9 +4866,10 @@ xmit_check: * timer is used to avoid a timeout before the * limited transmitted segment's ACK gets back. */ - if (tcp->tcp_xmit_head != NULL) + if (tcp->tcp_xmit_head != NULL) { tcp->tcp_xmit_head->b_prev = - (mblk_t *)LBOLT_FASTPATH; + (mblk_t *)(intptr_t)gethrtime(); + } } /* Anything more to do? */ @@ -5211,26 +5205,26 @@ tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp, return (mp); } -/* The minimum of smoothed mean deviation in RTO calculation. */ -#define TCP_SD_MIN 400 +/* The minimum of smoothed mean deviation in RTO calculation (nsec). */ +#define TCP_SD_MIN 400000000 /* - * Set RTO for this connection. The formula is from Jacobson and Karels' - * "Congestion Avoidance and Control" in SIGCOMM '88. The variable names - * are the same as those in Appendix A.2 of that paper. + * Set RTO for this connection based on a new round-trip time measurement. + * The formula is from Jacobson and Karels' "Congestion Avoidance and Control" + * in SIGCOMM '88. The variable names are the same as those in Appendix A.2 + * of that paper. * * m = new measurement * sa = smoothed RTT average (8 * average estimates). * sv = smoothed mean deviation (mdev) of RTT (4 * deviation estimates). */ static void -tcp_set_rto(tcp_t *tcp, clock_t rtt) +tcp_set_rto(tcp_t *tcp, hrtime_t rtt) { - long m = TICK_TO_MSEC(rtt); - clock_t sa = tcp->tcp_rtt_sa; - clock_t sv = tcp->tcp_rtt_sd; - clock_t rto; - tcp_stack_t *tcps = tcp->tcp_tcps; + hrtime_t m = rtt; + hrtime_t sa = tcp->tcp_rtt_sa; + hrtime_t sv = tcp->tcp_rtt_sd; + tcp_stack_t *tcps = tcp->tcp_tcps; TCPS_BUMP_MIB(tcps, tcpRttUpdate); tcp->tcp_rtt_update++; @@ -5238,11 +5232,24 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) /* tcp_rtt_sa is not 0 means this is a new sample. */ if (sa != 0) { /* - * Update average estimator: - * new rtt = 7/8 old rtt + 1/8 Error + * Update average estimator (see section 2.3 of RFC6298): + * SRTT = 7/8 SRTT + 1/8 rtt + * + * We maintain tcp_rtt_sa as 8 * SRTT, so this reduces to: + * tcp_rtt_sa = 7 * SRTT + rtt + * tcp_rtt_sa = 7 * (tcp_rtt_sa / 8) + rtt + * tcp_rtt_sa = tcp_rtt_sa - (tcp_rtt_sa / 8) + rtt + * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 8)) + * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa / 2^3)) + * tcp_rtt_sa = tcp_rtt_sa + (rtt - (tcp_rtt_sa >> 3)) + * + * (rtt - tcp_rtt_sa / 8) is simply the difference + * between the new rtt measurement and the existing smoothed + * RTT average. This is referred to as "Error" in subsequent + * calculations. */ - /* m is now Error in estimate. */ + /* m is now Error. */ m -= sa >> 3; if ((sa += m) <= 0) { /* @@ -5255,7 +5262,13 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) /* * Update deviation estimator: - * new mdev = 3/4 old mdev + 1/4 (abs(Error) - old mdev) + * mdev = 3/4 mdev + 1/4 abs(Error) + * + * We maintain tcp_rtt_sd as 4 * mdev, so this reduces to: + * tcp_rtt_sd = 3 * mdev + abs(Error) + * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 4) + abs(Error) + * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd / 2^2) + abs(Error) + * tcp_rtt_sd = tcp_rtt_sd - (tcp_rtt_sd >> 2) + abs(Error) */ if (m < 0) m = -m; @@ -5275,33 +5288,21 @@ tcp_set_rto(tcp_t *tcp, clock_t rtt) } if (sv < TCP_SD_MIN) { /* - * We do not know that if sa captures the delay ACK - * effect as in a long train of segments, a receiver - * does not delay its ACKs. So set the minimum of sv - * to be TCP_SD_MIN, which is default to 400 ms, twice - * of BSD DATO. That means the minimum of mean + * Since a receiver doesn't delay its ACKs during a long run of + * segments, sa may not have captured the effect of delayed ACK + * timeouts on the RTT. To make sure we always account for the + * possible delay (and avoid the unnecessary retransmission), + * TCP_SD_MIN is set to 400ms, twice the delayed ACK timeout of + * 200ms on older SunOS/BSD systems and modern Windows systems + * (as of 2019). This means that the minimum possible mean * deviation is 100 ms. - * */ sv = TCP_SD_MIN; } tcp->tcp_rtt_sa = sa; tcp->tcp_rtt_sd = sv; - /* - * RTO = average estimates (sa / 8) + 4 * deviation estimates (sv) - * - * Add tcp_rexmit_interval extra in case of extreme environment - * where the algorithm fails to work. The default value of - * tcp_rexmit_interval_extra should be 0. - * - * As we use a finer grained clock than BSD and update - * RTO for every ACKs, add in another .25 of RTT to the - * deviation of RTO to accomodate burstiness of 1/4 of - * window size. - */ - rto = (sa >> 3) + sv + tcps->tcps_rexmit_interval_extra + (sa >> 5); - TCP_SET_RTO(tcp, rto); + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, 0); /* Now, we can reset tcp_timer_backoff to use the new RTO... */ tcp->tcp_timer_backoff = 0; diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 40148b416a..5be23a2ad2 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -21,6 +21,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. + * Copyright (c) 2016 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -869,9 +871,7 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, tcp->tcp_cork = onoff; } break; - case TCP_RTO_INITIAL: { - clock_t rto; - + case TCP_RTO_INITIAL: if (checkonly || val == 0) break; @@ -901,15 +901,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, if (tcp->tcp_state >= TCPS_SYN_SENT) break; - tcp->tcp_rtt_sa = tcp->tcp_rto_initial << 2; - tcp->tcp_rtt_sd = tcp->tcp_rto_initial >> 1; - rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + - (tcp->tcp_rtt_sa >> 5) + - tcps->tcps_conn_grace_period; - TCP_SET_RTO(tcp, rto); + tcp->tcp_rtt_sa = MSEC2NSEC(tcp->tcp_rto_initial) << 2; + tcp->tcp_rtt_sd = MSEC2NSEC(tcp->tcp_rto_initial) >> 1; + tcp->tcp_rto = tcp_calculate_rto(tcp, tcps, + tcps->tcps_conn_grace_period); break; - } case TCP_RTO_MIN: if (checkonly || val == 0) break; diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c index 60840a3d54..c836076430 100644 --- a/usr/src/uts/common/inet/tcp/tcp_output.c +++ b/usr/src/uts/common/inet/tcp/tcp_output.c @@ -21,7 +21,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* This file contains all TCP output processing functions. */ @@ -58,12 +59,12 @@ static void tcp_wput_flush(tcp_t *, mblk_t *); static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp); static int tcp_xmit_end(tcp_t *); static int tcp_send(tcp_t *, const int, const int, const int, - const int, int *, uint_t *, int *, mblk_t **, mblk_t *); + const int, int *, uint32_t *, int *, mblk_t **, mblk_t *); static void tcp_xmit_early_reset(char *, mblk_t *, uint32_t, uint32_t, int, ip_recv_attr_t *, ip_stack_t *, conn_t *); static boolean_t tcp_send_rst_chk(tcp_stack_t *); static void tcp_process_shrunk_swnd(tcp_t *, uint32_t); -static void tcp_fill_header(tcp_t *, uchar_t *, clock_t, int); +static void tcp_fill_header(tcp_t *, uchar_t *, int); /* * Functions called directly via squeue having a prototype of edesc_t. @@ -454,7 +455,7 @@ data_null: } } - local_time = (mblk_t *)now; + local_time = (mblk_t *)(intptr_t)gethrtime(); /* * "Our" Nagle Algorithm. This is not the same as in the old @@ -1183,12 +1184,13 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) snxt = tcp->tcp_snxt; /* - * Check to see if this connection has been idled for some - * time and no ACK is expected. If it is, we need to slow - * start again to get back the connection's "self-clock" as - * described in VJ's paper. + * Check to see if this connection has been idle for some time and no + * ACK is expected. If so, then the congestion window size is no longer + * meaningfully tied to current network conditions. * - * Reinitialize tcp_cwnd after idle. + * We reinitialize tcp_cwnd, and slow start again to get back the + * connection's "self-clock" as described in Van Jacobson's 1988 paper + * "Congestion avoidance and control". */ now = LBOLT_FASTPATH; if ((tcp->tcp_suna == snxt) && !tcp->tcp_localnet && @@ -1256,7 +1258,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) if ((mp1 = dupb(mp)) == 0) goto no_memory; - mp->b_prev = (mblk_t *)(uintptr_t)now; + mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); mp->b_next = (mblk_t *)(uintptr_t)snxt; /* adjust tcp header information */ @@ -1311,12 +1313,10 @@ tcp_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) /* Fill in the timestamp option. */ if (tcp->tcp_snd_ts_ok) { - uint32_t llbolt = (uint32_t)LBOLT_FASTPATH; - - U32_TO_BE32(llbolt, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+4); + U32_TO_BE32(now, + (char *)tcpha + TCP_MIN_HEADER_LENGTH + 4); U32_TO_BE32(tcp->tcp_ts_recent, - (char *)tcpha + TCP_MIN_HEADER_LENGTH+8); + (char *)tcpha + TCP_MIN_HEADER_LENGTH + 8); } else { ASSERT(connp->conn_ht_ulp_len == TCP_MIN_HEADER_LENGTH); } @@ -1771,7 +1771,7 @@ tcp_send_synack(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) static int tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, const int tcp_hdr_len, const int num_sack_blk, int *usable, - uint_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) + uint32_t *snxt, int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time) { int num_lso_seg = 1; uint_t lso_usable; @@ -2066,7 +2066,7 @@ tcp_send(tcp_t *tcp, const int mss, const int total_hdr_len, * Fill in the header using the template header, and add * options such as time-stamp, ECN and/or SACK, as needed. */ - tcp_fill_header(tcp, rptr, (clock_t)local_time, num_sack_blk); + tcp_fill_header(tcp, rptr, num_sack_blk); mp->b_rptr = rptr; @@ -2284,8 +2284,8 @@ tcp_xmit_end(tcp_t *tcp) * So don't do any update. */ bzero(&uinfo, sizeof (uinfo)); - uinfo.iulp_rtt = tcp->tcp_rtt_sa; - uinfo.iulp_rtt_sd = tcp->tcp_rtt_sd; + uinfo.iulp_rtt = NSEC2MSEC(tcp->tcp_rtt_sa); + uinfo.iulp_rtt_sd = NSEC2MSEC(tcp->tcp_rtt_sd); /* * Note that uinfo is kept for conn_faddr in the DCE. Could update even @@ -3389,7 +3389,7 @@ tcp_sack_rexmit(tcp_t *tcp, uint_t *flags) /* * Update the send timestamp to avoid false retransmission. */ - snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, seg_len); @@ -3461,7 +3461,7 @@ tcp_ss_rexmit(tcp_t *tcp) * Update the send timestamp to avoid false * retransmission. */ - old_snxt_mp->b_prev = (mblk_t *)ddi_get_lbolt(); + old_snxt_mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); TCPS_BUMP_MIB(tcps, tcpRetransSegs); TCPS_UPDATE_MIB(tcps, tcpRetransBytes, cnt); @@ -3621,7 +3621,7 @@ tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_count) * ECN and/or SACK. */ static void -tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) +tcp_fill_header(tcp_t *tcp, uchar_t *rptr, int num_sack_blk) { tcpha_t *tcp_tmpl, *tcpha; uint32_t *dst, *src; @@ -3643,7 +3643,7 @@ tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now, int num_sack_blk) /* Fill time-stamp option if needed */ if (tcp->tcp_snd_ts_ok) { - U32_TO_BE32((uint32_t)now, + U32_TO_BE32(LBOLT_FASTPATH, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 4); U32_TO_BE32(tcp->tcp_ts_recent, (char *)tcp_tmpl + TCP_MIN_HEADER_LENGTH + 8); diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c index e3dba42c9b..b890bf6142 100644 --- a/usr/src/uts/common/inet/tcp/tcp_timers.c +++ b/usr/src/uts/common/inet/tcp/tcp_timers.c @@ -23,7 +23,7 @@ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2011 Joyent, Inc. All rights reserved. - * Copyright (c) 2014 by Delphix. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. */ #include <sys/types.h> @@ -751,15 +751,14 @@ tcp_timer(void *arg) case TCPS_LAST_ACK: /* If we have data to rexmit */ if (tcp->tcp_suna != tcp->tcp_snxt) { - clock_t time_to_wait; + clock_t time_to_wait; TCPS_BUMP_MIB(tcps, tcpTimRetrans); if (!tcp->tcp_xmit_head) break; - time_to_wait = ddi_get_lbolt() - - (clock_t)tcp->tcp_xmit_head->b_prev; - time_to_wait = tcp->tcp_rto - - TICK_TO_MSEC(time_to_wait); + time_to_wait = NSEC2MSEC(gethrtime() - + (hrtime_t)(intptr_t)tcp->tcp_xmit_head->b_prev); + time_to_wait = tcp->tcp_rto - time_to_wait; /* * If the timer fires too early, 1 clock tick earlier, * restart the timer. @@ -1012,8 +1011,8 @@ tcp_timer(void *arg) * window probe. */ if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) { - tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) + - (tcp->tcp_rtt_sa >> 5); + tcp->tcp_rtt_sd += tcp->tcp_rtt_sa >> 3 + + tcp->tcp_rtt_sa >> 5; tcp->tcp_rtt_sa = 0; tcp_ip_notify(tcp); tcp->tcp_rtt_update = 0; @@ -1022,24 +1021,14 @@ tcp_timer(void *arg) timer_rexmit: tcp->tcp_timer_backoff++; - if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd + - tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) < - tcp->tcp_rto_min) { - /* - * This means the original RTO is tcp_rexmit_interval_min. - * So we will use tcp_rexmit_interval_min as the RTO value - * and do the backoff. - */ - ms = tcp->tcp_rto_min << tcp->tcp_timer_backoff; - } else { - ms <<= tcp->tcp_timer_backoff; - } + /* + * Calculate the backed off retransmission timeout. If the shift brings + * us back over the max, then we repin the value, and decrement the + * backoff to avoid overflow. + */ + ms = tcp_calculate_rto(tcp, tcps, 0) << tcp->tcp_timer_backoff; if (ms > tcp->tcp_rto_max) { ms = tcp->tcp_rto_max; - /* - * ms is at max, decrement tcp_timer_backoff to avoid - * overflow. - */ tcp->tcp_timer_backoff--; } tcp->tcp_ms_we_have_waited += ms; @@ -1059,8 +1048,9 @@ timer_rexmit: if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0) mss = tcp->tcp_swnd; - if ((mp = tcp->tcp_xmit_head) != NULL) - mp->b_prev = (mblk_t *)ddi_get_lbolt(); + if ((mp = tcp->tcp_xmit_head) != NULL) { + mp->b_prev = (mblk_t *)(intptr_t)gethrtime(); + } mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss, B_TRUE); diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 4ef1886bae..b110a60fab 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -20,9 +20,9 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2016 Joyent, Inc. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. - * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #ifndef _INET_TCP_IMPL_H @@ -300,17 +300,6 @@ typedef struct tcp_squeue_priv_s { } /* - * Set tcp_rto with boundary checking. - */ -#define TCP_SET_RTO(tcp, rto) \ - if ((rto) < (tcp)->tcp_rto_min) \ - (tcp)->tcp_rto = (tcp)->tcp_rto_min; \ - else if ((rto) > (tcp)->tcp_rto_max) \ - (tcp)->tcp_rto = (tcp)->tcp_rto_max; \ - else \ - (tcp)->tcp_rto = (rto); - -/* * TCP options struct returned from tcp_parse_options. */ typedef struct tcp_opt_s { @@ -574,6 +563,61 @@ extern uint32_t tcp_early_abort; #define tcps_reass_timeout tcps_propinfo_tbl[59].prop_cur_uval #define tcps_iss_incr tcps_propinfo_tbl[65].prop_cur_uval + +/* + * As defined in RFC 6298, the RTO is the average estimates (SRTT) plus a + * multiple of the deviation estimates (K * RTTVAR): + * + * RTO = SRTT + max(G, K * RTTVAR) + * + * K is defined in the RFC as 4, and G is the clock granularity. We constrain + * the minimum mean deviation to TCP_SD_MIN when processing new RTTs, so this + * becomes: + * + * RTO = SRTT + 4 * RTTVAR + * + * In practice, however, we make several additions to it. As we use a finer + * grained clock than BSD and update RTO for every ACK, we add in another 1/4 of + * RTT to the deviation of RTO to accommodate burstiness of 1/4 of window size: + * + * RTO = SRTT + (SRTT / 4) + 4 * RTTVAR + * + * Since tcp_rtt_sa is 8 times the SRTT, and tcp_rtt_sd is 4 times the RTTVAR, + * this becomes: + * + * RTO = (tcp_rtt_sa / 8) + ((tcp_rtt_sa / 8) / 4) + tcp_rtt_sd + * RTO = (tcp_rtt_sa / 2^3) + (tcp_rtt_sa / 2^5) + tcp_rtt_sd + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd + * + * The "tcp_rexmit_interval_extra" and "tcp_conn_grace_period" tunables are + * used to help account for extreme environments where the algorithm fails to + * work; by default they should be 0. (The latter tunable is only used for + * calculating the intial RTO, and so is optionally passed in as "extra".) We + * add them here: + * + * RTO = (tcp_rtt_sa >> 3) + (tcp_rtt_sa >> 5) + tcp_rtt_sd + + * tcps_rexmit_interval_extra + tcps_conn_grace_period + * + * We then pin the RTO within our configured boundaries (sections 2.4 and 2.5 + * of RFC 6298). + */ +static __GNU_INLINE clock_t +tcp_calculate_rto(tcp_t *tcp, tcp_stack_t *tcps, uint32_t extra) +{ + clock_t rto; + + rto = NSEC2MSEC((tcp->tcp_rtt_sa >> 3) + (tcp->tcp_rtt_sa >> 5) + + tcp->tcp_rtt_sd) + tcps->tcps_rexmit_interval_extra + extra; + + if (rto < tcp->tcp_rto_min) { + rto = tcp->tcp_rto_min; + } else if (rto > tcp->tcp_rto_max) { + rto = tcp->tcp_rto_max; + } + + return (rto); +} + extern struct qinit tcp_rinitv4, tcp_rinitv6; extern boolean_t do_tcp_fusion; diff --git a/usr/src/uts/common/sys/time.h b/usr/src/uts/common/sys/time.h index 81b4753049..8a36f622c3 100644 --- a/usr/src/uts/common/sys/time.h +++ b/usr/src/uts/common/sys/time.h @@ -18,7 +18,7 @@ */ /* - * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_TIME_H @@ -247,8 +247,8 @@ struct itimerval32 { #define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) #define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) -#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) -#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) #define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) #define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) |