summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLauri Tirkkonen <lotheac@iki.fi>2015-04-15 16:30:46 +0300
committerRobert Mustacchi <rm@joyent.com>2015-07-30 08:33:51 -0700
commit1f183ba0b0be3e10202501aa3740753df6512804 (patch)
tree95dd728b664493dad65dbd359044daebdadf8410
parent11779b4caed6449da7f5cc8550aef185a2d06ba3 (diff)
downloadillumos-joyent-1f183ba0b0be3e10202501aa3740753df6512804.tar.gz
5850 tcp timestamping behavior changed mid-connection
Reviewed by: Dan McDonald <danmcd@omniti.com> Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Sebastien Roy <sebastien.roy@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r--usr/src/man/man7p/tcp.7p22
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c173
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_time_wait.c25
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h12
4 files changed, 110 insertions, 122 deletions
diff --git a/usr/src/man/man7p/tcp.7p b/usr/src/man/man7p/tcp.7p
index 6101fae25e..db739fc754 100644
--- a/usr/src/man/man7p/tcp.7p
+++ b/usr/src/man/man7p/tcp.7p
@@ -5,7 +5,7 @@
.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License.
.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License.
.\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner]
-.TH TCP 7P "May 27, 2014"
+.TH TCP 7P "Apr 27, 2015"
.SH NAME
tcp, TCP \- Internet Transmission Control Protocol
.SH SYNOPSIS
@@ -40,7 +40,6 @@ tcp, TCP \- Internet Transmission Control Protocol
.fi
.SH DESCRIPTION
-.sp
.LP
\fBTCP\fR is the virtual circuit protocol of the Internet protocol family. It
provides reliable, flow-controlled, in order, two-way transmission of data. It
@@ -183,11 +182,13 @@ PRIV_SYS_NET_CONFIG privilege if it wants to specify a number greater than that
calculated by \fIRFC 3390\fR.
.sp
.LP
-SunOS supports \fBTCP\fR Extensions for High Performance (\fIRFC 1323\fR) which
-includes the window scale and timestamp options, and Protection Against Wrap
-Around Sequence Numbers (PAWS). SunOS also supports Selective Acknowledgment
-(SACK) capabilities (RFC 2018) and Explicit Congestion Notification (ECN)
-mechanism (\fIRFC 3168\fR).
+illumos supports \fBTCP\fR Extensions for High Performance (\fIRFC 7323\fR)
+which includes the window scale and timestamp options, and Protection Against
+Wrap Around Sequence Numbers (PAWS). Note that if timestamps are negotiated on
+a connection, received segments without timestamps on that connection are
+silently dropped per the suggestion in the RFC. illumos also supports Selective
+Acknowledgment (SACK) capabilities (RFC 2018) and Explicit Congestion
+Notification (ECN) mechanism (\fIRFC 3168\fR).
.sp
.LP
Turn on the window scale option in one of the following ways:
@@ -354,7 +355,6 @@ specifies the number of keep-alive probes to be sent before aborting the
connection in the event of no response from peer. TCP_KEEPINTVL specifies the
interval in seconds between successive keep-alive probes.
.SH SEE ALSO
-.sp
.LP
\fBsvcs\fR(1), \fBndd\fR(1M), \fBioctl\fR(2), \fBread\fR(2), \fBsvcadm\fR(1M),
\fBwrite\fR(2), \fBaccept\fR(3SOCKET), \fBbind\fR(3SOCKET),
@@ -376,15 +376,14 @@ Bellovin, S., \fIRFC 1948, Defending Against Sequence Number Attacks\fR, May
1996.
.sp
.LP
-Jacobson, V., Braden, R., and Borman, D., \fIRFC 1323, TCP Extensions for High
-Performance\fR, May 1992.
+D. Borman, B. Braden, V. Jacobson and R. Scheffenegger, Ed., \fIRFC 7323, TCP
+Extensions for High Performance\fR, September 2014.
.sp
.LP
Postel, Jon, \fIRFC 793, Transmission Control Protocol - DARPA Internet Program
Protocol Specification\fR, Network Information Center, SRI International, Menlo
Park, CA., September 1981.
.SH DIAGNOSTICS
-.sp
.LP
A socket operation may fail if:
.sp
@@ -466,7 +465,6 @@ The system ran out of memory for internal data structures.
.RE
.SH NOTES
-.sp
.LP
The \fBtcp\fR service is managed by the service management facility,
\fBsmf\fR(5), under the service identifier:
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index 45337d83d9..cf8e0c6bd4 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -51,7 +51,7 @@
#include <inet/ipsec_impl.h>
/*
- * RFC1323-recommended phrasing of TSTAMP option, for easier parsing
+ * RFC7323-recommended phrasing of TSTAMP option, for easier parsing
*/
#ifdef _BIG_ENDIAN
@@ -63,15 +63,6 @@
#endif
/*
- * Flags returned from tcp_parse_options.
- */
-#define TCP_OPT_MSS_PRESENT 1
-#define TCP_OPT_WSCALE_PRESENT 2
-#define TCP_OPT_TSTAMP_PRESENT 4
-#define TCP_OPT_SACK_OK_PRESENT 8
-#define TCP_OPT_SACK_PRESENT 16
-
-/*
* PAWS needs a timer for 24 days. This is the number of ticks in 24 days
*/
#define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
@@ -171,7 +162,6 @@ static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
static mblk_t *tcp_input_add_ancillary(tcp_t *, mblk_t *, ip_pkt_t *,
ip_recv_attr_t *);
static void tcp_input_listener(void *, mblk_t *, void *, ip_recv_attr_t *);
-static int tcp_parse_options(tcpha_t *, tcp_opt_t *);
static void tcp_process_options(tcp_t *, tcpha_t *);
static mblk_t *tcp_reass(tcp_t *, mblk_t *, uint32_t);
static void tcp_reass_elim_overlap(tcp_t *, mblk_t *);
@@ -237,7 +227,7 @@ tcp_mss_set(tcp_t *tcp, uint32_t mss)
* Extract option values from a tcp header. We put any found values into the
* tcpopt struct and return a bitmask saying which options were found.
*/
-static int
+int
tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
{
uchar_t *endp;
@@ -251,6 +241,19 @@ tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt)
endp = up + TCP_HDR_LENGTH(tcpha);
up += TCP_MIN_HEADER_LENGTH;
+ /*
+ * If timestamp option is aligned as recommended in RFC 7323 Appendix
+ * A, and is the only option, return quickly.
+ */
+ if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH +
+ TCPOPT_REAL_TS_LEN &&
+ OK_32PTR(up) &&
+ *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) {
+ tcpopt->tcp_opt_ts_val = ABE32_TO_U32((up+4));
+ tcpopt->tcp_opt_ts_ecr = ABE32_TO_U32((up+8));
+
+ return (TCP_OPT_TSTAMP_PRESENT);
+ }
while (up < endp) {
len = endp - up;
switch (*up) {
@@ -686,82 +689,27 @@ tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp)
}
/*
- * This function does PAWS protection check. Returns B_TRUE if the
- * segment passes the PAWS test, else returns B_FALSE.
+ * This function does PAWS protection check, per RFC 7323 section 5. Requires
+ * that timestamp options are already processed into tcpoptp. Returns B_TRUE if
+ * the segment passes the PAWS test, else returns B_FALSE.
*/
boolean_t
-tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp)
+tcp_paws_check(tcp_t *tcp, const tcp_opt_t *tcpoptp)
{
- uint8_t flags;
- int options;
- uint8_t *up;
- conn_t *connp = tcp->tcp_connp;
-
- flags = (unsigned int)tcpha->tha_flags & 0xFF;
- /*
- * If timestamp option is aligned nicely, get values inline,
- * otherwise call general routine to parse. Only do that
- * if timestamp is the only option.
- */
- if (TCP_HDR_LENGTH(tcpha) == (uint32_t)TCP_MIN_HEADER_LENGTH +
- TCPOPT_REAL_TS_LEN &&
- OK_32PTR((up = ((uint8_t *)tcpha) +
- TCP_MIN_HEADER_LENGTH)) &&
- *(uint32_t *)up == TCPOPT_NOP_NOP_TSTAMP) {
- tcpoptp->tcp_opt_ts_val = ABE32_TO_U32((up+4));
- tcpoptp->tcp_opt_ts_ecr = ABE32_TO_U32((up+8));
-
- options = TCP_OPT_TSTAMP_PRESENT;
- } else {
- if (tcp->tcp_snd_sack_ok) {
- tcpoptp->tcp = tcp;
+ if (TSTMP_LT(tcpoptp->tcp_opt_ts_val,
+ tcp->tcp_ts_recent)) {
+ if (LBOLT_FASTPATH64 <
+ (tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) {
+ /* This segment is not acceptable. */
+ return (B_FALSE);
} else {
- tcpoptp->tcp = NULL;
- }
- options = tcp_parse_options(tcpha, tcpoptp);
- }
-
- if (options & TCP_OPT_TSTAMP_PRESENT) {
- /*
- * Do PAWS per RFC 1323 section 4.2. Accept RST
- * regardless of the timestamp, page 18 RFC 1323.bis.
- */
- if ((flags & TH_RST) == 0 &&
- TSTMP_LT(tcpoptp->tcp_opt_ts_val,
- tcp->tcp_ts_recent)) {
- if (LBOLT_FASTPATH64 <
- (tcp->tcp_last_rcv_lbolt + PAWS_TIMEOUT)) {
- /* This segment is not acceptable. */
- return (B_FALSE);
- } else {
- /*
- * Connection has been idle for
- * too long. Reset the timestamp
- * and assume the segment is valid.
- */
- tcp->tcp_ts_recent =
- tcpoptp->tcp_opt_ts_val;
- }
+ /*
+ * Connection has been idle for
+ * too long. Reset the timestamp
+ */
+ tcp->tcp_ts_recent =
+ tcpoptp->tcp_opt_ts_val;
}
- } else {
- /*
- * If we don't get a timestamp on every packet, we
- * figure we can't really trust 'em, so we stop sending
- * and parsing them.
- */
- tcp->tcp_snd_ts_ok = B_FALSE;
-
- connp->conn_ht_iphc_len -= TCPOPT_REAL_TS_LEN;
- connp->conn_ht_ulp_len -= TCPOPT_REAL_TS_LEN;
- tcp->tcp_tcpha->tha_offset_and_reserved -= (3 << 4);
- /*
- * Adjust the tcp_mss and tcp_cwnd accordingly. We avoid
- * doing a slow start here so as to not to lose on the
- * transfer rate built up so far.
- */
- tcp_mss_set(tcp, tcp->tcp_mss + TCPOPT_REAL_TS_LEN);
- if (tcp->tcp_snd_sack_ok)
- tcp->tcp_max_sack_blk = 4;
}
return (B_TRUE);
}
@@ -2912,23 +2860,47 @@ tcp_input_data(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
new_swnd = ntohs(tcpha->tha_win) <<
((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
- if (tcp->tcp_snd_ts_ok) {
- if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
+ /*
+ * We are interested in two TCP options: timestamps (if negotiated) and
+ * SACK (if negotiated). Skip option parsing if neither is negotiated.
+ */
+ if (tcp->tcp_snd_ts_ok || tcp->tcp_snd_sack_ok) {
+ int options;
+ if (tcp->tcp_snd_sack_ok)
+ tcpopt.tcp = tcp;
+ else
+ tcpopt.tcp = NULL;
+ options = tcp_parse_options(tcpha, &tcpopt);
+ /*
+ * RST segments must not be subject to PAWS and are not
+ * required to have timestamps.
+ */
+ if (tcp->tcp_snd_ts_ok && !(flags & TH_RST)) {
/*
- * This segment is not acceptable.
- * Drop it and send back an ACK.
+ * Per RFC 7323 section 3.2., silently drop non-RST
+ * segments without expected TSopt. This is a 'SHOULD'
+ * requirement.
*/
- freemsg(mp);
- flags |= TH_ACK_NEEDED;
- goto ack_check;
+ if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
+ /*
+ * Leave a breadcrumb for people to detect this
+ * behavior.
+ */
+ DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
+ freemsg(mp);
+ return;
+ }
+
+ if (!tcp_paws_check(tcp, &tcpopt)) {
+ /*
+ * This segment is not acceptable.
+ * Drop it and send back an ACK.
+ */
+ freemsg(mp);
+ flags |= TH_ACK_NEEDED;
+ goto ack_check;
+ }
}
- } else if (tcp->tcp_snd_sack_ok) {
- tcpopt.tcp = tcp;
- /*
- * SACK info in already updated in tcp_parse_options. Ignore
- * all other TCP options...
- */
- (void) tcp_parse_options(tcpha, &tcpopt);
}
try_again:;
mss = tcp->tcp_mss;
@@ -3221,11 +3193,10 @@ ok:;
}
/*
- * Check whether we can update tcp_ts_recent. This test is
- * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP
- * Extensions for High Performance: An Update", Internet Draft.
+ * Check whether we can update tcp_ts_recent. This test is from RFC
+ * 7323, section 5.3.
*/
- if (tcp->tcp_snd_ts_ok &&
+ if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
index adde51e745..b470934da0 100644
--- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c
+++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
@@ -517,10 +517,20 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
flags = (unsigned int)tcpha->tha_flags & 0xFF;
new_swnd = ntohs(tcpha->tha_win) <<
((tcpha->tha_flags & TH_SYN) ? 0 : tcp->tcp_snd_ws);
- if (tcp->tcp_snd_ts_ok) {
- if (!tcp_paws_check(tcp, tcpha, &tcpopt)) {
- tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt,
- tcp->tcp_rnxt, TH_ACK);
+
+ if (tcp->tcp_snd_ts_ok && !(tcpha->tha_flags & TH_RST)) {
+ int options;
+ if (tcp->tcp_snd_sack_ok)
+ tcpopt.tcp = tcp;
+ else
+ tcpopt.tcp = NULL;
+ options = tcp_parse_options(tcpha, &tcpopt);
+ if (!(options & TCP_OPT_TSTAMP_PRESENT)) {
+ DTRACE_TCP1(droppedtimestamp, tcp_t *, tcp);
+ goto done;
+ } else if (!tcp_paws_check(tcp, &tcpopt)) {
+ tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt,
+ TH_ACK);
goto done;
}
}
@@ -667,11 +677,10 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
}
}
/*
- * Check whether we can update tcp_ts_recent. This test is
- * NOT the one in RFC 1323 3.4. It is from Braden, 1993, "TCP
- * Extensions for High Performance: An Update", Internet Draft.
+ * Check whether we can update tcp_ts_recent. This test is from RFC
+ * 7323, section 5.3.
*/
- if (tcp->tcp_snd_ts_ok &&
+ if (tcp->tcp_snd_ts_ok && !(flags & TH_RST) &&
TSTMP_GEQ(tcpopt.tcp_opt_ts_val, tcp->tcp_ts_recent) &&
SEQ_LEQ(seg_seq, tcp->tcp_rack)) {
tcp->tcp_ts_recent = tcpopt.tcp_opt_ts_val;
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 1b20e40aca..0f0f915a2b 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -291,6 +291,15 @@ typedef struct tcp_opt_s {
} tcp_opt_t;
/*
+ * Flags returned from tcp_parse_options.
+ */
+#define TCP_OPT_MSS_PRESENT 1
+#define TCP_OPT_WSCALE_PRESENT 2
+#define TCP_OPT_TSTAMP_PRESENT 4
+#define TCP_OPT_SACK_OK_PRESENT 8
+#define TCP_OPT_SACK_PRESENT 16
+
+/*
* Write-side flow-control is implemented via the per instance STREAMS
* write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
* and clearing QFULL and calling qbackenable() to restart the flow based
@@ -653,7 +662,8 @@ extern void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void tcp_input_data(void *, mblk_t *, void *, ip_recv_attr_t *);
extern void tcp_input_listener_unbound(void *, mblk_t *, void *,
ip_recv_attr_t *);
-extern boolean_t tcp_paws_check(tcp_t *, tcpha_t *, tcp_opt_t *);
+extern boolean_t tcp_paws_check(tcp_t *, const tcp_opt_t *);
+extern int tcp_parse_options(tcpha_t *, tcp_opt_t *);
extern uint_t tcp_rcv_drain(tcp_t *);
extern void tcp_rcv_enqueue(tcp_t *, mblk_t *, uint_t, cred_t *);
extern boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,