summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/tcp/tcp_output.c
diff options
context:
space:
mode:
authorKacheong Poon <Kacheong.Poon@Sun.COM>2010-08-07 02:11:24 -0700
committerKacheong Poon <Kacheong.Poon@Sun.COM>2010-08-07 02:11:24 -0700
commitb7de80ed83be7d7a6c226533d3dfa88b4e2d85c1 (patch)
tree1ccbbc068c408600220a2b1ba51a88a90a198f36 /usr/src/uts/common/inet/tcp/tcp_output.c
parent63d763c84bef3a708cb75b6de1f5f809681e788c (diff)
downloadillumos-joyent-b7de80ed83be7d7a6c226533d3dfa88b4e2d85c1.tar.gz
6970847 Special option handling can be pulled out of tcp_xmit_mp()
Diffstat (limited to 'usr/src/uts/common/inet/tcp/tcp_output.c')
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_output.c433
1 files changed, 245 insertions, 188 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp_output.c b/usr/src/uts/common/inet/tcp/tcp_output.c
index d562bfc462..017a6fb51a 100644
--- a/usr/src/uts/common/inet/tcp/tcp_output.c
+++ b/usr/src/uts/common/inet/tcp/tcp_output.c
@@ -2786,6 +2786,238 @@ tcp_xmit_listeners_reset(mblk_t *mp, ip_recv_attr_t *ira, ip_stack_t *ipst,
}
/*
+ * Helper function for tcp_xmit_mp() in handling connection set up flag
+ * options setting.
+ */
+static void
+tcp_xmit_mp_aux_iss(tcp_t *tcp, conn_t *connp, tcpha_t *tcpha, mblk_t *mp,
+ uint_t *flags)
+{
+ uint32_t u1;
+ uint8_t *wptr = mp->b_wptr;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ boolean_t add_sack = B_FALSE;
+
+ /*
+ * If TCP_ISS_VALID and the seq number is tcp_iss,
+ * TCP can only be in SYN-SENT, SYN-RCVD or
+ * FIN-WAIT-1 state. It can be FIN-WAIT-1 if
+ * our SYN is not ack'ed but the app closes this
+ * TCP connection.
+ */
+ ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
+ tcp->tcp_state == TCPS_SYN_RCVD ||
+ tcp->tcp_state == TCPS_FIN_WAIT_1);
+
+ /*
+ * Tack on the MSS option. It is always needed
+ * for both active and passive open.
+ *
+ * MSS option value should be interface MTU - MIN
+ * TCP/IP header according to RFC 793 as it means
+ * the maximum segment size TCP can receive. But
+ * to get around some broken middle boxes/end hosts
+ * out there, we allow the option value to be the
+ * same as the MSS option size on the peer side.
+ * In this way, the other side will not send
+ * anything larger than they can receive.
+ *
+ * Note that for SYN_SENT state, the ndd param
+ * tcp_use_smss_as_mss_opt has no effect as we
+ * don't know the peer's MSS option value. So
+ * the only case we need to take care of is in
+ * SYN_RCVD state, which is done later.
+ */
+ wptr[0] = TCPOPT_MAXSEG;
+ wptr[1] = TCPOPT_MAXSEG_LEN;
+ wptr += 2;
+ u1 = tcp->tcp_initial_pmtu - (connp->conn_ipversion == IPV4_VERSION ?
+ IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) - TCP_MIN_HEADER_LENGTH;
+ U16_TO_BE16(u1, wptr);
+ wptr += 2;
+
+ /* Update the offset to cover the additional word */
+ tcpha->tha_offset_and_reserved += (1 << 4);
+
+ switch (tcp->tcp_state) {
+ case TCPS_SYN_SENT:
+ *flags = TH_SYN;
+
+ if (tcp->tcp_snd_sack_ok)
+ add_sack = B_TRUE;
+
+ if (tcp->tcp_snd_ts_ok) {
+ uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
+
+ if (add_sack) {
+ wptr[0] = TCPOPT_SACK_PERMITTED;
+ wptr[1] = TCPOPT_SACK_OK_LEN;
+ add_sack = B_FALSE;
+ } else {
+ wptr[0] = TCPOPT_NOP;
+ wptr[1] = TCPOPT_NOP;
+ }
+ wptr[2] = TCPOPT_TSTAMP;
+ wptr[3] = TCPOPT_TSTAMP_LEN;
+ wptr += 4;
+ U32_TO_BE32(llbolt, wptr);
+ wptr += 4;
+ ASSERT(tcp->tcp_ts_recent == 0);
+ U32_TO_BE32(0L, wptr);
+ wptr += 4;
+ tcpha->tha_offset_and_reserved += (3 << 4);
+ }
+
+ /*
+ * Set up all the bits to tell other side
+ * we are ECN capable.
+ */
+ if (tcp->tcp_ecn_ok)
+ *flags |= (TH_ECE | TH_CWR);
+
+ break;
+
+ case TCPS_SYN_RCVD:
+ *flags |= TH_SYN;
+
+ /*
+ * Reset the MSS option value to be SMSS
+ * We should probably add back the bytes
+ * for timestamp option and IPsec. We
+ * don't do that as this is a workaround
+ * for broken middle boxes/end hosts, it
+ * is better for us to be more cautious.
+ * They may not take these things into
+ * account in their SMSS calculation. Thus
+ * the peer's calculated SMSS may be smaller
+ * than what it can be. This should be OK.
+ */
+ if (tcps->tcps_use_smss_as_mss_opt) {
+ u1 = tcp->tcp_mss;
+ /*
+ * Note that wptr points just past the MSS
+ * option value.
+ */
+ U16_TO_BE16(u1, wptr - 2);
+ }
+
+ /*
+ * tcp_snd_ts_ok can only be set in TCPS_SYN_RCVD
+ * when the peer also uses timestamps option. And
+ * the TCP header template must have already been
+ * updated to include the timestamps option.
+ */
+ if (tcp->tcp_snd_sack_ok) {
+ if (tcp->tcp_snd_ts_ok) {
+ uint8_t *tmp_wptr;
+
+ /*
+ * Use the NOP in the header just
+ * before timestamps opton.
+ */
+ tmp_wptr = (uint8_t *)tcpha +
+ TCP_MIN_HEADER_LENGTH;
+ ASSERT(tmp_wptr[0] == TCPOPT_NOP &&
+ tmp_wptr[1] == TCPOPT_NOP);
+ tmp_wptr[0] = TCPOPT_SACK_PERMITTED;
+ tmp_wptr[1] = TCPOPT_SACK_OK_LEN;
+ } else {
+ add_sack = B_TRUE;
+ }
+ }
+
+
+ /*
+ * If the other side is ECN capable, reply
+ * that we are also ECN capable.
+ */
+ if (tcp->tcp_ecn_ok)
+ *flags |= TH_ECE;
+ break;
+
+ default:
+ /*
+ * The above ASSERT() makes sure that this
+ * must be FIN-WAIT-1 state. Our SYN has
+ * not been ack'ed so retransmit it.
+ */
+ *flags |= TH_SYN;
+ break;
+ }
+
+ if (add_sack) {
+ wptr[0] = TCPOPT_NOP;
+ wptr[1] = TCPOPT_NOP;
+ wptr[2] = TCPOPT_SACK_PERMITTED;
+ wptr[3] = TCPOPT_SACK_OK_LEN;
+ wptr += TCPOPT_REAL_SACK_OK_LEN;
+ tcpha->tha_offset_and_reserved += (1 << 4);
+ }
+
+ if (tcp->tcp_snd_ws_ok) {
+ wptr[0] = TCPOPT_NOP;
+ wptr[1] = TCPOPT_WSCALE;
+ wptr[2] = TCPOPT_WS_LEN;
+ wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
+ wptr += TCPOPT_REAL_WS_LEN;
+ tcpha->tha_offset_and_reserved += (1 << 4);
+ }
+
+ mp->b_wptr = wptr;
+ u1 = (int)(mp->b_wptr - mp->b_rptr);
+ /*
+ * Get IP set to checksum on our behalf
+ * Include the adjustment for a source route if any.
+ */
+ u1 += connp->conn_sum;
+ u1 = (u1 >> 16) + (u1 & 0xFFFF);
+ tcpha->tha_sum = htons(u1);
+ TCPS_BUMP_MIB(tcps, tcpOutControl);
+}
+
+/*
+ * Helper function for tcp_xmit_mp() in handling connection tear down
+ * flag setting and state changes.
+ */
+static void
+tcp_xmit_mp_aux_fss(tcp_t *tcp, ip_xmit_attr_t *ixa, uint_t *flags)
+{
+ if (!tcp->tcp_fin_acked) {
+ *flags |= TH_FIN;
+ TCPS_BUMP_MIB(tcp->tcp_tcps, tcpOutControl);
+ }
+ if (!tcp->tcp_fin_sent) {
+ tcp->tcp_fin_sent = B_TRUE;
+ switch (tcp->tcp_state) {
+ case TCPS_SYN_RCVD:
+ tcp->tcp_state = TCPS_FIN_WAIT_1;
+ DTRACE_TCP6(state__change, void, NULL,
+ ip_xmit_attr_t *, ixa, void, NULL,
+ tcp_t *, tcp, void, NULL,
+ int32_t, TCPS_SYN_RCVD);
+ break;
+ case TCPS_ESTABLISHED:
+ tcp->tcp_state = TCPS_FIN_WAIT_1;
+ DTRACE_TCP6(state__change, void, NULL,
+ ip_xmit_attr_t *, ixa, void, NULL,
+ tcp_t *, tcp, void, NULL,
+ int32_t, TCPS_ESTABLISHED);
+ break;
+ case TCPS_CLOSE_WAIT:
+ tcp->tcp_state = TCPS_LAST_ACK;
+ DTRACE_TCP6(state__change, void, NULL,
+ ip_xmit_attr_t *, ixa, void, NULL,
+ tcp_t *, tcp, void, NULL,
+ int32_t, TCPS_CLOSE_WAIT);
+ break;
+ }
+ if (tcp->tcp_suna == tcp->tcp_snxt)
+ TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
+ tcp->tcp_snxt = tcp->tcp_fss + 1;
+ }
+}
+
+/*
* tcp_xmit_mp is called to return a pointer to an mblk chain complete with
* ip and tcp header ready to pass down to IP. If the mp passed in is
* non-NULL, then up to max_to_send bytes of data will be dup'ed off that
@@ -2820,7 +3052,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
/* Allocate for our maximum TCP header + link-level */
mp1 = allocb(connp->conn_ht_iphc_allocated + tcps->tcps_wroff_xtra,
BPRI_MED);
- if (!mp1)
+ if (mp1 == NULL)
return (NULL);
data_length = 0;
@@ -2926,201 +3158,24 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
}
}
+ /* Check if there is any special processing needs to be done. */
if (tcp->tcp_valid_bits) {
uint32_t u1;
+ /* We don't allow having SYN and FIN in the same segment... */
if ((tcp->tcp_valid_bits & TCP_ISS_VALID) &&
seq == tcp->tcp_iss) {
- uchar_t *wptr;
-
- /*
- * If TCP_ISS_VALID and the seq number is tcp_iss,
- * TCP can only be in SYN-SENT, SYN-RCVD or
- * FIN-WAIT-1 state. It can be FIN-WAIT-1 if
- * our SYN is not ack'ed but the app closes this
- * TCP connection.
- */
- ASSERT(tcp->tcp_state == TCPS_SYN_SENT ||
- tcp->tcp_state == TCPS_SYN_RCVD ||
- tcp->tcp_state == TCPS_FIN_WAIT_1);
-
- /*
- * Tack on the MSS option. It is always needed
- * for both active and passive open.
- *
- * MSS option value should be interface MTU - MIN
- * TCP/IP header according to RFC 793 as it means
- * the maximum segment size TCP can receive. But
- * to get around some broken middle boxes/end hosts
- * out there, we allow the option value to be the
- * same as the MSS option size on the peer side.
- * In this way, the other side will not send
- * anything larger than they can receive.
- *
- * Note that for SYN_SENT state, the ndd param
- * tcp_use_smss_as_mss_opt has no effect as we
- * don't know the peer's MSS option value. So
- * the only case we need to take care of is in
- * SYN_RCVD state, which is done later.
- */
- wptr = mp1->b_wptr;
- wptr[0] = TCPOPT_MAXSEG;
- wptr[1] = TCPOPT_MAXSEG_LEN;
- wptr += 2;
- u1 = tcp->tcp_initial_pmtu -
- (connp->conn_ipversion == IPV4_VERSION ?
- IP_SIMPLE_HDR_LENGTH : IPV6_HDR_LEN) -
- TCP_MIN_HEADER_LENGTH;
- U16_TO_BE16(u1, wptr);
- mp1->b_wptr = wptr + 2;
- /* Update the offset to cover the additional word */
- tcpha->tha_offset_and_reserved += (1 << 4);
-
- /*
- * Note that the following way of filling in
- * TCP options are not optimal. Some NOPs can
- * be saved. But there is no need at this time
- * to optimize it. When it is needed, we will
- * do it.
- */
- switch (tcp->tcp_state) {
- case TCPS_SYN_SENT:
- flags = TH_SYN;
-
- if (tcp->tcp_snd_ts_ok) {
- uint32_t llbolt =
- (uint32_t)LBOLT_FASTPATH;
-
- wptr = mp1->b_wptr;
- wptr[0] = TCPOPT_NOP;
- wptr[1] = TCPOPT_NOP;
- wptr[2] = TCPOPT_TSTAMP;
- wptr[3] = TCPOPT_TSTAMP_LEN;
- wptr += 4;
- U32_TO_BE32(llbolt, wptr);
- wptr += 4;
- ASSERT(tcp->tcp_ts_recent == 0);
- U32_TO_BE32(0L, wptr);
- mp1->b_wptr += TCPOPT_REAL_TS_LEN;
- tcpha->tha_offset_and_reserved +=
- (3 << 4);
- }
-
- /*
- * Set up all the bits to tell other side
- * we are ECN capable.
- */
- if (tcp->tcp_ecn_ok) {
- flags |= (TH_ECE | TH_CWR);
- }
- break;
- case TCPS_SYN_RCVD:
- flags |= TH_SYN;
-
- /*
- * Reset the MSS option value to be SMSS
- * We should probably add back the bytes
- * for timestamp option and IPsec. We
- * don't do that as this is a workaround
- * for broken middle boxes/end hosts, it
- * is better for us to be more cautious.
- * They may not take these things into
- * account in their SMSS calculation. Thus
- * the peer's calculated SMSS may be smaller
- * than what it can be. This should be OK.
- */
- if (tcps->tcps_use_smss_as_mss_opt) {
- u1 = tcp->tcp_mss;
- U16_TO_BE16(u1, wptr);
- }
-
- /*
- * If the other side is ECN capable, reply
- * that we are also ECN capable.
- */
- if (tcp->tcp_ecn_ok)
- flags |= TH_ECE;
- break;
- default:
- /*
- * The above ASSERT() makes sure that this
- * must be FIN-WAIT-1 state. Our SYN has
- * not been ack'ed so retransmit it.
- */
- flags |= TH_SYN;
- break;
- }
-
- if (tcp->tcp_snd_ws_ok) {
- wptr = mp1->b_wptr;
- wptr[0] = TCPOPT_NOP;
- wptr[1] = TCPOPT_WSCALE;
- wptr[2] = TCPOPT_WS_LEN;
- wptr[3] = (uchar_t)tcp->tcp_rcv_ws;
- mp1->b_wptr += TCPOPT_REAL_WS_LEN;
- tcpha->tha_offset_and_reserved += (1 << 4);
- }
-
- if (tcp->tcp_snd_sack_ok) {
- wptr = mp1->b_wptr;
- wptr[0] = TCPOPT_NOP;
- wptr[1] = TCPOPT_NOP;
- wptr[2] = TCPOPT_SACK_PERMITTED;
- wptr[3] = TCPOPT_SACK_OK_LEN;
- mp1->b_wptr += TCPOPT_REAL_SACK_OK_LEN;
- tcpha->tha_offset_and_reserved += (1 << 4);
- }
-
- /* allocb() of adequate mblk assures space */
- ASSERT((uintptr_t)(mp1->b_wptr - mp1->b_rptr) <=
- (uintptr_t)INT_MAX);
- u1 = (int)(mp1->b_wptr - mp1->b_rptr);
- /*
- * Get IP set to checksum on our behalf
- * Include the adjustment for a source route if any.
- */
- u1 += connp->conn_sum;
- u1 = (u1 >> 16) + (u1 & 0xFFFF);
- tcpha->tha_sum = htons(u1);
- TCPS_BUMP_MIB(tcps, tcpOutControl);
- }
- if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
+ /* Need to do connection set up processing. */
+ tcp_xmit_mp_aux_iss(tcp, connp, tcpha, mp1, &flags);
+ } else if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
(seq + data_length) == tcp->tcp_fss) {
- if (!tcp->tcp_fin_acked) {
- flags |= TH_FIN;
- TCPS_BUMP_MIB(tcps, tcpOutControl);
- }
- if (!tcp->tcp_fin_sent) {
- tcp->tcp_fin_sent = B_TRUE;
- switch (tcp->tcp_state) {
- case TCPS_SYN_RCVD:
- tcp->tcp_state = TCPS_FIN_WAIT_1;
- DTRACE_TCP6(state__change, void, NULL,
- ip_xmit_attr_t *, ixa, void, NULL,
- tcp_t *, tcp, void, NULL,
- int32_t, TCPS_SYN_RCVD);
- break;
- case TCPS_ESTABLISHED:
- tcp->tcp_state = TCPS_FIN_WAIT_1;
- DTRACE_TCP6(state__change, void, NULL,
- ip_xmit_attr_t *, ixa, void, NULL,
- tcp_t *, tcp, void, NULL,
- int32_t, TCPS_ESTABLISHED);
- break;
- case TCPS_CLOSE_WAIT:
- tcp->tcp_state = TCPS_LAST_ACK;
- DTRACE_TCP6(state__change, void, NULL,
- ip_xmit_attr_t *, ixa, void, NULL,
- tcp_t *, tcp, void, NULL,
- int32_t, TCPS_CLOSE_WAIT);
- break;
- }
- if (tcp->tcp_suna == tcp->tcp_snxt)
- TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
- tcp->tcp_snxt = tcp->tcp_fss + 1;
- }
+ /* Need to do connection tear down processing. */
+ tcp_xmit_mp_aux_fss(tcp, ixa, &flags);
}
+
/*
+ * Need to do urgent pointer processing.
+ *
* Note the trick here. u1 is unsigned. When tcp_urg
* is smaller than seq, u1 will become a very huge value.
* So the comparison will fail. Also note that tcp_urp
@@ -3138,6 +3193,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
tcp->tcp_rack = tcp->tcp_rnxt;
tcp->tcp_rack_cnt = 0;
+ /* Fill in the current value of timestamps option. */
if (tcp->tcp_snd_ts_ok) {
if (tcp->tcp_state != TCPS_SYN_SENT) {
uint32_t llbolt = (uint32_t)LBOLT_FASTPATH;
@@ -3149,6 +3205,7 @@ tcp_xmit_mp(tcp_t *tcp, mblk_t *mp, int32_t max_to_send, int32_t *offset,
}
}
+ /* Fill in the SACK blocks. */
if (num_sack_blk > 0) {
uchar_t *wptr = (uchar_t *)tcpha + connp->conn_ht_ulp_len;
sack_blk_t *tmp;