summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/inet/ip.h1
-rw-r--r--usr/src/uts/common/inet/ip/ip.c20
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c21
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c7
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c22
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c3
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h13
-rw-r--r--usr/src/uts/common/inet/tcp.h33
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c300
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_kssl.c9
10 files changed, 360 insertions, 69 deletions
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 0c853ef4cd..4e588a67c7 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -3513,6 +3513,7 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *);
#define SQTAG_UDP_WPUT 34
#define SQTAG_UDP_OUTPUT 35
#define SQTAG_TCP_KSSL_INPUT 36
+#define SQTAG_TCP_DROP_Q0 37
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 7896e52d23..080c503d27 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -4469,8 +4469,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
sin = (sin_t *)ucp;
error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr,
sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE);
- if (protocol == IPPROTO_TCP)
- connp->conn_recv = tcp_conn_request;
break;
case sizeof (ipa_conn_t):
@@ -4482,8 +4480,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
error = ip_bind_connected(connp, mp, &ac->ac_laddr,
ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested,
ipsec_policy_set, B_TRUE, B_TRUE);
- if (protocol == IPPROTO_TCP)
- connp->conn_recv = tcp_input;
break;
case sizeof (ipa_conn_x_t):
@@ -4496,8 +4492,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp)
acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr,
acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set,
B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0);
- if (protocol == IPPROTO_TCP)
- connp->conn_recv = tcp_input;
break;
}
if (error == EINPROGRESS)
@@ -4691,7 +4685,14 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
connp->conn_fport = 0;
/*
* Do we need to add a check to reject Multicast packets
+ *
+ * We need to make sure that the conn_recv is set to a non-null
+ * value before we insert the conn into the classifier table.
+ * This is to avoid a race with an incoming packet which does an
+ * ipcl_classify().
*/
+ if (*mp->b_wptr == IPPROTO_TCP)
+ connp->conn_recv = tcp_conn_request;
error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport);
}
@@ -4707,6 +4708,8 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport,
/* Falls through to bad_addr */
}
}
+ } else if (connp->conn_ulp == IPPROTO_TCP) {
+ connp->conn_recv = tcp_input;
}
bad_addr:
if (error != 0) {
@@ -5139,7 +5142,12 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp,
/*
* The addresses have been verified. Time to insert in
* the correct fanout list.
+ * We need to make sure that the conn_recv is set to a non-null
+ * value before we insert into the classifier table to avoid a
+ * race with an incoming packet which does an ipcl_classify().
*/
+ if (protocol == IPPROTO_TCP)
+ connp->conn_recv = tcp_input;
error = ipcl_conn_insert(connp, protocol, src_addr,
dst_addr, connp->conn_ports);
}
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 913bcf307d..9e57397740 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -2329,8 +2329,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
goto bad_addr;
connp->conn_pkt_isv6 = B_TRUE;
}
- if (protocol == IPPROTO_TCP)
- connp->conn_recv = tcp_conn_request;
} else {
/*
* Bind to local and remote address. Local might be
@@ -2377,8 +2375,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp)
goto bad_addr;
connp->conn_pkt_isv6 = B_TRUE;
}
- if (protocol == IPPROTO_TCP)
- connp->conn_recv = tcp_input;
}
/* Update qinfo if v4/v6 changed */
if ((orig_pkt_isv6 != connp->conn_pkt_isv6) &&
@@ -2571,6 +2567,15 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src,
connp->conn_remv6 = ipv6_all_zeros;
connp->conn_lport = lport;
connp->conn_fport = 0;
+
+ /*
+ * We need to make sure that the conn_recv is set to a non-null
+ * value before we insert the conn_t into the classifier table.
+ * This is to avoid a race with an incoming packet which does
+ * an ipcl_classify().
+ */
+ if (*mp->b_wptr == IPPROTO_TCP)
+ connp->conn_recv = tcp_conn_request;
error = ipcl_bind_insert_v6(connp, *mp->b_wptr, v6src, lport);
}
if (error == 0) {
@@ -2585,6 +2590,8 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src,
goto bad_addr;
}
}
+ } else if (connp->conn_ulp == IPPROTO_TCP) {
+ connp->conn_recv = tcp_input;
}
bad_addr:
if (error != 0) {
@@ -3048,7 +3055,13 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src,
/*
* The addresses have been verified. Time to insert in
* the correct fanout list.
+ * We need to make sure that the conn_recv is set to a non-null
+ * value before we insert the conn_t into the classifier table.
+ * This is to avoid a race with an incoming packet which does
+ * an ipcl_classify().
*/
+ if (protocol == IPPROTO_TCP)
+ connp->conn_recv = tcp_input;
error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst,
connp->conn_ports,
IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0);
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 8aea44af15..af650a20cc 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -7843,6 +7843,13 @@ ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
* Use the preallocated ill_unbind_conn for this purpose
*/
connp = ill->ill_dls_capab->ill_unbind_conn;
+
+ ASSERT(!connp->conn_tcp->tcp_closemp.b_prev);
+ TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15);
+ if (connp->conn_tcp->tcp_closemp.b_prev == NULL)
+ connp->conn_tcp->tcp_closemp_used = 1;
+ else
+ connp->conn_tcp->tcp_closemp_used++;
mp = &connp->conn_tcp->tcp_closemp;
CONN_INC_REF(connp);
squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 5c1da7c964..d11098ec0e 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -785,15 +785,29 @@ ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
KM_NOSLEEP);
mutex_enter(&ill->ill_lock);
- if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE ||
- taskq_arg == NULL) {
+ /*
+ * Check sqp under the lock again for atomicity. Possible race with
+ * a previously scheduled ip_squeue_get -> ip_squeue_extend.
+ * Do the ring to squeue binding only if we are in interrupt context
+ * AND the ring is not already bound AND there is no one else trying
+ * the bind already.
+ */
+ sqp = ill_rx_ring->rr_sqp;
+ if (sqp != NULL || !interrupt ||
+ ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) {
/*
- * Do the ring to squeue binding only if we are in interrupt
- * context and there is no one else trying the bind already.
+ * Note that the ring might get bound once we drop the lock
+ * below, if a previous request is in progress i.e. if the ring
+ * state is ILL_RING_INPROC. The incoming connection on whose
+ * behalf we are currently here might get a suboptimal squeue
+ * via the call to IP_SQUEUE_GET below, but there is no
+ * correctness issue.
*/
mutex_exit(&ill->ill_lock);
if (taskq_arg != NULL)
kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
+ if (sqp != NULL)
+ return (sqp);
return (IP_SQUEUE_GET(lbolt));
}
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 4507496c97..f90b7a844c 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -1155,6 +1155,9 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src,
IPCL_HASH_REMOVE(connp);
mutex_enter(&connfp->connf_lock);
}
+
+ ASSERT(connp->conn_recv != NULL);
+
IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp);
mutex_exit(&connfp->connf_lock);
break;
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 9b959f219a..03d510fdbf 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -295,8 +295,8 @@ struct connf_s {
};
#define CONN_INC_REF(connp) { \
- DTRACE_PROBE1(conn__inc__ref, conn_t *, connp); \
mutex_enter(&(connp)->conn_lock); \
+ DTRACE_PROBE1(conn__inc__ref, conn_t *, connp); \
ASSERT(conn_trace_ref(connp)); \
(connp)->conn_ref++; \
ASSERT((connp)->conn_ref != 0); \
@@ -312,9 +312,16 @@ struct connf_s {
}
#define CONN_DEC_REF(connp) { \
- DTRACE_PROBE1(conn__dec__ref, conn_t *, connp); \
mutex_enter(&(connp)->conn_lock); \
- if ((connp)->conn_ref <= 0) \
+ DTRACE_PROBE1(conn__dec__ref, conn_t *, connp); \
+ /* \
+ * The squeue framework always does a CONN_DEC_REF after return \
+ * from TCP. Hence the refcnt must be at least 2 if conn_on_sqp \
+ * is B_TRUE and conn_ref is being decremented. This is to \
+ * account for the mblk being currently processed. \
+ */ \
+ if ((connp)->conn_ref <= 0 || \
+ ((connp)->conn_ref == 1 && (connp)->conn_on_sqp)) \
cmn_err(CE_PANIC, "CONN_DEC_REF: connp(%p) has ref " \
"= %d\n", (void *)(connp), (connp)->conn_ref); \
ASSERT(conn_untrace_ref(connp)); \
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 54d46f7f1b..7552e53600 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -287,7 +287,8 @@ typedef struct tcp_s {
tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */
tcp_cork : 1, /* tcp_cork option */
- tcp_pad_to_bit_31 : 18;
+ tcp_tconnind_started : 1, /* conn_ind message is being sent */
+ tcp_pad_to_bit_31 : 17;
uint32_t tcp_if_mtu; /* Outgoing interface MTU. */
@@ -553,8 +554,38 @@ typedef struct tcp_s {
kssl_ent_t tcp_kssl_ent; /* SSL table entry */
kssl_ctx_t tcp_kssl_ctx; /* SSL session */
uint_t tcp_label_len; /* length of cached label */
+
+ /*
+ * tcp_closemp_used is protected by listener's tcp_eager_lock
+ * when used for eagers. When used for a tcp in TIME_WAIT state
+ * or in tcp_close(), it is not protected by any lock as we
+ * do not expect any other thread to use it concurrently.
+ * Since we do allow re-use of tcp_closemp at certain places,
+ * tcp_closemp_used is declared as uint32_t instead of boolean_t
+ * to record any attempt to re-use tcp_closemp while it is still
+ * in use. This would facilitate debugging in non-debug kernels.
+ */
+ uint32_t tcp_closemp_used;
+
+ /*
+ * previous and next eagers in the list of droppable eagers. See
+ * the comments before MAKE_DROPPABLE(). These pointers are
+ * protected by listener's tcp_eager_lock.
+ */
+ struct tcp_s *tcp_eager_prev_drop_q0;
+ struct tcp_s *tcp_eager_next_drop_q0;
+#ifdef DEBUG
+ pc_t tcmp_stk[15];
+#endif
} tcp_t;
+#ifdef DEBUG
+#define TCP_DEBUG_GETPCSTACK(buffer, depth) ((void) getpcstack(buffer, \
+ depth))
+#else
+#define TCP_DEBUG_GETPCSTACK(buffer, depth)
+#endif
+
extern void tcp_free(tcp_t *tcp);
extern void tcp_ddi_init(void);
extern void tcp_ddi_destroy(void);
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 458265cfa3..aa80594733 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -995,6 +995,9 @@ static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
extern void tcp_kssl_input(tcp_t *, mblk_t *);
+void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
+void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
+
/*
* Routines related to the TCP_IOC_ABORT_CONN ioctl command.
*
@@ -1415,6 +1418,43 @@ boolean_t tcp_static_maxpsz = B_FALSE;
uint32_t tcp_random_anon_port = 1;
/*
+ * To reach to an eager in Q0 which can be dropped due to an incoming
+ * new SYN request when Q0 is full, a new doubly linked list is
+ * introduced. This list allows to select an eager from Q0 in O(1) time.
+ * This is needed to avoid spending too much time walking through the
+ * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
+ * this new list has to be a member of Q0.
+ * This list is headed by listener's tcp_t. When the list is empty,
+ * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
+ * of listener's tcp_t point to listener's tcp_t itself.
+ *
+ * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
+ * in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
+ * These macros do not affect the eager's membership to Q0.
+ */
+
+
+#define MAKE_DROPPABLE(listener, eager) \
+ if ((eager)->tcp_eager_next_drop_q0 == NULL) { \
+ (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
+ = (eager); \
+ (eager)->tcp_eager_prev_drop_q0 = (listener); \
+ (eager)->tcp_eager_next_drop_q0 = \
+ (listener)->tcp_eager_next_drop_q0; \
+ (listener)->tcp_eager_next_drop_q0 = (eager); \
+ }
+
+#define MAKE_UNDROPPABLE(eager) \
+ if ((eager)->tcp_eager_next_drop_q0 != NULL) { \
+ (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \
+ = (eager)->tcp_eager_prev_drop_q0; \
+ (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \
+ = (eager)->tcp_eager_next_drop_q0; \
+ (eager)->tcp_eager_prev_drop_q0 = NULL; \
+ (eager)->tcp_eager_next_drop_q0 = NULL; \
+ }
+
+/*
* If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
* than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
* data, TCP will not respond with an ACK. RFC 793 requires that
@@ -1535,8 +1575,11 @@ tcp_set_ws_value(tcp_t *tcp)
/*
* Remove a connection from the list of detached TIME_WAIT connections.
+ * It returns B_FALSE if it can't remove the connection from the list
+ * as the connection has already been removed from the list due to an
+ * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
*/
-static void
+static boolean_t
tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
{
boolean_t locked = B_FALSE;
@@ -1553,7 +1596,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
ASSERT(tcp->tcp_time_wait_prev == NULL);
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
- return;
+ return (B_FALSE);
}
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
@@ -1587,6 +1630,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+ return (B_TRUE);
}
/*
@@ -1770,6 +1814,7 @@ tcp_time_wait_collector(void *arg)
mblk_t *mp;
conn_t *connp;
kmutex_t *lock;
+ boolean_t removed;
squeue_t *sqp = (squeue_t *)arg;
tcp_squeue_priv_t *tcp_time_wait =
@@ -1803,7 +1848,8 @@ tcp_time_wait_collector(void *arg)
break;
}
- tcp_time_wait_remove(tcp, tcp_time_wait);
+ removed = tcp_time_wait_remove(tcp, tcp_time_wait);
+ ASSERT(removed);
connp = tcp->tcp_connp;
ASSERT(connp->conn_fanout != NULL);
@@ -1875,8 +1921,21 @@ tcp_time_wait_collector(void *arg)
/*
* We can reuse the closemp here since conn has
* detached (otherwise we wouldn't even be in
- * time_wait list).
+ * time_wait list). tcp_closemp_used can safely
+ * be changed without taking a lock as no other
+ * thread can concurrently access it at this
+ * point in the connection lifecycle. We
+ * increment tcp_closemp_used to record any
+ * attempt to reuse tcp_closemp while it is
+ * still in use.
*/
+
+ if (tcp->tcp_closemp.b_prev == NULL)
+ tcp->tcp_closemp_used = 1;
+ else
+ tcp->tcp_closemp_used++;
+
+ TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
squeue_fill(connp->conn_sqp, mp,
tcp_timewait_output, connp,
@@ -1890,8 +1949,21 @@ tcp_time_wait_collector(void *arg)
/*
* We can reuse the closemp here since conn has
* detached (otherwise we wouldn't even be in
- * time_wait list).
+ * time_wait list). tcp_closemp_used can safely
+ * be changed without taking a lock as no other
+ * thread can concurrently access it at this
+ * point in the connection lifecycle. We
+ * increment tcp_closemp_used to record any
+ * attempt to reuse tcp_closemp while it is
+ * still in use.
*/
+
+ if (tcp->tcp_closemp.b_prev == NULL)
+ tcp->tcp_closemp_used = 1;
+ else
+ tcp->tcp_closemp_used++;
+
+ TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
squeue_fill(connp->conn_sqp, mp,
tcp_timewait_output, connp, 0);
@@ -2306,6 +2378,10 @@ tcp_accept(tcp_t *listener, mblk_t *mp)
tcp->tcp_eager_next_q0 = NULL;
tcp->tcp_conn_def_q0 = B_FALSE;
+ /* Make sure the tcp isn't in the list of droppables */
+ ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
+ tcp->tcp_eager_prev_drop_q0 == NULL);
+
/*
* Insert at end of the queue because sockfs sends
* down T_CONN_RES in chronological order. Leaving
@@ -3407,6 +3483,8 @@ do_bind:
tcp->tcp_state = TCPS_LISTEN;
/* Initialize the chain. Don't need the eager_lock */
tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
+ tcp->tcp_eager_next_drop_q0 = tcp;
+ tcp->tcp_eager_prev_drop_q0 = tcp;
tcp->tcp_second_ctimer_threshold =
tcp_ip_abort_linterval;
}
@@ -3744,6 +3822,24 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
}
/*
+ * tcp_clean_death / tcp_close_detached must not be called more than once
+ * on a tcp. Thus every function that potentially calls tcp_clean_death
+ * must check for the tcp state before calling tcp_clean_death.
+ * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper,
+ * tcp_timer_handler, all check for the tcp state.
+ */
+/* ARGSUSED */
+void
+tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2)
+{
+ tcp_t *tcp = ((conn_t *)arg)->conn_tcp;
+
+ freemsg(mp);
+ if (tcp->tcp_state > TCPS_BOUND)
+ (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, ETIMEDOUT, 5);
+}
+
+/*
* We are dying for some reason. Try to do it gracefully. (May be called
* as writer.)
*
@@ -3794,7 +3890,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag)
* RST and will send a DISCON_IND to the application.
*/
tcp_closei_local(tcp);
- if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) {
+ if (!tcp->tcp_tconnind_started) {
CONN_DEC_REF(tcp->tcp_connp);
} else {
tcp->tcp_state = TCPS_BOUND;
@@ -3975,6 +4071,23 @@ tcp_close(queue_t *q, int flags)
tcp->tcp_closeflags = (uint8_t)flags;
ASSERT(connp->conn_ref >= 3);
+ /*
+ * tcp_closemp_used is used below without any protection of a lock
+ * as we don't expect any one else to use it concurrently at this
+ * point otherwise it would be a major defect, though we do
+ * increment tcp_closemp_used to record any attempt to reuse
+ * tcp_closemp while it is still in use. This would help debugging.
+ */
+
+ if (mp->b_prev == NULL) {
+ tcp->tcp_closemp_used = 1;
+ } else {
+ tcp->tcp_closemp_used++;
+ ASSERT(mp->b_prev == NULL);
+ }
+
+ TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
+
(*tcp_squeue_close_proc)(connp->conn_sqp, mp,
tcp_close_output, connp, SQTAG_IP_TCP_CLOSE);
@@ -4401,16 +4514,16 @@ tcp_closei_local(tcp_t *tcp)
tcp_t *listener = tcp->tcp_listener;
mutex_enter(&listener->tcp_eager_lock);
/*
- * tcp_eager_conn_ind == NULL means that the
+ * tcp_tconnind_started == B_TRUE means that the
* conn_ind has already gone to listener. At
* this point, eager will be closed but we
* leave it in listeners eager list so that
* if listener decides to close without doing
* accept, we can clean this up. In tcp_wput_accept
- * we take case of the case of accept on closed
+ * we take care of the case of accept on closed
* eager.
*/
- if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) {
+ if (!tcp->tcp_tconnind_started) {
tcp_eager_unlink(tcp);
mutex_exit(&listener->tcp_eager_lock);
/*
@@ -4449,7 +4562,7 @@ tcp_closei_local(tcp_t *tcp)
* tcp_time_wait_remove for the refcnt checks to work correctly.
*/
if (tcp->tcp_state == TCPS_TIME_WAIT)
- tcp_time_wait_remove(tcp, NULL);
+ (void) tcp_time_wait_remove(tcp, NULL);
CL_INET_DISCONNECT(tcp);
ipcl_hash_remove(connp);
@@ -4666,8 +4779,9 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp,
/*
* Defense for the SYN attack -
- * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest
- * one that doesn't have the dontdrop bit set.
+ * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest
+ * one from the list of droppable eagers. This list is a subset of q0.
+ * see comments before the definition of MAKE_DROPPABLE().
* 2. Don't drop a SYN request before its first timeout. This gives every
* request at least til the first timeout to complete its 3-way handshake.
* 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many
@@ -4682,26 +4796,29 @@ static boolean_t
tcp_drop_q0(tcp_t *tcp)
{
tcp_t *eager;
+ mblk_t *mp;
ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock));
ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0);
- /*
- * New one is added after next_q0 so prev_q0 points to the oldest
- * Also do not drop any established connections that are deferred on
- * q0 due to q being full
- */
- eager = tcp->tcp_eager_prev_q0;
- while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) {
- eager = eager->tcp_eager_prev_q0;
- if (eager == tcp) {
- eager = tcp->tcp_eager_prev_q0;
- break;
- }
+ /* Pick oldest eager from the list of droppable eagers */
+ eager = tcp->tcp_eager_prev_drop_q0;
+
+ /* If list is empty. return B_FALSE */
+ if (eager == tcp) {
+ return (B_FALSE);
}
- if (eager->tcp_syn_rcvd_timeout == 0)
+
+ /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */
+ if ((mp = allocb(0, BPRI_HI)) == NULL)
return (B_FALSE);
+ /*
+ * Take this eager out from the list of droppable eagers since we are
+ * going to drop it.
+ */
+ MAKE_UNDROPPABLE(eager);
+
if (tcp->tcp_debug) {
(void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE,
"tcp_drop_q0: listen half-open queue (max=%d) overflow"
@@ -4712,19 +4829,14 @@ tcp_drop_q0(tcp_t *tcp)
BUMP_MIB(&tcp_mib, tcpHalfOpenDrop);
- /*
- * need to do refhold here because the selected eager could
- * be removed by someone else if we release the eager lock.
- */
+ /* Put a reference on the conn as we are enqueueing it in the sqeue */
CONN_INC_REF(eager->tcp_connp);
- mutex_exit(&tcp->tcp_eager_lock);
/* Mark the IRE created for this SYN request temporary */
tcp_ip_ire_mark_advice(eager);
- (void) tcp_clean_death(eager, ETIMEDOUT, 5);
- CONN_DEC_REF(eager->tcp_connp);
+ squeue_fill(eager->tcp_connp->conn_sqp, mp,
+ tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0);
- mutex_enter(&tcp->tcp_eager_lock);
return (B_TRUE);
}
@@ -4976,6 +5088,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
}
ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
+ ASSERT(!tcp->tcp_tconnind_started);
/*
* If the SYN contains a credential, it's a loopback packet; attach
* the credential to the TPI message.
@@ -5097,6 +5210,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
tcp_opt_reverse(tcp, ipha);
ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
+ ASSERT(!tcp->tcp_tconnind_started);
/*
* If the SYN contains a credential, it's a loopback packet; attach
@@ -5924,6 +6038,7 @@ error1:
* treated as a new connection or dealth with
* a TH_RST if a connection already exists.
*/
+ CONN_DEC_REF(econnp);
freemsg(mp);
} else {
squeue_fill(econnp->conn_sqp, mp, tcp_input,
@@ -5985,7 +6100,23 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2)
* can't execute. If they are processed after we have
* changed the squeue, they are sent back to the
* correct squeue down below.
+ * But a listner close can race with processing of
+ * incoming SYN. If incoming SYN processing changes
+ * the squeue then the listener close which is waiting
+ * to enter the squeue would operate on the wrong
+ * squeue. Hence we don't change the squeue here unless
+ * the refcount is exactly the minimum refcount. The
+ * minimum refcount of 4 is counted as - 1 each for
+ * TCP and IP, 1 for being in the classifier hash, and
+ * 1 for the mblk being processed.
*/
+
+ if (connp->conn_ref != 4 ||
+ connp->conn_tcp->tcp_state != TCPS_LISTEN) {
+ mutex_exit(&connp->conn_lock);
+ mutex_exit(&connp->conn_fanout->connf_lock);
+ goto done;
+ }
if (connp->conn_sqp != new_sqp) {
while (connp->conn_sqp != new_sqp)
(void) casptr(&connp->conn_sqp, sqp, new_sqp);
@@ -6961,7 +7092,7 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2)
if (listener != NULL) {
mutex_enter(&listener->tcp_eager_lock);
tcp_eager_unlink(eager);
- if (eager->tcp_conn.tcp_eager_conn_ind == NULL) {
+ if (eager->tcp_tconnind_started) {
/*
* The eager has sent a conn_ind up to the
* listener but listener decides to close
@@ -6999,6 +7130,13 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum)
return (B_FALSE);
}
} while (eager->tcp_conn_req_seqnum != seqnum);
+
+ if (eager->tcp_closemp_used > 0) {
+ mutex_exit(&listener->tcp_eager_lock);
+ return (B_TRUE);
+ }
+ eager->tcp_closemp_used = 1;
+ TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
CONN_INC_REF(eager->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
mp = &eager->tcp_closemp;
@@ -7024,11 +7162,15 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
TCP_STAT(tcp_eager_blowoff_q);
eager = listener->tcp_eager_next_q;
while (eager != NULL) {
- CONN_INC_REF(eager->tcp_connp);
- mp = &eager->tcp_closemp;
- squeue_fill(eager->tcp_connp->conn_sqp, mp,
- tcp_eager_kill, eager->tcp_connp,
- SQTAG_TCP_EAGER_CLEANUP);
+ if (eager->tcp_closemp_used == 0) {
+ eager->tcp_closemp_used = 1;
+ TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
+ CONN_INC_REF(eager->tcp_connp);
+ mp = &eager->tcp_closemp;
+ squeue_fill(eager->tcp_connp->conn_sqp, mp,
+ tcp_eager_kill, eager->tcp_connp,
+ SQTAG_TCP_EAGER_CLEANUP);
+ }
eager = eager->tcp_eager_next_q;
}
}
@@ -7036,11 +7178,15 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only)
TCP_STAT(tcp_eager_blowoff_q0);
eager = listener->tcp_eager_next_q0;
while (eager != listener) {
- CONN_INC_REF(eager->tcp_connp);
- mp = &eager->tcp_closemp;
- squeue_fill(eager->tcp_connp->conn_sqp, mp,
- tcp_eager_kill, eager->tcp_connp,
- SQTAG_TCP_EAGER_CLEANUP_Q0);
+ if (eager->tcp_closemp_used == 0) {
+ eager->tcp_closemp_used = 1;
+ TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15);
+ CONN_INC_REF(eager->tcp_connp);
+ mp = &eager->tcp_closemp;
+ squeue_fill(eager->tcp_connp->conn_sqp, mp,
+ tcp_eager_kill, eager->tcp_connp,
+ SQTAG_TCP_EAGER_CLEANUP_Q0);
+ }
eager = eager->tcp_eager_next_q0;
}
}
@@ -7071,6 +7217,12 @@ tcp_eager_unlink(tcp_t *tcp)
tcp->tcp_eager_next_q0 = NULL;
tcp->tcp_eager_prev_q0 = NULL;
+ /*
+ * Take the eager out, if it is in the list of droppable
+ * eagers.
+ */
+ MAKE_UNDROPPABLE(tcp);
+
if (tcp->tcp_syn_rcvd_timeout != 0) {
/* we have timed out before */
ASSERT(listener->tcp_syn_rcvd_timeout > 0);
@@ -7637,6 +7789,8 @@ tcp_reinit(tcp_t *tcp)
*/
tcp->tcp_state = TCPS_LISTEN;
tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp;
+ tcp->tcp_eager_next_drop_q0 = tcp;
+ tcp->tcp_eager_prev_drop_q0 = tcp;
tcp->tcp_connp->conn_recv = tcp_conn_request;
if (tcp->tcp_family == AF_INET6) {
ASSERT(tcp->tcp_connp->conn_af_isv6);
@@ -7883,6 +8037,10 @@ tcp_reinit_values(tcp)
tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0);
ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL);
+ ASSERT((tcp->tcp_eager_next_drop_q0 == NULL &&
+ tcp->tcp_eager_prev_drop_q0 == NULL) ||
+ tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0);
+
tcp->tcp_client_errno = 0;
DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */
@@ -7966,6 +8124,7 @@ tcp_reinit_values(tcp)
tcp->tcp_in_ack_unsent = 0;
tcp->tcp_cork = B_FALSE;
+ tcp->tcp_tconnind_started = B_FALSE;
PRESERVE(tcp->tcp_squeue_bytes);
@@ -7973,6 +8132,13 @@ tcp_reinit_values(tcp)
ASSERT(!tcp->tcp_kssl_pending);
PRESERVE(tcp->tcp_kssl_ent);
+ tcp->tcp_closemp_used = 0;
+
+#ifdef DEBUG
+ DONTCARE(tcp->tcmp_stk[0]);
+#endif
+
+
#undef DONTCARE
#undef PRESERVE
}
@@ -12237,6 +12403,13 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
* processing
*/
mutex_enter(&listener->tcp_eager_lock);
+
+ /*
+ * Take the eager out, if it is in the list of droppable eagers
+ * as we are here because the 3W handshake is over.
+ */
+ MAKE_UNDROPPABLE(tcp);
+
if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
tcp_t *tail;
@@ -13656,6 +13829,7 @@ process_ack:
tcp_t *listener = tcp->tcp_listener;
mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind;
+ tcp->tcp_tconnind_started = B_TRUE;
tcp->tcp_conn.tcp_eager_conn_ind = NULL;
/*
* We are here means eager is fine but it can
@@ -16561,6 +16735,14 @@ tcp_timer(void *arg)
tcp->tcp_syn_rcvd_timeout = 1;
mutex_enter(&listener->tcp_eager_lock);
listener->tcp_syn_rcvd_timeout++;
+ if (!tcp->tcp_dontdrop && tcp->tcp_closemp_used == 0) {
+ /*
+ * Make this eager available for drop if we
+ * need to drop one to accomodate a new
+ * incoming SYN request.
+ */
+ MAKE_DROPPABLE(listener, tcp);
+ }
if (!listener->tcp_syn_defense &&
(listener->tcp_syn_rcvd_timeout >
(tcp_conn_req_max_q0 >> 2)) &&
@@ -16577,6 +16759,24 @@ tcp_timer(void *arg)
KM_NOSLEEP);
}
mutex_exit(&listener->tcp_eager_lock);
+ } else if (listener != NULL) {
+ mutex_enter(&listener->tcp_eager_lock);
+ tcp->tcp_syn_rcvd_timeout++;
+ if (tcp->tcp_syn_rcvd_timeout > 1 &&
+ tcp->tcp_closemp_used == 0) {
+ /*
+ * This is our second timeout. Put the tcp in
+ * the list of droppable eagers to allow it to
+ * be dropped, if needed. We don't check
+ * whether tcp_dontdrop is set or not to
+ * protect ourselve from a SYN attack where a
+ * remote host can spoof itself as one of the
+ * good IP source and continue to hold
+ * resources too long.
+ */
+ MAKE_DROPPABLE(listener, tcp);
+ }
+ mutex_exit(&listener->tcp_eager_lock);
}
}
/* FALLTHRU */
@@ -17907,6 +18107,10 @@ tcp_wput_accept(queue_t *q, mblk_t *mp)
tcp->tcp_eager_next_q0 = NULL;
tcp->tcp_conn_def_q0 = B_FALSE;
+ /* Make sure the tcp isn't in the list of droppables */
+ ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
+ tcp->tcp_eager_prev_drop_q0 == NULL);
+
/*
* Insert at end of the queue because sockfs sends
* down T_CONN_RES in chronological order. Leaving
@@ -24635,9 +24839,11 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq,
* just restart the timer.
*/
if (TCP_IS_DETACHED(tcp)) {
- tcp_time_wait_remove(tcp, NULL);
- tcp_time_wait_append(tcp);
- TCP_DBGSTAT(tcp_rput_time_wait);
+ if (tcp_time_wait_remove(tcp, NULL) ==
+ B_TRUE) {
+ tcp_time_wait_append(tcp);
+ TCP_DBGSTAT(tcp_rput_time_wait);
+ }
} else {
ASSERT(tcp != NULL);
TCP_TIMER_RESTART(tcp,
diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c
index 74022fcef6..3e1982453a 100644
--- a/usr/src/uts/common/inet/tcp/tcp_kssl.c
+++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -194,6 +193,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp)
* and all conn ref cnt comments apply.
*/
tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+ tcp->tcp_tconnind_started = B_TRUE;
CONN_INC_REF(connp);
@@ -274,6 +274,7 @@ no_can_do:
* and all conn ref cnt comments apply.
*/
tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+ tcp->tcp_tconnind_started = B_TRUE;
CONN_INC_REF(connp);