diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/uts/common/inet/ip.h | 1 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip.c | 20 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip6.c | 21 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_if.c | 7 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ip_squeue.c | 22 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ip/ipclassifier.c | 3 | ||||
-rw-r--r-- | usr/src/uts/common/inet/ipclassifier.h | 13 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp.h | 33 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 300 | ||||
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_kssl.c | 9 |
10 files changed, 360 insertions, 69 deletions
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index 0c853ef4cd..4e588a67c7 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -3513,6 +3513,7 @@ typedef void (*ipsq_func_t)(ipsq_t *, queue_t *, mblk_t *, void *); #define SQTAG_UDP_WPUT 34 #define SQTAG_UDP_OUTPUT 35 #define SQTAG_TCP_KSSL_INPUT 36 +#define SQTAG_TCP_DROP_Q0 37 #endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 7896e52d23..080c503d27 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -4469,8 +4469,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) sin = (sin_t *)ucp; error = ip_bind_laddr(connp, mp, sin->sin_addr.s_addr, sin->sin_port, ire_requested, ipsec_policy_set, B_TRUE); - if (protocol == IPPROTO_TCP) - connp->conn_recv = tcp_conn_request; break; case sizeof (ipa_conn_t): @@ -4482,8 +4480,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) error = ip_bind_connected(connp, mp, &ac->ac_laddr, ac->ac_lport, ac->ac_faddr, ac->ac_fport, ire_requested, ipsec_policy_set, B_TRUE, B_TRUE); - if (protocol == IPPROTO_TCP) - connp->conn_recv = tcp_input; break; case sizeof (ipa_conn_x_t): @@ -4496,8 +4492,6 @@ ip_bind_v4(queue_t *q, mblk_t *mp, conn_t *connp) acx->acx_conn.ac_lport, acx->acx_conn.ac_faddr, acx->acx_conn.ac_fport, ire_requested, ipsec_policy_set, B_TRUE, (acx->acx_flags & ACX_VERIFY_DST) != 0); - if (protocol == IPPROTO_TCP) - connp->conn_recv = tcp_input; break; } if (error == EINPROGRESS) @@ -4691,7 +4685,14 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, connp->conn_fport = 0; /* * Do we need to add a check to reject Multicast packets + * + * We need to make sure that the conn_recv is set to a non-null + * value before we insert the conn into the classifier table. + * This is to avoid a race with an incoming packet which does an + * ipcl_classify(). */ + if (*mp->b_wptr == IPPROTO_TCP) + connp->conn_recv = tcp_conn_request; error = ipcl_bind_insert(connp, *mp->b_wptr, src_addr, lport); } @@ -4707,6 +4708,8 @@ ip_bind_laddr(conn_t *connp, mblk_t *mp, ipaddr_t src_addr, uint16_t lport, /* Falls through to bad_addr */ } } + } else if (connp->conn_ulp == IPPROTO_TCP) { + connp->conn_recv = tcp_input; } bad_addr: if (error != 0) { @@ -5139,7 +5142,12 @@ ip_bind_connected(conn_t *connp, mblk_t *mp, ipaddr_t *src_addrp, /* * The addresses have been verified. Time to insert in * the correct fanout list. + * We need to make sure that the conn_recv is set to a non-null + * value before we insert into the classifier table to avoid a + * race with an incoming packet which does an ipcl_classify(). */ + if (protocol == IPPROTO_TCP) + connp->conn_recv = tcp_input; error = ipcl_conn_insert(connp, protocol, src_addr, dst_addr, connp->conn_ports); } diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 913bcf307d..9e57397740 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -2329,8 +2329,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) goto bad_addr; connp->conn_pkt_isv6 = B_TRUE; } - if (protocol == IPPROTO_TCP) - connp->conn_recv = tcp_conn_request; } else { /* * Bind to local and remote address. Local might be @@ -2377,8 +2375,6 @@ ip_bind_v6(queue_t *q, mblk_t *mp, conn_t *connp, ip6_pkt_t *ipp) goto bad_addr; connp->conn_pkt_isv6 = B_TRUE; } - if (protocol == IPPROTO_TCP) - connp->conn_recv = tcp_input; } /* Update qinfo if v4/v6 changed */ if ((orig_pkt_isv6 != connp->conn_pkt_isv6) && @@ -2571,6 +2567,15 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src, connp->conn_remv6 = ipv6_all_zeros; connp->conn_lport = lport; connp->conn_fport = 0; + + /* + * We need to make sure that the conn_recv is set to a non-null + * value before we insert the conn_t into the classifier table. + * This is to avoid a race with an incoming packet which does + * an ipcl_classify(). + */ + if (*mp->b_wptr == IPPROTO_TCP) + connp->conn_recv = tcp_conn_request; error = ipcl_bind_insert_v6(connp, *mp->b_wptr, v6src, lport); } if (error == 0) { @@ -2585,6 +2590,8 @@ ip_bind_laddr_v6(conn_t *connp, mblk_t *mp, const in6_addr_t *v6src, goto bad_addr; } } + } else if (connp->conn_ulp == IPPROTO_TCP) { + connp->conn_recv = tcp_input; } bad_addr: if (error != 0) { @@ -3048,7 +3055,13 @@ ip_bind_connected_v6(conn_t *connp, mblk_t *mp, in6_addr_t *v6src, /* * The addresses have been verified. Time to insert in * the correct fanout list. + * We need to make sure that the conn_recv is set to a non-null + * value before we insert the conn_t into the classifier table. + * This is to avoid a race with an incoming packet which does + * an ipcl_classify(). */ + if (protocol == IPPROTO_TCP) + connp->conn_recv = tcp_input; error = ipcl_conn_insert_v6(connp, protocol, v6src, v6dst, connp->conn_ports, IPCL_IS_TCP(connp) ? connp->conn_tcp->tcp_bound_if : 0); diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 8aea44af15..af650a20cc 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -7843,6 +7843,13 @@ ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) * Use the preallocated ill_unbind_conn for this purpose */ connp = ill->ill_dls_capab->ill_unbind_conn; + + ASSERT(!connp->conn_tcp->tcp_closemp.b_prev); + TCP_DEBUG_GETPCSTACK(connp->conn_tcp->tcmp_stk, 15); + if (connp->conn_tcp->tcp_closemp.b_prev == NULL) + connp->conn_tcp->tcp_closemp_used = 1; + else + connp->conn_tcp->tcp_closemp_used++; mp = &connp->conn_tcp->tcp_closemp; CONN_INC_REF(connp); squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 5c1da7c964..d11098ec0e 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -785,15 +785,29 @@ ip_squeue_get(ill_rx_ring_t *ill_rx_ring) KM_NOSLEEP); mutex_enter(&ill->ill_lock); - if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE || - taskq_arg == NULL) { + /* + * Check sqp under the lock again for atomicity. Possible race with + * a previously scheduled ip_squeue_get -> ip_squeue_extend. + * Do the ring to squeue binding only if we are in interrupt context + * AND the ring is not already bound AND there is no one else trying + * the bind already. + */ + sqp = ill_rx_ring->rr_sqp; + if (sqp != NULL || !interrupt || + ill_rx_ring->rr_ring_state != ILL_RING_INUSE || taskq_arg == NULL) { /* - * Do the ring to squeue binding only if we are in interrupt - * context and there is no one else trying the bind already. + * Note that the ring might get bound once we drop the lock + * below, if a previous request is in progress i.e. if the ring + * state is ILL_RING_INPROC. The incoming connection on whose + * behalf we are currently here might get a suboptimal squeue + * via the call to IP_SQUEUE_GET below, but there is no + * correctness issue. */ mutex_exit(&ill->ill_lock); if (taskq_arg != NULL) kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); + if (sqp != NULL) + return (sqp); return (IP_SQUEUE_GET(lbolt)); } diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 4507496c97..f90b7a844c 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -1155,6 +1155,9 @@ ipcl_conn_insert(conn_t *connp, uint8_t protocol, ipaddr_t src, IPCL_HASH_REMOVE(connp); mutex_enter(&connfp->connf_lock); } + + ASSERT(connp->conn_recv != NULL); + IPCL_HASH_INSERT_CONNECTED_LOCKED(connfp, connp); mutex_exit(&connfp->connf_lock); break; diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 9b959f219a..03d510fdbf 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -295,8 +295,8 @@ struct connf_s { }; #define CONN_INC_REF(connp) { \ - DTRACE_PROBE1(conn__inc__ref, conn_t *, connp); \ mutex_enter(&(connp)->conn_lock); \ + DTRACE_PROBE1(conn__inc__ref, conn_t *, connp); \ ASSERT(conn_trace_ref(connp)); \ (connp)->conn_ref++; \ ASSERT((connp)->conn_ref != 0); \ @@ -312,9 +312,16 @@ struct connf_s { } #define CONN_DEC_REF(connp) { \ - DTRACE_PROBE1(conn__dec__ref, conn_t *, connp); \ mutex_enter(&(connp)->conn_lock); \ - if ((connp)->conn_ref <= 0) \ + DTRACE_PROBE1(conn__dec__ref, conn_t *, connp); \ + /* \ + * The squeue framework always does a CONN_DEC_REF after return \ + * from TCP. Hence the refcnt must be at least 2 if conn_on_sqp \ + * is B_TRUE and conn_ref is being decremented. This is to \ + * account for the mblk being currently processed. \ + */ \ + if ((connp)->conn_ref <= 0 || \ + ((connp)->conn_ref == 1 && (connp)->conn_on_sqp)) \ cmn_err(CE_PANIC, "CONN_DEC_REF: connp(%p) has ref " \ "= %d\n", (void *)(connp), (connp)->conn_ref); \ ASSERT(conn_untrace_ref(connp)); \ diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 54d46f7f1b..7552e53600 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -287,7 +287,8 @@ typedef struct tcp_s { tcp_send_discon_ind : 1, /* TLI accept err, send discon ind */ tcp_cork : 1, /* tcp_cork option */ - tcp_pad_to_bit_31 : 18; + tcp_tconnind_started : 1, /* conn_ind message is being sent */ + tcp_pad_to_bit_31 : 17; uint32_t tcp_if_mtu; /* Outgoing interface MTU. */ @@ -553,8 +554,38 @@ typedef struct tcp_s { kssl_ent_t tcp_kssl_ent; /* SSL table entry */ kssl_ctx_t tcp_kssl_ctx; /* SSL session */ uint_t tcp_label_len; /* length of cached label */ + + /* + * tcp_closemp_used is protected by listener's tcp_eager_lock + * when used for eagers. When used for a tcp in TIME_WAIT state + * or in tcp_close(), it is not protected by any lock as we + * do not expect any other thread to use it concurrently. + * Since we do allow re-use of tcp_closemp at certain places, + * tcp_closemp_used is declared as uint32_t instead of boolean_t + * to record any attempt to re-use tcp_closemp while it is still + * in use. This would facilitate debugging in non-debug kernels. + */ + uint32_t tcp_closemp_used; + + /* + * previous and next eagers in the list of droppable eagers. See + * the comments before MAKE_DROPPABLE(). These pointers are + * protected by listener's tcp_eager_lock. + */ + struct tcp_s *tcp_eager_prev_drop_q0; + struct tcp_s *tcp_eager_next_drop_q0; +#ifdef DEBUG + pc_t tcmp_stk[15]; +#endif } tcp_t; +#ifdef DEBUG +#define TCP_DEBUG_GETPCSTACK(buffer, depth) ((void) getpcstack(buffer, \ + depth)) +#else +#define TCP_DEBUG_GETPCSTACK(buffer, depth) +#endif + extern void tcp_free(tcp_t *tcp); extern void tcp_ddi_init(void); extern void tcp_ddi_destroy(void); diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 458265cfa3..aa80594733 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -995,6 +995,9 @@ static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t); extern void tcp_kssl_input(tcp_t *, mblk_t *); +void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2); +void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2); + /* * Routines related to the TCP_IOC_ABORT_CONN ioctl command. * @@ -1415,6 +1418,43 @@ boolean_t tcp_static_maxpsz = B_FALSE; uint32_t tcp_random_anon_port = 1; /* + * To reach to an eager in Q0 which can be dropped due to an incoming + * new SYN request when Q0 is full, a new doubly linked list is + * introduced. This list allows to select an eager from Q0 in O(1) time. + * This is needed to avoid spending too much time walking through the + * long list of eagers in Q0 when tcp_drop_q0() is called. Each member of + * this new list has to be a member of Q0. + * This list is headed by listener's tcp_t. When the list is empty, + * both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0, + * of listener's tcp_t point to listener's tcp_t itself. + * + * Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager + * in the list. MAKE_UNDROPPABLE() takes the eager out of the list. + * These macros do not affect the eager's membership to Q0. + */ + + +#define MAKE_DROPPABLE(listener, eager) \ + if ((eager)->tcp_eager_next_drop_q0 == NULL) { \ + (listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\ + = (eager); \ + (eager)->tcp_eager_prev_drop_q0 = (listener); \ + (eager)->tcp_eager_next_drop_q0 = \ + (listener)->tcp_eager_next_drop_q0; \ + (listener)->tcp_eager_next_drop_q0 = (eager); \ + } + +#define MAKE_UNDROPPABLE(eager) \ + if ((eager)->tcp_eager_next_drop_q0 != NULL) { \ + (eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \ + = (eager)->tcp_eager_prev_drop_q0; \ + (eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \ + = (eager)->tcp_eager_next_drop_q0; \ + (eager)->tcp_eager_prev_drop_q0 = NULL; \ + (eager)->tcp_eager_next_drop_q0 = NULL; \ + } + +/* * If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more * than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent * data, TCP will not respond with an ACK. RFC 793 requires that @@ -1535,8 +1575,11 @@ tcp_set_ws_value(tcp_t *tcp) /* * Remove a connection from the list of detached TIME_WAIT connections. + * It returns B_FALSE if it can't remove the connection from the list + * as the connection has already been removed from the list due to an + * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. */ -static void +static boolean_t tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) { boolean_t locked = B_FALSE; @@ -1553,7 +1596,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) ASSERT(tcp->tcp_time_wait_prev == NULL); if (locked) mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - return; + return (B_FALSE); } ASSERT(TCP_IS_DETACHED(tcp)); ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); @@ -1587,6 +1630,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) if (locked) mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + return (B_TRUE); } /* @@ -1770,6 +1814,7 @@ tcp_time_wait_collector(void *arg) mblk_t *mp; conn_t *connp; kmutex_t *lock; + boolean_t removed; squeue_t *sqp = (squeue_t *)arg; tcp_squeue_priv_t *tcp_time_wait = @@ -1803,7 +1848,8 @@ tcp_time_wait_collector(void *arg) break; } - tcp_time_wait_remove(tcp, tcp_time_wait); + removed = tcp_time_wait_remove(tcp, tcp_time_wait); + ASSERT(removed); connp = tcp->tcp_connp; ASSERT(connp->conn_fanout != NULL); @@ -1875,8 +1921,21 @@ tcp_time_wait_collector(void *arg) /* * We can reuse the closemp here since conn has * detached (otherwise we wouldn't even be in - * time_wait list). + * time_wait list). tcp_closemp_used can safely + * be changed without taking a lock as no other + * thread can concurrently access it at this + * point in the connection lifecycle. We + * increment tcp_closemp_used to record any + * attempt to reuse tcp_closemp while it is + * still in use. */ + + if (tcp->tcp_closemp.b_prev == NULL) + tcp->tcp_closemp_used = 1; + else + tcp->tcp_closemp_used++; + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; squeue_fill(connp->conn_sqp, mp, tcp_timewait_output, connp, @@ -1890,8 +1949,21 @@ tcp_time_wait_collector(void *arg) /* * We can reuse the closemp here since conn has * detached (otherwise we wouldn't even be in - * time_wait list). + * time_wait list). tcp_closemp_used can safely + * be changed without taking a lock as no other + * thread can concurrently access it at this + * point in the connection lifecycle. We + * increment tcp_closemp_used to record any + * attempt to reuse tcp_closemp while it is + * still in use. */ + + if (tcp->tcp_closemp.b_prev == NULL) + tcp->tcp_closemp_used = 1; + else + tcp->tcp_closemp_used++; + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); mp = &tcp->tcp_closemp; squeue_fill(connp->conn_sqp, mp, tcp_timewait_output, connp, 0); @@ -2306,6 +2378,10 @@ tcp_accept(tcp_t *listener, mblk_t *mp) tcp->tcp_eager_next_q0 = NULL; tcp->tcp_conn_def_q0 = B_FALSE; + /* Make sure the tcp isn't in the list of droppables */ + ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && + tcp->tcp_eager_prev_drop_q0 == NULL); + /* * Insert at end of the queue because sockfs sends * down T_CONN_RES in chronological order. Leaving @@ -3407,6 +3483,8 @@ do_bind: tcp->tcp_state = TCPS_LISTEN; /* Initialize the chain. Don't need the eager_lock */ tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; + tcp->tcp_eager_next_drop_q0 = tcp; + tcp->tcp_eager_prev_drop_q0 = tcp; tcp->tcp_second_ctimer_threshold = tcp_ip_abort_linterval; } @@ -3744,6 +3822,24 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, } /* + * tcp_clean_death / tcp_close_detached must not be called more than once + * on a tcp. Thus every function that potentially calls tcp_clean_death + * must check for the tcp state before calling tcp_clean_death. + * Eg. tcp_input, tcp_rput_data, tcp_eager_kill, tcp_clean_death_wrapper, + * tcp_timer_handler, all check for the tcp state. + */ +/* ARGSUSED */ +void +tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2) +{ + tcp_t *tcp = ((conn_t *)arg)->conn_tcp; + + freemsg(mp); + if (tcp->tcp_state > TCPS_BOUND) + (void) tcp_clean_death(((conn_t *)arg)->conn_tcp, ETIMEDOUT, 5); +} + +/* * We are dying for some reason. Try to do it gracefully. (May be called * as writer.) * @@ -3794,7 +3890,7 @@ tcp_clean_death(tcp_t *tcp, int err, uint8_t tag) * RST and will send a DISCON_IND to the application. */ tcp_closei_local(tcp); - if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { + if (!tcp->tcp_tconnind_started) { CONN_DEC_REF(tcp->tcp_connp); } else { tcp->tcp_state = TCPS_BOUND; @@ -3975,6 +4071,23 @@ tcp_close(queue_t *q, int flags) tcp->tcp_closeflags = (uint8_t)flags; ASSERT(connp->conn_ref >= 3); + /* + * tcp_closemp_used is used below without any protection of a lock + * as we don't expect any one else to use it concurrently at this + * point otherwise it would be a major defect, though we do + * increment tcp_closemp_used to record any attempt to reuse + * tcp_closemp while it is still in use. This would help debugging. + */ + + if (mp->b_prev == NULL) { + tcp->tcp_closemp_used = 1; + } else { + tcp->tcp_closemp_used++; + ASSERT(mp->b_prev == NULL); + } + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); + (*tcp_squeue_close_proc)(connp->conn_sqp, mp, tcp_close_output, connp, SQTAG_IP_TCP_CLOSE); @@ -4401,16 +4514,16 @@ tcp_closei_local(tcp_t *tcp) tcp_t *listener = tcp->tcp_listener; mutex_enter(&listener->tcp_eager_lock); /* - * tcp_eager_conn_ind == NULL means that the + * tcp_tconnind_started == B_TRUE means that the * conn_ind has already gone to listener. At * this point, eager will be closed but we * leave it in listeners eager list so that * if listener decides to close without doing * accept, we can clean this up. In tcp_wput_accept - * we take case of the case of accept on closed + * we take care of the case of accept on closed * eager. */ - if (tcp->tcp_conn.tcp_eager_conn_ind != NULL) { + if (!tcp->tcp_tconnind_started) { tcp_eager_unlink(tcp); mutex_exit(&listener->tcp_eager_lock); /* @@ -4449,7 +4562,7 @@ tcp_closei_local(tcp_t *tcp) * tcp_time_wait_remove for the refcnt checks to work correctly. */ if (tcp->tcp_state == TCPS_TIME_WAIT) - tcp_time_wait_remove(tcp, NULL); + (void) tcp_time_wait_remove(tcp, NULL); CL_INET_DISCONNECT(tcp); ipcl_hash_remove(connp); @@ -4666,8 +4779,9 @@ tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph, mblk_t *idmp, /* * Defense for the SYN attack - - * 1. When q0 is full, drop from the tail (tcp_eager_prev_q0) the oldest - * one that doesn't have the dontdrop bit set. + * 1. When q0 is full, drop from the tail (tcp_eager_prev_drop_q0) the oldest + * one from the list of droppable eagers. This list is a subset of q0. + * see comments before the definition of MAKE_DROPPABLE(). * 2. Don't drop a SYN request before its first timeout. This gives every * request at least til the first timeout to complete its 3-way handshake. * 3. Maintain tcp_syn_rcvd_timeout as an accurate count of how many @@ -4682,26 +4796,29 @@ static boolean_t tcp_drop_q0(tcp_t *tcp) { tcp_t *eager; + mblk_t *mp; ASSERT(MUTEX_HELD(&tcp->tcp_eager_lock)); ASSERT(tcp->tcp_eager_next_q0 != tcp->tcp_eager_prev_q0); - /* - * New one is added after next_q0 so prev_q0 points to the oldest - * Also do not drop any established connections that are deferred on - * q0 due to q being full - */ - eager = tcp->tcp_eager_prev_q0; - while (eager->tcp_dontdrop || eager->tcp_conn_def_q0) { - eager = eager->tcp_eager_prev_q0; - if (eager == tcp) { - eager = tcp->tcp_eager_prev_q0; - break; - } + /* Pick oldest eager from the list of droppable eagers */ + eager = tcp->tcp_eager_prev_drop_q0; + + /* If list is empty. return B_FALSE */ + if (eager == tcp) { + return (B_FALSE); } - if (eager->tcp_syn_rcvd_timeout == 0) + + /* If allocated, the mp will be freed in tcp_clean_death_wrapper() */ + if ((mp = allocb(0, BPRI_HI)) == NULL) return (B_FALSE); + /* + * Take this eager out from the list of droppable eagers since we are + * going to drop it. + */ + MAKE_UNDROPPABLE(eager); + if (tcp->tcp_debug) { (void) strlog(TCP_MOD_ID, 0, 3, SL_TRACE, "tcp_drop_q0: listen half-open queue (max=%d) overflow" @@ -4712,19 +4829,14 @@ tcp_drop_q0(tcp_t *tcp) BUMP_MIB(&tcp_mib, tcpHalfOpenDrop); - /* - * need to do refhold here because the selected eager could - * be removed by someone else if we release the eager lock. - */ + /* Put a reference on the conn as we are enqueueing it in the sqeue */ CONN_INC_REF(eager->tcp_connp); - mutex_exit(&tcp->tcp_eager_lock); /* Mark the IRE created for this SYN request temporary */ tcp_ip_ire_mark_advice(eager); - (void) tcp_clean_death(eager, ETIMEDOUT, 5); - CONN_DEC_REF(eager->tcp_connp); + squeue_fill(eager->tcp_connp->conn_sqp, mp, + tcp_clean_death_wrapper, eager->tcp_connp, SQTAG_TCP_DROP_Q0); - mutex_enter(&tcp->tcp_eager_lock); return (B_TRUE); } @@ -4976,6 +5088,7 @@ tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp, } ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); + ASSERT(!tcp->tcp_tconnind_started); /* * If the SYN contains a credential, it's a loopback packet; attach * the credential to the TPI message. @@ -5097,6 +5210,7 @@ tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha, tcp_opt_reverse(tcp, ipha); ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); + ASSERT(!tcp->tcp_tconnind_started); /* * If the SYN contains a credential, it's a loopback packet; attach @@ -5924,6 +6038,7 @@ error1: * treated as a new connection or dealth with * a TH_RST if a connection already exists. */ + CONN_DEC_REF(econnp); freemsg(mp); } else { squeue_fill(econnp->conn_sqp, mp, tcp_input, @@ -5985,7 +6100,23 @@ tcp_conn_request_unbound(void *arg, mblk_t *mp, void *arg2) * can't execute. If they are processed after we have * changed the squeue, they are sent back to the * correct squeue down below. + * But a listner close can race with processing of + * incoming SYN. If incoming SYN processing changes + * the squeue then the listener close which is waiting + * to enter the squeue would operate on the wrong + * squeue. Hence we don't change the squeue here unless + * the refcount is exactly the minimum refcount. The + * minimum refcount of 4 is counted as - 1 each for + * TCP and IP, 1 for being in the classifier hash, and + * 1 for the mblk being processed. */ + + if (connp->conn_ref != 4 || + connp->conn_tcp->tcp_state != TCPS_LISTEN) { + mutex_exit(&connp->conn_lock); + mutex_exit(&connp->conn_fanout->connf_lock); + goto done; + } if (connp->conn_sqp != new_sqp) { while (connp->conn_sqp != new_sqp) (void) casptr(&connp->conn_sqp, sqp, new_sqp); @@ -6961,7 +7092,7 @@ tcp_eager_kill(void *arg, mblk_t *mp, void *arg2) if (listener != NULL) { mutex_enter(&listener->tcp_eager_lock); tcp_eager_unlink(eager); - if (eager->tcp_conn.tcp_eager_conn_ind == NULL) { + if (eager->tcp_tconnind_started) { /* * The eager has sent a conn_ind up to the * listener but listener decides to close @@ -6999,6 +7130,13 @@ tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum) return (B_FALSE); } } while (eager->tcp_conn_req_seqnum != seqnum); + + if (eager->tcp_closemp_used > 0) { + mutex_exit(&listener->tcp_eager_lock); + return (B_TRUE); + } + eager->tcp_closemp_used = 1; + TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); CONN_INC_REF(eager->tcp_connp); mutex_exit(&listener->tcp_eager_lock); mp = &eager->tcp_closemp; @@ -7024,11 +7162,15 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) TCP_STAT(tcp_eager_blowoff_q); eager = listener->tcp_eager_next_q; while (eager != NULL) { - CONN_INC_REF(eager->tcp_connp); - mp = &eager->tcp_closemp; - squeue_fill(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, - SQTAG_TCP_EAGER_CLEANUP); + if (eager->tcp_closemp_used == 0) { + eager->tcp_closemp_used = 1; + TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); + CONN_INC_REF(eager->tcp_connp); + mp = &eager->tcp_closemp; + squeue_fill(eager->tcp_connp->conn_sqp, mp, + tcp_eager_kill, eager->tcp_connp, + SQTAG_TCP_EAGER_CLEANUP); + } eager = eager->tcp_eager_next_q; } } @@ -7036,11 +7178,15 @@ tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only) TCP_STAT(tcp_eager_blowoff_q0); eager = listener->tcp_eager_next_q0; while (eager != listener) { - CONN_INC_REF(eager->tcp_connp); - mp = &eager->tcp_closemp; - squeue_fill(eager->tcp_connp->conn_sqp, mp, - tcp_eager_kill, eager->tcp_connp, - SQTAG_TCP_EAGER_CLEANUP_Q0); + if (eager->tcp_closemp_used == 0) { + eager->tcp_closemp_used = 1; + TCP_DEBUG_GETPCSTACK(eager->tcmp_stk, 15); + CONN_INC_REF(eager->tcp_connp); + mp = &eager->tcp_closemp; + squeue_fill(eager->tcp_connp->conn_sqp, mp, + tcp_eager_kill, eager->tcp_connp, + SQTAG_TCP_EAGER_CLEANUP_Q0); + } eager = eager->tcp_eager_next_q0; } } @@ -7071,6 +7217,12 @@ tcp_eager_unlink(tcp_t *tcp) tcp->tcp_eager_next_q0 = NULL; tcp->tcp_eager_prev_q0 = NULL; + /* + * Take the eager out, if it is in the list of droppable + * eagers. + */ + MAKE_UNDROPPABLE(tcp); + if (tcp->tcp_syn_rcvd_timeout != 0) { /* we have timed out before */ ASSERT(listener->tcp_syn_rcvd_timeout > 0); @@ -7637,6 +7789,8 @@ tcp_reinit(tcp_t *tcp) */ tcp->tcp_state = TCPS_LISTEN; tcp->tcp_eager_next_q0 = tcp->tcp_eager_prev_q0 = tcp; + tcp->tcp_eager_next_drop_q0 = tcp; + tcp->tcp_eager_prev_drop_q0 = tcp; tcp->tcp_connp->conn_recv = tcp_conn_request; if (tcp->tcp_family == AF_INET6) { ASSERT(tcp->tcp_connp->conn_af_isv6); @@ -7883,6 +8037,10 @@ tcp_reinit_values(tcp) tcp->tcp_eager_next_q0 == tcp->tcp_eager_prev_q0); ASSERT(tcp->tcp_conn.tcp_eager_conn_ind == NULL); + ASSERT((tcp->tcp_eager_next_drop_q0 == NULL && + tcp->tcp_eager_prev_drop_q0 == NULL) || + tcp->tcp_eager_next_drop_q0 == tcp->tcp_eager_prev_drop_q0); + tcp->tcp_client_errno = 0; DONTCARE(tcp->tcp_sum); /* Init in tcp_init_values */ @@ -7966,6 +8124,7 @@ tcp_reinit_values(tcp) tcp->tcp_in_ack_unsent = 0; tcp->tcp_cork = B_FALSE; + tcp->tcp_tconnind_started = B_FALSE; PRESERVE(tcp->tcp_squeue_bytes); @@ -7973,6 +8132,13 @@ tcp_reinit_values(tcp) ASSERT(!tcp->tcp_kssl_pending); PRESERVE(tcp->tcp_kssl_ent); + tcp->tcp_closemp_used = 0; + +#ifdef DEBUG + DONTCARE(tcp->tcmp_stk[0]); +#endif + + #undef DONTCARE #undef PRESERVE } @@ -12237,6 +12403,13 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) * processing */ mutex_enter(&listener->tcp_eager_lock); + + /* + * Take the eager out, if it is in the list of droppable eagers + * as we are here because the 3W handshake is over. + */ + MAKE_UNDROPPABLE(tcp); + if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { tcp_t *tail; @@ -13656,6 +13829,7 @@ process_ack: tcp_t *listener = tcp->tcp_listener; mblk_t *mp = tcp->tcp_conn.tcp_eager_conn_ind; + tcp->tcp_tconnind_started = B_TRUE; tcp->tcp_conn.tcp_eager_conn_ind = NULL; /* * We are here means eager is fine but it can @@ -16561,6 +16735,14 @@ tcp_timer(void *arg) tcp->tcp_syn_rcvd_timeout = 1; mutex_enter(&listener->tcp_eager_lock); listener->tcp_syn_rcvd_timeout++; + if (!tcp->tcp_dontdrop && tcp->tcp_closemp_used == 0) { + /* + * Make this eager available for drop if we + * need to drop one to accomodate a new + * incoming SYN request. + */ + MAKE_DROPPABLE(listener, tcp); + } if (!listener->tcp_syn_defense && (listener->tcp_syn_rcvd_timeout > (tcp_conn_req_max_q0 >> 2)) && @@ -16577,6 +16759,24 @@ tcp_timer(void *arg) KM_NOSLEEP); } mutex_exit(&listener->tcp_eager_lock); + } else if (listener != NULL) { + mutex_enter(&listener->tcp_eager_lock); + tcp->tcp_syn_rcvd_timeout++; + if (tcp->tcp_syn_rcvd_timeout > 1 && + tcp->tcp_closemp_used == 0) { + /* + * This is our second timeout. Put the tcp in + * the list of droppable eagers to allow it to + * be dropped, if needed. We don't check + * whether tcp_dontdrop is set or not to + * protect ourselve from a SYN attack where a + * remote host can spoof itself as one of the + * good IP source and continue to hold + * resources too long. + */ + MAKE_DROPPABLE(listener, tcp); + } + mutex_exit(&listener->tcp_eager_lock); } } /* FALLTHRU */ @@ -17907,6 +18107,10 @@ tcp_wput_accept(queue_t *q, mblk_t *mp) tcp->tcp_eager_next_q0 = NULL; tcp->tcp_conn_def_q0 = B_FALSE; + /* Make sure the tcp isn't in the list of droppables */ + ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && + tcp->tcp_eager_prev_drop_q0 == NULL); + /* * Insert at end of the queue because sockfs sends * down T_CONN_RES in chronological order. Leaving @@ -24635,9 +24839,11 @@ tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp, uint32_t seg_seq, * just restart the timer. */ if (TCP_IS_DETACHED(tcp)) { - tcp_time_wait_remove(tcp, NULL); - tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcp_rput_time_wait); + if (tcp_time_wait_remove(tcp, NULL) == + B_TRUE) { + tcp_time_wait_append(tcp); + TCP_DBGSTAT(tcp_rput_time_wait); + } } else { ASSERT(tcp != NULL); TCP_TIMER_RESTART(tcp, diff --git a/usr/src/uts/common/inet/tcp/tcp_kssl.c b/usr/src/uts/common/inet/tcp/tcp_kssl.c index 74022fcef6..3e1982453a 100644 --- a/usr/src/uts/common/inet/tcp/tcp_kssl.c +++ b/usr/src/uts/common/inet/tcp/tcp_kssl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -194,6 +193,7 @@ tcp_kssl_input(tcp_t *tcp, mblk_t *mp) * and all conn ref cnt comments apply. */ tcp->tcp_conn.tcp_eager_conn_ind = NULL; + tcp->tcp_tconnind_started = B_TRUE; CONN_INC_REF(connp); @@ -274,6 +274,7 @@ no_can_do: * and all conn ref cnt comments apply. */ tcp->tcp_conn.tcp_eager_conn_ind = NULL; + tcp->tcp_tconnind_started = B_TRUE; CONN_INC_REF(connp); |