diff options
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_input.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_time_wait.c | 668 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_tunables.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/tcp_impl.h | 79 |
4 files changed, 435 insertions, 320 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index cf8e0c6bd4..7cfdb9a4a2 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2014 by Delphix. All rights reserved. */ @@ -99,7 +99,7 @@ * tcps_time_wait_interval since the period before upper layer closes the * connection is not accounted for when tcp_time_wait_append() is called. * - * If uppser layer has closed the connection, call tcp_time_wait_append() + * If upper layer has closed the connection, call tcp_time_wait_append() * directly. * */ diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c index b470934da0..6600296b18 100644 --- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c +++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -41,13 +41,13 @@ #include <inet/tcp_impl.h> #include <inet/tcp_cluster.h> -static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *); + +#define TW_BUCKET(t) \ + (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS) + +#define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS) -/* - * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. - * Running it every 5 seconds seems to give the best results. - */ -#define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC) /* * Remove a connection from the list of detached TIME_WAIT connections. @@ -56,17 +56,17 @@ static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. */ boolean_t -tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) +tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp) { boolean_t locked = B_FALSE; - if (tcp_time_wait == NULL) { - tcp_time_wait = *((tcp_squeue_priv_t **) + if (tsp == NULL) { + tsp = *((tcp_squeue_priv_t **) squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + mutex_enter(&tsp->tcp_time_wait_lock); locked = B_TRUE; } else { - ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); + ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); } /* 0 means that the tcp_t has not been added to the time wait list. */ @@ -74,40 +74,34 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) ASSERT(tcp->tcp_time_wait_next == NULL); ASSERT(tcp->tcp_time_wait_prev == NULL); if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); return (B_FALSE); } ASSERT(TCP_IS_DETACHED(tcp)); ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); + ASSERT(tsp->tcp_time_wait_cnt > 0); - if (tcp == tcp_time_wait->tcp_time_wait_head) { - ASSERT(tcp->tcp_time_wait_prev == NULL); - tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; - if (tcp_time_wait->tcp_time_wait_head != NULL) { - tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = - NULL; - } else { - tcp_time_wait->tcp_time_wait_tail = NULL; - } - } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { - ASSERT(tcp->tcp_time_wait_next == NULL); - tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; - } else { - ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); - ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); - tcp->tcp_time_wait_prev->tcp_time_wait_next = - tcp->tcp_time_wait_next; + if (tcp->tcp_time_wait_next != NULL) { tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp->tcp_time_wait_prev; } + if (tcp->tcp_time_wait_prev != NULL) { + tcp->tcp_time_wait_prev->tcp_time_wait_next = + tcp->tcp_time_wait_next; + } else { + unsigned int bucket; + + bucket = TW_BUCKET(tcp->tcp_time_wait_expire); + ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp); + tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next; + } tcp->tcp_time_wait_next = NULL; tcp->tcp_time_wait_prev = NULL; tcp->tcp_time_wait_expire = 0; + tsp->tcp_time_wait_cnt--; if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); return (B_TRUE); } @@ -126,6 +120,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \ IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6))) + /* * Add a connection to the list of detached TIME_WAIT connections * and set its time to expire. @@ -135,9 +130,10 @@ tcp_time_wait_append(tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; squeue_t *sqp = tcp->tcp_connp->conn_sqp; - tcp_squeue_priv_t *tcp_time_wait = + tcp_squeue_priv_t *tsp = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); - hrtime_t firetime = 0; + int64_t now, schedule; + unsigned int bucket; tcp_timers_stop(tcp); @@ -146,6 +142,8 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_ack_tid == 0); /* must have happened at the time of detaching the tcp */ + ASSERT(TCP_IS_DETACHED(tcp)); + ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); ASSERT(tcp->tcp_ptpahn == NULL); ASSERT(tcp->tcp_flow_stopped == 0); ASSERT(tcp->tcp_time_wait_next == NULL); @@ -153,97 +151,112 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_time_wait_expire == 0); ASSERT(tcp->tcp_listener == NULL); - tcp->tcp_time_wait_expire = ddi_get_lbolt64(); - if (IS_LOCAL_HOST(tcp)) { - /* - * This is the fastpath for handling localhost connections. - * Since we don't have to worry about packets on the localhost - * showing up after a long network delay, we want to expire - * these quickly so the port range on the localhost doesn't - * get starved by short-running, local apps. - * - * Leave tcp_time_wait_expire at the current time. This - * essentially means the connection is expired now and it will - * clean up the next time tcp_time_wait_collector runs. We set - * firetime to use a short delay so that if we have to start a - * tcp_time_wait_collector thread below, it runs soon instead - * of after a delay of time_wait_interval. firetime being set - * to a non-0 value is also our indicator that we should add - * this connection to the head of the time wait list (since we - * are already expired) so that its sure to get cleaned up on - * the next run of tcp_time_wait_collector (which expects the - * entries to appear in time-order and stops when it hits the - * first non-expired entry). - */ - firetime = TCP_TIME_WAIT_DELAY; - } else { - /* - * Since tcp_time_wait_expire is lbolt64, it should not wrap - * around in practice. Hence it cannot be 0. Note that zero - * means that the tcp_t is not in the TIME_WAIT list. - */ - tcp->tcp_time_wait_expire += MSEC_TO_TICK( - tcps->tcps_time_wait_interval); + TCP_DBGSTAT(tcps, tcp_time_wait); + mutex_enter(&tsp->tcp_time_wait_lock); + + /* + * Immediately expire loopback connections. Since there is no worry + * about packets on the local host showing up after a long network + * delay, this is safe and allows much higher rates of connection churn + * for applications operating locally. + * + * This typically bypasses the tcp_free_list fast path due to squeue + * re-entry for the loopback close operation. + */ + if (tcp->tcp_loopback) { + tcp_time_wait_purge(tcp, tsp); + mutex_exit(&tsp->tcp_time_wait_lock); + return; } - ASSERT(TCP_IS_DETACHED(tcp)); - ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); - ASSERT(tcp->tcp_time_wait_next == NULL); - ASSERT(tcp->tcp_time_wait_prev == NULL); - TCP_DBGSTAT(tcps, tcp_time_wait); + /* + * In order to reap TIME_WAITs reliably, we should use a source of time + * that is not adjustable by the user. While it would be more accurate + * to grab this timestamp before (potentially) sleeping on the + * tcp_time_wait_lock, doing so complicates bucket addressing later. + */ + now = ddi_get_lbolt64(); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - if (tcp_time_wait->tcp_time_wait_head == NULL) { - ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); - tcp_time_wait->tcp_time_wait_head = tcp; + /* + * Each squeue uses an arbitrary time offset when scheduling + * expiration timers. This prevents the bucketing from forcing + * tcp_time_wait_collector to run in locksetup across squeues. + * + * This offset is (re)initialized when a new TIME_WAIT connection is + * added to an squeue which has no connections waiting to expire. + */ + if (tsp->tcp_time_wait_tid == 0) { + ASSERT(tsp->tcp_time_wait_cnt == 0); + tsp->tcp_time_wait_offset = + now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + } + now -= tsp->tcp_time_wait_offset; + + /* + * Use the netstack-defined timeout, rounded up to the minimum + * time_wait_collector interval. + */ + schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval); + tcp->tcp_time_wait_expire = schedule; + + /* + * Append the connection into the appropriate bucket. + */ + bucket = TW_BUCKET(tcp->tcp_time_wait_expire); + tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket]; + tsp->tcp_time_wait_bucket[bucket] = tcp; + if (tcp->tcp_time_wait_next != NULL) { + ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL); + tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp; + } + tsp->tcp_time_wait_cnt++; + + /* + * Round delay up to the nearest bucket boundary. + */ + schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + + /* + * The newly inserted entry may require a tighter schedule for the + * expiration timer. + */ + if (schedule < tsp->tcp_time_wait_schedule) { + callout_id_t old_tid = tsp->tcp_time_wait_tid; + + tsp->tcp_time_wait_schedule = schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); /* - * Even if the list was empty before, there may be a timer - * running since a tcp_t can be removed from the list - * in other places, such as tcp_clean_death(). So check if - * a timer is needed. - */ - if (tcp_time_wait->tcp_time_wait_tid == 0) { - if (firetime == 0) - firetime = (hrtime_t) - (tcps->tcps_time_wait_interval + 1) * - MICROSEC; - - tcp_time_wait->tcp_time_wait_tid = - timeout_generic(CALLOUT_NORMAL, - tcp_time_wait_collector, sqp, firetime, - CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); - } - tcp_time_wait->tcp_time_wait_tail = tcp; - } else { - /* - * The list is not empty, so a timer must be running. If not, - * tcp_time_wait_collector() must be running on this - * tcp_time_wait list at the same time. + * It is possible for the timer to fire before the untimeout + * action is able to complete. In that case, the exclusion + * offered by the tcp_time_wait_collector_active flag will + * prevent multiple collector threads from processing records + * simultaneously from the same squeue. */ - ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 || - tcp_time_wait->tcp_time_wait_running); - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == - TCPS_TIME_WAIT); - - if (firetime == 0) { - /* add at end */ - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = - tcp; - tcp->tcp_time_wait_prev = - tcp_time_wait->tcp_time_wait_tail; - tcp_time_wait->tcp_time_wait_tail = tcp; - } else { - /* add at head */ - tcp->tcp_time_wait_next = - tcp_time_wait->tcp_time_wait_head; - tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = - tcp; - tcp_time_wait->tcp_time_wait_head = tcp; - } + mutex_exit(&tsp->tcp_time_wait_lock); + (void) untimeout_default(old_tid, 0); + return; + } + + /* + * Start a fresh timer if none exists. + */ + if (tsp->tcp_time_wait_schedule == 0) { + ASSERT(tsp->tcp_time_wait_tid == 0); + + tsp->tcp_time_wait_schedule = schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); } - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); } /* @@ -278,216 +291,287 @@ tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) tcp_close_detached(tcp); } + +static void +tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp) +{ + mblk_t *mp; + conn_t *connp = tcp->tcp_connp; + kmutex_t *lock; + + ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); + ASSERT(connp->conn_fanout != NULL); + + lock = &connp->conn_fanout->connf_lock; + + /* + * This is essentially a TIME_WAIT reclaim fast path optimization for + * performance where the connection is checked under the fanout lock + * (so that no one else can get access to the conn_t) that the refcnt + * is 2 (one each for TCP and the classifier hash list). That is the + * case and clustering callbacks are not enabled, the conn can be + * removed under the fanout lock and avoid clean-up under the squeue. + * + * This optimization is forgone when clustering is enabled since the + * clustering callback must be made before setting the CONDEMNED flag + * and after dropping all locks + * + * See the comments in tcp_closei_local for additional information + * regarding the refcnt logic. + */ + if (mutex_tryenter(lock)) { + mutex_enter(&connp->conn_lock); + if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) { + ipcl_hash_remove_locked(connp, connp->conn_fanout); + /* + * Set the CONDEMNED flag now itself so that the refcnt + * cannot increase due to any walker. + */ + connp->conn_state_flags |= CONN_CONDEMNED; + mutex_exit(&connp->conn_lock); + mutex_exit(lock); + if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) { + /* + * Add to head of tcp_free_list + */ + tcp_cleanup(tcp); + ASSERT(connp->conn_latch == NULL); + ASSERT(connp->conn_policy == NULL); + ASSERT(tcp->tcp_tcps == NULL); + ASSERT(connp->conn_netstack == NULL); + + tcp->tcp_time_wait_next = tsp->tcp_free_list; + tcp->tcp_in_free_list = B_TRUE; + tsp->tcp_free_list = tcp; + tsp->tcp_free_list_cnt++; + } else { + /* + * Do not add to tcp_free_list + */ + tcp_bind_hash_remove(tcp); + ixa_cleanup(tcp->tcp_connp->conn_ixa); + tcp_ipsec_cleanup(tcp); + CONN_DEC_REF(tcp->tcp_connp); + } + + /* + * With the fast-path complete, we can bail. + */ + return; + } else { + /* + * Fall back to slow path. + */ + CONN_INC_REF_LOCKED(connp); + mutex_exit(&connp->conn_lock); + mutex_exit(lock); + } + } else { + CONN_INC_REF(connp); + } + + /* + * We can reuse the closemp here since conn has detached (otherwise we + * wouldn't even be in time_wait list). It is safe to change + * tcp_closemp_used without taking a lock as no other thread can + * concurrently access it at this point in the connection lifecycle. + */ + if (tcp->tcp_closemp.b_prev == NULL) { + tcp->tcp_closemp_used = B_TRUE; + } else { + cmn_err(CE_PANIC, + "tcp_timewait_collector: concurrent use of tcp_closemp: " + "connp %p tcp %p\n", (void *)connp, (void *)tcp); + } + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); + mp = &tcp->tcp_closemp; + mutex_exit(&tsp->tcp_time_wait_lock); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL, + SQ_FILL, SQTAG_TCP_TIMEWAIT); + mutex_enter(&tsp->tcp_time_wait_lock); +} + /* - * Blows away all tcps whose TIME_WAIT has expired. List traversal - * is done forwards from the head. - * This walks all stack instances since - * tcp_time_wait remains global across all stacks. + * Purge any tcp_t instances associated with this squeue which have expired + * from the TIME_WAIT state. */ -/* ARGSUSED */ void tcp_time_wait_collector(void *arg) { tcp_t *tcp; - int64_t now; - mblk_t *mp; - conn_t *connp; - kmutex_t *lock; - boolean_t removed; - extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t, - uint8_t *, in_port_t, uint8_t *, in_port_t, void *); + int64_t now, active_schedule, new_schedule; + unsigned int idx; squeue_t *sqp = (squeue_t *)arg; - tcp_squeue_priv_t *tcp_time_wait = + tcp_squeue_priv_t *tsp = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - tcp_time_wait->tcp_time_wait_tid = 0; -#ifdef DEBUG - tcp_time_wait->tcp_time_wait_running = B_TRUE; -#endif + mutex_enter(&tsp->tcp_time_wait_lock); + + /* + * Because of timer scheduling complexity and the fact that the + * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is + * possible for multiple tcp_time_wait_collector threads to run against + * the same squeue. This flag is used to exclude other collectors from + * the squeue during execution. + */ + if (tsp->tcp_time_wait_collector_active) { + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } + tsp->tcp_time_wait_collector_active = B_TRUE; - if (tcp_time_wait->tcp_free_list != NULL && - tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { + /* + * Purge the free list if necessary + */ + if (tsp->tcp_free_list != NULL) { TCP_G_STAT(tcp_freelist_cleanup); - while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { - tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; + while ((tcp = tsp->tcp_free_list) != NULL) { + tsp->tcp_free_list = tcp->tcp_time_wait_next; tcp->tcp_time_wait_next = NULL; - tcp_time_wait->tcp_free_list_cnt--; + tsp->tcp_free_list_cnt--; ASSERT(tcp->tcp_tcps == NULL); CONN_DEC_REF(tcp->tcp_connp); } - ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); + ASSERT(tsp->tcp_free_list_cnt == 0); } /* - * In order to reap time waits reliably, we should use a - * source of time that is not adjustable by the user -- hence - * the call to ddi_get_lbolt64(). + * If there are no connections pending, clear timer-related state to be + * reinitialized by the next caller. */ - now = ddi_get_lbolt64(); - while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { + if (tsp->tcp_time_wait_cnt == 0) { + tsp->tcp_time_wait_offset = 0; + tsp->tcp_time_wait_schedule = 0; + tsp->tcp_time_wait_tid = 0; + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } + + /* + * Grab the bucket which we were scheduled to cleanse. + */ + active_schedule = tsp->tcp_time_wait_schedule; + idx = TW_BUCKET(active_schedule - 1); + now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; +retry: + tcp = tsp->tcp_time_wait_bucket[idx]; + + while (tcp != NULL) { /* - * lbolt64 should not wrap around in practice... So we can - * do a direct comparison. + * Since the bucket count is sized to prevent wrap-around + * during typical operation and timers are schedule to process + * buckets with only expired connections, there is only one + * reason to encounter a connection expiring in the future: + * The tcp_time_wait_collector thread has been so delayed in + * its processing that connections have wrapped around the + * timing wheel into this bucket. + * + * In that case, the remaining entires in the bucket can be + * ignored since, being appended sequentially, they should all + * expire in the future. */ - if (now < tcp->tcp_time_wait_expire) + if (now < tcp->tcp_time_wait_expire) { break; + } - removed = tcp_time_wait_remove(tcp, tcp_time_wait); - ASSERT(removed); + /* + * Pull the connection out of the bucket. + */ + VERIFY(tcp_time_wait_remove(tcp, tsp)); - connp = tcp->tcp_connp; - ASSERT(connp->conn_fanout != NULL); - lock = &connp->conn_fanout->connf_lock; /* - * This is essentially a TW reclaim fast path optimization for - * performance where the timewait collector checks under the - * fanout lock (so that no one else can get access to the - * conn_t) that the refcnt is 2 i.e. one for TCP and one for - * the classifier hash list. If ref count is indeed 2, we can - * just remove the conn under the fanout lock and avoid - * cleaning up the conn under the squeue, provided that - * clustering callbacks are not enabled. If clustering is - * enabled, we need to make the clustering callback before - * setting the CONDEMNED flag and after dropping all locks and - * so we forego this optimization and fall back to the slow - * path. Also please see the comments in tcp_closei_local - * regarding the refcnt logic. + * Purge the connection. * - * Since we are holding the tcp_time_wait_lock, its better - * not to block on the fanout_lock because other connections - * can't add themselves to time_wait list. So we do a - * tryenter instead of mutex_enter. + * While tcp_time_wait_lock will be temporarily dropped as part + * of the process, there is no risk of the timer being + * (re)scheduled while the collector is running since a value + * corresponding to the past is left in tcp_time_wait_schedule. */ - if (mutex_tryenter(lock)) { - mutex_enter(&connp->conn_lock); - if ((connp->conn_ref == 2) && - (cl_inet_disconnect == NULL)) { - ipcl_hash_remove_locked(connp, - connp->conn_fanout); - /* - * Set the CONDEMNED flag now itself so that - * the refcnt cannot increase due to any - * walker. - */ - connp->conn_state_flags |= CONN_CONDEMNED; - mutex_exit(lock); - mutex_exit(&connp->conn_lock); - if (tcp_time_wait->tcp_free_list_cnt < - tcp_free_list_max_cnt) { - /* Add to head of tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_cleanup(tcp); - ASSERT(connp->conn_latch == NULL); - ASSERT(connp->conn_policy == NULL); - ASSERT(tcp->tcp_tcps == NULL); - ASSERT(connp->conn_netstack == NULL); - - mutex_enter( - &tcp_time_wait->tcp_time_wait_lock); - tcp->tcp_time_wait_next = - tcp_time_wait->tcp_free_list; - tcp_time_wait->tcp_free_list = tcp; - tcp_time_wait->tcp_free_list_cnt++; - continue; - } else { - /* Do not add to tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_bind_hash_remove(tcp); - ixa_cleanup(tcp->tcp_connp->conn_ixa); - tcp_ipsec_cleanup(tcp); - CONN_DEC_REF(tcp->tcp_connp); - } - } else { - CONN_INC_REF_LOCKED(connp); - mutex_exit(lock); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ + tcp_time_wait_purge(tcp, tsp); - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, - "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_close, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); - } - } else { - mutex_enter(&connp->conn_lock); - CONN_INC_REF_LOCKED(connp); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ + /* + * Because tcp_time_wait_remove clears the tcp_time_wait_next + * field, the next item must be grabbed directly from the + * bucket itself. + */ + tcp = tsp->tcp_time_wait_bucket[idx]; + } + + if (tsp->tcp_time_wait_cnt == 0) { + /* + * There is not a need for the collector to schedule a new + * timer if no pending items remain. The timer state can be + * cleared only if it was untouched while the collector dropped + * its locks during tcp_time_wait_purge. + */ + if (tsp->tcp_time_wait_schedule == active_schedule) { + tsp->tcp_time_wait_offset = 0; + tsp->tcp_time_wait_schedule = 0; + tsp->tcp_time_wait_tid = 0; + } + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } else { + unsigned int nidx; - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_close, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); + /* + * Locate the next bucket containing entries. + */ + new_schedule = active_schedule + + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + nidx = TW_BUCKET_NEXT(idx); + while (tsp->tcp_time_wait_bucket[nidx] == NULL) { + if (nidx == idx) { + break; + } + nidx = TW_BUCKET_NEXT(nidx); + new_schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); } - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL); } - if (tcp_time_wait->tcp_free_list != NULL) - tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; + /* + * It is possible that the system is under such dire load that between + * the timer scheduling and TIME_WAIT processing delay, execution + * overran the interval allocated to this bucket. + */ + now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; + if (new_schedule <= now) { + /* + * Attempt to right the situation by immediately performing a + * purge on the next bucket. This loop will continue as needed + * until the schedule can be pushed out ahead of the clock. + */ + idx = TW_BUCKET(new_schedule - 1); + goto retry; + } /* - * If the time wait list is not empty and there is no timer running, - * restart it. + * Another thread may have snuck in to reschedule the timer while locks + * were dropped during tcp_time_wait_purge. Defer to the running timer + * if that is the case. */ - if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL && - tcp_time_wait->tcp_time_wait_tid == 0) { - hrtime_t firetime; - - /* shouldn't be necessary, but just in case */ - if (tcp->tcp_time_wait_expire < now) - tcp->tcp_time_wait_expire = now; - - firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now); - /* This ensures that we won't wake up too often. */ - firetime = MAX(TCP_TIME_WAIT_DELAY, firetime); - tcp_time_wait->tcp_time_wait_tid = - timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, - sqp, firetime, CALLOUT_TCP_RESOLUTION, - CALLOUT_FLAG_ROUNDUP); + if (tsp->tcp_time_wait_schedule != active_schedule) { + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; } -#ifdef DEBUG - tcp_time_wait->tcp_time_wait_running = B_FALSE; -#endif - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + + /* + * Schedule the next timer. + */ + tsp->tcp_time_wait_schedule = new_schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(new_schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c index e81d68b423..f4d6c71914 100644 --- a/usr/src/uts/common/inet/tcp/tcp_tunables.c +++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015, Joyent, Inc. + * Copyright 2016 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ @@ -249,7 +249,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = { /* tunable - 0 */ { "_time_wait_interval", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, - {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} }, + {1*SECONDS, TCP_TIME_WAIT_MAX, 1*MINUTES}, {1*MINUTES} }, { "_conn_req_max_q", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 848d27a1e8..cb83b91fad 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent Inc. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ @@ -105,7 +105,7 @@ extern sock_downcalls_t sock_tcp_downcalls; */ #define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached) -/* TCP timers related data strucutres. Refer to tcp_timers.c. */ +/* TCP timers related data structures. Refer to tcp_timers.c. */ typedef struct tcp_timer_s { conn_t *connp; void (*tcpt_proc)(void *); @@ -132,48 +132,79 @@ extern kmem_cache_t *tcp_timercache; (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, (intvl)); \ } + +/* + * Maximum TIME_WAIT timeout. It is defined here (instead of tcp_tunables.c) + * so that other parameters can be derived from it. + */ +#define TCP_TIME_WAIT_MAX (10 * MINUTES) + +/* + * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. + * Running it every 5 seconds seems to yield a reasonable balance between + * cleanup liveliness and system load. + */ +#define TCP_TIME_WAIT_DELAY (5 * SECONDS) + +#define TCP_TIME_WAIT_BUCKETS ((TCP_TIME_WAIT_MAX / TCP_TIME_WAIT_DELAY) + 1) + /* * For scalability, we must not run a timer for every TCP connection * in TIME_WAIT state. To see why, consider (for time wait interval of * 1 minutes): * 10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's * - * This list is ordered by time, so you need only delete from the head - * until you get to entries which aren't old enough to delete yet. - * The list consists of only the detached TIME_WAIT connections. + * Since TIME_WAIT expiration occurs on a per-squeue basis, handling + * connections from all netstacks on the system, a simple queue is inadequate + * for pending entries. This is because tcp_time_wait_interval may differ + * between connections, causing tail insertion to violate expiration order. + * + * Instead of performing expensive sorting or unnecessary list traversal to + * counteract interval variance between netstacks, a timing wheel structure is + * used. The duration covered by each bucket in the wheel is determined by the + * TCP_TIME_WAIT_DELAY (5 seconds). The number of buckets in the wheel is + * determined by dividing the maximum TIME_WAIT interval (10 minutes) by + * TCP_TIME_WAIT_DELAY, with one added bucket for rollover protection. + * (Yielding 121 buckets with the current parameters) When items are inserted + * into the set of buckets, they are indexed by using their expiration time + * divided by the bucket size, modulo the number of buckets. This means that + * when each bucket is processed, all items within should have expired within + * the last TCP_TIME_WAIT_DELAY interval. + * + * Since bucket timer schedules are rounded to the nearest TCP_TIME_WAIT_DELAY + * interval to ensure all connections in the pending bucket will be expired, a + * per-squeue offset is used when doing TIME_WAIT scheduling. This offset is + * between 0 and the TCP_TIME_WAIT_DELAY and is designed to avoid scheduling + * all of the tcp_time_wait_collector threads to run in lock-step. The offset + * is fixed while there are any connections present in the buckets. * * When a tcp_t enters TIME_WAIT state, a timer is started (timeout is * tcps_time_wait_interval). When the tcp_t is detached (upper layer closes - * the end point), it is moved to the time wait list and another timer is - * started (expiry time is set at tcp_time_wait_expire, which is - * also calculated using tcps_time_wait_interval). This means that the - * TIME_WAIT state can be extended (up to doubled) if the tcp_t doesn't - * become detached for a long time. + * the end point), it is scheduled to be cleaned up by the squeue-driving + * tcp_time_wait_collector (also using tcps_time_wait_interval). This means + * that the TIME_WAIT state can be extended (up to doubled) if the tcp_t + * doesn't become detached for a long time. * * The list manipulations (including tcp_time_wait_next/prev) * are protected by the tcp_time_wait_lock. The content of the * detached TIME_WAIT connections is protected by the normal perimeters. * - * This list is per squeue and squeues are shared across the tcp_stack_t's. - * Things on tcp_time_wait_head remain associated with the tcp_stack_t - * and conn_netstack. - * The tcp_t's that are added to tcp_free_list are disassociated and - * have NULL tcp_tcps and conn_netstack pointers. + * These connection lists are per squeue and squeues are shared across the + * tcp_stack_t instances. Things in a tcp_time_wait_bucket remain associated + * with the tcp_stack_t and conn_netstack. Any tcp_t connections stored in the + * tcp_free_list are disassociated and have NULL tcp_tcps and conn_netstack + * pointers. */ typedef struct tcp_squeue_priv_s { kmutex_t tcp_time_wait_lock; + boolean_t tcp_time_wait_collector_active; callout_id_t tcp_time_wait_tid; - tcp_t *tcp_time_wait_head; - tcp_t *tcp_time_wait_tail; + uint64_t tcp_time_wait_cnt; + int64_t tcp_time_wait_schedule; + int64_t tcp_time_wait_offset; + tcp_t *tcp_time_wait_bucket[TCP_TIME_WAIT_BUCKETS]; tcp_t *tcp_free_list; uint_t tcp_free_list_cnt; -#ifdef DEBUG - /* - * For debugging purpose, true when tcp_time_wait_collector() is - * running. - */ - boolean_t tcp_time_wait_running; -#endif } tcp_squeue_priv_t; /* |
