summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c4
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_time_wait.c668
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_tunables.c4
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h79
4 files changed, 435 insertions, 320 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index cf8e0c6bd4..7cfdb9a4a2 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
@@ -99,7 +99,7 @@
* tcps_time_wait_interval since the period before upper layer closes the
* connection is not accounted for when tcp_time_wait_append() is called.
*
- * If uppser layer has closed the connection, call tcp_time_wait_append()
+ * If upper layer has closed the connection, call tcp_time_wait_append()
* directly.
*
*/
diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
index b470934da0..6600296b18 100644
--- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c
+++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/*
@@ -41,13 +41,13 @@
#include <inet/tcp_impl.h>
#include <inet/tcp_cluster.h>
-static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
+
+#define TW_BUCKET(t) \
+ (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
+
+#define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS)
-/*
- * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
- * Running it every 5 seconds seems to give the best results.
- */
-#define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC)
/*
* Remove a connection from the list of detached TIME_WAIT connections.
@@ -56,17 +56,17 @@ static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
* earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
*/
boolean_t
-tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
+tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
{
boolean_t locked = B_FALSE;
- if (tcp_time_wait == NULL) {
- tcp_time_wait = *((tcp_squeue_priv_t **)
+ if (tsp == NULL) {
+ tsp = *((tcp_squeue_priv_t **)
squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_enter(&tsp->tcp_time_wait_lock);
locked = B_TRUE;
} else {
- ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
+ ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
}
/* 0 means that the tcp_t has not been added to the time wait list. */
@@ -74,40 +74,34 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
if (locked)
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_exit(&tsp->tcp_time_wait_lock);
return (B_FALSE);
}
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
+ ASSERT(tsp->tcp_time_wait_cnt > 0);
- if (tcp == tcp_time_wait->tcp_time_wait_head) {
- ASSERT(tcp->tcp_time_wait_prev == NULL);
- tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
- if (tcp_time_wait->tcp_time_wait_head != NULL) {
- tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
- NULL;
- } else {
- tcp_time_wait->tcp_time_wait_tail = NULL;
- }
- } else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
- ASSERT(tcp->tcp_time_wait_next == NULL);
- tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
- ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
- tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
- } else {
- ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
- ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
- tcp->tcp_time_wait_prev->tcp_time_wait_next =
- tcp->tcp_time_wait_next;
+ if (tcp->tcp_time_wait_next != NULL) {
tcp->tcp_time_wait_next->tcp_time_wait_prev =
tcp->tcp_time_wait_prev;
}
+ if (tcp->tcp_time_wait_prev != NULL) {
+ tcp->tcp_time_wait_prev->tcp_time_wait_next =
+ tcp->tcp_time_wait_next;
+ } else {
+ unsigned int bucket;
+
+ bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
+ ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
+ tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
+ }
tcp->tcp_time_wait_next = NULL;
tcp->tcp_time_wait_prev = NULL;
tcp->tcp_time_wait_expire = 0;
+ tsp->tcp_time_wait_cnt--;
if (locked)
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_exit(&tsp->tcp_time_wait_lock);
return (B_TRUE);
}
@@ -126,6 +120,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
+
/*
* Add a connection to the list of detached TIME_WAIT connections
* and set its time to expire.
@@ -135,9 +130,10 @@ tcp_time_wait_append(tcp_t *tcp)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
squeue_t *sqp = tcp->tcp_connp->conn_sqp;
- tcp_squeue_priv_t *tcp_time_wait =
+ tcp_squeue_priv_t *tsp =
*((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
- hrtime_t firetime = 0;
+ int64_t now, schedule;
+ unsigned int bucket;
tcp_timers_stop(tcp);
@@ -146,6 +142,8 @@ tcp_time_wait_append(tcp_t *tcp)
ASSERT(tcp->tcp_ack_tid == 0);
/* must have happened at the time of detaching the tcp */
+ ASSERT(TCP_IS_DETACHED(tcp));
+ ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
ASSERT(tcp->tcp_ptpahn == NULL);
ASSERT(tcp->tcp_flow_stopped == 0);
ASSERT(tcp->tcp_time_wait_next == NULL);
@@ -153,97 +151,112 @@ tcp_time_wait_append(tcp_t *tcp)
ASSERT(tcp->tcp_time_wait_expire == 0);
ASSERT(tcp->tcp_listener == NULL);
- tcp->tcp_time_wait_expire = ddi_get_lbolt64();
- if (IS_LOCAL_HOST(tcp)) {
- /*
- * This is the fastpath for handling localhost connections.
- * Since we don't have to worry about packets on the localhost
- * showing up after a long network delay, we want to expire
- * these quickly so the port range on the localhost doesn't
- * get starved by short-running, local apps.
- *
- * Leave tcp_time_wait_expire at the current time. This
- * essentially means the connection is expired now and it will
- * clean up the next time tcp_time_wait_collector runs. We set
- * firetime to use a short delay so that if we have to start a
- * tcp_time_wait_collector thread below, it runs soon instead
- * of after a delay of time_wait_interval. firetime being set
- * to a non-0 value is also our indicator that we should add
- * this connection to the head of the time wait list (since we
- * are already expired) so that its sure to get cleaned up on
- * the next run of tcp_time_wait_collector (which expects the
- * entries to appear in time-order and stops when it hits the
- * first non-expired entry).
- */
- firetime = TCP_TIME_WAIT_DELAY;
- } else {
- /*
- * Since tcp_time_wait_expire is lbolt64, it should not wrap
- * around in practice. Hence it cannot be 0. Note that zero
- * means that the tcp_t is not in the TIME_WAIT list.
- */
- tcp->tcp_time_wait_expire += MSEC_TO_TICK(
- tcps->tcps_time_wait_interval);
+ TCP_DBGSTAT(tcps, tcp_time_wait);
+ mutex_enter(&tsp->tcp_time_wait_lock);
+
+ /*
+ * Immediately expire loopback connections. Since there is no worry
+ * about packets on the local host showing up after a long network
+ * delay, this is safe and allows much higher rates of connection churn
+ * for applications operating locally.
+ *
+ * This typically bypasses the tcp_free_list fast path due to squeue
+ * re-entry for the loopback close operation.
+ */
+ if (tcp->tcp_loopback) {
+ tcp_time_wait_purge(tcp, tsp);
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
}
- ASSERT(TCP_IS_DETACHED(tcp));
- ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
- ASSERT(tcp->tcp_time_wait_next == NULL);
- ASSERT(tcp->tcp_time_wait_prev == NULL);
- TCP_DBGSTAT(tcps, tcp_time_wait);
+ /*
+ * In order to reap TIME_WAITs reliably, we should use a source of time
+ * that is not adjustable by the user. While it would be more accurate
+ * to grab this timestamp before (potentially) sleeping on the
+ * tcp_time_wait_lock, doing so complicates bucket addressing later.
+ */
+ now = ddi_get_lbolt64();
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
- if (tcp_time_wait->tcp_time_wait_head == NULL) {
- ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
- tcp_time_wait->tcp_time_wait_head = tcp;
+ /*
+ * Each squeue uses an arbitrary time offset when scheduling
+ * expiration timers. This prevents the bucketing from forcing
+ * tcp_time_wait_collector to run in locksetup across squeues.
+ *
+ * This offset is (re)initialized when a new TIME_WAIT connection is
+ * added to an squeue which has no connections waiting to expire.
+ */
+ if (tsp->tcp_time_wait_tid == 0) {
+ ASSERT(tsp->tcp_time_wait_cnt == 0);
+ tsp->tcp_time_wait_offset =
+ now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+ }
+ now -= tsp->tcp_time_wait_offset;
+
+ /*
+ * Use the netstack-defined timeout, rounded up to the minimum
+ * time_wait_collector interval.
+ */
+ schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
+ tcp->tcp_time_wait_expire = schedule;
+
+ /*
+ * Append the connection into the appropriate bucket.
+ */
+ bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
+ tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
+ tsp->tcp_time_wait_bucket[bucket] = tcp;
+ if (tcp->tcp_time_wait_next != NULL) {
+ ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
+ tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
+ }
+ tsp->tcp_time_wait_cnt++;
+
+ /*
+ * Round delay up to the nearest bucket boundary.
+ */
+ schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+ schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+
+ /*
+ * The newly inserted entry may require a tighter schedule for the
+ * expiration timer.
+ */
+ if (schedule < tsp->tcp_time_wait_schedule) {
+ callout_id_t old_tid = tsp->tcp_time_wait_tid;
+
+ tsp->tcp_time_wait_schedule = schedule;
+ tsp->tcp_time_wait_tid =
+ timeout_generic(CALLOUT_NORMAL,
+ tcp_time_wait_collector, sqp,
+ TICK_TO_NSEC(schedule - now),
+ CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
/*
- * Even if the list was empty before, there may be a timer
- * running since a tcp_t can be removed from the list
- * in other places, such as tcp_clean_death(). So check if
- * a timer is needed.
- */
- if (tcp_time_wait->tcp_time_wait_tid == 0) {
- if (firetime == 0)
- firetime = (hrtime_t)
- (tcps->tcps_time_wait_interval + 1) *
- MICROSEC;
-
- tcp_time_wait->tcp_time_wait_tid =
- timeout_generic(CALLOUT_NORMAL,
- tcp_time_wait_collector, sqp, firetime,
- CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
- }
- tcp_time_wait->tcp_time_wait_tail = tcp;
- } else {
- /*
- * The list is not empty, so a timer must be running. If not,
- * tcp_time_wait_collector() must be running on this
- * tcp_time_wait list at the same time.
+ * It is possible for the timer to fire before the untimeout
+ * action is able to complete. In that case, the exclusion
+ * offered by the tcp_time_wait_collector_active flag will
+ * prevent multiple collector threads from processing records
+ * simultaneously from the same squeue.
*/
- ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 ||
- tcp_time_wait->tcp_time_wait_running);
- ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
- ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
- TCPS_TIME_WAIT);
-
- if (firetime == 0) {
- /* add at end */
- tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next =
- tcp;
- tcp->tcp_time_wait_prev =
- tcp_time_wait->tcp_time_wait_tail;
- tcp_time_wait->tcp_time_wait_tail = tcp;
- } else {
- /* add at head */
- tcp->tcp_time_wait_next =
- tcp_time_wait->tcp_time_wait_head;
- tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
- tcp;
- tcp_time_wait->tcp_time_wait_head = tcp;
- }
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ (void) untimeout_default(old_tid, 0);
+ return;
+ }
+
+ /*
+ * Start a fresh timer if none exists.
+ */
+ if (tsp->tcp_time_wait_schedule == 0) {
+ ASSERT(tsp->tcp_time_wait_tid == 0);
+
+ tsp->tcp_time_wait_schedule = schedule;
+ tsp->tcp_time_wait_tid =
+ timeout_generic(CALLOUT_NORMAL,
+ tcp_time_wait_collector, sqp,
+ TICK_TO_NSEC(schedule - now),
+ CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
}
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_exit(&tsp->tcp_time_wait_lock);
}
/*
@@ -278,216 +291,287 @@ tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
tcp_close_detached(tcp);
}
+
+static void
+tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
+{
+ mblk_t *mp;
+ conn_t *connp = tcp->tcp_connp;
+ kmutex_t *lock;
+
+ ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
+ ASSERT(connp->conn_fanout != NULL);
+
+ lock = &connp->conn_fanout->connf_lock;
+
+ /*
+ * This is essentially a TIME_WAIT reclaim fast path optimization for
+ * performance where the connection is checked under the fanout lock
+ * (so that no one else can get access to the conn_t) that the refcnt
+ * is 2 (one each for TCP and the classifier hash list). That is the
+ * case and clustering callbacks are not enabled, the conn can be
+ * removed under the fanout lock and avoid clean-up under the squeue.
+ *
+ * This optimization is forgone when clustering is enabled since the
+ * clustering callback must be made before setting the CONDEMNED flag
+ * and after dropping all locks
+ *
+ * See the comments in tcp_closei_local for additional information
+ * regarding the refcnt logic.
+ */
+ if (mutex_tryenter(lock)) {
+ mutex_enter(&connp->conn_lock);
+ if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
+ ipcl_hash_remove_locked(connp, connp->conn_fanout);
+ /*
+ * Set the CONDEMNED flag now itself so that the refcnt
+ * cannot increase due to any walker.
+ */
+ connp->conn_state_flags |= CONN_CONDEMNED;
+ mutex_exit(&connp->conn_lock);
+ mutex_exit(lock);
+ if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
+ /*
+ * Add to head of tcp_free_list
+ */
+ tcp_cleanup(tcp);
+ ASSERT(connp->conn_latch == NULL);
+ ASSERT(connp->conn_policy == NULL);
+ ASSERT(tcp->tcp_tcps == NULL);
+ ASSERT(connp->conn_netstack == NULL);
+
+ tcp->tcp_time_wait_next = tsp->tcp_free_list;
+ tcp->tcp_in_free_list = B_TRUE;
+ tsp->tcp_free_list = tcp;
+ tsp->tcp_free_list_cnt++;
+ } else {
+ /*
+ * Do not add to tcp_free_list
+ */
+ tcp_bind_hash_remove(tcp);
+ ixa_cleanup(tcp->tcp_connp->conn_ixa);
+ tcp_ipsec_cleanup(tcp);
+ CONN_DEC_REF(tcp->tcp_connp);
+ }
+
+ /*
+ * With the fast-path complete, we can bail.
+ */
+ return;
+ } else {
+ /*
+ * Fall back to slow path.
+ */
+ CONN_INC_REF_LOCKED(connp);
+ mutex_exit(&connp->conn_lock);
+ mutex_exit(lock);
+ }
+ } else {
+ CONN_INC_REF(connp);
+ }
+
+ /*
+ * We can reuse the closemp here since conn has detached (otherwise we
+ * wouldn't even be in time_wait list). It is safe to change
+ * tcp_closemp_used without taking a lock as no other thread can
+ * concurrently access it at this point in the connection lifecycle.
+ */
+ if (tcp->tcp_closemp.b_prev == NULL) {
+ tcp->tcp_closemp_used = B_TRUE;
+ } else {
+ cmn_err(CE_PANIC,
+ "tcp_timewait_collector: concurrent use of tcp_closemp: "
+ "connp %p tcp %p\n", (void *)connp, (void *)tcp);
+ }
+
+ TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
+ mp = &tcp->tcp_closemp;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
+ SQ_FILL, SQTAG_TCP_TIMEWAIT);
+ mutex_enter(&tsp->tcp_time_wait_lock);
+}
+
/*
- * Blows away all tcps whose TIME_WAIT has expired. List traversal
- * is done forwards from the head.
- * This walks all stack instances since
- * tcp_time_wait remains global across all stacks.
+ * Purge any tcp_t instances associated with this squeue which have expired
+ * from the TIME_WAIT state.
*/
-/* ARGSUSED */
void
tcp_time_wait_collector(void *arg)
{
tcp_t *tcp;
- int64_t now;
- mblk_t *mp;
- conn_t *connp;
- kmutex_t *lock;
- boolean_t removed;
- extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t,
- uint8_t *, in_port_t, uint8_t *, in_port_t, void *);
+ int64_t now, active_schedule, new_schedule;
+ unsigned int idx;
squeue_t *sqp = (squeue_t *)arg;
- tcp_squeue_priv_t *tcp_time_wait =
+ tcp_squeue_priv_t *tsp =
*((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
- tcp_time_wait->tcp_time_wait_tid = 0;
-#ifdef DEBUG
- tcp_time_wait->tcp_time_wait_running = B_TRUE;
-#endif
+ mutex_enter(&tsp->tcp_time_wait_lock);
+
+ /*
+ * Because of timer scheduling complexity and the fact that the
+ * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
+ * possible for multiple tcp_time_wait_collector threads to run against
+ * the same squeue. This flag is used to exclude other collectors from
+ * the squeue during execution.
+ */
+ if (tsp->tcp_time_wait_collector_active) {
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
+ }
+ tsp->tcp_time_wait_collector_active = B_TRUE;
- if (tcp_time_wait->tcp_free_list != NULL &&
- tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
+ /*
+ * Purge the free list if necessary
+ */
+ if (tsp->tcp_free_list != NULL) {
TCP_G_STAT(tcp_freelist_cleanup);
- while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
- tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
+ while ((tcp = tsp->tcp_free_list) != NULL) {
+ tsp->tcp_free_list = tcp->tcp_time_wait_next;
tcp->tcp_time_wait_next = NULL;
- tcp_time_wait->tcp_free_list_cnt--;
+ tsp->tcp_free_list_cnt--;
ASSERT(tcp->tcp_tcps == NULL);
CONN_DEC_REF(tcp->tcp_connp);
}
- ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
+ ASSERT(tsp->tcp_free_list_cnt == 0);
}
/*
- * In order to reap time waits reliably, we should use a
- * source of time that is not adjustable by the user -- hence
- * the call to ddi_get_lbolt64().
+ * If there are no connections pending, clear timer-related state to be
+ * reinitialized by the next caller.
*/
- now = ddi_get_lbolt64();
- while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
+ if (tsp->tcp_time_wait_cnt == 0) {
+ tsp->tcp_time_wait_offset = 0;
+ tsp->tcp_time_wait_schedule = 0;
+ tsp->tcp_time_wait_tid = 0;
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
+ }
+
+ /*
+ * Grab the bucket which we were scheduled to cleanse.
+ */
+ active_schedule = tsp->tcp_time_wait_schedule;
+ idx = TW_BUCKET(active_schedule - 1);
+ now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
+retry:
+ tcp = tsp->tcp_time_wait_bucket[idx];
+
+ while (tcp != NULL) {
/*
- * lbolt64 should not wrap around in practice... So we can
- * do a direct comparison.
+ * Since the bucket count is sized to prevent wrap-around
+ * during typical operation and timers are schedule to process
+ * buckets with only expired connections, there is only one
+ * reason to encounter a connection expiring in the future:
+ * The tcp_time_wait_collector thread has been so delayed in
+ * its processing that connections have wrapped around the
+ * timing wheel into this bucket.
+ *
+ * In that case, the remaining entires in the bucket can be
+ * ignored since, being appended sequentially, they should all
+ * expire in the future.
*/
- if (now < tcp->tcp_time_wait_expire)
+ if (now < tcp->tcp_time_wait_expire) {
break;
+ }
- removed = tcp_time_wait_remove(tcp, tcp_time_wait);
- ASSERT(removed);
+ /*
+ * Pull the connection out of the bucket.
+ */
+ VERIFY(tcp_time_wait_remove(tcp, tsp));
- connp = tcp->tcp_connp;
- ASSERT(connp->conn_fanout != NULL);
- lock = &connp->conn_fanout->connf_lock;
/*
- * This is essentially a TW reclaim fast path optimization for
- * performance where the timewait collector checks under the
- * fanout lock (so that no one else can get access to the
- * conn_t) that the refcnt is 2 i.e. one for TCP and one for
- * the classifier hash list. If ref count is indeed 2, we can
- * just remove the conn under the fanout lock and avoid
- * cleaning up the conn under the squeue, provided that
- * clustering callbacks are not enabled. If clustering is
- * enabled, we need to make the clustering callback before
- * setting the CONDEMNED flag and after dropping all locks and
- * so we forego this optimization and fall back to the slow
- * path. Also please see the comments in tcp_closei_local
- * regarding the refcnt logic.
+ * Purge the connection.
*
- * Since we are holding the tcp_time_wait_lock, its better
- * not to block on the fanout_lock because other connections
- * can't add themselves to time_wait list. So we do a
- * tryenter instead of mutex_enter.
+ * While tcp_time_wait_lock will be temporarily dropped as part
+ * of the process, there is no risk of the timer being
+ * (re)scheduled while the collector is running since a value
+ * corresponding to the past is left in tcp_time_wait_schedule.
*/
- if (mutex_tryenter(lock)) {
- mutex_enter(&connp->conn_lock);
- if ((connp->conn_ref == 2) &&
- (cl_inet_disconnect == NULL)) {
- ipcl_hash_remove_locked(connp,
- connp->conn_fanout);
- /*
- * Set the CONDEMNED flag now itself so that
- * the refcnt cannot increase due to any
- * walker.
- */
- connp->conn_state_flags |= CONN_CONDEMNED;
- mutex_exit(lock);
- mutex_exit(&connp->conn_lock);
- if (tcp_time_wait->tcp_free_list_cnt <
- tcp_free_list_max_cnt) {
- /* Add to head of tcp_free_list */
- mutex_exit(
- &tcp_time_wait->tcp_time_wait_lock);
- tcp_cleanup(tcp);
- ASSERT(connp->conn_latch == NULL);
- ASSERT(connp->conn_policy == NULL);
- ASSERT(tcp->tcp_tcps == NULL);
- ASSERT(connp->conn_netstack == NULL);
-
- mutex_enter(
- &tcp_time_wait->tcp_time_wait_lock);
- tcp->tcp_time_wait_next =
- tcp_time_wait->tcp_free_list;
- tcp_time_wait->tcp_free_list = tcp;
- tcp_time_wait->tcp_free_list_cnt++;
- continue;
- } else {
- /* Do not add to tcp_free_list */
- mutex_exit(
- &tcp_time_wait->tcp_time_wait_lock);
- tcp_bind_hash_remove(tcp);
- ixa_cleanup(tcp->tcp_connp->conn_ixa);
- tcp_ipsec_cleanup(tcp);
- CONN_DEC_REF(tcp->tcp_connp);
- }
- } else {
- CONN_INC_REF_LOCKED(connp);
- mutex_exit(lock);
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
- mutex_exit(&connp->conn_lock);
- /*
- * We can reuse the closemp here since conn has
- * detached (otherwise we wouldn't even be in
- * time_wait list). tcp_closemp_used can safely
- * be changed without taking a lock as no other
- * thread can concurrently access it at this
- * point in the connection lifecycle.
- */
+ tcp_time_wait_purge(tcp, tsp);
- if (tcp->tcp_closemp.b_prev == NULL)
- tcp->tcp_closemp_used = B_TRUE;
- else
- cmn_err(CE_PANIC,
- "tcp_timewait_collector: "
- "concurrent use of tcp_closemp: "
- "connp %p tcp %p\n", (void *)connp,
- (void *)tcp);
-
- TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
- mp = &tcp->tcp_closemp;
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_timewait_close, connp, NULL,
- SQ_FILL, SQTAG_TCP_TIMEWAIT);
- }
- } else {
- mutex_enter(&connp->conn_lock);
- CONN_INC_REF_LOCKED(connp);
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
- mutex_exit(&connp->conn_lock);
- /*
- * We can reuse the closemp here since conn has
- * detached (otherwise we wouldn't even be in
- * time_wait list). tcp_closemp_used can safely
- * be changed without taking a lock as no other
- * thread can concurrently access it at this
- * point in the connection lifecycle.
- */
+ /*
+ * Because tcp_time_wait_remove clears the tcp_time_wait_next
+ * field, the next item must be grabbed directly from the
+ * bucket itself.
+ */
+ tcp = tsp->tcp_time_wait_bucket[idx];
+ }
+
+ if (tsp->tcp_time_wait_cnt == 0) {
+ /*
+ * There is not a need for the collector to schedule a new
+ * timer if no pending items remain. The timer state can be
+ * cleared only if it was untouched while the collector dropped
+ * its locks during tcp_time_wait_purge.
+ */
+ if (tsp->tcp_time_wait_schedule == active_schedule) {
+ tsp->tcp_time_wait_offset = 0;
+ tsp->tcp_time_wait_schedule = 0;
+ tsp->tcp_time_wait_tid = 0;
+ }
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
+ } else {
+ unsigned int nidx;
- if (tcp->tcp_closemp.b_prev == NULL)
- tcp->tcp_closemp_used = B_TRUE;
- else
- cmn_err(CE_PANIC, "tcp_timewait_collector: "
- "concurrent use of tcp_closemp: "
- "connp %p tcp %p\n", (void *)connp,
- (void *)tcp);
-
- TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
- mp = &tcp->tcp_closemp;
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_timewait_close, connp, NULL,
- SQ_FILL, SQTAG_TCP_TIMEWAIT);
+ /*
+ * Locate the next bucket containing entries.
+ */
+ new_schedule = active_schedule
+ + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+ nidx = TW_BUCKET_NEXT(idx);
+ while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
+ if (nidx == idx) {
+ break;
+ }
+ nidx = TW_BUCKET_NEXT(nidx);
+ new_schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
}
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
+ ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
}
- if (tcp_time_wait->tcp_free_list != NULL)
- tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
+ /*
+ * It is possible that the system is under such dire load that between
+ * the timer scheduling and TIME_WAIT processing delay, execution
+ * overran the interval allocated to this bucket.
+ */
+ now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
+ if (new_schedule <= now) {
+ /*
+ * Attempt to right the situation by immediately performing a
+ * purge on the next bucket. This loop will continue as needed
+ * until the schedule can be pushed out ahead of the clock.
+ */
+ idx = TW_BUCKET(new_schedule - 1);
+ goto retry;
+ }
/*
- * If the time wait list is not empty and there is no timer running,
- * restart it.
+ * Another thread may have snuck in to reschedule the timer while locks
+ * were dropped during tcp_time_wait_purge. Defer to the running timer
+ * if that is the case.
*/
- if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL &&
- tcp_time_wait->tcp_time_wait_tid == 0) {
- hrtime_t firetime;
-
- /* shouldn't be necessary, but just in case */
- if (tcp->tcp_time_wait_expire < now)
- tcp->tcp_time_wait_expire = now;
-
- firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now);
- /* This ensures that we won't wake up too often. */
- firetime = MAX(TCP_TIME_WAIT_DELAY, firetime);
- tcp_time_wait->tcp_time_wait_tid =
- timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector,
- sqp, firetime, CALLOUT_TCP_RESOLUTION,
- CALLOUT_FLAG_ROUNDUP);
+ if (tsp->tcp_time_wait_schedule != active_schedule) {
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
}
-#ifdef DEBUG
- tcp_time_wait->tcp_time_wait_running = B_FALSE;
-#endif
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+
+ /*
+ * Schedule the next timer.
+ */
+ tsp->tcp_time_wait_schedule = new_schedule;
+ tsp->tcp_time_wait_tid =
+ timeout_generic(CALLOUT_NORMAL,
+ tcp_time_wait_collector, sqp,
+ TICK_TO_NSEC(new_schedule - now),
+ CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
}
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c
index e81d68b423..f4d6c71914 100644
--- a/usr/src/uts/common/inet/tcp/tcp_tunables.c
+++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015, Joyent, Inc.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
@@ -249,7 +249,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = {
/* tunable - 0 */
{ "_time_wait_interval", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
- {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
+ {1*SECONDS, TCP_TIME_WAIT_MAX, 1*MINUTES}, {1*MINUTES} },
{ "_conn_req_max_q", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 848d27a1e8..cb83b91fad 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent Inc.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
@@ -105,7 +105,7 @@ extern sock_downcalls_t sock_tcp_downcalls;
*/
#define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached)
-/* TCP timers related data strucutres. Refer to tcp_timers.c. */
+/* TCP timers related data structures. Refer to tcp_timers.c. */
typedef struct tcp_timer_s {
conn_t *connp;
void (*tcpt_proc)(void *);
@@ -132,48 +132,79 @@ extern kmem_cache_t *tcp_timercache;
(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, (intvl)); \
}
+
+/*
+ * Maximum TIME_WAIT timeout. It is defined here (instead of tcp_tunables.c)
+ * so that other parameters can be derived from it.
+ */
+#define TCP_TIME_WAIT_MAX (10 * MINUTES)
+
+/*
+ * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
+ * Running it every 5 seconds seems to yield a reasonable balance between
+ * cleanup liveliness and system load.
+ */
+#define TCP_TIME_WAIT_DELAY (5 * SECONDS)
+
+#define TCP_TIME_WAIT_BUCKETS ((TCP_TIME_WAIT_MAX / TCP_TIME_WAIT_DELAY) + 1)
+
/*
* For scalability, we must not run a timer for every TCP connection
* in TIME_WAIT state. To see why, consider (for time wait interval of
* 1 minutes):
* 10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's
*
- * This list is ordered by time, so you need only delete from the head
- * until you get to entries which aren't old enough to delete yet.
- * The list consists of only the detached TIME_WAIT connections.
+ * Since TIME_WAIT expiration occurs on a per-squeue basis, handling
+ * connections from all netstacks on the system, a simple queue is inadequate
+ * for pending entries. This is because tcp_time_wait_interval may differ
+ * between connections, causing tail insertion to violate expiration order.
+ *
+ * Instead of performing expensive sorting or unnecessary list traversal to
+ * counteract interval variance between netstacks, a timing wheel structure is
+ * used. The duration covered by each bucket in the wheel is determined by the
+ * TCP_TIME_WAIT_DELAY (5 seconds). The number of buckets in the wheel is
+ * determined by dividing the maximum TIME_WAIT interval (10 minutes) by
+ * TCP_TIME_WAIT_DELAY, with one added bucket for rollover protection.
+ * (Yielding 121 buckets with the current parameters) When items are inserted
+ * into the set of buckets, they are indexed by using their expiration time
+ * divided by the bucket size, modulo the number of buckets. This means that
+ * when each bucket is processed, all items within should have expired within
+ * the last TCP_TIME_WAIT_DELAY interval.
+ *
+ * Since bucket timer schedules are rounded to the nearest TCP_TIME_WAIT_DELAY
+ * interval to ensure all connections in the pending bucket will be expired, a
+ * per-squeue offset is used when doing TIME_WAIT scheduling. This offset is
+ * between 0 and the TCP_TIME_WAIT_DELAY and is designed to avoid scheduling
+ * all of the tcp_time_wait_collector threads to run in lock-step. The offset
+ * is fixed while there are any connections present in the buckets.
*
* When a tcp_t enters TIME_WAIT state, a timer is started (timeout is
* tcps_time_wait_interval). When the tcp_t is detached (upper layer closes
- * the end point), it is moved to the time wait list and another timer is
- * started (expiry time is set at tcp_time_wait_expire, which is
- * also calculated using tcps_time_wait_interval). This means that the
- * TIME_WAIT state can be extended (up to doubled) if the tcp_t doesn't
- * become detached for a long time.
+ * the end point), it is scheduled to be cleaned up by the squeue-driving
+ * tcp_time_wait_collector (also using tcps_time_wait_interval). This means
+ * that the TIME_WAIT state can be extended (up to doubled) if the tcp_t
+ * doesn't become detached for a long time.
*
* The list manipulations (including tcp_time_wait_next/prev)
* are protected by the tcp_time_wait_lock. The content of the
* detached TIME_WAIT connections is protected by the normal perimeters.
*
- * This list is per squeue and squeues are shared across the tcp_stack_t's.
- * Things on tcp_time_wait_head remain associated with the tcp_stack_t
- * and conn_netstack.
- * The tcp_t's that are added to tcp_free_list are disassociated and
- * have NULL tcp_tcps and conn_netstack pointers.
+ * These connection lists are per squeue and squeues are shared across the
+ * tcp_stack_t instances. Things in a tcp_time_wait_bucket remain associated
+ * with the tcp_stack_t and conn_netstack. Any tcp_t connections stored in the
+ * tcp_free_list are disassociated and have NULL tcp_tcps and conn_netstack
+ * pointers.
*/
typedef struct tcp_squeue_priv_s {
kmutex_t tcp_time_wait_lock;
+ boolean_t tcp_time_wait_collector_active;
callout_id_t tcp_time_wait_tid;
- tcp_t *tcp_time_wait_head;
- tcp_t *tcp_time_wait_tail;
+ uint64_t tcp_time_wait_cnt;
+ int64_t tcp_time_wait_schedule;
+ int64_t tcp_time_wait_offset;
+ tcp_t *tcp_time_wait_bucket[TCP_TIME_WAIT_BUCKETS];
tcp_t *tcp_free_list;
uint_t tcp_free_list_cnt;
-#ifdef DEBUG
- /*
- * For debugging purpose, true when tcp_time_wait_collector() is
- * running.
- */
- boolean_t tcp_time_wait_running;
-#endif
} tcp_squeue_priv_t;
/*