1 files changed, 1046 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
new file mode 100644
index 0000000000..5c87620fca
--- /dev/null
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -0,0 +1,1046 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/types.h>
+#include <sys/strlog.h>
+#include <sys/strsun.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+#include <sys/callo.h>
+#include <sys/strsubr.h>
+
+#include <inet/common.h>
+#include <inet/ip.h>
+#include <inet/ip_ire.h>
+#include <inet/ip_rts.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+
+/*
+ * Implementation of TCP Timers.
+ * =============================
+ *
+ * INTERFACE:
+ *
+ * There are two basic functions dealing with tcp timers:
+ *
+ *	timeout_id_t	tcp_timeout(connp, func, time)
+ * 	clock_t		tcp_timeout_cancel(connp, timeout_id)
+ *	TCP_TIMER_RESTART(tcp, intvl)
+ *
+ * tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
+ * after 'time' ticks passed. The function called by timeout() must adhere to
+ * the same restrictions as a driver soft interrupt handler - it must not sleep
+ * or call other functions that might sleep. The value returned is the opaque
+ * non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
+ * cancel the request. The call to tcp_timeout() may fail in which case it
+ * returns zero. This is different from the timeout(9F) function which never
+ * fails.
+ *
+ * The call-back function 'func' always receives 'connp' as its single
+ * argument. It is always executed in the squeue corresponding to the tcp
+ * structure. The tcp structure is guaranteed to be present at the time the
+ * call-back is called.
+ *
+ * NOTE: The call-back function 'func' is never called if tcp is in
+ * 	the TCPS_CLOSED state.
+ *
+ * tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
+ * request. locks acquired by the call-back routine should not be held across
+ * the call to tcp_timeout_cancel() or a deadlock may result.
+ *
+ * tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
+ * Otherwise, it returns an integer value greater than or equal to 0. In
+ * particular, if the call-back function is already placed on the squeue, it can
+ * not be canceled.
+ *
+ * NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
+ * 	within squeue context corresponding to the tcp instance. Since the
+ *	call-back is also called via the same squeue, there are no race
+ *	conditions described in untimeout(9F) manual page since all calls are
+ *	strictly serialized.
+ *
+ *      TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
+ *	stored in tcp_timer_tid and starts a new one using
+ *	MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
+ *	and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
+ *	field.
+ *
+ * NOTE: since the timeout cancellation is not guaranteed, the cancelled
+ *	call-back may still be called, so it is possible tcp_timer() will be
+ *	called several times. This should not be a problem since tcp_timer()
+ *	should always check the tcp instance state.
+ *
+ *
+ * IMPLEMENTATION:
+ *
+ * TCP timers are implemented using three-stage process. The call to
+ * tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
+ * when the timer expires. The tcp_timer_callback() arranges the call of the
+ * tcp_timer_handler() function via squeue corresponding to the tcp
+ * instance. The tcp_timer_handler() calls actual requested timeout call-back
+ * and passes tcp instance as an argument to it. Information is passed between
+ * stages using the tcp_timer_t structure which contains the connp pointer, the
+ * tcp call-back to call and the timeout id returned by the timeout(9F).
+ *
+ * The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
+ * like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
+ * mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
+ * returns the pointer to this mblk.
+ *
+ * The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
+ * looks like a normal mblk without actual dblk attached to it.
+ *
+ * To optimize performance each tcp instance holds a small cache of timer
+ * mblocks. In the current implementation it caches up to two timer mblocks per
+ * tcp instance. The cache is preserved over tcp frees and is only freed when
+ * the whole tcp structure is destroyed by its kmem destructor. Since all tcp
+ * timer processing happens on a corresponding squeue, the cache manipulation
+ * does not require any locks. Experiments show that majority of timer mblocks
+ * allocations are satisfied from the tcp cache and do not involve kmem calls.
+ *
+ * The tcp_timeout() places a refhold on the connp instance which guarantees
+ * that it will be present at the time the call-back function fires. The
+ * tcp_timer_handler() drops the reference after calling the call-back, so the
+ * call-back function does not need to manipulate the references explicitly.
+ */
+
+kmem_cache_t *tcp_timercache;
+
+static void	tcp_ip_notify(tcp_t *);
+static void	tcp_timer_callback(void *);
+static void	tcp_timer_free(tcp_t *, mblk_t *);
+static void	tcp_timer_handler(void *, mblk_t *, void *, ip_recv_attr_t *);
+
+timeout_id_t
+tcp_timeout(conn_t *connp, void (*f)(void *), clock_t tim)
+{
+	mblk_t *mp;
+	tcp_timer_t *tcpt;
+	tcp_t *tcp = connp->conn_tcp;
+
+	ASSERT(connp->conn_sqp != NULL);
+
+	TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_calls);
+
+	if (tcp->tcp_timercache == NULL) {
+		mp = tcp_timermp_alloc(KM_NOSLEEP | KM_PANIC);
+	} else {
+		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timeout_cached_alloc);
+		mp = tcp->tcp_timercache;
+		tcp->tcp_timercache = mp->b_next;
+		mp->b_next = NULL;
+		ASSERT(mp->b_wptr == NULL);
+	}
+
+	CONN_INC_REF(connp);
+	tcpt = (tcp_timer_t *)mp->b_rptr;
+	tcpt->connp = connp;
+	tcpt->tcpt_proc = f;
+	/*
+	 * TCP timers are normal timeouts. Plus, they do not require more than
+	 * a 10 millisecond resolution. By choosing a coarser resolution and by
+	 * rounding up the expiration to the next resolution boundary, we can
+	 * batch timers in the callout subsystem to make TCP timers more
+	 * efficient. The roundup also protects short timers from expiring too
+	 * early before they have a chance to be cancelled.
+	 */
+	tcpt->tcpt_tid = timeout_generic(CALLOUT_NORMAL, tcp_timer_callback, mp,
+	    TICK_TO_NSEC(tim), CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
+
+	return ((timeout_id_t)mp);
+}
+
+static void
+tcp_timer_callback(void *arg)
+{
+	mblk_t *mp = (mblk_t *)arg;
+	tcp_timer_t *tcpt;
+	conn_t	*connp;
+
+	tcpt = (tcp_timer_t *)mp->b_rptr;
+	connp = tcpt->connp;
+	SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timer_handler, connp,
+	    NULL, SQ_FILL, SQTAG_TCP_TIMER);
+}
+
+/* ARGSUSED */
+static void
+tcp_timer_handler(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
+{
+	tcp_timer_t *tcpt;
+	conn_t *connp = (conn_t *)arg;
+	tcp_t *tcp = connp->conn_tcp;
+
+	tcpt = (tcp_timer_t *)mp->b_rptr;
+	ASSERT(connp == tcpt->connp);
+	ASSERT((squeue_t *)arg2 == connp->conn_sqp);
+
+	/*
+	 * If the TCP has reached the closed state, don't proceed any
+	 * further. This TCP logically does not exist on the system.
+	 * tcpt_proc could for example access queues, that have already
+	 * been qprocoff'ed off.
+	 */
+	if (tcp->tcp_state != TCPS_CLOSED) {
+		(*tcpt->tcpt_proc)(connp);
+	} else {
+		tcp->tcp_timer_tid = 0;
+	}
+	tcp_timer_free(connp->conn_tcp, mp);
+}
+
+/*
+ * There is potential race with untimeout and the handler firing at the same
+ * time. The mblock may be freed by the handler while we are trying to use
+ * it. But since both should execute on the same squeue, this race should not
+ * occur.
+ */
+clock_t
+tcp_timeout_cancel(conn_t *connp, timeout_id_t id)
+{
+	mblk_t	*mp = (mblk_t *)id;
+	tcp_timer_t *tcpt;
+	clock_t delta;
+
+	TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_cancel_reqs);
+
+	if (mp == NULL)
+		return (-1);
+
+	tcpt = (tcp_timer_t *)mp->b_rptr;
+	ASSERT(tcpt->connp == connp);
+
+	delta = untimeout_default(tcpt->tcpt_tid, 0);
+
+	if (delta >= 0) {
+		TCP_DBGSTAT(connp->conn_tcp->tcp_tcps, tcp_timeout_canceled);
+		tcp_timer_free(connp->conn_tcp, mp);
+		CONN_DEC_REF(connp);
+	}
+
+	return (delta);
+}
+
+/*
+ * Allocate space for the timer event. The allocation looks like mblk, but it is
+ * not a proper mblk. To avoid confusion we set b_wptr to NULL.
+ *
+ * Dealing with failures: If we can't allocate from the timer cache we try
+ * allocating from dblock caches using allocb_tryhard(). In this case b_wptr
+ * points to b_rptr.
+ * If we can't allocate anything using allocb_tryhard(), we perform a last
+ * attempt and use kmem_alloc_tryhard(). In this case we set b_wptr to -1 and
+ * save the actual allocation size in b_datap.
+ */
+mblk_t *
+tcp_timermp_alloc(int kmflags)
+{
+	mblk_t *mp = (mblk_t *)kmem_cache_alloc(tcp_timercache,
+	    kmflags & ~KM_PANIC);
+
+	if (mp != NULL) {
+		mp->b_next = mp->b_prev = NULL;
+		mp->b_rptr = (uchar_t *)(&mp[1]);
+		mp->b_wptr = NULL;
+		mp->b_datap = NULL;
+		mp->b_queue = NULL;
+		mp->b_cont = NULL;
+	} else if (kmflags & KM_PANIC) {
+		/*
+		 * Failed to allocate memory for the timer. Try allocating from
+		 * dblock caches.
+		 */
+		/* ipclassifier calls this from a constructor - hence no tcps */
+		TCP_G_STAT(tcp_timermp_allocfail);
+		mp = allocb_tryhard(sizeof (tcp_timer_t));
+		if (mp == NULL) {
+			size_t size = 0;
+			/*
+			 * Memory is really low. Try tryhard allocation.
+			 *
+			 * ipclassifier calls this from a constructor -
+			 * hence no tcps
+			 */
+			TCP_G_STAT(tcp_timermp_allocdblfail);
+			mp = kmem_alloc_tryhard(sizeof (mblk_t) +
+			    sizeof (tcp_timer_t), &size, kmflags);
+			mp->b_rptr = (uchar_t *)(&mp[1]);
+			mp->b_next = mp->b_prev = NULL;
+			mp->b_wptr = (uchar_t *)-1;
+			mp->b_datap = (dblk_t *)size;
+			mp->b_queue = NULL;
+			mp->b_cont = NULL;
+		}
+		ASSERT(mp->b_wptr != NULL);
+	}
+	/* ipclassifier calls this from a constructor - hence no tcps */
+	TCP_G_DBGSTAT(tcp_timermp_alloced);
+
+	return (mp);
+}
+
+/*
+ * Free per-tcp timer cache.
+ * It can only contain entries from tcp_timercache.
+ */
+void
+tcp_timermp_free(tcp_t *tcp)
+{
+	mblk_t *mp;
+
+	while ((mp = tcp->tcp_timercache) != NULL) {
+		ASSERT(mp->b_wptr == NULL);
+		tcp->tcp_timercache = tcp->tcp_timercache->b_next;
+		kmem_cache_free(tcp_timercache, mp);
+	}
+}
+
+/*
+ * Free timer event. Put it on the per-tcp timer cache if there is not too many
+ * events there already (currently at most two events are cached).
+ * If the event is not allocated from the timer cache, free it right away.
+ */
+static void
+tcp_timer_free(tcp_t *tcp, mblk_t *mp)
+{
+	mblk_t *mp1 = tcp->tcp_timercache;
+
+	if (mp->b_wptr != NULL) {
+		/*
+		 * This allocation is not from a timer cache, free it right
+		 * away.
+		 */
+		if (mp->b_wptr != (uchar_t *)-1)
+			freeb(mp);
+		else
+			kmem_free(mp, (size_t)mp->b_datap);
+	} else if (mp1 == NULL || mp1->b_next == NULL) {
+		/* Cache this timer block for future allocations */
+		mp->b_rptr = (uchar_t *)(&mp[1]);
+		mp->b_next = mp1;
+		tcp->tcp_timercache = mp;
+	} else {
+		kmem_cache_free(tcp_timercache, mp);
+		TCP_DBGSTAT(tcp->tcp_tcps, tcp_timermp_freed);
+	}
+}
+
+/*
+ * Stop all TCP timers.
+ */
+void
+tcp_timers_stop(tcp_t *tcp)
+{
+	if (tcp->tcp_timer_tid != 0) {
+		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_timer_tid);
+		tcp->tcp_timer_tid = 0;
+	}
+	if (tcp->tcp_ka_tid != 0) {
+		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ka_tid);
+		tcp->tcp_ka_tid = 0;
+	}
+	if (tcp->tcp_ack_tid != 0) {
+		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_ack_tid);
+		tcp->tcp_ack_tid = 0;
+	}
+	if (tcp->tcp_push_tid != 0) {
+		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_push_tid);
+		tcp->tcp_push_tid = 0;
+	}
+	if (tcp->tcp_reass_tid != 0) {
+		(void) TCP_TIMER_CANCEL(tcp, tcp->tcp_reass_tid);
+		tcp->tcp_reass_tid = 0;
+	}
+}
+
+/*
+ * Timer callback routine for keepalive probe.  We do a fake resend of
+ * last ACKed byte.  Then set a timer using RTO.  When the timer expires,
+ * check to see if we have heard anything from the other end for the last
+ * RTO period.  If we have, set the timer to expire for another
+ * tcp_keepalive_intrvl and check again.  If we have not, set a timer using
+ * RTO << 1 and check again when it expires.  Keep exponentially increasing
+ * the timeout if we have not heard from the other side.  If for more than
+ * (tcp_ka_interval + tcp_ka_abort_thres) we have not heard anything,
+ * kill the connection unless the keepalive abort threshold is 0.  In
+ * that case, we will probe "forever."
+ */
+void
+tcp_keepalive_timer(void *arg)
+{
+	mblk_t	*mp;
+	conn_t	*connp = (conn_t *)arg;
+	tcp_t  	*tcp = connp->conn_tcp;
+	int32_t	firetime;
+	int32_t	idletime;
+	int32_t	ka_intrvl;
+	tcp_stack_t	*tcps = tcp->tcp_tcps;
+
+	tcp->tcp_ka_tid = 0;
+
+	if (tcp->tcp_fused)
+		return;
+
+	TCPS_BUMP_MIB(tcps, tcpTimKeepalive);
+	ka_intrvl = tcp->tcp_ka_interval;
+
+	/*
+	 * Keepalive probe should only be sent if the application has not
+	 * done a close on the connection.
+	 */
+	if (tcp->tcp_state > TCPS_CLOSE_WAIT) {
+		return;
+	}
+	/* Timer fired too early, restart it. */
+	if (tcp->tcp_state < TCPS_ESTABLISHED) {
+		tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
+		    MSEC_TO_TICK(ka_intrvl));
+		return;
+	}
+
+	idletime = TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time);
+	/*
+	 * If we have not heard from the other side for a long
+	 * time, kill the connection unless the keepalive abort
+	 * threshold is 0.  In that case, we will probe "forever."
+	 */
+	if (tcp->tcp_ka_abort_thres != 0 &&
+	    idletime > (ka_intrvl + tcp->tcp_ka_abort_thres)) {
+		TCPS_BUMP_MIB(tcps, tcpTimKeepaliveDrop);
+		(void) tcp_clean_death(tcp, tcp->tcp_client_errno ?
+		    tcp->tcp_client_errno : ETIMEDOUT);
+		return;
+	}
+
+	if (tcp->tcp_snxt == tcp->tcp_suna &&
+	    idletime >= ka_intrvl) {
+		/* Fake resend of last ACKed byte. */
+		mblk_t	*mp1 = allocb(1, BPRI_LO);
+
+		if (mp1 != NULL) {
+			*mp1->b_wptr++ = '\0';
+			mp = tcp_xmit_mp(tcp, mp1, 1, NULL, NULL,
+			    tcp->tcp_suna - 1, B_FALSE, NULL, B_TRUE);
+			freeb(mp1);
+			/*
+			 * if allocation failed, fall through to start the
+			 * timer back.
+			 */
+			if (mp != NULL) {
+				tcp_send_data(tcp, mp);
+				TCPS_BUMP_MIB(tcps, tcpTimKeepaliveProbe);
+				if (tcp->tcp_ka_last_intrvl != 0) {
+					int max;
+					/*
+					 * We should probe again at least
+					 * in ka_intrvl, but not more than
+					 * tcp_rexmit_interval_max.
+					 */
+					max = tcps->tcps_rexmit_interval_max;
+					firetime = MIN(ka_intrvl - 1,
+					    tcp->tcp_ka_last_intrvl << 1);
+					if (firetime > max)
+						firetime = max;
+				} else {
+					firetime = tcp->tcp_rto;
+				}
+				tcp->tcp_ka_tid = TCP_TIMER(tcp,
+				    tcp_keepalive_timer,
+				    MSEC_TO_TICK(firetime));
+				tcp->tcp_ka_last_intrvl = firetime;
+				return;
+			}
+		}
+	} else {
+		tcp->tcp_ka_last_intrvl = 0;
+	}
+
+	/* firetime can be negative if (mp1 == NULL || mp == NULL) */
+	if ((firetime = ka_intrvl - idletime) < 0) {
+		firetime = ka_intrvl;
+	}
+	tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
+	    MSEC_TO_TICK(firetime));
+}
+
+void
+tcp_reass_timer(void *arg)
+{
+	conn_t *connp = (conn_t *)arg;
+	tcp_t *tcp = connp->conn_tcp;
+
+	tcp->tcp_reass_tid = 0;
+	if (tcp->tcp_reass_head == NULL)
+		return;
+	ASSERT(tcp->tcp_reass_tail != NULL);
+	if (tcp->tcp_snd_sack_ok && tcp->tcp_num_sack_blk > 0) {
+		tcp_sack_remove(tcp->tcp_sack_list,
+		    TCP_REASS_END(tcp->tcp_reass_tail), &tcp->tcp_num_sack_blk);
+	}
+	tcp_close_mpp(&tcp->tcp_reass_head);
+	tcp->tcp_reass_tail = NULL;
+	TCP_STAT(tcp->tcp_tcps, tcp_reass_timeout);
+}
+
+/* This function handles the push timeout. */
+void
+tcp_push_timer(void *arg)
+{
+	conn_t	*connp = (conn_t *)arg;
+	tcp_t *tcp = connp->conn_tcp;
+
+	TCP_DBGSTAT(tcp->tcp_tcps, tcp_push_timer_cnt);
+
+	ASSERT(tcp->tcp_listener == NULL);
+
+	ASSERT(!IPCL_IS_NONSTR(connp));
+
+	tcp->tcp_push_tid = 0;
+
+	if (tcp->tcp_rcv_list != NULL &&
+	    tcp_rcv_drain(tcp) == TH_ACK_NEEDED)
+		tcp_xmit_ctl(NULL, tcp, tcp->tcp_snxt, tcp->tcp_rnxt, TH_ACK);
+}
+
+/*
+ * This function handles delayed ACK timeout.
+ */
+void
+tcp_ack_timer(void *arg)
+{
+	conn_t	*connp = (conn_t *)arg;
+	tcp_t *tcp = connp->conn_tcp;
+	mblk_t *mp;
+	tcp_stack_t	*tcps = tcp->tcp_tcps;
+
+	TCP_DBGSTAT(tcps, tcp_ack_timer_cnt);
+
+	tcp->tcp_ack_tid = 0;
+
+	if (tcp->tcp_fused)
+		return;
+
+	/*
+	 * Do not send ACK if there is no outstanding unack'ed data.
+	 */
+	if (tcp->tcp_rnxt == tcp->tcp_rack) {
+		return;
+	}
+
+	if ((tcp->tcp_rnxt - tcp->tcp_rack) > tcp->tcp_mss) {
+		/*
+		 * Make sure we don't allow deferred ACKs to result in
+		 * timer-based ACKing.  If we have held off an ACK
+		 * when there was more than an mss here, and the timer
+		 * goes off, we have to worry about the possibility
+		 * that the sender isn't doing slow-start, or is out
+		 * of step with us for some other reason.  We fall
+		 * permanently back in the direction of
+		 * ACK-every-other-packet as suggested in RFC 1122.
+		 */
+		if (tcp->tcp_rack_abs_max > 2)
+			tcp->tcp_rack_abs_max--;
+		tcp->tcp_rack_cur_max = 2;
+	}
+	mp = tcp_ack_mp(tcp);
+
+	if (mp != NULL) {
+		BUMP_LOCAL(tcp->tcp_obsegs);
+		TCPS_BUMP_MIB(tcps, tcpOutAck);
+		TCPS_BUMP_MIB(tcps, tcpOutAckDelayed);
+		tcp_send_data(tcp, mp);
+	}
+}
+
+/*
+ * Notify IP that we are having trouble with this connection.  IP should
+ * make note so it can potentially use a different IRE.
+ */
+static void
+tcp_ip_notify(tcp_t *tcp)
+{
+	conn_t		*connp = tcp->tcp_connp;
+	ire_t		*ire;
+
+	/*
+	 * Note: in the case of source routing we want to blow away the
+	 * route to the first source route hop.
+	 */
+	ire = connp->conn_ixa->ixa_ire;
+	if (ire != NULL && !(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) {
+		if (ire->ire_ipversion == IPV4_VERSION) {
+			/*
+			 * As per RFC 1122, we send an RTM_LOSING to inform
+			 * routing protocols.
+			 */
+			ip_rts_change(RTM_LOSING, ire->ire_addr,
+			    ire->ire_gateway_addr, ire->ire_mask,
+			    connp->conn_laddr_v4,  0, 0, 0,
+			    (RTA_DST | RTA_GATEWAY | RTA_NETMASK | RTA_IFA),
+			    ire->ire_ipst);
+		}
+		(void) ire_no_good(ire);
+	}
+}
+
+/*
+ * tcp_timer is the timer service routine.  It handles the retransmission,
+ * FIN_WAIT_2 flush, and zero window probe timeout events.  It figures out
+ * from the state of the tcp instance what kind of action needs to be done
+ * at the time it is called.
+ */
+void
+tcp_timer(void *arg)
+{
+	mblk_t		*mp;
+	clock_t		first_threshold;
+	clock_t		second_threshold;
+	clock_t		ms;
+	uint32_t	mss;
+	conn_t		*connp = (conn_t *)arg;
+	tcp_t		*tcp = connp->conn_tcp;
+	tcp_stack_t	*tcps = tcp->tcp_tcps;
+
+	tcp->tcp_timer_tid = 0;
+
+	if (tcp->tcp_fused)
+		return;
+
+	first_threshold =  tcp->tcp_first_timer_threshold;
+	second_threshold = tcp->tcp_second_timer_threshold;
+	switch (tcp->tcp_state) {
+	case TCPS_IDLE:
+	case TCPS_BOUND:
+	case TCPS_LISTEN:
+		return;
+	case TCPS_SYN_RCVD: {
+		tcp_t	*listener = tcp->tcp_listener;
+
+		if (tcp->tcp_syn_rcvd_timeout == 0 && (listener != NULL)) {
+			/* it's our first timeout */
+			tcp->tcp_syn_rcvd_timeout = 1;
+			mutex_enter(&listener->tcp_eager_lock);
+			listener->tcp_syn_rcvd_timeout++;
+			if (!tcp->tcp_dontdrop && !tcp->tcp_closemp_used) {
+				/*
+				 * Make this eager available for drop if we
+				 * need to drop one to accomodate a new
+				 * incoming SYN request.
+				 */
+				MAKE_DROPPABLE(listener, tcp);
+			}
+			if (!listener->tcp_syn_defense &&
+			    (listener->tcp_syn_rcvd_timeout >
+			    (tcps->tcps_conn_req_max_q0 >> 2)) &&
+			    (tcps->tcps_conn_req_max_q0 > 200)) {
+				/* We may be under attack. Put on a defense. */
+				listener->tcp_syn_defense = B_TRUE;
+				cmn_err(CE_WARN, "High TCP connect timeout "
+				    "rate! System (port %d) may be under a "
+				    "SYN flood attack!",
+				    ntohs(listener->tcp_connp->conn_lport));
+
+				listener->tcp_ip_addr_cache = kmem_zalloc(
+				    IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t),
+				    KM_NOSLEEP);
+			}
+			mutex_exit(&listener->tcp_eager_lock);
+		} else if (listener != NULL) {
+			mutex_enter(&listener->tcp_eager_lock);
+			tcp->tcp_syn_rcvd_timeout++;
+			if (tcp->tcp_syn_rcvd_timeout > 1 &&
+			    !tcp->tcp_closemp_used) {
+				/*
+				 * This is our second timeout. Put the tcp in
+				 * the list of droppable eagers to allow it to
+				 * be dropped, if needed. We don't check
+				 * whether tcp_dontdrop is set or not to
+				 * protect ourselve from a SYN attack where a
+				 * remote host can spoof itself as one of the
+				 * good IP source and continue to hold
+				 * resources too long.
+				 */
+				MAKE_DROPPABLE(listener, tcp);
+			}
+			mutex_exit(&listener->tcp_eager_lock);
+		}
+	}
+		/* FALLTHRU */
+	case TCPS_SYN_SENT:
+		first_threshold =  tcp->tcp_first_ctimer_threshold;
+		second_threshold = tcp->tcp_second_ctimer_threshold;
+		break;
+	case TCPS_ESTABLISHED:
+	case TCPS_FIN_WAIT_1:
+	case TCPS_CLOSING:
+	case TCPS_CLOSE_WAIT:
+	case TCPS_LAST_ACK:
+		/* If we have data to rexmit */
+		if (tcp->tcp_suna != tcp->tcp_snxt) {
+			clock_t	time_to_wait;
+
+			TCPS_BUMP_MIB(tcps, tcpTimRetrans);
+			if (!tcp->tcp_xmit_head)
+				break;
+			time_to_wait = ddi_get_lbolt() -
+			    (clock_t)tcp->tcp_xmit_head->b_prev;
+			time_to_wait = tcp->tcp_rto -
+			    TICK_TO_MSEC(time_to_wait);
+			/*
+			 * If the timer fires too early, 1 clock tick earlier,
+			 * restart the timer.
+			 */
+			if (time_to_wait > msec_per_tick) {
+				TCP_STAT(tcps, tcp_timer_fire_early);
+				TCP_TIMER_RESTART(tcp, time_to_wait);
+				return;
+			}
+			/*
+			 * When we probe zero windows, we force the swnd open.
+			 * If our peer acks with a closed window swnd will be
+			 * set to zero by tcp_rput(). As long as we are
+			 * receiving acks tcp_rput will
+			 * reset 'tcp_ms_we_have_waited' so as not to trip the
+			 * first and second interval actions.  NOTE: the timer
+			 * interval is allowed to continue its exponential
+			 * backoff.
+			 */
+			if (tcp->tcp_swnd == 0 || tcp->tcp_zero_win_probe) {
+				if (connp->conn_debug) {
+					(void) strlog(TCP_MOD_ID, 0, 1,
+					    SL_TRACE, "tcp_timer: zero win");
+				}
+			} else {
+				/*
+				 * After retransmission, we need to do
+				 * slow start.  Set the ssthresh to one
+				 * half of current effective window and
+				 * cwnd to one MSS.  Also reset
+				 * tcp_cwnd_cnt.
+				 *
+				 * Note that if tcp_ssthresh is reduced because
+				 * of ECN, do not reduce it again unless it is
+				 * already one window of data away (tcp_cwr
+				 * should then be cleared) or this is a
+				 * timeout for a retransmitted segment.
+				 */
+				uint32_t npkt;
+
+				if (!tcp->tcp_cwr || tcp->tcp_rexmit) {
+					npkt = ((tcp->tcp_timer_backoff ?
+					    tcp->tcp_cwnd_ssthresh :
+					    tcp->tcp_snxt -
+					    tcp->tcp_suna) >> 1) / tcp->tcp_mss;
+					tcp->tcp_cwnd_ssthresh = MAX(npkt, 2) *
+					    tcp->tcp_mss;
+				}
+				tcp->tcp_cwnd = tcp->tcp_mss;
+				tcp->tcp_cwnd_cnt = 0;
+				if (tcp->tcp_ecn_ok) {
+					tcp->tcp_cwr = B_TRUE;
+					tcp->tcp_cwr_snd_max = tcp->tcp_snxt;
+					tcp->tcp_ecn_cwr_sent = B_FALSE;
+				}
+			}
+			break;
+		}
+		/*
+		 * We have something to send yet we cannot send.  The
+		 * reason can be:
+		 *
+		 * 1. Zero send window: we need to do zero window probe.
+		 * 2. Zero cwnd: because of ECN, we need to "clock out
+		 * segments.
+		 * 3. SWS avoidance: receiver may have shrunk window,
+		 * reset our knowledge.
+		 *
+		 * Note that condition 2 can happen with either 1 or
+		 * 3.  But 1 and 3 are exclusive.
+		 */
+		if (tcp->tcp_unsent != 0) {
+			/*
+			 * Should not hold the zero-copy messages for too long.
+			 */
+			if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+				tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+				    tcp->tcp_xmit_head, B_TRUE);
+
+			if (tcp->tcp_cwnd == 0) {
+				/*
+				 * Set tcp_cwnd to 1 MSS so that a
+				 * new segment can be sent out.  We
+				 * are "clocking out" new data when
+				 * the network is really congested.
+				 */
+				ASSERT(tcp->tcp_ecn_ok);
+				tcp->tcp_cwnd = tcp->tcp_mss;
+			}
+			if (tcp->tcp_swnd == 0) {
+				/* Extend window for zero window probe */
+				tcp->tcp_swnd++;
+				tcp->tcp_zero_win_probe = B_TRUE;
+				TCPS_BUMP_MIB(tcps, tcpOutWinProbe);
+			} else {
+				/*
+				 * Handle timeout from sender SWS avoidance.
+				 * Reset our knowledge of the max send window
+				 * since the receiver might have reduced its
+				 * receive buffer.  Avoid setting tcp_max_swnd
+				 * to one since that will essentially disable
+				 * the SWS checks.
+				 *
+				 * Note that since we don't have a SWS
+				 * state variable, if the timeout is set
+				 * for ECN but not for SWS, this
+				 * code will also be executed.  This is
+				 * fine as tcp_max_swnd is updated
+				 * constantly and it will not affect
+				 * anything.
+				 */
+				tcp->tcp_max_swnd = MAX(tcp->tcp_swnd, 2);
+			}
+			tcp_wput_data(tcp, NULL, B_FALSE);
+			return;
+		}
+		/* Is there a FIN that needs to be to re retransmitted? */
+		if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
+		    !tcp->tcp_fin_acked)
+			break;
+		/* Nothing to do, return without restarting timer. */
+		TCP_STAT(tcps, tcp_timer_fire_miss);
+		return;
+	case TCPS_FIN_WAIT_2:
+		/*
+		 * User closed the TCP endpoint and peer ACK'ed our FIN.
+		 * We waited some time for for peer's FIN, but it hasn't
+		 * arrived.  We flush the connection now to avoid
+		 * case where the peer has rebooted.
+		 */
+		if (TCP_IS_DETACHED(tcp)) {
+			(void) tcp_clean_death(tcp, 0);
+		} else {
+			TCP_TIMER_RESTART(tcp,
+			    tcps->tcps_fin_wait_2_flush_interval);
+		}
+		return;
+	case TCPS_TIME_WAIT:
+		(void) tcp_clean_death(tcp, 0);
+		return;
+	default:
+		if (connp->conn_debug) {
+			(void) strlog(TCP_MOD_ID, 0, 1, SL_TRACE|SL_ERROR,
+			    "tcp_timer: strange state (%d) %s",
+			    tcp->tcp_state, tcp_display(tcp, NULL,
+			    DISP_PORT_ONLY));
+		}
+		return;
+	}
+
+	/*
+	 * If the system is under memory pressure or the max number of
+	 * connections have been established for the listener, be more
+	 * aggressive in aborting connections.
+	 */
+	if (tcps->tcps_reclaim || (tcp->tcp_listen_cnt != NULL &&
+	    tcp->tcp_listen_cnt->tlc_cnt > tcp->tcp_listen_cnt->tlc_max)) {
+		second_threshold = tcp_early_abort * SECONDS;
+	}
+
+	if ((ms = tcp->tcp_ms_we_have_waited) > second_threshold) {
+		/*
+		 * Should not hold the zero-copy messages for too long.
+		 */
+		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+			    tcp->tcp_xmit_head, B_TRUE);
+
+		/*
+		 * For zero window probe, we need to send indefinitely,
+		 * unless we have not heard from the other side for some
+		 * time...
+		 */
+		if ((tcp->tcp_zero_win_probe == 0) ||
+		    (TICK_TO_MSEC(ddi_get_lbolt() - tcp->tcp_last_recv_time) >
+		    second_threshold)) {
+			TCPS_BUMP_MIB(tcps, tcpTimRetransDrop);
+			/*
+			 * If TCP is in SYN_RCVD state, send back a
+			 * RST|ACK as BSD does.  Note that tcp_zero_win_probe
+			 * should be zero in TCPS_SYN_RCVD state.
+			 */
+			if (tcp->tcp_state == TCPS_SYN_RCVD) {
+				tcp_xmit_ctl("tcp_timer: RST sent on timeout "
+				    "in SYN_RCVD",
+				    tcp, tcp->tcp_snxt,
+				    tcp->tcp_rnxt, TH_RST | TH_ACK);
+			}
+			(void) tcp_clean_death(tcp,
+			    tcp->tcp_client_errno ?
+			    tcp->tcp_client_errno : ETIMEDOUT);
+			return;
+		} else {
+			/*
+			 * If the system is under memory pressure, we also
+			 * abort connection in zero window probing.
+			 */
+			if (tcps->tcps_reclaim) {
+				(void) tcp_clean_death(tcp,
+				    tcp->tcp_client_errno ?
+				    tcp->tcp_client_errno : ETIMEDOUT);
+				TCP_STAT(tcps, tcp_zwin_mem_drop);
+				return;
+			}
+			/*
+			 * Set tcp_ms_we_have_waited to second_threshold
+			 * so that in next timeout, we will do the above
+			 * check (ddi_get_lbolt() - tcp_last_recv_time).
+			 * This is also to avoid overflow.
+			 *
+			 * We don't need to decrement tcp_timer_backoff
+			 * to avoid overflow because it will be decremented
+			 * later if new timeout value is greater than
+			 * tcp_rexmit_interval_max.  In the case when
+			 * tcp_rexmit_interval_max is greater than
+			 * second_threshold, it means that we will wait
+			 * longer than second_threshold to send the next
+			 * window probe.
+			 */
+			tcp->tcp_ms_we_have_waited = second_threshold;
+		}
+	} else if (ms > first_threshold) {
+		/*
+		 * Should not hold the zero-copy messages for too long.
+		 */
+		if (tcp->tcp_snd_zcopy_aware && !tcp->tcp_xmit_zc_clean)
+			tcp->tcp_xmit_head = tcp_zcopy_backoff(tcp,
+			    tcp->tcp_xmit_head, B_TRUE);
+
+		/*
+		 * We have been retransmitting for too long...  The RTT
+		 * we calculated is probably incorrect.  Reinitialize it.
+		 * Need to compensate for 0 tcp_rtt_sa.  Reset
+		 * tcp_rtt_update so that we won't accidentally cache a
+		 * bad value.  But only do this if this is not a zero
+		 * window probe.
+		 */
+		if (tcp->tcp_rtt_sa != 0 && tcp->tcp_zero_win_probe == 0) {
+			tcp->tcp_rtt_sd += (tcp->tcp_rtt_sa >> 3) +
+			    (tcp->tcp_rtt_sa >> 5);
+			tcp->tcp_rtt_sa = 0;
+			tcp_ip_notify(tcp);
+			tcp->tcp_rtt_update = 0;
+		}
+	}
+	tcp->tcp_timer_backoff++;
+	if ((ms = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
+	    tcps->tcps_rexmit_interval_extra + (tcp->tcp_rtt_sa >> 5)) <
+	    tcps->tcps_rexmit_interval_min) {
+		/*
+		 * This means the original RTO is tcp_rexmit_interval_min.
+		 * So we will use tcp_rexmit_interval_min as the RTO value
+		 * and do the backoff.
+		 */
+		ms = tcps->tcps_rexmit_interval_min << tcp->tcp_timer_backoff;
+	} else {
+		ms <<= tcp->tcp_timer_backoff;
+	}
+	if (ms > tcps->tcps_rexmit_interval_max) {
+		ms = tcps->tcps_rexmit_interval_max;
+		/*
+		 * ms is at max, decrement tcp_timer_backoff to avoid
+		 * overflow.
+		 */
+		tcp->tcp_timer_backoff--;
+	}
+	tcp->tcp_ms_we_have_waited += ms;
+	if (tcp->tcp_zero_win_probe == 0) {
+		tcp->tcp_rto = ms;
+	}
+	TCP_TIMER_RESTART(tcp, ms);
+	/*
+	 * This is after a timeout and tcp_rto is backed off.  Set
+	 * tcp_set_timer to 1 so that next time RTO is updated, we will
+	 * restart the timer with a correct value.
+	 */
+	tcp->tcp_set_timer = 1;
+	mss = tcp->tcp_snxt - tcp->tcp_suna;
+	if (mss > tcp->tcp_mss)
+		mss = tcp->tcp_mss;
+	if (mss > tcp->tcp_swnd && tcp->tcp_swnd != 0)
+		mss = tcp->tcp_swnd;
+
+	if ((mp = tcp->tcp_xmit_head) != NULL)
+		mp->b_prev = (mblk_t *)ddi_get_lbolt();
+	mp = tcp_xmit_mp(tcp, mp, mss, NULL, NULL, tcp->tcp_suna, B_TRUE, &mss,
+	    B_TRUE);
+
+	/*
+	 * When slow start after retransmission begins, start with
+	 * this seq no.  tcp_rexmit_max marks the end of special slow
+	 * start phase.  tcp_snd_burst controls how many segments
+	 * can be sent because of an ack.
+	 */
+	tcp->tcp_rexmit_nxt = tcp->tcp_suna;
+	tcp->tcp_snd_burst = TCP_CWND_SS;
+	if ((tcp->tcp_valid_bits & TCP_FSS_VALID) &&
+	    (tcp->tcp_unsent == 0)) {
+		tcp->tcp_rexmit_max = tcp->tcp_fss;
+	} else {
+		tcp->tcp_rexmit_max = tcp->tcp_snxt;
+	}
+	tcp->tcp_rexmit = B_TRUE;
+	tcp->tcp_dupack_cnt = 0;
+
+	/*
+	 * Remove all rexmit SACK blk to start from fresh.
+	 */
+	if (tcp->tcp_snd_sack_ok && tcp->tcp_notsack_list != NULL)
+		TCP_NOTSACK_REMOVE_ALL(tcp->tcp_notsack_list, tcp);
+	if (mp == NULL) {
+		return;
+	}
+
+	tcp->tcp_csuna = tcp->tcp_snxt;
+	TCPS_BUMP_MIB(tcps, tcpRetransSegs);
+	TCPS_UPDATE_MIB(tcps, tcpRetransBytes, mss);
+	tcp_send_data(tcp, mp);
+
+}
+
+/*
+ * Handle lingering timeouts. This function is called when the SO_LINGER timeout
+ * expires.
+ */
+void
+tcp_close_linger_timeout(void *arg)
+{
+	conn_t	*connp = (conn_t *)arg;
+	tcp_t 	*tcp = connp->conn_tcp;
+
+	tcp->tcp_client_errno = ETIMEDOUT;
+	tcp_stop_lingering(tcp);
+}