summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/tcp/tcp.c
diff options
context:
space:
mode:
authorxy158873 <none@none>2005-10-17 11:11:52 -0700
committerxy158873 <none@none>2005-10-17 11:11:52 -0700
commita66ba7b33369c7bd064dcae0e279fd236eee6dbf (patch)
tree111927198ae119f055eb30d6402cfa5c72c5ce6e /usr/src/uts/common/inet/tcp/tcp.c
parentc0f937da2a16c696966e32fa43ce9d1eeda946ec (diff)
downloadillumos-joyent-a66ba7b33369c7bd064dcae0e279fd236eee6dbf.tar.gz
6275398 Galaxy hangs when running lmbench
Diffstat (limited to 'usr/src/uts/common/inet/tcp/tcp.c')
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c130
1 files changed, 102 insertions, 28 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 8c651d1443..b3ab0208ac 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -1155,10 +1155,39 @@ static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
boolean_t);
+/*
+ * Write-side flow-control is implemented via the per instance STREAMS
+ * write-side Q by explicitly setting QFULL to stop the flow of mblk_t(s)
+ * and clearing QFULL and calling qbackenable() to restart the flow based
+ * on the number of TCP unsent bytes (i.e. those not on the wire waiting
+ * for a remote ACK).
+ *
+ * This is different than a standard STREAMS kmod which when using the
+ * STREAMS Q the framework would automatictly flow-control based on the
+ * defined hiwat/lowat values as mblk_t's are enqueued/dequeued.
+ *
+ * As of FireEngine TCP write-side flow-control needs to take into account
+ * both the unsent tcp_xmit list bytes but also any squeue_t enqueued bytes
+ * (i.e. from tcp_wput() -> tcp_output()).
+ *
+ * This is accomplished by adding a new tcp_t fields, tcp_squeue_bytes, to
+ * count the number of bytes enqueued by tcp_wput() and the number of bytes
+ * dequeued and processed by tcp_output().
+ *
+ * So, the total number of bytes unsent is (squeue_bytes + unsent) with all
+ * flow-control uses of unsent replaced with the macro TCP_UNSENT_BYTES.
+ */
static void tcp_clrqfull(tcp_t *);
static void tcp_setqfull(tcp_t *);
+#define TCP_UNSENT_BYTES(tcp) \
+ ((tcp)->tcp_squeue_bytes + (tcp)->tcp_unsent)
+
+/*
+ * STREAMS kmod stuff ...
+ */
+
static struct module_info tcp_rinfo = {
#define TCP_MODULE_ID 5105
TCP_MODULE_ID, "tcp", 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
@@ -1968,12 +1997,10 @@ tcp_unfuse(tcp_t *tcp)
/* Lift up any flow-control conditions */
if (tcp->tcp_flow_stopped) {
tcp_clrqfull(tcp);
- tcp->tcp_flow_stopped = B_FALSE;
TCP_STAT(tcp_fusion_backenabled);
}
if (peer_tcp->tcp_flow_stopped) {
tcp_clrqfull(peer_tcp);
- peer_tcp->tcp_flow_stopped = B_FALSE;
TCP_STAT(tcp_fusion_backenabled);
}
@@ -2165,7 +2192,6 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp)
if (TCP_IS_DETACHED(peer_tcp) &&
peer_tcp->tcp_rcv_cnt > peer_rq->q_hiwat) {
tcp_setqfull(tcp);
- tcp->tcp_flow_stopped = B_TRUE;
TCP_STAT(tcp_fusion_flowctl);
}
@@ -2203,7 +2229,6 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp)
} else if (!tcp->tcp_flow_stopped) {
if (!canputnext(peer_rq)) {
tcp_setqfull(tcp);
- tcp->tcp_flow_stopped = B_TRUE;
TCP_STAT(tcp_fusion_flowctl);
} else {
ASSERT(peer_tcp->tcp_rcv_list != NULL);
@@ -2211,6 +2236,8 @@ tcp_fuse_output(tcp_t *tcp, mblk_t *mp)
peer_tcp, NULL);
TCP_STAT(tcp_fusion_putnext);
}
+ } else if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ tcp_clrqfull(tcp);
}
}
return (B_TRUE);
@@ -4476,7 +4503,6 @@ tcp_stop_lingering(tcp_t *tcp)
if (tcp->tcp_state > TCPS_LISTEN) {
tcp_acceptor_hash_remove(tcp);
if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
tcp_clrqfull(tcp);
}
@@ -4798,7 +4824,6 @@ tcp_close_output(void *arg, mblk_t *mp, void *arg2)
tcp_acceptor_hash_remove(tcp);
if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
tcp_clrqfull(tcp);
}
@@ -7989,7 +8014,6 @@ tcp_reinit(tcp_t *tcp)
tcp_timers_stop(tcp);
if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
tcp_clrqfull(tcp);
}
/*
@@ -8398,6 +8422,8 @@ tcp_reinit_values(tcp)
tcp->tcp_in_ack_unsent = 0;
tcp->tcp_cork = B_FALSE;
+ tcp->tcp_squeue_bytes = 0;
+
#undef DONTCARE
#undef PRESERVE
}
@@ -10472,8 +10498,8 @@ tcp_opt_set(queue_t *q, uint_t optset_context, int level, int name,
* condition to be lifted right away.
*/
if (tcp->tcp_flow_stopped &&
- tcp->tcp_unsent < tcp->tcp_xmit_hiwater) {
- tcp->tcp_flow_stopped = B_FALSE;
+ TCP_UNSENT_BYTES(tcp)
+ < tcp->tcp_xmit_hiwater) {
tcp_clrqfull(tcp);
}
}
@@ -15988,7 +16014,6 @@ tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2)
(void) tcp_rcv_drain(tcp->tcp_rq, tcp);
tcp_clrqfull(peer_tcp);
- peer_tcp->tcp_flow_stopped = B_FALSE;
TCP_STAT(tcp_fusion_backenabled);
return;
}
@@ -17441,6 +17466,7 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
int usable;
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
+ uint32_t msize;
/*
* Try and ASSERT the minimum possible references on the
@@ -17455,8 +17481,15 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
(connp->conn_fanout == NULL && connp->conn_ref >= 3));
/* Bypass tcp protocol for fused tcp loopback */
- if (tcp->tcp_fused && tcp_fuse_output(tcp, mp))
- return;
+ if (tcp->tcp_fused) {
+ msize = msgdsize(mp);
+ mutex_enter(&connp->conn_lock);
+ tcp->tcp_squeue_bytes -= msize;
+ mutex_exit(&connp->conn_lock);
+
+ if (tcp_fuse_output(tcp, mp))
+ return;
+ }
mss = tcp->tcp_mss;
if (tcp->tcp_xmit_zc_clean)
@@ -17482,6 +17515,11 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
(len == 0) ||
(len > mss) ||
(tcp->tcp_valid_bits != 0)) {
+ msize = msgdsize(mp);
+ mutex_enter(&connp->conn_lock);
+ tcp->tcp_squeue_bytes -= msize;
+ mutex_exit(&connp->conn_lock);
+
tcp_wput_data(tcp, mp, B_FALSE);
return;
}
@@ -17489,6 +17527,10 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
ASSERT(tcp->tcp_xmit_tail_unsent == 0);
ASSERT(tcp->tcp_fin_sent == 0);
+ mutex_enter(&connp->conn_lock);
+ tcp->tcp_squeue_bytes -= len;
+ mutex_exit(&connp->conn_lock);
+
/* queue new packet onto retransmission queue */
if (tcp->tcp_xmit_head == NULL) {
tcp->tcp_xmit_head = mp;
@@ -17536,6 +17578,11 @@ tcp_output(void *arg, mblk_t *mp, void *arg2)
goto slow;
}
+ if (tcp->tcp_flow_stopped &&
+ TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
+ tcp_clrqfull(tcp);
+ }
+
/*
* determine if anything to send (Nagle).
*
@@ -17909,7 +17956,6 @@ tcp_accept_finish(void *arg, mblk_t *mp, void *arg2)
ASSERT(peer_tcp->tcp_fused);
tcp_clrqfull(peer_tcp);
- peer_tcp->tcp_flow_stopped = B_FALSE;
TCP_STAT(tcp_fusion_backenabled);
}
}
@@ -18245,12 +18291,27 @@ tcp_wput(queue_t *q, mblk_t *mp)
t_scalar_t type;
uchar_t *rptr;
struct iocblk *iocp;
+ uint32_t msize;
ASSERT(connp->conn_ref >= 2);
switch (DB_TYPE(mp)) {
case M_DATA:
- CONN_INC_REF(connp);
+ tcp = connp->conn_tcp;
+ ASSERT(tcp != NULL);
+
+ msize = msgdsize(mp);
+
+ mutex_enter(&connp->conn_lock);
+ CONN_INC_REF_LOCKED(connp);
+
+ tcp->tcp_squeue_bytes += msize;
+ if (TCP_UNSENT_BYTES(tcp) > tcp->tcp_xmit_hiwater) {
+ mutex_exit(&connp->conn_lock);
+ tcp_setqfull(tcp);
+ } else
+ mutex_exit(&connp->conn_lock);
+
(*tcp_squeue_wput_proc)(connp->conn_sqp, mp,
tcp_output, connp, SQTAG_TCP_OUTPUT);
return;
@@ -19214,15 +19275,12 @@ done:;
TCP_TIMER_RESTART(tcp, tcp->tcp_rto);
}
/* Note that len is the amount we just sent but with a negative sign */
- len += tcp->tcp_unsent;
- tcp->tcp_unsent = len;
+ tcp->tcp_unsent += len;
if (tcp->tcp_flow_stopped) {
- if (len <= tcp->tcp_xmit_lowater) {
- tcp->tcp_flow_stopped = B_FALSE;
+ if (TCP_UNSENT_BYTES(tcp) <= tcp->tcp_xmit_lowater) {
tcp_clrqfull(tcp);
}
- } else if (len >= tcp->tcp_xmit_hiwater) {
- tcp->tcp_flow_stopped = B_TRUE;
+ } else if (TCP_UNSENT_BYTES(tcp) >= tcp->tcp_xmit_hiwater) {
tcp_setqfull(tcp);
}
}
@@ -21112,7 +21170,6 @@ tcp_wput_flush(tcp_t *tcp, mblk_t *mp)
* tcp_xmit_lowater, so re-enable flow.
*/
if (tcp->tcp_flow_stopped) {
- tcp->tcp_flow_stopped = B_FALSE;
tcp_clrqfull(tcp);
}
}
@@ -25165,16 +25222,29 @@ tcp_timer_free(tcp_t *tcp, mblk_t *mp)
* End of TCP Timers implementation.
*/
+/*
+ * tcp_{set,clr}qfull() functions are used to either set or clear QFULL
+ * on the specified backing STREAMS q. Note, the caller may make the
+ * decision to call based on the tcp_t.tcp_flow_stopped value which
+ * when check outside the q's lock is only an advisory check ...
+ */
+
static void
tcp_setqfull(tcp_t *tcp)
{
queue_t *q = tcp->tcp_wq;
if (!(q->q_flag & QFULL)) {
- TCP_STAT(tcp_flwctl_on);
mutex_enter(QLOCK(q));
- q->q_flag |= QFULL;
- mutex_exit(QLOCK(q));
+ if (!(q->q_flag & QFULL)) {
+ /* still need to set QFULL */
+ q->q_flag |= QFULL;
+ tcp->tcp_flow_stopped = B_TRUE;
+ mutex_exit(QLOCK(q));
+ TCP_STAT(tcp_flwctl_on);
+ } else {
+ mutex_exit(QLOCK(q));
+ }
}
}
@@ -25185,10 +25255,14 @@ tcp_clrqfull(tcp_t *tcp)
if (q->q_flag & QFULL) {
mutex_enter(QLOCK(q));
- q->q_flag &= ~QFULL;
- mutex_exit(QLOCK(q));
- if (q->q_flag & QWANTW)
- qbackenable(q, 0);
+ if (q->q_flag & QFULL) {
+ q->q_flag &= ~QFULL;
+ tcp->tcp_flow_stopped = B_FALSE;
+ mutex_exit(QLOCK(q));
+ if (q->q_flag & QWANTW)
+ qbackenable(q, 0);
+ } else
+ mutex_exit(QLOCK(q));
}
}