summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/tcp/tcp_tpi.c
diff options
context:
space:
mode:
authorAnders Persson <Anders.Persson@Sun.COM>2010-06-17 17:22:09 -0700
committerAnders Persson <Anders.Persson@Sun.COM>2010-06-17 17:22:09 -0700
commit3e95bd4ab92abca814bd28e854607d1975c7dc88 (patch)
tree9f3088b26f62207198f0d44feca65b701fafb8dc /usr/src/uts/common/inet/tcp/tcp_tpi.c
parent8e51227711fb29b69b8f42a3953e759963432065 (diff)
downloadillumos-gate-3e95bd4ab92abca814bd28e854607d1975c7dc88.tar.gz
PSARC/2009/590 Socket Filter Framework
6939085 Socket Filter Framework 6802067 connect_failed kernel socket callback is not triggered 6776450 time spent in tcp_close could be reduced/deferred to a worker thread 6828586 assertion failed: family == 26, file: ../../common/fs/sockfs/socksyscalls.c, line: 1608 6802078 kernel socket 'newconn' callback is passing rcv queue size as an argument
Diffstat (limited to 'usr/src/uts/common/inet/tcp/tcp_tpi.c')
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_tpi.c492
1 files changed, 298 insertions, 194 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp_tpi.c b/usr/src/uts/common/inet/tcp/tcp_tpi.c
index bcaa1595ec..8c645425b7 100644
--- a/usr/src/uts/common/inet/tcp/tcp_tpi.c
+++ b/usr/src/uts/common/inet/tcp/tcp_tpi.c
@@ -20,8 +20,7 @@
*/
/*
- * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
- * Use is subject to license terms.
+ * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
*/
/* This files contains all TCP TLI/TPI related functions */
@@ -47,7 +46,6 @@
static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
-static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
void
tcp_use_pure_tpi(tcp_t *tcp)
@@ -823,7 +821,7 @@ tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
/* TODO: Default ETSDU is 1. Is that correct for tcp? */
}
-static void
+void
tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
t_uscalar_t cap_bits1)
{
@@ -950,148 +948,6 @@ tcp_addr_req(tcp_t *tcp, mblk_t *mp)
}
/*
- * tcp_fallback
- *
- * A direct socket is falling back to using STREAMS. The queue
- * that is being passed down was created using tcp_open() with
- * the SO_FALLBACK flag set. As a result, the queue is not
- * associated with a conn, and the q_ptrs instead contain the
- * dev and minor area that should be used.
- *
- * The 'issocket' flag indicates whether the FireEngine
- * optimizations should be used. The common case would be that
- * optimizations are enabled, and they might be subsequently
- * disabled using the _SIOCSOCKFALLBACK ioctl.
- */
-
-/*
- * An active connection is falling back to TPI. Gather all the information
- * required by the STREAM head and TPI sonode and send it up.
- */
-void
-tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
- boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb)
-{
- conn_t *connp = tcp->tcp_connp;
- struct stroptions *stropt;
- struct T_capability_ack tca;
- struct sockaddr_in6 laddr, faddr;
- socklen_t laddrlen, faddrlen;
- short opts;
- int error;
- mblk_t *mp;
-
- connp->conn_dev = (dev_t)RD(q)->q_ptr;
- connp->conn_minor_arena = WR(q)->q_ptr;
-
- RD(q)->q_ptr = WR(q)->q_ptr = connp;
-
- connp->conn_rq = RD(q);
- connp->conn_wq = WR(q);
-
- WR(q)->q_qinfo = &tcp_sock_winit;
-
- if (!issocket)
- tcp_use_pure_tpi(tcp);
-
- /*
- * free the helper stream
- */
- ip_free_helper_stream(connp);
-
- /*
- * Notify the STREAM head about options
- */
- DB_TYPE(stropt_mp) = M_SETOPTS;
- stropt = (struct stroptions *)stropt_mp->b_rptr;
- stropt_mp->b_wptr += sizeof (struct stroptions);
- stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
-
- stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
- tcp->tcp_tcps->tcps_wroff_xtra);
- if (tcp->tcp_snd_sack_ok)
- stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
- stropt->so_hiwat = connp->conn_rcvbuf;
- stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
-
- putnext(RD(q), stropt_mp);
-
- /*
- * Collect the information needed to sync with the sonode
- */
- tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
-
- laddrlen = faddrlen = sizeof (sin6_t);
- (void) tcp_getsockname((sock_lower_handle_t)connp,
- (struct sockaddr *)&laddr, &laddrlen, CRED());
- error = tcp_getpeername((sock_lower_handle_t)connp,
- (struct sockaddr *)&faddr, &faddrlen, CRED());
- if (error != 0)
- faddrlen = 0;
-
- opts = 0;
- if (connp->conn_oobinline)
- opts |= SO_OOBINLINE;
- if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
- opts |= SO_DONTROUTE;
-
- /*
- * Notify the socket that the protocol is now quiescent,
- * and it's therefore safe move data from the socket
- * to the stream head.
- */
- (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
- (struct sockaddr *)&laddr, laddrlen,
- (struct sockaddr *)&faddr, faddrlen, opts);
-
- while ((mp = tcp->tcp_rcv_list) != NULL) {
- tcp->tcp_rcv_list = mp->b_next;
- mp->b_next = NULL;
- /* We never do fallback for kernel RPC */
- putnext(q, mp);
- }
- tcp->tcp_rcv_last_head = NULL;
- tcp->tcp_rcv_last_tail = NULL;
- tcp->tcp_rcv_cnt = 0;
-}
-
-/*
- * An eager is falling back to TPI. All we have to do is send
- * up a T_CONN_IND.
- */
-void
-tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
-{
- tcp_t *listener = eager->tcp_listener;
- mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind;
-
- ASSERT(listener != NULL);
- ASSERT(mp != NULL);
-
- eager->tcp_conn.tcp_eager_conn_ind = NULL;
-
- /*
- * TLI/XTI applications will get confused by
- * sending eager as an option since it violates
- * the option semantics. So remove the eager as
- * option since TLI/XTI app doesn't need it anyway.
- */
- if (!direct_sockfs) {
- struct T_conn_ind *conn_ind;
-
- conn_ind = (struct T_conn_ind *)mp->b_rptr;
- conn_ind->OPT_length = 0;
- conn_ind->OPT_offset = 0;
- }
-
- /*
- * Sockfs guarantees that the listener will not be closed
- * during fallback. So we can safely use the listener's queue.
- */
- putnext(listener->tcp_connp->conn_rq, mp);
-}
-
-/*
* Swap information between the eager and acceptor for a TLI/XTI client.
* The sockfs accept is done on the acceptor stream and control goes
* through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
@@ -1185,6 +1041,191 @@ tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
}
/*
+ * This runs at the tail end of accept processing on the squeue of the
+ * new connection.
+ */
+/* ARGSUSED */
+static void
+tcp_accept_finish(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
+{
+ conn_t *connp = (conn_t *)arg;
+ tcp_t *tcp = connp->conn_tcp;
+ queue_t *q = connp->conn_rq;
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ struct stroptions *stropt;
+ struct sock_proto_props sopp;
+
+ /* Should never be called for non-STREAMS sockets */
+ ASSERT(!IPCL_IS_NONSTR(connp));
+
+ /* We should just receive a single mblk that fits a T_discon_ind */
+ ASSERT(mp->b_cont == NULL);
+
+ /*
+ * Drop the eager's ref on the listener, that was placed when
+ * this eager began life in tcp_input_listener.
+ */
+ CONN_DEC_REF(tcp->tcp_saved_listener->tcp_connp);
+
+ tcp->tcp_detached = B_FALSE;
+
+ if (tcp->tcp_state <= TCPS_BOUND || tcp->tcp_accept_error) {
+ /*
+ * Someone blewoff the eager before we could finish
+ * the accept.
+ *
+ * The only reason eager exists it because we put in
+ * a ref on it when conn ind went up. We need to send
+ * a disconnect indication up while the last reference
+ * on the eager will be dropped by the squeue when we
+ * return.
+ */
+ ASSERT(tcp->tcp_listener == NULL);
+ if (tcp->tcp_issocket || tcp->tcp_send_discon_ind) {
+ struct T_discon_ind *tdi;
+
+ (void) putnextctl1(q, M_FLUSH, FLUSHRW);
+ /*
+ * Let us reuse the incoming mblk to avoid
+ * memory allocation failure problems. We know
+ * that the size of the incoming mblk i.e.
+ * stroptions is greater than sizeof
+ * T_discon_ind.
+ */
+ ASSERT(DB_REF(mp) == 1);
+ ASSERT(MBLKSIZE(mp) >=
+ sizeof (struct T_discon_ind));
+
+ DB_TYPE(mp) = M_PROTO;
+ ((union T_primitives *)mp->b_rptr)->type =
+ T_DISCON_IND;
+ tdi = (struct T_discon_ind *)mp->b_rptr;
+ if (tcp->tcp_issocket) {
+ tdi->DISCON_reason = ECONNREFUSED;
+ tdi->SEQ_number = 0;
+ } else {
+ tdi->DISCON_reason = ENOPROTOOPT;
+ tdi->SEQ_number =
+ tcp->tcp_conn_req_seqnum;
+ }
+ mp->b_wptr = mp->b_rptr +
+ sizeof (struct T_discon_ind);
+ putnext(q, mp);
+ }
+ tcp->tcp_hard_binding = B_FALSE;
+ return;
+ }
+
+ /*
+ * This is the first time we run on the correct
+ * queue after tcp_accept. So fix all the q parameters
+ * here.
+ *
+ * Let us reuse the incoming mblk to avoid
+ * memory allocation failure problems. We know
+ * that the size of the incoming mblk is at least
+ * stroptions
+ */
+ tcp_get_proto_props(tcp, &sopp);
+
+ ASSERT(DB_REF(mp) == 1);
+ ASSERT(MBLKSIZE(mp) >= sizeof (struct stroptions));
+
+ DB_TYPE(mp) = M_SETOPTS;
+ stropt = (struct stroptions *)mp->b_rptr;
+ mp->b_wptr = mp->b_rptr + sizeof (struct stroptions);
+ stropt = (struct stroptions *)mp->b_rptr;
+ ASSERT(sopp.sopp_flags & (SO_HIWAT|SO_WROFF|SO_MAXBLK));
+ stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
+ stropt->so_hiwat = sopp.sopp_rxhiwat;
+ stropt->so_wroff = sopp.sopp_wroff;
+ stropt->so_maxblk = sopp.sopp_maxblk;
+
+ if (sopp.sopp_flags & SOCKOPT_TAIL) {
+ ASSERT(tcp->tcp_kssl_ctx != NULL);
+
+ stropt->so_flags |= SO_TAIL | SO_COPYOPT;
+ stropt->so_tail = sopp.sopp_tail;
+ stropt->so_copyopt = sopp.sopp_zcopyflag;
+ }
+
+ /* Send the options up */
+ putnext(q, mp);
+
+ /*
+ * Pass up any data and/or a fin that has been received.
+ *
+ * Adjust receive window in case it had decreased
+ * (because there is data <=> tcp_rcv_list != NULL)
+ * while the connection was detached. Note that
+ * in case the eager was flow-controlled, w/o this
+ * code, the rwnd may never open up again!
+ */
+ if (tcp->tcp_rcv_list != NULL) {
+ /* We drain directly in case of fused tcp loopback */
+
+ if (!tcp->tcp_fused && canputnext(q)) {
+ tcp->tcp_rwnd = connp->conn_rcvbuf;
+ if (tcp->tcp_state >= TCPS_ESTABLISHED &&
+ tcp_rwnd_reopen(tcp) == TH_ACK_NEEDED) {
+ tcp_xmit_ctl(NULL,
+ tcp, (tcp->tcp_swnd == 0) ?
+ tcp->tcp_suna : tcp->tcp_snxt,
+ tcp->tcp_rnxt, TH_ACK);
+ }
+ }
+
+ (void) tcp_rcv_drain(tcp);
+
+ /*
+ * For fused tcp loopback, back-enable peer endpoint
+ * if it's currently flow-controlled.
+ */
+ if (tcp->tcp_fused) {
+ tcp_t *peer_tcp = tcp->tcp_loopback_peer;
+
+ ASSERT(peer_tcp != NULL);
+ ASSERT(peer_tcp->tcp_fused);
+
+ mutex_enter(&peer_tcp->tcp_non_sq_lock);
+ if (peer_tcp->tcp_flow_stopped) {
+ tcp_clrqfull(peer_tcp);
+ TCP_STAT(tcps, tcp_fusion_backenabled);
+ }
+ mutex_exit(&peer_tcp->tcp_non_sq_lock);
+ }
+ }
+ ASSERT(tcp->tcp_rcv_list == NULL || tcp->tcp_fused_sigurg);
+ if (tcp->tcp_fin_rcvd && !tcp->tcp_ordrel_done) {
+ tcp->tcp_ordrel_done = B_TRUE;
+ mp = tcp->tcp_ordrel_mp;
+ tcp->tcp_ordrel_mp = NULL;
+ putnext(q, mp);
+ }
+ tcp->tcp_hard_binding = B_FALSE;
+
+ if (connp->conn_keepalive) {
+ tcp->tcp_ka_last_intrvl = 0;
+ tcp->tcp_ka_tid = TCP_TIMER(tcp, tcp_keepalive_timer,
+ tcp->tcp_ka_interval);
+ }
+
+ /*
+ * At this point, eager is fully established and will
+ * have the following references -
+ *
+ * 2 references for connection to exist (1 for TCP and 1 for IP).
+ * 1 reference for the squeue which will be dropped by the squeue as
+ * soon as this function returns.
+ * There will be 1 additonal reference for being in classifier
+ * hash list provided something bad hasn't happened.
+ */
+ ASSERT((connp->conn_fanout != NULL && connp->conn_ref >= 4) ||
+ (connp->conn_fanout == NULL && connp->conn_ref >= 3));
+}
+
+
+/*
* Reply to a clients T_CONN_RES TPI message. This function
* is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
* on the acceptor STREAM and processed in tcp_accept_common().
@@ -1643,6 +1684,7 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
tcp_t *listener;
struct T_ok_ack *ok;
t_scalar_t PRIM_type;
+ mblk_t *discon_mp;
conn_t *econnp;
cred_t *cr;
@@ -1703,14 +1745,120 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
q->q_qinfo = &tcp_winit;
listener = eager->tcp_listener;
- if (tcp_accept_common(listener->tcp_connp,
- econnp, cr) < 0) {
+ /*
+ * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+ * use it if something failed.
+ */
+ discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+ sizeof (struct stroptions)), BPRI_HI);
+
+ if (discon_mp == NULL) {
mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
if (mp != NULL)
putnext(rq, mp);
return;
}
+ eager->tcp_issocket = B_TRUE;
+
+ ASSERT(econnp->conn_netstack ==
+ listener->tcp_connp->conn_netstack);
+ ASSERT(eager->tcp_tcps == listener->tcp_tcps);
+
+ /* Put the ref for IP */
+ CONN_INC_REF(econnp);
+
+ /*
+ * We should have minimum of 3 references on the conn
+ * at this point. One each for TCP and IP and one for
+ * the T_conn_ind that was sent up when the 3-way handshake
+ * completed. In the normal case we would also have another
+ * reference (making a total of 4) for the conn being in the
+ * classifier hash list. However the eager could have received
+ * an RST subsequently and tcp_closei_local could have removed
+ * the eager from the classifier hash list, hence we can't
+ * assert that reference.
+ */
+ ASSERT(econnp->conn_ref >= 3);
+
+ mutex_enter(&listener->tcp_eager_lock);
+ if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
+
+ tcp_t *tail;
+ tcp_t *tcp;
+ mblk_t *mp1;
+
+ tcp = listener->tcp_eager_prev_q0;
+ /*
+ * listener->tcp_eager_prev_q0 points to the TAIL of the
+ * deferred T_conn_ind queue. We need to get to the head
+ * of the queue in order to send up T_conn_ind the same
+ * order as how the 3WHS is completed.
+ */
+ while (tcp != listener) {
+ if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0 &&
+ !tcp->tcp_kssl_pending)
+ break;
+ else
+ tcp = tcp->tcp_eager_prev_q0;
+ }
+ /* None of the pending eagers can be sent up now */
+ if (tcp == listener)
+ goto no_more_eagers;
+
+ mp1 = tcp->tcp_conn.tcp_eager_conn_ind;
+ tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+ /* Move from q0 to q */
+ ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+ listener->tcp_conn_req_cnt_q0--;
+ listener->tcp_conn_req_cnt_q++;
+ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
+ tcp->tcp_eager_prev_q0;
+ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
+ tcp->tcp_eager_next_q0;
+ tcp->tcp_eager_prev_q0 = NULL;
+ tcp->tcp_eager_next_q0 = NULL;
+ tcp->tcp_conn_def_q0 = B_FALSE;
+
+ /* Make sure the tcp isn't in the list of droppables */
+ ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
+ tcp->tcp_eager_prev_drop_q0 == NULL);
+
+ /*
+ * Insert at end of the queue because sockfs sends
+ * down T_CONN_RES in chronological order. Leaving
+ * the older conn indications at front of the queue
+ * helps reducing search time.
+ */
+ tail = listener->tcp_eager_last_q;
+ if (tail != NULL) {
+ tail->tcp_eager_next_q = tcp;
+ } else {
+ listener->tcp_eager_next_q = tcp;
+ }
+ listener->tcp_eager_last_q = tcp;
+ tcp->tcp_eager_next_q = NULL;
+
+ /* Need to get inside the listener perimeter */
+ CONN_INC_REF(listener->tcp_connp);
+ SQUEUE_ENTER_ONE(listener->tcp_connp->conn_sqp, mp1,
+ tcp_send_pending, listener->tcp_connp, NULL,
+ SQ_FILL, SQTAG_TCP_SEND_PENDING);
+ }
+no_more_eagers:
+ tcp_eager_unlink(eager);
+ mutex_exit(&listener->tcp_eager_lock);
+
+ /*
+ * At this point, the eager is detached from the listener
+ * but we still have an extra refs on eager (apart from the
+ * usual tcp references). The ref was placed in tcp_input_data
+ * before sending the conn_ind in tcp_send_conn_ind.
+ * The ref will be dropped in tcp_accept_finish().
+ */
+ SQUEUE_ENTER_ONE(econnp->conn_sqp, discon_mp, tcp_accept_finish,
+ econnp, NULL, SQ_NODRAIN, SQTAG_TCP_ACCEPT_FINISH_Q0);
+
/*
* Send the new local address also up to sockfs. There
* should already be enough space in the mp that came
@@ -1761,50 +1909,6 @@ tcp_tpi_accept(queue_t *q, mblk_t *mp)
}
/*
- * Send the newconn notification to ulp. The eager is blown off if the
- * notification fails.
- */
-static void
-tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
-{
- if (IPCL_IS_NONSTR(lconnp)) {
- cred_t *cr;
- pid_t cpid = NOPID;
-
- ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp);
- ASSERT(econnp->conn_tcp->tcp_saved_listener ==
- lconnp->conn_tcp);
-
- cr = msg_getcred(mp, &cpid);
-
- /* Keep the message around in case of a fallback to TPI */
- econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp;
- /*
- * Notify the ULP about the newconn. It is guaranteed that no
- * tcp_accept() call will be made for the eager if the
- * notification fails, so it's safe to blow it off in that
- * case.
- *
- * The upper handle will be assigned when tcp_accept() is
- * called.
- */
- if ((*lconnp->conn_upcalls->su_newconn)
- (lconnp->conn_upper_handle,
- (sock_lower_handle_t)econnp,
- &sock_tcp_downcalls, cr, cpid,
- &econnp->conn_upcalls) == NULL) {
- /* Failed to allocate a socket */
- TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps,
- tcpEstabResets);
- (void) tcp_eager_blowoff(lconnp->conn_tcp,
- econnp->conn_tcp->tcp_conn_req_seqnum);
- }
- } else {
- putnext(lconnp->conn_rq, mp);
- }
-}
-
-/*
* The function called through squeue to get behind listener's perimeter to
* send a deferred conn_ind.
*/
@@ -1831,7 +1935,7 @@ tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
return;
}
- tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+ putnext(lconnp->conn_rq, mp);
}
/*
@@ -1989,5 +2093,5 @@ tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
}
mutex_exit(&listener->tcp_eager_lock);
if (need_send_conn_ind)
- tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+ putnext(lconnp->conn_rq, mp);
}