summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/tcp/tcp_tpi.c
diff options
context:
space:
mode:
authorKacheong Poon <Kacheong.Poon@Sun.COM>2010-02-24 09:06:37 -0800
committerKacheong Poon <Kacheong.Poon@Sun.COM>2010-02-24 09:06:37 -0800
commit439b3dea6216344ffd0d8911f76442c317de8ca3 (patch)
tree38985c16986ffc01fae6761c01a8f7ad19b41edc /usr/src/uts/common/inet/tcp/tcp_tpi.c
parent721fffe35d40e548a5a58dc53a2ec9c6762172d9 (diff)
downloadillumos-joyent-439b3dea6216344ffd0d8911f76442c317de8ca3.tar.gz
6925635 The file tcp.c is too big
Diffstat (limited to 'usr/src/uts/common/inet/tcp/tcp_tpi.c')
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_tpi.c1992
1 files changed, 1992 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp_tpi.c b/usr/src/uts/common/inet/tcp/tcp_tpi.c
new file mode 100644
index 0000000000..d46a05ef08
--- /dev/null
+++ b/usr/src/uts/common/inet/tcp/tcp_tpi.c
@@ -0,0 +1,1992 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/* This files contains all TCP TLI/TPI related functions */
+
+#include <sys/types.h>
+#include <sys/stream.h>
+#include <sys/strsun.h>
+#include <sys/strsubr.h>
+#include <sys/stropts.h>
+#include <sys/strlog.h>
+#define _SUN_TPI_VERSION 2
+#include <sys/tihdr.h>
+#include <sys/suntpi.h>
+#include <sys/xti_inet.h>
+#include <sys/squeue_impl.h>
+#include <sys/squeue.h>
+
+#include <inet/common.h>
+#include <inet/ip.h>
+#include <inet/tcp.h>
+#include <inet/tcp_impl.h>
+#include <inet/proto_set.h>
+
+static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *);
+static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *);
+static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
+
+void
+tcp_use_pure_tpi(tcp_t *tcp)
+{
+ conn_t *connp = tcp->tcp_connp;
+
+#ifdef _ILP32
+ tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq;
+#else
+ tcp->tcp_acceptor_id = connp->conn_dev;
+#endif
+ /*
+ * Insert this socket into the acceptor hash.
+ * We might need it for T_CONN_RES message
+ */
+ tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp);
+
+ tcp->tcp_issocket = B_FALSE;
+ TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback);
+}
+
+/* Shorthand to generate and send TPI error acks to our client */
+void
+tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error)
+{
+ if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL)
+ putnext(tcp->tcp_connp->conn_rq, mp);
+}
+
+/* Shorthand to generate and send TPI error acks to our client */
+void
+tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
+ int t_error, int sys_error)
+{
+ struct T_error_ack *teackp;
+
+ if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack),
+ M_PCPROTO, T_ERROR_ACK)) != NULL) {
+ teackp = (struct T_error_ack *)mp->b_rptr;
+ teackp->ERROR_prim = primitive;
+ teackp->TLI_error = t_error;
+ teackp->UNIX_error = sys_error;
+ putnext(tcp->tcp_connp->conn_rq, mp);
+ }
+}
+
+/*
+ * TCP routine to get the values of options.
+ */
+int
+tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
+{
+ return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr));
+}
+
+/* ARGSUSED */
+int
+tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name,
+ uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp,
+ void *thisdg_attrs, cred_t *cr)
+{
+ conn_t *connp = Q_TO_CONN(q);
+
+ return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp,
+ outlenp, outvalp, thisdg_attrs, cr));
+}
+
+static int
+tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp,
+ int *t_errorp, int *sys_errorp)
+{
+ int error;
+ int is_absreq_failure;
+ t_scalar_t *opt_lenp;
+ t_scalar_t opt_offset;
+ int prim_type;
+ struct T_conn_req *tcreqp;
+ struct T_conn_res *tcresp;
+ cred_t *cr;
+
+ /*
+ * All Solaris components should pass a db_credp
+ * for this TPI message, hence we ASSERT.
+ * But in case there is some other M_PROTO that looks
+ * like a TPI message sent by some other kernel
+ * component, we check and return an error.
+ */
+ cr = msg_getcred(mp, NULL);
+ ASSERT(cr != NULL);
+ if (cr == NULL)
+ return (-1);
+
+ prim_type = ((union T_primitives *)mp->b_rptr)->type;
+ ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES ||
+ prim_type == T_CONN_RES);
+
+ switch (prim_type) {
+ case T_CONN_REQ:
+ tcreqp = (struct T_conn_req *)mp->b_rptr;
+ opt_offset = tcreqp->OPT_offset;
+ opt_lenp = (t_scalar_t *)&tcreqp->OPT_length;
+ break;
+ case O_T_CONN_RES:
+ case T_CONN_RES:
+ tcresp = (struct T_conn_res *)mp->b_rptr;
+ opt_offset = tcresp->OPT_offset;
+ opt_lenp = (t_scalar_t *)&tcresp->OPT_length;
+ break;
+ }
+
+ *t_errorp = 0;
+ *sys_errorp = 0;
+ *do_disconnectp = 0;
+
+ error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp,
+ opt_offset, cr, &tcp_opt_obj,
+ NULL, &is_absreq_failure);
+
+ switch (error) {
+ case 0: /* no error */
+ ASSERT(is_absreq_failure == 0);
+ return (0);
+ case ENOPROTOOPT:
+ *t_errorp = TBADOPT;
+ break;
+ case EACCES:
+ *t_errorp = TACCES;
+ break;
+ default:
+ *t_errorp = TSYSERR; *sys_errorp = error;
+ break;
+ }
+ if (is_absreq_failure != 0) {
+ /*
+ * The connection request should get the local ack
+ * T_OK_ACK and then a T_DISCON_IND.
+ */
+ *do_disconnectp = 1;
+ }
+ return (-1);
+}
+
+void
+tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
+{
+ int error;
+ conn_t *connp = tcp->tcp_connp;
+ struct sockaddr *sa;
+ mblk_t *mp1;
+ struct T_bind_req *tbr;
+ int backlog;
+ socklen_t len;
+ sin_t *sin;
+ sin6_t *sin6;
+ cred_t *cr;
+
+ /*
+ * All Solaris components should pass a db_credp
+ * for this TPI message, hence we ASSERT.
+ * But in case there is some other M_PROTO that looks
+ * like a TPI message sent by some other kernel
+ * component, we check and return an error.
+ */
+ cr = msg_getcred(mp, NULL);
+ ASSERT(cr != NULL);
+ if (cr == NULL) {
+ tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
+ return;
+ }
+
+ ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
+ if (connp->conn_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "tcp_tpi_bind: bad req, len %u",
+ (uint_t)(mp->b_wptr - mp->b_rptr));
+ }
+ tcp_err_ack(tcp, mp, TPROTO, 0);
+ return;
+ }
+ /* Make sure the largest address fits */
+ mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
+ if (mp1 == NULL) {
+ tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
+ return;
+ }
+ mp = mp1;
+ tbr = (struct T_bind_req *)mp->b_rptr;
+
+ backlog = tbr->CONIND_number;
+ len = tbr->ADDR_length;
+
+ switch (len) {
+ case 0: /* request for a generic port */
+ tbr->ADDR_offset = sizeof (struct T_bind_req);
+ if (connp->conn_family == AF_INET) {
+ tbr->ADDR_length = sizeof (sin_t);
+ sin = (sin_t *)&tbr[1];
+ *sin = sin_null;
+ sin->sin_family = AF_INET;
+ sa = (struct sockaddr *)sin;
+ len = sizeof (sin_t);
+ mp->b_wptr = (uchar_t *)&sin[1];
+ } else {
+ ASSERT(connp->conn_family == AF_INET6);
+ tbr->ADDR_length = sizeof (sin6_t);
+ sin6 = (sin6_t *)&tbr[1];
+ *sin6 = sin6_null;
+ sin6->sin6_family = AF_INET6;
+ sa = (struct sockaddr *)sin6;
+ len = sizeof (sin6_t);
+ mp->b_wptr = (uchar_t *)&sin6[1];
+ }
+ break;
+
+ case sizeof (sin_t): /* Complete IPv4 address */
+ sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
+ sizeof (sin_t));
+ break;
+
+ case sizeof (sin6_t): /* Complete IPv6 address */
+ sa = (struct sockaddr *)mi_offset_param(mp,
+ tbr->ADDR_offset, sizeof (sin6_t));
+ break;
+
+ default:
+ if (connp->conn_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
+ "tcp_tpi_bind: bad address length, %d",
+ tbr->ADDR_length);
+ }
+ tcp_err_ack(tcp, mp, TBADADDR, 0);
+ return;
+ }
+
+ if (backlog > 0) {
+ error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
+ tbr->PRIM_type != O_T_BIND_REQ);
+ } else {
+ error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
+ tbr->PRIM_type != O_T_BIND_REQ);
+ }
+done:
+ if (error > 0) {
+ tcp_err_ack(tcp, mp, TSYSERR, error);
+ } else if (error < 0) {
+ tcp_err_ack(tcp, mp, -error, 0);
+ } else {
+ /*
+ * Update port information as sockfs/tpi needs it for checking
+ */
+ if (connp->conn_family == AF_INET) {
+ sin = (sin_t *)sa;
+ sin->sin_port = connp->conn_lport;
+ } else {
+ sin6 = (sin6_t *)sa;
+ sin6->sin6_port = connp->conn_lport;
+ }
+ mp->b_datap->db_type = M_PCPROTO;
+ tbr->PRIM_type = T_BIND_ACK;
+ putnext(connp->conn_rq, mp);
+ }
+}
+
+/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */
+void
+tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp)
+{
+ conn_t *connp = tcp->tcp_connp;
+ int error;
+
+ error = tcp_do_unbind(connp);
+ if (error > 0) {
+ tcp_err_ack(tcp, mp, TSYSERR, error);
+ } else if (error < 0) {
+ tcp_err_ack(tcp, mp, -error, 0);
+ } else {
+ /* Send M_FLUSH according to TPI */
+ (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW);
+
+ mp = mi_tpi_ok_ack_alloc(mp);
+ if (mp != NULL)
+ putnext(connp->conn_rq, mp);
+ }
+}
+
+int
+tcp_tpi_close(queue_t *q, int flags)
+{
+ conn_t *connp;
+
+ ASSERT(WR(q)->q_next == NULL);
+
+ if (flags & SO_FALLBACK) {
+ /*
+ * stream is being closed while in fallback
+ * simply free the resources that were allocated
+ */
+ inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr));
+ qprocsoff(q);
+ goto done;
+ }
+
+ connp = Q_TO_CONN(q);
+ /*
+ * We are being closed as /dev/tcp or /dev/tcp6.
+ */
+ tcp_close_common(connp, flags);
+
+ qprocsoff(q);
+ inet_minor_free(connp->conn_minor_arena, connp->conn_dev);
+
+ /*
+ * Drop IP's reference on the conn. This is the last reference
+ * on the connp if the state was less than established. If the
+ * connection has gone into timewait state, then we will have
+ * one ref for the TCP and one more ref (total of two) for the
+ * classifier connected hash list (a timewait connections stays
+ * in connected hash till closed).
+ *
+ * We can't assert the references because there might be other
+ * transient reference places because of some walkers or queued
+ * packets in squeue for the timewait state.
+ */
+ CONN_DEC_REF(connp);
+done:
+ q->q_ptr = WR(q)->q_ptr = NULL;
+ return (0);
+}
+
+int
+tcp_tpi_close_accept(queue_t *q)
+{
+ vmem_t *minor_arena;
+ dev_t conn_dev;
+ extern struct qinit tcp_acceptor_winit;
+
+ ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit);
+
+ /*
+ * We had opened an acceptor STREAM for sockfs which is
+ * now being closed due to some error.
+ */
+ qprocsoff(q);
+
+ minor_arena = (vmem_t *)WR(q)->q_ptr;
+ conn_dev = (dev_t)RD(q)->q_ptr;
+ ASSERT(minor_arena != NULL);
+ ASSERT(conn_dev != 0);
+ inet_minor_free(minor_arena, conn_dev);
+ q->q_ptr = WR(q)->q_ptr = NULL;
+ return (0);
+}
+
+/*
+ * Put a connection confirmation message upstream built from the
+ * address/flowid information with the conn and iph. Report our success or
+ * failure.
+ */
+boolean_t
+tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp,
+ mblk_t **defermp, ip_recv_attr_t *ira)
+{
+ sin_t sin;
+ sin6_t sin6;
+ mblk_t *mp;
+ char *optp = NULL;
+ int optlen = 0;
+ conn_t *connp = tcp->tcp_connp;
+
+ if (defermp != NULL)
+ *defermp = NULL;
+
+ if (tcp->tcp_conn.tcp_opts_conn_req != NULL) {
+ /*
+ * Return in T_CONN_CON results of option negotiation through
+ * the T_CONN_REQ. Note: If there is an real end-to-end option
+ * negotiation, then what is received from remote end needs
+ * to be taken into account but there is no such thing (yet?)
+ * in our TCP/IP.
+ * Note: We do not use mi_offset_param() here as
+ * tcp_opts_conn_req contents do not directly come from
+ * an application and are either generated in kernel or
+ * from user input that was already verified.
+ */
+ mp = tcp->tcp_conn.tcp_opts_conn_req;
+ optp = (char *)(mp->b_rptr +
+ ((struct T_conn_req *)mp->b_rptr)->OPT_offset);
+ optlen = (int)
+ ((struct T_conn_req *)mp->b_rptr)->OPT_length;
+ }
+
+ if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) {
+
+ /* packet is IPv4 */
+ if (connp->conn_family == AF_INET) {
+ sin = sin_null;
+ sin.sin_addr.s_addr = connp->conn_faddr_v4;
+ sin.sin_port = connp->conn_fport;
+ sin.sin_family = AF_INET;
+ mp = mi_tpi_conn_con(NULL, (char *)&sin,
+ (int)sizeof (sin_t), optp, optlen);
+ } else {
+ sin6 = sin6_null;
+ sin6.sin6_addr = connp->conn_faddr_v6;
+ sin6.sin6_port = connp->conn_fport;
+ sin6.sin6_family = AF_INET6;
+ mp = mi_tpi_conn_con(NULL, (char *)&sin6,
+ (int)sizeof (sin6_t), optp, optlen);
+
+ }
+ } else {
+ ip6_t *ip6h = (ip6_t *)iphdr;
+
+ ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION);
+ ASSERT(connp->conn_family == AF_INET6);
+ sin6 = sin6_null;
+ sin6.sin6_addr = connp->conn_faddr_v6;
+ sin6.sin6_port = connp->conn_fport;
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK;
+ mp = mi_tpi_conn_con(NULL, (char *)&sin6,
+ (int)sizeof (sin6_t), optp, optlen);
+ }
+
+ if (!mp)
+ return (B_FALSE);
+
+ mblk_copycred(mp, idmp);
+
+ if (defermp == NULL) {
+ conn_t *connp = tcp->tcp_connp;
+ if (IPCL_IS_NONSTR(connp)) {
+ (*connp->conn_upcalls->su_connected)
+ (connp->conn_upper_handle, tcp->tcp_connid,
+ ira->ira_cred, ira->ira_cpid);
+ freemsg(mp);
+ } else {
+ if (ira->ira_cred != NULL) {
+ /* So that getpeerucred works for TPI sockfs */
+ mblk_setcred(mp, ira->ira_cred, ira->ira_cpid);
+ }
+ putnext(connp->conn_rq, mp);
+ }
+ } else {
+ *defermp = mp;
+ }
+
+ if (tcp->tcp_conn.tcp_opts_conn_req != NULL)
+ tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req);
+ return (B_TRUE);
+}
+
+/*
+ * Successful connect request processing begins when our client passes
+ * a T_CONN_REQ message into tcp_wput(), which performs function calls into
+ * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream).
+ *
+ * After various error checks are completed, tcp_tpi_connect() lays
+ * the target address and port into the composite header template.
+ * Then we ask IP for information, including a source address if we didn't
+ * already have one. Finally we prepare to send the SYN packet, and then
+ * send up the T_OK_ACK reply message.
+ */
+void
+tcp_tpi_connect(tcp_t *tcp, mblk_t *mp)
+{
+ sin_t *sin;
+ struct T_conn_req *tcr;
+ struct sockaddr *sa;
+ socklen_t len;
+ int error;
+ cred_t *cr;
+ pid_t cpid;
+ conn_t *connp = tcp->tcp_connp;
+ queue_t *q = connp->conn_wq;
+
+ /*
+ * All Solaris components should pass a db_credp
+ * for this TPI message, hence we ASSERT.
+ * But in case there is some other M_PROTO that looks
+ * like a TPI message sent by some other kernel
+ * component, we check and return an error.
+ */
+ cr = msg_getcred(mp, &cpid);
+ ASSERT(cr != NULL);
+ if (cr == NULL) {
+ tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
+ return;
+ }
+
+ tcr = (struct T_conn_req *)mp->b_rptr;
+
+ ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
+ tcp_err_ack(tcp, mp, TPROTO, 0);
+ return;
+ }
+
+ /*
+ * Pre-allocate the T_ordrel_ind mblk so that at close time, we
+ * will always have that to send up. Otherwise, we need to do
+ * special handling in case the allocation fails at that time.
+ * If the end point is TPI, the tcp_t can be reused and the
+ * tcp_ordrel_mp may be allocated already.
+ */
+ if (tcp->tcp_ordrel_mp == NULL) {
+ if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) {
+ tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
+ return;
+ }
+ }
+
+ /*
+ * Determine packet type based on type of address passed in
+ * the request should contain an IPv4 or IPv6 address.
+ * Make sure that address family matches the type of
+ * family of the address passed down.
+ */
+ switch (tcr->DEST_length) {
+ default:
+ tcp_err_ack(tcp, mp, TBADADDR, 0);
+ return;
+
+ case (sizeof (sin_t) - sizeof (sin->sin_zero)): {
+ /*
+ * XXX: The check for valid DEST_length was not there
+ * in earlier releases and some buggy
+ * TLI apps (e.g Sybase) got away with not feeding
+ * in sin_zero part of address.
+ * We allow that bug to keep those buggy apps humming.
+ * Test suites require the check on DEST_length.
+ * We construct a new mblk with valid DEST_length
+ * free the original so the rest of the code does
+ * not have to keep track of this special shorter
+ * length address case.
+ */
+ mblk_t *nmp;
+ struct T_conn_req *ntcr;
+ sin_t *nsin;
+
+ nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) +
+ tcr->OPT_length, BPRI_HI);
+ if (nmp == NULL) {
+ tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
+ return;
+ }
+ ntcr = (struct T_conn_req *)nmp->b_rptr;
+ bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */
+ ntcr->PRIM_type = T_CONN_REQ;
+ ntcr->DEST_length = sizeof (sin_t);
+ ntcr->DEST_offset = sizeof (struct T_conn_req);
+
+ nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset);
+ *nsin = sin_null;
+ /* Get pointer to shorter address to copy from original mp */
+ sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset,
+ tcr->DEST_length); /* extract DEST_length worth of sin_t */
+ if (sin == NULL || !OK_32PTR((char *)sin)) {
+ freemsg(nmp);
+ tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
+ return;
+ }
+ nsin->sin_family = sin->sin_family;
+ nsin->sin_port = sin->sin_port;
+ nsin->sin_addr = sin->sin_addr;
+ /* Note:nsin->sin_zero zero-fill with sin_null assign above */
+ nmp->b_wptr = (uchar_t *)&nsin[1];
+ if (tcr->OPT_length != 0) {
+ ntcr->OPT_length = tcr->OPT_length;
+ ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr;
+ bcopy((uchar_t *)tcr + tcr->OPT_offset,
+ (uchar_t *)ntcr + ntcr->OPT_offset,
+ tcr->OPT_length);
+ nmp->b_wptr += tcr->OPT_length;
+ }
+ freemsg(mp); /* original mp freed */
+ mp = nmp; /* re-initialize original variables */
+ tcr = ntcr;
+ }
+ /* FALLTHRU */
+
+ case sizeof (sin_t):
+ sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
+ sizeof (sin_t));
+ len = sizeof (sin_t);
+ break;
+
+ case sizeof (sin6_t):
+ sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset,
+ sizeof (sin6_t));
+ len = sizeof (sin6_t);
+ break;
+ }
+
+ error = proto_verify_ip_addr(connp->conn_family, sa, len);
+ if (error != 0) {
+ tcp_err_ack(tcp, mp, TSYSERR, error);
+ return;
+ }
+
+ /*
+ * TODO: If someone in TCPS_TIME_WAIT has this dst/port we
+ * should key on their sequence number and cut them loose.
+ */
+
+ /*
+ * If options passed in, feed it for verification and handling
+ */
+ if (tcr->OPT_length != 0) {
+ mblk_t *ok_mp;
+ mblk_t *discon_mp;
+ mblk_t *conn_opts_mp;
+ int t_error, sys_error, do_disconnect;
+
+ conn_opts_mp = NULL;
+
+ if (tcp_conprim_opt_process(tcp, mp,
+ &do_disconnect, &t_error, &sys_error) < 0) {
+ if (do_disconnect) {
+ ASSERT(t_error == 0 && sys_error == 0);
+ discon_mp = mi_tpi_discon_ind(NULL,
+ ECONNREFUSED, 0);
+ if (!discon_mp) {
+ tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
+ TSYSERR, ENOMEM);
+ return;
+ }
+ ok_mp = mi_tpi_ok_ack_alloc(mp);
+ if (!ok_mp) {
+ tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
+ TSYSERR, ENOMEM);
+ return;
+ }
+ qreply(q, ok_mp);
+ qreply(q, discon_mp); /* no flush! */
+ } else {
+ ASSERT(t_error != 0);
+ tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error,
+ sys_error);
+ }
+ return;
+ }
+ /*
+ * Success in setting options, the mp option buffer represented
+ * by OPT_length/offset has been potentially modified and
+ * contains results of option processing. We copy it in
+ * another mp to save it for potentially influencing returning
+ * it in T_CONN_CONN.
+ */
+ if (tcr->OPT_length != 0) { /* there are resulting options */
+ conn_opts_mp = copyb(mp);
+ if (!conn_opts_mp) {
+ tcp_err_ack_prim(tcp, mp, T_CONN_REQ,
+ TSYSERR, ENOMEM);
+ return;
+ }
+ ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL);
+ tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp;
+ /*
+ * Note:
+ * These resulting option negotiation can include any
+ * end-to-end negotiation options but there no such
+ * thing (yet?) in our TCP/IP.
+ */
+ }
+ }
+
+ /* call the non-TPI version */
+ error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid);
+ if (error < 0) {
+ mp = mi_tpi_err_ack_alloc(mp, -error, 0);
+ } else if (error > 0) {
+ mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error);
+ } else {
+ mp = mi_tpi_ok_ack_alloc(mp);
+ }
+
+ /*
+ * Note: Code below is the "failure" case
+ */
+ /* return error ack and blow away saved option results if any */
+connect_failed:
+ if (mp != NULL)
+ putnext(connp->conn_rq, mp);
+ else {
+ tcp_err_ack_prim(tcp, NULL, T_CONN_REQ,
+ TSYSERR, ENOMEM);
+ }
+}
+
+/* Return the TPI/TLI equivalent of our current tcp_state */
+static int
+tcp_tpistate(tcp_t *tcp)
+{
+ switch (tcp->tcp_state) {
+ case TCPS_IDLE:
+ return (TS_UNBND);
+ case TCPS_LISTEN:
+ /*
+ * Return whether there are outstanding T_CONN_IND waiting
+ * for the matching T_CONN_RES. Therefore don't count q0.
+ */
+ if (tcp->tcp_conn_req_cnt_q > 0)
+ return (TS_WRES_CIND);
+ else
+ return (TS_IDLE);
+ case TCPS_BOUND:
+ return (TS_IDLE);
+ case TCPS_SYN_SENT:
+ return (TS_WCON_CREQ);
+ case TCPS_SYN_RCVD:
+ /*
+ * Note: assumption: this has to the active open SYN_RCVD.
+ * The passive instance is detached in SYN_RCVD stage of
+ * incoming connection processing so we cannot get request
+ * for T_info_ack on it.
+ */
+ return (TS_WACK_CRES);
+ case TCPS_ESTABLISHED:
+ return (TS_DATA_XFER);
+ case TCPS_CLOSE_WAIT:
+ return (TS_WREQ_ORDREL);
+ case TCPS_FIN_WAIT_1:
+ return (TS_WIND_ORDREL);
+ case TCPS_FIN_WAIT_2:
+ return (TS_WIND_ORDREL);
+
+ case TCPS_CLOSING:
+ case TCPS_LAST_ACK:
+ case TCPS_TIME_WAIT:
+ case TCPS_CLOSED:
+ /*
+ * Following TS_WACK_DREQ7 is a rendition of "not
+ * yet TS_IDLE" TPI state. There is no best match to any
+ * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we
+ * choose a value chosen that will map to TLI/XTI level
+ * state of TSTATECHNG (state is process of changing) which
+ * captures what this dummy state represents.
+ */
+ return (TS_WACK_DREQ7);
+ default:
+ cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s",
+ tcp->tcp_state, tcp_display(tcp, NULL,
+ DISP_PORT_ONLY));
+ return (TS_UNBND);
+ }
+}
+
+static void
+tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp)
+{
+ tcp_stack_t *tcps = tcp->tcp_tcps;
+ conn_t *connp = tcp->tcp_connp;
+ extern struct T_info_ack tcp_g_t_info_ack;
+ extern struct T_info_ack tcp_g_t_info_ack_v6;
+
+ if (connp->conn_family == AF_INET6)
+ *tia = tcp_g_t_info_ack_v6;
+ else
+ *tia = tcp_g_t_info_ack;
+ tia->CURRENT_state = tcp_tpistate(tcp);
+ tia->OPT_size = tcp_max_optsize;
+ if (tcp->tcp_mss == 0) {
+ /* Not yet set - tcp_open does not set mss */
+ if (connp->conn_ipversion == IPV4_VERSION)
+ tia->TIDU_size = tcps->tcps_mss_def_ipv4;
+ else
+ tia->TIDU_size = tcps->tcps_mss_def_ipv6;
+ } else {
+ tia->TIDU_size = tcp->tcp_mss;
+ }
+ /* TODO: Default ETSDU is 1. Is that correct for tcp? */
+}
+
+static void
+tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap,
+ t_uscalar_t cap_bits1)
+{
+ tcap->CAP_bits1 = 0;
+
+ if (cap_bits1 & TC1_INFO) {
+ tcp_copy_info(&tcap->INFO_ack, tcp);
+ tcap->CAP_bits1 |= TC1_INFO;
+ }
+
+ if (cap_bits1 & TC1_ACCEPTOR_ID) {
+ tcap->ACCEPTOR_id = tcp->tcp_acceptor_id;
+ tcap->CAP_bits1 |= TC1_ACCEPTOR_ID;
+ }
+
+}
+
+/*
+ * This routine responds to T_CAPABILITY_REQ messages. It is called by
+ * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from
+ * tcp_g_t_info_ack. The current state of the stream is copied from
+ * tcp_state.
+ */
+void
+tcp_capability_req(tcp_t *tcp, mblk_t *mp)
+{
+ t_uscalar_t cap_bits1;
+ struct T_capability_ack *tcap;
+
+ if (MBLKL(mp) < sizeof (struct T_capability_req)) {
+ freemsg(mp);
+ return;
+ }
+
+ cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1;
+
+ mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack),
+ mp->b_datap->db_type, T_CAPABILITY_ACK);
+ if (mp == NULL)
+ return;
+
+ tcap = (struct T_capability_ack *)mp->b_rptr;
+ tcp_do_capability_ack(tcp, tcap, cap_bits1);
+
+ putnext(tcp->tcp_connp->conn_rq, mp);
+}
+
+/*
+ * This routine responds to T_INFO_REQ messages. It is called by tcp_wput.
+ * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack.
+ * The current state of the stream is copied from tcp_state.
+ */
+void
+tcp_info_req(tcp_t *tcp, mblk_t *mp)
+{
+ mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO,
+ T_INFO_ACK);
+ if (!mp) {
+ tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
+ return;
+ }
+ tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp);
+ putnext(tcp->tcp_connp->conn_rq, mp);
+}
+
+/* Respond to the TPI addr request */
+void
+tcp_addr_req(tcp_t *tcp, mblk_t *mp)
+{
+ struct sockaddr *sa;
+ mblk_t *ackmp;
+ struct T_addr_ack *taa;
+ conn_t *connp = tcp->tcp_connp;
+ uint_t addrlen;
+
+ /* Make it large enough for worst case */
+ ackmp = reallocb(mp, sizeof (struct T_addr_ack) +
+ 2 * sizeof (sin6_t), 1);
+ if (ackmp == NULL) {
+ tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
+ return;
+ }
+
+ taa = (struct T_addr_ack *)ackmp->b_rptr;
+
+ bzero(taa, sizeof (struct T_addr_ack));
+ ackmp->b_wptr = (uchar_t *)&taa[1];
+
+ taa->PRIM_type = T_ADDR_ACK;
+ ackmp->b_datap->db_type = M_PCPROTO;
+
+ if (connp->conn_family == AF_INET)
+ addrlen = sizeof (sin_t);
+ else
+ addrlen = sizeof (sin6_t);
+
+ /*
+ * Note: Following code assumes 32 bit alignment of basic
+ * data structures like sin_t and struct T_addr_ack.
+ */
+ if (tcp->tcp_state >= TCPS_BOUND) {
+ /*
+ * Fill in local address first
+ */
+ taa->LOCADDR_offset = sizeof (*taa);
+ taa->LOCADDR_length = addrlen;
+ sa = (struct sockaddr *)&taa[1];
+ (void) conn_getsockname(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
+ }
+ if (tcp->tcp_state >= TCPS_SYN_RCVD) {
+ /*
+ * Fill in Remote address
+ */
+ taa->REMADDR_length = addrlen;
+ /* assumed 32-bit alignment */
+ taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length;
+ sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset);
+ (void) conn_getpeername(connp, sa, &addrlen);
+ ackmp->b_wptr += addrlen;
+ }
+ ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim);
+ putnext(tcp->tcp_connp->conn_rq, ackmp);
+}
+
+/*
+ * tcp_fallback
+ *
+ * A direct socket is falling back to using STREAMS. The queue
+ * that is being passed down was created using tcp_open() with
+ * the SO_FALLBACK flag set. As a result, the queue is not
+ * associated with a conn, and the q_ptrs instead contain the
+ * dev and minor area that should be used.
+ *
+ * The 'issocket' flag indicates whether the FireEngine
+ * optimizations should be used. The common case would be that
+ * optimizations are enabled, and they might be subsequently
+ * disabled using the _SIOCSOCKFALLBACK ioctl.
+ */
+
+/*
+ * An active connection is falling back to TPI. Gather all the information
+ * required by the STREAM head and TPI sonode and send it up.
+ */
+void
+tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
+ boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb)
+{
+ conn_t *connp = tcp->tcp_connp;
+ struct stroptions *stropt;
+ struct T_capability_ack tca;
+ struct sockaddr_in6 laddr, faddr;
+ socklen_t laddrlen, faddrlen;
+ short opts;
+ int error;
+ mblk_t *mp;
+
+ connp->conn_dev = (dev_t)RD(q)->q_ptr;
+ connp->conn_minor_arena = WR(q)->q_ptr;
+
+ RD(q)->q_ptr = WR(q)->q_ptr = connp;
+
+ connp->conn_rq = RD(q);
+ connp->conn_wq = WR(q);
+
+ WR(q)->q_qinfo = &tcp_sock_winit;
+
+ if (!issocket)
+ tcp_use_pure_tpi(tcp);
+
+ /*
+ * free the helper stream
+ */
+ ip_free_helper_stream(connp);
+
+ /*
+ * Notify the STREAM head about options
+ */
+ DB_TYPE(stropt_mp) = M_SETOPTS;
+ stropt = (struct stroptions *)stropt_mp->b_rptr;
+ stropt_mp->b_wptr += sizeof (struct stroptions);
+ stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK;
+
+ stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 :
+ tcp->tcp_tcps->tcps_wroff_xtra);
+ if (tcp->tcp_snd_sack_ok)
+ stropt->so_wroff += TCPOPT_MAX_SACK_LEN;
+ stropt->so_hiwat = connp->conn_rcvbuf;
+ stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE);
+
+ putnext(RD(q), stropt_mp);
+
+ /*
+ * Collect the information needed to sync with the sonode
+ */
+ tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
+
+ laddrlen = faddrlen = sizeof (sin6_t);
+ (void) tcp_getsockname((sock_lower_handle_t)connp,
+ (struct sockaddr *)&laddr, &laddrlen, CRED());
+ error = tcp_getpeername((sock_lower_handle_t)connp,
+ (struct sockaddr *)&faddr, &faddrlen, CRED());
+ if (error != 0)
+ faddrlen = 0;
+
+ opts = 0;
+ if (connp->conn_oobinline)
+ opts |= SO_OOBINLINE;
+ if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE)
+ opts |= SO_DONTROUTE;
+
+ /*
+ * Notify the socket that the protocol is now quiescent,
+ * and it's therefore safe move data from the socket
+ * to the stream head.
+ */
+ (*quiesced_cb)(connp->conn_upper_handle, q, &tca,
+ (struct sockaddr *)&laddr, laddrlen,
+ (struct sockaddr *)&faddr, faddrlen, opts);
+
+ while ((mp = tcp->tcp_rcv_list) != NULL) {
+ tcp->tcp_rcv_list = mp->b_next;
+ mp->b_next = NULL;
+ /* We never do fallback for kernel RPC */
+ putnext(q, mp);
+ }
+ tcp->tcp_rcv_last_head = NULL;
+ tcp->tcp_rcv_last_tail = NULL;
+ tcp->tcp_rcv_cnt = 0;
+}
+
+/*
+ * An eager is falling back to TPI. All we have to do is send
+ * up a T_CONN_IND.
+ */
+void
+tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
+{
+ tcp_t *listener = eager->tcp_listener;
+ mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind;
+
+ ASSERT(listener != NULL);
+ ASSERT(mp != NULL);
+
+ eager->tcp_conn.tcp_eager_conn_ind = NULL;
+
+ /*
+ * TLI/XTI applications will get confused by
+ * sending eager as an option since it violates
+ * the option semantics. So remove the eager as
+ * option since TLI/XTI app doesn't need it anyway.
+ */
+ if (!direct_sockfs) {
+ struct T_conn_ind *conn_ind;
+
+ conn_ind = (struct T_conn_ind *)mp->b_rptr;
+ conn_ind->OPT_length = 0;
+ conn_ind->OPT_offset = 0;
+ }
+
+ /*
+ * Sockfs guarantees that the listener will not be closed
+ * during fallback. So we can safely use the listener's queue.
+ */
+ putnext(listener->tcp_connp->conn_rq, mp);
+}
+
+/*
+ * Swap information between the eager and acceptor for a TLI/XTI client.
+ * The sockfs accept is done on the acceptor stream and control goes
+ * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
+ * called. In either case, both the eager and listener are in their own
+ * perimeter (squeue) and the code has to deal with potential race.
+ *
+ * See the block comment on top of tcp_accept() and tcp_tli_accept().
+ */
+static void
+tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
+{
+ conn_t *econnp, *aconnp;
+
+ ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
+ ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
+ ASSERT(!TCP_IS_SOCKET(acceptor));
+ ASSERT(!TCP_IS_SOCKET(eager));
+ ASSERT(!TCP_IS_SOCKET(listener));
+
+ /*
+ * Trusted Extensions may need to use a security label that is
+ * different from the acceptor's label on MLP and MAC-Exempt
+ * sockets. If this is the case, the required security label
+ * already exists in econnp->conn_ixa->ixa_tsl. Since we make the
+ * acceptor stream refer to econnp we atomatically get that label.
+ */
+
+ acceptor->tcp_detached = B_TRUE;
+ /*
+ * To permit stream re-use by TLI/XTI, the eager needs a copy of
+ * the acceptor id.
+ */
+ eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
+
+ /* remove eager from listen list... */
+ mutex_enter(&listener->tcp_eager_lock);
+ tcp_eager_unlink(eager);
+ ASSERT(eager->tcp_eager_next_q == NULL &&
+ eager->tcp_eager_last_q == NULL);
+ ASSERT(eager->tcp_eager_next_q0 == NULL &&
+ eager->tcp_eager_prev_q0 == NULL);
+ mutex_exit(&listener->tcp_eager_lock);
+
+ econnp = eager->tcp_connp;
+ aconnp = acceptor->tcp_connp;
+ econnp->conn_rq = aconnp->conn_rq;
+ econnp->conn_wq = aconnp->conn_wq;
+ econnp->conn_rq->q_ptr = econnp;
+ econnp->conn_wq->q_ptr = econnp;
+
+ /*
+ * In the TLI/XTI loopback case, we are inside the listener's squeue,
+ * which might be a different squeue from our peer TCP instance.
+ * For TCP Fusion, the peer expects that whenever tcp_detached is
+ * clear, our TCP queues point to the acceptor's queues. Thus, use
+ * membar_producer() to ensure that the assignments of conn_rq/conn_wq
+ * above reach global visibility prior to the clearing of tcp_detached.
+ */
+ membar_producer();
+ eager->tcp_detached = B_FALSE;
+
+ ASSERT(eager->tcp_ack_tid == 0);
+
+ econnp->conn_dev = aconnp->conn_dev;
+ econnp->conn_minor_arena = aconnp->conn_minor_arena;
+
+ ASSERT(econnp->conn_minor_arena != NULL);
+ if (econnp->conn_cred != NULL)
+ crfree(econnp->conn_cred);
+ econnp->conn_cred = aconnp->conn_cred;
+ econnp->conn_ixa->ixa_cred = econnp->conn_cred;
+ aconnp->conn_cred = NULL;
+ econnp->conn_cpid = aconnp->conn_cpid;
+ ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
+ ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
+
+ econnp->conn_zoneid = aconnp->conn_zoneid;
+ econnp->conn_allzones = aconnp->conn_allzones;
+ econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
+
+ econnp->conn_mac_mode = aconnp->conn_mac_mode;
+ econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
+ aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
+
+ /* Do the IPC initialization */
+ CONN_INC_REF(econnp);
+
+ /* Done with old IPC. Drop its ref on its connp */
+ CONN_DEC_REF(aconnp);
+}
+
+/*
+ * Reply to a clients T_CONN_RES TPI message. This function
+ * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
+ * on the acceptor STREAM and processed in tcp_accept_common().
+ * Read the block comment on top of tcp_input_listener().
+ */
+void
+tcp_tli_accept(tcp_t *listener, mblk_t *mp)
+{
+ tcp_t *acceptor;
+ tcp_t *eager;
+ tcp_t *tcp;
+ struct T_conn_res *tcr;
+ t_uscalar_t acceptor_id;
+ t_scalar_t seqnum;
+ mblk_t *discon_mp = NULL;
+ mblk_t *ok_mp;
+ mblk_t *mp1;
+ tcp_stack_t *tcps = listener->tcp_tcps;
+ conn_t *econnp;
+
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
+ tcp_err_ack(listener, mp, TPROTO, 0);
+ return;
+ }
+ tcr = (struct T_conn_res *)mp->b_rptr;
+
+ /*
+ * Under ILP32 the stream head points tcr->ACCEPTOR_id at the
+ * read side queue of the streams device underneath us i.e. the
+ * read side queue of 'ip'. Since we can't deference QUEUE_ptr we
+ * look it up in the queue_hash. Under LP64 it sends down the
+ * minor_t of the accepting endpoint.
+ *
+ * Once the acceptor/eager are modified (in tcp_accept_swap) the
+ * fanout hash lock is held.
+ * This prevents any thread from entering the acceptor queue from
+ * below (since it has not been hard bound yet i.e. any inbound
+ * packets will arrive on the listener conn_t and
+ * go through the classifier).
+ * The CONN_INC_REF will prevent the acceptor from closing.
+ *
+ * XXX It is still possible for a tli application to send down data
+ * on the accepting stream while another thread calls t_accept.
+ * This should not be a problem for well-behaved applications since
+ * the T_OK_ACK is sent after the queue swapping is completed.
+ *
+ * If the accepting fd is the same as the listening fd, avoid
+ * queue hash lookup since that will return an eager listener in a
+ * already established state.
+ */
+ acceptor_id = tcr->ACCEPTOR_id;
+ mutex_enter(&listener->tcp_eager_lock);
+ if (listener->tcp_acceptor_id == acceptor_id) {
+ eager = listener->tcp_eager_next_q;
+ /* only count how many T_CONN_INDs so don't count q0 */
+ if ((listener->tcp_conn_req_cnt_q != 1) ||
+ (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
+ mutex_exit(&listener->tcp_eager_lock);
+ tcp_err_ack(listener, mp, TBADF, 0);
+ return;
+ }
+ if (listener->tcp_conn_req_cnt_q0 != 0) {
+ /* Throw away all the eagers on q0. */
+ tcp_eager_cleanup(listener, 1);
+ }
+ if (listener->tcp_syn_defense) {
+ listener->tcp_syn_defense = B_FALSE;
+ if (listener->tcp_ip_addr_cache != NULL) {
+ kmem_free(listener->tcp_ip_addr_cache,
+ IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
+ listener->tcp_ip_addr_cache = NULL;
+ }
+ }
+ /*
+ * Transfer tcp_conn_req_max to the eager so that when
+ * a disconnect occurs we can revert the endpoint to the
+ * listen state.
+ */
+ eager->tcp_conn_req_max = listener->tcp_conn_req_max;
+ ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
+ /*
+ * Get a reference on the acceptor just like the
+ * tcp_acceptor_hash_lookup below.
+ */
+ acceptor = listener;
+ CONN_INC_REF(acceptor->tcp_connp);
+ } else {
+ acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
+ if (acceptor == NULL) {
+ if (listener->tcp_connp->conn_debug) {
+ (void) strlog(TCP_MOD_ID, 0, 1,
+ SL_ERROR|SL_TRACE,
+ "tcp_accept: did not find acceptor 0x%x\n",
+ acceptor_id);
+ }
+ mutex_exit(&listener->tcp_eager_lock);
+ tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
+ return;
+ }
+ /*
+ * Verify acceptor state. The acceptable states for an acceptor
+ * include TCPS_IDLE and TCPS_BOUND.
+ */
+ switch (acceptor->tcp_state) {
+ case TCPS_IDLE:
+ /* FALLTHRU */
+ case TCPS_BOUND:
+ break;
+ default:
+ CONN_DEC_REF(acceptor->tcp_connp);
+ mutex_exit(&listener->tcp_eager_lock);
+ tcp_err_ack(listener, mp, TOUTSTATE, 0);
+ return;
+ }
+ }
+
+ /* The listener must be in TCPS_LISTEN */
+ if (listener->tcp_state != TCPS_LISTEN) {
+ CONN_DEC_REF(acceptor->tcp_connp);
+ mutex_exit(&listener->tcp_eager_lock);
+ tcp_err_ack(listener, mp, TOUTSTATE, 0);
+ return;
+ }
+
+ /*
+ * Rendezvous with an eager connection request packet hanging off
+ * 'tcp' that has the 'seqnum' tag. We tagged the detached open
+ * tcp structure when the connection packet arrived in
+ * tcp_input_listener().
+ */
+ seqnum = tcr->SEQ_number;
+ eager = listener;
+ do {
+ eager = eager->tcp_eager_next_q;
+ if (eager == NULL) {
+ CONN_DEC_REF(acceptor->tcp_connp);
+ mutex_exit(&listener->tcp_eager_lock);
+ tcp_err_ack(listener, mp, TBADSEQ, 0);
+ return;
+ }
+ } while (eager->tcp_conn_req_seqnum != seqnum);
+ mutex_exit(&listener->tcp_eager_lock);
+
+ /*
+ * At this point, both acceptor and listener have 2 ref
+ * that they begin with. Acceptor has one additional ref
+ * we placed in lookup while listener has 3 additional
+ * ref for being behind the squeue (tcp_accept() is
+ * done on listener's squeue); being in classifier hash;
+ * and eager's ref on listener.
+ */
+ ASSERT(listener->tcp_connp->conn_ref >= 5);
+ ASSERT(acceptor->tcp_connp->conn_ref >= 3);
+
+ /*
+ * The eager at this point is set in its own squeue and
+ * could easily have been killed (tcp_accept_finish will
+ * deal with that) because of a TH_RST so we can only
+ * ASSERT for a single ref.
+ */
+ ASSERT(eager->tcp_connp->conn_ref >= 1);
+
+ /*
+ * Pre allocate the discon_ind mblk also. tcp_accept_finish will
+ * use it if something failed.
+ */
+ discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
+ sizeof (struct stroptions)), BPRI_HI);
+ if (discon_mp == NULL) {
+ CONN_DEC_REF(acceptor->tcp_connp);
+ CONN_DEC_REF(eager->tcp_connp);
+ tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
+ return;
+ }
+
+ econnp = eager->tcp_connp;
+
+ /* Hold a copy of mp, in case reallocb fails */
+ if ((mp1 = copymsg(mp)) == NULL) {
+ CONN_DEC_REF(acceptor->tcp_connp);
+ CONN_DEC_REF(eager->tcp_connp);
+ freemsg(discon_mp);
+ tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
+ return;
+ }
+
+ tcr = (struct T_conn_res *)mp1->b_rptr;
+
+ /*
+ * This is an expanded version of mi_tpi_ok_ack_alloc()
+ * which allocates a larger mblk and appends the new
+ * local address to the ok_ack. The address is copied by
+ * soaccept() for getsockname().
+ */
+ {
+ int extra;
+
+ extra = (econnp->conn_family == AF_INET) ?
+ sizeof (sin_t) : sizeof (sin6_t);
+
+ /*
+ * Try to re-use mp, if possible. Otherwise, allocate
+ * an mblk and return it as ok_mp. In any case, mp
+ * is no longer usable upon return.
+ */
+ if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
+ CONN_DEC_REF(acceptor->tcp_connp);
+ CONN_DEC_REF(eager->tcp_connp);
+ freemsg(discon_mp);
+ /* Original mp has been freed by now, so use mp1 */
+ tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
+ return;
+ }
+
+ mp = NULL; /* We should never use mp after this point */
+
+ switch (extra) {
+ case sizeof (sin_t): {
+ sin_t *sin = (sin_t *)ok_mp->b_wptr;
+
+ ok_mp->b_wptr += extra;
+ sin->sin_family = AF_INET;
+ sin->sin_port = econnp->conn_lport;
+ sin->sin_addr.s_addr = econnp->conn_laddr_v4;
+ break;
+ }
+ case sizeof (sin6_t): {
+ sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
+
+ ok_mp->b_wptr += extra;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = econnp->conn_lport;
+ sin6->sin6_addr = econnp->conn_laddr_v6;
+ sin6->sin6_flowinfo = econnp->conn_flowinfo;
+ if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
+ (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+ sin6->sin6_scope_id =
+ econnp->conn_ixa->ixa_scopeid;
+ } else {
+ sin6->sin6_scope_id = 0;
+ }
+ sin6->__sin6_src_id = 0;
+ break;
+ }
+ default:
+ break;
+ }
+ ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
+ }
+
+ /*
+ * If there are no options we know that the T_CONN_RES will
+ * succeed. However, we can't send the T_OK_ACK upstream until
+ * the tcp_accept_swap is done since it would be dangerous to
+ * let the application start using the new fd prior to the swap.
+ */
+ tcp_accept_swap(listener, acceptor, eager);
+
+ /*
+ * tcp_accept_swap unlinks eager from listener but does not drop
+ * the eager's reference on the listener.
+ */
+ ASSERT(eager->tcp_listener == NULL);
+ ASSERT(listener->tcp_connp->conn_ref >= 5);
+
+ /*
+ * The eager is now associated with its own queue. Insert in
+ * the hash so that the connection can be reused for a future
+ * T_CONN_RES.
+ */
+ tcp_acceptor_hash_insert(acceptor_id, eager);
+
+ /*
+ * We now do the processing of options with T_CONN_RES.
+ * We delay till now since we wanted to have queue to pass to
+ * option processing routines that points back to the right
+ * instance structure which does not happen until after
+ * tcp_accept_swap().
+ *
+ * Note:
+ * The sanity of the logic here assumes that whatever options
+ * are appropriate to inherit from listner=>eager are done
+ * before this point, and whatever were to be overridden (or not)
+ * in transfer logic from eager=>acceptor in tcp_accept_swap().
+ * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
+ * before its ACCEPTOR_id comes down in T_CONN_RES ]
+ * This may not be true at this point in time but can be fixed
+ * independently. This option processing code starts with
+ * the instantiated acceptor instance and the final queue at
+ * this point.
+ */
+
+ if (tcr->OPT_length != 0) {
+ /* Options to process */
+ int t_error = 0;
+ int sys_error = 0;
+ int do_disconnect = 0;
+
+ if (tcp_conprim_opt_process(eager, mp1,
+ &do_disconnect, &t_error, &sys_error) < 0) {
+ eager->tcp_accept_error = 1;
+ if (do_disconnect) {
+ /*
+ * An option failed which does not allow
+ * connection to be accepted.
+ *
+ * We allow T_CONN_RES to succeed and
+ * put a T_DISCON_IND on the eager queue.
+ */
+ ASSERT(t_error == 0 && sys_error == 0);
+ eager->tcp_send_discon_ind = 1;
+ } else {
+ ASSERT(t_error != 0);
+ freemsg(ok_mp);
+ /*
+ * Original mp was either freed or set
+ * to ok_mp above, so use mp1 instead.
+ */
+ tcp_err_ack(listener, mp1, t_error, sys_error);
+ goto finish;
+ }
+ }
+ /*
+ * Most likely success in setting options (except if
+ * eager->tcp_send_discon_ind set).
+ * mp1 option buffer represented by OPT_length/offset
+ * potentially modified and contains results of setting
+ * options at this point
+ */
+ }
+
+ /* We no longer need mp1, since all options processing has passed */
+ freemsg(mp1);
+
+ putnext(listener->tcp_connp->conn_rq, ok_mp);
+
+ mutex_enter(&listener->tcp_eager_lock);
+ if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
+ tcp_t *tail;
+ mblk_t *conn_ind;
+
+ /*
+ * This path should not be executed if listener and
+ * acceptor streams are the same.
+ */
+ ASSERT(listener != acceptor);
+
+ tcp = listener->tcp_eager_prev_q0;
+ /*
+ * listener->tcp_eager_prev_q0 points to the TAIL of the
+ * deferred T_conn_ind queue. We need to get to the head of
+ * the queue in order to send up T_conn_ind the same order as
+ * how the 3WHS is completed.
+ */
+ while (tcp != listener) {
+ if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
+ break;
+ else
+ tcp = tcp->tcp_eager_prev_q0;
+ }
+ ASSERT(tcp != listener);
+ conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
+ ASSERT(conn_ind != NULL);
+ tcp->tcp_conn.tcp_eager_conn_ind = NULL;
+
+ /* Move from q0 to q */
+ ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+ listener->tcp_conn_req_cnt_q0--;
+ listener->tcp_conn_req_cnt_q++;
+ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
+ tcp->tcp_eager_prev_q0;
+ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
+ tcp->tcp_eager_next_q0;
+ tcp->tcp_eager_prev_q0 = NULL;
+ tcp->tcp_eager_next_q0 = NULL;
+ tcp->tcp_conn_def_q0 = B_FALSE;
+
+ /* Make sure the tcp isn't in the list of droppables */
+ ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
+ tcp->tcp_eager_prev_drop_q0 == NULL);
+
+ /*
+ * Insert at end of the queue because sockfs sends
+ * down T_CONN_RES in chronological order. Leaving
+ * the older conn indications at front of the queue
+ * helps reducing search time.
+ */
+ tail = listener->tcp_eager_last_q;
+ if (tail != NULL)
+ tail->tcp_eager_next_q = tcp;
+ else
+ listener->tcp_eager_next_q = tcp;
+ listener->tcp_eager_last_q = tcp;
+ tcp->tcp_eager_next_q = NULL;
+ mutex_exit(&listener->tcp_eager_lock);
+ putnext(tcp->tcp_connp->conn_rq, conn_ind);
+ } else {
+ mutex_exit(&listener->tcp_eager_lock);
+ }
+
+ /*
+ * Done with the acceptor - free it
+ *
+ * Note: from this point on, no access to listener should be made
+ * as listener can be equal to acceptor.
+ */
+finish:
+ ASSERT(acceptor->tcp_detached);
+ acceptor->tcp_connp->conn_rq = NULL;
+ ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
+ acceptor->tcp_connp->conn_wq = NULL;
+ (void) tcp_clean_death(acceptor, 0);
+ CONN_DEC_REF(acceptor->tcp_connp);
+
+ /*
+ * We pass discon_mp to tcp_accept_finish to get on the right squeue.
+ *
+ * It will update the setting for sockfs/stream head and also take
+ * care of any data that arrived before accept() wad called.
+ * In case we already received a FIN then tcp_accept_finish will send up
+ * the ordrel. It will also send up a window update if the window
+ * has opened up.
+ */
+
+ /*
+ * XXX: we currently have a problem if XTI application closes the
+ * acceptor stream in between. This problem exists in on10-gate also
+ * and is well know but nothing can be done short of major rewrite
+ * to fix it. Now it is possible to take care of it by assigning TLI/XTI
+ * eager same squeue as listener (we can distinguish non socket
+ * listeners at the time of handling a SYN in tcp_input_listener)
+ * and do most of the work that tcp_accept_finish does here itself
+ * and then get behind the acceptor squeue to access the acceptor
+ * queue.
+ */
+ /*
+ * We already have a ref on tcp so no need to do one before squeue_enter
+ */
+ SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
+ tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
+ SQTAG_TCP_ACCEPT_FINISH);
+}
+
+
+/*
+ * This is the STREAMS entry point for T_CONN_RES coming down on
+ * Acceptor STREAM when sockfs listener does accept processing.
+ * Read the block comment on top of tcp_input_listener().
+ */
+void
+tcp_tpi_accept(queue_t *q, mblk_t *mp)
+{
+ queue_t *rq = RD(q);
+ struct T_conn_res *conn_res;
+ tcp_t *eager;
+ tcp_t *listener;
+ struct T_ok_ack *ok;
+ t_scalar_t PRIM_type;
+ conn_t *econnp;
+ cred_t *cr;
+
+ ASSERT(DB_TYPE(mp) == M_PROTO);
+
+ /*
+ * All Solaris components should pass a db_credp
+ * for this TPI message, hence we ASSERT.
+ * But in case there is some other M_PROTO that looks
+ * like a TPI message sent by some other kernel
+ * component, we check and return an error.
+ */
+ cr = msg_getcred(mp, NULL);
+ ASSERT(cr != NULL);
+ if (cr == NULL) {
+ mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL);
+ if (mp != NULL)
+ putnext(rq, mp);
+ return;
+ }
+ conn_res = (struct T_conn_res *)mp->b_rptr;
+ ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
+ if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) {
+ mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
+ if (mp != NULL)
+ putnext(rq, mp);
+ return;
+ }
+ switch (conn_res->PRIM_type) {
+ case O_T_CONN_RES:
+ case T_CONN_RES:
+ /*
+ * We pass up an err ack if allocb fails. This will
+ * cause sockfs to issue a T_DISCON_REQ which will cause
+ * tcp_eager_blowoff to be called. sockfs will then call
+ * rq->q_qinfo->qi_qclose to cleanup the acceptor stream.
+ * we need to do the allocb up here because we have to
+ * make sure rq->q_qinfo->qi_qclose still points to the
+ * correct function (tcp_tpi_close_accept) in case allocb
+ * fails.
+ */
+ bcopy(mp->b_rptr + conn_res->OPT_offset,
+ &eager, conn_res->OPT_length);
+ PRIM_type = conn_res->PRIM_type;
+ mp->b_datap->db_type = M_PCPROTO;
+ mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack);
+ ok = (struct T_ok_ack *)mp->b_rptr;
+ ok->PRIM_type = T_OK_ACK;
+ ok->CORRECT_prim = PRIM_type;
+ econnp = eager->tcp_connp;
+ econnp->conn_dev = (dev_t)RD(q)->q_ptr;
+ econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr);
+ econnp->conn_rq = rq;
+ econnp->conn_wq = q;
+ rq->q_ptr = econnp;
+ rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */
+ q->q_ptr = econnp;
+ q->q_qinfo = &tcp_winit;
+ listener = eager->tcp_listener;
+
+ if (tcp_accept_common(listener->tcp_connp,
+ econnp, cr) < 0) {
+ mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0);
+ if (mp != NULL)
+ putnext(rq, mp);
+ return;
+ }
+
+ /*
+ * Send the new local address also up to sockfs. There
+ * should already be enough space in the mp that came
+ * down from soaccept().
+ */
+ if (econnp->conn_family == AF_INET) {
+ sin_t *sin;
+
+ ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
+ (sizeof (struct T_ok_ack) + sizeof (sin_t)));
+ sin = (sin_t *)mp->b_wptr;
+ mp->b_wptr += sizeof (sin_t);
+ sin->sin_family = AF_INET;
+ sin->sin_port = econnp->conn_lport;
+ sin->sin_addr.s_addr = econnp->conn_laddr_v4;
+ } else {
+ sin6_t *sin6;
+
+ ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >=
+ sizeof (struct T_ok_ack) + sizeof (sin6_t));
+ sin6 = (sin6_t *)mp->b_wptr;
+ mp->b_wptr += sizeof (sin6_t);
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_port = econnp->conn_lport;
+ sin6->sin6_addr = econnp->conn_laddr_v6;
+ if (econnp->conn_ipversion == IPV4_VERSION)
+ sin6->sin6_flowinfo = 0;
+ else
+ sin6->sin6_flowinfo = econnp->conn_flowinfo;
+ if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
+ (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
+ sin6->sin6_scope_id =
+ econnp->conn_ixa->ixa_scopeid;
+ } else {
+ sin6->sin6_scope_id = 0;
+ }
+ sin6->__sin6_src_id = 0;
+ }
+
+ putnext(rq, mp);
+ return;
+ default:
+ mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0);
+ if (mp != NULL)
+ putnext(rq, mp);
+ return;
+ }
+}
+
+/*
+ * Send the newconn notification to ulp. The eager is blown off if the
+ * notification fails.
+ */
+static void
+tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp)
+{
+ if (IPCL_IS_NONSTR(lconnp)) {
+ cred_t *cr;
+ pid_t cpid = NOPID;
+
+ ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp);
+ ASSERT(econnp->conn_tcp->tcp_saved_listener ==
+ lconnp->conn_tcp);
+
+ cr = msg_getcred(mp, &cpid);
+
+ /* Keep the message around in case of a fallback to TPI */
+ econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp;
+ /*
+ * Notify the ULP about the newconn. It is guaranteed that no
+ * tcp_accept() call will be made for the eager if the
+ * notification fails, so it's safe to blow it off in that
+ * case.
+ *
+ * The upper handle will be assigned when tcp_accept() is
+ * called.
+ */
+ if ((*lconnp->conn_upcalls->su_newconn)
+ (lconnp->conn_upper_handle,
+ (sock_lower_handle_t)econnp,
+ &sock_tcp_downcalls, cr, cpid,
+ &econnp->conn_upcalls) == NULL) {
+ /* Failed to allocate a socket */
+ TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps,
+ tcpEstabResets);
+ (void) tcp_eager_blowoff(lconnp->conn_tcp,
+ econnp->conn_tcp->tcp_conn_req_seqnum);
+ }
+ } else {
+ putnext(lconnp->conn_rq, mp);
+ }
+}
+
+/*
+ * The function called through squeue to get behind listener's perimeter to
+ * send a deferred conn_ind.
+ */
+/* ARGSUSED */
+void
+tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
+{
+ conn_t *lconnp = (conn_t *)arg;
+ tcp_t *listener = lconnp->conn_tcp;
+ struct T_conn_ind *conn_ind;
+ tcp_t *tcp;
+
+ conn_ind = (struct T_conn_ind *)mp->b_rptr;
+ bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
+ conn_ind->OPT_length);
+
+ if (listener->tcp_state != TCPS_LISTEN) {
+ /*
+ * If listener has closed, it would have caused a
+ * a cleanup/blowoff to happen for the eager, so
+ * we don't need to do anything more.
+ */
+ freemsg(mp);
+ return;
+ }
+
+ tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+}
+
+/*
+ * Sends the T_CONN_IND to the listener. The caller calls this
+ * functions via squeue to get inside the listener's perimeter
+ * once the 3 way hand shake is done a T_CONN_IND needs to be
+ * sent. As an optimization, the caller can call this directly
+ * if listener's perimeter is same as eager's.
+ */
+/* ARGSUSED */
+void
+tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2)
+{
+ conn_t *lconnp = (conn_t *)arg;
+ tcp_t *listener = lconnp->conn_tcp;
+ tcp_t *tcp;
+ struct T_conn_ind *conn_ind;
+ ipaddr_t *addr_cache;
+ boolean_t need_send_conn_ind = B_FALSE;
+ tcp_stack_t *tcps = listener->tcp_tcps;
+
+ /* retrieve the eager */
+ conn_ind = (struct T_conn_ind *)mp->b_rptr;
+ ASSERT(conn_ind->OPT_offset != 0 &&
+ conn_ind->OPT_length == sizeof (intptr_t));
+ bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp,
+ conn_ind->OPT_length);
+
+ /*
+ * TLI/XTI applications will get confused by
+ * sending eager as an option since it violates
+ * the option semantics. So remove the eager as
+ * option since TLI/XTI app doesn't need it anyway.
+ */
+ if (!TCP_IS_SOCKET(listener)) {
+ conn_ind->OPT_length = 0;
+ conn_ind->OPT_offset = 0;
+ }
+ if (listener->tcp_state != TCPS_LISTEN) {
+ /*
+ * If listener has closed, it would have caused a
+ * a cleanup/blowoff to happen for the eager. We
+ * just need to return.
+ */
+ freemsg(mp);
+ return;
+ }
+
+
+ /*
+ * if the conn_req_q is full defer passing up the
+ * T_CONN_IND until space is availabe after t_accept()
+ * processing
+ */
+ mutex_enter(&listener->tcp_eager_lock);
+
+ /*
+ * Take the eager out, if it is in the list of droppable eagers
+ * as we are here because the 3W handshake is over.
+ */
+ MAKE_UNDROPPABLE(tcp);
+
+ if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) {
+ tcp_t *tail;
+
+ /*
+ * The eager already has an extra ref put in tcp_input_data
+ * so that it stays till accept comes back even though it
+ * might get into TCPS_CLOSED as a result of a TH_RST etc.
+ */
+ ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
+ listener->tcp_conn_req_cnt_q0--;
+ listener->tcp_conn_req_cnt_q++;
+
+ /* Move from SYN_RCVD to ESTABLISHED list */
+ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
+ tcp->tcp_eager_prev_q0;
+ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
+ tcp->tcp_eager_next_q0;
+ tcp->tcp_eager_prev_q0 = NULL;
+ tcp->tcp_eager_next_q0 = NULL;
+
+ /*
+ * Insert at end of the queue because sockfs
+ * sends down T_CONN_RES in chronological
+ * order. Leaving the older conn indications
+ * at front of the queue helps reducing search
+ * time.
+ */
+ tail = listener->tcp_eager_last_q;
+ if (tail != NULL)
+ tail->tcp_eager_next_q = tcp;
+ else
+ listener->tcp_eager_next_q = tcp;
+ listener->tcp_eager_last_q = tcp;
+ tcp->tcp_eager_next_q = NULL;
+ /*
+ * Delay sending up the T_conn_ind until we are
+ * done with the eager. Once we have have sent up
+ * the T_conn_ind, the accept can potentially complete
+ * any time and release the refhold we have on the eager.
+ */
+ need_send_conn_ind = B_TRUE;
+ } else {
+ /*
+ * Defer connection on q0 and set deferred
+ * connection bit true
+ */
+ tcp->tcp_conn_def_q0 = B_TRUE;
+
+ /* take tcp out of q0 ... */
+ tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
+ tcp->tcp_eager_next_q0;
+ tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
+ tcp->tcp_eager_prev_q0;
+
+ /* ... and place it at the end of q0 */
+ tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0;
+ tcp->tcp_eager_next_q0 = listener;
+ listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp;
+ listener->tcp_eager_prev_q0 = tcp;
+ tcp->tcp_conn.tcp_eager_conn_ind = mp;
+ }
+
+ /* we have timed out before */
+ if (tcp->tcp_syn_rcvd_timeout != 0) {
+ tcp->tcp_syn_rcvd_timeout = 0;
+ listener->tcp_syn_rcvd_timeout--;
+ if (listener->tcp_syn_defense &&
+ listener->tcp_syn_rcvd_timeout <=
+ (tcps->tcps_conn_req_max_q0 >> 5) &&
+ 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() -
+ listener->tcp_last_rcv_lbolt)) {
+ /*
+ * Turn off the defense mode if we
+ * believe the SYN attack is over.
+ */
+ listener->tcp_syn_defense = B_FALSE;
+ if (listener->tcp_ip_addr_cache) {
+ kmem_free((void *)listener->tcp_ip_addr_cache,
+ IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
+ listener->tcp_ip_addr_cache = NULL;
+ }
+ }
+ }
+ addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache);
+ if (addr_cache != NULL) {
+ /*
+ * We have finished a 3-way handshake with this
+ * remote host. This proves the IP addr is good.
+ * Cache it!
+ */
+ addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] =
+ tcp->tcp_connp->conn_faddr_v4;
+ }
+ mutex_exit(&listener->tcp_eager_lock);
+ if (need_send_conn_ind)
+ tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp);
+}