diff options
author | Kacheong Poon <Kacheong.Poon@Sun.COM> | 2010-02-24 09:06:37 -0800 |
---|---|---|
committer | Kacheong Poon <Kacheong.Poon@Sun.COM> | 2010-02-24 09:06:37 -0800 |
commit | 439b3dea6216344ffd0d8911f76442c317de8ca3 (patch) | |
tree | 38985c16986ffc01fae6761c01a8f7ad19b41edc /usr/src/uts/common/inet/tcp/tcp_tpi.c | |
parent | 721fffe35d40e548a5a58dc53a2ec9c6762172d9 (diff) | |
download | illumos-joyent-439b3dea6216344ffd0d8911f76442c317de8ca3.tar.gz |
6925635 The file tcp.c is too big
Diffstat (limited to 'usr/src/uts/common/inet/tcp/tcp_tpi.c')
-rw-r--r-- | usr/src/uts/common/inet/tcp/tcp_tpi.c | 1992 |
1 files changed, 1992 insertions, 0 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp_tpi.c b/usr/src/uts/common/inet/tcp/tcp_tpi.c new file mode 100644 index 0000000000..d46a05ef08 --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_tpi.c @@ -0,0 +1,1992 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* This files contains all TCP TLI/TPI related functions */ + +#include <sys/types.h> +#include <sys/stream.h> +#include <sys/strsun.h> +#include <sys/strsubr.h> +#include <sys/stropts.h> +#include <sys/strlog.h> +#define _SUN_TPI_VERSION 2 +#include <sys/tihdr.h> +#include <sys/suntpi.h> +#include <sys/xti_inet.h> +#include <sys/squeue_impl.h> +#include <sys/squeue.h> + +#include <inet/common.h> +#include <inet/ip.h> +#include <inet/tcp.h> +#include <inet/tcp_impl.h> +#include <inet/proto_set.h> + +static void tcp_accept_swap(tcp_t *, tcp_t *, tcp_t *); +static int tcp_conprim_opt_process(tcp_t *, mblk_t *, int *, int *, int *); +static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *); + +void +tcp_use_pure_tpi(tcp_t *tcp) +{ + conn_t *connp = tcp->tcp_connp; + +#ifdef _ILP32 + tcp->tcp_acceptor_id = (t_uscalar_t)connp->conn_rq; +#else + tcp->tcp_acceptor_id = connp->conn_dev; +#endif + /* + * Insert this socket into the acceptor hash. + * We might need it for T_CONN_RES message + */ + tcp_acceptor_hash_insert(tcp->tcp_acceptor_id, tcp); + + tcp->tcp_issocket = B_FALSE; + TCP_STAT(tcp->tcp_tcps, tcp_sock_fallback); +} + +/* Shorthand to generate and send TPI error acks to our client */ +void +tcp_err_ack(tcp_t *tcp, mblk_t *mp, int t_error, int sys_error) +{ + if ((mp = mi_tpi_err_ack_alloc(mp, t_error, sys_error)) != NULL) + putnext(tcp->tcp_connp->conn_rq, mp); +} + +/* Shorthand to generate and send TPI error acks to our client */ +void +tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive, + int t_error, int sys_error) +{ + struct T_error_ack *teackp; + + if ((mp = tpi_ack_alloc(mp, sizeof (struct T_error_ack), + M_PCPROTO, T_ERROR_ACK)) != NULL) { + teackp = (struct T_error_ack *)mp->b_rptr; + teackp->ERROR_prim = primitive; + teackp->TLI_error = t_error; + teackp->UNIX_error = sys_error; + putnext(tcp->tcp_connp->conn_rq, mp); + } +} + +/* + * TCP routine to get the values of options. + */ +int +tcp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) +{ + return (tcp_opt_get(Q_TO_CONN(q), level, name, ptr)); +} + +/* ARGSUSED */ +int +tcp_tpi_opt_set(queue_t *q, uint_t optset_context, int level, int name, + uint_t inlen, uchar_t *invalp, uint_t *outlenp, uchar_t *outvalp, + void *thisdg_attrs, cred_t *cr) +{ + conn_t *connp = Q_TO_CONN(q); + + return (tcp_opt_set(connp, optset_context, level, name, inlen, invalp, + outlenp, outvalp, thisdg_attrs, cr)); +} + +static int +tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp, int *do_disconnectp, + int *t_errorp, int *sys_errorp) +{ + int error; + int is_absreq_failure; + t_scalar_t *opt_lenp; + t_scalar_t opt_offset; + int prim_type; + struct T_conn_req *tcreqp; + struct T_conn_res *tcresp; + cred_t *cr; + + /* + * All Solaris components should pass a db_credp + * for this TPI message, hence we ASSERT. + * But in case there is some other M_PROTO that looks + * like a TPI message sent by some other kernel + * component, we check and return an error. + */ + cr = msg_getcred(mp, NULL); + ASSERT(cr != NULL); + if (cr == NULL) + return (-1); + + prim_type = ((union T_primitives *)mp->b_rptr)->type; + ASSERT(prim_type == T_CONN_REQ || prim_type == O_T_CONN_RES || + prim_type == T_CONN_RES); + + switch (prim_type) { + case T_CONN_REQ: + tcreqp = (struct T_conn_req *)mp->b_rptr; + opt_offset = tcreqp->OPT_offset; + opt_lenp = (t_scalar_t *)&tcreqp->OPT_length; + break; + case O_T_CONN_RES: + case T_CONN_RES: + tcresp = (struct T_conn_res *)mp->b_rptr; + opt_offset = tcresp->OPT_offset; + opt_lenp = (t_scalar_t *)&tcresp->OPT_length; + break; + } + + *t_errorp = 0; + *sys_errorp = 0; + *do_disconnectp = 0; + + error = tpi_optcom_buf(tcp->tcp_connp->conn_wq, mp, opt_lenp, + opt_offset, cr, &tcp_opt_obj, + NULL, &is_absreq_failure); + + switch (error) { + case 0: /* no error */ + ASSERT(is_absreq_failure == 0); + return (0); + case ENOPROTOOPT: + *t_errorp = TBADOPT; + break; + case EACCES: + *t_errorp = TACCES; + break; + default: + *t_errorp = TSYSERR; *sys_errorp = error; + break; + } + if (is_absreq_failure != 0) { + /* + * The connection request should get the local ack + * T_OK_ACK and then a T_DISCON_IND. + */ + *do_disconnectp = 1; + } + return (-1); +} + +void +tcp_tpi_bind(tcp_t *tcp, mblk_t *mp) +{ + int error; + conn_t *connp = tcp->tcp_connp; + struct sockaddr *sa; + mblk_t *mp1; + struct T_bind_req *tbr; + int backlog; + socklen_t len; + sin_t *sin; + sin6_t *sin6; + cred_t *cr; + + /* + * All Solaris components should pass a db_credp + * for this TPI message, hence we ASSERT. + * But in case there is some other M_PROTO that looks + * like a TPI message sent by some other kernel + * component, we check and return an error. + */ + cr = msg_getcred(mp, NULL); + ASSERT(cr != NULL); + if (cr == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, EINVAL); + return; + } + + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); + if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_tpi_bind: bad req, len %u", + (uint_t)(mp->b_wptr - mp->b_rptr)); + } + tcp_err_ack(tcp, mp, TPROTO, 0); + return; + } + /* Make sure the largest address fits */ + mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1); + if (mp1 == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); + return; + } + mp = mp1; + tbr = (struct T_bind_req *)mp->b_rptr; + + backlog = tbr->CONIND_number; + len = tbr->ADDR_length; + + switch (len) { + case 0: /* request for a generic port */ + tbr->ADDR_offset = sizeof (struct T_bind_req); + if (connp->conn_family == AF_INET) { + tbr->ADDR_length = sizeof (sin_t); + sin = (sin_t *)&tbr[1]; + *sin = sin_null; + sin->sin_family = AF_INET; + sa = (struct sockaddr *)sin; + len = sizeof (sin_t); + mp->b_wptr = (uchar_t *)&sin[1]; + } else { + ASSERT(connp->conn_family == AF_INET6); + tbr->ADDR_length = sizeof (sin6_t); + sin6 = (sin6_t *)&tbr[1]; + *sin6 = sin6_null; + sin6->sin6_family = AF_INET6; + sa = (struct sockaddr *)sin6; + len = sizeof (sin6_t); + mp->b_wptr = (uchar_t *)&sin6[1]; + } + break; + + case sizeof (sin_t): /* Complete IPv4 address */ + sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset, + sizeof (sin_t)); + break; + + case sizeof (sin6_t): /* Complete IPv6 address */ + sa = (struct sockaddr *)mi_offset_param(mp, + tbr->ADDR_offset, sizeof (sin6_t)); + break; + + default: + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_tpi_bind: bad address length, %d", + tbr->ADDR_length); + } + tcp_err_ack(tcp, mp, TBADADDR, 0); + return; + } + + if (backlog > 0) { + error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp), + tbr->PRIM_type != O_T_BIND_REQ); + } else { + error = tcp_do_bind(connp, sa, len, DB_CRED(mp), + tbr->PRIM_type != O_T_BIND_REQ); + } +done: + if (error > 0) { + tcp_err_ack(tcp, mp, TSYSERR, error); + } else if (error < 0) { + tcp_err_ack(tcp, mp, -error, 0); + } else { + /* + * Update port information as sockfs/tpi needs it for checking + */ + if (connp->conn_family == AF_INET) { + sin = (sin_t *)sa; + sin->sin_port = connp->conn_lport; + } else { + sin6 = (sin6_t *)sa; + sin6->sin6_port = connp->conn_lport; + } + mp->b_datap->db_type = M_PCPROTO; + tbr->PRIM_type = T_BIND_ACK; + putnext(connp->conn_rq, mp); + } +} + +/* tcp_unbind is called by tcp_wput_proto to handle T_UNBIND_REQ messages. */ +void +tcp_tpi_unbind(tcp_t *tcp, mblk_t *mp) +{ + conn_t *connp = tcp->tcp_connp; + int error; + + error = tcp_do_unbind(connp); + if (error > 0) { + tcp_err_ack(tcp, mp, TSYSERR, error); + } else if (error < 0) { + tcp_err_ack(tcp, mp, -error, 0); + } else { + /* Send M_FLUSH according to TPI */ + (void) putnextctl1(connp->conn_rq, M_FLUSH, FLUSHRW); + + mp = mi_tpi_ok_ack_alloc(mp); + if (mp != NULL) + putnext(connp->conn_rq, mp); + } +} + +int +tcp_tpi_close(queue_t *q, int flags) +{ + conn_t *connp; + + ASSERT(WR(q)->q_next == NULL); + + if (flags & SO_FALLBACK) { + /* + * stream is being closed while in fallback + * simply free the resources that were allocated + */ + inet_minor_free(WR(q)->q_ptr, (dev_t)(RD(q)->q_ptr)); + qprocsoff(q); + goto done; + } + + connp = Q_TO_CONN(q); + /* + * We are being closed as /dev/tcp or /dev/tcp6. + */ + tcp_close_common(connp, flags); + + qprocsoff(q); + inet_minor_free(connp->conn_minor_arena, connp->conn_dev); + + /* + * Drop IP's reference on the conn. This is the last reference + * on the connp if the state was less than established. If the + * connection has gone into timewait state, then we will have + * one ref for the TCP and one more ref (total of two) for the + * classifier connected hash list (a timewait connections stays + * in connected hash till closed). + * + * We can't assert the references because there might be other + * transient reference places because of some walkers or queued + * packets in squeue for the timewait state. + */ + CONN_DEC_REF(connp); +done: + q->q_ptr = WR(q)->q_ptr = NULL; + return (0); +} + +int +tcp_tpi_close_accept(queue_t *q) +{ + vmem_t *minor_arena; + dev_t conn_dev; + extern struct qinit tcp_acceptor_winit; + + ASSERT(WR(q)->q_qinfo == &tcp_acceptor_winit); + + /* + * We had opened an acceptor STREAM for sockfs which is + * now being closed due to some error. + */ + qprocsoff(q); + + minor_arena = (vmem_t *)WR(q)->q_ptr; + conn_dev = (dev_t)RD(q)->q_ptr; + ASSERT(minor_arena != NULL); + ASSERT(conn_dev != 0); + inet_minor_free(minor_arena, conn_dev); + q->q_ptr = WR(q)->q_ptr = NULL; + return (0); +} + +/* + * Put a connection confirmation message upstream built from the + * address/flowid information with the conn and iph. Report our success or + * failure. + */ +boolean_t +tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, mblk_t *idmp, + mblk_t **defermp, ip_recv_attr_t *ira) +{ + sin_t sin; + sin6_t sin6; + mblk_t *mp; + char *optp = NULL; + int optlen = 0; + conn_t *connp = tcp->tcp_connp; + + if (defermp != NULL) + *defermp = NULL; + + if (tcp->tcp_conn.tcp_opts_conn_req != NULL) { + /* + * Return in T_CONN_CON results of option negotiation through + * the T_CONN_REQ. Note: If there is an real end-to-end option + * negotiation, then what is received from remote end needs + * to be taken into account but there is no such thing (yet?) + * in our TCP/IP. + * Note: We do not use mi_offset_param() here as + * tcp_opts_conn_req contents do not directly come from + * an application and are either generated in kernel or + * from user input that was already verified. + */ + mp = tcp->tcp_conn.tcp_opts_conn_req; + optp = (char *)(mp->b_rptr + + ((struct T_conn_req *)mp->b_rptr)->OPT_offset); + optlen = (int) + ((struct T_conn_req *)mp->b_rptr)->OPT_length; + } + + if (IPH_HDR_VERSION(iphdr) == IPV4_VERSION) { + + /* packet is IPv4 */ + if (connp->conn_family == AF_INET) { + sin = sin_null; + sin.sin_addr.s_addr = connp->conn_faddr_v4; + sin.sin_port = connp->conn_fport; + sin.sin_family = AF_INET; + mp = mi_tpi_conn_con(NULL, (char *)&sin, + (int)sizeof (sin_t), optp, optlen); + } else { + sin6 = sin6_null; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; + sin6.sin6_family = AF_INET6; + mp = mi_tpi_conn_con(NULL, (char *)&sin6, + (int)sizeof (sin6_t), optp, optlen); + + } + } else { + ip6_t *ip6h = (ip6_t *)iphdr; + + ASSERT(IPH_HDR_VERSION(iphdr) == IPV6_VERSION); + ASSERT(connp->conn_family == AF_INET6); + sin6 = sin6_null; + sin6.sin6_addr = connp->conn_faddr_v6; + sin6.sin6_port = connp->conn_fport; + sin6.sin6_family = AF_INET6; + sin6.sin6_flowinfo = ip6h->ip6_vcf & ~IPV6_VERS_AND_FLOW_MASK; + mp = mi_tpi_conn_con(NULL, (char *)&sin6, + (int)sizeof (sin6_t), optp, optlen); + } + + if (!mp) + return (B_FALSE); + + mblk_copycred(mp, idmp); + + if (defermp == NULL) { + conn_t *connp = tcp->tcp_connp; + if (IPCL_IS_NONSTR(connp)) { + (*connp->conn_upcalls->su_connected) + (connp->conn_upper_handle, tcp->tcp_connid, + ira->ira_cred, ira->ira_cpid); + freemsg(mp); + } else { + if (ira->ira_cred != NULL) { + /* So that getpeerucred works for TPI sockfs */ + mblk_setcred(mp, ira->ira_cred, ira->ira_cpid); + } + putnext(connp->conn_rq, mp); + } + } else { + *defermp = mp; + } + + if (tcp->tcp_conn.tcp_opts_conn_req != NULL) + tcp_close_mpp(&tcp->tcp_conn.tcp_opts_conn_req); + return (B_TRUE); +} + +/* + * Successful connect request processing begins when our client passes + * a T_CONN_REQ message into tcp_wput(), which performs function calls into + * IP and the passes a T_OK_ACK (or T_ERROR_ACK upstream). + * + * After various error checks are completed, tcp_tpi_connect() lays + * the target address and port into the composite header template. + * Then we ask IP for information, including a source address if we didn't + * already have one. Finally we prepare to send the SYN packet, and then + * send up the T_OK_ACK reply message. + */ +void +tcp_tpi_connect(tcp_t *tcp, mblk_t *mp) +{ + sin_t *sin; + struct T_conn_req *tcr; + struct sockaddr *sa; + socklen_t len; + int error; + cred_t *cr; + pid_t cpid; + conn_t *connp = tcp->tcp_connp; + queue_t *q = connp->conn_wq; + + /* + * All Solaris components should pass a db_credp + * for this TPI message, hence we ASSERT. + * But in case there is some other M_PROTO that looks + * like a TPI message sent by some other kernel + * component, we check and return an error. + */ + cr = msg_getcred(mp, &cpid); + ASSERT(cr != NULL); + if (cr == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, EINVAL); + return; + } + + tcr = (struct T_conn_req *)mp->b_rptr; + + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); + if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { + tcp_err_ack(tcp, mp, TPROTO, 0); + return; + } + + /* + * Pre-allocate the T_ordrel_ind mblk so that at close time, we + * will always have that to send up. Otherwise, we need to do + * special handling in case the allocation fails at that time. + * If the end point is TPI, the tcp_t can be reused and the + * tcp_ordrel_mp may be allocated already. + */ + if (tcp->tcp_ordrel_mp == NULL) { + if ((tcp->tcp_ordrel_mp = mi_tpi_ordrel_ind()) == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); + return; + } + } + + /* + * Determine packet type based on type of address passed in + * the request should contain an IPv4 or IPv6 address. + * Make sure that address family matches the type of + * family of the address passed down. + */ + switch (tcr->DEST_length) { + default: + tcp_err_ack(tcp, mp, TBADADDR, 0); + return; + + case (sizeof (sin_t) - sizeof (sin->sin_zero)): { + /* + * XXX: The check for valid DEST_length was not there + * in earlier releases and some buggy + * TLI apps (e.g Sybase) got away with not feeding + * in sin_zero part of address. + * We allow that bug to keep those buggy apps humming. + * Test suites require the check on DEST_length. + * We construct a new mblk with valid DEST_length + * free the original so the rest of the code does + * not have to keep track of this special shorter + * length address case. + */ + mblk_t *nmp; + struct T_conn_req *ntcr; + sin_t *nsin; + + nmp = allocb(sizeof (struct T_conn_req) + sizeof (sin_t) + + tcr->OPT_length, BPRI_HI); + if (nmp == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); + return; + } + ntcr = (struct T_conn_req *)nmp->b_rptr; + bzero(ntcr, sizeof (struct T_conn_req)); /* zero fill */ + ntcr->PRIM_type = T_CONN_REQ; + ntcr->DEST_length = sizeof (sin_t); + ntcr->DEST_offset = sizeof (struct T_conn_req); + + nsin = (sin_t *)((uchar_t *)ntcr + ntcr->DEST_offset); + *nsin = sin_null; + /* Get pointer to shorter address to copy from original mp */ + sin = (sin_t *)mi_offset_param(mp, tcr->DEST_offset, + tcr->DEST_length); /* extract DEST_length worth of sin_t */ + if (sin == NULL || !OK_32PTR((char *)sin)) { + freemsg(nmp); + tcp_err_ack(tcp, mp, TSYSERR, EINVAL); + return; + } + nsin->sin_family = sin->sin_family; + nsin->sin_port = sin->sin_port; + nsin->sin_addr = sin->sin_addr; + /* Note:nsin->sin_zero zero-fill with sin_null assign above */ + nmp->b_wptr = (uchar_t *)&nsin[1]; + if (tcr->OPT_length != 0) { + ntcr->OPT_length = tcr->OPT_length; + ntcr->OPT_offset = nmp->b_wptr - nmp->b_rptr; + bcopy((uchar_t *)tcr + tcr->OPT_offset, + (uchar_t *)ntcr + ntcr->OPT_offset, + tcr->OPT_length); + nmp->b_wptr += tcr->OPT_length; + } + freemsg(mp); /* original mp freed */ + mp = nmp; /* re-initialize original variables */ + tcr = ntcr; + } + /* FALLTHRU */ + + case sizeof (sin_t): + sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, + sizeof (sin_t)); + len = sizeof (sin_t); + break; + + case sizeof (sin6_t): + sa = (struct sockaddr *)mi_offset_param(mp, tcr->DEST_offset, + sizeof (sin6_t)); + len = sizeof (sin6_t); + break; + } + + error = proto_verify_ip_addr(connp->conn_family, sa, len); + if (error != 0) { + tcp_err_ack(tcp, mp, TSYSERR, error); + return; + } + + /* + * TODO: If someone in TCPS_TIME_WAIT has this dst/port we + * should key on their sequence number and cut them loose. + */ + + /* + * If options passed in, feed it for verification and handling + */ + if (tcr->OPT_length != 0) { + mblk_t *ok_mp; + mblk_t *discon_mp; + mblk_t *conn_opts_mp; + int t_error, sys_error, do_disconnect; + + conn_opts_mp = NULL; + + if (tcp_conprim_opt_process(tcp, mp, + &do_disconnect, &t_error, &sys_error) < 0) { + if (do_disconnect) { + ASSERT(t_error == 0 && sys_error == 0); + discon_mp = mi_tpi_discon_ind(NULL, + ECONNREFUSED, 0); + if (!discon_mp) { + tcp_err_ack_prim(tcp, mp, T_CONN_REQ, + TSYSERR, ENOMEM); + return; + } + ok_mp = mi_tpi_ok_ack_alloc(mp); + if (!ok_mp) { + tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, + TSYSERR, ENOMEM); + return; + } + qreply(q, ok_mp); + qreply(q, discon_mp); /* no flush! */ + } else { + ASSERT(t_error != 0); + tcp_err_ack_prim(tcp, mp, T_CONN_REQ, t_error, + sys_error); + } + return; + } + /* + * Success in setting options, the mp option buffer represented + * by OPT_length/offset has been potentially modified and + * contains results of option processing. We copy it in + * another mp to save it for potentially influencing returning + * it in T_CONN_CONN. + */ + if (tcr->OPT_length != 0) { /* there are resulting options */ + conn_opts_mp = copyb(mp); + if (!conn_opts_mp) { + tcp_err_ack_prim(tcp, mp, T_CONN_REQ, + TSYSERR, ENOMEM); + return; + } + ASSERT(tcp->tcp_conn.tcp_opts_conn_req == NULL); + tcp->tcp_conn.tcp_opts_conn_req = conn_opts_mp; + /* + * Note: + * These resulting option negotiation can include any + * end-to-end negotiation options but there no such + * thing (yet?) in our TCP/IP. + */ + } + } + + /* call the non-TPI version */ + error = tcp_do_connect(tcp->tcp_connp, sa, len, cr, cpid); + if (error < 0) { + mp = mi_tpi_err_ack_alloc(mp, -error, 0); + } else if (error > 0) { + mp = mi_tpi_err_ack_alloc(mp, TSYSERR, error); + } else { + mp = mi_tpi_ok_ack_alloc(mp); + } + + /* + * Note: Code below is the "failure" case + */ + /* return error ack and blow away saved option results if any */ +connect_failed: + if (mp != NULL) + putnext(connp->conn_rq, mp); + else { + tcp_err_ack_prim(tcp, NULL, T_CONN_REQ, + TSYSERR, ENOMEM); + } +} + +/* Return the TPI/TLI equivalent of our current tcp_state */ +static int +tcp_tpistate(tcp_t *tcp) +{ + switch (tcp->tcp_state) { + case TCPS_IDLE: + return (TS_UNBND); + case TCPS_LISTEN: + /* + * Return whether there are outstanding T_CONN_IND waiting + * for the matching T_CONN_RES. Therefore don't count q0. + */ + if (tcp->tcp_conn_req_cnt_q > 0) + return (TS_WRES_CIND); + else + return (TS_IDLE); + case TCPS_BOUND: + return (TS_IDLE); + case TCPS_SYN_SENT: + return (TS_WCON_CREQ); + case TCPS_SYN_RCVD: + /* + * Note: assumption: this has to the active open SYN_RCVD. + * The passive instance is detached in SYN_RCVD stage of + * incoming connection processing so we cannot get request + * for T_info_ack on it. + */ + return (TS_WACK_CRES); + case TCPS_ESTABLISHED: + return (TS_DATA_XFER); + case TCPS_CLOSE_WAIT: + return (TS_WREQ_ORDREL); + case TCPS_FIN_WAIT_1: + return (TS_WIND_ORDREL); + case TCPS_FIN_WAIT_2: + return (TS_WIND_ORDREL); + + case TCPS_CLOSING: + case TCPS_LAST_ACK: + case TCPS_TIME_WAIT: + case TCPS_CLOSED: + /* + * Following TS_WACK_DREQ7 is a rendition of "not + * yet TS_IDLE" TPI state. There is no best match to any + * TPI state for TCPS_{CLOSING, LAST_ACK, TIME_WAIT} but we + * choose a value chosen that will map to TLI/XTI level + * state of TSTATECHNG (state is process of changing) which + * captures what this dummy state represents. + */ + return (TS_WACK_DREQ7); + default: + cmn_err(CE_WARN, "tcp_tpistate: strange state (%d) %s", + tcp->tcp_state, tcp_display(tcp, NULL, + DISP_PORT_ONLY)); + return (TS_UNBND); + } +} + +static void +tcp_copy_info(struct T_info_ack *tia, tcp_t *tcp) +{ + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + extern struct T_info_ack tcp_g_t_info_ack; + extern struct T_info_ack tcp_g_t_info_ack_v6; + + if (connp->conn_family == AF_INET6) + *tia = tcp_g_t_info_ack_v6; + else + *tia = tcp_g_t_info_ack; + tia->CURRENT_state = tcp_tpistate(tcp); + tia->OPT_size = tcp_max_optsize; + if (tcp->tcp_mss == 0) { + /* Not yet set - tcp_open does not set mss */ + if (connp->conn_ipversion == IPV4_VERSION) + tia->TIDU_size = tcps->tcps_mss_def_ipv4; + else + tia->TIDU_size = tcps->tcps_mss_def_ipv6; + } else { + tia->TIDU_size = tcp->tcp_mss; + } + /* TODO: Default ETSDU is 1. Is that correct for tcp? */ +} + +static void +tcp_do_capability_ack(tcp_t *tcp, struct T_capability_ack *tcap, + t_uscalar_t cap_bits1) +{ + tcap->CAP_bits1 = 0; + + if (cap_bits1 & TC1_INFO) { + tcp_copy_info(&tcap->INFO_ack, tcp); + tcap->CAP_bits1 |= TC1_INFO; + } + + if (cap_bits1 & TC1_ACCEPTOR_ID) { + tcap->ACCEPTOR_id = tcp->tcp_acceptor_id; + tcap->CAP_bits1 |= TC1_ACCEPTOR_ID; + } + +} + +/* + * This routine responds to T_CAPABILITY_REQ messages. It is called by + * tcp_wput. Much of the T_CAPABILITY_ACK information is copied from + * tcp_g_t_info_ack. The current state of the stream is copied from + * tcp_state. + */ +void +tcp_capability_req(tcp_t *tcp, mblk_t *mp) +{ + t_uscalar_t cap_bits1; + struct T_capability_ack *tcap; + + if (MBLKL(mp) < sizeof (struct T_capability_req)) { + freemsg(mp); + return; + } + + cap_bits1 = ((struct T_capability_req *)mp->b_rptr)->CAP_bits1; + + mp = tpi_ack_alloc(mp, sizeof (struct T_capability_ack), + mp->b_datap->db_type, T_CAPABILITY_ACK); + if (mp == NULL) + return; + + tcap = (struct T_capability_ack *)mp->b_rptr; + tcp_do_capability_ack(tcp, tcap, cap_bits1); + + putnext(tcp->tcp_connp->conn_rq, mp); +} + +/* + * This routine responds to T_INFO_REQ messages. It is called by tcp_wput. + * Most of the T_INFO_ACK information is copied from tcp_g_t_info_ack. + * The current state of the stream is copied from tcp_state. + */ +void +tcp_info_req(tcp_t *tcp, mblk_t *mp) +{ + mp = tpi_ack_alloc(mp, sizeof (struct T_info_ack), M_PCPROTO, + T_INFO_ACK); + if (!mp) { + tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); + return; + } + tcp_copy_info((struct T_info_ack *)mp->b_rptr, tcp); + putnext(tcp->tcp_connp->conn_rq, mp); +} + +/* Respond to the TPI addr request */ +void +tcp_addr_req(tcp_t *tcp, mblk_t *mp) +{ + struct sockaddr *sa; + mblk_t *ackmp; + struct T_addr_ack *taa; + conn_t *connp = tcp->tcp_connp; + uint_t addrlen; + + /* Make it large enough for worst case */ + ackmp = reallocb(mp, sizeof (struct T_addr_ack) + + 2 * sizeof (sin6_t), 1); + if (ackmp == NULL) { + tcp_err_ack(tcp, mp, TSYSERR, ENOMEM); + return; + } + + taa = (struct T_addr_ack *)ackmp->b_rptr; + + bzero(taa, sizeof (struct T_addr_ack)); + ackmp->b_wptr = (uchar_t *)&taa[1]; + + taa->PRIM_type = T_ADDR_ACK; + ackmp->b_datap->db_type = M_PCPROTO; + + if (connp->conn_family == AF_INET) + addrlen = sizeof (sin_t); + else + addrlen = sizeof (sin6_t); + + /* + * Note: Following code assumes 32 bit alignment of basic + * data structures like sin_t and struct T_addr_ack. + */ + if (tcp->tcp_state >= TCPS_BOUND) { + /* + * Fill in local address first + */ + taa->LOCADDR_offset = sizeof (*taa); + taa->LOCADDR_length = addrlen; + sa = (struct sockaddr *)&taa[1]; + (void) conn_getsockname(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; + } + if (tcp->tcp_state >= TCPS_SYN_RCVD) { + /* + * Fill in Remote address + */ + taa->REMADDR_length = addrlen; + /* assumed 32-bit alignment */ + taa->REMADDR_offset = taa->LOCADDR_offset + taa->LOCADDR_length; + sa = (struct sockaddr *)(ackmp->b_rptr + taa->REMADDR_offset); + (void) conn_getpeername(connp, sa, &addrlen); + ackmp->b_wptr += addrlen; + } + ASSERT(ackmp->b_wptr <= ackmp->b_datap->db_lim); + putnext(tcp->tcp_connp->conn_rq, ackmp); +} + +/* + * tcp_fallback + * + * A direct socket is falling back to using STREAMS. The queue + * that is being passed down was created using tcp_open() with + * the SO_FALLBACK flag set. As a result, the queue is not + * associated with a conn, and the q_ptrs instead contain the + * dev and minor area that should be used. + * + * The 'issocket' flag indicates whether the FireEngine + * optimizations should be used. The common case would be that + * optimizations are enabled, and they might be subsequently + * disabled using the _SIOCSOCKFALLBACK ioctl. + */ + +/* + * An active connection is falling back to TPI. Gather all the information + * required by the STREAM head and TPI sonode and send it up. + */ +void +tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q, + boolean_t issocket, so_proto_quiesced_cb_t quiesced_cb) +{ + conn_t *connp = tcp->tcp_connp; + struct stroptions *stropt; + struct T_capability_ack tca; + struct sockaddr_in6 laddr, faddr; + socklen_t laddrlen, faddrlen; + short opts; + int error; + mblk_t *mp; + + connp->conn_dev = (dev_t)RD(q)->q_ptr; + connp->conn_minor_arena = WR(q)->q_ptr; + + RD(q)->q_ptr = WR(q)->q_ptr = connp; + + connp->conn_rq = RD(q); + connp->conn_wq = WR(q); + + WR(q)->q_qinfo = &tcp_sock_winit; + + if (!issocket) + tcp_use_pure_tpi(tcp); + + /* + * free the helper stream + */ + ip_free_helper_stream(connp); + + /* + * Notify the STREAM head about options + */ + DB_TYPE(stropt_mp) = M_SETOPTS; + stropt = (struct stroptions *)stropt_mp->b_rptr; + stropt_mp->b_wptr += sizeof (struct stroptions); + stropt->so_flags = SO_HIWAT | SO_WROFF | SO_MAXBLK; + + stropt->so_wroff = connp->conn_ht_iphc_len + (tcp->tcp_loopback ? 0 : + tcp->tcp_tcps->tcps_wroff_xtra); + if (tcp->tcp_snd_sack_ok) + stropt->so_wroff += TCPOPT_MAX_SACK_LEN; + stropt->so_hiwat = connp->conn_rcvbuf; + stropt->so_maxblk = tcp_maxpsz_set(tcp, B_FALSE); + + putnext(RD(q), stropt_mp); + + /* + * Collect the information needed to sync with the sonode + */ + tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID); + + laddrlen = faddrlen = sizeof (sin6_t); + (void) tcp_getsockname((sock_lower_handle_t)connp, + (struct sockaddr *)&laddr, &laddrlen, CRED()); + error = tcp_getpeername((sock_lower_handle_t)connp, + (struct sockaddr *)&faddr, &faddrlen, CRED()); + if (error != 0) + faddrlen = 0; + + opts = 0; + if (connp->conn_oobinline) + opts |= SO_OOBINLINE; + if (connp->conn_ixa->ixa_flags & IXAF_DONTROUTE) + opts |= SO_DONTROUTE; + + /* + * Notify the socket that the protocol is now quiescent, + * and it's therefore safe move data from the socket + * to the stream head. + */ + (*quiesced_cb)(connp->conn_upper_handle, q, &tca, + (struct sockaddr *)&laddr, laddrlen, + (struct sockaddr *)&faddr, faddrlen, opts); + + while ((mp = tcp->tcp_rcv_list) != NULL) { + tcp->tcp_rcv_list = mp->b_next; + mp->b_next = NULL; + /* We never do fallback for kernel RPC */ + putnext(q, mp); + } + tcp->tcp_rcv_last_head = NULL; + tcp->tcp_rcv_last_tail = NULL; + tcp->tcp_rcv_cnt = 0; +} + +/* + * An eager is falling back to TPI. All we have to do is send + * up a T_CONN_IND. + */ +void +tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs) +{ + tcp_t *listener = eager->tcp_listener; + mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind; + + ASSERT(listener != NULL); + ASSERT(mp != NULL); + + eager->tcp_conn.tcp_eager_conn_ind = NULL; + + /* + * TLI/XTI applications will get confused by + * sending eager as an option since it violates + * the option semantics. So remove the eager as + * option since TLI/XTI app doesn't need it anyway. + */ + if (!direct_sockfs) { + struct T_conn_ind *conn_ind; + + conn_ind = (struct T_conn_ind *)mp->b_rptr; + conn_ind->OPT_length = 0; + conn_ind->OPT_offset = 0; + } + + /* + * Sockfs guarantees that the listener will not be closed + * during fallback. So we can safely use the listener's queue. + */ + putnext(listener->tcp_connp->conn_rq, mp); +} + +/* + * Swap information between the eager and acceptor for a TLI/XTI client. + * The sockfs accept is done on the acceptor stream and control goes + * through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not + * called. In either case, both the eager and listener are in their own + * perimeter (squeue) and the code has to deal with potential race. + * + * See the block comment on top of tcp_accept() and tcp_tli_accept(). + */ +static void +tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager) +{ + conn_t *econnp, *aconnp; + + ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq); + ASSERT(eager->tcp_detached && !acceptor->tcp_detached); + ASSERT(!TCP_IS_SOCKET(acceptor)); + ASSERT(!TCP_IS_SOCKET(eager)); + ASSERT(!TCP_IS_SOCKET(listener)); + + /* + * Trusted Extensions may need to use a security label that is + * different from the acceptor's label on MLP and MAC-Exempt + * sockets. If this is the case, the required security label + * already exists in econnp->conn_ixa->ixa_tsl. Since we make the + * acceptor stream refer to econnp we atomatically get that label. + */ + + acceptor->tcp_detached = B_TRUE; + /* + * To permit stream re-use by TLI/XTI, the eager needs a copy of + * the acceptor id. + */ + eager->tcp_acceptor_id = acceptor->tcp_acceptor_id; + + /* remove eager from listen list... */ + mutex_enter(&listener->tcp_eager_lock); + tcp_eager_unlink(eager); + ASSERT(eager->tcp_eager_next_q == NULL && + eager->tcp_eager_last_q == NULL); + ASSERT(eager->tcp_eager_next_q0 == NULL && + eager->tcp_eager_prev_q0 == NULL); + mutex_exit(&listener->tcp_eager_lock); + + econnp = eager->tcp_connp; + aconnp = acceptor->tcp_connp; + econnp->conn_rq = aconnp->conn_rq; + econnp->conn_wq = aconnp->conn_wq; + econnp->conn_rq->q_ptr = econnp; + econnp->conn_wq->q_ptr = econnp; + + /* + * In the TLI/XTI loopback case, we are inside the listener's squeue, + * which might be a different squeue from our peer TCP instance. + * For TCP Fusion, the peer expects that whenever tcp_detached is + * clear, our TCP queues point to the acceptor's queues. Thus, use + * membar_producer() to ensure that the assignments of conn_rq/conn_wq + * above reach global visibility prior to the clearing of tcp_detached. + */ + membar_producer(); + eager->tcp_detached = B_FALSE; + + ASSERT(eager->tcp_ack_tid == 0); + + econnp->conn_dev = aconnp->conn_dev; + econnp->conn_minor_arena = aconnp->conn_minor_arena; + + ASSERT(econnp->conn_minor_arena != NULL); + if (econnp->conn_cred != NULL) + crfree(econnp->conn_cred); + econnp->conn_cred = aconnp->conn_cred; + econnp->conn_ixa->ixa_cred = econnp->conn_cred; + aconnp->conn_cred = NULL; + econnp->conn_cpid = aconnp->conn_cpid; + ASSERT(econnp->conn_netstack == aconnp->conn_netstack); + ASSERT(eager->tcp_tcps == acceptor->tcp_tcps); + + econnp->conn_zoneid = aconnp->conn_zoneid; + econnp->conn_allzones = aconnp->conn_allzones; + econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid; + + econnp->conn_mac_mode = aconnp->conn_mac_mode; + econnp->conn_zone_is_global = aconnp->conn_zone_is_global; + aconnp->conn_mac_mode = CONN_MAC_DEFAULT; + + /* Do the IPC initialization */ + CONN_INC_REF(econnp); + + /* Done with old IPC. Drop its ref on its connp */ + CONN_DEC_REF(aconnp); +} + +/* + * Reply to a clients T_CONN_RES TPI message. This function + * is used only for TLI/XTI listener. Sockfs sends T_CONN_RES + * on the acceptor STREAM and processed in tcp_accept_common(). + * Read the block comment on top of tcp_input_listener(). + */ +void +tcp_tli_accept(tcp_t *listener, mblk_t *mp) +{ + tcp_t *acceptor; + tcp_t *eager; + tcp_t *tcp; + struct T_conn_res *tcr; + t_uscalar_t acceptor_id; + t_scalar_t seqnum; + mblk_t *discon_mp = NULL; + mblk_t *ok_mp; + mblk_t *mp1; + tcp_stack_t *tcps = listener->tcp_tcps; + conn_t *econnp; + + if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) { + tcp_err_ack(listener, mp, TPROTO, 0); + return; + } + tcr = (struct T_conn_res *)mp->b_rptr; + + /* + * Under ILP32 the stream head points tcr->ACCEPTOR_id at the + * read side queue of the streams device underneath us i.e. the + * read side queue of 'ip'. Since we can't deference QUEUE_ptr we + * look it up in the queue_hash. Under LP64 it sends down the + * minor_t of the accepting endpoint. + * + * Once the acceptor/eager are modified (in tcp_accept_swap) the + * fanout hash lock is held. + * This prevents any thread from entering the acceptor queue from + * below (since it has not been hard bound yet i.e. any inbound + * packets will arrive on the listener conn_t and + * go through the classifier). + * The CONN_INC_REF will prevent the acceptor from closing. + * + * XXX It is still possible for a tli application to send down data + * on the accepting stream while another thread calls t_accept. + * This should not be a problem for well-behaved applications since + * the T_OK_ACK is sent after the queue swapping is completed. + * + * If the accepting fd is the same as the listening fd, avoid + * queue hash lookup since that will return an eager listener in a + * already established state. + */ + acceptor_id = tcr->ACCEPTOR_id; + mutex_enter(&listener->tcp_eager_lock); + if (listener->tcp_acceptor_id == acceptor_id) { + eager = listener->tcp_eager_next_q; + /* only count how many T_CONN_INDs so don't count q0 */ + if ((listener->tcp_conn_req_cnt_q != 1) || + (eager->tcp_conn_req_seqnum != tcr->SEQ_number)) { + mutex_exit(&listener->tcp_eager_lock); + tcp_err_ack(listener, mp, TBADF, 0); + return; + } + if (listener->tcp_conn_req_cnt_q0 != 0) { + /* Throw away all the eagers on q0. */ + tcp_eager_cleanup(listener, 1); + } + if (listener->tcp_syn_defense) { + listener->tcp_syn_defense = B_FALSE; + if (listener->tcp_ip_addr_cache != NULL) { + kmem_free(listener->tcp_ip_addr_cache, + IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); + listener->tcp_ip_addr_cache = NULL; + } + } + /* + * Transfer tcp_conn_req_max to the eager so that when + * a disconnect occurs we can revert the endpoint to the + * listen state. + */ + eager->tcp_conn_req_max = listener->tcp_conn_req_max; + ASSERT(listener->tcp_conn_req_cnt_q0 == 0); + /* + * Get a reference on the acceptor just like the + * tcp_acceptor_hash_lookup below. + */ + acceptor = listener; + CONN_INC_REF(acceptor->tcp_connp); + } else { + acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps); + if (acceptor == NULL) { + if (listener->tcp_connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_accept: did not find acceptor 0x%x\n", + acceptor_id); + } + mutex_exit(&listener->tcp_eager_lock); + tcp_err_ack(listener, mp, TPROVMISMATCH, 0); + return; + } + /* + * Verify acceptor state. The acceptable states for an acceptor + * include TCPS_IDLE and TCPS_BOUND. + */ + switch (acceptor->tcp_state) { + case TCPS_IDLE: + /* FALLTHRU */ + case TCPS_BOUND: + break; + default: + CONN_DEC_REF(acceptor->tcp_connp); + mutex_exit(&listener->tcp_eager_lock); + tcp_err_ack(listener, mp, TOUTSTATE, 0); + return; + } + } + + /* The listener must be in TCPS_LISTEN */ + if (listener->tcp_state != TCPS_LISTEN) { + CONN_DEC_REF(acceptor->tcp_connp); + mutex_exit(&listener->tcp_eager_lock); + tcp_err_ack(listener, mp, TOUTSTATE, 0); + return; + } + + /* + * Rendezvous with an eager connection request packet hanging off + * 'tcp' that has the 'seqnum' tag. We tagged the detached open + * tcp structure when the connection packet arrived in + * tcp_input_listener(). + */ + seqnum = tcr->SEQ_number; + eager = listener; + do { + eager = eager->tcp_eager_next_q; + if (eager == NULL) { + CONN_DEC_REF(acceptor->tcp_connp); + mutex_exit(&listener->tcp_eager_lock); + tcp_err_ack(listener, mp, TBADSEQ, 0); + return; + } + } while (eager->tcp_conn_req_seqnum != seqnum); + mutex_exit(&listener->tcp_eager_lock); + + /* + * At this point, both acceptor and listener have 2 ref + * that they begin with. Acceptor has one additional ref + * we placed in lookup while listener has 3 additional + * ref for being behind the squeue (tcp_accept() is + * done on listener's squeue); being in classifier hash; + * and eager's ref on listener. + */ + ASSERT(listener->tcp_connp->conn_ref >= 5); + ASSERT(acceptor->tcp_connp->conn_ref >= 3); + + /* + * The eager at this point is set in its own squeue and + * could easily have been killed (tcp_accept_finish will + * deal with that) because of a TH_RST so we can only + * ASSERT for a single ref. + */ + ASSERT(eager->tcp_connp->conn_ref >= 1); + + /* + * Pre allocate the discon_ind mblk also. tcp_accept_finish will + * use it if something failed. + */ + discon_mp = allocb(MAX(sizeof (struct T_discon_ind), + sizeof (struct stroptions)), BPRI_HI); + if (discon_mp == NULL) { + CONN_DEC_REF(acceptor->tcp_connp); + CONN_DEC_REF(eager->tcp_connp); + tcp_err_ack(listener, mp, TSYSERR, ENOMEM); + return; + } + + econnp = eager->tcp_connp; + + /* Hold a copy of mp, in case reallocb fails */ + if ((mp1 = copymsg(mp)) == NULL) { + CONN_DEC_REF(acceptor->tcp_connp); + CONN_DEC_REF(eager->tcp_connp); + freemsg(discon_mp); + tcp_err_ack(listener, mp, TSYSERR, ENOMEM); + return; + } + + tcr = (struct T_conn_res *)mp1->b_rptr; + + /* + * This is an expanded version of mi_tpi_ok_ack_alloc() + * which allocates a larger mblk and appends the new + * local address to the ok_ack. The address is copied by + * soaccept() for getsockname(). + */ + { + int extra; + + extra = (econnp->conn_family == AF_INET) ? + sizeof (sin_t) : sizeof (sin6_t); + + /* + * Try to re-use mp, if possible. Otherwise, allocate + * an mblk and return it as ok_mp. In any case, mp + * is no longer usable upon return. + */ + if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) { + CONN_DEC_REF(acceptor->tcp_connp); + CONN_DEC_REF(eager->tcp_connp); + freemsg(discon_mp); + /* Original mp has been freed by now, so use mp1 */ + tcp_err_ack(listener, mp1, TSYSERR, ENOMEM); + return; + } + + mp = NULL; /* We should never use mp after this point */ + + switch (extra) { + case sizeof (sin_t): { + sin_t *sin = (sin_t *)ok_mp->b_wptr; + + ok_mp->b_wptr += extra; + sin->sin_family = AF_INET; + sin->sin_port = econnp->conn_lport; + sin->sin_addr.s_addr = econnp->conn_laddr_v4; + break; + } + case sizeof (sin6_t): { + sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr; + + ok_mp->b_wptr += extra; + sin6->sin6_family = AF_INET6; + sin6->sin6_port = econnp->conn_lport; + sin6->sin6_addr = econnp->conn_laddr_v6; + sin6->sin6_flowinfo = econnp->conn_flowinfo; + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && + (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = + econnp->conn_ixa->ixa_scopeid; + } else { + sin6->sin6_scope_id = 0; + } + sin6->__sin6_src_id = 0; + break; + } + default: + break; + } + ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim); + } + + /* + * If there are no options we know that the T_CONN_RES will + * succeed. However, we can't send the T_OK_ACK upstream until + * the tcp_accept_swap is done since it would be dangerous to + * let the application start using the new fd prior to the swap. + */ + tcp_accept_swap(listener, acceptor, eager); + + /* + * tcp_accept_swap unlinks eager from listener but does not drop + * the eager's reference on the listener. + */ + ASSERT(eager->tcp_listener == NULL); + ASSERT(listener->tcp_connp->conn_ref >= 5); + + /* + * The eager is now associated with its own queue. Insert in + * the hash so that the connection can be reused for a future + * T_CONN_RES. + */ + tcp_acceptor_hash_insert(acceptor_id, eager); + + /* + * We now do the processing of options with T_CONN_RES. + * We delay till now since we wanted to have queue to pass to + * option processing routines that points back to the right + * instance structure which does not happen until after + * tcp_accept_swap(). + * + * Note: + * The sanity of the logic here assumes that whatever options + * are appropriate to inherit from listner=>eager are done + * before this point, and whatever were to be overridden (or not) + * in transfer logic from eager=>acceptor in tcp_accept_swap(). + * [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it + * before its ACCEPTOR_id comes down in T_CONN_RES ] + * This may not be true at this point in time but can be fixed + * independently. This option processing code starts with + * the instantiated acceptor instance and the final queue at + * this point. + */ + + if (tcr->OPT_length != 0) { + /* Options to process */ + int t_error = 0; + int sys_error = 0; + int do_disconnect = 0; + + if (tcp_conprim_opt_process(eager, mp1, + &do_disconnect, &t_error, &sys_error) < 0) { + eager->tcp_accept_error = 1; + if (do_disconnect) { + /* + * An option failed which does not allow + * connection to be accepted. + * + * We allow T_CONN_RES to succeed and + * put a T_DISCON_IND on the eager queue. + */ + ASSERT(t_error == 0 && sys_error == 0); + eager->tcp_send_discon_ind = 1; + } else { + ASSERT(t_error != 0); + freemsg(ok_mp); + /* + * Original mp was either freed or set + * to ok_mp above, so use mp1 instead. + */ + tcp_err_ack(listener, mp1, t_error, sys_error); + goto finish; + } + } + /* + * Most likely success in setting options (except if + * eager->tcp_send_discon_ind set). + * mp1 option buffer represented by OPT_length/offset + * potentially modified and contains results of setting + * options at this point + */ + } + + /* We no longer need mp1, since all options processing has passed */ + freemsg(mp1); + + putnext(listener->tcp_connp->conn_rq, ok_mp); + + mutex_enter(&listener->tcp_eager_lock); + if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) { + tcp_t *tail; + mblk_t *conn_ind; + + /* + * This path should not be executed if listener and + * acceptor streams are the same. + */ + ASSERT(listener != acceptor); + + tcp = listener->tcp_eager_prev_q0; + /* + * listener->tcp_eager_prev_q0 points to the TAIL of the + * deferred T_conn_ind queue. We need to get to the head of + * the queue in order to send up T_conn_ind the same order as + * how the 3WHS is completed. + */ + while (tcp != listener) { + if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0) + break; + else + tcp = tcp->tcp_eager_prev_q0; + } + ASSERT(tcp != listener); + conn_ind = tcp->tcp_conn.tcp_eager_conn_ind; + ASSERT(conn_ind != NULL); + tcp->tcp_conn.tcp_eager_conn_ind = NULL; + + /* Move from q0 to q */ + ASSERT(listener->tcp_conn_req_cnt_q0 > 0); + listener->tcp_conn_req_cnt_q0--; + listener->tcp_conn_req_cnt_q++; + tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = + tcp->tcp_eager_prev_q0; + tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = + tcp->tcp_eager_next_q0; + tcp->tcp_eager_prev_q0 = NULL; + tcp->tcp_eager_next_q0 = NULL; + tcp->tcp_conn_def_q0 = B_FALSE; + + /* Make sure the tcp isn't in the list of droppables */ + ASSERT(tcp->tcp_eager_next_drop_q0 == NULL && + tcp->tcp_eager_prev_drop_q0 == NULL); + + /* + * Insert at end of the queue because sockfs sends + * down T_CONN_RES in chronological order. Leaving + * the older conn indications at front of the queue + * helps reducing search time. + */ + tail = listener->tcp_eager_last_q; + if (tail != NULL) + tail->tcp_eager_next_q = tcp; + else + listener->tcp_eager_next_q = tcp; + listener->tcp_eager_last_q = tcp; + tcp->tcp_eager_next_q = NULL; + mutex_exit(&listener->tcp_eager_lock); + putnext(tcp->tcp_connp->conn_rq, conn_ind); + } else { + mutex_exit(&listener->tcp_eager_lock); + } + + /* + * Done with the acceptor - free it + * + * Note: from this point on, no access to listener should be made + * as listener can be equal to acceptor. + */ +finish: + ASSERT(acceptor->tcp_detached); + acceptor->tcp_connp->conn_rq = NULL; + ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp)); + acceptor->tcp_connp->conn_wq = NULL; + (void) tcp_clean_death(acceptor, 0); + CONN_DEC_REF(acceptor->tcp_connp); + + /* + * We pass discon_mp to tcp_accept_finish to get on the right squeue. + * + * It will update the setting for sockfs/stream head and also take + * care of any data that arrived before accept() wad called. + * In case we already received a FIN then tcp_accept_finish will send up + * the ordrel. It will also send up a window update if the window + * has opened up. + */ + + /* + * XXX: we currently have a problem if XTI application closes the + * acceptor stream in between. This problem exists in on10-gate also + * and is well know but nothing can be done short of major rewrite + * to fix it. Now it is possible to take care of it by assigning TLI/XTI + * eager same squeue as listener (we can distinguish non socket + * listeners at the time of handling a SYN in tcp_input_listener) + * and do most of the work that tcp_accept_finish does here itself + * and then get behind the acceptor squeue to access the acceptor + * queue. + */ + /* + * We already have a ref on tcp so no need to do one before squeue_enter + */ + SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp, + tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL, + SQTAG_TCP_ACCEPT_FINISH); +} + + +/* + * This is the STREAMS entry point for T_CONN_RES coming down on + * Acceptor STREAM when sockfs listener does accept processing. + * Read the block comment on top of tcp_input_listener(). + */ +void +tcp_tpi_accept(queue_t *q, mblk_t *mp) +{ + queue_t *rq = RD(q); + struct T_conn_res *conn_res; + tcp_t *eager; + tcp_t *listener; + struct T_ok_ack *ok; + t_scalar_t PRIM_type; + conn_t *econnp; + cred_t *cr; + + ASSERT(DB_TYPE(mp) == M_PROTO); + + /* + * All Solaris components should pass a db_credp + * for this TPI message, hence we ASSERT. + * But in case there is some other M_PROTO that looks + * like a TPI message sent by some other kernel + * component, we check and return an error. + */ + cr = msg_getcred(mp, NULL); + ASSERT(cr != NULL); + if (cr == NULL) { + mp = mi_tpi_err_ack_alloc(mp, TSYSERR, EINVAL); + if (mp != NULL) + putnext(rq, mp); + return; + } + conn_res = (struct T_conn_res *)mp->b_rptr; + ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX); + if ((mp->b_wptr - mp->b_rptr) < sizeof (struct T_conn_res)) { + mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); + if (mp != NULL) + putnext(rq, mp); + return; + } + switch (conn_res->PRIM_type) { + case O_T_CONN_RES: + case T_CONN_RES: + /* + * We pass up an err ack if allocb fails. This will + * cause sockfs to issue a T_DISCON_REQ which will cause + * tcp_eager_blowoff to be called. sockfs will then call + * rq->q_qinfo->qi_qclose to cleanup the acceptor stream. + * we need to do the allocb up here because we have to + * make sure rq->q_qinfo->qi_qclose still points to the + * correct function (tcp_tpi_close_accept) in case allocb + * fails. + */ + bcopy(mp->b_rptr + conn_res->OPT_offset, + &eager, conn_res->OPT_length); + PRIM_type = conn_res->PRIM_type; + mp->b_datap->db_type = M_PCPROTO; + mp->b_wptr = mp->b_rptr + sizeof (struct T_ok_ack); + ok = (struct T_ok_ack *)mp->b_rptr; + ok->PRIM_type = T_OK_ACK; + ok->CORRECT_prim = PRIM_type; + econnp = eager->tcp_connp; + econnp->conn_dev = (dev_t)RD(q)->q_ptr; + econnp->conn_minor_arena = (vmem_t *)(WR(q)->q_ptr); + econnp->conn_rq = rq; + econnp->conn_wq = q; + rq->q_ptr = econnp; + rq->q_qinfo = &tcp_rinitv4; /* No open - same as rinitv6 */ + q->q_ptr = econnp; + q->q_qinfo = &tcp_winit; + listener = eager->tcp_listener; + + if (tcp_accept_common(listener->tcp_connp, + econnp, cr) < 0) { + mp = mi_tpi_err_ack_alloc(mp, TPROTO, 0); + if (mp != NULL) + putnext(rq, mp); + return; + } + + /* + * Send the new local address also up to sockfs. There + * should already be enough space in the mp that came + * down from soaccept(). + */ + if (econnp->conn_family == AF_INET) { + sin_t *sin; + + ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= + (sizeof (struct T_ok_ack) + sizeof (sin_t))); + sin = (sin_t *)mp->b_wptr; + mp->b_wptr += sizeof (sin_t); + sin->sin_family = AF_INET; + sin->sin_port = econnp->conn_lport; + sin->sin_addr.s_addr = econnp->conn_laddr_v4; + } else { + sin6_t *sin6; + + ASSERT((mp->b_datap->db_lim - mp->b_datap->db_base) >= + sizeof (struct T_ok_ack) + sizeof (sin6_t)); + sin6 = (sin6_t *)mp->b_wptr; + mp->b_wptr += sizeof (sin6_t); + sin6->sin6_family = AF_INET6; + sin6->sin6_port = econnp->conn_lport; + sin6->sin6_addr = econnp->conn_laddr_v6; + if (econnp->conn_ipversion == IPV4_VERSION) + sin6->sin6_flowinfo = 0; + else + sin6->sin6_flowinfo = econnp->conn_flowinfo; + if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) && + (econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) { + sin6->sin6_scope_id = + econnp->conn_ixa->ixa_scopeid; + } else { + sin6->sin6_scope_id = 0; + } + sin6->__sin6_src_id = 0; + } + + putnext(rq, mp); + return; + default: + mp = mi_tpi_err_ack_alloc(mp, TNOTSUPPORT, 0); + if (mp != NULL) + putnext(rq, mp); + return; + } +} + +/* + * Send the newconn notification to ulp. The eager is blown off if the + * notification fails. + */ +static void +tcp_ulp_newconn(conn_t *lconnp, conn_t *econnp, mblk_t *mp) +{ + if (IPCL_IS_NONSTR(lconnp)) { + cred_t *cr; + pid_t cpid = NOPID; + + ASSERT(econnp->conn_tcp->tcp_listener == lconnp->conn_tcp); + ASSERT(econnp->conn_tcp->tcp_saved_listener == + lconnp->conn_tcp); + + cr = msg_getcred(mp, &cpid); + + /* Keep the message around in case of a fallback to TPI */ + econnp->conn_tcp->tcp_conn.tcp_eager_conn_ind = mp; + /* + * Notify the ULP about the newconn. It is guaranteed that no + * tcp_accept() call will be made for the eager if the + * notification fails, so it's safe to blow it off in that + * case. + * + * The upper handle will be assigned when tcp_accept() is + * called. + */ + if ((*lconnp->conn_upcalls->su_newconn) + (lconnp->conn_upper_handle, + (sock_lower_handle_t)econnp, + &sock_tcp_downcalls, cr, cpid, + &econnp->conn_upcalls) == NULL) { + /* Failed to allocate a socket */ + TCPS_BUMP_MIB(lconnp->conn_tcp->tcp_tcps, + tcpEstabResets); + (void) tcp_eager_blowoff(lconnp->conn_tcp, + econnp->conn_tcp->tcp_conn_req_seqnum); + } + } else { + putnext(lconnp->conn_rq, mp); + } +} + +/* + * The function called through squeue to get behind listener's perimeter to + * send a deferred conn_ind. + */ +/* ARGSUSED */ +void +tcp_send_pending(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) +{ + conn_t *lconnp = (conn_t *)arg; + tcp_t *listener = lconnp->conn_tcp; + struct T_conn_ind *conn_ind; + tcp_t *tcp; + + conn_ind = (struct T_conn_ind *)mp->b_rptr; + bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, + conn_ind->OPT_length); + + if (listener->tcp_state != TCPS_LISTEN) { + /* + * If listener has closed, it would have caused a + * a cleanup/blowoff to happen for the eager, so + * we don't need to do anything more. + */ + freemsg(mp); + return; + } + + tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); +} + +/* + * Sends the T_CONN_IND to the listener. The caller calls this + * functions via squeue to get inside the listener's perimeter + * once the 3 way hand shake is done a T_CONN_IND needs to be + * sent. As an optimization, the caller can call this directly + * if listener's perimeter is same as eager's. + */ +/* ARGSUSED */ +void +tcp_send_conn_ind(void *arg, mblk_t *mp, void *arg2) +{ + conn_t *lconnp = (conn_t *)arg; + tcp_t *listener = lconnp->conn_tcp; + tcp_t *tcp; + struct T_conn_ind *conn_ind; + ipaddr_t *addr_cache; + boolean_t need_send_conn_ind = B_FALSE; + tcp_stack_t *tcps = listener->tcp_tcps; + + /* retrieve the eager */ + conn_ind = (struct T_conn_ind *)mp->b_rptr; + ASSERT(conn_ind->OPT_offset != 0 && + conn_ind->OPT_length == sizeof (intptr_t)); + bcopy(mp->b_rptr + conn_ind->OPT_offset, &tcp, + conn_ind->OPT_length); + + /* + * TLI/XTI applications will get confused by + * sending eager as an option since it violates + * the option semantics. So remove the eager as + * option since TLI/XTI app doesn't need it anyway. + */ + if (!TCP_IS_SOCKET(listener)) { + conn_ind->OPT_length = 0; + conn_ind->OPT_offset = 0; + } + if (listener->tcp_state != TCPS_LISTEN) { + /* + * If listener has closed, it would have caused a + * a cleanup/blowoff to happen for the eager. We + * just need to return. + */ + freemsg(mp); + return; + } + + + /* + * if the conn_req_q is full defer passing up the + * T_CONN_IND until space is availabe after t_accept() + * processing + */ + mutex_enter(&listener->tcp_eager_lock); + + /* + * Take the eager out, if it is in the list of droppable eagers + * as we are here because the 3W handshake is over. + */ + MAKE_UNDROPPABLE(tcp); + + if (listener->tcp_conn_req_cnt_q < listener->tcp_conn_req_max) { + tcp_t *tail; + + /* + * The eager already has an extra ref put in tcp_input_data + * so that it stays till accept comes back even though it + * might get into TCPS_CLOSED as a result of a TH_RST etc. + */ + ASSERT(listener->tcp_conn_req_cnt_q0 > 0); + listener->tcp_conn_req_cnt_q0--; + listener->tcp_conn_req_cnt_q++; + + /* Move from SYN_RCVD to ESTABLISHED list */ + tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = + tcp->tcp_eager_prev_q0; + tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = + tcp->tcp_eager_next_q0; + tcp->tcp_eager_prev_q0 = NULL; + tcp->tcp_eager_next_q0 = NULL; + + /* + * Insert at end of the queue because sockfs + * sends down T_CONN_RES in chronological + * order. Leaving the older conn indications + * at front of the queue helps reducing search + * time. + */ + tail = listener->tcp_eager_last_q; + if (tail != NULL) + tail->tcp_eager_next_q = tcp; + else + listener->tcp_eager_next_q = tcp; + listener->tcp_eager_last_q = tcp; + tcp->tcp_eager_next_q = NULL; + /* + * Delay sending up the T_conn_ind until we are + * done with the eager. Once we have have sent up + * the T_conn_ind, the accept can potentially complete + * any time and release the refhold we have on the eager. + */ + need_send_conn_ind = B_TRUE; + } else { + /* + * Defer connection on q0 and set deferred + * connection bit true + */ + tcp->tcp_conn_def_q0 = B_TRUE; + + /* take tcp out of q0 ... */ + tcp->tcp_eager_prev_q0->tcp_eager_next_q0 = + tcp->tcp_eager_next_q0; + tcp->tcp_eager_next_q0->tcp_eager_prev_q0 = + tcp->tcp_eager_prev_q0; + + /* ... and place it at the end of q0 */ + tcp->tcp_eager_prev_q0 = listener->tcp_eager_prev_q0; + tcp->tcp_eager_next_q0 = listener; + listener->tcp_eager_prev_q0->tcp_eager_next_q0 = tcp; + listener->tcp_eager_prev_q0 = tcp; + tcp->tcp_conn.tcp_eager_conn_ind = mp; + } + + /* we have timed out before */ + if (tcp->tcp_syn_rcvd_timeout != 0) { + tcp->tcp_syn_rcvd_timeout = 0; + listener->tcp_syn_rcvd_timeout--; + if (listener->tcp_syn_defense && + listener->tcp_syn_rcvd_timeout <= + (tcps->tcps_conn_req_max_q0 >> 5) && + 10*MINUTES < TICK_TO_MSEC(ddi_get_lbolt64() - + listener->tcp_last_rcv_lbolt)) { + /* + * Turn off the defense mode if we + * believe the SYN attack is over. + */ + listener->tcp_syn_defense = B_FALSE; + if (listener->tcp_ip_addr_cache) { + kmem_free((void *)listener->tcp_ip_addr_cache, + IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t)); + listener->tcp_ip_addr_cache = NULL; + } + } + } + addr_cache = (ipaddr_t *)(listener->tcp_ip_addr_cache); + if (addr_cache != NULL) { + /* + * We have finished a 3-way handshake with this + * remote host. This proves the IP addr is good. + * Cache it! + */ + addr_cache[IP_ADDR_CACHE_HASH(tcp->tcp_connp->conn_faddr_v4)] = + tcp->tcp_connp->conn_faddr_v4; + } + mutex_exit(&listener->tcp_eager_lock); + if (need_send_conn_ind) + tcp_ulp_newconn(lconnp, tcp->tcp_connp, mp); +} |