From 721fffe35d40e548a5a58dc53a2ec9c6762172d9 Mon Sep 17 00:00:00 2001 From: Kacheong Poon Date: Wed, 24 Feb 2010 07:49:29 -0800 Subject: PSARC 2010/042 increase max TCP_INIT_CWND 6923847 Increase TCP_INIT_CWND max 6918307 Some TCP kstats are not needed 6923858 TCP connection counter 6925635 The file tcp.c is too big --- usr/src/uts/common/inet/tcp/tcp_bind.c | 935 +++++++++++++++++++++++++++++++++ 1 file changed, 935 insertions(+) create mode 100644 usr/src/uts/common/inet/tcp/tcp_bind.c (limited to 'usr/src/uts/common/inet/tcp/tcp_bind.c') diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c new file mode 100644 index 0000000000..5d91fe7a7f --- /dev/null +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -0,0 +1,935 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#define _SUN_TPI_VERSION 2 +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +/* Setable in /etc/system */ +/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */ +static uint32_t tcp_random_anon_port = 1; + +static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, + cred_t *cr); +static in_port_t tcp_get_next_priv_port(const tcp_t *); + +/* + * Hash list insertion routine for tcp_t structures. Each hash bucket + * contains a list of tcp_t entries, and each entry is bound to a unique + * port. If there are multiple tcp_t's that are bound to the same port, then + * one of them will be linked into the hash bucket list, and the rest will + * hang off of that one entry. For each port, entries bound to a specific IP + * address will be inserted before those those bound to INADDR_ANY. + */ +void +tcp_bind_hash_insert(tf_t *tbf, tcp_t *tcp, int caller_holds_lock) +{ + tcp_t **tcpp; + tcp_t *tcpnext; + tcp_t *tcphash; + conn_t *connp = tcp->tcp_connp; + conn_t *connext; + + if (tcp->tcp_ptpbhn != NULL) { + ASSERT(!caller_holds_lock); + tcp_bind_hash_remove(tcp); + } + tcpp = &tbf->tf_tcp; + if (!caller_holds_lock) { + mutex_enter(&tbf->tf_lock); + } else { + ASSERT(MUTEX_HELD(&tbf->tf_lock)); + } + tcphash = tcpp[0]; + tcpnext = NULL; + if (tcphash != NULL) { + /* Look for an entry using the same port */ + while ((tcphash = tcpp[0]) != NULL && + connp->conn_lport != tcphash->tcp_connp->conn_lport) + tcpp = &(tcphash->tcp_bind_hash); + + /* The port was not found, just add to the end */ + if (tcphash == NULL) + goto insert; + + /* + * OK, there already exists an entry bound to the + * same port. + * + * If the new tcp bound to the INADDR_ANY address + * and the first one in the list is not bound to + * INADDR_ANY we skip all entries until we find the + * first one bound to INADDR_ANY. + * This makes sure that applications binding to a + * specific address get preference over those binding to + * INADDR_ANY. + */ + tcpnext = tcphash; + connext = tcpnext->tcp_connp; + tcphash = NULL; + if (V6_OR_V4_INADDR_ANY(connp->conn_bound_addr_v6) && + !V6_OR_V4_INADDR_ANY(connext->conn_bound_addr_v6)) { + while ((tcpnext = tcpp[0]) != NULL) { + connext = tcpnext->tcp_connp; + if (!V6_OR_V4_INADDR_ANY( + connext->conn_bound_addr_v6)) + tcpp = &(tcpnext->tcp_bind_hash_port); + else + break; + } + if (tcpnext != NULL) { + tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; + tcphash = tcpnext->tcp_bind_hash; + if (tcphash != NULL) { + tcphash->tcp_ptpbhn = + &(tcp->tcp_bind_hash); + tcpnext->tcp_bind_hash = NULL; + } + } + } else { + tcpnext->tcp_ptpbhn = &tcp->tcp_bind_hash_port; + tcphash = tcpnext->tcp_bind_hash; + if (tcphash != NULL) { + tcphash->tcp_ptpbhn = + &(tcp->tcp_bind_hash); + tcpnext->tcp_bind_hash = NULL; + } + } + } +insert: + tcp->tcp_bind_hash_port = tcpnext; + tcp->tcp_bind_hash = tcphash; + tcp->tcp_ptpbhn = tcpp; + tcpp[0] = tcp; + if (!caller_holds_lock) + mutex_exit(&tbf->tf_lock); +} + +/* + * Hash list removal routine for tcp_t structures. + */ +void +tcp_bind_hash_remove(tcp_t *tcp) +{ + tcp_t *tcpnext; + kmutex_t *lockp; + tcp_stack_t *tcps = tcp->tcp_tcps; + conn_t *connp = tcp->tcp_connp; + + if (tcp->tcp_ptpbhn == NULL) + return; + + /* + * Extract the lock pointer in case there are concurrent + * hash_remove's for this instance. + */ + ASSERT(connp->conn_lport != 0); + lockp = &tcps->tcps_bind_fanout[TCP_BIND_HASH( + connp->conn_lport)].tf_lock; + + ASSERT(lockp != NULL); + mutex_enter(lockp); + if (tcp->tcp_ptpbhn) { + tcpnext = tcp->tcp_bind_hash_port; + if (tcpnext != NULL) { + tcp->tcp_bind_hash_port = NULL; + tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; + tcpnext->tcp_bind_hash = tcp->tcp_bind_hash; + if (tcpnext->tcp_bind_hash != NULL) { + tcpnext->tcp_bind_hash->tcp_ptpbhn = + &(tcpnext->tcp_bind_hash); + tcp->tcp_bind_hash = NULL; + } + } else if ((tcpnext = tcp->tcp_bind_hash) != NULL) { + tcpnext->tcp_ptpbhn = tcp->tcp_ptpbhn; + tcp->tcp_bind_hash = NULL; + } + *tcp->tcp_ptpbhn = tcpnext; + tcp->tcp_ptpbhn = NULL; + } + mutex_exit(lockp); +} + +/* + * Don't let port fall into the privileged range. + * Since the extra privileged ports can be arbitrary we also + * ensure that we exclude those from consideration. + * tcp_g_epriv_ports is not sorted thus we loop over it until + * there are no changes. + * + * Note: No locks are held when inspecting tcp_g_*epriv_ports + * but instead the code relies on: + * - the fact that the address of the array and its size never changes + * - the atomic assignment of the elements of the array + * + * Returns 0 if there are no more ports available. + * + * TS note: skip multilevel ports. + */ +in_port_t +tcp_update_next_port(in_port_t port, const tcp_t *tcp, boolean_t random) +{ + int i; + boolean_t restart = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; + + if (random && tcp_random_anon_port != 0) { + (void) random_get_pseudo_bytes((uint8_t *)&port, + sizeof (in_port_t)); + /* + * Unless changed by a sys admin, the smallest anon port + * is 32768 and the largest anon port is 65535. It is + * very likely (50%) for the random port to be smaller + * than the smallest anon port. When that happens, + * add port % (anon port range) to the smallest anon + * port to get the random port. It should fall into the + * valid anon port range. + */ + if (port < tcps->tcps_smallest_anon_port) { + port = tcps->tcps_smallest_anon_port + + port % (tcps->tcps_largest_anon_port - + tcps->tcps_smallest_anon_port); + } + } + +retry: + if (port < tcps->tcps_smallest_anon_port) + port = (in_port_t)tcps->tcps_smallest_anon_port; + + if (port > tcps->tcps_largest_anon_port) { + if (restart) + return (0); + restart = B_TRUE; + port = (in_port_t)tcps->tcps_smallest_anon_port; + } + + if (port < tcps->tcps_smallest_nonpriv_port) + port = (in_port_t)tcps->tcps_smallest_nonpriv_port; + + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (port == tcps->tcps_g_epriv_ports[i]) { + port++; + /* + * Make sure whether the port is in the + * valid range. + */ + goto retry; + } + } + if (is_system_labeled() && + (i = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), port, + IPPROTO_TCP, B_TRUE)) != 0) { + port = i; + goto retry; + } + return (port); +} + +/* + * Return the next anonymous port in the privileged port range for + * bind checking. It starts at IPPORT_RESERVED - 1 and goes + * downwards. This is the same behavior as documented in the userland + * library call rresvport(3N). + * + * TS note: skip multilevel ports. + */ +static in_port_t +tcp_get_next_priv_port(const tcp_t *tcp) +{ + static in_port_t next_priv_port = IPPORT_RESERVED - 1; + in_port_t nextport; + boolean_t restart = B_FALSE; + tcp_stack_t *tcps = tcp->tcp_tcps; +retry: + if (next_priv_port < tcps->tcps_min_anonpriv_port || + next_priv_port >= IPPORT_RESERVED) { + next_priv_port = IPPORT_RESERVED - 1; + if (restart) + return (0); + restart = B_TRUE; + } + if (is_system_labeled() && + (nextport = tsol_next_port(crgetzone(tcp->tcp_connp->conn_cred), + next_priv_port, IPPROTO_TCP, B_FALSE)) != 0) { + next_priv_port = nextport; + goto retry; + } + return (next_priv_port--); +} + +static int +tcp_bind_select_lport(tcp_t *tcp, in_port_t *requested_port_ptr, + boolean_t bind_to_req_port_only, cred_t *cr) +{ + in_port_t mlp_port; + mlp_type_t addrtype, mlptype; + boolean_t user_specified; + in_port_t allocated_port; + in_port_t requested_port = *requested_port_ptr; + conn_t *connp = tcp->tcp_connp; + zone_t *zone; + tcp_stack_t *tcps = tcp->tcp_tcps; + in6_addr_t v6addr = connp->conn_laddr_v6; + + /* + * XXX It's up to the caller to specify bind_to_req_port_only or not. + */ + ASSERT(cr != NULL); + + /* + * Get a valid port (within the anonymous range and should not + * be a privileged one) to use if the user has not given a port. + * If multiple threads are here, they may all start with + * with the same initial port. But, it should be fine as long as + * tcp_bindi will ensure that no two threads will be assigned + * the same port. + * + * NOTE: XXX If a privileged process asks for an anonymous port, we + * still check for ports only in the range > tcp_smallest_non_priv_port, + * unless TCP_ANONPRIVBIND option is set. + */ + mlptype = mlptSingle; + mlp_port = requested_port; + if (requested_port == 0) { + requested_port = connp->conn_anon_priv_bind ? + tcp_get_next_priv_port(tcp) : + tcp_update_next_port(tcps->tcps_next_port_to_try, + tcp, B_TRUE); + if (requested_port == 0) { + return (-TNOADDR); + } + user_specified = B_FALSE; + + /* + * If the user went through one of the RPC interfaces to create + * this socket and RPC is MLP in this zone, then give him an + * anonymous MLP. + */ + if (connp->conn_anon_mlp && is_system_labeled()) { + zone = crgetzone(cr); + addrtype = tsol_mlp_addr_type( + connp->conn_allzones ? ALL_ZONES : zone->zone_id, + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); + if (addrtype == mlptSingle) { + return (-TNOADDR); + } + mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, + PMAPPORT, addrtype); + mlp_port = PMAPPORT; + } + } else { + int i; + boolean_t priv = B_FALSE; + + /* + * If the requested_port is in the well-known privileged range, + * verify that the stream was opened by a privileged user. + * Note: No locks are held when inspecting tcp_g_*epriv_ports + * but instead the code relies on: + * - the fact that the address of the array and its size never + * changes + * - the atomic assignment of the elements of the array + */ + if (requested_port < tcps->tcps_smallest_nonpriv_port) { + priv = B_TRUE; + } else { + for (i = 0; i < tcps->tcps_g_num_epriv_ports; i++) { + if (requested_port == + tcps->tcps_g_epriv_ports[i]) { + priv = B_TRUE; + break; + } + } + } + if (priv) { + if (secpolicy_net_privaddr(cr, requested_port, + IPPROTO_TCP) != 0) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: no priv for port %d", + requested_port); + } + return (-TACCES); + } + } + user_specified = B_TRUE; + + connp = tcp->tcp_connp; + if (is_system_labeled()) { + zone = crgetzone(cr); + addrtype = tsol_mlp_addr_type( + connp->conn_allzones ? ALL_ZONES : zone->zone_id, + IPV6_VERSION, &v6addr, + tcps->tcps_netstack->netstack_ip); + if (addrtype == mlptSingle) { + return (-TNOADDR); + } + mlptype = tsol_mlp_port_type(zone, IPPROTO_TCP, + requested_port, addrtype); + } + } + + if (mlptype != mlptSingle) { + if (secpolicy_net_bindmlp(cr) != 0) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: no priv for multilevel port %d", + requested_port); + } + return (-TACCES); + } + + /* + * If we're specifically binding a shared IP address and the + * port is MLP on shared addresses, then check to see if this + * zone actually owns the MLP. Reject if not. + */ + if (mlptype == mlptShared && addrtype == mlptShared) { + /* + * No need to handle exclusive-stack zones since + * ALL_ZONES only applies to the shared stack. + */ + zoneid_t mlpzone; + + mlpzone = tsol_mlp_findzone(IPPROTO_TCP, + htons(mlp_port)); + if (connp->conn_zoneid != mlpzone) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: attempt to bind port " + "%d on shared addr in zone %d " + "(should be %d)", + mlp_port, connp->conn_zoneid, + mlpzone); + } + return (-TACCES); + } + } + + if (!user_specified) { + int err; + err = tsol_mlp_anon(zone, mlptype, connp->conn_proto, + requested_port, B_TRUE); + if (err != 0) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: cannot establish anon " + "MLP for port %d", + requested_port); + } + return (err); + } + connp->conn_anon_port = B_TRUE; + } + connp->conn_mlp_type = mlptype; + } + + allocated_port = tcp_bindi(tcp, requested_port, &v6addr, + connp->conn_reuseaddr, B_FALSE, bind_to_req_port_only, + user_specified); + + if (allocated_port == 0) { + connp->conn_mlp_type = mlptSingle; + if (connp->conn_anon_port) { + connp->conn_anon_port = B_FALSE; + (void) tsol_mlp_anon(zone, mlptype, connp->conn_proto, + requested_port, B_FALSE); + } + if (bind_to_req_port_only) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: requested addr busy"); + } + return (-TADDRBUSY); + } else { + /* If we are out of ports, fail the bind. */ + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: out of ports?"); + } + return (-TNOADDR); + } + } + + /* Pass the allocated port back */ + *requested_port_ptr = allocated_port; + return (0); +} + +/* + * Check the address and check/pick a local port number. + */ +int +tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, + boolean_t bind_to_req_port_only) +{ + tcp_t *tcp = connp->conn_tcp; + sin_t *sin; + sin6_t *sin6; + in_port_t requested_port; + ipaddr_t v4addr; + in6_addr_t v6addr; + ip_laddr_t laddr_type = IPVL_UNICAST_UP; /* INADDR_ANY */ + zoneid_t zoneid = IPCL_ZONEID(connp); + ip_stack_t *ipst = connp->conn_netstack->netstack_ip; + uint_t scopeid = 0; + int error = 0; + ip_xmit_attr_t *ixa = connp->conn_ixa; + + ASSERT((uintptr_t)len <= (uintptr_t)INT_MAX); + + if (tcp->tcp_state == TCPS_BOUND) { + return (0); + } else if (tcp->tcp_state > TCPS_BOUND) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad state, %d", tcp->tcp_state); + } + return (-TOUTSTATE); + } + + ASSERT(sa != NULL && len != 0); + + if (!OK_32PTR((char *)sa)) { + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, + SL_ERROR|SL_TRACE, + "tcp_bind: bad address parameter, " + "address %p, len %d", + (void *)sa, len); + } + return (-TPROTO); + } + + error = proto_verify_ip_addr(connp->conn_family, sa, len); + if (error != 0) { + return (error); + } + + switch (len) { + case sizeof (sin_t): /* Complete IPv4 address */ + sin = (sin_t *)sa; + requested_port = ntohs(sin->sin_port); + v4addr = sin->sin_addr.s_addr; + IN6_IPADDR_TO_V4MAPPED(v4addr, &v6addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, zoneid, ipst, + B_FALSE); + } + break; + + case sizeof (sin6_t): /* Complete IPv6 address */ + sin6 = (sin6_t *)sa; + v6addr = sin6->sin6_addr; + requested_port = ntohs(sin6->sin6_port); + if (IN6_IS_ADDR_V4MAPPED(&v6addr)) { + if (connp->conn_ipv6_v6only) + return (EADDRNOTAVAIL); + + IN6_V4MAPPED_TO_IPADDR(&v6addr, v4addr); + if (v4addr != INADDR_ANY) { + laddr_type = ip_laddr_verify_v4(v4addr, + zoneid, ipst, B_FALSE); + } + } else { + if (!IN6_IS_ADDR_UNSPECIFIED(&v6addr)) { + if (IN6_IS_ADDR_LINKSCOPE(&v6addr)) + scopeid = sin6->sin6_scope_id; + laddr_type = ip_laddr_verify_v6(&v6addr, + zoneid, ipst, B_FALSE, scopeid); + } + } + break; + + default: + if (connp->conn_debug) { + (void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE, + "tcp_bind: bad address length, %d", len); + } + return (EAFNOSUPPORT); + /* return (-TBADADDR); */ + } + + /* Is the local address a valid unicast address? */ + if (laddr_type == IPVL_BAD) + return (EADDRNOTAVAIL); + + connp->conn_bound_addr_v6 = v6addr; + if (scopeid != 0) { + ixa->ixa_flags |= IXAF_SCOPEID_SET; + ixa->ixa_scopeid = scopeid; + connp->conn_incoming_ifindex = scopeid; + } else { + ixa->ixa_flags &= ~IXAF_SCOPEID_SET; + connp->conn_incoming_ifindex = connp->conn_bound_if; + } + + connp->conn_laddr_v6 = v6addr; + connp->conn_saddr_v6 = v6addr; + + bind_to_req_port_only = requested_port != 0 && bind_to_req_port_only; + + error = tcp_bind_select_lport(tcp, &requested_port, + bind_to_req_port_only, cr); + if (error != 0) { + connp->conn_laddr_v6 = ipv6_all_zeros; + connp->conn_saddr_v6 = ipv6_all_zeros; + connp->conn_bound_addr_v6 = ipv6_all_zeros; + } + return (error); +} + +/* + * If the "bind_to_req_port_only" parameter is set, if the requested port + * number is available, return it, If not return 0 + * + * If "bind_to_req_port_only" parameter is not set and + * If the requested port number is available, return it. If not, return + * the first anonymous port we happen across. If no anonymous ports are + * available, return 0. addr is the requested local address, if any. + * + * In either case, when succeeding update the tcp_t to record the port number + * and insert it in the bind hash table. + * + * Note that TCP over IPv4 and IPv6 sockets can use the same port number + * without setting SO_REUSEADDR. This is needed so that they + * can be viewed as two independent transport protocols. + */ +in_port_t +tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, + int reuseaddr, boolean_t quick_connect, + boolean_t bind_to_req_port_only, boolean_t user_specified) +{ + /* number of times we have run around the loop */ + int count = 0; + /* maximum number of times to run around the loop */ + int loopmax; + conn_t *connp = tcp->tcp_connp; + tcp_stack_t *tcps = tcp->tcp_tcps; + + /* + * Lookup for free addresses is done in a loop and "loopmax" + * influences how long we spin in the loop + */ + if (bind_to_req_port_only) { + /* + * If the requested port is busy, don't bother to look + * for a new one. Setting loop maximum count to 1 has + * that effect. + */ + loopmax = 1; + } else { + /* + * If the requested port is busy, look for a free one + * in the anonymous port range. + * Set loopmax appropriately so that one does not look + * forever in the case all of the anonymous ports are in use. + */ + if (connp->conn_anon_priv_bind) { + /* + * loopmax = + * (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1 + */ + loopmax = IPPORT_RESERVED - + tcps->tcps_min_anonpriv_port; + } else { + loopmax = (tcps->tcps_largest_anon_port - + tcps->tcps_smallest_anon_port + 1); + } + } + do { + uint16_t lport; + tf_t *tbf; + tcp_t *ltcp; + conn_t *lconnp; + + lport = htons(port); + + /* + * Ensure that the tcp_t is not currently in the bind hash. + * Hold the lock on the hash bucket to ensure that + * the duplicate check plus the insertion is an atomic + * operation. + * + * This function does an inline lookup on the bind hash list + * Make sure that we access only members of tcp_t + * and that we don't look at tcp_tcp, since we are not + * doing a CONN_INC_REF. + */ + tcp_bind_hash_remove(tcp); + tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)]; + mutex_enter(&tbf->tf_lock); + for (ltcp = tbf->tf_tcp; ltcp != NULL; + ltcp = ltcp->tcp_bind_hash) { + if (lport == ltcp->tcp_connp->conn_lport) + break; + } + + for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { + boolean_t not_socket; + boolean_t exclbind; + + lconnp = ltcp->tcp_connp; + + /* + * On a labeled system, we must treat bindings to ports + * on shared IP addresses by sockets with MAC exemption + * privilege as being in all zones, as there's + * otherwise no way to identify the right receiver. + */ + if (!IPCL_BIND_ZONE_MATCH(lconnp, connp)) + continue; + + /* + * If TCP_EXCLBIND is set for either the bound or + * binding endpoint, the semantics of bind + * is changed according to the following. + * + * spec = specified address (v4 or v6) + * unspec = unspecified address (v4 or v6) + * A = specified addresses are different for endpoints + * + * bound bind to allowed + * ------------------------------------- + * unspec unspec no + * unspec spec no + * spec unspec no + * spec spec yes if A + * + * For labeled systems, SO_MAC_EXEMPT behaves the same + * as TCP_EXCLBIND, except that zoneid is ignored. + * + * Note: + * + * 1. Because of TLI semantics, an endpoint can go + * back from, say TCP_ESTABLISHED to TCPS_LISTEN or + * TCPS_BOUND, depending on whether it is originally + * a listener or not. That is why we need to check + * for states greater than or equal to TCPS_BOUND + * here. + * + * 2. Ideally, we should only check for state equals + * to TCPS_LISTEN. And the following check should be + * added. + * + * if (ltcp->tcp_state == TCPS_LISTEN || + * !reuseaddr || !lconnp->conn_reuseaddr) { + * ... + * } + * + * The semantics will be changed to this. If the + * endpoint on the list is in state not equal to + * TCPS_LISTEN and both endpoints have SO_REUSEADDR + * set, let the bind succeed. + * + * Because of (1), we cannot do that for TLI + * endpoints. But we can do that for socket endpoints. + * If in future, we can change this going back + * semantics, we can use the above check for TLI also. + */ + not_socket = !(TCP_IS_SOCKET(ltcp) && + TCP_IS_SOCKET(tcp)); + exclbind = lconnp->conn_exclbind || + connp->conn_exclbind; + + if ((lconnp->conn_mac_mode != CONN_MAC_DEFAULT) || + (connp->conn_mac_mode != CONN_MAC_DEFAULT) || + (exclbind && (not_socket || + ltcp->tcp_state <= TCPS_ESTABLISHED))) { + if (V6_OR_V4_INADDR_ANY( + lconnp->conn_bound_addr_v6) || + V6_OR_V4_INADDR_ANY(*laddr) || + IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6)) { + break; + } + continue; + } + + /* + * Check ipversion to allow IPv4 and IPv6 sockets to + * have disjoint port number spaces, if *_EXCLBIND + * is not set and only if the application binds to a + * specific port. We use the same autoassigned port + * number space for IPv4 and IPv6 sockets. + */ + if (connp->conn_ipversion != lconnp->conn_ipversion && + bind_to_req_port_only) + continue; + + /* + * Ideally, we should make sure that the source + * address, remote address, and remote port in the + * four tuple for this tcp-connection is unique. + * However, trying to find out the local source + * address would require too much code duplication + * with IP, since IP needs needs to have that code + * to support userland TCP implementations. + */ + if (quick_connect && + (ltcp->tcp_state > TCPS_LISTEN) && + ((connp->conn_fport != lconnp->conn_fport) || + !IN6_ARE_ADDR_EQUAL(&connp->conn_faddr_v6, + &lconnp->conn_faddr_v6))) + continue; + + if (!reuseaddr) { + /* + * No socket option SO_REUSEADDR. + * If existing port is bound to + * a non-wildcard IP address + * and the requesting stream is + * bound to a distinct + * different IP addresses + * (non-wildcard, also), keep + * going. + */ + if (!V6_OR_V4_INADDR_ANY(*laddr) && + !V6_OR_V4_INADDR_ANY( + lconnp->conn_bound_addr_v6) && + !IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6)) + continue; + if (ltcp->tcp_state >= TCPS_BOUND) { + /* + * This port is being used and + * its state is >= TCPS_BOUND, + * so we can't bind to it. + */ + break; + } + } else { + /* + * socket option SO_REUSEADDR is set on the + * binding tcp_t. + * + * If two streams are bound to + * same IP address or both addr + * and bound source are wildcards + * (INADDR_ANY), we want to stop + * searching. + * We have found a match of IP source + * address and source port, which is + * refused regardless of the + * SO_REUSEADDR setting, so we break. + */ + if (IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6) && + (ltcp->tcp_state == TCPS_LISTEN || + ltcp->tcp_state == TCPS_BOUND)) + break; + } + } + if (ltcp != NULL) { + /* The port number is busy */ + mutex_exit(&tbf->tf_lock); + } else { + /* + * This port is ours. Insert in fanout and mark as + * bound to prevent others from getting the port + * number. + */ + tcp->tcp_state = TCPS_BOUND; + connp->conn_lport = htons(port); + + ASSERT(&tcps->tcps_bind_fanout[TCP_BIND_HASH( + connp->conn_lport)] == tbf); + tcp_bind_hash_insert(tbf, tcp, 1); + + mutex_exit(&tbf->tf_lock); + + /* + * We don't want tcp_next_port_to_try to "inherit" + * a port number supplied by the user in a bind. + */ + if (user_specified) + return (port); + + /* + * This is the only place where tcp_next_port_to_try + * is updated. After the update, it may or may not + * be in the valid range. + */ + if (!connp->conn_anon_priv_bind) + tcps->tcps_next_port_to_try = port + 1; + return (port); + } + + if (connp->conn_anon_priv_bind) { + port = tcp_get_next_priv_port(tcp); + } else { + if (count == 0 && user_specified) { + /* + * We may have to return an anonymous port. So + * get one to start with. + */ + port = + tcp_update_next_port( + tcps->tcps_next_port_to_try, + tcp, B_TRUE); + user_specified = B_FALSE; + } else { + port = tcp_update_next_port(port + 1, tcp, + B_FALSE); + } + } + if (port == 0) + break; + + /* + * Don't let this loop run forever in the case where + * all of the anonymous ports are in use. + */ + } while (++count < loopmax); + return (0); +} -- cgit v1.2.3