summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/tcp
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/tcp')
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c39
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_bind.c226
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c19
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c110
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_socket.c10
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_stats.c9
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_timers.c2
7 files changed, 359 insertions, 56 deletions
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 9348ea3d0f..427a6df274 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -961,8 +961,7 @@ void
tcp_stop_lingering(tcp_t *tcp)
{
clock_t delta = 0;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- conn_t *connp = tcp->tcp_connp;
+ conn_t *connp = tcp->tcp_connp;
tcp->tcp_linger_tid = 0;
if (tcp->tcp_state > TCPS_LISTEN) {
@@ -990,7 +989,7 @@ tcp_stop_lingering(tcp_t *tcp)
if (tcp->tcp_state == TCPS_TIME_WAIT) {
tcp_time_wait_append(tcp);
- TCP_DBGSTAT(tcps, tcp_detach_time_wait);
+ TCP_DBGSTAT(tcp->tcp_tcps, tcp_detach_time_wait);
goto finish;
}
@@ -1429,6 +1428,21 @@ tcp_free(tcp_t *tcp)
tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
/*
+ * Destroy any association with SO_REUSEPORT group.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ /*
+ * This is only necessary for connections which enabled
+ * SO_REUSEPORT but were never bound. Such connections should
+ * be the one and only member of the tcp_rg_tp to which they
+ * have been associated.
+ */
+ VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ tcp->tcp_rg_bind = NULL;
+ }
+
+ /*
* If this is a non-STREAM socket still holding on to an upper
* handle, release it. As a result of fallback we might also see
* STREAMS based conns with upper handles, in which case there is
@@ -2477,8 +2491,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
* Path MTU might have changed by either increase or decrease, so need to
* adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
* or negative MSS, since tcp_mss_set() will do it.
+ *
+ * Returns B_TRUE when the connection PMTU changes, otherwise B_FALSE.
*/
-void
+boolean_t
tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
{
uint32_t pmtu;
@@ -2488,10 +2504,10 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
iaflags_t ixaflags;
if (tcp->tcp_tcps->tcps_ignore_path_mtu)
- return;
+ return (B_FALSE);
if (tcp->tcp_state < TCPS_ESTABLISHED)
- return;
+ return (B_FALSE);
/*
* Always call ip_get_pmtu() to make sure that IP has updated
@@ -2511,13 +2527,13 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
* Nothing to change, so just return.
*/
if (mss == tcp->tcp_mss)
- return;
+ return (B_FALSE);
/*
* Currently, for ICMP errors, only PMTU decrease is handled.
*/
if (mss > tcp->tcp_mss && decrease_only)
- return;
+ return (B_FALSE);
DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
@@ -2552,6 +2568,7 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
}
ixa->ixa_flags = ixaflags;
+ return (B_TRUE);
}
int
@@ -3424,7 +3441,7 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
tcp_update_lso(tcp, connp->conn_ixa);
break;
case IXAN_PMTU:
- tcp_update_pmtu(tcp, B_FALSE);
+ (void) tcp_update_pmtu(tcp, B_FALSE);
break;
case IXAN_ZCOPY:
tcp_update_zcopy(tcp);
@@ -3755,7 +3772,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
{
tcp_stack_t *tcps;
int i;
- int error = 0;
major_t major;
size_t arrsz;
@@ -3819,8 +3835,7 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
tcps->tcps_mibkp = tcp_kstat_init(stackid);
major = mod_name_to_major(INET_NAME);
- error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident);
- ASSERT(error == 0);
+ VERIFY0(ldi_ident_from_major(major, &tcps->tcps_ldi_ident));
tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL);
diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c
index 86242fc944..5c2e1e1932 100644
--- a/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ b/usr/src/uts/common/inet/tcp/tcp_bind.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -56,6 +57,7 @@ static uint32_t tcp_random_anon_port = 1;
static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
cred_t *cr);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
/*
* Hash list insertion routine for tcp_t structures. Each hash bucket
@@ -173,6 +175,16 @@ tcp_bind_hash_remove(tcp_t *tcp)
ASSERT(lockp != NULL);
mutex_enter(lockp);
+
+ /* destroy any association with SO_REUSEPORT group */
+ if (tcp->tcp_rg_bind != NULL) {
+ if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+ /* Last one out turns off the lights */
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ }
+ tcp->tcp_rg_bind = NULL;
+ }
+
if (tcp->tcp_ptpbhn) {
tcpnext = tcp->tcp_bind_hash_port;
if (tcpnext != NULL) {
@@ -638,13 +650,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
}
/*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
*
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it. If not, return
- * the first anonymous port we happen across. If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it. If not, return the first anonymous port we
+ * happen across. If no anonymous ports are available, return 0.
*
* In either case, when succeeding update the tcp_t to record the port number
* and insert it in the bind hash table.
@@ -664,6 +675,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int loopmax;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ boolean_t reuseport = connp->conn_reuseport;
/*
* Lookup for free addresses is done in a loop and "loopmax"
@@ -700,6 +712,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
tf_t *tbf;
tcp_t *ltcp;
conn_t *lconnp;
+ boolean_t attempt_reuse = B_FALSE;
lport = htons(port);
@@ -726,6 +739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
boolean_t not_socket;
boolean_t exclbind;
+ boolean_t addrmatch;
lconnp = ltcp->tcp_connp;
@@ -831,22 +845,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
&lconnp->conn_faddr_v6)))
continue;
+ addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+ &lconnp->conn_bound_addr_v6);
+
+ if (addrmatch && reuseport && bind_to_req_port_only &&
+ (ltcp->tcp_state == TCPS_BOUND ||
+ ltcp->tcp_state == TCPS_LISTEN)) {
+ /*
+ * This entry is bound to the exact same
+ * address and port. If SO_REUSEPORT is set on
+ * the calling socket, attempt to reuse this
+ * binding if it too had SO_REUSEPORT enabled
+ * when it was bound.
+ */
+ attempt_reuse = (ltcp->tcp_rg_bind != NULL);
+ break;
+ }
+
if (!reuseaddr) {
/*
- * No socket option SO_REUSEADDR.
- * If existing port is bound to
- * a non-wildcard IP address
- * and the requesting stream is
- * bound to a distinct
- * different IP addresses
- * (non-wildcard, also), keep
- * going.
+ * No socket option SO_REUSEADDR. If an
+ * existing port is bound to a non-wildcard IP
+ * address and the requesting stream is bound
+ * to a distinct different IP address
+ * (non-wildcard, also), keep going.
*/
if (!V6_OR_V4_INADDR_ANY(*laddr) &&
!V6_OR_V4_INADDR_ANY(
lconnp->conn_bound_addr_v6) &&
- !IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6))
+ !addrmatch)
continue;
if (ltcp->tcp_state >= TCPS_BOUND) {
/*
@@ -861,27 +888,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* socket option SO_REUSEADDR is set on the
* binding tcp_t.
*
- * If two streams are bound to
- * same IP address or both addr
- * and bound source are wildcards
- * (INADDR_ANY), we want to stop
- * searching.
- * We have found a match of IP source
- * address and source port, which is
- * refused regardless of the
- * SO_REUSEADDR setting, so we break.
+ * If two streams are bound to the same IP
+ * address or both addr and bound source are
+ * wildcards (INADDR_ANY), we want to stop
+ * searching. We have found a match of IP
+ * source address and source port, which is
+ * refused regardless of the SO_REUSEADDR
+ * setting, so we break.
*/
- if (IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6) &&
+ if (addrmatch &&
(ltcp->tcp_state == TCPS_LISTEN ||
ltcp->tcp_state == TCPS_BOUND))
break;
}
}
- if (ltcp != NULL) {
+ if (ltcp != NULL && !attempt_reuse) {
/* The port number is busy */
mutex_exit(&tbf->tf_lock);
} else {
+ if (attempt_reuse) {
+ int err;
+ struct tcp_rg_s *rg;
+
+ ASSERT(ltcp != NULL);
+ ASSERT(ltcp->tcp_rg_bind != NULL);
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+ err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+ if (err != 0) {
+ mutex_exit(&tbf->tf_lock);
+ return (0);
+ }
+ /*
+ * Now that the newly-binding socket has joined
+ * the existing reuseport group on ltcp, it
+ * should clean up its own (empty) group.
+ */
+ rg = tcp->tcp_rg_bind;
+ tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+ VERIFY(tcp_rg_remove(rg, tcp));
+ tcp_rg_destroy(rg);
+ }
+
/*
* This port is ours. Insert in fanout and mark as
* bound to prevent others from getting the port
@@ -946,3 +995,124 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
} while (++count < loopmax);
return (0);
}
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define TCP_RG_SIZE_MAX 64
+/* Step size when expanding members array */
+#define TCP_RG_SIZE_STEP 2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+ tcp_rg_t *rg;
+ rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP_LAZY);
+ if (rg == NULL)
+ return (NULL);
+ rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), KM_NOSLEEP_LAZY);
+ if (rg->tcprg_members == NULL) {
+ kmem_free(rg, sizeof (tcp_rg_t));
+ return (NULL);
+ }
+
+ mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+ rg->tcprg_size = 2;
+ rg->tcprg_count = 1;
+ rg->tcprg_active = 1;
+ rg->tcprg_members[0] = tcp;
+ return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+ mutex_enter(&rg->tcprg_lock);
+ ASSERT(rg->tcprg_count == 0);
+ ASSERT(rg->tcprg_active == 0);
+ kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+ mutex_destroy(&rg->tcprg_lock);
+ kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+ mutex_enter(&rg->tcprg_lock);
+
+ VERIFY(rg->tcprg_size > 0);
+ VERIFY(rg->tcprg_count <= rg->tcprg_size);
+ if (rg->tcprg_count != 0) {
+ cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+ cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+ if (crgetuid(oldcred) != crgetuid(newcred) ||
+ crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EPERM);
+ }
+ }
+
+ if (rg->tcprg_count == rg->tcprg_size) {
+ unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+ unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+ tcp_t **newmembers;
+
+ if (newsize > TCP_RG_SIZE_MAX) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EINVAL);
+ }
+ newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+ KM_NOSLEEP_LAZY);
+ if (newmembers == NULL) {
+ mutex_exit(&rg->tcprg_lock);
+ return (ENOMEM);
+ }
+ bcopy(rg->tcprg_members, newmembers, oldalloc);
+ kmem_free(rg->tcprg_members, oldalloc);
+ rg->tcprg_members = newmembers;
+ rg->tcprg_size = newsize;
+ }
+
+ rg->tcprg_members[rg->tcprg_count] = tcp;
+ rg->tcprg_count++;
+ rg->tcprg_active++;
+
+ mutex_exit(&rg->tcprg_lock);
+ return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+ int i;
+ boolean_t is_empty;
+
+ mutex_enter(&rg->tcprg_lock);
+ for (i = 0; i < rg->tcprg_count; i++) {
+ if (rg->tcprg_members[i] == tcp)
+ break;
+ }
+ /* The item should be present */
+ ASSERT(i < rg->tcprg_count);
+ /* Move the last member into this position */
+ rg->tcprg_count--;
+ rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+ rg->tcprg_members[rg->tcprg_count] = NULL;
+ if (tcp->tcp_connp->conn_reuseport != 0)
+ rg->tcprg_active--;
+ is_empty = (rg->tcprg_count == 0);
+ mutex_exit(&rg->tcprg_lock);
+ return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+ mutex_enter(&rg->tcprg_lock);
+ if (is_active) {
+ rg->tcprg_active++;
+ } else {
+ rg->tcprg_active--;
+ }
+ mutex_exit(&rg->tcprg_lock);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index dd264528fc..22b0019a6a 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -5715,10 +5715,12 @@ noticmpv4:
switch (icmph->icmph_code) {
case ICMP_FRAGMENTATION_NEEDED:
/*
- * Update Path MTU, then try to send something out.
+ * Attempt to update path MTU and, if the MSS of the
+ * connection is altered, retransmit outstanding data.
*/
- tcp_update_pmtu(tcp, B_TRUE);
- tcp_rexmit_after_error(tcp);
+ if (tcp_update_pmtu(tcp, B_TRUE)) {
+ tcp_rexmit_after_error(tcp);
+ }
break;
case ICMP_PORT_UNREACHABLE:
case ICMP_PROTOCOL_UNREACHABLE:
@@ -5761,7 +5763,7 @@ noticmpv4:
break;
}
break;
- case ICMP_SOURCE_QUENCH: {
+ case ICMP_SOURCE_QUENCH:
/*
* use a global boolean to control
* whether TCP should respond to ICMP_SOURCE_QUENCH.
@@ -5786,7 +5788,6 @@ noticmpv4:
}
break;
}
- }
freemsg(mp);
}
@@ -5839,10 +5840,12 @@ noticmpv6:
switch (icmp6->icmp6_type) {
case ICMP6_PACKET_TOO_BIG:
/*
- * Update Path MTU, then try to send something out.
+ * Attempt to update path MTU and, if the MSS of the connection
+ * is altered, retransmit outstanding data.
*/
- tcp_update_pmtu(tcp, B_TRUE);
- tcp_rexmit_after_error(tcp);
+ if (tcp_update_pmtu(tcp, B_TRUE)) {
+ tcp_rexmit_after_error(tcp);
+ }
break;
case ICMP6_DST_UNREACH:
switch (icmp6->icmp6_code) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 8687b52d53..15e49ae070 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -67,7 +67,8 @@ opdes_t tcp_opt_arr[] = {
{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
@@ -505,6 +506,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
}
/*
+ * Set a TCP connection's participation in SO_REUSEPORT. This operation is
+ * performed under the protection of the squeue via tcp_setsockopt.
+ * The manipulation of tcp_rg_bind, as part of this operation, is subject to
+ * these constraints:
+ * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
+ * under the protection of the squeue.
+ * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
+ * altered until such time as tcp_free() cleans up the connection.
+ * 3. A connection undergoing bind, which matches to a connection participating
+ * in port-reuse, will switch its tcp_rg_bind pointer when it joins the
+ * group of an existing connection in tcp_bindi().
+ */
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+ tcp_t *tcp = connp->conn_tcp;
+ struct tcp_rg_s *rg;
+
+ if (!IPCL_IS_NONSTR(connp)) {
+ if (do_enable) {
+ /*
+ * SO_REUSEPORT cannot be enabled on sockets which have
+ * fallen back to the STREAMS API.
+ */
+ return (EINVAL);
+ } else {
+ /*
+ * A connection with SO_REUSEPORT enabled should be
+ * prevented from falling back to STREAMS mode via
+ * logic in tcp_fallback. It is legal, however, for
+ * fallen-back connections to affirm the disabled state
+ * of SO_REUSEPORT.
+ */
+ ASSERT(connp->conn_reuseport == 0);
+ return (0);
+ }
+ }
+ if (tcp->tcp_state <= TCPS_CLOSED) {
+ return (EINVAL);
+ }
+ if (connp->conn_reuseport == 0 && do_enable) {
+ /* disabled -> enabled */
+ if (tcp->tcp_rg_bind != NULL) {
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ } else {
+ /*
+ * Connection state is not a concern when initially
+ * populating tcp_rg_bind. Setting it to non-NULL on a
+ * bound or listening connection would only mean that
+ * new reused-port binds become a possibility.
+ */
+ if ((rg = tcp_rg_init(tcp)) == NULL) {
+ return (ENOMEM);
+ }
+ tcp->tcp_rg_bind = rg;
+ }
+ connp->conn_reuseport = 1;
+ } else if (connp->conn_reuseport != 0 && !do_enable) {
+ /* enabled -> disabled */
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ if (tcp->tcp_state == TCPS_IDLE) {
+ /*
+ * If the connection has not been bound yet, discard
+ * the reuse group state. Since disabling SO_REUSEPORT
+ * on a bound socket will _not_ prevent others from
+ * reusing the port, the presence of tcp_rg_bind is
+ * used to determine reuse availability, not
+ * conn_reuseport.
+ *
+ * This allows proper behavior for examples such as:
+ *
+ * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
+ * bind(fd1, &myaddr, ...);
+ * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
+ *
+ * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
+ * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
+ *
+ */
+ rg = tcp->tcp_rg_bind;
+ tcp->tcp_rg_bind = NULL;
+ VERIFY(tcp_rg_remove(rg, tcp));
+ tcp_rg_destroy(rg);
+ } else {
+ /*
+ * If a connection has been bound, it's no longer safe
+ * to manipulate tcp_rg_bind until connection clean-up
+ * during tcp_free. Just mark the member status of the
+ * connection as inactive.
+ */
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ }
+ connp->conn_reuseport = 0;
+ }
+ return (0);
+}
+
+/*
* We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
* Parameters are assumed to be verified by the caller.
*/
@@ -674,6 +773,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
*outlenp = inlen;
return (0);
+ case SO_REUSEPORT:
+ if (!checkonly) {
+ return (tcp_set_reuseport(connp, *i1 != 0));
+ }
+ return (0);
}
break;
case IPPROTO_TCP:
@@ -1031,10 +1135,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
break;
case IPPROTO_IP:
- if (connp->conn_family != AF_INET) {
- *outlenp = 0;
- return (EINVAL);
- }
switch (name) {
case IP_SEC_OPT:
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index 9b6c0daac3..32422be675 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -1029,6 +1029,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
}
/*
+ * Do not allow fallback on connections making use of SO_REUSEPORT.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ freeb(stropt_mp);
+ freeb(ordrel_mp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
+ return (EINVAL);
+ }
+
+ /*
* Both endpoints must be of the same type (either STREAMS or
* non-STREAMS) for fusion to be enabled. So if we are fused,
* we have to unfuse.
diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c
index e29c76a696..226467e167 100644
--- a/usr/src/uts/common/inet/tcp/tcp_stats.c
+++ b/usr/src/uts/common/inet/tcp/tcp_stats.c
@@ -21,8 +21,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -131,9 +131,14 @@ tcp_set_conninfo(tcp_t *tcp, struct tcpConnEntryInfo_s *tcei, boolean_t ispriv)
tcei->ce_rto = tcp->tcp_rto;
tcei->ce_mss = tcp->tcp_mss;
tcei->ce_state = tcp->tcp_state;
- tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3);
tcei->ce_rtt_sum = NSEC2USEC(tcp->tcp_rtt_sum);
tcei->ce_rtt_cnt = tcp->tcp_rtt_cnt;
+
+ /* tcp_rtt_sa is stored as 8 times the average RTT */
+ tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3);
+
+ /* tcp_rtt_sd is stored as 4 times the average RTTVAR */
+ tcei->ce_rtt_sd = NSEC2USEC(tcp->tcp_rtt_sd >> 2);
}
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
index 5793a7fd27..7d9b449392 100644
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/