summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet')
-rw-r--r--usr/src/uts/common/inet/inet_hash.h37
-rw-r--r--usr/src/uts/common/inet/ip/conn_opt.c22
-rw-r--r--usr/src/uts/common/inet/ip/ip.c12
-rw-r--r--usr/src/uts/common/inet/ip/ip_attr.c5
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c2
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c163
-rw-r--r--usr/src/uts/common/inet/ip/ipsecesp.c3
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h4
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c122
-rw-r--r--usr/src/uts/common/inet/ipf/ipf.conf5
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_stack.h10
-rw-r--r--usr/src/uts/common/inet/ipf/solaris.c1
-rw-r--r--usr/src/uts/common/inet/sockmods/datafilt.c116
-rw-r--r--usr/src/uts/common/inet/squeue.c100
-rw-r--r--usr/src/uts/common/inet/tcp.h10
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c20
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_bind.c224
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c4
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c86
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_socket.c11
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_time_wait.c668
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_tunables.c6
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h103
-rw-r--r--usr/src/uts/common/inet/udp/udp.c165
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c4
-rw-r--r--usr/src/uts/common/inet/udp_impl.h7
26 files changed, 1440 insertions, 470 deletions
diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h
new file mode 100644
index 0000000000..a790a797d1
--- /dev/null
+++ b/usr/src/uts/common/inet/inet_hash.h
@@ -0,0 +1,37 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _INET_INET_HASH_H
+#define _INET_INET_HASH_H
+
+/*
+ * Common packet hashing routines shared across MAC, UDP, and others.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define INET_PKT_HASH_L2 0x01
+#define INET_PKT_HASH_L3 0x02
+#define INET_PKT_HASH_L4 0x04
+
+extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_INET_HASH_H */
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index bcbc1c4949..b4bff4d7b4 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -619,6 +620,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
case SO_REUSEADDR:
*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
break; /* goto sizeof (int) option return */
+ case SO_REUSEPORT:
+ *i1 = connp->conn_reuseport;
+ break; /* goto sizeof (int) option return */
case SO_TYPE:
*i1 = connp->conn_so_type;
break; /* goto sizeof (int) option return */
@@ -1186,8 +1190,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
int error;
- if (connp->conn_family != AF_INET)
+ if (connp->conn_family == AF_INET6 &&
+ connp->conn_ipversion == IPV4_VERSION) {
+ /*
+ * Allow certain IPv4 options to be set on an AF_INET6 socket
+ * if the connection is still IPv4.
+ */
+ switch (name) {
+ case IP_TOS:
+ case T_IP_TOS:
+ case IP_TTL:
+ case IP_DONTFRAG:
+ break;
+ default:
+ return (EINVAL);
+ }
+ } else if (connp->conn_family != AF_INET) {
return (EINVAL);
+ }
switch (name) {
case IP_TTL:
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index f006e83a1f..73081b9c1c 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -12577,6 +12577,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
ip_ioctl_cmd_t *ipip = arg;
ip_extract_func_t *extract_funcp;
+ ill_t *ill;
cmd_info_t ci;
int err;
boolean_t entered_ipsq = B_FALSE;
@@ -12697,6 +12698,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
/*
+ * We need to cache the ill_t that we're going to use as the argument
+ * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
+ * blown away by calling ipi_func.
+ */
+ ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
+
+ /*
* A return value of EINPROGRESS means the ioctl is
* either queued and waiting for some reason or has
* already completed.
@@ -12704,9 +12712,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
- int, ipip->ipi_cmd,
- ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
- ipif_t *, ci.ci_ipif);
+ int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
if (entered_ipsq)
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
index 85ee142dfc..c350d67c2d 100644
--- a/usr/src/uts/common/inet/ip/ip_attr.c
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
*/
if (ixa->ixa_free_flags & IXA_FREE_CRED)
crhold(ixa->ixa_cred);
+
+ /*
+ * There is no cleanup in progress on this new copy.
+ */
+ ixa->ixa_tcpcleanup = IXATC_IDLE;
}
/*
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 33a2fa5935..dedb4dadcc 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri)
{
squeue_t *sqp;
- sqp = squeue_create(ip_squeue_worker_wait, pri);
+ sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE);
ASSERT(sqp != NULL);
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index bc2173ff24..3a12e58c3a 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/*
@@ -868,67 +869,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
mutex_exit(&(connfp)->connf_lock); \
}
-#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
- conn_t *pconnp = NULL, *nconnp; \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- nconnp = (connfp)->connf_head; \
- while (nconnp != NULL && \
- !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
- pconnp = nconnp; \
- nconnp = nconnp->conn_next; \
- } \
- if (pconnp != NULL) { \
- pconnp->conn_next = (connp); \
- (connp)->conn_prev = pconnp; \
- } else { \
- (connfp)->connf_head = (connp); \
- } \
- if (nconnp != NULL) { \
- (connp)->conn_next = nconnp; \
- nconnp->conn_prev = (connp); \
- } \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF(connp); \
- mutex_exit(&(connfp)->connf_lock); \
-}
+/*
+ * When inserting bound or wildcard entries into the hash, ordering rules are
+ * used to facilitate timely and correct lookups. The order is as follows:
+ * 1. Entries bound to a specific address
+ * 2. Entries bound to INADDR_ANY
+ * 3. Entries bound to ADDR_UNSPECIFIED
+ * Entries in a category which share conn_lport (such as those using
+ * SO_REUSEPORT) will be ordered such that the newest inserted is first.
+ */
-#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
- conn_t **list, *prev, *next; \
- boolean_t isv4mapped = \
- IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- list = &(connfp)->connf_head; \
- prev = NULL; \
- while ((next = *list) != NULL) { \
- if (isv4mapped && \
- IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
- connp->conn_zoneid == next->conn_zoneid) { \
- (connp)->conn_next = next; \
- if (prev != NULL) \
- prev = next->conn_prev; \
- next->conn_prev = (connp); \
- break; \
- } \
- list = &next->conn_next; \
- prev = next; \
- } \
- (connp)->conn_prev = prev; \
- *list = (connp); \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF((connp)); \
- mutex_exit(&(connfp)->connf_lock); \
+void
+ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
+{
+ conn_t *pconnp, *nconnp;
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ /*
+ * Walk though entries associated with the fanout until one is
+ * found which fulfills any of these conditions:
+ * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
+ * 2. Listen port the same as connp
+ */
+ if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
+ connp->conn_lport == nconnp->conn_lport)
+ break;
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
void
ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
{
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ conn_t **list, *prev, *next;
+ conn_t *pconnp = NULL, *nconnp;
+ boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
+ isv4mapped && connp->conn_lport == nconnp->conn_lport)
+ break;
+ if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
+ (isv4mapped ||
+ connp->conn_lport == nconnp->conn_lport))
+ break;
+
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
/*
@@ -1034,9 +1059,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
} else {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
}
} else {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
@@ -1205,9 +1230,9 @@ ipcl_bind_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (protocol == IPPROTO_RSVP)
ill_set_inputfn_all(ipst);
@@ -1219,9 +1244,9 @@ ipcl_bind_insert_v4(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
ASSERT(connp->conn_ipversion == IPV4_VERSION);
@@ -1271,9 +1296,9 @@ ipcl_bind_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
@@ -1283,9 +1308,9 @@ ipcl_bind_insert_v6(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
sa_family_t addr_family;
@@ -1416,9 +1441,9 @@ ipcl_conn_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
@@ -1504,9 +1529,9 @@ ipcl_conn_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index c325e8dc26..2ca770ebe9 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
{
espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
"net", KSTAT_TYPE_NAMED,
- sizeof (esp_kstats_t) / sizeof (kstat_named_t),
- KSTAT_FLAG_PERSISTENT, stackid);
+ sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index f6466434f6..c3139d9288 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
#ifndef _INET_IPCLASSIFIER_H
@@ -293,7 +294,8 @@ struct conn_s {
conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */
conn_mcbc_bind : 1, /* Bound to multi/broadcast */
- conn_pad_to_bit_31 : 12;
+ conn_reuseport : 1, /* SO_REUSEPORT state */
+ conn_pad_to_bit_31 : 11;
boolean_t conn_blocked; /* conn is flow-controlled */
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index f958ca2261..227d2075f8 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -83,6 +83,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
void *));
static int ipf_hook6 __P((hook_data_t, int, int, void *));
+static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+ void *));
extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
@@ -152,6 +160,16 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
char *hook6_loop_out = "ipfilter_hook6_loop_out";
char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in = "ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in = "ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out = "ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out = "ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz";
+
/* ------------------------------------------------------------------------ */
/* Function: ipldetach */
/* Returns: int - 0 == success, else error. */
@@ -248,6 +266,31 @@ ipf_stack_t *ifs;
ifs->ifs_ipf_ipv4 = NULL;
}
+ /*
+ * Remove VND hooks
+ */
+ if (ifs->ifs_ipf_vndl3v4 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v4 = NULL;
+ }
+
+ if (ifs->ifs_ipf_vndl3v6 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v6 = NULL;
+ }
+
#undef UNDO_HOOK
#ifdef IPFDEBUG
@@ -445,6 +488,48 @@ ipf_stack_t *ifs;
}
/*
+ * Add VND INET hooks
+ */
+ ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+ if (ifs->ifs_ipf_vndl3v4 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+ hook4_vnd_in, hook4_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+ hook4_vnd_out, hook4_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_out)
+ goto hookup_failed;
+
+
+ /*
+ * VND INET6 hooks
+ */
+ ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+ if (ifs->ifs_ipf_vndl3v6 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+ hook6_vnd_in, hook6_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+ hook6_vnd_out, hook6_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_out)
+ goto hookup_failed;
+ /*
* Reacquire ipf_global, now it is safe.
*/
WRITE_ENTER(&ifs->ifs_ipf_global);
@@ -1011,7 +1096,6 @@ cred_t *cp;
return ENXIO;
unit = isp->ipfs_minor;
-
/*
* ipf_find_stack returns with a read lock on ifs_ipf_global
*/
@@ -2045,6 +2129,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
}
/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookvndl3_in */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The vnd hooks are private hooks to ON. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. The driver sends */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the */
+/* traditional packet hook flags. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
/* Function: ipf_hook4_loop_in */
/* Returns: int - 0 == packet ok, else problem, free packet if not done */
/* Parameters: event(I) - pointer to event */
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
#
#
name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..9aa2478c6a 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -125,6 +125,10 @@ struct ipf_stack {
hook_t *ifs_ipfhook6_loop_in;
hook_t *ifs_ipfhook6_loop_out;
hook_t *ifs_ipfhook6_nicevents;
+ hook_t *ifs_ipfhookvndl3v4_in;
+ hook_t *ifs_ipfhookvndl3v6_in;
+ hook_t *ifs_ipfhookvndl3v4_out;
+ hook_t *ifs_ipfhookvndl3v6_out;
/* flags to indicate whether hooks are registered. */
boolean_t ifs_hook4_physical_in;
@@ -137,10 +141,16 @@ struct ipf_stack {
boolean_t ifs_hook6_nic_events;
boolean_t ifs_hook6_loopback_in;
boolean_t ifs_hook6_loopback_out;
+ boolean_t ifs_hookvndl3v4_physical_in;
+ boolean_t ifs_hookvndl3v6_physical_in;
+ boolean_t ifs_hookvndl3v4_physical_out;
+ boolean_t ifs_hookvndl3v6_physical_out;
int ifs_ipf_loopback;
net_handle_t ifs_ipf_ipv4;
net_handle_t ifs_ipf_ipv6;
+ net_handle_t ifs_ipf_vndl3v4;
+ net_handle_t ifs_ipf_vndl3v6;
/* ip_auth.c */
int ifs_fr_authsize;
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5d56debc31 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
/*
* Destroy things for ipf for one stack.
*/
-/* ARGSUSED */
static void
ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
{
diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c
new file mode 100644
index 0000000000..6e1171de46
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/datafilt.c
@@ -0,0 +1,116 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * This file implements a socketfilter used to deter TCP connections.
+ * To defer a connection means to delay the return of accept(3SOCKET)
+ * until at least one byte is ready to be read(2). This filter may be
+ * applied automatically or programmatically through the use of
+ * soconfig(1M) and setsockopt(3SOCKET).
+ */
+
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+#include <sys/note.h>
+#include <sys/taskq.h>
+
+#define DATAFILT_MODULE "datafilt"
+
+static struct modlmisc dataf_modlmisc = {
+ &mod_miscops,
+ "Kernel data-ready socket filter"
+};
+
+static struct modlinkage dataf_modlinkage = {
+ MODREV_1,
+ &dataf_modlmisc,
+ NULL
+};
+
+static sof_rval_t
+dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph,
+ void *parg, struct sockaddr *laddr, socklen_t laddrlen,
+ struct sockaddr *faddr, socklen_t faddrlen, void **cookiep)
+{
+ _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen,
+ cookiep));
+ return (SOF_RVAL_DEFER);
+}
+
+static void
+dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr)
+{
+ _NOTE(ARGUNUSED(handle, cookie, cr));
+}
+
+static mblk_t *
+dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags,
+ size_t *lenp)
+{
+ _NOTE(ARGUNUSED(cookie, flags, lenp));
+
+ if (mp != NULL && MBLKL(mp) > 0) {
+ sof_newconn_ready(handle);
+ sof_bypass(handle);
+ }
+
+ return (mp);
+}
+
+static sof_ops_t dataf_ops = {
+ .sofop_attach_passive = dataf_attach_passive_cb,
+ .sofop_detach = dataf_detach_cb,
+ .sofop_data_in = dataf_data_in_cb
+};
+
+int
+_init(void)
+{
+ int err;
+
+ /*
+ * This module is safe to attach even after some preliminary socket
+ * setup calls have taken place. See the comment for SOF_ATT_SAFE.
+ */
+ err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops,
+ SOF_ATT_SAFE);
+ if (err != 0)
+ return (err);
+ if ((err = mod_install(&dataf_modlinkage)) != 0)
+ (void) sof_unregister(DATAFILT_MODULE);
+
+ return (err);
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = sof_unregister(DATAFILT_MODULE)) != 0)
+ return (err);
+
+ return (mod_remove(&dataf_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&dataf_modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 2e08dc359b..1009f0700f 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,7 +23,7 @@
*/
/*
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc. All rights reserved.
*/
/*
@@ -61,6 +61,10 @@
* connection are processed on that squeue. The connection ("conn") to
* squeue mapping is stored in "conn_t" member "conn_sqp".
*
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
* Since the processing of the connection cuts across multiple layers
* but still allows packets for different connnection to be processed on
* other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -244,7 +248,7 @@ squeue_init(void)
/* ARGSUSED */
squeue_t *
-squeue_create(clock_t wait, pri_t pri)
+squeue_create(clock_t wait, pri_t pri, boolean_t isip)
{
squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
@@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri)
sqp->sq_enter = squeue_enter;
sqp->sq_drain = squeue_drain;
+ sqp->sq_isip = isip;
return (sqp);
}
/*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+ kt_did_t worker, poll;
+ mutex_enter(&sqp->sq_lock);
+ VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+ worker = sqp->sq_worker->t_did;
+ poll = sqp->sq_poll_thr->t_did;
+ sqp->sq_state |= SQS_EXIT;
+ cv_signal(&sqp->sq_poll_cv);
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+
+ thread_join(poll);
+ thread_join(worker);
+ kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
* Bind squeue worker thread to the specified CPU, given by CPU id.
* If the CPU id value is -1, bind the worker thread to the value
* specified in sq_bind field. If a thread is already bound to a
@@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
SQUEUE_DBG_CLEAR(sqp);
- CONN_DEC_REF(connp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
return;
}
} else {
- if (ira != NULL) {
+ if (sqp->sq_isip == B_TRUE && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
if (!(sqp->sq_state & SQS_REENTER) &&
(process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
(sqp->sq_run == curthread) && (cnt == 1) &&
- (connp->conn_on_sqp == B_FALSE)) {
+ (sqp->sq_isip == B_FALSE ||
+ connp->conn_on_sqp == B_FALSE)) {
sqp->sq_state |= SQS_REENTER;
mutex_exit(&sqp->sq_lock);
@@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+ SQUEUE_DBG_SET(sqp, mp, proc, connp,
+ tag);
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
#ifdef DEBUG
mp->b_tag = tag;
#endif
- if (ira != NULL) {
+ if (sqp->sq_isip && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -779,7 +818,7 @@ again:
mp->b_prev = NULL;
/* Is there an ip_recv_attr_t to handle? */
- if (ip_recv_attr_is_mblk(mp)) {
+ if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
mblk_t *attrmp = mp;
ASSERT(attrmp->b_cont != NULL);
@@ -804,20 +843,25 @@ again:
/*
- * Handle squeue switching. More details in the
- * block comment at the top of the file
+ * Handle squeue switching. More details in the block comment at
+ * the top of the file. non-IP squeues cannot switch, as there
+ * is no conn_t.
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
mp->b_tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp)
cv_wait(async, lock);
CALLB_CPR_SAFE_END(&cprinfo, lock);
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
SQS_POLL_THR_QUIESCED);
if (ctl_state != 0) {
@@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp)
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
+ /* Only IP related squeues should reach this point */
+ VERIFY(sqp->sq_isip == B_TRUE);
+
poll_again:
sq_rx_ring = sqp->sq_rx_ring;
sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp)
ill_rx_ring_t *rx_ring;
ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ VERIFY(sqp->sq_isip == B_TRUE);
if (sqp->sq_state & SQS_POLL_RESTART) {
/* Restart implies a previous quiesce. */
@@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp)
for (;;) {
for (;;) {
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
/*
* If the poll thread has handed control to us
* we need to break out of the wait.
@@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
again:
sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1487,6 +1546,7 @@ void
squeue_synch_exit(conn_t *connp)
{
squeue_t *sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_run == curthread) {
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index b2b9973291..6ec2e6b2d7 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
@@ -134,6 +134,7 @@ typedef struct tcphdra_s {
struct conn_s;
struct tcp_listen_cnt_s;
+struct tcp_rg_s;
/*
* Control structure for each open TCP stream,
@@ -404,6 +405,13 @@ typedef struct tcp_s {
struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
struct tcp_s **tcp_ptpbhn;
+ /*
+ * Group of tcp_t entries bound to the same adress and port via
+ * SO_REUSEPORT. The pointer itself is protected by tf_lock in the
+ * containing tcps_bind_fanout slot.
+ */
+ struct tcp_rg_s *tcp_rg_bind;
+
uint_t tcp_maxpsz_multiplier;
uint32_t tcp_lso_max; /* maximum LSO payload */
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index fba7125690..cf046c968e 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013,2014 by Delphix. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
@@ -1423,6 +1423,21 @@ tcp_free(tcp_t *tcp)
tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
/*
+ * Destroy any association with SO_REUSEPORT group.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ /*
+ * This is only necessary for connections which enabled
+ * SO_REUSEPORT but were never bound. Such connections should
+ * be the one and only member of the tcp_rg_tp to which they
+ * have been associated.
+ */
+ VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ tcp->tcp_rg_bind = NULL;
+ }
+
+ /*
* If this is a non-STREAM socket still holding on to an upper
* handle, release it. As a result of fallback we might also see
* STREAMS based conns with upper handles, in which case there is
@@ -2054,8 +2069,7 @@ tcp_reinit(tcp_t *tcp)
* structure!
*/
static void
-tcp_reinit_values(tcp)
- tcp_t *tcp;
+tcp_reinit_values(tcp_t *tcp)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
conn_t *connp = tcp->tcp_connp;
diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c
index c6df39b91e..adc201eebb 100644
--- a/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ b/usr/src/uts/common/inet/tcp/tcp_bind.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
#include <sys/types.h>
@@ -55,6 +56,7 @@ static uint32_t tcp_random_anon_port = 1;
static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
cred_t *cr);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
/*
* Hash list insertion routine for tcp_t structures. Each hash bucket
@@ -172,6 +174,16 @@ tcp_bind_hash_remove(tcp_t *tcp)
ASSERT(lockp != NULL);
mutex_enter(lockp);
+
+ /* destroy any association with SO_REUSEPORT group */
+ if (tcp->tcp_rg_bind != NULL) {
+ if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+ /* Last one out turns off the lights */
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ }
+ tcp->tcp_rg_bind = NULL;
+ }
+
if (tcp->tcp_ptpbhn) {
tcpnext = tcp->tcp_bind_hash_port;
if (tcpnext != NULL) {
@@ -636,13 +648,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
}
/*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
*
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it. If not, return
- * the first anonymous port we happen across. If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it. If not, return the first anonymous port we
+ * happen across. If no anonymous ports are available, return 0.
*
* In either case, when succeeding update the tcp_t to record the port number
* and insert it in the bind hash table.
@@ -662,6 +673,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int loopmax;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ boolean_t reuseport = connp->conn_reuseport;
/*
* Lookup for free addresses is done in a loop and "loopmax"
@@ -698,6 +710,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
tf_t *tbf;
tcp_t *ltcp;
conn_t *lconnp;
+ boolean_t attempt_reuse = B_FALSE;
lport = htons(port);
@@ -724,6 +737,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
boolean_t not_socket;
boolean_t exclbind;
+ boolean_t addrmatch;
lconnp = ltcp->tcp_connp;
@@ -829,22 +843,34 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
&lconnp->conn_faddr_v6)))
continue;
+ addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+ &lconnp->conn_bound_addr_v6);
+
+ if (addrmatch && reuseport && bind_to_req_port_only &&
+ (ltcp->tcp_state == TCPS_BOUND ||
+ ltcp->tcp_state == TCPS_LISTEN)) {
+ /*
+ * This entry is bound to the exact same
+ * address and port. If SO_REUSEPORT is set on
+ * the calling socket, attempt to reuse this
+ * binding if it too appears to be willing.
+ */
+ attempt_reuse = B_TRUE;
+ break;
+ }
+
if (!reuseaddr) {
/*
- * No socket option SO_REUSEADDR.
- * If existing port is bound to
- * a non-wildcard IP address
- * and the requesting stream is
- * bound to a distinct
- * different IP addresses
- * (non-wildcard, also), keep
- * going.
+ * No socket option SO_REUSEADDR. If an
+ * existing port is bound to a non-wildcard IP
+ * address and the requesting stream is bound
+ * to a distinct different IP address
+ * (non-wildcard, also), keep going.
*/
if (!V6_OR_V4_INADDR_ANY(*laddr) &&
!V6_OR_V4_INADDR_ANY(
lconnp->conn_bound_addr_v6) &&
- !IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6))
+ !addrmatch)
continue;
if (ltcp->tcp_state >= TCPS_BOUND) {
/*
@@ -859,27 +885,47 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* socket option SO_REUSEADDR is set on the
* binding tcp_t.
*
- * If two streams are bound to
- * same IP address or both addr
- * and bound source are wildcards
- * (INADDR_ANY), we want to stop
- * searching.
- * We have found a match of IP source
- * address and source port, which is
- * refused regardless of the
- * SO_REUSEADDR setting, so we break.
+ * If two streams are bound to the same IP
+ * address or both addr and bound source are
+ * wildcards (INADDR_ANY), we want to stop
+ * searching. We have found a match of IP
+ * source address and source port, which is
+ * refused regardless of the SO_REUSEADDR
+ * setting, so we break.
*/
- if (IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6) &&
+ if (addrmatch &&
(ltcp->tcp_state == TCPS_LISTEN ||
ltcp->tcp_state == TCPS_BOUND))
break;
}
}
- if (ltcp != NULL) {
+ if (ltcp != NULL && !attempt_reuse) {
/* The port number is busy */
mutex_exit(&tbf->tf_lock);
} else {
+ if (attempt_reuse) {
+ int err;
+
+ ASSERT(ltcp != NULL);
+ ASSERT(ltcp->tcp_rg_bind != NULL);
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+ err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+ if (err != 0) {
+ mutex_exit(&tbf->tf_lock);
+ return (0);
+ }
+ /*
+ * Now that the newly-binding socket has joined
+ * the existing reuseport group on ltcp, it
+ * should clean up its own (empty) group.
+ */
+ VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+ }
+
/*
* This port is ours. Insert in fanout and mark as
* bound to prevent others from getting the port
@@ -944,3 +990,125 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
} while (++count < loopmax);
return (0);
}
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define TCP_RG_SIZE_MAX 64
+/* Step size when expanding members array */
+#define TCP_RG_SIZE_STEP 2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+ tcp_rg_t *rg;
+ rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
+ if (rg == NULL)
+ return (NULL);
+ rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
+ KM_NOSLEEP|KM_NORMALPRI);
+ if (rg->tcprg_members == NULL) {
+ kmem_free(rg, sizeof (tcp_rg_t));
+ return (NULL);
+ }
+
+ mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+ rg->tcprg_size = 2;
+ rg->tcprg_count = 1;
+ rg->tcprg_active = 1;
+ rg->tcprg_members[0] = tcp;
+ return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+ mutex_enter(&rg->tcprg_lock);
+ ASSERT(rg->tcprg_count == 0);
+ ASSERT(rg->tcprg_active == 0);
+ kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+ mutex_destroy(&rg->tcprg_lock);
+ kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+ mutex_enter(&rg->tcprg_lock);
+
+ VERIFY(rg->tcprg_size > 0);
+ VERIFY(rg->tcprg_count <= rg->tcprg_size);
+ if (rg->tcprg_count != 0) {
+ cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+ cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+ if (crgetuid(oldcred) != crgetuid(newcred) ||
+ crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EPERM);
+ }
+ }
+
+ if (rg->tcprg_count == rg->tcprg_size) {
+ unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+ unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+ tcp_t **newmembers;
+
+ if (newsize > TCP_RG_SIZE_MAX) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EINVAL);
+ }
+ newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+ KM_NOSLEEP|KM_NORMALPRI);
+ if (newmembers == NULL) {
+ mutex_exit(&rg->tcprg_lock);
+ return (ENOMEM);
+ }
+ bcopy(rg->tcprg_members, newmembers, oldalloc);
+ kmem_free(rg->tcprg_members, oldalloc);
+ rg->tcprg_members = newmembers;
+ rg->tcprg_size = newsize;
+ }
+
+ rg->tcprg_members[rg->tcprg_count] = tcp;
+ rg->tcprg_count++;
+ rg->tcprg_active++;
+
+ mutex_exit(&rg->tcprg_lock);
+ return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+ int i;
+ boolean_t is_empty;
+
+ mutex_enter(&rg->tcprg_lock);
+ for (i = 0; i < rg->tcprg_count; i++) {
+ if (rg->tcprg_members[i] == tcp)
+ break;
+ }
+ /* The item should be present */
+ ASSERT(i < rg->tcprg_count);
+ /* Move the last member into this position */
+ rg->tcprg_count--;
+ rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+ rg->tcprg_members[rg->tcprg_count] = NULL;
+ if (tcp->tcp_connp->conn_reuseport != 0)
+ rg->tcprg_active--;
+ is_empty = (rg->tcprg_count == 0);
+ mutex_exit(&rg->tcprg_lock);
+ return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+ mutex_enter(&rg->tcprg_lock);
+ if (is_active) {
+ rg->tcprg_active++;
+ } else {
+ rg->tcprg_active--;
+ }
+ mutex_exit(&rg->tcprg_lock);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index cf8e0c6bd4..7cfdb9a4a2 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2014 by Delphix. All rights reserved.
*/
@@ -99,7 +99,7 @@
* tcps_time_wait_interval since the period before upper layer closes the
* connection is not accounted for when tcp_time_wait_append() is called.
*
- * If uppser layer has closed the connection, call tcp_time_wait_append()
+ * If upper layer has closed the connection, call tcp_time_wait_append()
* directly.
*
*/
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 1a5363bedc..835acd1b12 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -62,7 +63,8 @@ opdes_t tcp_opt_arr[] = {
{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
@@ -483,6 +485,42 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
return (retval);
}
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+ tcp_t *tcp = connp->conn_tcp;
+ struct tcp_rg_s *rg;
+
+ if (do_enable && !IPCL_IS_NONSTR(connp)) {
+ /*
+ * SO_REUSEPORT cannot be enabled on sockets which have fallen
+ * back to the STREAMS API.
+ */
+ return (EINVAL);
+ }
+ if (connp->conn_reuseport == 0 && do_enable) {
+ /* disabled -> enabled */
+ if (tcp->tcp_rg_bind != NULL) {
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ } else {
+ if (tcp->tcp_state >= TCPS_BOUND ||
+ tcp->tcp_state <= TCPS_CLOSED)
+ return (EINVAL);
+ if ((rg = tcp_rg_init(tcp)) == NULL)
+ return (ENOMEM);
+ tcp->tcp_rg_bind = rg;
+ }
+ connp->conn_reuseport = 1;
+ } else if (connp->conn_reuseport != 0 && !do_enable) {
+ /* enabled -> disabled */
+ if (tcp->tcp_rg_bind != NULL) {
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ }
+ connp->conn_reuseport = 0;
+ }
+ return (0);
+}
+
/*
* We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
* Parameters are assumed to be verified by the caller.
@@ -653,6 +691,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
*outlenp = inlen;
return (0);
+ case SO_REUSEPORT:
+ if (!checkonly) {
+ return (tcp_set_reuseport(connp, *i1 != 0));
+ }
+ return (0);
}
break;
case IPPROTO_TCP:
@@ -769,14 +812,37 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
if (*i1 == 0) {
return (EINVAL);
} else if (tcp->tcp_ka_rinterval == 0) {
- if ((tcp->tcp_ka_abort_thres / *i1) <
- tcp->tcp_rto_min ||
- (tcp->tcp_ka_abort_thres / *i1) >
- tcp->tcp_rto_max)
- return (EINVAL);
+ /*
+ * When TCP_KEEPCNT is specified without first
+ * specifying a TCP_KEEPINTVL, we infer an
+ * interval based on a tunable specific to our
+ * stack: the tcp_keepalive_abort_interval.
+ * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
+ * the unlikely event that that has been set.)
+ * Given the abort interval's default value of
+ * 480 seconds, low TCP_KEEPCNT values can
+ * result in intervals that exceed the default
+ * maximum RTO of 60 seconds. Rather than
+ * fail in these cases, we (implicitly) clamp
+ * the interval at the maximum RTO; if the
+ * TCP_KEEPCNT is shortly followed by a
+ * TCP_KEEPINTVL (as we expect), the abort
+ * threshold will be recalculated correctly --
+ * and if a TCP_KEEPINTVL is not forthcoming,
+ * keep-alive will at least operate reasonably
+ * given the underconfigured state.
+ */
+ uint32_t interval;
- tcp->tcp_ka_rinterval =
- tcp->tcp_ka_abort_thres / *i1;
+ interval = tcp->tcp_ka_abort_thres / *i1;
+
+ if (interval < tcp->tcp_rto_min)
+ interval = tcp->tcp_rto_min;
+
+ if (interval > tcp->tcp_rto_max)
+ interval = tcp->tcp_rto_max;
+
+ tcp->tcp_ka_rinterval = interval;
} else {
if ((*i1 * tcp->tcp_ka_rinterval) <
tcps->tcps_keepalive_abort_interval_low ||
@@ -953,10 +1019,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
break;
case IPPROTO_IP:
- if (connp->conn_family != AF_INET) {
- *outlenp = 0;
- return (EINVAL);
- }
switch (name) {
case IP_SEC_OPT:
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index a431bf63d1..8f535a5dd1 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
*/
/* This file contains all TCP kernel socket related functions. */
@@ -1022,6 +1023,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
}
/*
+ * Do not allow fallback on connections making use of SO_REUSEPORT.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ freeb(stropt_mp);
+ freeb(ordrel_mp);
+ squeue_synch_exit(connp);
+ return (EINVAL);
+ }
+
+ /*
* Both endpoints must be of the same type (either STREAMS or
* non-STREAMS) for fusion to be enabled. So if we are fused,
* we have to unfuse.
diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
index b470934da0..6600296b18 100644
--- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c
+++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/*
@@ -41,13 +41,13 @@
#include <inet/tcp_impl.h>
#include <inet/tcp_cluster.h>
-static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
+static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *);
+
+#define TW_BUCKET(t) \
+ (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS)
+
+#define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS)
-/*
- * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
- * Running it every 5 seconds seems to give the best results.
- */
-#define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC)
/*
* Remove a connection from the list of detached TIME_WAIT connections.
@@ -56,17 +56,17 @@ static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *);
* earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
*/
boolean_t
-tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
+tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp)
{
boolean_t locked = B_FALSE;
- if (tcp_time_wait == NULL) {
- tcp_time_wait = *((tcp_squeue_priv_t **)
+ if (tsp == NULL) {
+ tsp = *((tcp_squeue_priv_t **)
squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_enter(&tsp->tcp_time_wait_lock);
locked = B_TRUE;
} else {
- ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
+ ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
}
/* 0 means that the tcp_t has not been added to the time wait list. */
@@ -74,40 +74,34 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
if (locked)
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_exit(&tsp->tcp_time_wait_lock);
return (B_FALSE);
}
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
+ ASSERT(tsp->tcp_time_wait_cnt > 0);
- if (tcp == tcp_time_wait->tcp_time_wait_head) {
- ASSERT(tcp->tcp_time_wait_prev == NULL);
- tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
- if (tcp_time_wait->tcp_time_wait_head != NULL) {
- tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
- NULL;
- } else {
- tcp_time_wait->tcp_time_wait_tail = NULL;
- }
- } else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
- ASSERT(tcp->tcp_time_wait_next == NULL);
- tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
- ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
- tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
- } else {
- ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
- ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
- tcp->tcp_time_wait_prev->tcp_time_wait_next =
- tcp->tcp_time_wait_next;
+ if (tcp->tcp_time_wait_next != NULL) {
tcp->tcp_time_wait_next->tcp_time_wait_prev =
tcp->tcp_time_wait_prev;
}
+ if (tcp->tcp_time_wait_prev != NULL) {
+ tcp->tcp_time_wait_prev->tcp_time_wait_next =
+ tcp->tcp_time_wait_next;
+ } else {
+ unsigned int bucket;
+
+ bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
+ ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp);
+ tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next;
+ }
tcp->tcp_time_wait_next = NULL;
tcp->tcp_time_wait_prev = NULL;
tcp->tcp_time_wait_expire = 0;
+ tsp->tcp_time_wait_cnt--;
if (locked)
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_exit(&tsp->tcp_time_wait_lock);
return (B_TRUE);
}
@@ -126,6 +120,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \
IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6)))
+
/*
* Add a connection to the list of detached TIME_WAIT connections
* and set its time to expire.
@@ -135,9 +130,10 @@ tcp_time_wait_append(tcp_t *tcp)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
squeue_t *sqp = tcp->tcp_connp->conn_sqp;
- tcp_squeue_priv_t *tcp_time_wait =
+ tcp_squeue_priv_t *tsp =
*((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
- hrtime_t firetime = 0;
+ int64_t now, schedule;
+ unsigned int bucket;
tcp_timers_stop(tcp);
@@ -146,6 +142,8 @@ tcp_time_wait_append(tcp_t *tcp)
ASSERT(tcp->tcp_ack_tid == 0);
/* must have happened at the time of detaching the tcp */
+ ASSERT(TCP_IS_DETACHED(tcp));
+ ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
ASSERT(tcp->tcp_ptpahn == NULL);
ASSERT(tcp->tcp_flow_stopped == 0);
ASSERT(tcp->tcp_time_wait_next == NULL);
@@ -153,97 +151,112 @@ tcp_time_wait_append(tcp_t *tcp)
ASSERT(tcp->tcp_time_wait_expire == 0);
ASSERT(tcp->tcp_listener == NULL);
- tcp->tcp_time_wait_expire = ddi_get_lbolt64();
- if (IS_LOCAL_HOST(tcp)) {
- /*
- * This is the fastpath for handling localhost connections.
- * Since we don't have to worry about packets on the localhost
- * showing up after a long network delay, we want to expire
- * these quickly so the port range on the localhost doesn't
- * get starved by short-running, local apps.
- *
- * Leave tcp_time_wait_expire at the current time. This
- * essentially means the connection is expired now and it will
- * clean up the next time tcp_time_wait_collector runs. We set
- * firetime to use a short delay so that if we have to start a
- * tcp_time_wait_collector thread below, it runs soon instead
- * of after a delay of time_wait_interval. firetime being set
- * to a non-0 value is also our indicator that we should add
- * this connection to the head of the time wait list (since we
- * are already expired) so that its sure to get cleaned up on
- * the next run of tcp_time_wait_collector (which expects the
- * entries to appear in time-order and stops when it hits the
- * first non-expired entry).
- */
- firetime = TCP_TIME_WAIT_DELAY;
- } else {
- /*
- * Since tcp_time_wait_expire is lbolt64, it should not wrap
- * around in practice. Hence it cannot be 0. Note that zero
- * means that the tcp_t is not in the TIME_WAIT list.
- */
- tcp->tcp_time_wait_expire += MSEC_TO_TICK(
- tcps->tcps_time_wait_interval);
+ TCP_DBGSTAT(tcps, tcp_time_wait);
+ mutex_enter(&tsp->tcp_time_wait_lock);
+
+ /*
+ * Immediately expire loopback connections. Since there is no worry
+ * about packets on the local host showing up after a long network
+ * delay, this is safe and allows much higher rates of connection churn
+ * for applications operating locally.
+ *
+ * This typically bypasses the tcp_free_list fast path due to squeue
+ * re-entry for the loopback close operation.
+ */
+ if (tcp->tcp_loopback) {
+ tcp_time_wait_purge(tcp, tsp);
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
}
- ASSERT(TCP_IS_DETACHED(tcp));
- ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
- ASSERT(tcp->tcp_time_wait_next == NULL);
- ASSERT(tcp->tcp_time_wait_prev == NULL);
- TCP_DBGSTAT(tcps, tcp_time_wait);
+ /*
+ * In order to reap TIME_WAITs reliably, we should use a source of time
+ * that is not adjustable by the user. While it would be more accurate
+ * to grab this timestamp before (potentially) sleeping on the
+ * tcp_time_wait_lock, doing so complicates bucket addressing later.
+ */
+ now = ddi_get_lbolt64();
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
- if (tcp_time_wait->tcp_time_wait_head == NULL) {
- ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
- tcp_time_wait->tcp_time_wait_head = tcp;
+ /*
+ * Each squeue uses an arbitrary time offset when scheduling
+ * expiration timers. This prevents the bucketing from forcing
+ * tcp_time_wait_collector to run in locksetup across squeues.
+ *
+ * This offset is (re)initialized when a new TIME_WAIT connection is
+ * added to an squeue which has no connections waiting to expire.
+ */
+ if (tsp->tcp_time_wait_tid == 0) {
+ ASSERT(tsp->tcp_time_wait_cnt == 0);
+ tsp->tcp_time_wait_offset =
+ now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+ }
+ now -= tsp->tcp_time_wait_offset;
+
+ /*
+ * Use the netstack-defined timeout, rounded up to the minimum
+ * time_wait_collector interval.
+ */
+ schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval);
+ tcp->tcp_time_wait_expire = schedule;
+
+ /*
+ * Append the connection into the appropriate bucket.
+ */
+ bucket = TW_BUCKET(tcp->tcp_time_wait_expire);
+ tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket];
+ tsp->tcp_time_wait_bucket[bucket] = tcp;
+ if (tcp->tcp_time_wait_next != NULL) {
+ ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL);
+ tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp;
+ }
+ tsp->tcp_time_wait_cnt++;
+
+ /*
+ * Round delay up to the nearest bucket boundary.
+ */
+ schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+ schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+
+ /*
+ * The newly inserted entry may require a tighter schedule for the
+ * expiration timer.
+ */
+ if (schedule < tsp->tcp_time_wait_schedule) {
+ callout_id_t old_tid = tsp->tcp_time_wait_tid;
+
+ tsp->tcp_time_wait_schedule = schedule;
+ tsp->tcp_time_wait_tid =
+ timeout_generic(CALLOUT_NORMAL,
+ tcp_time_wait_collector, sqp,
+ TICK_TO_NSEC(schedule - now),
+ CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
/*
- * Even if the list was empty before, there may be a timer
- * running since a tcp_t can be removed from the list
- * in other places, such as tcp_clean_death(). So check if
- * a timer is needed.
- */
- if (tcp_time_wait->tcp_time_wait_tid == 0) {
- if (firetime == 0)
- firetime = (hrtime_t)
- (tcps->tcps_time_wait_interval + 1) *
- MICROSEC;
-
- tcp_time_wait->tcp_time_wait_tid =
- timeout_generic(CALLOUT_NORMAL,
- tcp_time_wait_collector, sqp, firetime,
- CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
- }
- tcp_time_wait->tcp_time_wait_tail = tcp;
- } else {
- /*
- * The list is not empty, so a timer must be running. If not,
- * tcp_time_wait_collector() must be running on this
- * tcp_time_wait list at the same time.
+ * It is possible for the timer to fire before the untimeout
+ * action is able to complete. In that case, the exclusion
+ * offered by the tcp_time_wait_collector_active flag will
+ * prevent multiple collector threads from processing records
+ * simultaneously from the same squeue.
*/
- ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 ||
- tcp_time_wait->tcp_time_wait_running);
- ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
- ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
- TCPS_TIME_WAIT);
-
- if (firetime == 0) {
- /* add at end */
- tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next =
- tcp;
- tcp->tcp_time_wait_prev =
- tcp_time_wait->tcp_time_wait_tail;
- tcp_time_wait->tcp_time_wait_tail = tcp;
- } else {
- /* add at head */
- tcp->tcp_time_wait_next =
- tcp_time_wait->tcp_time_wait_head;
- tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
- tcp;
- tcp_time_wait->tcp_time_wait_head = tcp;
- }
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ (void) untimeout_default(old_tid, 0);
+ return;
+ }
+
+ /*
+ * Start a fresh timer if none exists.
+ */
+ if (tsp->tcp_time_wait_schedule == 0) {
+ ASSERT(tsp->tcp_time_wait_tid == 0);
+
+ tsp->tcp_time_wait_schedule = schedule;
+ tsp->tcp_time_wait_tid =
+ timeout_generic(CALLOUT_NORMAL,
+ tcp_time_wait_collector, sqp,
+ TICK_TO_NSEC(schedule - now),
+ CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
}
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+ mutex_exit(&tsp->tcp_time_wait_lock);
}
/*
@@ -278,216 +291,287 @@ tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
tcp_close_detached(tcp);
}
+
+static void
+tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp)
+{
+ mblk_t *mp;
+ conn_t *connp = tcp->tcp_connp;
+ kmutex_t *lock;
+
+ ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock));
+ ASSERT(connp->conn_fanout != NULL);
+
+ lock = &connp->conn_fanout->connf_lock;
+
+ /*
+ * This is essentially a TIME_WAIT reclaim fast path optimization for
+ * performance where the connection is checked under the fanout lock
+ * (so that no one else can get access to the conn_t) that the refcnt
+ * is 2 (one each for TCP and the classifier hash list). That is the
+ * case and clustering callbacks are not enabled, the conn can be
+ * removed under the fanout lock and avoid clean-up under the squeue.
+ *
+ * This optimization is forgone when clustering is enabled since the
+ * clustering callback must be made before setting the CONDEMNED flag
+ * and after dropping all locks
+ *
+ * See the comments in tcp_closei_local for additional information
+ * regarding the refcnt logic.
+ */
+ if (mutex_tryenter(lock)) {
+ mutex_enter(&connp->conn_lock);
+ if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) {
+ ipcl_hash_remove_locked(connp, connp->conn_fanout);
+ /*
+ * Set the CONDEMNED flag now itself so that the refcnt
+ * cannot increase due to any walker.
+ */
+ connp->conn_state_flags |= CONN_CONDEMNED;
+ mutex_exit(&connp->conn_lock);
+ mutex_exit(lock);
+ if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) {
+ /*
+ * Add to head of tcp_free_list
+ */
+ tcp_cleanup(tcp);
+ ASSERT(connp->conn_latch == NULL);
+ ASSERT(connp->conn_policy == NULL);
+ ASSERT(tcp->tcp_tcps == NULL);
+ ASSERT(connp->conn_netstack == NULL);
+
+ tcp->tcp_time_wait_next = tsp->tcp_free_list;
+ tcp->tcp_in_free_list = B_TRUE;
+ tsp->tcp_free_list = tcp;
+ tsp->tcp_free_list_cnt++;
+ } else {
+ /*
+ * Do not add to tcp_free_list
+ */
+ tcp_bind_hash_remove(tcp);
+ ixa_cleanup(tcp->tcp_connp->conn_ixa);
+ tcp_ipsec_cleanup(tcp);
+ CONN_DEC_REF(tcp->tcp_connp);
+ }
+
+ /*
+ * With the fast-path complete, we can bail.
+ */
+ return;
+ } else {
+ /*
+ * Fall back to slow path.
+ */
+ CONN_INC_REF_LOCKED(connp);
+ mutex_exit(&connp->conn_lock);
+ mutex_exit(lock);
+ }
+ } else {
+ CONN_INC_REF(connp);
+ }
+
+ /*
+ * We can reuse the closemp here since conn has detached (otherwise we
+ * wouldn't even be in time_wait list). It is safe to change
+ * tcp_closemp_used without taking a lock as no other thread can
+ * concurrently access it at this point in the connection lifecycle.
+ */
+ if (tcp->tcp_closemp.b_prev == NULL) {
+ tcp->tcp_closemp_used = B_TRUE;
+ } else {
+ cmn_err(CE_PANIC,
+ "tcp_timewait_collector: concurrent use of tcp_closemp: "
+ "connp %p tcp %p\n", (void *)connp, (void *)tcp);
+ }
+
+ TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
+ mp = &tcp->tcp_closemp;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL,
+ SQ_FILL, SQTAG_TCP_TIMEWAIT);
+ mutex_enter(&tsp->tcp_time_wait_lock);
+}
+
/*
- * Blows away all tcps whose TIME_WAIT has expired. List traversal
- * is done forwards from the head.
- * This walks all stack instances since
- * tcp_time_wait remains global across all stacks.
+ * Purge any tcp_t instances associated with this squeue which have expired
+ * from the TIME_WAIT state.
*/
-/* ARGSUSED */
void
tcp_time_wait_collector(void *arg)
{
tcp_t *tcp;
- int64_t now;
- mblk_t *mp;
- conn_t *connp;
- kmutex_t *lock;
- boolean_t removed;
- extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t,
- uint8_t *, in_port_t, uint8_t *, in_port_t, void *);
+ int64_t now, active_schedule, new_schedule;
+ unsigned int idx;
squeue_t *sqp = (squeue_t *)arg;
- tcp_squeue_priv_t *tcp_time_wait =
+ tcp_squeue_priv_t *tsp =
*((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
- tcp_time_wait->tcp_time_wait_tid = 0;
-#ifdef DEBUG
- tcp_time_wait->tcp_time_wait_running = B_TRUE;
-#endif
+ mutex_enter(&tsp->tcp_time_wait_lock);
+
+ /*
+ * Because of timer scheduling complexity and the fact that the
+ * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is
+ * possible for multiple tcp_time_wait_collector threads to run against
+ * the same squeue. This flag is used to exclude other collectors from
+ * the squeue during execution.
+ */
+ if (tsp->tcp_time_wait_collector_active) {
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
+ }
+ tsp->tcp_time_wait_collector_active = B_TRUE;
- if (tcp_time_wait->tcp_free_list != NULL &&
- tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
+ /*
+ * Purge the free list if necessary
+ */
+ if (tsp->tcp_free_list != NULL) {
TCP_G_STAT(tcp_freelist_cleanup);
- while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
- tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
+ while ((tcp = tsp->tcp_free_list) != NULL) {
+ tsp->tcp_free_list = tcp->tcp_time_wait_next;
tcp->tcp_time_wait_next = NULL;
- tcp_time_wait->tcp_free_list_cnt--;
+ tsp->tcp_free_list_cnt--;
ASSERT(tcp->tcp_tcps == NULL);
CONN_DEC_REF(tcp->tcp_connp);
}
- ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
+ ASSERT(tsp->tcp_free_list_cnt == 0);
}
/*
- * In order to reap time waits reliably, we should use a
- * source of time that is not adjustable by the user -- hence
- * the call to ddi_get_lbolt64().
+ * If there are no connections pending, clear timer-related state to be
+ * reinitialized by the next caller.
*/
- now = ddi_get_lbolt64();
- while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
+ if (tsp->tcp_time_wait_cnt == 0) {
+ tsp->tcp_time_wait_offset = 0;
+ tsp->tcp_time_wait_schedule = 0;
+ tsp->tcp_time_wait_tid = 0;
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
+ }
+
+ /*
+ * Grab the bucket which we were scheduled to cleanse.
+ */
+ active_schedule = tsp->tcp_time_wait_schedule;
+ idx = TW_BUCKET(active_schedule - 1);
+ now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
+retry:
+ tcp = tsp->tcp_time_wait_bucket[idx];
+
+ while (tcp != NULL) {
/*
- * lbolt64 should not wrap around in practice... So we can
- * do a direct comparison.
+ * Since the bucket count is sized to prevent wrap-around
+ * during typical operation and timers are schedule to process
+ * buckets with only expired connections, there is only one
+ * reason to encounter a connection expiring in the future:
+ * The tcp_time_wait_collector thread has been so delayed in
+ * its processing that connections have wrapped around the
+ * timing wheel into this bucket.
+ *
+ * In that case, the remaining entires in the bucket can be
+ * ignored since, being appended sequentially, they should all
+ * expire in the future.
*/
- if (now < tcp->tcp_time_wait_expire)
+ if (now < tcp->tcp_time_wait_expire) {
break;
+ }
- removed = tcp_time_wait_remove(tcp, tcp_time_wait);
- ASSERT(removed);
+ /*
+ * Pull the connection out of the bucket.
+ */
+ VERIFY(tcp_time_wait_remove(tcp, tsp));
- connp = tcp->tcp_connp;
- ASSERT(connp->conn_fanout != NULL);
- lock = &connp->conn_fanout->connf_lock;
/*
- * This is essentially a TW reclaim fast path optimization for
- * performance where the timewait collector checks under the
- * fanout lock (so that no one else can get access to the
- * conn_t) that the refcnt is 2 i.e. one for TCP and one for
- * the classifier hash list. If ref count is indeed 2, we can
- * just remove the conn under the fanout lock and avoid
- * cleaning up the conn under the squeue, provided that
- * clustering callbacks are not enabled. If clustering is
- * enabled, we need to make the clustering callback before
- * setting the CONDEMNED flag and after dropping all locks and
- * so we forego this optimization and fall back to the slow
- * path. Also please see the comments in tcp_closei_local
- * regarding the refcnt logic.
+ * Purge the connection.
*
- * Since we are holding the tcp_time_wait_lock, its better
- * not to block on the fanout_lock because other connections
- * can't add themselves to time_wait list. So we do a
- * tryenter instead of mutex_enter.
+ * While tcp_time_wait_lock will be temporarily dropped as part
+ * of the process, there is no risk of the timer being
+ * (re)scheduled while the collector is running since a value
+ * corresponding to the past is left in tcp_time_wait_schedule.
*/
- if (mutex_tryenter(lock)) {
- mutex_enter(&connp->conn_lock);
- if ((connp->conn_ref == 2) &&
- (cl_inet_disconnect == NULL)) {
- ipcl_hash_remove_locked(connp,
- connp->conn_fanout);
- /*
- * Set the CONDEMNED flag now itself so that
- * the refcnt cannot increase due to any
- * walker.
- */
- connp->conn_state_flags |= CONN_CONDEMNED;
- mutex_exit(lock);
- mutex_exit(&connp->conn_lock);
- if (tcp_time_wait->tcp_free_list_cnt <
- tcp_free_list_max_cnt) {
- /* Add to head of tcp_free_list */
- mutex_exit(
- &tcp_time_wait->tcp_time_wait_lock);
- tcp_cleanup(tcp);
- ASSERT(connp->conn_latch == NULL);
- ASSERT(connp->conn_policy == NULL);
- ASSERT(tcp->tcp_tcps == NULL);
- ASSERT(connp->conn_netstack == NULL);
-
- mutex_enter(
- &tcp_time_wait->tcp_time_wait_lock);
- tcp->tcp_time_wait_next =
- tcp_time_wait->tcp_free_list;
- tcp_time_wait->tcp_free_list = tcp;
- tcp_time_wait->tcp_free_list_cnt++;
- continue;
- } else {
- /* Do not add to tcp_free_list */
- mutex_exit(
- &tcp_time_wait->tcp_time_wait_lock);
- tcp_bind_hash_remove(tcp);
- ixa_cleanup(tcp->tcp_connp->conn_ixa);
- tcp_ipsec_cleanup(tcp);
- CONN_DEC_REF(tcp->tcp_connp);
- }
- } else {
- CONN_INC_REF_LOCKED(connp);
- mutex_exit(lock);
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
- mutex_exit(&connp->conn_lock);
- /*
- * We can reuse the closemp here since conn has
- * detached (otherwise we wouldn't even be in
- * time_wait list). tcp_closemp_used can safely
- * be changed without taking a lock as no other
- * thread can concurrently access it at this
- * point in the connection lifecycle.
- */
+ tcp_time_wait_purge(tcp, tsp);
- if (tcp->tcp_closemp.b_prev == NULL)
- tcp->tcp_closemp_used = B_TRUE;
- else
- cmn_err(CE_PANIC,
- "tcp_timewait_collector: "
- "concurrent use of tcp_closemp: "
- "connp %p tcp %p\n", (void *)connp,
- (void *)tcp);
-
- TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
- mp = &tcp->tcp_closemp;
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_timewait_close, connp, NULL,
- SQ_FILL, SQTAG_TCP_TIMEWAIT);
- }
- } else {
- mutex_enter(&connp->conn_lock);
- CONN_INC_REF_LOCKED(connp);
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
- mutex_exit(&connp->conn_lock);
- /*
- * We can reuse the closemp here since conn has
- * detached (otherwise we wouldn't even be in
- * time_wait list). tcp_closemp_used can safely
- * be changed without taking a lock as no other
- * thread can concurrently access it at this
- * point in the connection lifecycle.
- */
+ /*
+ * Because tcp_time_wait_remove clears the tcp_time_wait_next
+ * field, the next item must be grabbed directly from the
+ * bucket itself.
+ */
+ tcp = tsp->tcp_time_wait_bucket[idx];
+ }
+
+ if (tsp->tcp_time_wait_cnt == 0) {
+ /*
+ * There is not a need for the collector to schedule a new
+ * timer if no pending items remain. The timer state can be
+ * cleared only if it was untouched while the collector dropped
+ * its locks during tcp_time_wait_purge.
+ */
+ if (tsp->tcp_time_wait_schedule == active_schedule) {
+ tsp->tcp_time_wait_offset = 0;
+ tsp->tcp_time_wait_schedule = 0;
+ tsp->tcp_time_wait_tid = 0;
+ }
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
+ } else {
+ unsigned int nidx;
- if (tcp->tcp_closemp.b_prev == NULL)
- tcp->tcp_closemp_used = B_TRUE;
- else
- cmn_err(CE_PANIC, "tcp_timewait_collector: "
- "concurrent use of tcp_closemp: "
- "connp %p tcp %p\n", (void *)connp,
- (void *)tcp);
-
- TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
- mp = &tcp->tcp_closemp;
- SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
- tcp_timewait_close, connp, NULL,
- SQ_FILL, SQTAG_TCP_TIMEWAIT);
+ /*
+ * Locate the next bucket containing entries.
+ */
+ new_schedule = active_schedule
+ + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
+ nidx = TW_BUCKET_NEXT(idx);
+ while (tsp->tcp_time_wait_bucket[nidx] == NULL) {
+ if (nidx == idx) {
+ break;
+ }
+ nidx = TW_BUCKET_NEXT(nidx);
+ new_schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY);
}
- mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
+ ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL);
}
- if (tcp_time_wait->tcp_free_list != NULL)
- tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
+ /*
+ * It is possible that the system is under such dire load that between
+ * the timer scheduling and TIME_WAIT processing delay, execution
+ * overran the interval allocated to this bucket.
+ */
+ now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset;
+ if (new_schedule <= now) {
+ /*
+ * Attempt to right the situation by immediately performing a
+ * purge on the next bucket. This loop will continue as needed
+ * until the schedule can be pushed out ahead of the clock.
+ */
+ idx = TW_BUCKET(new_schedule - 1);
+ goto retry;
+ }
/*
- * If the time wait list is not empty and there is no timer running,
- * restart it.
+ * Another thread may have snuck in to reschedule the timer while locks
+ * were dropped during tcp_time_wait_purge. Defer to the running timer
+ * if that is the case.
*/
- if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL &&
- tcp_time_wait->tcp_time_wait_tid == 0) {
- hrtime_t firetime;
-
- /* shouldn't be necessary, but just in case */
- if (tcp->tcp_time_wait_expire < now)
- tcp->tcp_time_wait_expire = now;
-
- firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now);
- /* This ensures that we won't wake up too often. */
- firetime = MAX(TCP_TIME_WAIT_DELAY, firetime);
- tcp_time_wait->tcp_time_wait_tid =
- timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector,
- sqp, firetime, CALLOUT_TCP_RESOLUTION,
- CALLOUT_FLAG_ROUNDUP);
+ if (tsp->tcp_time_wait_schedule != active_schedule) {
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
+ return;
}
-#ifdef DEBUG
- tcp_time_wait->tcp_time_wait_running = B_FALSE;
-#endif
- mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
+
+ /*
+ * Schedule the next timer.
+ */
+ tsp->tcp_time_wait_schedule = new_schedule;
+ tsp->tcp_time_wait_tid =
+ timeout_generic(CALLOUT_NORMAL,
+ tcp_time_wait_collector, sqp,
+ TICK_TO_NSEC(new_schedule - now),
+ CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP);
+ tsp->tcp_time_wait_collector_active = B_FALSE;
+ mutex_exit(&tsp->tcp_time_wait_lock);
}
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c
index be75f1f663..f4d6c71914 100644
--- a/usr/src/uts/common/inet/tcp/tcp_tunables.c
+++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
*/
@@ -249,7 +249,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = {
/* tunable - 0 */
{ "_time_wait_interval", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
- {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} },
+ {1*SECONDS, TCP_TIME_WAIT_MAX, 1*MINUTES}, {1*MINUTES} },
{ "_conn_req_max_q", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
@@ -307,7 +307,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = {
{ "_keepalive_interval", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
- {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
+ {1*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} },
{ "_maxpsz_multiplier", MOD_PROTO_TCP,
mod_set_uint32, mod_get_uint32,
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 0f0f915a2b..cb83b91fad 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved.
* Copyright (c) 2013, 2014 by Delphix. All rights reserved.
*/
@@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls;
* by setting it to 0.
*/
#define TCP_XMIT_LOWATER 4096
-#define TCP_XMIT_HIWATER 49152
+#define TCP_XMIT_HIWATER 128000
#define TCP_RECV_LOWATER 2048
-#define TCP_RECV_HIWATER 128000
+#define TCP_RECV_HIWATER 1048576
/*
* Bind hash list size and has function. It has to be a power of 2 for
@@ -105,7 +105,7 @@ extern sock_downcalls_t sock_tcp_downcalls;
*/
#define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached)
-/* TCP timers related data strucutres. Refer to tcp_timers.c. */
+/* TCP timers related data structures. Refer to tcp_timers.c. */
typedef struct tcp_timer_s {
conn_t *connp;
void (*tcpt_proc)(void *);
@@ -132,48 +132,79 @@ extern kmem_cache_t *tcp_timercache;
(tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, (intvl)); \
}
+
+/*
+ * Maximum TIME_WAIT timeout. It is defined here (instead of tcp_tunables.c)
+ * so that other parameters can be derived from it.
+ */
+#define TCP_TIME_WAIT_MAX (10 * MINUTES)
+
+/*
+ * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
+ * Running it every 5 seconds seems to yield a reasonable balance between
+ * cleanup liveliness and system load.
+ */
+#define TCP_TIME_WAIT_DELAY (5 * SECONDS)
+
+#define TCP_TIME_WAIT_BUCKETS ((TCP_TIME_WAIT_MAX / TCP_TIME_WAIT_DELAY) + 1)
+
/*
* For scalability, we must not run a timer for every TCP connection
* in TIME_WAIT state. To see why, consider (for time wait interval of
* 1 minutes):
* 10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's
*
- * This list is ordered by time, so you need only delete from the head
- * until you get to entries which aren't old enough to delete yet.
- * The list consists of only the detached TIME_WAIT connections.
+ * Since TIME_WAIT expiration occurs on a per-squeue basis, handling
+ * connections from all netstacks on the system, a simple queue is inadequate
+ * for pending entries. This is because tcp_time_wait_interval may differ
+ * between connections, causing tail insertion to violate expiration order.
+ *
+ * Instead of performing expensive sorting or unnecessary list traversal to
+ * counteract interval variance between netstacks, a timing wheel structure is
+ * used. The duration covered by each bucket in the wheel is determined by the
+ * TCP_TIME_WAIT_DELAY (5 seconds). The number of buckets in the wheel is
+ * determined by dividing the maximum TIME_WAIT interval (10 minutes) by
+ * TCP_TIME_WAIT_DELAY, with one added bucket for rollover protection.
+ * (Yielding 121 buckets with the current parameters) When items are inserted
+ * into the set of buckets, they are indexed by using their expiration time
+ * divided by the bucket size, modulo the number of buckets. This means that
+ * when each bucket is processed, all items within should have expired within
+ * the last TCP_TIME_WAIT_DELAY interval.
+ *
+ * Since bucket timer schedules are rounded to the nearest TCP_TIME_WAIT_DELAY
+ * interval to ensure all connections in the pending bucket will be expired, a
+ * per-squeue offset is used when doing TIME_WAIT scheduling. This offset is
+ * between 0 and the TCP_TIME_WAIT_DELAY and is designed to avoid scheduling
+ * all of the tcp_time_wait_collector threads to run in lock-step. The offset
+ * is fixed while there are any connections present in the buckets.
*
* When a tcp_t enters TIME_WAIT state, a timer is started (timeout is
* tcps_time_wait_interval). When the tcp_t is detached (upper layer closes
- * the end point), it is moved to the time wait list and another timer is
- * started (expiry time is set at tcp_time_wait_expire, which is
- * also calculated using tcps_time_wait_interval). This means that the
- * TIME_WAIT state can be extended (up to doubled) if the tcp_t doesn't
- * become detached for a long time.
+ * the end point), it is scheduled to be cleaned up by the squeue-driving
+ * tcp_time_wait_collector (also using tcps_time_wait_interval). This means
+ * that the TIME_WAIT state can be extended (up to doubled) if the tcp_t
+ * doesn't become detached for a long time.
*
* The list manipulations (including tcp_time_wait_next/prev)
* are protected by the tcp_time_wait_lock. The content of the
* detached TIME_WAIT connections is protected by the normal perimeters.
*
- * This list is per squeue and squeues are shared across the tcp_stack_t's.
- * Things on tcp_time_wait_head remain associated with the tcp_stack_t
- * and conn_netstack.
- * The tcp_t's that are added to tcp_free_list are disassociated and
- * have NULL tcp_tcps and conn_netstack pointers.
+ * These connection lists are per squeue and squeues are shared across the
+ * tcp_stack_t instances. Things in a tcp_time_wait_bucket remain associated
+ * with the tcp_stack_t and conn_netstack. Any tcp_t connections stored in the
+ * tcp_free_list are disassociated and have NULL tcp_tcps and conn_netstack
+ * pointers.
*/
typedef struct tcp_squeue_priv_s {
kmutex_t tcp_time_wait_lock;
+ boolean_t tcp_time_wait_collector_active;
callout_id_t tcp_time_wait_tid;
- tcp_t *tcp_time_wait_head;
- tcp_t *tcp_time_wait_tail;
+ uint64_t tcp_time_wait_cnt;
+ int64_t tcp_time_wait_schedule;
+ int64_t tcp_time_wait_offset;
+ tcp_t *tcp_time_wait_bucket[TCP_TIME_WAIT_BUCKETS];
tcp_t *tcp_free_list;
uint_t tcp_free_list_cnt;
-#ifdef DEBUG
- /*
- * For debugging purpose, true when tcp_time_wait_collector() is
- * running.
- */
- boolean_t tcp_time_wait_running;
-#endif
} tcp_squeue_priv_t;
/*
@@ -375,6 +406,22 @@ typedef struct tcp_listen_cnt_s {
uint32_t tlc_drop;
} tcp_listen_cnt_t;
+/*
+ * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT.
+ * - tcprg_lock: Protects the other fields
+ * - tcprg_size: Allocated size (in entries) of tcprg_members array
+ * - tcprg_count: Count of occupied tcprg_members slots
+ * - tcprg_active: Count of members which still have SO_REUSEPORT set
+ * - tcprg_members: Connections associated with address/port group
+ */
+typedef struct tcp_rg_s {
+ kmutex_t tcprg_lock;
+ unsigned int tcprg_size;
+ unsigned int tcprg_count;
+ unsigned int tcprg_active;
+ tcp_t **tcprg_members;
+} tcp_rg_t;
+
#define TCP_TLC_REPORT_INTERVAL (30 * MINUTES)
#define TCP_DECR_LISTEN_CNT(tcp) \
@@ -618,6 +665,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *,
int, boolean_t, boolean_t, boolean_t);
extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *,
boolean_t);
+extern tcp_rg_t *tcp_rg_init(tcp_t *);
+extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *);
+extern void tcp_rg_destroy(tcp_rg_t *);
+extern void tcp_rg_setactive(tcp_rg_t *, boolean_t);
/*
* Fusion related functions in tcp_fusion.c.
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 5a15aea4de..a88bac932c 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -22,6 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -76,7 +77,8 @@
#include <inet/ipclassifier.h>
#include <sys/squeue_impl.h>
#include <inet/ipnet.h>
-#include <sys/ethernet.h>
+#include <sys/vxlan.h>
+#include <inet/inet_hash.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
@@ -346,6 +348,89 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
typedef union T_primitives *t_primp_t;
/*
+ * Various protocols that encapsulate UDP have no real use for the source port.
+ * Instead, they want to vary the source port to provide better equal-cost
+ * multipathing and other systems that use fanout. Consider something like
+ * VXLAN. If you're actually sending multiple different streams to a single
+ * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP,
+ * SRC Port, DST Port) will always be the same.
+ *
+ * Here, we return a port to hash this to, if we know how to hash it. If for
+ * some reason we can't perform an L4 hash, then we just return the default
+ * value, usually the default port. After we determine the hash we transform it
+ * so that it's in the range of [ min, max ].
+ *
+ * We'd like to avoid a pull up for the sake of performing the hash. If the
+ * first mblk_t doesn't have the full protocol header, then we just send it to
+ * the default. If for some reason we have an encapsulated packet that has its
+ * protocol header in different parts of an mblk_t, then we'll go with the
+ * default port. This means that that if a driver isn't consistent about how it
+ * generates the frames for a given flow, it will not always be consistently
+ * hashed. That should be an uncommon event.
+ */
+uint16_t
+udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
+ uint16_t def)
+{
+ size_t szused = 0;
+ struct ether_header *ether;
+ struct ether_vlan_header *vether;
+ ip6_t *ip6h;
+ ipha_t *ipha;
+ uint16_t sap;
+ uint64_t hash;
+ uint32_t mod;
+
+ ASSERT(min <= max);
+
+ if (type != UDP_HASH_VXLAN)
+ return (def);
+
+ if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)))
+ return (def);
+
+ /*
+ * The following logic is VXLAN specific to get at the header, if we
+ * have formats, eg. GENEVE, then we should ignore this.
+ *
+ * The kernel overlay device often puts a first mblk_t for the data
+ * which is just the encap. If so, then we're going to use that and try
+ * to avoid a pull up.
+ */
+ if (MBLKL(mp) == VXLAN_HDR_LEN) {
+ if (mp->b_cont == NULL)
+ return (def);
+ mp = mp->b_cont;
+ ether = (struct ether_header *)mp->b_rptr;
+ } else if (MBLKL(mp) < VXLAN_HDR_LEN) {
+ return (def);
+ } else {
+ szused = VXLAN_HDR_LEN;
+ ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused);
+ }
+
+ /* Can we hold a MAC header? */
+ if (MBLKL(mp) + szused < sizeof (struct ether_header))
+ return (def);
+
+ /*
+ * We need to lie about the starting offset into the message block for
+ * convenience. Undo it at the end. We know that inet_pkt_hash() won't
+ * modify the mblk_t.
+ */
+ mp->b_rptr += szused;
+ hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 |
+ INET_PKT_HASH_L3 | INET_PKT_HASH_L4);
+ mp->b_rptr -= szused;
+
+ if (hash == 0)
+ return (def);
+
+ mod = max - min + 1;
+ return ((hash % mod) + min);
+}
+
+/*
* Return the next anonymous port in the privileged port range for
* bind checking.
*
@@ -1583,6 +1668,16 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
*i1 = udp->udp_rcvhdr ? 1 : 0;
mutex_exit(&connp->conn_lock);
return (sizeof (int));
+ case UDP_SRCPORT_HASH:
+ mutex_enter(&connp->conn_lock);
+ *i1 = udp->udp_vxlanhash;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
+ case UDP_SND_TO_CONNECTED:
+ mutex_enter(&connp->conn_lock);
+ *i1 = udp->udp_snd_to_conn ? 1 : 0;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
}
}
mutex_enter(&connp->conn_lock);
@@ -1718,6 +1813,31 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
udp->udp_rcvhdr = onoff;
mutex_exit(&connp->conn_lock);
return (0);
+ case UDP_SRCPORT_HASH:
+ /*
+ * This should have already been verified, but double
+ * check.
+ */
+ if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
+ return (error);
+ }
+
+ /* First see if the val is something we understand */
+ if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN)
+ return (EINVAL);
+
+ if (!checkonly) {
+ mutex_enter(&connp->conn_lock);
+ udp->udp_vxlanhash = *i1;
+ mutex_exit(&connp->conn_lock);
+ }
+ /* Fully handled this option. */
+ return (0);
+ case UDP_SND_TO_CONNECTED:
+ mutex_enter(&connp->conn_lock);
+ udp->udp_snd_to_conn = onoff;
+ mutex_exit(&connp->conn_lock);
+ return (0);
}
break;
}
@@ -2001,13 +2121,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
uint32_t cksum;
udp_t *udp = connp->conn_udp;
boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ boolean_t hash_srcport = udp->udp_vxlanhash;
uint_t ulp_hdr_len;
+ uint16_t srcport;
data_len = msgdsize(data_mp);
ulp_hdr_len = UDPH_SIZE;
if (insert_spi)
ulp_hdr_len += sizeof (uint32_t);
+ /*
+ * If we have source port hashing going on, determine the hash before
+ * we modify the mblk_t.
+ */
+ if (hash_srcport == B_TRUE) {
+ srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+ IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+ ntohs(connp->conn_lport));
+ }
+
mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
if (mp == NULL) {
@@ -2019,7 +2151,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
- udpha->uha_src_port = connp->conn_lport;
+ if (hash_srcport == B_TRUE) {
+ udpha->uha_src_port = htons(srcport);
+ } else {
+ udpha->uha_src_port = connp->conn_lport;
+ }
udpha->uha_dst_port = dstport;
udpha->uha_checksum = 0;
udpha->uha_length = htons(data_len);
@@ -3194,6 +3330,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
udp_t *udp = connp->conn_udp;
udp_stack_t *us = udp->udp_us;
boolean_t insert_spi = udp->udp_nat_t_endpoint;
+ boolean_t hash_srcport = udp->udp_vxlanhash;
uint_t pktlen;
uint_t alloclen;
uint_t copylen;
@@ -3202,10 +3339,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
udpha_t *udpha;
uint32_t cksum;
ip_pkt_t *ipp;
+ uint16_t srcport;
ASSERT(MUTEX_HELD(&connp->conn_lock));
/*
+ * If we have source port hashing going on, determine the hash before
+ * we modify the mblk_t.
+ */
+ if (hash_srcport == B_TRUE) {
+ srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+ IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+ ntohs(connp->conn_lport));
+ }
+
+ /*
* Copy the header template and leave space for an SPI
*/
copylen = connp->conn_ht_iphc_len;
@@ -3303,6 +3451,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
*((uint32_t *)(udpha + 1)) = 0;
udpha->uha_dst_port = dstport;
+ if (hash_srcport == B_TRUE)
+ udpha->uha_src_port = htons(srcport);
+
return (mp);
}
@@ -5947,10 +6098,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
else
return (error);
}
- if (udp->udp_state == TS_DATA_XFER) {
+
+ /*
+ * Check if we're allowed to send to a connection on which we've
+ * already called 'connect'. The posix spec. allows both behaviors but
+ * historically we've returned an error if already connected. The
+ * client can allow this via a sockopt.
+ */
+ if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {
UDPS_BUMP_MIB(us, udpOutErrors);
return (EISCONN);
}
+
error = proto_verify_ip_addr(connp->conn_family,
(struct sockaddr *)msg->msg_name, msg->msg_namelen);
if (error != 0) {
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index c279bb4a21..847e2cdde6 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
*/
#include <sys/types.h>
@@ -292,6 +293,9 @@ opdes_t udp_opt_arr[] = {
},
{ UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int),
0 },
+{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 },
+{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+ 0 }
};
/*
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 6a31ce5c22..ebba10c0f7 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
#ifndef _UDP_IMPL_H
@@ -178,8 +179,12 @@ typedef struct udp_s {
udp_issocket : 1, /* socket mode; sockfs is on top */
udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */
udp_rcvhdr : 1, /* UDP_RCVHDR option */
+ udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */
+ /* Because there's only VXLAN, cheat */
+ /* and only use a single bit */
+ udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */
- udp_pad_to_bit_31 : 29;
+ udp_pad_to_bit_31 : 27;
/* Following 2 fields protected by the uf_lock */
struct udp_s *udp_bind_hash; /* Bind hash chain */