diff options
Diffstat (limited to 'usr/src/uts/common/inet')
26 files changed, 1440 insertions, 470 deletions
diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h new file mode 100644 index 0000000000..a790a797d1 --- /dev/null +++ b/usr/src/uts/common/inet/inet_hash.h @@ -0,0 +1,37 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2015 Joyent, Inc. + */ + +#ifndef _INET_INET_HASH_H +#define _INET_INET_HASH_H + +/* + * Common packet hashing routines shared across MAC, UDP, and others. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define INET_PKT_HASH_L2 0x01 +#define INET_PKT_HASH_L3 0x02 +#define INET_PKT_HASH_L4 0x04 + +extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_INET_HASH_H */ diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index bcbc1c4949..b4bff4d7b4 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -619,6 +620,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, case SO_REUSEADDR: *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; break; /* goto sizeof (int) option return */ + case SO_REUSEPORT: + *i1 = connp->conn_reuseport; + break; /* goto sizeof (int) option return */ case SO_TYPE: *i1 = connp->conn_so_type; break; /* goto sizeof (int) option return */ @@ -1186,8 +1190,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, ip_stack_t *ipst = connp->conn_netstack->netstack_ip; int error; - if (connp->conn_family != AF_INET) + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV4_VERSION) { + /* + * Allow certain IPv4 options to be set on an AF_INET6 socket + * if the connection is still IPv4. + */ + switch (name) { + case IP_TOS: + case T_IP_TOS: + case IP_TTL: + case IP_DONTFRAG: + break; + default: + return (EINVAL); + } + } else if (connp->conn_family != AF_INET) { return (EINVAL); + } switch (name) { case IP_TTL: diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index f006e83a1f..73081b9c1c 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -12577,6 +12577,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) struct iocblk *iocp = (struct iocblk *)mp->b_rptr; ip_ioctl_cmd_t *ipip = arg; ip_extract_func_t *extract_funcp; + ill_t *ill; cmd_info_t ci; int err; boolean_t entered_ipsq = B_FALSE; @@ -12697,6 +12698,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); /* + * We need to cache the ill_t that we're going to use as the argument + * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be + * blown away by calling ipi_func. + */ + ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill; + + /* * A return value of EINPROGRESS means the ioctl is * either queued and waiting for some reason or has * already completed. @@ -12704,9 +12712,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR", - int, ipip->ipi_cmd, - ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill, - ipif_t *, ci.ci_ipif); + int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); if (entered_ipsq) diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c index 85ee142dfc..c350d67c2d 100644 --- a/usr/src/uts/common/inet/ip/ip_attr.c +++ b/usr/src/uts/common/inet/ip/ip_attr.c @@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa) */ if (ixa->ixa_free_flags & IXA_FREE_CRED) crhold(ixa->ixa_cred); + + /* + * There is no cleanup in progress on this new copy. + */ + ixa->ixa_tcpcleanup = IXATC_IDLE; } /* diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 33a2fa5935..dedb4dadcc 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri) { squeue_t *sqp; - sqp = squeue_create(ip_squeue_worker_wait, pri); + sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index bc2173ff24..3a12e58c3a 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* @@ -868,67 +869,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) mutex_exit(&(connfp)->connf_lock); \ } -#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ - conn_t *pconnp = NULL, *nconnp; \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - nconnp = (connfp)->connf_head; \ - while (nconnp != NULL && \ - !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ - pconnp = nconnp; \ - nconnp = nconnp->conn_next; \ - } \ - if (pconnp != NULL) { \ - pconnp->conn_next = (connp); \ - (connp)->conn_prev = pconnp; \ - } else { \ - (connfp)->connf_head = (connp); \ - } \ - if (nconnp != NULL) { \ - (connp)->conn_next = nconnp; \ - nconnp->conn_prev = (connp); \ - } \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF(connp); \ - mutex_exit(&(connfp)->connf_lock); \ -} +/* + * When inserting bound or wildcard entries into the hash, ordering rules are + * used to facilitate timely and correct lookups. The order is as follows: + * 1. Entries bound to a specific address + * 2. Entries bound to INADDR_ANY + * 3. Entries bound to ADDR_UNSPECIFIED + * Entries in a category which share conn_lport (such as those using + * SO_REUSEPORT) will be ordered such that the newest inserted is first. + */ -#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ - conn_t **list, *prev, *next; \ - boolean_t isv4mapped = \ - IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - list = &(connfp)->connf_head; \ - prev = NULL; \ - while ((next = *list) != NULL) { \ - if (isv4mapped && \ - IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ - connp->conn_zoneid == next->conn_zoneid) { \ - (connp)->conn_next = next; \ - if (prev != NULL) \ - prev = next->conn_prev; \ - next->conn_prev = (connp); \ - break; \ - } \ - list = &next->conn_next; \ - prev = next; \ - } \ - (connp)->conn_prev = prev; \ - *list = (connp); \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF((connp)); \ - mutex_exit(&(connfp)->connf_lock); \ +void +ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp) +{ + conn_t *pconnp, *nconnp; + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + /* + * Walk though entries associated with the fanout until one is + * found which fulfills any of these conditions: + * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED + * 2. Listen port the same as connp + */ + if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) || + connp->conn_lport == nconnp->conn_lport) + break; + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } void ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + conn_t **list, *prev, *next; + conn_t *pconnp = NULL, *nconnp; + boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6); + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) && + isv4mapped && connp->conn_lport == nconnp->conn_lport) + break; + if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) && + (isv4mapped || + connp->conn_lport == nconnp->conn_lport)) + break; + + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } /* @@ -1034,9 +1059,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } else { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } } else { IPCL_HASH_INSERT_CONNECTED(connfp, connp); @@ -1205,9 +1230,9 @@ ipcl_bind_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (protocol == IPPROTO_RSVP) ill_set_inputfn_all(ipst); @@ -1219,9 +1244,9 @@ ipcl_bind_insert_v4(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { ASSERT(connp->conn_ipversion == IPV4_VERSION); @@ -1271,9 +1296,9 @@ ipcl_bind_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; @@ -1283,9 +1308,9 @@ ipcl_bind_insert_v6(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { sa_family_t addr_family; @@ -1416,9 +1441,9 @@ ipcl_conn_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -1504,9 +1529,9 @@ ipcl_conn_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c index c325e8dc26..2ca770ebe9 100644 --- a/usr/src/uts/common/inet/ip/ipsecesp.c +++ b/usr/src/uts/common/inet/ip/ipsecesp.c @@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid) { espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat", "net", KSTAT_TYPE_NAMED, - sizeof (esp_kstats_t) / sizeof (kstat_named_t), - KSTAT_FLAG_PERSISTENT, stackid); + sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid); if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL) return (B_FALSE); diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index f6466434f6..c3139d9288 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ #ifndef _INET_IPCLASSIFIER_H @@ -293,7 +294,8 @@ struct conn_s { conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ conn_mcbc_bind : 1, /* Bound to multi/broadcast */ - conn_pad_to_bit_31 : 12; + conn_reuseport : 1, /* SO_REUSEPORT state */ + conn_pad_to_bit_31 : 11; boolean_t conn_blocked; /* conn is flow-controlled */ diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index f958ca2261..227d2075f8 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -83,6 +83,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); +static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, + void *)); extern int ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *)); extern int ipf_frruleiter __P((void *, int, void *, ipf_stack_t *)); @@ -152,6 +160,16 @@ char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; char *hook6_loop_out = "ipfilter_hook6_loop_out"; char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +/* vnd IPv4/v6 hook names */ +char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; +char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; +char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; +char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; +char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; +char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; +char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; +char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -248,6 +266,31 @@ ipf_stack_t *ifs; ifs->ifs_ipf_ipv4 = NULL; } + /* + * Remove VND hooks + */ + if (ifs->ifs_ipf_vndl3v4 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v4 = NULL; + } + + if (ifs->ifs_ipf_vndl3v6 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v6 = NULL; + } + #undef UNDO_HOOK #ifdef IPFDEBUG @@ -445,6 +488,48 @@ ipf_stack_t *ifs; } /* + * Add VND INET hooks + */ + ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); + if (ifs->ifs_ipf_vndl3v4 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, + hook4_vnd_in, hook4_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, + hook4_vnd_out, hook4_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); + if (!ifs->ifs_hookvndl3v4_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); + if (!ifs->ifs_hookvndl3v4_physical_out) + goto hookup_failed; + + + /* + * VND INET6 hooks + */ + ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); + if (ifs->ifs_ipf_vndl3v6 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, + hook6_vnd_in, hook6_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, + hook6_vnd_out, hook6_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); + if (!ifs->ifs_hookvndl3v6_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); + if (!ifs->ifs_hookvndl3v6_physical_out) + goto hookup_failed; + /* * Reacquire ipf_global, now it is safe. */ WRITE_ENTER(&ifs->ifs_ipf_global); @@ -1011,7 +1096,6 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; - /* * ipf_find_stack returns with a read lock on ifs_ipf_global */ @@ -2045,6 +2129,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg) } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hook4_loop_in */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf index 6b36f9fdbf..f49e024a72 100644 --- a/usr/src/uts/common/inet/ipf/ipf.conf +++ b/usr/src/uts/common/inet/ipf/ipf.conf @@ -1,3 +1,8 @@ # # name="ipf" parent="pseudo" instance=0; + +# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, +# and both should be prime numbers +fr_statesize=151007; +fr_statemax=113279; diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index a239f1c1ca..9aa2478c6a 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -125,6 +125,10 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_in; hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookvndl3v4_in; + hook_t *ifs_ipfhookvndl3v6_in; + hook_t *ifs_ipfhookvndl3v4_out; + hook_t *ifs_ipfhookvndl3v6_out; /* flags to indicate whether hooks are registered. */ boolean_t ifs_hook4_physical_in; @@ -137,10 +141,16 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookvndl3v4_physical_in; + boolean_t ifs_hookvndl3v6_physical_in; + boolean_t ifs_hookvndl3v4_physical_out; + boolean_t ifs_hookvndl3v6_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_vndl3v4; + net_handle_t ifs_ipf_vndl3v6; /* ip_auth.c */ int ifs_fr_authsize; diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c index c541f4dddc..5d56debc31 100644 --- a/usr/src/uts/common/inet/ipf/solaris.c +++ b/usr/src/uts/common/inet/ipf/solaris.c @@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg) /* * Destroy things for ipf for one stack. */ -/* ARGSUSED */ static void ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) { diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c new file mode 100644 index 0000000000..6e1171de46 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/datafilt.c @@ -0,0 +1,116 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * This file implements a socketfilter used to deter TCP connections. + * To defer a connection means to delay the return of accept(3SOCKET) + * until at least one byte is ready to be read(2). This filter may be + * applied automatically or programmatically through the use of + * soconfig(1M) and setsockopt(3SOCKET). + */ + +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/socketvar.h> +#include <sys/sockfilter.h> +#include <sys/note.h> +#include <sys/taskq.h> + +#define DATAFILT_MODULE "datafilt" + +static struct modlmisc dataf_modlmisc = { + &mod_miscops, + "Kernel data-ready socket filter" +}; + +static struct modlinkage dataf_modlinkage = { + MODREV_1, + &dataf_modlmisc, + NULL +}; + +static sof_rval_t +dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph, + void *parg, struct sockaddr *laddr, socklen_t laddrlen, + struct sockaddr *faddr, socklen_t faddrlen, void **cookiep) +{ + _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen, + cookiep)); + return (SOF_RVAL_DEFER); +} + +static void +dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr) +{ + _NOTE(ARGUNUSED(handle, cookie, cr)); +} + +static mblk_t * +dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags, + size_t *lenp) +{ + _NOTE(ARGUNUSED(cookie, flags, lenp)); + + if (mp != NULL && MBLKL(mp) > 0) { + sof_newconn_ready(handle); + sof_bypass(handle); + } + + return (mp); +} + +static sof_ops_t dataf_ops = { + .sofop_attach_passive = dataf_attach_passive_cb, + .sofop_detach = dataf_detach_cb, + .sofop_data_in = dataf_data_in_cb +}; + +int +_init(void) +{ + int err; + + /* + * This module is safe to attach even after some preliminary socket + * setup calls have taken place. See the comment for SOF_ATT_SAFE. + */ + err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops, + SOF_ATT_SAFE); + if (err != 0) + return (err); + if ((err = mod_install(&dataf_modlinkage)) != 0) + (void) sof_unregister(DATAFILT_MODULE); + + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = sof_unregister(DATAFILT_MODULE)) != 0) + return (err); + + return (mod_remove(&dataf_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&dataf_modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 2e08dc359b..1009f0700f 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -23,7 +23,7 @@ */ /* - * Copyright 2012 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. */ /* @@ -61,6 +61,10 @@ * connection are processed on that squeue. The connection ("conn") to * squeue mapping is stored in "conn_t" member "conn_sqp". * + * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is + * false and it will not have an associated conn_t, which means many aspects of + * the system, such as polling and swtiching squeues will not be used. + * * Since the processing of the connection cuts across multiple layers * but still allows packets for different connnection to be processed on * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or @@ -244,7 +248,7 @@ squeue_init(void) /* ARGSUSED */ squeue_t * -squeue_create(clock_t wait, pri_t pri) +squeue_create(clock_t wait, pri_t pri, boolean_t isip) { squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); @@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri) sqp->sq_enter = squeue_enter; sqp->sq_drain = squeue_drain; + sqp->sq_isip = isip; return (sqp); } /* + * We need to kill the threads and then clean up. We should VERIFY that + * polling is disabled so we don't have to worry about disassociating from + * MAC/IP/etc. + */ +void +squeue_destroy(squeue_t *sqp) +{ + kt_did_t worker, poll; + mutex_enter(&sqp->sq_lock); + VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT))); + worker = sqp->sq_worker->t_did; + poll = sqp->sq_poll_thr->t_did; + sqp->sq_state |= SQS_EXIT; + cv_signal(&sqp->sq_poll_cv); + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + + thread_join(poll); + thread_join(worker); + kmem_cache_free(squeue_cache, sqp); +} + +/* * Bind squeue worker thread to the specified CPU, given by CPU id. * If the CPU id value is -1, bind the worker thread to the value * specified in sq_bind field. If a thread is already bound to a @@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } SQUEUE_DBG_CLEAR(sqp); - CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, return; } } else { - if (ira != NULL) { + if (sqp->sq_isip == B_TRUE && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, if (!(sqp->sq_state & SQS_REENTER) && (process_flag != SQ_FILL) && (sqp->sq_first == NULL) && (sqp->sq_run == curthread) && (cnt == 1) && - (connp->conn_on_sqp == B_FALSE)) { + (sqp->sq_isip == B_FALSE || + connp->conn_on_sqp == B_FALSE)) { sqp->sq_state |= SQS_REENTER; mutex_exit(&sqp->sq_lock); @@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + tag); + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, #ifdef DEBUG mp->b_tag = tag; #endif - if (ira != NULL) { + if (sqp->sq_isip && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -779,7 +818,7 @@ again: mp->b_prev = NULL; /* Is there an ip_recv_attr_t to handle? */ - if (ip_recv_attr_is_mblk(mp)) { + if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) { mblk_t *attrmp = mp; ASSERT(attrmp->b_cont != NULL); @@ -804,20 +843,25 @@ again: /* - * Handle squeue switching. More details in the - * block comment at the top of the file + * Handle squeue switching. More details in the block comment at + * the top of the file. non-IP squeues cannot switch, as there + * is no conn_t. */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp) cv_wait(async, lock); CALLB_CPR_SAFE_END(&cprinfo, lock); + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | SQS_POLL_THR_QUIESCED); if (ctl_state != 0) { @@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp) (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); + /* Only IP related squeues should reach this point */ + VERIFY(sqp->sq_isip == B_TRUE); + poll_again: sq_rx_ring = sqp->sq_rx_ring; sq_get_pkts = sq_rx_ring->rr_rx; @@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp) ill_rx_ring_t *rx_ring; ASSERT(MUTEX_HELD(&sqp->sq_lock)); + VERIFY(sqp->sq_isip == B_TRUE); if (sqp->sq_state & SQS_POLL_RESTART) { /* Restart implies a previous quiesce. */ @@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp) for (;;) { for (;;) { + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + /* * If the poll thread has handed control to us * we need to break out of the wait. @@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp) again: sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { @@ -1487,6 +1546,7 @@ void squeue_synch_exit(conn_t *connp) { squeue_t *sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_run == curthread) { diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index b2b9973291..6ec2e6b2d7 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014 by Delphix. All rights reserved. */ @@ -134,6 +134,7 @@ typedef struct tcphdra_s { struct conn_s; struct tcp_listen_cnt_s; +struct tcp_rg_s; /* * Control structure for each open TCP stream, @@ -404,6 +405,13 @@ typedef struct tcp_s { struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; + /* + * Group of tcp_t entries bound to the same adress and port via + * SO_REUSEPORT. The pointer itself is protected by tf_lock in the + * containing tcps_bind_fanout slot. + */ + struct tcp_rg_s *tcp_rg_bind; + uint_t tcp_maxpsz_multiplier; uint32_t tcp_lso_max; /* maximum LSO payload */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index fba7125690..cf046c968e 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -21,7 +21,7 @@ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013,2014 by Delphix. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. @@ -1423,6 +1423,21 @@ tcp_free(tcp_t *tcp) tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind); /* + * Destroy any association with SO_REUSEPORT group. + */ + if (tcp->tcp_rg_bind != NULL) { + /* + * This is only necessary for connections which enabled + * SO_REUSEPORT but were never bound. Such connections should + * be the one and only member of the tcp_rg_tp to which they + * have been associated. + */ + VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); + tcp_rg_destroy(tcp->tcp_rg_bind); + tcp->tcp_rg_bind = NULL; + } + + /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see * STREAMS based conns with upper handles, in which case there is @@ -2054,8 +2069,7 @@ tcp_reinit(tcp_t *tcp) * structure! */ static void -tcp_reinit_values(tcp) - tcp_t *tcp; +tcp_reinit_values(tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; conn_t *connp = tcp->tcp_connp; diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c index c6df39b91e..adc201eebb 100644 --- a/usr/src/uts/common/inet/tcp/tcp_bind.c +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. */ #include <sys/types.h> @@ -55,6 +56,7 @@ static uint32_t tcp_random_anon_port = 1; static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, cred_t *cr); static in_port_t tcp_get_next_priv_port(const tcp_t *); +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *); /* * Hash list insertion routine for tcp_t structures. Each hash bucket @@ -172,6 +174,16 @@ tcp_bind_hash_remove(tcp_t *tcp) ASSERT(lockp != NULL); mutex_enter(lockp); + + /* destroy any association with SO_REUSEPORT group */ + if (tcp->tcp_rg_bind != NULL) { + if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) { + /* Last one out turns off the lights */ + tcp_rg_destroy(tcp->tcp_rg_bind); + } + tcp->tcp_rg_bind = NULL; + } + if (tcp->tcp_ptpbhn) { tcpnext = tcp->tcp_bind_hash_port; if (tcpnext != NULL) { @@ -636,13 +648,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } /* - * If the "bind_to_req_port_only" parameter is set, if the requested port - * number is available, return it, If not return 0 + * If the "bind_to_req_port_only" parameter is set and the requested port + * number is available, return it (else return 0). * - * If "bind_to_req_port_only" parameter is not set and - * If the requested port number is available, return it. If not, return - * the first anonymous port we happen across. If no anonymous ports are - * available, return 0. addr is the requested local address, if any. + * If "bind_to_req_port_only" parameter is not set and the requested port + * number is available, return it. If not, return the first anonymous port we + * happen across. If no anonymous ports are available, return 0. * * In either case, when succeeding update the tcp_t to record the port number * and insert it in the bind hash table. @@ -662,6 +673,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int loopmax; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; + boolean_t reuseport = connp->conn_reuseport; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -698,6 +710,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, tf_t *tbf; tcp_t *ltcp; conn_t *lconnp; + boolean_t attempt_reuse = B_FALSE; lport = htons(port); @@ -724,6 +737,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { boolean_t not_socket; boolean_t exclbind; + boolean_t addrmatch; lconnp = ltcp->tcp_connp; @@ -829,22 +843,34 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, &lconnp->conn_faddr_v6))) continue; + addrmatch = IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6); + + if (addrmatch && reuseport && bind_to_req_port_only && + (ltcp->tcp_state == TCPS_BOUND || + ltcp->tcp_state == TCPS_LISTEN)) { + /* + * This entry is bound to the exact same + * address and port. If SO_REUSEPORT is set on + * the calling socket, attempt to reuse this + * binding if it too appears to be willing. + */ + attempt_reuse = B_TRUE; + break; + } + if (!reuseaddr) { /* - * No socket option SO_REUSEADDR. - * If existing port is bound to - * a non-wildcard IP address - * and the requesting stream is - * bound to a distinct - * different IP addresses - * (non-wildcard, also), keep - * going. + * No socket option SO_REUSEADDR. If an + * existing port is bound to a non-wildcard IP + * address and the requesting stream is bound + * to a distinct different IP address + * (non-wildcard, also), keep going. */ if (!V6_OR_V4_INADDR_ANY(*laddr) && !V6_OR_V4_INADDR_ANY( lconnp->conn_bound_addr_v6) && - !IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6)) + !addrmatch) continue; if (ltcp->tcp_state >= TCPS_BOUND) { /* @@ -859,27 +885,47 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * socket option SO_REUSEADDR is set on the * binding tcp_t. * - * If two streams are bound to - * same IP address or both addr - * and bound source are wildcards - * (INADDR_ANY), we want to stop - * searching. - * We have found a match of IP source - * address and source port, which is - * refused regardless of the - * SO_REUSEADDR setting, so we break. + * If two streams are bound to the same IP + * address or both addr and bound source are + * wildcards (INADDR_ANY), we want to stop + * searching. We have found a match of IP + * source address and source port, which is + * refused regardless of the SO_REUSEADDR + * setting, so we break. */ - if (IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6) && + if (addrmatch && (ltcp->tcp_state == TCPS_LISTEN || ltcp->tcp_state == TCPS_BOUND)) break; } } - if (ltcp != NULL) { + if (ltcp != NULL && !attempt_reuse) { /* The port number is busy */ mutex_exit(&tbf->tf_lock); } else { + if (attempt_reuse) { + int err; + + ASSERT(ltcp != NULL); + ASSERT(ltcp->tcp_rg_bind != NULL); + ASSERT(tcp->tcp_rg_bind != NULL); + ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind); + + err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp); + if (err != 0) { + mutex_exit(&tbf->tf_lock); + return (0); + } + /* + * Now that the newly-binding socket has joined + * the existing reuseport group on ltcp, it + * should clean up its own (empty) group. + */ + VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); + tcp_rg_destroy(tcp->tcp_rg_bind); + tcp->tcp_rg_bind = ltcp->tcp_rg_bind; + } + /* * This port is ours. Insert in fanout and mark as * bound to prevent others from getting the port @@ -944,3 +990,125 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, } while (++count < loopmax); return (0); } + +/* Max number of members in TCP SO_REUSEPORT group */ +#define TCP_RG_SIZE_MAX 64 +/* Step size when expanding members array */ +#define TCP_RG_SIZE_STEP 2 + + +tcp_rg_t * +tcp_rg_init(tcp_t *tcp) +{ + tcp_rg_t *rg; + rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI); + if (rg == NULL) + return (NULL); + rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (rg->tcprg_members == NULL) { + kmem_free(rg, sizeof (tcp_rg_t)); + return (NULL); + } + + mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL); + rg->tcprg_size = 2; + rg->tcprg_count = 1; + rg->tcprg_active = 1; + rg->tcprg_members[0] = tcp; + return (rg); +} + +void +tcp_rg_destroy(tcp_rg_t *rg) +{ + mutex_enter(&rg->tcprg_lock); + ASSERT(rg->tcprg_count == 0); + ASSERT(rg->tcprg_active == 0); + kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *)); + mutex_destroy(&rg->tcprg_lock); + kmem_free(rg, sizeof (struct tcp_rg_s)); +} + +static int +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp) +{ + mutex_enter(&rg->tcprg_lock); + + VERIFY(rg->tcprg_size > 0); + VERIFY(rg->tcprg_count <= rg->tcprg_size); + if (rg->tcprg_count != 0) { + cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred; + cred_t *newcred = tcp->tcp_connp->conn_cred; + + if (crgetuid(oldcred) != crgetuid(newcred) || + crgetzoneid(oldcred) != crgetzoneid(newcred)) { + mutex_exit(&rg->tcprg_lock); + return (EPERM); + } + } + + if (rg->tcprg_count == rg->tcprg_size) { + unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *); + unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP; + tcp_t **newmembers; + + if (newsize > TCP_RG_SIZE_MAX) { + mutex_exit(&rg->tcprg_lock); + return (EINVAL); + } + newmembers = kmem_zalloc(newsize * sizeof (tcp_t *), + KM_NOSLEEP|KM_NORMALPRI); + if (newmembers == NULL) { + mutex_exit(&rg->tcprg_lock); + return (ENOMEM); + } + bcopy(rg->tcprg_members, newmembers, oldalloc); + kmem_free(rg->tcprg_members, oldalloc); + rg->tcprg_members = newmembers; + rg->tcprg_size = newsize; + } + + rg->tcprg_members[rg->tcprg_count] = tcp; + rg->tcprg_count++; + rg->tcprg_active++; + + mutex_exit(&rg->tcprg_lock); + return (0); +} + +boolean_t +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp) +{ + int i; + boolean_t is_empty; + + mutex_enter(&rg->tcprg_lock); + for (i = 0; i < rg->tcprg_count; i++) { + if (rg->tcprg_members[i] == tcp) + break; + } + /* The item should be present */ + ASSERT(i < rg->tcprg_count); + /* Move the last member into this position */ + rg->tcprg_count--; + rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count]; + rg->tcprg_members[rg->tcprg_count] = NULL; + if (tcp->tcp_connp->conn_reuseport != 0) + rg->tcprg_active--; + is_empty = (rg->tcprg_count == 0); + mutex_exit(&rg->tcprg_lock); + return (is_empty); +} + +void +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active) +{ + mutex_enter(&rg->tcprg_lock); + if (is_active) { + rg->tcprg_active++; + } else { + rg->tcprg_active--; + } + mutex_exit(&rg->tcprg_lock); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index cf8e0c6bd4..7cfdb9a4a2 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2011 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2014 by Delphix. All rights reserved. */ @@ -99,7 +99,7 @@ * tcps_time_wait_interval since the period before upper layer closes the * connection is not accounted for when tcp_time_wait_append() is called. * - * If uppser layer has closed the connection, call tcp_time_wait_append() + * If upper layer has closed the connection, call tcp_time_wait_append() * directly. * */ diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 1a5363bedc..835acd1b12 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -62,7 +63,8 @@ opdes_t tcp_opt_arr[] = { { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, @@ -483,6 +485,42 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) return (retval); } +static int +tcp_set_reuseport(conn_t *connp, boolean_t do_enable) +{ + tcp_t *tcp = connp->conn_tcp; + struct tcp_rg_s *rg; + + if (do_enable && !IPCL_IS_NONSTR(connp)) { + /* + * SO_REUSEPORT cannot be enabled on sockets which have fallen + * back to the STREAMS API. + */ + return (EINVAL); + } + if (connp->conn_reuseport == 0 && do_enable) { + /* disabled -> enabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } else { + if (tcp->tcp_state >= TCPS_BOUND || + tcp->tcp_state <= TCPS_CLOSED) + return (EINVAL); + if ((rg = tcp_rg_init(tcp)) == NULL) + return (ENOMEM); + tcp->tcp_rg_bind = rg; + } + connp->conn_reuseport = 1; + } else if (connp->conn_reuseport != 0 && !do_enable) { + /* enabled -> disabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } + connp->conn_reuseport = 0; + } + return (0); +} + /* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. @@ -653,6 +691,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } *outlenp = inlen; return (0); + case SO_REUSEPORT: + if (!checkonly) { + return (tcp_set_reuseport(connp, *i1 != 0)); + } + return (0); } break; case IPPROTO_TCP: @@ -769,14 +812,37 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, if (*i1 == 0) { return (EINVAL); } else if (tcp->tcp_ka_rinterval == 0) { - if ((tcp->tcp_ka_abort_thres / *i1) < - tcp->tcp_rto_min || - (tcp->tcp_ka_abort_thres / *i1) > - tcp->tcp_rto_max) - return (EINVAL); + /* + * When TCP_KEEPCNT is specified without first + * specifying a TCP_KEEPINTVL, we infer an + * interval based on a tunable specific to our + * stack: the tcp_keepalive_abort_interval. + * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in + * the unlikely event that that has been set.) + * Given the abort interval's default value of + * 480 seconds, low TCP_KEEPCNT values can + * result in intervals that exceed the default + * maximum RTO of 60 seconds. Rather than + * fail in these cases, we (implicitly) clamp + * the interval at the maximum RTO; if the + * TCP_KEEPCNT is shortly followed by a + * TCP_KEEPINTVL (as we expect), the abort + * threshold will be recalculated correctly -- + * and if a TCP_KEEPINTVL is not forthcoming, + * keep-alive will at least operate reasonably + * given the underconfigured state. + */ + uint32_t interval; - tcp->tcp_ka_rinterval = - tcp->tcp_ka_abort_thres / *i1; + interval = tcp->tcp_ka_abort_thres / *i1; + + if (interval < tcp->tcp_rto_min) + interval = tcp->tcp_rto_min; + + if (interval > tcp->tcp_rto_max) + interval = tcp->tcp_rto_max; + + tcp->tcp_ka_rinterval = interval; } else { if ((*i1 * tcp->tcp_ka_rinterval) < tcps->tcps_keepalive_abort_interval_low || @@ -953,10 +1019,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } break; case IPPROTO_IP: - if (connp->conn_family != AF_INET) { - *outlenp = 0; - return (EINVAL); - } switch (name) { case IP_SEC_OPT: /* diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index a431bf63d1..8f535a5dd1 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Joyent, Inc. */ /* This file contains all TCP kernel socket related functions. */ @@ -1022,6 +1023,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, } /* + * Do not allow fallback on connections making use of SO_REUSEPORT. + */ + if (tcp->tcp_rg_bind != NULL) { + freeb(stropt_mp); + freeb(ordrel_mp); + squeue_synch_exit(connp); + return (EINVAL); + } + + /* * Both endpoints must be of the same type (either STREAMS or * non-STREAMS) for fusion to be enabled. So if we are fused, * we have to unfuse. diff --git a/usr/src/uts/common/inet/tcp/tcp_time_wait.c b/usr/src/uts/common/inet/tcp/tcp_time_wait.c index b470934da0..6600296b18 100644 --- a/usr/src/uts/common/inet/tcp/tcp_time_wait.c +++ b/usr/src/uts/common/inet/tcp/tcp_time_wait.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* @@ -41,13 +41,13 @@ #include <inet/tcp_impl.h> #include <inet/tcp_cluster.h> -static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); +static void tcp_time_wait_purge(tcp_t *, tcp_squeue_priv_t *); + +#define TW_BUCKET(t) \ + (((t) / MSEC_TO_TICK(TCP_TIME_WAIT_DELAY)) % TCP_TIME_WAIT_BUCKETS) + +#define TW_BUCKET_NEXT(b) (((b) + 1) % TCP_TIME_WAIT_BUCKETS) -/* - * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. - * Running it every 5 seconds seems to give the best results. - */ -#define TCP_TIME_WAIT_DELAY ((hrtime_t)5 * NANOSEC) /* * Remove a connection from the list of detached TIME_WAIT connections. @@ -56,17 +56,17 @@ static void tcp_timewait_close(void *, mblk_t *, void *, ip_recv_attr_t *); * earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE. */ boolean_t -tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) +tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tsp) { boolean_t locked = B_FALSE; - if (tcp_time_wait == NULL) { - tcp_time_wait = *((tcp_squeue_priv_t **) + if (tsp == NULL) { + tsp = *((tcp_squeue_priv_t **) squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP)); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + mutex_enter(&tsp->tcp_time_wait_lock); locked = B_TRUE; } else { - ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock)); + ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); } /* 0 means that the tcp_t has not been added to the time wait list. */ @@ -74,40 +74,34 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) ASSERT(tcp->tcp_time_wait_next == NULL); ASSERT(tcp->tcp_time_wait_prev == NULL); if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); return (B_FALSE); } ASSERT(TCP_IS_DETACHED(tcp)); ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); + ASSERT(tsp->tcp_time_wait_cnt > 0); - if (tcp == tcp_time_wait->tcp_time_wait_head) { - ASSERT(tcp->tcp_time_wait_prev == NULL); - tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next; - if (tcp_time_wait->tcp_time_wait_head != NULL) { - tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = - NULL; - } else { - tcp_time_wait->tcp_time_wait_tail = NULL; - } - } else if (tcp == tcp_time_wait->tcp_time_wait_tail) { - ASSERT(tcp->tcp_time_wait_next == NULL); - tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev; - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL; - } else { - ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp); - ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp); - tcp->tcp_time_wait_prev->tcp_time_wait_next = - tcp->tcp_time_wait_next; + if (tcp->tcp_time_wait_next != NULL) { tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp->tcp_time_wait_prev; } + if (tcp->tcp_time_wait_prev != NULL) { + tcp->tcp_time_wait_prev->tcp_time_wait_next = + tcp->tcp_time_wait_next; + } else { + unsigned int bucket; + + bucket = TW_BUCKET(tcp->tcp_time_wait_expire); + ASSERT(tsp->tcp_time_wait_bucket[bucket] == tcp); + tsp->tcp_time_wait_bucket[bucket] = tcp->tcp_time_wait_next; + } tcp->tcp_time_wait_next = NULL; tcp->tcp_time_wait_prev = NULL; tcp->tcp_time_wait_expire = 0; + tsp->tcp_time_wait_cnt--; if (locked) - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); return (B_TRUE); } @@ -126,6 +120,7 @@ tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait) ((x)->tcp_connp->conn_ipversion == IPV6_VERSION && \ IN6_IS_ADDR_LOOPBACK(&(x)->tcp_connp->conn_laddr_v6))) + /* * Add a connection to the list of detached TIME_WAIT connections * and set its time to expire. @@ -135,9 +130,10 @@ tcp_time_wait_append(tcp_t *tcp) { tcp_stack_t *tcps = tcp->tcp_tcps; squeue_t *sqp = tcp->tcp_connp->conn_sqp; - tcp_squeue_priv_t *tcp_time_wait = + tcp_squeue_priv_t *tsp = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); - hrtime_t firetime = 0; + int64_t now, schedule; + unsigned int bucket; tcp_timers_stop(tcp); @@ -146,6 +142,8 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_ack_tid == 0); /* must have happened at the time of detaching the tcp */ + ASSERT(TCP_IS_DETACHED(tcp)); + ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); ASSERT(tcp->tcp_ptpahn == NULL); ASSERT(tcp->tcp_flow_stopped == 0); ASSERT(tcp->tcp_time_wait_next == NULL); @@ -153,97 +151,112 @@ tcp_time_wait_append(tcp_t *tcp) ASSERT(tcp->tcp_time_wait_expire == 0); ASSERT(tcp->tcp_listener == NULL); - tcp->tcp_time_wait_expire = ddi_get_lbolt64(); - if (IS_LOCAL_HOST(tcp)) { - /* - * This is the fastpath for handling localhost connections. - * Since we don't have to worry about packets on the localhost - * showing up after a long network delay, we want to expire - * these quickly so the port range on the localhost doesn't - * get starved by short-running, local apps. - * - * Leave tcp_time_wait_expire at the current time. This - * essentially means the connection is expired now and it will - * clean up the next time tcp_time_wait_collector runs. We set - * firetime to use a short delay so that if we have to start a - * tcp_time_wait_collector thread below, it runs soon instead - * of after a delay of time_wait_interval. firetime being set - * to a non-0 value is also our indicator that we should add - * this connection to the head of the time wait list (since we - * are already expired) so that its sure to get cleaned up on - * the next run of tcp_time_wait_collector (which expects the - * entries to appear in time-order and stops when it hits the - * first non-expired entry). - */ - firetime = TCP_TIME_WAIT_DELAY; - } else { - /* - * Since tcp_time_wait_expire is lbolt64, it should not wrap - * around in practice. Hence it cannot be 0. Note that zero - * means that the tcp_t is not in the TIME_WAIT list. - */ - tcp->tcp_time_wait_expire += MSEC_TO_TICK( - tcps->tcps_time_wait_interval); + TCP_DBGSTAT(tcps, tcp_time_wait); + mutex_enter(&tsp->tcp_time_wait_lock); + + /* + * Immediately expire loopback connections. Since there is no worry + * about packets on the local host showing up after a long network + * delay, this is safe and allows much higher rates of connection churn + * for applications operating locally. + * + * This typically bypasses the tcp_free_list fast path due to squeue + * re-entry for the loopback close operation. + */ + if (tcp->tcp_loopback) { + tcp_time_wait_purge(tcp, tsp); + mutex_exit(&tsp->tcp_time_wait_lock); + return; } - ASSERT(TCP_IS_DETACHED(tcp)); - ASSERT(tcp->tcp_state == TCPS_TIME_WAIT); - ASSERT(tcp->tcp_time_wait_next == NULL); - ASSERT(tcp->tcp_time_wait_prev == NULL); - TCP_DBGSTAT(tcps, tcp_time_wait); + /* + * In order to reap TIME_WAITs reliably, we should use a source of time + * that is not adjustable by the user. While it would be more accurate + * to grab this timestamp before (potentially) sleeping on the + * tcp_time_wait_lock, doing so complicates bucket addressing later. + */ + now = ddi_get_lbolt64(); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - if (tcp_time_wait->tcp_time_wait_head == NULL) { - ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL); - tcp_time_wait->tcp_time_wait_head = tcp; + /* + * Each squeue uses an arbitrary time offset when scheduling + * expiration timers. This prevents the bucketing from forcing + * tcp_time_wait_collector to run in locksetup across squeues. + * + * This offset is (re)initialized when a new TIME_WAIT connection is + * added to an squeue which has no connections waiting to expire. + */ + if (tsp->tcp_time_wait_tid == 0) { + ASSERT(tsp->tcp_time_wait_cnt == 0); + tsp->tcp_time_wait_offset = + now % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + } + now -= tsp->tcp_time_wait_offset; + + /* + * Use the netstack-defined timeout, rounded up to the minimum + * time_wait_collector interval. + */ + schedule = now + MSEC_TO_TICK(tcps->tcps_time_wait_interval); + tcp->tcp_time_wait_expire = schedule; + + /* + * Append the connection into the appropriate bucket. + */ + bucket = TW_BUCKET(tcp->tcp_time_wait_expire); + tcp->tcp_time_wait_next = tsp->tcp_time_wait_bucket[bucket]; + tsp->tcp_time_wait_bucket[bucket] = tcp; + if (tcp->tcp_time_wait_next != NULL) { + ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == NULL); + tcp->tcp_time_wait_next->tcp_time_wait_prev = tcp; + } + tsp->tcp_time_wait_cnt++; + + /* + * Round delay up to the nearest bucket boundary. + */ + schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + schedule -= schedule % MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + + /* + * The newly inserted entry may require a tighter schedule for the + * expiration timer. + */ + if (schedule < tsp->tcp_time_wait_schedule) { + callout_id_t old_tid = tsp->tcp_time_wait_tid; + + tsp->tcp_time_wait_schedule = schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); /* - * Even if the list was empty before, there may be a timer - * running since a tcp_t can be removed from the list - * in other places, such as tcp_clean_death(). So check if - * a timer is needed. - */ - if (tcp_time_wait->tcp_time_wait_tid == 0) { - if (firetime == 0) - firetime = (hrtime_t) - (tcps->tcps_time_wait_interval + 1) * - MICROSEC; - - tcp_time_wait->tcp_time_wait_tid = - timeout_generic(CALLOUT_NORMAL, - tcp_time_wait_collector, sqp, firetime, - CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); - } - tcp_time_wait->tcp_time_wait_tail = tcp; - } else { - /* - * The list is not empty, so a timer must be running. If not, - * tcp_time_wait_collector() must be running on this - * tcp_time_wait list at the same time. + * It is possible for the timer to fire before the untimeout + * action is able to complete. In that case, the exclusion + * offered by the tcp_time_wait_collector_active flag will + * prevent multiple collector threads from processing records + * simultaneously from the same squeue. */ - ASSERT(tcp_time_wait->tcp_time_wait_tid != 0 || - tcp_time_wait->tcp_time_wait_running); - ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL); - ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state == - TCPS_TIME_WAIT); - - if (firetime == 0) { - /* add at end */ - tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = - tcp; - tcp->tcp_time_wait_prev = - tcp_time_wait->tcp_time_wait_tail; - tcp_time_wait->tcp_time_wait_tail = tcp; - } else { - /* add at head */ - tcp->tcp_time_wait_next = - tcp_time_wait->tcp_time_wait_head; - tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev = - tcp; - tcp_time_wait->tcp_time_wait_head = tcp; - } + mutex_exit(&tsp->tcp_time_wait_lock); + (void) untimeout_default(old_tid, 0); + return; + } + + /* + * Start a fresh timer if none exists. + */ + if (tsp->tcp_time_wait_schedule == 0) { + ASSERT(tsp->tcp_time_wait_tid == 0); + + tsp->tcp_time_wait_schedule = schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); } - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + mutex_exit(&tsp->tcp_time_wait_lock); } /* @@ -278,216 +291,287 @@ tcp_timewait_close(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy) tcp_close_detached(tcp); } + +static void +tcp_time_wait_purge(tcp_t *tcp, tcp_squeue_priv_t *tsp) +{ + mblk_t *mp; + conn_t *connp = tcp->tcp_connp; + kmutex_t *lock; + + ASSERT(MUTEX_HELD(&tsp->tcp_time_wait_lock)); + ASSERT(connp->conn_fanout != NULL); + + lock = &connp->conn_fanout->connf_lock; + + /* + * This is essentially a TIME_WAIT reclaim fast path optimization for + * performance where the connection is checked under the fanout lock + * (so that no one else can get access to the conn_t) that the refcnt + * is 2 (one each for TCP and the classifier hash list). That is the + * case and clustering callbacks are not enabled, the conn can be + * removed under the fanout lock and avoid clean-up under the squeue. + * + * This optimization is forgone when clustering is enabled since the + * clustering callback must be made before setting the CONDEMNED flag + * and after dropping all locks + * + * See the comments in tcp_closei_local for additional information + * regarding the refcnt logic. + */ + if (mutex_tryenter(lock)) { + mutex_enter(&connp->conn_lock); + if (connp->conn_ref == 2 && cl_inet_disconnect == NULL) { + ipcl_hash_remove_locked(connp, connp->conn_fanout); + /* + * Set the CONDEMNED flag now itself so that the refcnt + * cannot increase due to any walker. + */ + connp->conn_state_flags |= CONN_CONDEMNED; + mutex_exit(&connp->conn_lock); + mutex_exit(lock); + if (tsp->tcp_free_list_cnt < tcp_free_list_max_cnt) { + /* + * Add to head of tcp_free_list + */ + tcp_cleanup(tcp); + ASSERT(connp->conn_latch == NULL); + ASSERT(connp->conn_policy == NULL); + ASSERT(tcp->tcp_tcps == NULL); + ASSERT(connp->conn_netstack == NULL); + + tcp->tcp_time_wait_next = tsp->tcp_free_list; + tcp->tcp_in_free_list = B_TRUE; + tsp->tcp_free_list = tcp; + tsp->tcp_free_list_cnt++; + } else { + /* + * Do not add to tcp_free_list + */ + tcp_bind_hash_remove(tcp); + ixa_cleanup(tcp->tcp_connp->conn_ixa); + tcp_ipsec_cleanup(tcp); + CONN_DEC_REF(tcp->tcp_connp); + } + + /* + * With the fast-path complete, we can bail. + */ + return; + } else { + /* + * Fall back to slow path. + */ + CONN_INC_REF_LOCKED(connp); + mutex_exit(&connp->conn_lock); + mutex_exit(lock); + } + } else { + CONN_INC_REF(connp); + } + + /* + * We can reuse the closemp here since conn has detached (otherwise we + * wouldn't even be in time_wait list). It is safe to change + * tcp_closemp_used without taking a lock as no other thread can + * concurrently access it at this point in the connection lifecycle. + */ + if (tcp->tcp_closemp.b_prev == NULL) { + tcp->tcp_closemp_used = B_TRUE; + } else { + cmn_err(CE_PANIC, + "tcp_timewait_collector: concurrent use of tcp_closemp: " + "connp %p tcp %p\n", (void *)connp, (void *)tcp); + } + + TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); + mp = &tcp->tcp_closemp; + mutex_exit(&tsp->tcp_time_wait_lock); + SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_timewait_close, connp, NULL, + SQ_FILL, SQTAG_TCP_TIMEWAIT); + mutex_enter(&tsp->tcp_time_wait_lock); +} + /* - * Blows away all tcps whose TIME_WAIT has expired. List traversal - * is done forwards from the head. - * This walks all stack instances since - * tcp_time_wait remains global across all stacks. + * Purge any tcp_t instances associated with this squeue which have expired + * from the TIME_WAIT state. */ -/* ARGSUSED */ void tcp_time_wait_collector(void *arg) { tcp_t *tcp; - int64_t now; - mblk_t *mp; - conn_t *connp; - kmutex_t *lock; - boolean_t removed; - extern void (*cl_inet_disconnect)(netstackid_t, uint8_t, sa_family_t, - uint8_t *, in_port_t, uint8_t *, in_port_t, void *); + int64_t now, active_schedule, new_schedule; + unsigned int idx; squeue_t *sqp = (squeue_t *)arg; - tcp_squeue_priv_t *tcp_time_wait = + tcp_squeue_priv_t *tsp = *((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP)); - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); - tcp_time_wait->tcp_time_wait_tid = 0; -#ifdef DEBUG - tcp_time_wait->tcp_time_wait_running = B_TRUE; -#endif + mutex_enter(&tsp->tcp_time_wait_lock); + + /* + * Because of timer scheduling complexity and the fact that the + * tcp_time_wait_lock is dropped during tcp_time_wait_purge, it is + * possible for multiple tcp_time_wait_collector threads to run against + * the same squeue. This flag is used to exclude other collectors from + * the squeue during execution. + */ + if (tsp->tcp_time_wait_collector_active) { + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } + tsp->tcp_time_wait_collector_active = B_TRUE; - if (tcp_time_wait->tcp_free_list != NULL && - tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) { + /* + * Purge the free list if necessary + */ + if (tsp->tcp_free_list != NULL) { TCP_G_STAT(tcp_freelist_cleanup); - while ((tcp = tcp_time_wait->tcp_free_list) != NULL) { - tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next; + while ((tcp = tsp->tcp_free_list) != NULL) { + tsp->tcp_free_list = tcp->tcp_time_wait_next; tcp->tcp_time_wait_next = NULL; - tcp_time_wait->tcp_free_list_cnt--; + tsp->tcp_free_list_cnt--; ASSERT(tcp->tcp_tcps == NULL); CONN_DEC_REF(tcp->tcp_connp); } - ASSERT(tcp_time_wait->tcp_free_list_cnt == 0); + ASSERT(tsp->tcp_free_list_cnt == 0); } /* - * In order to reap time waits reliably, we should use a - * source of time that is not adjustable by the user -- hence - * the call to ddi_get_lbolt64(). + * If there are no connections pending, clear timer-related state to be + * reinitialized by the next caller. */ - now = ddi_get_lbolt64(); - while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) { + if (tsp->tcp_time_wait_cnt == 0) { + tsp->tcp_time_wait_offset = 0; + tsp->tcp_time_wait_schedule = 0; + tsp->tcp_time_wait_tid = 0; + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } + + /* + * Grab the bucket which we were scheduled to cleanse. + */ + active_schedule = tsp->tcp_time_wait_schedule; + idx = TW_BUCKET(active_schedule - 1); + now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; +retry: + tcp = tsp->tcp_time_wait_bucket[idx]; + + while (tcp != NULL) { /* - * lbolt64 should not wrap around in practice... So we can - * do a direct comparison. + * Since the bucket count is sized to prevent wrap-around + * during typical operation and timers are schedule to process + * buckets with only expired connections, there is only one + * reason to encounter a connection expiring in the future: + * The tcp_time_wait_collector thread has been so delayed in + * its processing that connections have wrapped around the + * timing wheel into this bucket. + * + * In that case, the remaining entires in the bucket can be + * ignored since, being appended sequentially, they should all + * expire in the future. */ - if (now < tcp->tcp_time_wait_expire) + if (now < tcp->tcp_time_wait_expire) { break; + } - removed = tcp_time_wait_remove(tcp, tcp_time_wait); - ASSERT(removed); + /* + * Pull the connection out of the bucket. + */ + VERIFY(tcp_time_wait_remove(tcp, tsp)); - connp = tcp->tcp_connp; - ASSERT(connp->conn_fanout != NULL); - lock = &connp->conn_fanout->connf_lock; /* - * This is essentially a TW reclaim fast path optimization for - * performance where the timewait collector checks under the - * fanout lock (so that no one else can get access to the - * conn_t) that the refcnt is 2 i.e. one for TCP and one for - * the classifier hash list. If ref count is indeed 2, we can - * just remove the conn under the fanout lock and avoid - * cleaning up the conn under the squeue, provided that - * clustering callbacks are not enabled. If clustering is - * enabled, we need to make the clustering callback before - * setting the CONDEMNED flag and after dropping all locks and - * so we forego this optimization and fall back to the slow - * path. Also please see the comments in tcp_closei_local - * regarding the refcnt logic. + * Purge the connection. * - * Since we are holding the tcp_time_wait_lock, its better - * not to block on the fanout_lock because other connections - * can't add themselves to time_wait list. So we do a - * tryenter instead of mutex_enter. + * While tcp_time_wait_lock will be temporarily dropped as part + * of the process, there is no risk of the timer being + * (re)scheduled while the collector is running since a value + * corresponding to the past is left in tcp_time_wait_schedule. */ - if (mutex_tryenter(lock)) { - mutex_enter(&connp->conn_lock); - if ((connp->conn_ref == 2) && - (cl_inet_disconnect == NULL)) { - ipcl_hash_remove_locked(connp, - connp->conn_fanout); - /* - * Set the CONDEMNED flag now itself so that - * the refcnt cannot increase due to any - * walker. - */ - connp->conn_state_flags |= CONN_CONDEMNED; - mutex_exit(lock); - mutex_exit(&connp->conn_lock); - if (tcp_time_wait->tcp_free_list_cnt < - tcp_free_list_max_cnt) { - /* Add to head of tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_cleanup(tcp); - ASSERT(connp->conn_latch == NULL); - ASSERT(connp->conn_policy == NULL); - ASSERT(tcp->tcp_tcps == NULL); - ASSERT(connp->conn_netstack == NULL); - - mutex_enter( - &tcp_time_wait->tcp_time_wait_lock); - tcp->tcp_time_wait_next = - tcp_time_wait->tcp_free_list; - tcp_time_wait->tcp_free_list = tcp; - tcp_time_wait->tcp_free_list_cnt++; - continue; - } else { - /* Do not add to tcp_free_list */ - mutex_exit( - &tcp_time_wait->tcp_time_wait_lock); - tcp_bind_hash_remove(tcp); - ixa_cleanup(tcp->tcp_connp->conn_ixa); - tcp_ipsec_cleanup(tcp); - CONN_DEC_REF(tcp->tcp_connp); - } - } else { - CONN_INC_REF_LOCKED(connp); - mutex_exit(lock); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ + tcp_time_wait_purge(tcp, tsp); - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, - "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_close, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); - } - } else { - mutex_enter(&connp->conn_lock); - CONN_INC_REF_LOCKED(connp); - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); - mutex_exit(&connp->conn_lock); - /* - * We can reuse the closemp here since conn has - * detached (otherwise we wouldn't even be in - * time_wait list). tcp_closemp_used can safely - * be changed without taking a lock as no other - * thread can concurrently access it at this - * point in the connection lifecycle. - */ + /* + * Because tcp_time_wait_remove clears the tcp_time_wait_next + * field, the next item must be grabbed directly from the + * bucket itself. + */ + tcp = tsp->tcp_time_wait_bucket[idx]; + } + + if (tsp->tcp_time_wait_cnt == 0) { + /* + * There is not a need for the collector to schedule a new + * timer if no pending items remain. The timer state can be + * cleared only if it was untouched while the collector dropped + * its locks during tcp_time_wait_purge. + */ + if (tsp->tcp_time_wait_schedule == active_schedule) { + tsp->tcp_time_wait_offset = 0; + tsp->tcp_time_wait_schedule = 0; + tsp->tcp_time_wait_tid = 0; + } + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; + } else { + unsigned int nidx; - if (tcp->tcp_closemp.b_prev == NULL) - tcp->tcp_closemp_used = B_TRUE; - else - cmn_err(CE_PANIC, "tcp_timewait_collector: " - "concurrent use of tcp_closemp: " - "connp %p tcp %p\n", (void *)connp, - (void *)tcp); - - TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15); - mp = &tcp->tcp_closemp; - SQUEUE_ENTER_ONE(connp->conn_sqp, mp, - tcp_timewait_close, connp, NULL, - SQ_FILL, SQTAG_TCP_TIMEWAIT); + /* + * Locate the next bucket containing entries. + */ + new_schedule = active_schedule + + MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); + nidx = TW_BUCKET_NEXT(idx); + while (tsp->tcp_time_wait_bucket[nidx] == NULL) { + if (nidx == idx) { + break; + } + nidx = TW_BUCKET_NEXT(nidx); + new_schedule += MSEC_TO_TICK(TCP_TIME_WAIT_DELAY); } - mutex_enter(&tcp_time_wait->tcp_time_wait_lock); + ASSERT(tsp->tcp_time_wait_bucket[nidx] != NULL); } - if (tcp_time_wait->tcp_free_list != NULL) - tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE; + /* + * It is possible that the system is under such dire load that between + * the timer scheduling and TIME_WAIT processing delay, execution + * overran the interval allocated to this bucket. + */ + now = ddi_get_lbolt64() - tsp->tcp_time_wait_offset; + if (new_schedule <= now) { + /* + * Attempt to right the situation by immediately performing a + * purge on the next bucket. This loop will continue as needed + * until the schedule can be pushed out ahead of the clock. + */ + idx = TW_BUCKET(new_schedule - 1); + goto retry; + } /* - * If the time wait list is not empty and there is no timer running, - * restart it. + * Another thread may have snuck in to reschedule the timer while locks + * were dropped during tcp_time_wait_purge. Defer to the running timer + * if that is the case. */ - if ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL && - tcp_time_wait->tcp_time_wait_tid == 0) { - hrtime_t firetime; - - /* shouldn't be necessary, but just in case */ - if (tcp->tcp_time_wait_expire < now) - tcp->tcp_time_wait_expire = now; - - firetime = TICK_TO_NSEC(tcp->tcp_time_wait_expire - now); - /* This ensures that we won't wake up too often. */ - firetime = MAX(TCP_TIME_WAIT_DELAY, firetime); - tcp_time_wait->tcp_time_wait_tid = - timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, - sqp, firetime, CALLOUT_TCP_RESOLUTION, - CALLOUT_FLAG_ROUNDUP); + if (tsp->tcp_time_wait_schedule != active_schedule) { + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); + return; } -#ifdef DEBUG - tcp_time_wait->tcp_time_wait_running = B_FALSE; -#endif - mutex_exit(&tcp_time_wait->tcp_time_wait_lock); + + /* + * Schedule the next timer. + */ + tsp->tcp_time_wait_schedule = new_schedule; + tsp->tcp_time_wait_tid = + timeout_generic(CALLOUT_NORMAL, + tcp_time_wait_collector, sqp, + TICK_TO_NSEC(new_schedule - now), + CALLOUT_TCP_RESOLUTION, CALLOUT_FLAG_ROUNDUP); + tsp->tcp_time_wait_collector_active = B_FALSE; + mutex_exit(&tsp->tcp_time_wait_lock); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_tunables.c b/usr/src/uts/common/inet/tcp/tcp_tunables.c index be75f1f663..f4d6c71914 100644 --- a/usr/src/uts/common/inet/tcp/tcp_tunables.c +++ b/usr/src/uts/common/inet/tcp/tcp_tunables.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ @@ -249,7 +249,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = { /* tunable - 0 */ { "_time_wait_interval", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, - {1*SECONDS, 10*MINUTES, 1*MINUTES}, {1*MINUTES} }, + {1*SECONDS, TCP_TIME_WAIT_MAX, 1*MINUTES}, {1*MINUTES} }, { "_conn_req_max_q", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, @@ -307,7 +307,7 @@ mod_prop_info_t tcp_propinfo_tbl[] = { { "_keepalive_interval", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, - {10*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} }, + {1*SECONDS, 10*DAYS, 2*HOURS}, {2*HOURS} }, { "_maxpsz_multiplier", MOD_PROTO_TCP, mod_set_uint32, mod_get_uint32, diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 0f0f915a2b..cb83b91fad 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2013, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ @@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls; * by setting it to 0. */ #define TCP_XMIT_LOWATER 4096 -#define TCP_XMIT_HIWATER 49152 +#define TCP_XMIT_HIWATER 128000 #define TCP_RECV_LOWATER 2048 -#define TCP_RECV_HIWATER 128000 +#define TCP_RECV_HIWATER 1048576 /* * Bind hash list size and has function. It has to be a power of 2 for @@ -105,7 +105,7 @@ extern sock_downcalls_t sock_tcp_downcalls; */ #define TCP_IS_DETACHED(tcp) ((tcp)->tcp_detached) -/* TCP timers related data strucutres. Refer to tcp_timers.c. */ +/* TCP timers related data structures. Refer to tcp_timers.c. */ typedef struct tcp_timer_s { conn_t *connp; void (*tcpt_proc)(void *); @@ -132,48 +132,79 @@ extern kmem_cache_t *tcp_timercache; (tcp)->tcp_timer_tid = TCP_TIMER((tcp), tcp_timer, (intvl)); \ } + +/* + * Maximum TIME_WAIT timeout. It is defined here (instead of tcp_tunables.c) + * so that other parameters can be derived from it. + */ +#define TCP_TIME_WAIT_MAX (10 * MINUTES) + +/* + * TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs. + * Running it every 5 seconds seems to yield a reasonable balance between + * cleanup liveliness and system load. + */ +#define TCP_TIME_WAIT_DELAY (5 * SECONDS) + +#define TCP_TIME_WAIT_BUCKETS ((TCP_TIME_WAIT_MAX / TCP_TIME_WAIT_DELAY) + 1) + /* * For scalability, we must not run a timer for every TCP connection * in TIME_WAIT state. To see why, consider (for time wait interval of * 1 minutes): * 10,000 connections/sec * 60 seconds/time wait = 600,000 active conn's * - * This list is ordered by time, so you need only delete from the head - * until you get to entries which aren't old enough to delete yet. - * The list consists of only the detached TIME_WAIT connections. + * Since TIME_WAIT expiration occurs on a per-squeue basis, handling + * connections from all netstacks on the system, a simple queue is inadequate + * for pending entries. This is because tcp_time_wait_interval may differ + * between connections, causing tail insertion to violate expiration order. + * + * Instead of performing expensive sorting or unnecessary list traversal to + * counteract interval variance between netstacks, a timing wheel structure is + * used. The duration covered by each bucket in the wheel is determined by the + * TCP_TIME_WAIT_DELAY (5 seconds). The number of buckets in the wheel is + * determined by dividing the maximum TIME_WAIT interval (10 minutes) by + * TCP_TIME_WAIT_DELAY, with one added bucket for rollover protection. + * (Yielding 121 buckets with the current parameters) When items are inserted + * into the set of buckets, they are indexed by using their expiration time + * divided by the bucket size, modulo the number of buckets. This means that + * when each bucket is processed, all items within should have expired within + * the last TCP_TIME_WAIT_DELAY interval. + * + * Since bucket timer schedules are rounded to the nearest TCP_TIME_WAIT_DELAY + * interval to ensure all connections in the pending bucket will be expired, a + * per-squeue offset is used when doing TIME_WAIT scheduling. This offset is + * between 0 and the TCP_TIME_WAIT_DELAY and is designed to avoid scheduling + * all of the tcp_time_wait_collector threads to run in lock-step. The offset + * is fixed while there are any connections present in the buckets. * * When a tcp_t enters TIME_WAIT state, a timer is started (timeout is * tcps_time_wait_interval). When the tcp_t is detached (upper layer closes - * the end point), it is moved to the time wait list and another timer is - * started (expiry time is set at tcp_time_wait_expire, which is - * also calculated using tcps_time_wait_interval). This means that the - * TIME_WAIT state can be extended (up to doubled) if the tcp_t doesn't - * become detached for a long time. + * the end point), it is scheduled to be cleaned up by the squeue-driving + * tcp_time_wait_collector (also using tcps_time_wait_interval). This means + * that the TIME_WAIT state can be extended (up to doubled) if the tcp_t + * doesn't become detached for a long time. * * The list manipulations (including tcp_time_wait_next/prev) * are protected by the tcp_time_wait_lock. The content of the * detached TIME_WAIT connections is protected by the normal perimeters. * - * This list is per squeue and squeues are shared across the tcp_stack_t's. - * Things on tcp_time_wait_head remain associated with the tcp_stack_t - * and conn_netstack. - * The tcp_t's that are added to tcp_free_list are disassociated and - * have NULL tcp_tcps and conn_netstack pointers. + * These connection lists are per squeue and squeues are shared across the + * tcp_stack_t instances. Things in a tcp_time_wait_bucket remain associated + * with the tcp_stack_t and conn_netstack. Any tcp_t connections stored in the + * tcp_free_list are disassociated and have NULL tcp_tcps and conn_netstack + * pointers. */ typedef struct tcp_squeue_priv_s { kmutex_t tcp_time_wait_lock; + boolean_t tcp_time_wait_collector_active; callout_id_t tcp_time_wait_tid; - tcp_t *tcp_time_wait_head; - tcp_t *tcp_time_wait_tail; + uint64_t tcp_time_wait_cnt; + int64_t tcp_time_wait_schedule; + int64_t tcp_time_wait_offset; + tcp_t *tcp_time_wait_bucket[TCP_TIME_WAIT_BUCKETS]; tcp_t *tcp_free_list; uint_t tcp_free_list_cnt; -#ifdef DEBUG - /* - * For debugging purpose, true when tcp_time_wait_collector() is - * running. - */ - boolean_t tcp_time_wait_running; -#endif } tcp_squeue_priv_t; /* @@ -375,6 +406,22 @@ typedef struct tcp_listen_cnt_s { uint32_t tlc_drop; } tcp_listen_cnt_t; +/* + * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT. + * - tcprg_lock: Protects the other fields + * - tcprg_size: Allocated size (in entries) of tcprg_members array + * - tcprg_count: Count of occupied tcprg_members slots + * - tcprg_active: Count of members which still have SO_REUSEPORT set + * - tcprg_members: Connections associated with address/port group + */ +typedef struct tcp_rg_s { + kmutex_t tcprg_lock; + unsigned int tcprg_size; + unsigned int tcprg_count; + unsigned int tcprg_active; + tcp_t **tcprg_members; +} tcp_rg_t; + #define TCP_TLC_REPORT_INTERVAL (30 * MINUTES) #define TCP_DECR_LISTEN_CNT(tcp) \ @@ -618,6 +665,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *, int, boolean_t, boolean_t, boolean_t); extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *, boolean_t); +extern tcp_rg_t *tcp_rg_init(tcp_t *); +extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *); +extern void tcp_rg_destroy(tcp_rg_t *); +extern void tcp_rg_setactive(tcp_rg_t *, boolean_t); /* * Fusion related functions in tcp_fusion.c. diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 5a15aea4de..a88bac932c 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -22,6 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -76,7 +77,8 @@ #include <inet/ipclassifier.h> #include <sys/squeue_impl.h> #include <inet/ipnet.h> -#include <sys/ethernet.h> +#include <sys/vxlan.h> +#include <inet/inet_hash.h> #include <sys/tsol/label.h> #include <sys/tsol/tnet.h> @@ -346,6 +348,89 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol, typedef union T_primitives *t_primp_t; /* + * Various protocols that encapsulate UDP have no real use for the source port. + * Instead, they want to vary the source port to provide better equal-cost + * multipathing and other systems that use fanout. Consider something like + * VXLAN. If you're actually sending multiple different streams to a single + * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP, + * SRC Port, DST Port) will always be the same. + * + * Here, we return a port to hash this to, if we know how to hash it. If for + * some reason we can't perform an L4 hash, then we just return the default + * value, usually the default port. After we determine the hash we transform it + * so that it's in the range of [ min, max ]. + * + * We'd like to avoid a pull up for the sake of performing the hash. If the + * first mblk_t doesn't have the full protocol header, then we just send it to + * the default. If for some reason we have an encapsulated packet that has its + * protocol header in different parts of an mblk_t, then we'll go with the + * default port. This means that that if a driver isn't consistent about how it + * generates the frames for a given flow, it will not always be consistently + * hashed. That should be an uncommon event. + */ +uint16_t +udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max, + uint16_t def) +{ + size_t szused = 0; + struct ether_header *ether; + struct ether_vlan_header *vether; + ip6_t *ip6h; + ipha_t *ipha; + uint16_t sap; + uint64_t hash; + uint32_t mod; + + ASSERT(min <= max); + + if (type != UDP_HASH_VXLAN) + return (def); + + if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t))) + return (def); + + /* + * The following logic is VXLAN specific to get at the header, if we + * have formats, eg. GENEVE, then we should ignore this. + * + * The kernel overlay device often puts a first mblk_t for the data + * which is just the encap. If so, then we're going to use that and try + * to avoid a pull up. + */ + if (MBLKL(mp) == VXLAN_HDR_LEN) { + if (mp->b_cont == NULL) + return (def); + mp = mp->b_cont; + ether = (struct ether_header *)mp->b_rptr; + } else if (MBLKL(mp) < VXLAN_HDR_LEN) { + return (def); + } else { + szused = VXLAN_HDR_LEN; + ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused); + } + + /* Can we hold a MAC header? */ + if (MBLKL(mp) + szused < sizeof (struct ether_header)) + return (def); + + /* + * We need to lie about the starting offset into the message block for + * convenience. Undo it at the end. We know that inet_pkt_hash() won't + * modify the mblk_t. + */ + mp->b_rptr += szused; + hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 | + INET_PKT_HASH_L3 | INET_PKT_HASH_L4); + mp->b_rptr -= szused; + + if (hash == 0) + return (def); + + mod = max - min + 1; + return ((hash % mod) + min); +} + +/* * Return the next anonymous port in the privileged port range for * bind checking. * @@ -1583,6 +1668,16 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name, *i1 = udp->udp_rcvhdr ? 1 : 0; mutex_exit(&connp->conn_lock); return (sizeof (int)); + case UDP_SRCPORT_HASH: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_vxlanhash; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_snd_to_conn ? 1 : 0; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } } mutex_enter(&connp->conn_lock); @@ -1718,6 +1813,31 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name, udp->udp_rcvhdr = onoff; mutex_exit(&connp->conn_lock); return (0); + case UDP_SRCPORT_HASH: + /* + * This should have already been verified, but double + * check. + */ + if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) { + return (error); + } + + /* First see if the val is something we understand */ + if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN) + return (EINVAL); + + if (!checkonly) { + mutex_enter(&connp->conn_lock); + udp->udp_vxlanhash = *i1; + mutex_exit(&connp->conn_lock); + } + /* Fully handled this option. */ + return (0); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + udp->udp_snd_to_conn = onoff; + mutex_exit(&connp->conn_lock); + return (0); } break; } @@ -2001,13 +2121,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, uint32_t cksum; udp_t *udp = connp->conn_udp; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t ulp_hdr_len; + uint16_t srcport; data_len = msgdsize(data_mp); ulp_hdr_len = UDPH_SIZE; if (insert_spi) ulp_hdr_len += sizeof (uint32_t); + /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo, ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp); if (mp == NULL) { @@ -2019,7 +2151,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp, ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length; udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length); - udpha->uha_src_port = connp->conn_lport; + if (hash_srcport == B_TRUE) { + udpha->uha_src_port = htons(srcport); + } else { + udpha->uha_src_port = connp->conn_lport; + } udpha->uha_dst_port = dstport; udpha->uha_checksum = 0; udpha->uha_length = htons(data_len); @@ -3194,6 +3330,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udp_t *udp = connp->conn_udp; udp_stack_t *us = udp->udp_us; boolean_t insert_spi = udp->udp_nat_t_endpoint; + boolean_t hash_srcport = udp->udp_vxlanhash; uint_t pktlen; uint_t alloclen; uint_t copylen; @@ -3202,10 +3339,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, udpha_t *udpha; uint32_t cksum; ip_pkt_t *ipp; + uint16_t srcport; ASSERT(MUTEX_HELD(&connp->conn_lock)); /* + * If we have source port hashing going on, determine the hash before + * we modify the mblk_t. + */ + if (hash_srcport == B_TRUE) { + srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN, + IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX, + ntohs(connp->conn_lport)); + } + + /* * Copy the header template and leave space for an SPI */ copylen = connp->conn_ht_iphc_len; @@ -3303,6 +3451,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp, *((uint32_t *)(udpha + 1)) = 0; udpha->uha_dst_port = dstport; + if (hash_srcport == B_TRUE) + udpha->uha_src_port = htons(srcport); + return (mp); } @@ -5947,10 +6098,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, else return (error); } - if (udp->udp_state == TS_DATA_XFER) { + + /* + * Check if we're allowed to send to a connection on which we've + * already called 'connect'. The posix spec. allows both behaviors but + * historically we've returned an error if already connected. The + * client can allow this via a sockopt. + */ + if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) { UDPS_BUMP_MIB(us, udpOutErrors); return (EISCONN); } + error = proto_verify_ip_addr(connp->conn_family, (struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) { diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index c279bb4a21..847e2cdde6 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015, Joyent, Inc. */ #include <sys/types.h> @@ -292,6 +293,9 @@ opdes_t udp_opt_arr[] = { }, { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, +{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, +{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), + 0 } }; /* diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 6a31ce5c22..ebba10c0f7 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. */ #ifndef _UDP_IMPL_H @@ -178,8 +179,12 @@ typedef struct udp_s { udp_issocket : 1, /* socket mode; sockfs is on top */ udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ udp_rcvhdr : 1, /* UDP_RCVHDR option */ + udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */ + /* Because there's only VXLAN, cheat */ + /* and only use a single bit */ + udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */ - udp_pad_to_bit_31 : 29; + udp_pad_to_bit_31 : 27; /* Following 2 fields protected by the uf_lock */ struct udp_s *udp_bind_hash; /* Bind hash chain */ |