diff options
Diffstat (limited to 'usr/src/uts/common/inet')
41 files changed, 2753 insertions, 342 deletions
diff --git a/usr/src/uts/common/inet/bpf.h b/usr/src/uts/common/inet/bpf.h new file mode 100644 index 0000000000..e3eac799e5 --- /dev/null +++ b/usr/src/uts/common/inet/bpf.h @@ -0,0 +1,49 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Joyent, Inc. + */ + +#ifndef _INET_BPF_H +#define _INET_BPF_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef _KERNEL + +#include <sys/types.h> + +/* + * Clone bpf_insn definition so that consumers don't need net/bpf.h to reason + * about struct sizing. + */ +typedef struct ip_bpf_insn { + uint16_t code; + uint8_t jt; + uint8_t jf; + uint32_t k; +} ip_bpf_insn_t; + +extern uint32_t ip_bpf_filter(ip_bpf_insn_t *, uchar_t *, uint_t, uint_t); +extern boolean_t ip_bpf_validate(ip_bpf_insn_t *, uint_t); + + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _INET_BPF_H */ diff --git a/usr/src/uts/common/inet/bpf_filter.c b/usr/src/uts/common/inet/bpf_filter.c new file mode 100644 index 0000000000..5a9ba38da6 --- /dev/null +++ b/usr/src/uts/common/inet/bpf_filter.c @@ -0,0 +1,572 @@ +/* $NetBSD: bpf_filter.c,v 1.35 2008/08/20 13:01:54 joerg Exp $ */ + +/* + * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from the Stanford/CMU enet packet filter, + * (net/enet.c) distributed as part of 4.3BSD, and code contributed + * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence + * Berkeley Laboratory. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)bpf_filter.c 8.1 (Berkeley) 6/10/93 + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/time.h> +#include <sys/stream.h> +#include <sys/byteorder.h> +#include <sys/sdt.h> +#include <inet/bpf.h> +#include <net/bpf.h> + +#define EXTRACT_SHORT(p) BE_IN16(p) +#define EXTRACT_LONG(p) BE_IN32(p) + +#define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr) +#define mtod(_a, _t) ((_t)((_a)->b_rptr)) +#define MINDEX(len, m, k) \ +{ \ + len = M_LEN(m); \ + while (k >= len) { \ + k -= len; \ + m = m->b_cont; \ + if (m == 0) \ + return (0); \ + len = M_LEN(m); \ + } \ +} + +static int m_xword(mblk_t *, uint32_t, int *); +static int m_xhalf(mblk_t *, uint32_t, int *); + +static int +m_xword(mblk_t *m, uint32_t k, int *err) +{ + int len; + uchar_t *cp, *np; + mblk_t *m0; + + *err = 1; + MINDEX(len, m, k); + cp = mtod(m, uchar_t *) + k; + if (len >= k + 4) { + *err = 0; + return (EXTRACT_LONG(cp)); + } + m0 = m->b_cont; + if (m0 == 0 || M_LEN(m0) + len - k < 4) { + DTRACE_PROBE3(mblk_xword_fail, mblk_t *, m0, int, len, int, k); + return (0); + } + *err = 0; + np = mtod(m0, uchar_t *); + switch (len - k) { + + case 1: + return ((cp[0] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]); + + case 2: + return ((cp[0] << 24) | (cp[1] << 16) | (np[0] << 8) | np[1]); + + default: + return ((cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | np[0]); + } +} + +static int +m_xhalf(mblk_t *m, uint32_t k, int *err) +{ + int len; + uchar_t *cp; + mblk_t *m0; + + *err = 1; + MINDEX(len, m, k); + cp = mtod(m, uchar_t *) + k; + if (len >= k + 2) { + *err = 0; + return (EXTRACT_SHORT(cp)); + } + m0 = m->b_cont; + if (m0 == 0) { + DTRACE_PROBE3(mblk_xhalf_fail, mblk_t *, m0, int, len, int, k); + return (0); + } + *err = 0; + return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]); +} + + +/* + * Execute the filter program starting at pc on the packet p + * wirelen is the length of the original packet + * buflen is the amount of data present + * When buflen is non-0, p is a pointer to a the start of the packet and the + * packet is only in one mblk_t. + * When buflen is 0, p is an mblk_t pointer. + */ +uint32_t +ip_bpf_filter(ip_bpf_insn_t *pc, uchar_t *p, uint_t wirelen, uint_t buflen) +{ + uint32_t A, X, k; + uint32_t mem[BPF_MEMWORDS]; + + if (pc == 0) + /* + * No filter means accept all. + */ + return ((uint32_t)-1); + A = 0; + X = 0; + --pc; + /* CONSTCOND */ + while (1) { + ++pc; + switch (pc->code) { + + default: +#ifdef _KERNEL + DTRACE_PROBE1(bpf_insn_unknown, + struct bpf_insn *, pc); + return (0); +#else + abort(); +#endif + case BPF_RET|BPF_K: + return (pc->k); + + case BPF_RET|BPF_A: + return (A); + + case BPF_LD|BPF_W|BPF_ABS: + k = pc->k; + if (k + sizeof (int32_t) > buflen) { +#ifdef _KERNEL + int merr = 0; + + if (buflen != 0) + return (0); + A = m_xword((mblk_t *)p, k, &merr); + if (merr != 0) + return (0); + continue; +#else + return (0); +#endif + } + A = EXTRACT_LONG(&p[k]); + continue; + + case BPF_LD|BPF_H|BPF_ABS: + k = pc->k; + if (k + sizeof (int16_t) > buflen) { +#ifdef _KERNEL + int merr; + + if (buflen != 0) + return (0); + A = m_xhalf((mblk_t *)p, k, &merr); + if (merr != 0) + return (0); + continue; +#else + return (0); +#endif + } + A = EXTRACT_SHORT(&p[k]); + continue; + + case BPF_LD|BPF_B|BPF_ABS: + k = pc->k; + if (k >= buflen) { +#ifdef _KERNEL + mblk_t *m; + int len; + + if (buflen != 0) + return (0); + m = (mblk_t *)p; + MINDEX(len, m, k); + A = mtod(m, uchar_t *)[k]; + continue; +#else + return (0); +#endif + } + A = p[k]; + continue; + + case BPF_LD|BPF_W|BPF_LEN: + A = wirelen; + continue; + + case BPF_LDX|BPF_W|BPF_LEN: + X = wirelen; + continue; + + case BPF_LD|BPF_W|BPF_IND: + k = X + pc->k; + if (k + sizeof (int32_t) > buflen) { +#ifdef _KERNEL + int merr = 0; + + if (buflen != 0) + return (0); + A = m_xword((mblk_t *)p, k, &merr); + if (merr != 0) + return (0); + continue; +#else + return (0); +#endif + } + A = EXTRACT_LONG(&p[k]); + continue; + + case BPF_LD|BPF_H|BPF_IND: + k = X + pc->k; + if (k + sizeof (int16_t) > buflen) { +#ifdef _KERNEL + int merr = 0; + + if (buflen != 0) + return (0); + A = m_xhalf((mblk_t *)p, k, &merr); + if (merr != 0) + return (0); + continue; +#else + return (0); +#endif + } + A = EXTRACT_SHORT(&p[k]); + continue; + + case BPF_LD|BPF_B|BPF_IND: + k = X + pc->k; + if (k >= buflen) { +#ifdef _KERNEL + mblk_t *m; + int len; + + if (buflen != 0) + return (0); + m = (mblk_t *)p; + MINDEX(len, m, k); + A = mtod(m, uchar_t *)[k]; + continue; +#else + return (0); +#endif + } + A = p[k]; + continue; + + case BPF_LDX|BPF_MSH|BPF_B: + k = pc->k; + if (k >= buflen) { +#ifdef _KERNEL + mblk_t *m; + int len; + + if (buflen != 0) + return (0); + m = (mblk_t *)p; + MINDEX(len, m, k); + X = (mtod(m, char *)[k] & 0xf) << 2; + continue; +#else + return (0); +#endif + } + X = (p[pc->k] & 0xf) << 2; + continue; + + case BPF_LD|BPF_IMM: + A = pc->k; + continue; + + case BPF_LDX|BPF_IMM: + X = pc->k; + continue; + + case BPF_LD|BPF_MEM: + A = mem[pc->k]; + continue; + + case BPF_LDX|BPF_MEM: + X = mem[pc->k]; + continue; + + case BPF_ST: + mem[pc->k] = A; + continue; + + case BPF_STX: + mem[pc->k] = X; + continue; + + case BPF_JMP|BPF_JA: + pc += pc->k; + continue; + + case BPF_JMP|BPF_JGT|BPF_K: + pc += (A > pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_K: + pc += (A >= pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_K: + pc += (A == pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_K: + pc += (A & pc->k) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGT|BPF_X: + pc += (A > X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JGE|BPF_X: + pc += (A >= X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JEQ|BPF_X: + pc += (A == X) ? pc->jt : pc->jf; + continue; + + case BPF_JMP|BPF_JSET|BPF_X: + pc += (A & X) ? pc->jt : pc->jf; + continue; + + case BPF_ALU|BPF_ADD|BPF_X: + A += X; + continue; + + case BPF_ALU|BPF_SUB|BPF_X: + A -= X; + continue; + + case BPF_ALU|BPF_MUL|BPF_X: + A *= X; + continue; + + case BPF_ALU|BPF_DIV|BPF_X: + if (X == 0) + return (0); + A /= X; + continue; + + case BPF_ALU|BPF_AND|BPF_X: + A &= X; + continue; + + case BPF_ALU|BPF_OR|BPF_X: + A |= X; + continue; + + case BPF_ALU|BPF_LSH|BPF_X: + A <<= X; + continue; + + case BPF_ALU|BPF_RSH|BPF_X: + A >>= X; + continue; + + case BPF_ALU|BPF_ADD|BPF_K: + A += pc->k; + continue; + + case BPF_ALU|BPF_SUB|BPF_K: + A -= pc->k; + continue; + + case BPF_ALU|BPF_MUL|BPF_K: + A *= pc->k; + continue; + + case BPF_ALU|BPF_DIV|BPF_K: + A /= pc->k; + continue; + + case BPF_ALU|BPF_AND|BPF_K: + A &= pc->k; + continue; + + case BPF_ALU|BPF_OR|BPF_K: + A |= pc->k; + continue; + + case BPF_ALU|BPF_LSH|BPF_K: + A <<= pc->k; + continue; + + case BPF_ALU|BPF_RSH|BPF_K: + A >>= pc->k; + continue; + + case BPF_ALU|BPF_NEG: + A = -A; + continue; + + case BPF_MISC|BPF_TAX: + X = A; + continue; + + case BPF_MISC|BPF_TXA: + A = X; + continue; + } + } + /* NOTREACHED */ +} + +/* + * Return true if the 'fcode' is a valid filter program. + * The constraints are that each jump be forward and to a valid + * code, that memory accesses are within valid ranges (to the + * extent that this can be checked statically; loads of packet + * data have to be, and are, also checked at run time), and that + * the code terminates with either an accept or reject. + * + * The kernel needs to be able to verify an application's filter code. + * Otherwise, a bogus program could easily crash the system. + */ +boolean_t +ip_bpf_validate(ip_bpf_insn_t *f, uint_t len) +{ + uint_t i, from; + ip_bpf_insn_t *p; + + if (len < 1 || len > BPF_MAXINSNS) + return (B_FALSE); + + for (i = 0; i < len; ++i) { + p = &f[i]; + DTRACE_PROBE1(bpf_valid_insn, struct bpf_insn *, p); + switch (BPF_CLASS(p->code)) { + /* + * Check that memory operations use valid addresses. + */ + case BPF_LD: + case BPF_LDX: + switch (BPF_MODE(p->code)) { + case BPF_MEM: + if (p->k >= BPF_MEMWORDS) + return (B_FALSE); + break; + case BPF_ABS: + case BPF_IND: + case BPF_MSH: + case BPF_IMM: + case BPF_LEN: + break; + default: + return (B_FALSE); + } + break; + case BPF_ST: + case BPF_STX: + if (p->k >= BPF_MEMWORDS) + return (B_FALSE); + break; + case BPF_ALU: + switch (BPF_OP(p->code)) { + case BPF_ADD: + case BPF_SUB: + case BPF_MUL: + case BPF_OR: + case BPF_AND: + case BPF_LSH: + case BPF_RSH: + case BPF_NEG: + break; + case BPF_DIV: + /* + * Check for constant division by 0. + */ + if (BPF_RVAL(p->code) == BPF_K && p->k == 0) + return (B_FALSE); + break; + default: + return (B_FALSE); + } + break; + case BPF_JMP: + /* + * Check that jumps are within the code block, + * and that unconditional branches don't go + * backwards as a result of an overflow. + * Unconditional branches have a 32-bit offset, + * so they could overflow; we check to make + * sure they don't. Conditional branches have + * an 8-bit offset, and the from address is <= + * BPF_MAXINSNS, and we assume that BPF_MAXINSNS + * is sufficiently small that adding 255 to it + * won't overflow. + * + * We know that len is <= BPF_MAXINSNS, and we + * assume that BPF_MAXINSNS is < the maximum size + * of a uint_t, so that i + 1 doesn't overflow. + */ + from = i + 1; + switch (BPF_OP(p->code)) { + case BPF_JA: + if (from + p->k < from || from + p->k >= len) + return (B_FALSE); + break; + case BPF_JEQ: + case BPF_JGT: + case BPF_JGE: + case BPF_JSET: + if (from + p->jt >= len || from + p->jf >= len) + return (B_FALSE); + break; + default: + return (B_FALSE); + } + break; + case BPF_RET: + break; + case BPF_MISC: + break; + default: + return (B_FALSE); + } + } + + return (BPF_CLASS(f[len - 1].code) == BPF_RET); +} diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index c081c44a04..ebf2574363 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -1416,6 +1416,7 @@ typedef union ill_g_head_u { #define ILL_CAPAB_DLD 0x20 /* DLD capabilities */ #define ILL_CAPAB_DLD_POLL 0x40 /* Polling */ #define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */ +#define ILL_CAPAB_DLD_IPCHECK 0x100 /* Check if IPs are permitted */ /* * Per-ill Hardware Checksumming capbilities. @@ -1772,6 +1773,10 @@ typedef struct ill_s { * Used to save errors that occur during plumbing */ uint_t ill_ifname_pending_err; + /* + * Used to save errors that occur during binding + */ + uint_t ill_dl_bind_err; avl_node_t ill_avl_byppa; /* avl node based on ppa */ uint_t ill_mcast_nces; /* Number of NCEs that are multicast. */ list_t ill_nce; /* pointer to nce_s list */ @@ -1938,6 +1943,7 @@ typedef struct ill_s { * ill_nd_lla_len ipsq + down ill only when ill is up * ill_phys_addr_pend ipsq + down ill only when ill is up * ill_ifname_pending_err ipsq ipsq + * ill_dl_bind_err ipsq ipsq * ill_avl_byppa ipsq, ill_g_lock write once * * ill_fastpath_list ill_lock ill_lock @@ -3580,6 +3586,8 @@ typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *); typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t); +typedef boolean_t (*ip_mac_ipcheck_t)(void *, boolean_t, + in6_addr_t *); typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t); @@ -3632,6 +3640,12 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */ void *idd_tx_fctl_dh; /* mac_client_handle */ } ill_dld_direct_t; +/* IP - DLD direct function call to check if an IP is allowed */ +typedef struct ill_dld_ipcheck_s { + ip_mac_ipcheck_t idi_allowed_df; + void *idi_allowed_dh; +} ill_dld_ipcheck_t; + /* IP - DLD polling capability */ typedef struct ill_dld_poll_s { ill_rx_ring_t idp_ring_tbl[ILL_MAX_RINGS]; @@ -3643,6 +3657,7 @@ struct ill_dld_capab_s { void *idc_capab_dh; /* dld_str_t *dsp */ ill_dld_direct_t idc_direct; ill_dld_poll_t idc_poll; + ill_dld_ipcheck_t idc_ipcheck; }; /* diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c index 7aac9b655a..eeec56b162 100644 --- a/usr/src/uts/common/inet/ip/conn_opt.c +++ b/usr/src/uts/common/inet/ip/conn_opt.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -644,6 +645,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name, case SO_REUSEADDR: *i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0; break; /* goto sizeof (int) option return */ + case SO_REUSEPORT: + *i1 = connp->conn_reuseport; + break; /* goto sizeof (int) option return */ case SO_TYPE: *i1 = connp->conn_so_type; break; /* goto sizeof (int) option return */ @@ -1214,8 +1218,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen, ip_stack_t *ipst = connp->conn_netstack->netstack_ip; int error; - if (connp->conn_family != AF_INET) + if (connp->conn_family == AF_INET6 && + connp->conn_ipversion == IPV4_VERSION) { + /* + * Allow certain IPv4 options to be set on an AF_INET6 socket + * if the connection is still IPv4. + */ + switch (name) { + case IP_TOS: + case T_IP_TOS: + case IP_TTL: + case IP_DONTFRAG: + break; + default: + return (EINVAL); + } + } else if (connp->conn_family != AF_INET) { return (EINVAL); + } ifindex = UINT_MAX; switch (name) { diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c index 57ee0c5585..46c791298a 100644 --- a/usr/src/uts/common/inet/ip/icmp.c +++ b/usr/src/uts/common/inet/ip/icmp.c @@ -81,6 +81,7 @@ #include <sys/tsol/tnet.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> #include <sys/disp.h> @@ -1018,6 +1019,12 @@ icmp_close_free(conn_t *connp) icmp->icmp_filter = NULL; } + if (icmp->icmp_bpf_len != 0) { + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + } + /* * Clear any fields which the kmem_cache constructor clears. * Only icmp_connp needs to be preserved. @@ -1971,6 +1978,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr) return (err); } +static int +icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp) +{ + struct bpf_program prog; + ip_bpf_insn_t *insns = NULL; + unsigned int size; + +#ifdef _LP64 + if (get_udatamodel() != DATAMODEL_NATIVE) { + struct bpf_program32 *prog32; + + if (inlen != sizeof (struct bpf_program32)) { + return (EINVAL); + } + prog32 = (struct bpf_program32 *)invalp; + prog.bf_len = prog32->bf_len; + prog.bf_insns = (void *)(uint64_t)prog32->bf_insns; + } else +#endif + if (inlen == sizeof (struct bpf_program)) { + bcopy(invalp, &prog, sizeof (prog)); + } else { + return (EINVAL); + } + + if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) { + return (EINVAL); + } + size = prog.bf_len * sizeof (struct bpf_insn); + insns = kmem_alloc(size, KM_SLEEP); + if (copyin(prog.bf_insns, insns, size) != 0) { + kmem_free(insns, size); + return (EFAULT); + } + if (!ip_bpf_validate(insns, prog.bf_len)) { + kmem_free(insns, size); + return (EINVAL); + } + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len != 0) { + ASSERT(icmp->icmp_bpf_prog != NULL); + + kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len); + } + icmp->icmp_bpf_len = size; + icmp->icmp_bpf_prog = insns; + rw_exit(&icmp->icmp_bpf_lock); + return (0); +} + +static int +icmp_detach_filter(icmp_t *icmp) +{ + int error; + + rw_enter(&icmp->icmp_bpf_lock, RW_WRITER); + if (icmp->icmp_bpf_len == 0) { + ASSERT(icmp->icmp_bpf_prog == NULL); + error = ENOENT; + } else { + kmem_free(icmp->icmp_bpf_prog, + icmp->icmp_bpf_len); + icmp->icmp_bpf_len = 0; + icmp->icmp_bpf_prog = NULL; + error = 0; + } + rw_exit(&icmp->icmp_bpf_lock); + return (error); +} + +static boolean_t +icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira) +{ + boolean_t res; + uchar_t *buf = mp->b_rptr; + uint_t wirelen, len = MBLKL(mp); + + rw_enter(&icmp->icmp_bpf_lock, RW_READER); + if (icmp->icmp_bpf_len == 0) { + rw_exit(&icmp->icmp_bpf_lock); + return (B_FALSE); + } + if (ira->ira_flags & IRAF_IS_IPV4) { + ipha_t *ipha = (ipha_t *)buf; + + wirelen = ntohs(ipha->ipha_length); + } else { + ip6_t *ip6h = (ip6_t *)buf; + + wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN; + } + res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len); + rw_exit(&icmp->icmp_bpf_lock); + + return (res); +} + /* * This routine sets socket options. */ @@ -2060,6 +2165,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name, return (ENOBUFS); } break; + case SO_ATTACH_FILTER: + return (icmp_attach_filter(icmp, inlen, invalp)); + case SO_DETACH_FILTER: + return (icmp_detach_filter(icmp)); } break; @@ -2605,6 +2714,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira) /* Initialize regardless of IP version */ ipps.ipp_fields = 0; + /* Apply socket filter, if needed */ + if (icmp->icmp_bpf_len != 0) { + if (icmp_eval_filter(icmp, mp, ira)) { + freemsg(mp); + return; + } + } + if (ira->ira_flags & IRAF_IS_IPV4) { ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION); ASSERT(MBLKL(mp) >= sizeof (ipha_t)); diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c index ff0310de0c..d65d3164d3 100644 --- a/usr/src/uts/common/inet/ip/icmp_opt_data.c +++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c @@ -21,6 +21,7 @@ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -41,6 +42,7 @@ #include <netinet/ip_mroute.h> #include <inet/optcom.h> #include <inet/rawip_impl.h> +#include <net/bpf.h> /* * Table of all known options handled on a ICMP protocol stack. @@ -86,6 +88,10 @@ opdes_t icmp_opt_arr[] = { 0 }, { SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, +{ SO_ATTACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, + sizeof (struct bpf_program), 0 }, +{ SO_DETACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 }, + { IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP, (OP_VARLEN|OP_NODEFAULT), IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ }, diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index 6063fa01d2..704f152bb9 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -8235,7 +8235,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) conn_t *connp = NULL; t_uscalar_t paddrreq; mblk_t *mp_hw; - boolean_t success; boolean_t ioctl_aborted = B_FALSE; boolean_t log = B_TRUE; @@ -8335,7 +8334,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS; mutex_exit(&ill->ill_lock); /* - * Something went wrong with the bind. We presumably + * Something went wrong with the bind. If this was the + * result of a DL_NOTE_REPLUMB, then we presumably * have an IOCTL hanging out waiting for completion. * Find it, take down the interface that was coming * up, and complete the IOCTL with the error noted. @@ -8352,6 +8352,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) (void) ipif_down(ipif, NULL, NULL); /* error is set below the switch */ + } else { + /* + * There's no pending IOCTL, so the bind was + * most likely started by ill_dl_up(). We save + * the error and let it take care of responding + * to the IOCTL. + */ + ill->ill_dl_bind_err = dlea->dl_unix_errno ? + dlea->dl_unix_errno : ENXIO; } break; case DL_ENABMULTI_REQ: @@ -8475,55 +8484,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill); ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0); - /* - * Now bring up the resolver; when that is complete, we'll - * create IREs. Note that we intentionally mirror what - * ipif_up() would have done, because we got here by way of - * ill_dl_up(), which stopped ipif_up()'s processing. - */ - if (ill->ill_isv6) { - /* - * v6 interfaces. - * Unlike ARP which has to do another bind - * and attach, once we get here we are - * done with NDP - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0) - err = ipif_up_done_v6(ipif); - } else if (ill->ill_net_type == IRE_IF_RESOLVER) { - /* - * ARP and other v4 external resolvers. - * Leave the pending mblk intact so that - * the ioctl completes in ip_rput(). - */ - if (connp != NULL) - mutex_enter(&connp->conn_lock); - mutex_enter(&ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0); - mutex_exit(&ill->ill_lock); - if (connp != NULL) - mutex_exit(&connp->conn_lock); - if (success) { - err = ipif_resolver_up(ipif, Res_act_initial); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - mp1 = ipsq_pending_mp_get(ipsq, &connp); - } else { - /* The conn has started closing */ - err = EINTR; - } - } else { - /* - * This one is complete. Reply to pending ioctl. - */ - (void) ipif_resolver_up(ipif, Res_act_initial); - err = ipif_up_done(ipif); - } - - if ((err == 0) && (ill->ill_up_ipifs)) { + if (ill->ill_up_ipifs) { err = ill_up_ipifs(ill, q, mp1); if (err == EINPROGRESS) { freemsg(mp); @@ -8531,25 +8492,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg) } } - /* - * If we have a moved ipif to bring up, and everything has - * succeeded to this point, bring it up on the IPMP ill. - * Otherwise, leave it down -- the admin can try to bring it - * up by hand if need be. - */ - if (ill->ill_move_ipif != NULL) { - if (err != 0) { - ill->ill_move_ipif = NULL; - } else { - ipif = ill->ill_move_ipif; - ill->ill_move_ipif = NULL; - err = ipif_up(ipif, q, mp1); - if (err == EINPROGRESS) { - freemsg(mp); - return; - } - } - } break; case DL_NOTIFY_IND: { @@ -12621,6 +12563,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) struct iocblk *iocp = (struct iocblk *)mp->b_rptr; ip_ioctl_cmd_t *ipip = arg; ip_extract_func_t *extract_funcp; + ill_t *ill; cmd_info_t ci; int err; boolean_t entered_ipsq = B_FALSE; @@ -12742,6 +12685,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd); /* + * We need to cache the ill_t that we're going to use as the argument + * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be + * blown away by calling ipi_func. + */ + ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill; + + /* * A return value of EINPROGRESS means the ioctl is * either queued and waiting for some reason or has * already completed. @@ -12749,9 +12699,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg) err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr); DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR", - int, ipip->ipi_cmd, - ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill, - ipif_t *, ci.ci_ipif); + int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif); ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq); if (entered_ipsq) diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index cc67299a1b..2307837eb8 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -22,7 +22,7 @@ * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 1990 Mentat Inc. * Copyright (c) 2013 by Delphix. All rights reserved. - * Copyright (c) 2016, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved. */ @@ -174,7 +174,7 @@ static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen, static int ill_alloc_ppa(ill_if_t *, ill_t *); static void ill_delete_interface_type(ill_if_t *); -static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q); +static int ill_dl_up(ill_t *ill, ipif_t *ipif); static void ill_dl_down(ill_t *ill); static void ill_down(ill_t *ill); static void ill_down_ipifs(ill_t *, boolean_t); @@ -1380,6 +1380,36 @@ ill_capability_probe(ill_t *ill) ill->ill_dlpi_capab_state = IDCS_PROBE_SENT; } +static boolean_t +ill_capability_wait(ill_t *ill) +{ + /* + * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can + * only be set by someone who is the writer. Since we + * drop-and-reacquire the squeue in this loop, we need to check for + * ILL_CONDEMNED, which if set means nothing can signal our capability + * condition variable. + */ + ASSERT(IAM_WRITER_ILL(ill)); + + while (ill->ill_capab_pending_cnt != 0 && + (ill->ill_state_flags & ILL_CONDEMNED) == 0) { + /* This may enable blocked callers of ill_capability_done(). */ + ipsq_exit(ill->ill_phyint->phyint_ipsq); + /* Pause a bit (1msec) before we re-enter the squeue. */ + delay(drv_usectohz(1000000)); + + /* + * If ipsq_enter() fails, someone set ILL_CONDEMNED + * while we dropped the squeue. Indicate such to the caller. + */ + if (!ipsq_enter(ill, B_FALSE, CUR_OP)) + return (B_FALSE); + } + + return ((ill->ill_state_flags & ILL_CONDEMNED) == 0); +} + void ill_capability_reset(ill_t *ill, boolean_t reneg) { @@ -1390,6 +1420,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg) ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT; + ASSERT(ill->ill_capab_reset_mp != NULL); + ill_capability_send(ill, ill->ill_capab_reset_mp); ill->ill_capab_reset_mp = NULL; /* @@ -2109,6 +2141,49 @@ ill_capability_lso_enable(ill_t *ill) } } +/* + * Check whether or not mac will prevent us from sending with a given IP + * address. This requires having the IPCHECK capability, which we should + * always be able to successfully negotiate, but if it's somehow missing + * then we just permit the caller to use the address, since mac does the + * actual enforcement and ip is just performing a courtesy check to help + * prevent users from unwittingly setting and attempting to use blocked + * addresses. + */ +static boolean_t +ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr) +{ + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0) + return (B_TRUE); + + ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck; + ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df; + return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr)); +} + +static void +ill_capability_ipcheck_enable(ill_t *ill) +{ + ill_dld_capab_t *idc = ill->ill_dld_capab; + ill_dld_ipcheck_t *idi = &idc->idc_ipcheck; + dld_capab_ipcheck_t spoof; + int rc; + + ASSERT(IAM_WRITER_ILL(ill)); + + bzero(&spoof, sizeof (spoof)); + if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + &spoof, DLD_ENABLE)) == 0) { + idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df; + idi->idi_allowed_dh = spoof.ipc_allowed_dh; + ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK; + } else { + cmn_err(CE_WARN, "warning: could not enable IPCHECK " + "capability, rc = %d\n", rc); + DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc); + } +} + static void ill_capability_dld_enable(ill_t *ill) { @@ -2121,6 +2196,8 @@ ill_capability_dld_enable(ill_t *ill) ill_capability_direct_enable(ill); ill_capability_poll_enable(ill); } + + ill_capability_ipcheck_enable(ill); ill_capability_lso_enable(ill); ill->ill_capabilities |= ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); @@ -2186,6 +2263,15 @@ ill_capability_dld_disable(ill_t *ill) NULL, DLD_DISABLE); } + if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) { + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL); + ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL); + + ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK; + (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK, + NULL, DLD_DISABLE); + } + ill->ill_capabilities &= ~ILL_CAPAB_DLD; ill_mac_perim_exit(ill, mph); } @@ -9676,7 +9762,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, in6_addr_t v6addr; boolean_t need_up = B_FALSE; ill_t *ill; - int i; ip1dbg(("ip_sioctl_addr(%s:%u %p)\n", ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif)); @@ -9751,20 +9836,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp, IN6_IPADDR_TO_V4MAPPED(addr, &v6addr); } - /* - * verify that the address being configured is permitted by the - * ill_allowed_ips[] for the interface. - */ - if (ill->ill_allowed_ips_cnt > 0) { - for (i = 0; i < ill->ill_allowed_ips_cnt; i++) { - if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i], - &v6addr)) - break; - } - if (i == ill->ill_allowed_ips_cnt) { - pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr); - return (EPERM); - } + /* verify that the address being configured is permitted by mac */ + if (!ill_ipcheck_addr(ill, &v6addr)) { + return (EPERM); } /* * Even if there is no change we redo things just to rerun @@ -12704,6 +12778,12 @@ ill_dl_down(ill_t *ill) } ill->ill_unbind_mp = NULL; + + mutex_enter(&ill->ill_lock); + ill->ill_dl_up = 0; + ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); + mutex_exit(&ill->ill_lock); + if (mp != NULL) { ip1dbg(("ill_dl_down: %s (%u) for %s\n", dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr, @@ -12726,11 +12806,13 @@ ill_dl_down(ill_t *ill) ill_capability_dld_disable(ill); ill_capability_reset(ill, B_FALSE); ill_dlpi_send(ill, mp); + + /* + * Wait for the capability reset to finish. + * In this case, it doesn't matter WHY or HOW it finished. + */ + (void) ill_capability_wait(ill); } - mutex_enter(&ill->ill_lock); - ill->ill_dl_up = 0; - ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0); - mutex_exit(&ill->ill_lock); } void @@ -12852,6 +12934,7 @@ void ill_capability_done(ill_t *ill) { ASSERT(ill->ill_capab_pending_cnt != 0); + ASSERT(IAM_WRITER_ILL(ill)); ill_dlpi_done(ill, DL_CAPABILITY_REQ); @@ -14480,7 +14563,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp) * address/netmask etc cause a down/up dance, but * does not cause an unbind (DL_UNBIND) with the driver */ - return (ill_dl_up(ill, ipif, mp, q)); + if ((err = ill_dl_up(ill, ipif)) != 0) { + return (err); + } + } + + /* Reject bringing up interfaces with unusable IP addresses */ + if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) { + return (EPERM); } /* @@ -14593,24 +14683,22 @@ ill_delete_ires(ill_t *ill) /* * Perform a bind for the physical device. - * When the routine returns EINPROGRESS then mp has been consumed and - * the ioctl will be acked from ip_rput_dlpi. - * Allocate an unbind message and save it until ipif_down. + * + * When the routine returns successfully then dlpi has been bound and + * capabilities negotiated. An unbind message will have been allocated + * for later use in ipif_down. */ static int -ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) +ill_dl_up(ill_t *ill, ipif_t *ipif) { mblk_t *bind_mp = NULL; mblk_t *unbind_mp = NULL; - conn_t *connp; - boolean_t success; int err; DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill); ip1dbg(("ill_dl_up(%s)\n", ill->ill_name)); ASSERT(IAM_WRITER_ILL(ill)); - ASSERT(mp != NULL); /* * Make sure we have an IRE_MULTICAST in case we immediately @@ -14645,19 +14733,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) if (unbind_mp == NULL) goto bad; } - /* - * Record state needed to complete this operation when the - * DL_BIND_ACK shows up. Also remember the pre-allocated mblks. - */ - connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL; - ASSERT(connp != NULL || !CONN_Q(q)); - GRAB_CONN_LOCK(q); - mutex_enter(&ipif->ipif_ill->ill_lock); - success = ipsq_pending_mp_add(connp, ipif, q, mp, 0); - mutex_exit(&ipif->ipif_ill->ill_lock); - RELEASE_CONN_LOCK(q); - if (!success) - goto bad; /* * Save the unbind message for ill_dl_down(); it will be consumed when @@ -14669,6 +14744,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) ill_dlpi_send(ill, bind_mp); /* Send down link-layer capabilities probe if not already done. */ ill_capability_probe(ill); + /* + * Wait for DLPI to be bound and the capability probe to finish. + * The call drops-and-reacquires the squeue. If it couldn't because + * ILL_CONDEMNED got set, bail. + */ + if (!ill_capability_wait(ill)) + return (ENXIO); + + /* DLPI failed to bind. Return the saved error */ + if (!ill->ill_dl_up) { + return (ill->ill_dl_bind_err); + } /* * Sysid used to rely on the fact that netboots set domainname @@ -14686,11 +14773,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q) cmn_err(CE_WARN, "no cached dhcp response"); } - /* - * This operation will complete in ip_rput_dlpi with either - * a DL_BIND_ACK or DL_ERROR_ACK. - */ - return (EINPROGRESS); + return (0); bad: ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name)); diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index 13e961333c..b6565d9c1f 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -153,7 +153,7 @@ ip_squeue_create(pri_t pri) { squeue_t *sqp; - sqp = squeue_create(pri); + sqp = squeue_create(pri, B_TRUE); ASSERT(sqp != NULL); if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c index 34832d56e5..d47997a4aa 100644 --- a/usr/src/uts/common/inet/ip/ipclassifier.c +++ b/usr/src/uts/common/inet/ip/ipclassifier.c @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. * Copyright 2022 Joyent, Inc. */ @@ -871,67 +872,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp) mutex_exit(&(connfp)->connf_lock); \ } -#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \ - conn_t *pconnp = NULL, *nconnp; \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - nconnp = (connfp)->connf_head; \ - while (nconnp != NULL && \ - !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \ - pconnp = nconnp; \ - nconnp = nconnp->conn_next; \ - } \ - if (pconnp != NULL) { \ - pconnp->conn_next = (connp); \ - (connp)->conn_prev = pconnp; \ - } else { \ - (connfp)->connf_head = (connp); \ - } \ - if (nconnp != NULL) { \ - (connp)->conn_next = nconnp; \ - nconnp->conn_prev = (connp); \ - } \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF(connp); \ - mutex_exit(&(connfp)->connf_lock); \ -} +/* + * When inserting bound or wildcard entries into the hash, ordering rules are + * used to facilitate timely and correct lookups. The order is as follows: + * 1. Entries bound to a specific address + * 2. Entries bound to INADDR_ANY + * 3. Entries bound to ADDR_UNSPECIFIED + * Entries in a category which share conn_lport (such as those using + * SO_REUSEPORT) will be ordered such that the newest inserted is first. + */ -#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \ - conn_t **list, *prev, *next; \ - boolean_t isv4mapped = \ - IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \ - IPCL_HASH_REMOVE((connp)); \ - mutex_enter(&(connfp)->connf_lock); \ - list = &(connfp)->connf_head; \ - prev = NULL; \ - while ((next = *list) != NULL) { \ - if (isv4mapped && \ - IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \ - connp->conn_zoneid == next->conn_zoneid) { \ - (connp)->conn_next = next; \ - if (prev != NULL) \ - prev = next->conn_prev; \ - next->conn_prev = (connp); \ - break; \ - } \ - list = &next->conn_next; \ - prev = next; \ - } \ - (connp)->conn_prev = prev; \ - *list = (connp); \ - (connp)->conn_fanout = (connfp); \ - (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \ - IPCL_BOUND; \ - CONN_INC_REF((connp)); \ - mutex_exit(&(connfp)->connf_lock); \ +void +ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp) +{ + conn_t *pconnp, *nconnp; + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + /* + * Walk though entries associated with the fanout until one is + * found which fulfills any of these conditions: + * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED + * 2. Listen port the same as connp + */ + if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) || + connp->conn_lport == nconnp->conn_lport) + break; + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } void ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + conn_t **list, *prev, *next; + conn_t *pconnp = NULL, *nconnp; + boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6); + + IPCL_HASH_REMOVE(connp); + mutex_enter(&connfp->connf_lock); + nconnp = connfp->connf_head; + pconnp = NULL; + while (nconnp != NULL) { + if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) && + isv4mapped && connp->conn_lport == nconnp->conn_lport) + break; + if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) && + (isv4mapped || + connp->conn_lport == nconnp->conn_lport)) + break; + + pconnp = nconnp; + nconnp = nconnp->conn_next; + } + if (pconnp != NULL) { + pconnp->conn_next = connp; + connp->conn_prev = pconnp; + } else { + connfp->connf_head = connp; + } + if (nconnp != NULL) { + connp->conn_next = nconnp; + nconnp->conn_prev = connp; + } + connp->conn_fanout = connfp; + connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND; + CONN_INC_REF(connp); + mutex_exit(&connfp->connf_lock); } /* @@ -1037,9 +1062,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport) IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) { if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) || IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } else { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } } else { IPCL_HASH_INSERT_CONNECTED(connfp, connp); @@ -1208,9 +1233,9 @@ ipcl_bind_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (protocol == IPPROTO_RSVP) ill_set_inputfn_all(ipst); @@ -1222,9 +1247,9 @@ ipcl_bind_insert_v4(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { ASSERT(connp->conn_ipversion == IPV4_VERSION); @@ -1274,9 +1299,9 @@ ipcl_bind_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; @@ -1286,9 +1311,9 @@ ipcl_bind_insert_v6(conn_t *connp) connfp = &ipst->ips_ipcl_bind_fanout[ IPCL_BIND_HASH(lport, ipst)]; if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } if (cl_inet_listen != NULL) { sa_family_t addr_family; @@ -1419,9 +1444,9 @@ ipcl_conn_insert_v4(conn_t *connp) if (connp->conn_faddr_v4 != INADDR_ANY) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (connp->conn_laddr_v4 != INADDR_ANY) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -1507,9 +1532,9 @@ ipcl_conn_insert_v6(conn_t *connp) if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) { IPCL_HASH_INSERT_CONNECTED(connfp, connp); } else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) { - IPCL_HASH_INSERT_BOUND(connfp, connp); + ipcl_hash_insert_bound(connfp, connp); } else { - IPCL_HASH_INSERT_WILDCARD(connfp, connp); + ipcl_hash_insert_wildcard(connfp, connp); } break; } @@ -2095,6 +2120,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags) connp->conn_flags = IPCL_RAWIPCONN; connp->conn_proto = IPPROTO_ICMP; icmp->icmp_connp = connp; + rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL); rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL); connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags); if (connp->conn_ixa == NULL) @@ -2119,6 +2145,7 @@ rawip_conn_destructor(void *buf, void *cdrarg) mutex_destroy(&connp->conn_lock); cv_destroy(&connp->conn_cv); rw_destroy(&connp->conn_ilg_lock); + rw_destroy(&icmp->icmp_bpf_lock); /* Can be NULL if constructor failed */ if (connp->conn_ixa != NULL) { diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 89968826b3..70cff374a4 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -21,6 +21,7 @@ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2015 Joyent, Inc. */ /* @@ -299,7 +300,8 @@ struct conn_s { conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */ conn_mcbc_bind : 1, /* Bound to multi/broadcast */ - conn_pad_to_bit_31 : 12; + conn_reuseport : 1, /* SO_REUSEPORT state */ + conn_pad_to_bit_31 : 11; boolean_t conn_blocked; /* conn is flow-controlled */ diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c index 104603d840..22f2d79d24 100644 --- a/usr/src/uts/common/inet/ipd/ipd.c +++ b/usr/src/uts/common/inet/ipd/ipd.c @@ -9,7 +9,7 @@ * http://www.illumos.org/license/CDDL. */ /* - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. All rights reserved. */ /* @@ -222,7 +222,7 @@ typedef struct ipd_netstack { net_handle_t ipdn_v6hdl; /* IPv4 net handle */ int ipdn_hooked; /* are hooks registered */ hook_t *ipdn_v4in; /* IPv4 traffic in hook */ - hook_t *ipdn_v4out; /* IPv4 traffice out hook */ + hook_t *ipdn_v4out; /* IPv4 traffic out hook */ hook_t *ipdn_v6in; /* IPv6 traffic in hook */ hook_t *ipdn_v6out; /* IPv6 traffic out hook */ int ipdn_enabled; /* which perturbs are on */ @@ -613,7 +613,7 @@ ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay) /* * If ipd_check_hooks_failed, that must mean that we failed to set up * the hooks, so we are going to effectively zero out and fail the - * request to enable corruption. + * request to enable packet delays. */ if (rval != 0) ins->ipdn_delay = 0; diff --git a/usr/src/uts/common/inet/ipf/cfw.c b/usr/src/uts/common/inet/ipf/cfw.c new file mode 100644 index 0000000000..941aeac328 --- /dev/null +++ b/usr/src/uts/common/inet/ipf/cfw.c @@ -0,0 +1,659 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019, Joyent, Inc. + */ + +/* IPF oddness for compilation in userland for IPF tests. */ +#if defined(KERNEL) || defined(_KERNEL) +#undef KERNEL +#undef _KERNEL +#define KERNEL 1 +#define _KERNEL 1 +#endif + +#include <sys/errno.h> +#include <sys/types.h> +#include <sys/param.h> +#include <sys/time.h> +#include <sys/socket.h> +#include <net/if.h> +#include <net/route.h> +#include <netinet/in.h> +#include <netinet/in_systm.h> +#include <netinet/ip.h> +#include <netinet/ip_var.h> +#include <netinet/tcp.h> +#include "netinet/ip_compat.h" +#ifdef USE_INET6 +#include <netinet/icmp6.h> +#endif +#include <netinet/tcpip.h> +#include "netinet/ip_fil.h" +#include "netinet/ip_nat.h" +#include "netinet/ip_frag.h" +#include "netinet/ip_state.h" +#include "netinet/ip_proxy.h" +#include "netinet/ip_auth.h" +#include "netinet/ipf_stack.h" +#ifdef IPFILTER_SCAN +#include "netinet/ip_scan.h" +#endif +#ifdef IPFILTER_SYNC +#include "netinet/ip_sync.h" +#endif +#include "netinet/ip_pool.h" +#include "netinet/ip_htable.h" +#ifdef IPFILTER_COMPILED +#include "netinet/ip_rules.h" +#endif +#if defined(_KERNEL) +#include <sys/sunddi.h> +#endif + +#include "netinet/ipf_cfw.h" +#include <sys/file.h> +#include <sys/uio.h> +#include <sys/cred.h> +#include <sys/ddi.h> + +/* + * cfw == Cloud Firewall ==> routines for a global-zone data collector about + * ipf events for SmartOS. The only ones that CFW cares about are ones + * enforced by global-zone-controlled rulesets. + * + * The variable below is tied into the GZ-only ipf device /dev/ipfev, that + * flips this on when there is an open instance. This feature will also + * consume an fr_flag to have per-rule granularity. + */ +boolean_t ipf_cfwlog_enabled; + +/* + * Because ipf's test tools in $SRC/cmd insert all of these files, we need to + * stub out what we can vs. drag in even more headers and who knows what else. + */ +#ifdef _KERNEL + +/* + * CFW event ring buffer. Remember, this is for ALL ZONES because only a + * global-zone event-reader will be consuming these. In other words, it's + * not something to instantiate per-netstack. + * + * We may want to get more sophisticated and performant (e.g. per-processor), + * but for now keep the ring buffer simple and stupid. + * Must be a power of 2, to be bitmaskable, and must be countable by a uint_t + * + * Resizeable, see ipf_cfw_ring_resize() below. + */ +#define IPF_CFW_DEFAULT_RING_BUFS 1024 +#define IPF_CFW_MIN_RING_BUFS 8 +#define IPF_CFW_MAX_RING_BUFS (1U << 31U) + +/* Assume C's init-to-zero is sufficient for these types... */ +static kmutex_t cfw_ringlock; +static kcondvar_t cfw_ringcv; + +static cfwev_t *cfw_ring; /* NULL by default. */ +static uint32_t cfw_ringsize; /* 0 by default, number of array elements. */ +static uint32_t cfw_ringmask; /* 0 by default. */ + +/* If these are equal, we're either empty or full. */ +static uint_t cfw_ringstart, cfw_ringend; +static boolean_t cfw_ringfull; /* Tell the difference here! */ +/* Bean-counters. */ +static uint64_t cfw_evreports; +static uint64_t cfw_evdrops; + +/* + * Place an event in the CFW event ring buffer. + * + * For now, be simple and drop the oldest event if we overflow. We may wish to + * selectively drop older events based on type in the future. + */ +static void +ipf_cfwev_report(cfwev_t *event) +{ + mutex_enter(&cfw_ringlock); + cfw_ring[cfw_ringend] = *event; + cfw_ringend++; + cfw_ringend &= cfw_ringmask; + if (cfw_ringfull) { + cfw_ringstart++; + cfw_ringstart &= cfw_ringmask; + ASSERT3U(cfw_ringstart, ==, cfw_ringend); + DTRACE_PROBE(ipf__cfw__evdrop); + cfw_evdrops++; + } else { + cfw_ringfull = (cfw_ringend == cfw_ringstart); + } + cfw_evreports++; + cv_broadcast(&cfw_ringcv); + mutex_exit(&cfw_ringlock); +} + +/* + * Provide access to multiple CFW events that can allow copying straight from + * the ring buffer up to userland. Requires a callback (which could call + * uiomove() directly, OR to a local still-in-kernel buffer) that must do the + * data copying-out. + * + * Callback function is of the form: + * + * uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg); + * + * The function must return how many events got consumed, which MUST be <= the + * number available. The function must ALSO UNDERSTAND that cfw_ringlock is + * held and must not be released during this time. The function may be called + * more than once, if the available buffers wrap-around OR "block" is set and + * we don't have enough buffers. If any callback returns 0, exit the function + * with however many were consumed. + * + * This function, like the callback, returns the number of events *CONSUMED*. + * + * . . . + * + * Tunables for ipf_cfwev_consume_many(). + * + * If you wish to attempt to coalesce reads (to reduce the likelihood of one + * event at a time during high load) change the number of tries below to + * something not 0. Early experiments set this to 10. + * + * The wait between tries is in usecs in cfw_timeout_wait. The pessimal + * case for this is a timeout_wait-spaced trickle of one event at a time. + */ +uint_t cfw_timeout_tries = 0; +uint_t cfw_timeout_wait = 10000; /* 10ms wait. */ + +typedef struct uio_error_s { + struct uio *ue_uio; + int ue_error; +} uio_error_t; + +static uint_t +ipf_cfwev_consume_many(uint_t num_requested, boolean_t block, + cfwmanycb_t cfw_many_cb, void *cbarg) +{ + uint_t consumed = 0, cb_consumed, contig_size; + uint_t timeout_tries = cfw_timeout_tries; + boolean_t eintr = B_FALSE; + + mutex_enter(&cfw_ringlock); + + while (num_requested > 0) { + clock_t delta; + + /* Silly reality checks */ + ASSERT3U(cfw_ringstart, <, cfw_ringsize); + ASSERT3U(cfw_ringend, <, cfw_ringsize); + + if (cfw_ringstart > cfw_ringend || cfw_ringfull) { + /* We have from ringstart to the buffer's end. */ + contig_size = cfw_ringsize - cfw_ringstart; + } else if (cfw_ringstart < cfw_ringend) { + /* We have no potential wrapping at this time. */ + contig_size = cfw_ringend - cfw_ringstart; + } else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) { + /* Maybe something to consume now, try again. */ + continue; + } else { + /* Nothing (more) to consume, return! */ + eintr = (block && consumed == 0); + break; + } + + /* Less asked-for than what we needed. */ + if (num_requested < contig_size) + contig_size = num_requested; + + cb_consumed = + cfw_many_cb(&(cfw_ring[cfw_ringstart]), contig_size, cbarg); + ASSERT3U(cb_consumed, <=, contig_size); + + cfw_ringstart += cb_consumed; + ASSERT3U(cfw_ringstart, <=, cfw_ringmask + 1); + cfw_ringstart &= cfw_ringmask; /* In case of wraparound. */ + consumed += cb_consumed; + cfw_ringfull = (cfw_ringfull && cb_consumed == 0); + if (cb_consumed < contig_size) { + /* + * Callback returned less than given. + * This is likely a uio error, but we have + * something. Get out of here. + */ + break; + } + ASSERT3U(cb_consumed, ==, contig_size); + num_requested -= contig_size; + + if (num_requested == 0) { + /* All done! */ + break; + } + + if (cfw_ringstart != cfw_ringend) { + /* + * We wrapped around the end of the buffer, and + * we have more available to fill our request. + */ + ASSERT0(cfw_ringstart); + ASSERT(!cfw_ringfull); + continue; + } + + /* + * We obtained some of the events we requested, but not all. + * Since we have nothing to consume, wait *a little* longer. + */ + if (timeout_tries == 0) + break; /* Don't bother... */ + delta = drv_usectohz(cfw_timeout_wait); + timeout_tries--; + + switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta, + TR_CLOCK_TICK)) { + case 0: + /* + * Received signal! Return what we have OR if we have + * nothing, EINTR. + */ + DTRACE_PROBE1(ipf__cfw__timedsignal, int, consumed); + eintr = (consumed == 0); + num_requested = 0; + break; + case -1: + /* Time reached! Bail with what we got. */ + DTRACE_PROBE(ipf__cfw__timedexpired); + num_requested = 0; + break; + default: + /* Aha! We've got more! */ + DTRACE_PROBE(ipf__cfw__moredata); + break; + } + } + + mutex_exit(&cfw_ringlock); + if (eintr) + ((uio_error_t *)cbarg)->ue_error = EINTR; + return (consumed); +} + +/* + * SmartOS likes using the zone's debug id. Make sure we squirrel that away in + * the ipf netstack instance if it's not there. + */ +static inline zoneid_t +ifs_to_did(ipf_stack_t *ifs) +{ + if (ifs->ifs_zone_did == 0) { + zone_t *zone; + + /* + * We can't get the zone_did at initialization time because + * most zone data isn't readily available then, cement the did + * in place now. + */ + VERIFY3U(ifs->ifs_zone, !=, GLOBAL_ZONEID); + zone = zone_find_by_id(ifs->ifs_zone); + if (zone != NULL) { + ifs->ifs_zone_did = zone->zone_did; + zone_rele(zone); + } + /* Else we are either in shutdown or something weirder. */ + } + return (ifs->ifs_zone_did); +} + +/* + * ipf_block_cfwlog() + * + * Called by fr_check(). Record drop events for the global-zone data + * collector. Use rest-of-ipf-style names for the parameters. + */ +void +ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs) +{ + cfwev_t event = {0}; + + /* + * We need a rule. + * Capture failure by using dtrace on this function's entry. + * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }' + */ + if (fr == NULL) + return; + + event.cfwev_type = CFWEV_BLOCK; + event.cfwev_length = sizeof (event); + /* + * IPF code elsewhere does the cheesy single-flag check, even though + * there are two flags in a rule (one for in, one for out). + */ + event.cfwev_direction = (fr->fr_flags & FR_INQUE) ? + CFWDIR_IN : CFWDIR_OUT; + + event.cfwev_protocol = fin->fin_p; + /* + * NOTE: fin_*port is in host/native order, and ICMP info is here too. + */ + event.cfwev_sport = htons(fin->fin_sport); + event.cfwev_dport = htons(fin->fin_dport); + + switch (fin->fin_v) { + case IPV4_VERSION: + IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr); + IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr); + break; + case IPV6_VERSION: + event.cfwev_saddr = fin->fin_src6.in6; + event.cfwev_daddr = fin->fin_dst6.in6; + break; + default: + /* We should never reach here, but mark it if we do. */ + DTRACE_PROBE1(ipf__cfw__frinfo__badipversion, frinfo_t *, fin); + return; + } + + /* + * uniqtime() is what ipf's GETKTIME() uses. + * If cfwev_tstamp needs to be sourced from elsewhere, fix that here. + */ + uniqtime(&event.cfwev_tstamp); + event.cfwev_zonedid = ifs_to_did(ifs); + event.cfwev_ruleid = fin->fin_rule; + memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t)); + + ipf_cfwev_report(&event); +} + +/* + * ipf_log_cfwlog() + * + * Twin of ipstate_log(), but records state events for the global-zone data + * collector. + */ +void +ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs) +{ + cfwev_t event = {0}; + + switch (type) { + case ISL_NEW: + case ISL_CLONE: + event.cfwev_type = CFWEV_BEGIN; + break; + case ISL_EXPIRE: + case ISL_FLUSH: + case ISL_REMOVE: + case ISL_KILLED: + case ISL_ORPHAN: + /* + * We don't care about session disappearances in CFW logging + * for now. (Possible future: CFWEV_END) + */ + return; + default: + event.cfwev_type = CFWEV_BLOCK; + break; + } + + /* + * IPF code elsewhere does the cheesy single-flag check, even though + * there are two flags in a rule (one for in, one for out). Follow + * suit here. + */ + event.cfwev_length = sizeof (event); + ASSERT(is->is_rule != NULL); + event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ? + CFWDIR_IN : CFWDIR_OUT; + event.cfwev_protocol = is->is_p; + switch (is->is_p) { + case IPPROTO_TCP: + case IPPROTO_UDP: + /* NOTE: is_*port is in network order. */ + event.cfwev_sport = is->is_sport; + event.cfwev_dport = is->is_dport; + break; + case IPPROTO_ICMP: + case IPPROTO_ICMPV6: + /* Scribble the ICMP type in sport... */ + event.cfwev_sport = is->is_icmp.ici_type; + break; + /* Other protocols leave the event's port fields empty. */ + } + + switch(is->is_v) { + case IPV4_VERSION: + IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr); + IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr); + break; + case IPV6_VERSION: + event.cfwev_saddr = is->is_src.in6; + event.cfwev_daddr = is->is_dst.in6; + break; + default: + /* Can't parse addresses if we don't know the version. Drop. */ + DTRACE_PROBE1(ipf__cfw__ipstate__badipversion, + struct ipstate *, is); + return; + } + + /* + * uniqtime() is what ipf's GETKTIME() uses. + * If cfwev_tstamp needs to be sourced from elsewhere, fix that here. + */ + uniqtime(&event.cfwev_tstamp); + event.cfwev_zonedid = ifs_to_did(ifs); + event.cfwev_ruleid = is->is_rulen; + memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t)); + + ipf_cfwev_report(&event); +} + +/* + * Callback routine we use for ipf_cfwev_consume_many(). + * Returning 0 means error indication. + */ +static uint_t +cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg) +{ + uio_error_t *ue = (uio_error_t *)cbarg; + + ASSERT(MUTEX_HELD(&cfw_ringlock)); + + if (ue->ue_error != 0) + return (0); + + ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr), + UIO_READ, ue->ue_uio); + if (ue->ue_error != 0) + return (0); + + return (num_avail); +} + +/* + * Resize the CFW event ring buffer. + * + * The caller must ensure the new size is a power of 2 between + * IPF_CFW_{MIN,MAX}_RING_BUFS (inclusive) or the special values + * IPF_CFW_RING_ALLOCATE (first-time creation) or IPF_CFW_RING_DESTROY + * (netstack-unload destruction). + * + * Everything in the current ring will be destroyed (and reported as a drop) + * upon resize. + */ +int +ipf_cfw_ring_resize(uint32_t newsize) +{ + ASSERT(MUTEX_HELD(&cfw_ringlock) || newsize == IPF_CFW_RING_ALLOCATE || + newsize == IPF_CFW_RING_DESTROY); + + if (newsize == IPF_CFW_RING_ALLOCATE) { + if (cfw_ring != NULL) + return (EBUSY); + newsize = IPF_CFW_DEFAULT_RING_BUFS; + /* Fall through to allocating a new ring buffer. */ + } else { + /* We may be called during error cleanup, so be liberal here. */ + if ((cfw_ring == NULL && newsize == IPF_CFW_RING_DESTROY) || + newsize == cfw_ringsize) { + return (0); + } + kmem_free(cfw_ring, cfw_ringsize * sizeof (cfwev_t)); + cfw_ring = NULL; + if (cfw_ringfull) { + cfw_evdrops += cfw_ringsize; + } else if (cfw_ringstart > cfw_ringend) { + cfw_evdrops += cfw_ringend + + (cfw_ringsize - cfw_ringstart); + } else { + cfw_evdrops += cfw_ringend - cfw_ringstart; + } + cfw_ringsize = cfw_ringmask = cfw_ringstart = cfw_ringend = 0; + cfw_ringfull = B_FALSE; + + if (newsize == IPF_CFW_RING_DESTROY) + return (0); + /* + * Keep the reports & drops around because if we're just + * resizing, we need to know what we lost. + */ + } + + ASSERT(ISP2(newsize)); + cfw_ring = kmem_alloc(newsize * sizeof (cfwev_t), KM_SLEEP); + /* KM_SLEEP means we always succeed. */ + cfw_ringsize = newsize; + cfw_ringmask = cfw_ringsize - 1; + + return (0); +} + +/* + * ioctl handler for /dev/ipfev. Only supports SIOCIPFCFWCFG (get data + * collector statistics and configuration), and SIOCIPFCFWNEWSZ (resize the + * event ring buffer). + */ +/* ARGSUSED */ +int +ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp, + int *rp) +{ + ipfcfwcfg_t cfginfo; + int error; + + if (cmd != SIOCIPFCFWCFG && cmd != SIOCIPFCFWNEWSZ) + return (EIO); + + if (crgetzoneid(cp) != GLOBAL_ZONEID) + return (EACCES); + + error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo)); + if (error != 0) + return (EFAULT); + + cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t); + mutex_enter(&cfw_ringlock); + cfginfo.ipfcfwc_evreports = cfw_evreports; + if (cmd == SIOCIPFCFWNEWSZ) { + uint32_t newsize = cfginfo.ipfcfwc_evringsize; + + /* Do ioctl parameter checking here, then call the resizer. */ + if (newsize < IPF_CFW_MIN_RING_BUFS || + newsize > IPF_CFW_MAX_RING_BUFS || !ISP2(newsize)) { + error = EINVAL; + } else { + error = ipf_cfw_ring_resize(cfginfo.ipfcfwc_evringsize); + } + } else { + error = 0; + } + /* Both cfw_evdrops and cfw_ringsize are affected by resize. */ + cfginfo.ipfcfwc_evdrops = cfw_evdrops; + cfginfo.ipfcfwc_evringsize = cfw_ringsize; + mutex_exit(&cfw_ringlock); + + if (error != 0) + return (error); + + error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo)); + if (error != 0) + return (EFAULT); + + return (0); +} + +/* + * Send events up via /dev/ipfev reads. Will return only complete events. + */ +/* ARGSUSED */ +int +ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp) +{ + uint_t requested, consumed; + uio_error_t ue = {uio, 0}; + boolean_t block; + + if (uio->uio_resid == 0) + return (0); + if (uio->uio_resid < sizeof (cfwev_t)) + return (EINVAL); + + block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0); + requested = uio->uio_resid / sizeof (cfwev_t); + + /* + * As stated earlier, ipf_cfwev_consume_many() takes a callback. + * The callback may be called multiple times before we return. + * The callback will execute uiomove(). + */ + consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb, + &ue); + ASSERT3U(consumed, <=, requested); + if (!block && consumed == 0 && ue.ue_error == 0) { + /* No data available. */ + ue.ue_error = EWOULDBLOCK; + } else if (ue.ue_error != 0 && ue.ue_error != EINTR) { + /* + * We had a problem that wasn't simply a + * case of cv_wait_sig() receiving a signal. + */ + DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed); + mutex_enter(&cfw_ringlock); + cfw_evdrops += consumed; + mutex_exit(&cfw_ringlock); + } + return (ue.ue_error); +} + +#else /* _KERNEL */ + +/* Blank stubs to satisfy userland's test compilations. */ + +int +ipf_cfw_ring_resize(uint32_t a) +{ + return (0); +} + +void +ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c) +{ +} + +void +ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c) +{ +} + +#endif /* _KERNEL */ diff --git a/usr/src/uts/common/inet/ipf/fil.c b/usr/src/uts/common/inet/ipf/fil.c index 78980be106..48fa6e7325 100644 --- a/usr/src/uts/common/inet/ipf/fil.c +++ b/usr/src/uts/common/inet/ipf/fil.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #if defined(KERNEL) || defined(_KERNEL) @@ -2588,6 +2588,9 @@ ipf_stack_t *ifs; } #endif + if (IFS_CFWLOG(ifs, fr) && FR_ISBLOCK(pass)) + ipf_block_cfwlog(fr, fin, ifs); + /* * The FI_STATE flag is cleared here so that calling fr_checkstate * will work when called from inside of fr_fastroute. Although diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c index c9d5f03e13..0d34e0fce3 100644 --- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c +++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright 2018 Joyent, Inc. + * Copyright 2019 Joyent, Inc. */ #if !defined(lint) @@ -85,6 +85,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t, static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hook6 __P((hook_data_t, int, int, void *)); +static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t, + void *)); +static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t, + void *)); static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *)); static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t, @@ -116,7 +124,7 @@ u_long *ip_forwarding = NULL; #endif vmem_t *ipf_minor; /* minor number arena */ -void *ipf_state; /* DDI state */ +void *ipf_state; /* DDI state */ /* * GZ-controlled and per-zone stacks: @@ -141,28 +149,38 @@ void *ipf_state; /* DDI state */ */ /* IPv4 hook names */ -char *hook4_nicevents = "ipfilter_hook4_nicevents"; -char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz"; -char *hook4_in = "ipfilter_hook4_in"; -char *hook4_in_gz = "ipfilter_hook4_in_gz"; -char *hook4_out = "ipfilter_hook4_out"; -char *hook4_out_gz = "ipfilter_hook4_out_gz"; -char *hook4_loop_in = "ipfilter_hook4_loop_in"; -char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz"; -char *hook4_loop_out = "ipfilter_hook4_loop_out"; -char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz"; +char *hook4_nicevents = "ipfilter_hook4_nicevents"; +char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz"; +char *hook4_in = "ipfilter_hook4_in"; +char *hook4_in_gz = "ipfilter_hook4_in_gz"; +char *hook4_out = "ipfilter_hook4_out"; +char *hook4_out_gz = "ipfilter_hook4_out_gz"; +char *hook4_loop_in = "ipfilter_hook4_loop_in"; +char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz"; +char *hook4_loop_out = "ipfilter_hook4_loop_out"; +char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz"; /* IPv6 hook names */ -char *hook6_nicevents = "ipfilter_hook6_nicevents"; -char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz"; -char *hook6_in = "ipfilter_hook6_in"; -char *hook6_in_gz = "ipfilter_hook6_in_gz"; -char *hook6_out = "ipfilter_hook6_out"; -char *hook6_out_gz = "ipfilter_hook6_out_gz"; -char *hook6_loop_in = "ipfilter_hook6_loop_in"; -char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; -char *hook6_loop_out = "ipfilter_hook6_loop_out"; -char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; +char *hook6_nicevents = "ipfilter_hook6_nicevents"; +char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz"; +char *hook6_in = "ipfilter_hook6_in"; +char *hook6_in_gz = "ipfilter_hook6_in_gz"; +char *hook6_out = "ipfilter_hook6_out"; +char *hook6_out_gz = "ipfilter_hook6_out_gz"; +char *hook6_loop_in = "ipfilter_hook6_loop_in"; +char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz"; +char *hook6_loop_out = "ipfilter_hook6_loop_out"; +char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz"; + +/* vnd IPv4/v6 hook names */ +char *hook4_vnd_in = "ipfilter_hookvndl3v4_in"; +char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz"; +char *hook6_vnd_in = "ipfilter_hookvndl3v6_in"; +char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz"; +char *hook4_vnd_out = "ipfilter_hookvndl3v4_out"; +char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz"; +char *hook6_vnd_out = "ipfilter_hookvndl3v6_out"; +char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz"; /* viona hook names */ char *hook_viona_in = "ipfilter_hookviona_in"; @@ -170,6 +188,39 @@ char *hook_viona_in_gz = "ipfilter_hookviona_in_gz"; char *hook_viona_out = "ipfilter_hookviona_out"; char *hook_viona_out_gz = "ipfilter_hookviona_out_gz"; +/* + * For VIONA. The net_{instance,protocol}_notify_register() functions only + * deal with per-callback-function granularity. We need two wrapper functions + * for GZ-controlled and per-zone instances. + */ +static int +ipf_hook_instance_notify_gz(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy, const char *instance) +{ + return (ipf_hook_instance_notify(command, arg, netid, dummy, instance)); +} + +static int +ipf_hook_instance_notify_ngz(hook_notify_cmd_t command, void *arg, + const char *netid, const char *dummy, const char *instance) +{ + return (ipf_hook_instance_notify(command, arg, netid, dummy, instance)); +} + +static int +ipf_hook_protocol_notify_gz(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy, const char *he_name) +{ + return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name)); +} + +static int +ipf_hook_protocol_notify_ngz(hook_notify_cmd_t command, void *arg, + const char *name, const char *dummy, const char *he_name) +{ + return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name)); +} + /* ------------------------------------------------------------------------ */ /* Function: ipldetach */ /* Returns: int - 0 == success, else error. */ @@ -267,10 +318,36 @@ ipf_stack_t *ifs; } /* + * Remove VND hooks + */ + if (ifs->ifs_ipf_vndl3v4 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in); + UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v4 = NULL; + } + + if (ifs->ifs_ipf_vndl3v6 != NULL) { + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in, + NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in); + UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out, + NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out); + + if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0) + goto detach_failed; + ifs->ifs_ipf_vndl3v6 = NULL; + } + + /* * Remove notification of viona hooks */ net_instance_notify_unregister(ifs->ifs_netid, - ipf_hook_instance_notify); + ifs->ifs_gz_controlled ? ipf_hook_instance_notify_gz : + ipf_hook_instance_notify_ngz); #undef UNDO_HOOK @@ -278,6 +355,10 @@ ipf_stack_t *ifs; * Normally, viona will unregister itself before ipldetach() is called, * so these will be no-ops, but out of caution, we try to make sure * we've removed any of our references. + * + * For now, the _gz and _ngz versions are both wrappers to what's + * below. Just call it directly, but if that changes fix here as + * well. */ (void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL, NH_PHYSICAL_IN); @@ -295,6 +376,10 @@ ipf_stack_t *ifs; * traced, we pass the same value the nethook framework would * pass, even though the callback does not currently use the * value. + * + * For now, the _gz and _ngz versions are both wrappers to + * what's below. Just call it directly, but if that changes + * fix here as well. */ (void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr, NULL, Hn_VIONA); @@ -495,6 +580,49 @@ ipf_stack_t *ifs; } /* + * Add VND INET hooks + */ + ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET); + if (ifs->ifs_ipf_vndl3v4 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in, + hook4_vnd_in, hook4_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out, + hook4_vnd_out, hook4_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0); + if (!ifs->ifs_hookvndl3v4_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0); + if (!ifs->ifs_hookvndl3v4_physical_out) + goto hookup_failed; + + + /* + * VND INET6 hooks + */ + ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6); + if (ifs->ifs_ipf_vndl3v6 == NULL) + goto hookup_failed; + + HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in, + hook6_vnd_in, hook6_vnd_in_gz, ifs); + HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out, + hook6_vnd_out, hook6_vnd_out_gz, ifs); + ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0); + if (!ifs->ifs_hookvndl3v6_physical_in) + goto hookup_failed; + + ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6, + NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0); + if (!ifs->ifs_hookvndl3v6_physical_out) + goto hookup_failed; + + /* * VIONA INET hooks. While the nethook framework allows us to register * hooks for events that haven't been registered yet, we instead * register and unregister our hooks in response to notifications @@ -504,9 +632,15 @@ ipf_stack_t *ifs; * is unloaded, the viona module cannot later re-register them if it * gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded * even on DEBUG kernels, they do not experience this issue. + * + * Today, the per-zone ones don't matter for a BHYVE-branded zone, BUT + * the ipf_hook_protocol_notify() function is GZ vs. per-zone aware. + * Employ two different versions of ipf_hook_instance_notify(), one for + * the GZ-controlled, and one for the per-zone one. */ - if (net_instance_notify_register(id, ipf_hook_instance_notify, - ifs) != 0) + if (net_instance_notify_register(id, ifs->ifs_gz_controlled ? + ipf_hook_instance_notify_gz : ipf_hook_instance_notify_ngz, ifs) != + 0) goto hookup_failed; /* @@ -688,6 +822,7 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, { ipf_stack_t *ifs = arg; int ret = 0; + const boolean_t gz = ifs->ifs_gz_controlled; /* We currently only care about viona hooks */ if (strcmp(instance, Hn_VIONA) != 0) @@ -705,14 +840,16 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg, return (EPROTONOSUPPORT); ret = net_protocol_notify_register(ifs->ifs_ipf_viona, - ipf_hook_protocol_notify, ifs); + gz ? ipf_hook_protocol_notify_gz : + ipf_hook_protocol_notify_ngz, ifs); VERIFY(ret == 0 || ret == ESHUTDOWN); break; case HN_UNREGISTER: if (ifs->ifs_ipf_viona == NULL) break; VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona, - ipf_hook_protocol_notify)); + gz ? ipf_hook_protocol_notify_gz : + ipf_hook_protocol_notify_ngz)); VERIFY0(net_protocol_release(ifs->ifs_ipf_viona)); ifs->ifs_ipf_viona = NULL; break; @@ -821,6 +958,9 @@ int *rp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (ipf_cfwlog_ioctl(dev, cmd, data, mode, cp, rp)); + zid = crgetzoneid(cp); if (cmd == SIOCIPFZONESET) { if (zid == GLOBAL_ZONEID) @@ -1129,14 +1269,14 @@ ipf_stack_t *ifs; { net_handle_t nif; - if (v == 4) - nif = ifs->ifs_ipf_ipv4; - else if (v == 6) - nif = ifs->ifs_ipf_ipv6; - else - return 0; - - return (net_phylookup(nif, name)); + if (v == 4) + nif = ifs->ifs_ipf_ipv4; + else if (v == 6) + nif = ifs->ifs_ipf_ipv6; + else + return 0; + + return (net_phylookup(nif, name)); } /* @@ -1161,11 +1301,35 @@ cred_t *cred; if (IPL_LOGMAX < min) return ENXIO; + /* Special-case ipfev: global-zone-open only. */ + if (min == IPL_LOGEV) { + if (crgetzoneid(cred) != GLOBAL_ZONEID) + return (ENXIO); + /* + * Else enable the CFW logging of events. + * NOTE: For now, we only allow one open at a time. + * Use atomic_cas to confirm/deny. And also for now, + * assume sizeof (boolean_t) == sizeof (uint_t). + * + * Per the *_{refrele,REFRELE}() in other parts of inet, + * ensure all loads/stores complete before calling cas. + * membar_exit() does this. + */ + membar_exit(); + if (atomic_cas_uint(&ipf_cfwlog_enabled, 0, 1) != 0) + return (EBUSY); + } + minor = (minor_t)(uintptr_t)vmem_alloc(ipf_minor, 1, VM_BESTFIT | VM_SLEEP); if (ddi_soft_state_zalloc(ipf_state, minor) != 0) { vmem_free(ipf_minor, (void *)(uintptr_t)minor, 1); + if (min == IPL_LOGEV) { + /* See above... */ + membar_exit(); + VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1); + } return ENXIO; } @@ -1187,6 +1351,7 @@ int flags, otype; cred_t *cred; { minor_t min = getminor(dev); + ipf_devstate_t *isp; #ifdef IPFDEBUG cmn_err(CE_CONT, "iplclose(%x,%x,%x,%x)\n", dev, flags, otype, cred); @@ -1195,6 +1360,15 @@ cred_t *cred; if (IPL_LOGMAX < min) return ENXIO; + isp = ddi_get_soft_state(ipf_state, min); + if (isp != NULL && isp->ipfs_minor == IPL_LOGEV) { + /* + * Disable CFW logging. See iplopen() for details. + */ + membar_exit(); + VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1); + } + ddi_soft_state_free(ipf_state, min); vmem_free(ipf_minor, (void *)(uintptr_t)min, 1); @@ -1225,6 +1399,8 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (ipf_cfwlog_read(dev, uio, cp)); /* * ipf_find_stack returns with a read lock on ifs_ipf_global @@ -1277,6 +1453,9 @@ cred_t *cp; return ENXIO; unit = isp->ipfs_minor; + if (unit == IPL_LOGEV) + return (EIO); /* ipfev doesn't support write yet. */ + /* * ipf_find_stack returns with a read lock on ifs_ipf_global */ @@ -2068,8 +2247,11 @@ frdest_t *fdp; return (-1); } - /* Check the src here, fin_ifp is the src interface. */ - if (!(fin->fin_flx & FI_GENERATED) && + /* + * If we're forwarding (vs. injecting), check the src here, fin_ifp is + * the src interface. + */ + if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) && !fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) { return (-1); } @@ -2138,8 +2320,8 @@ frdest_t *fdp; inj->ni_physical = net_routeto(net_data_p, sinp, NULL); } - /* we're checking the destination here */ - if (!(fin->fin_flx & FI_GENERATED) && + /* If we're forwarding (vs. injecting), check the destinatation here. */ + if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) && !fr_forwarding_enabled(inj->ni_physical, net_data_p)) { goto bad_fastroute; } @@ -2355,6 +2537,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg, } /* ------------------------------------------------------------------------ */ +/* Function: ipf_hookvndl3_in */ +/* Returns: int - 0 == packet ok, else problem, free packet if not done */ +/* Parameters: event(I) - pointer to event */ +/* info(I) - pointer to hook information for firewalling */ +/* */ +/* The vnd hooks are private hooks to ON. They represents a layer 2 */ +/* datapath generally used to implement virtual machines. The driver sends */ +/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */ +/* them is in the upper 16 bits while the remaining bits are the */ +/* traditional packet hook flags. */ +/* */ +/* They end up calling the appropriate traditional ip hooks. */ +/* ------------------------------------------------------------------------ */ +/*ARGSUSED*/ +int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_in(token, info, arg); +} + +int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_in(token, info, arg); +} + +/*ARGSUSED*/ +int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook4_out(token, info, arg); +} + +int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg) +{ + return ipf_hook6_out(token, info, arg); +} + +/* ------------------------------------------------------------------------ */ /* Function: ipf_hookviona_{in,out} */ /* Returns: int - 0 == packet ok, else problem, free packet if not done */ /* Parameters: event(I) - pointer to event */ @@ -3120,16 +3338,16 @@ fr_info_t *fin; /* both IP versions. The details are going to be explained here. */ /* */ /* The packet looks as follows: */ -/* xxx | IP hdr | IP payload ... | */ -/* ^ ^ ^ ^ */ -/* | | | | */ +/* xxx | IP hdr | IP payload ... | */ +/* ^ ^ ^ ^ */ +/* | | | | */ /* | | | fin_m->b_wptr = fin->fin_dp + fin->fin_dlen */ /* | | | */ /* | | `- fin_m->fin_dp (in case of IPv4 points to L4 header) */ /* | | */ /* | `- fin_m->b_rptr + fin_ipoff (fin_ipoff is most likely 0 in case */ /* | of loopback) */ -/* | */ +/* | */ /* `- fin_m->b_rptr - points to L2 header in case of physical NIC */ /* */ /* All relevant IP headers are pulled up into the first mblk. It happened */ diff --git a/usr/src/uts/common/inet/ipf/ip_log.c b/usr/src/uts/common/inet/ipf/ip_log.c index 584ee42d9a..b70e320def 100644 --- a/usr/src/uts/common/inet/ipf/ip_log.c +++ b/usr/src/uts/common/inet/ipf/ip_log.c @@ -8,7 +8,7 @@ * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #include <sys/param.h> @@ -373,9 +373,11 @@ u_int flags; if (fin->fin_fr != NULL) { ipfl.fl_loglevel = fin->fin_fr->fr_loglevel; ipfl.fl_logtag = fin->fin_fr->fr_logtag; + bcopy(fin->fin_fr->fr_uuid, ipfl.fl_uuid, sizeof (uuid_t)); } else { ipfl.fl_loglevel = 0xffff; ipfl.fl_logtag = FR_NOLOGTAG; + bzero(ipfl.fl_uuid, sizeof (uuid_t)); } if (fin->fin_nattag != NULL) bcopy(fin->fin_nattag, (void *)&ipfl.fl_nattag, diff --git a/usr/src/uts/common/inet/ipf/ip_state.c b/usr/src/uts/common/inet/ipf/ip_state.c index 184f8775b6..a45bcbfdaf 100644 --- a/usr/src/uts/common/inet/ipf/ip_state.c +++ b/usr/src/uts/common/inet/ipf/ip_state.c @@ -5,7 +5,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #if defined(KERNEL) || defined(_KERNEL) @@ -108,6 +108,7 @@ struct file; # include <sys/systm.h> # endif #endif +#include <sys/uuid.h> /* END OF INCLUDES */ @@ -1445,6 +1446,7 @@ u_int flags; is->is_sti.tqe_flags |= TQE_RULEBASED; } is->is_tag = fr->fr_logtag; + memcpy(is->is_uuid, fr->fr_uuid, sizeof (uuid_t)); is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1]; is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2]; @@ -1524,6 +1526,9 @@ u_int flags; if (ifs->ifs_ipstate_logging) ipstate_log(is, ISL_NEW, ifs); + if (IFS_CFWLOG(ifs, is->is_rule)) + ipf_log_cfwlog(is, ISL_NEW, ifs); + RWLOCK_EXIT(&ifs->ifs_ipf_state); fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr); fin->fin_flx |= FI_STATE; @@ -2314,6 +2319,8 @@ u_32_t cmask; is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT); if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging) ipstate_log(is, ISL_CLONE, ifs); + if ((flags & SI_CLONED) && IFS_CFWLOG(ifs, is->is_rule)) + ipf_log_cfwlog(is, ISL_CLONE, ifs); } ret = -1; @@ -3397,6 +3404,15 @@ ipf_stack_t *ifs; if (ifs->ifs_ipstate_logging != 0 && why != 0) ipstate_log(is, why, ifs); + /* + * For now, ipf_log_cfwlog() copes with all "why" values. Strictly + * speaking, though, they all map to one event (CFWEV_END), which for + * now is not supported, hence no code calling ipf_log_cfwlog() like + * below: + * + * if (why != 0 && IFS_CFWLOG(ifs, is->is_rule)) + * ipf_log_cfwlog(is, why, ifs); + */ if (is->is_rule != NULL) { is->is_rule->fr_statecnt--; @@ -3931,7 +3947,6 @@ int flags; return rval; } - /* ------------------------------------------------------------------------ */ /* Function: ipstate_log */ /* Returns: Nil */ diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf index 6b36f9fdbf..f49e024a72 100644 --- a/usr/src/uts/common/inet/ipf/ipf.conf +++ b/usr/src/uts/common/inet/ipf/ipf.conf @@ -1,3 +1,8 @@ # # name="ipf" parent="pseudo" instance=0; + +# Increase the state table limits. fr_statemax should be ~70% of fr_statesize, +# and both should be prime numbers +fr_statesize=151007; +fr_statemax=113279; diff --git a/usr/src/uts/common/inet/ipf/netinet/Makefile b/usr/src/uts/common/inet/ipf/netinet/Makefile index cca3b48ac4..88f91e633f 100644 --- a/usr/src/uts/common/inet/ipf/netinet/Makefile +++ b/usr/src/uts/common/inet/ipf/netinet/Makefile @@ -1,16 +1,15 @@ # -#ident "%Z%%M% %I% %E% SMI" -# # Copyright 2008 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2019 Joyent, Inc. # # uts/common/inet/ipf/netinet/Makefile # # include global definitions include ../../../../../Makefile.master -HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h \ - ip_frag.h ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h +HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h ip_frag.h \ + ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h ipf_cfw.h ROOTDIRS= $(ROOT)/usr/include/netinet diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h index 4c3c5683b5..bb5ce7bd6c 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h +++ b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h @@ -8,7 +8,7 @@ * * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019, Joyent, Inc. */ #ifndef __IP_FIL_H__ @@ -16,6 +16,7 @@ #include "netinet/ip_compat.h" #include <sys/zone.h> +#include <sys/uuid.h> #ifdef SOLARIS #undef SOLARIS @@ -115,6 +116,8 @@ #define SIOCDELFR SIOCRMAFR #define SIOCINSFR SIOCINAFR # define SIOCIPFZONESET _IOWR('r', 97, struct ipfzoneobj) +# define SIOCIPFCFWCFG _IOR('r', 98, struct ipfcfwcfg) +# define SIOCIPFCFWNEWSZ _IOWR('r', 99, struct ipfcfwcfg) /* * What type of table is getting flushed? @@ -600,6 +603,7 @@ typedef struct frentry { u_32_t fr_flags; /* per-rule flags && options (see below) */ u_32_t fr_logtag; /* user defined log tag # */ u_32_t fr_collect; /* collection number */ + uuid_t fr_uuid; /* user defined uuid */ u_int fr_arg; /* misc. numeric arg for rule */ u_int fr_loglevel; /* syslog log facility + priority */ u_int fr_age[2]; /* non-TCP timeouts */ @@ -728,6 +732,7 @@ typedef struct frentry { #define FR_NEWISN 0x400000 /* new ISN for outgoing TCP */ #define FR_NOICMPERR 0x800000 /* do not match ICMP errors in state */ #define FR_STATESYNC 0x1000000 /* synchronize state to slave */ +#define FR_CFWLOG 0x2000000 /* Global CFW logging enabled */ #define FR_NOMATCH 0x8000000 /* no match occured */ /* 0x10000000 FF_LOGPASS */ /* 0x20000000 FF_LOGBLOCK */ @@ -883,6 +888,7 @@ typedef struct ipflog { u_32_t fl_lflags; u_32_t fl_logtag; ipftag_t fl_nattag; + uuid_t fl_uuid; u_short fl_plen; /* extra data after hlen */ u_short fl_loglevel; /* syslog log level */ char fl_group[FR_GROUPLEN]; @@ -931,6 +937,7 @@ typedef struct ipflog { #define IPSYNC_NAME "/dev/ipsync" #define IPSCAN_NAME "/dev/ipscan" #define IPLOOKUP_NAME "/dev/iplookup" +#define IPFEV_NAME "/dev/ipfev" #define IPL_LOGIPF 0 /* Minor device #'s for accessing logs */ #define IPL_LOGNAT 1 @@ -939,8 +946,9 @@ typedef struct ipflog { #define IPL_LOGSYNC 4 #define IPL_LOGSCAN 5 #define IPL_LOGLOOKUP 6 -#define IPL_LOGCOUNT 7 -#define IPL_LOGMAX 7 +#define IPL_LOGEV 7 +#define IPL_LOGCOUNT 8 +#define IPL_LOGMAX 8 #define IPL_LOGSIZE (IPL_LOGMAX + 1) #define IPL_LOGALL -1 #define IPL_LOGNONE -2 @@ -1181,6 +1189,21 @@ typedef struct ipfzoneobj { char ipfz_zonename[ZONENAME_MAX]; /* zone to act on */ } ipfzoneobj_t; +/* ioctl to grab CFW logging parameters */ +typedef struct ipfcfwcfg { + /* CFG => Max event size, NEWSZ => ignored in, like CFG out. */ + uint32_t ipfcfwc_maxevsize; + /* + * CFG => Current ring size, + * NEWSZ => New ring size, must be 2^N for 3 <= N <= 31. + */ + uint32_t ipfcfwc_evringsize; + /* CFG => Number of event reports, NEWSZ => ignored in, like CFG out. */ + uint64_t ipfcfwc_evreports; + /* CFG => Number of event drops, NEWSZ => ignored in, like CFG out. */ + uint64_t ipfcfwc_evdrops; +} ipfcfwcfg_t; + #if defined(_KERNEL) /* Set ipfs_zoneid to this if no zone has been set: */ #define IPFS_ZONE_UNSET -2 @@ -1560,6 +1583,23 @@ extern int ipllog __P((int, fr_info_t *, void **, size_t *, int *, int, ipf_stack_t *)); extern void fr_logunload __P((ipf_stack_t *)); +/* SmartOS single-FD global-zone state accumulator (see cfw.c) */ +extern boolean_t ipf_cfwlog_enabled; +struct ipstate; /* Ugggh. */ +extern void ipf_log_cfwlog __P((struct ipstate *, uint_t, ipf_stack_t *)); +extern void ipf_block_cfwlog __P((frentry_t *, fr_info_t *, ipf_stack_t *)); +#define IFS_CFWLOG(ifs, fr) ((ifs)->ifs_gz_controlled && ipf_cfwlog_enabled &&\ + fr != NULL && ((fr)->fr_flags & FR_CFWLOG)) +struct cfwev_s; /* See ipf_cfw.h */ +extern boolean_t ipf_cfwev_consume __P((struct cfwev_s *, boolean_t)); +/* See cfw.c's ipf_cfwev_consume_many() for details. */ +typedef uint_t (*cfwmanycb_t) __P((struct cfwev_s *, uint_t, void *)); +extern int ipf_cfwlog_read __P((dev_t, struct uio *, struct cred *)); +extern int ipf_cfwlog_ioctl __P((dev_t, int, intptr_t, int, cred_t *, int *)); +#define IPF_CFW_RING_ALLOCATE 0 +#define IPF_CFW_RING_DESTROY 1 +extern int ipf_cfw_ring_resize(uint32_t); + extern frentry_t *fr_acctpkt __P((fr_info_t *, u_32_t *)); extern int fr_copytolog __P((int, char *, int)); extern u_short fr_cksum __P((mb_t *, ip_t *, int, void *)); diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_state.h b/usr/src/uts/common/inet/ipf/netinet/ip_state.h index 4c605c1b89..ef315d5ef1 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ip_state.h +++ b/usr/src/uts/common/inet/ipf/netinet/ip_state.h @@ -8,11 +8,14 @@ * * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2019 Joyent, Inc. */ #ifndef __IP_STATE_H__ #define __IP_STATE_H__ +#include <sys/uuid.h> + #if defined(__STDC__) || defined(__GNUC__) || defined(_AIX51) # define SIOCDELST _IOW('r', 61, struct ipfobj) #else @@ -66,6 +69,7 @@ typedef struct ipstate { /* in both directions */ u_32_t is_optmsk[2]; /* " " mask */ /* in both directions */ + uuid_t is_uuid; u_short is_sec; /* security options set */ u_short is_secmsk; /* " " mask */ u_short is_auth; /* authentication options set */ diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h new file mode 100644 index 0000000000..1972d2b3f7 --- /dev/null +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h @@ -0,0 +1,69 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef __IPF_CFW_H__ +#define __IPF_CFW_H__ + +#include <sys/types.h> +#include <inet/ip6.h> +#include <sys/uuid.h> + +/* Because ipf compiles this kernel file in userland testing... */ +#ifndef ASSERT3U +#define ASSERT3U(a, b, c) ASSERT(a ## b ## c); +#endif /* ASSERT3U */ + +/* + * CFW Event, which is emitted to a global-zone listener. The global-zone + * listener solves the one-fd-per-zone problem of using each zone's ipmon. + * + * These must be 64-bit aligned because they form an array in-kernel. There + * might be reserved fields to ensure that alignment. + */ +#define CFWEV_BLOCK 1 +#define CFWEV_BEGIN 2 +#define CFWEV_END 3 +#define CFWDIR_IN 1 +#define CFWDIR_OUT 2 + +typedef struct cfwev_s { + uint16_t cfwev_type; /* BEGIN, END, BLOCK */ + uint16_t cfwev_length; /* in bytes, so capped to 65535 bytes */ + zoneid_t cfwev_zonedid; /* Pullable from ipf_stack_t. */ + + uint32_t cfwev_ruleid; /* Pullable from fr_info_t. */ + uint16_t cfwev_sport; /* Source port (network order) */ + uint16_t cfwev_dport; /* Dest. port (network order) */ + + uint8_t cfwev_protocol; /* IPPROTO_* */ + /* "direction" informs if src/dst are local/remote or remote/local. */ + uint8_t cfwev_direction; + uint8_t cfwev_reserved[6]; /* Ensures 64-bit alignment. */ + + in6_addr_t cfwev_saddr; /* IPv4 addresses are V4MAPPED. */ + in6_addr_t cfwev_daddr; + + /* + * Because of 'struct timeval' being different between 32-bit and + * 64-bit ABIs, this interface is only usable by 64-bit binaries. + */ + struct timeval cfwev_tstamp; + + uuid_t cfwev_ruleuuid; /* Pullable from fr_info_t. */ +} cfwev_t; + + + +#endif /* __IPF_CFW_H__ */ diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h index 0ceea1e921..0b2a8d826f 100644 --- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h +++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright 2018 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ #ifndef __IPF_STACK_H__ @@ -46,6 +46,7 @@ struct ipf_stack { struct ipf_stack *ifs_gz_cont_ifs; netid_t ifs_netid; zoneid_t ifs_zone; + zoneid_t ifs_zone_did; boolean_t ifs_gz_controlled; /* ipf module */ @@ -126,6 +127,11 @@ struct ipf_stack { hook_t *ifs_ipfhook6_loop_out; hook_t *ifs_ipfhook6_nicevents; + hook_t *ifs_ipfhookvndl3v4_in; + hook_t *ifs_ipfhookvndl3v6_in; + hook_t *ifs_ipfhookvndl3v4_out; + hook_t *ifs_ipfhookvndl3v6_out; + hook_t *ifs_ipfhookviona_in; hook_t *ifs_ipfhookviona_out; @@ -140,12 +146,18 @@ struct ipf_stack { boolean_t ifs_hook6_nic_events; boolean_t ifs_hook6_loopback_in; boolean_t ifs_hook6_loopback_out; + boolean_t ifs_hookvndl3v4_physical_in; + boolean_t ifs_hookvndl3v6_physical_in; + boolean_t ifs_hookvndl3v4_physical_out; + boolean_t ifs_hookvndl3v6_physical_out; boolean_t ifs_hookviona_physical_in; boolean_t ifs_hookviona_physical_out; int ifs_ipf_loopback; net_handle_t ifs_ipf_ipv4; net_handle_t ifs_ipf_ipv6; + net_handle_t ifs_ipf_vndl3v4; + net_handle_t ifs_ipf_vndl3v6; net_handle_t ifs_ipf_viona; /* ip_auth.c */ @@ -305,6 +317,7 @@ struct ipf_stack { char *ifs_addmask_key; char *ifs_rn_zeros; char *ifs_rn_ones; + #ifdef KERNEL /* kstats for inbound and outbound */ kstat_t *ifs_kstatp[2]; diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c index c541f4dddc..5ccbfa3188 100644 --- a/usr/src/uts/common/inet/ipf/solaris.c +++ b/usr/src/uts/common/inet/ipf/solaris.c @@ -6,7 +6,7 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -116,7 +116,7 @@ static void ipf_stack_shutdown __P((const netid_t, void *)); static int ipf_property_g_update __P((dev_info_t *)); static char *ipf_devfiles[] = { IPL_NAME, IPNAT_NAME, IPSTATE_NAME, IPAUTH_NAME, IPSYNC_NAME, IPSCAN_NAME, - IPLOOKUP_NAME, NULL }; + IPLOOKUP_NAME, IPFEV_NAME, NULL }; extern void *ipf_state; /* DDI state */ extern vmem_t *ipf_minor; /* minor number arena */ @@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg) /* * Destroy things for ipf for one stack. */ -/* ARGSUSED */ static void ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs) { @@ -742,6 +741,9 @@ ddi_attach_cmd_t cmd; ipf_dev_info = dip; + if (ipf_cfw_ring_resize(IPF_CFW_RING_ALLOCATE) != 0) + goto attach_failed; + ipfncb = net_instance_alloc(NETINFO_VERSION); if (ipfncb == NULL) goto attach_failed; @@ -769,6 +771,7 @@ ddi_attach_cmd_t cmd; } attach_failed: + (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY); ddi_remove_minor_node(dip, NULL); ddi_prop_remove_all(dip); ddi_soft_state_fini(&ipf_state); @@ -796,6 +799,7 @@ ddi_detach_cmd_t cmd; * framework guarantees we are not active with this devinfo * node in any other entry points at this time. */ + (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY); ddi_prop_remove_all(dip); i = ddi_get_instance(dip); ddi_remove_minor_node(dip, NULL); diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h index 5a168523ee..85ca5ebdec 100644 --- a/usr/src/uts/common/inet/mib2.h +++ b/usr/src/uts/common/inet/mib2.h @@ -23,6 +23,7 @@ /* * Copyright (c) 1990 Mentat Inc. * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. */ /* @@ -1400,6 +1401,8 @@ typedef struct tcpConnEntryInfo_s { /* round-trip time smoothed average (us) */ Gauge ce_rtt_sa; /* current rto (retransmit timeout) */ + Gauge ce_rtt_sd; + /* current rto (retransmit timeout) */ Gauge ce_rto; /* round-trip time count */ Gauge ce_rtt_cnt; diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h index 6fb72d1d08..ddb482db78 100644 --- a/usr/src/uts/common/inet/rawip_impl.h +++ b/usr/src/uts/common/inet/rawip_impl.h @@ -20,6 +20,7 @@ */ /* * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2016 Joyent, Inc. */ /* Copyright (c) 1990 Mentat Inc. */ @@ -43,6 +44,7 @@ extern "C" { #include <inet/ip.h> #include <inet/optcom.h> #include <inet/tunables.h> +#include <inet/bpf.h> /* * ICMP stack instances @@ -84,6 +86,10 @@ typedef struct icmp_s { mblk_t *icmp_fallback_queue_head; mblk_t *icmp_fallback_queue_tail; struct sockaddr_storage icmp_delayed_addr; + + krwlock_t icmp_bpf_lock; /* protects icmp_bpf */ + ip_bpf_insn_t *icmp_bpf_prog; /* SO_ATTACH_FILTER bpf */ + uint_t icmp_bpf_len; } icmp_t; /* diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c new file mode 100644 index 0000000000..6e1171de46 --- /dev/null +++ b/usr/src/uts/common/inet/sockmods/datafilt.c @@ -0,0 +1,116 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * This file implements a socketfilter used to deter TCP connections. + * To defer a connection means to delay the return of accept(3SOCKET) + * until at least one byte is ready to be read(2). This filter may be + * applied automatically or programmatically through the use of + * soconfig(1M) and setsockopt(3SOCKET). + */ + +#include <sys/kmem.h> +#include <sys/systm.h> +#include <sys/stropts.h> +#include <sys/strsun.h> +#include <sys/socketvar.h> +#include <sys/sockfilter.h> +#include <sys/note.h> +#include <sys/taskq.h> + +#define DATAFILT_MODULE "datafilt" + +static struct modlmisc dataf_modlmisc = { + &mod_miscops, + "Kernel data-ready socket filter" +}; + +static struct modlinkage dataf_modlinkage = { + MODREV_1, + &dataf_modlmisc, + NULL +}; + +static sof_rval_t +dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph, + void *parg, struct sockaddr *laddr, socklen_t laddrlen, + struct sockaddr *faddr, socklen_t faddrlen, void **cookiep) +{ + _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen, + cookiep)); + return (SOF_RVAL_DEFER); +} + +static void +dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr) +{ + _NOTE(ARGUNUSED(handle, cookie, cr)); +} + +static mblk_t * +dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags, + size_t *lenp) +{ + _NOTE(ARGUNUSED(cookie, flags, lenp)); + + if (mp != NULL && MBLKL(mp) > 0) { + sof_newconn_ready(handle); + sof_bypass(handle); + } + + return (mp); +} + +static sof_ops_t dataf_ops = { + .sofop_attach_passive = dataf_attach_passive_cb, + .sofop_detach = dataf_detach_cb, + .sofop_data_in = dataf_data_in_cb +}; + +int +_init(void) +{ + int err; + + /* + * This module is safe to attach even after some preliminary socket + * setup calls have taken place. See the comment for SOF_ATT_SAFE. + */ + err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops, + SOF_ATT_SAFE); + if (err != 0) + return (err); + if ((err = mod_install(&dataf_modlinkage)) != 0) + (void) sof_unregister(DATAFILT_MODULE); + + return (err); +} + +int +_fini(void) +{ + int err; + + if ((err = sof_unregister(DATAFILT_MODULE)) != 0) + return (err); + + return (mod_remove(&dataf_modlinkage)); +} + +int +_info(struct modinfo *modinfop) +{ + return (mod_info(&dataf_modlinkage, modinfop)); +} diff --git a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c index 586d7f06f8..76191e93b8 100644 --- a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c +++ b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2015 Joyent, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. */ #include <sys/types.h> @@ -51,6 +51,7 @@ #include <sys/mac_client.h> #include <sys/mac_provider.h> #include <sys/mac_client_priv.h> +#include <inet/bpf.h> #include <netpacket/packet.h> @@ -448,7 +449,7 @@ pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag) buffer = (uchar_t *)mp; } rw_enter(&ps->ps_bpflock, RW_READER); - if (bpf_filter(ps->ps_bpf.bf_insns, buffer, + if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer, hdr.mhi_pktsize, buflen) == 0) { rw_exit(&ps->ps_bpflock); ps->ps_stats.tp_drops++; @@ -1336,7 +1337,7 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name, const void *optval, socklen_t optlen) { struct bpf_program prog; - struct bpf_insn *fcode; + ip_bpf_insn_t *fcode; struct pfpsock *ps; struct sock_proto_props sopp; int error = 0; @@ -1370,10 +1371,10 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name, return (EFAULT); } - if (bpf_validate(fcode, (int)prog.bf_len)) { + if (ip_bpf_validate(fcode, prog.bf_len)) { rw_enter(&ps->ps_bpflock, RW_WRITER); pfp_release_bpf(ps); - ps->ps_bpf.bf_insns = fcode; + ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode; ps->ps_bpf.bf_len = size; rw_exit(&ps->ps_bpflock); diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c index 9fa40eccb6..e65af832eb 100644 --- a/usr/src/uts/common/inet/squeue.c +++ b/usr/src/uts/common/inet/squeue.c @@ -61,6 +61,10 @@ * connection are processed on that squeue. The connection ("conn") to * squeue mapping is stored in "conn_t" member "conn_sqp". * + * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is + * false and it will not have an associated conn_t, which means many aspects of + * the system, such as polling and swtiching squeues will not be used. + * * Since the processing of the connection cuts across multiple layers * but still allows packets for different connnection to be processed on * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or @@ -241,7 +245,7 @@ squeue_init(void) } squeue_t * -squeue_create(pri_t pri) +squeue_create(pri_t pri, boolean_t isip) { squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP); @@ -256,11 +260,36 @@ squeue_create(pri_t pri) sqp->sq_enter = squeue_enter; sqp->sq_drain = squeue_drain; + sqp->sq_isip = isip; return (sqp); } /* + * We need to kill the threads and then clean up. We should VERIFY that + * polling is disabled so we don't have to worry about disassociating from + * MAC/IP/etc. + */ +void +squeue_destroy(squeue_t *sqp) +{ + kt_did_t worker, poll; + mutex_enter(&sqp->sq_lock); + VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED | + SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT))); + worker = sqp->sq_worker->t_did; + poll = sqp->sq_poll_thr->t_did; + sqp->sq_state |= SQS_EXIT; + cv_signal(&sqp->sq_poll_cv); + cv_signal(&sqp->sq_worker_cv); + mutex_exit(&sqp->sq_lock); + + thread_join(poll); + thread_join(worker); + kmem_cache_free(squeue_cache, sqp); +} + +/* * Bind squeue worker thread to the specified CPU, given by CPU id. * If the CPU id value is -1, bind the worker thread to the value * specified in sq_bind field. If a thread is already bound to a @@ -380,18 +409,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } SQUEUE_DBG_CLEAR(sqp); - CONN_DEC_REF(connp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -407,7 +439,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * still be best to process a single queued * item if it matches the active connection. */ - if (sqp->sq_first != NULL) { + if (sqp->sq_first != NULL && sqp->sq_isip) { squeue_try_drain_one(sqp, connp); } @@ -423,7 +455,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, return; } } else { - if (ira != NULL) { + if (sqp->sq_isip == B_TRUE && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -496,7 +528,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, if (!(sqp->sq_state & SQS_REENTER) && (process_flag != SQ_FILL) && (sqp->sq_first == NULL) && (sqp->sq_run == curthread) && (cnt == 1) && - (connp->conn_on_sqp == B_FALSE)) { + (sqp->sq_isip == B_FALSE || + connp->conn_on_sqp == B_FALSE)) { sqp->sq_state |= SQS_REENTER; mutex_exit(&sqp->sq_lock); @@ -511,15 +544,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, * Handle squeue switching. More details in the * block comment at the top of the file */ - if (connp->conn_sqp == sqp) { - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { + SQUEUE_DBG_SET(sqp, mp, proc, connp, + tag); + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -540,7 +579,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt, #ifdef DEBUG mp->b_tag = tag; #endif - if (ira != NULL) { + if (sqp->sq_isip && ira != NULL) { mblk_t *attrmp; ASSERT(cnt == 1); @@ -658,7 +697,7 @@ again: mp->b_prev = NULL; /* Is there an ip_recv_attr_t to handle? */ - if (ip_recv_attr_is_mblk(mp)) { + if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) { mblk_t *attrmp = mp; ASSERT(attrmp->b_cont != NULL); @@ -683,20 +722,25 @@ again: /* - * Handle squeue switching. More details in the - * block comment at the top of the file + * Handle squeue switching. More details in the block comment at + * the top of the file. non-IP squeues cannot switch, as there + * is no conn_t. */ - if (connp->conn_sqp == sqp) { + if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) { SQUEUE_DBG_SET(sqp, mp, proc, connp, mp->b_tag); - connp->conn_on_sqp = B_TRUE; + if (sqp->sq_isip == B_TRUE) + connp->conn_on_sqp = B_TRUE; DTRACE_PROBE3(squeue__proc__start, squeue_t *, sqp, mblk_t *, mp, conn_t *, connp); (*proc)(connp, mp, sqp, ira); DTRACE_PROBE2(squeue__proc__end, squeue_t *, sqp, conn_t *, connp); - connp->conn_on_sqp = B_FALSE; - CONN_DEC_REF(connp); + if (sqp->sq_isip == B_TRUE) { + connp->conn_on_sqp = B_FALSE; + CONN_DEC_REF(connp); + } + SQUEUE_DBG_CLEAR(sqp); } else { SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE); @@ -925,6 +969,11 @@ squeue_polling_thread(squeue_t *sqp) cv_wait(async, lock); CALLB_CPR_SAFE_END(&cprinfo, lock); + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL | SQS_POLL_THR_QUIESCED); if (ctl_state != 0) { @@ -950,6 +999,9 @@ squeue_polling_thread(squeue_t *sqp) (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) == (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)); + /* Only IP related squeues should reach this point */ + VERIFY(sqp->sq_isip == B_TRUE); + poll_again: sq_rx_ring = sqp->sq_rx_ring; sq_get_pkts = sq_rx_ring->rr_rx; @@ -1079,6 +1131,7 @@ squeue_worker_thr_control(squeue_t *sqp) ill_rx_ring_t *rx_ring; ASSERT(MUTEX_HELD(&sqp->sq_lock)); + VERIFY(sqp->sq_isip == B_TRUE); if (sqp->sq_state & SQS_POLL_RESTART) { /* Restart implies a previous quiesce. */ @@ -1190,6 +1243,11 @@ squeue_worker(squeue_t *sqp) for (;;) { for (;;) { + if (sqp->sq_state & SQS_EXIT) { + mutex_exit(lock); + thread_exit(); + } + /* * If the poll thread has handed control to us * we need to break out of the wait. @@ -1286,6 +1344,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp) again: sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); mutex_enter(&sqp->sq_lock); if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) { @@ -1374,6 +1433,7 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn) ASSERT(MUTEX_HELD(&sqp->sq_lock)); ASSERT((sqp->sq_state & SQS_PROC) == 0); ASSERT(sqp->sq_run == NULL); + ASSERT(sqp->sq_isip); VERIFY(mp != NULL); /* @@ -1440,6 +1500,9 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn) CONN_DEC_REF(connp); SQUEUE_DBG_CLEAR(sqp); + if (ira != NULL) + ira_cleanup(ira, B_TRUE); + done: mutex_enter(&sqp->sq_lock); sqp->sq_state &= ~(SQS_PROC); @@ -1451,6 +1514,7 @@ squeue_synch_exit(conn_t *connp, int flag) { squeue_t *sqp = connp->conn_sqp; + VERIFY(sqp->sq_isip == B_TRUE); ASSERT(flag == SQ_NODRAIN || flag == SQ_PROCESS); mutex_enter(&sqp->sq_lock); diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h index 775c5abe6b..3ed2b7174a 100644 --- a/usr/src/uts/common/inet/tcp.h +++ b/usr/src/uts/common/inet/tcp.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent, Inc. All rights reserved. + * Copyright 2015 Joyent, Inc. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, 2017 by Delphix. All rights reserved. * Copyright 2020 OmniOS Community Edition (OmniOSce) Association. @@ -137,6 +137,7 @@ typedef struct tcphdra_s { struct conn_s; struct tcp_listen_cnt_s; +struct tcp_rg_s; /* * Control structure for each open TCP stream, @@ -407,6 +408,13 @@ typedef struct tcp_s { struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */ struct tcp_s **tcp_ptpbhn; + /* + * Group of tcp_t entries bound to the same adress and port via + * SO_REUSEPORT. The pointer itself is protected by tf_lock in the + * containing tcps_bind_fanout slot. + */ + struct tcp_rg_s *tcp_rg_bind; + uint_t tcp_maxpsz_multiplier; uint32_t tcp_lso_max; /* maximum LSO payload */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 9348ea3d0f..427a6df274 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -961,8 +961,7 @@ void tcp_stop_lingering(tcp_t *tcp) { clock_t delta = 0; - tcp_stack_t *tcps = tcp->tcp_tcps; - conn_t *connp = tcp->tcp_connp; + conn_t *connp = tcp->tcp_connp; tcp->tcp_linger_tid = 0; if (tcp->tcp_state > TCPS_LISTEN) { @@ -990,7 +989,7 @@ tcp_stop_lingering(tcp_t *tcp) if (tcp->tcp_state == TCPS_TIME_WAIT) { tcp_time_wait_append(tcp); - TCP_DBGSTAT(tcps, tcp_detach_time_wait); + TCP_DBGSTAT(tcp->tcp_tcps, tcp_detach_time_wait); goto finish; } @@ -1429,6 +1428,21 @@ tcp_free(tcp_t *tcp) tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv); /* + * Destroy any association with SO_REUSEPORT group. + */ + if (tcp->tcp_rg_bind != NULL) { + /* + * This is only necessary for connections which enabled + * SO_REUSEPORT but were never bound. Such connections should + * be the one and only member of the tcp_rg_tp to which they + * have been associated. + */ + VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp)); + tcp_rg_destroy(tcp->tcp_rg_bind); + tcp->tcp_rg_bind = NULL; + } + + /* * If this is a non-STREAM socket still holding on to an upper * handle, release it. As a result of fallback we might also see * STREAMS based conns with upper handles, in which case there is @@ -2477,8 +2491,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent) * Path MTU might have changed by either increase or decrease, so need to * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny * or negative MSS, since tcp_mss_set() will do it. + * + * Returns B_TRUE when the connection PMTU changes, otherwise B_FALSE. */ -void +boolean_t tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) { uint32_t pmtu; @@ -2488,10 +2504,10 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) iaflags_t ixaflags; if (tcp->tcp_tcps->tcps_ignore_path_mtu) - return; + return (B_FALSE); if (tcp->tcp_state < TCPS_ESTABLISHED) - return; + return (B_FALSE); /* * Always call ip_get_pmtu() to make sure that IP has updated @@ -2511,13 +2527,13 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) * Nothing to change, so just return. */ if (mss == tcp->tcp_mss) - return; + return (B_FALSE); /* * Currently, for ICMP errors, only PMTU decrease is handled. */ if (mss > tcp->tcp_mss && decrease_only) - return; + return (B_FALSE); DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss); @@ -2552,6 +2568,7 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only) tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0; } ixa->ixa_flags = ixaflags; + return (B_TRUE); } int @@ -3424,7 +3441,7 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype, tcp_update_lso(tcp, connp->conn_ixa); break; case IXAN_PMTU: - tcp_update_pmtu(tcp, B_FALSE); + (void) tcp_update_pmtu(tcp, B_FALSE); break; case IXAN_ZCOPY: tcp_update_zcopy(tcp); @@ -3755,7 +3772,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) { tcp_stack_t *tcps; int i; - int error = 0; major_t major; size_t arrsz; @@ -3819,8 +3835,7 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns) tcps->tcps_mibkp = tcp_kstat_init(stackid); major = mod_name_to_major(INET_NAME); - error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident); - ASSERT(error == 0); + VERIFY0(ldi_ident_from_major(major, &tcps->tcps_ldi_ident)); tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL); ASSERT(tcps->tcps_ixa_cleanup_mp != NULL); cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL); diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c index 86242fc944..5c2e1e1932 100644 --- a/usr/src/uts/common/inet/tcp/tcp_bind.c +++ b/usr/src/uts/common/inet/tcp/tcp_bind.c @@ -22,6 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright 2016 Joyent, Inc. * Copyright (c) 2016 by Delphix. All rights reserved. */ @@ -56,6 +57,7 @@ static uint32_t tcp_random_anon_port = 1; static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t, cred_t *cr); static in_port_t tcp_get_next_priv_port(const tcp_t *); +static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *); /* * Hash list insertion routine for tcp_t structures. Each hash bucket @@ -173,6 +175,16 @@ tcp_bind_hash_remove(tcp_t *tcp) ASSERT(lockp != NULL); mutex_enter(lockp); + + /* destroy any association with SO_REUSEPORT group */ + if (tcp->tcp_rg_bind != NULL) { + if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) { + /* Last one out turns off the lights */ + tcp_rg_destroy(tcp->tcp_rg_bind); + } + tcp->tcp_rg_bind = NULL; + } + if (tcp->tcp_ptpbhn) { tcpnext = tcp->tcp_bind_hash_port; if (tcpnext != NULL) { @@ -638,13 +650,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr, } /* - * If the "bind_to_req_port_only" parameter is set, if the requested port - * number is available, return it, If not return 0 + * If the "bind_to_req_port_only" parameter is set and the requested port + * number is available, return it (else return 0). * - * If "bind_to_req_port_only" parameter is not set and - * If the requested port number is available, return it. If not, return - * the first anonymous port we happen across. If no anonymous ports are - * available, return 0. addr is the requested local address, if any. + * If "bind_to_req_port_only" parameter is not set and the requested port + * number is available, return it. If not, return the first anonymous port we + * happen across. If no anonymous ports are available, return 0. * * In either case, when succeeding update the tcp_t to record the port number * and insert it in the bind hash table. @@ -664,6 +675,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, int loopmax; conn_t *connp = tcp->tcp_connp; tcp_stack_t *tcps = tcp->tcp_tcps; + boolean_t reuseport = connp->conn_reuseport; /* * Lookup for free addresses is done in a loop and "loopmax" @@ -700,6 +712,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, tf_t *tbf; tcp_t *ltcp; conn_t *lconnp; + boolean_t attempt_reuse = B_FALSE; lport = htons(port); @@ -726,6 +739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) { boolean_t not_socket; boolean_t exclbind; + boolean_t addrmatch; lconnp = ltcp->tcp_connp; @@ -831,22 +845,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, &lconnp->conn_faddr_v6))) continue; + addrmatch = IN6_ARE_ADDR_EQUAL(laddr, + &lconnp->conn_bound_addr_v6); + + if (addrmatch && reuseport && bind_to_req_port_only && + (ltcp->tcp_state == TCPS_BOUND || + ltcp->tcp_state == TCPS_LISTEN)) { + /* + * This entry is bound to the exact same + * address and port. If SO_REUSEPORT is set on + * the calling socket, attempt to reuse this + * binding if it too had SO_REUSEPORT enabled + * when it was bound. + */ + attempt_reuse = (ltcp->tcp_rg_bind != NULL); + break; + } + if (!reuseaddr) { /* - * No socket option SO_REUSEADDR. - * If existing port is bound to - * a non-wildcard IP address - * and the requesting stream is - * bound to a distinct - * different IP addresses - * (non-wildcard, also), keep - * going. + * No socket option SO_REUSEADDR. If an + * existing port is bound to a non-wildcard IP + * address and the requesting stream is bound + * to a distinct different IP address + * (non-wildcard, also), keep going. */ if (!V6_OR_V4_INADDR_ANY(*laddr) && !V6_OR_V4_INADDR_ANY( lconnp->conn_bound_addr_v6) && - !IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6)) + !addrmatch) continue; if (ltcp->tcp_state >= TCPS_BOUND) { /* @@ -861,27 +888,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, * socket option SO_REUSEADDR is set on the * binding tcp_t. * - * If two streams are bound to - * same IP address or both addr - * and bound source are wildcards - * (INADDR_ANY), we want to stop - * searching. - * We have found a match of IP source - * address and source port, which is - * refused regardless of the - * SO_REUSEADDR setting, so we break. + * If two streams are bound to the same IP + * address or both addr and bound source are + * wildcards (INADDR_ANY), we want to stop + * searching. We have found a match of IP + * source address and source port, which is + * refused regardless of the SO_REUSEADDR + * setting, so we break. */ - if (IN6_ARE_ADDR_EQUAL(laddr, - &lconnp->conn_bound_addr_v6) && + if (addrmatch && (ltcp->tcp_state == TCPS_LISTEN || ltcp->tcp_state == TCPS_BOUND)) break; } } - if (ltcp != NULL) { + if (ltcp != NULL && !attempt_reuse) { /* The port number is busy */ mutex_exit(&tbf->tf_lock); } else { + if (attempt_reuse) { + int err; + struct tcp_rg_s *rg; + + ASSERT(ltcp != NULL); + ASSERT(ltcp->tcp_rg_bind != NULL); + ASSERT(tcp->tcp_rg_bind != NULL); + ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind); + + err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp); + if (err != 0) { + mutex_exit(&tbf->tf_lock); + return (0); + } + /* + * Now that the newly-binding socket has joined + * the existing reuseport group on ltcp, it + * should clean up its own (empty) group. + */ + rg = tcp->tcp_rg_bind; + tcp->tcp_rg_bind = ltcp->tcp_rg_bind; + VERIFY(tcp_rg_remove(rg, tcp)); + tcp_rg_destroy(rg); + } + /* * This port is ours. Insert in fanout and mark as * bound to prevent others from getting the port @@ -946,3 +995,124 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr, } while (++count < loopmax); return (0); } + +/* Max number of members in TCP SO_REUSEPORT group */ +#define TCP_RG_SIZE_MAX 64 +/* Step size when expanding members array */ +#define TCP_RG_SIZE_STEP 2 + + +tcp_rg_t * +tcp_rg_init(tcp_t *tcp) +{ + tcp_rg_t *rg; + rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP_LAZY); + if (rg == NULL) + return (NULL); + rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), KM_NOSLEEP_LAZY); + if (rg->tcprg_members == NULL) { + kmem_free(rg, sizeof (tcp_rg_t)); + return (NULL); + } + + mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL); + rg->tcprg_size = 2; + rg->tcprg_count = 1; + rg->tcprg_active = 1; + rg->tcprg_members[0] = tcp; + return (rg); +} + +void +tcp_rg_destroy(tcp_rg_t *rg) +{ + mutex_enter(&rg->tcprg_lock); + ASSERT(rg->tcprg_count == 0); + ASSERT(rg->tcprg_active == 0); + kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *)); + mutex_destroy(&rg->tcprg_lock); + kmem_free(rg, sizeof (struct tcp_rg_s)); +} + +static int +tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp) +{ + mutex_enter(&rg->tcprg_lock); + + VERIFY(rg->tcprg_size > 0); + VERIFY(rg->tcprg_count <= rg->tcprg_size); + if (rg->tcprg_count != 0) { + cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred; + cred_t *newcred = tcp->tcp_connp->conn_cred; + + if (crgetuid(oldcred) != crgetuid(newcred) || + crgetzoneid(oldcred) != crgetzoneid(newcred)) { + mutex_exit(&rg->tcprg_lock); + return (EPERM); + } + } + + if (rg->tcprg_count == rg->tcprg_size) { + unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *); + unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP; + tcp_t **newmembers; + + if (newsize > TCP_RG_SIZE_MAX) { + mutex_exit(&rg->tcprg_lock); + return (EINVAL); + } + newmembers = kmem_zalloc(newsize * sizeof (tcp_t *), + KM_NOSLEEP_LAZY); + if (newmembers == NULL) { + mutex_exit(&rg->tcprg_lock); + return (ENOMEM); + } + bcopy(rg->tcprg_members, newmembers, oldalloc); + kmem_free(rg->tcprg_members, oldalloc); + rg->tcprg_members = newmembers; + rg->tcprg_size = newsize; + } + + rg->tcprg_members[rg->tcprg_count] = tcp; + rg->tcprg_count++; + rg->tcprg_active++; + + mutex_exit(&rg->tcprg_lock); + return (0); +} + +boolean_t +tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp) +{ + int i; + boolean_t is_empty; + + mutex_enter(&rg->tcprg_lock); + for (i = 0; i < rg->tcprg_count; i++) { + if (rg->tcprg_members[i] == tcp) + break; + } + /* The item should be present */ + ASSERT(i < rg->tcprg_count); + /* Move the last member into this position */ + rg->tcprg_count--; + rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count]; + rg->tcprg_members[rg->tcprg_count] = NULL; + if (tcp->tcp_connp->conn_reuseport != 0) + rg->tcprg_active--; + is_empty = (rg->tcprg_count == 0); + mutex_exit(&rg->tcprg_lock); + return (is_empty); +} + +void +tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active) +{ + mutex_enter(&rg->tcprg_lock); + if (is_active) { + rg->tcprg_active++; + } else { + rg->tcprg_active--; + } + mutex_exit(&rg->tcprg_lock); +} diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c index dd264528fc..22b0019a6a 100644 --- a/usr/src/uts/common/inet/tcp/tcp_input.c +++ b/usr/src/uts/common/inet/tcp/tcp_input.c @@ -5715,10 +5715,12 @@ noticmpv4: switch (icmph->icmph_code) { case ICMP_FRAGMENTATION_NEEDED: /* - * Update Path MTU, then try to send something out. + * Attempt to update path MTU and, if the MSS of the + * connection is altered, retransmit outstanding data. */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); + if (tcp_update_pmtu(tcp, B_TRUE)) { + tcp_rexmit_after_error(tcp); + } break; case ICMP_PORT_UNREACHABLE: case ICMP_PROTOCOL_UNREACHABLE: @@ -5761,7 +5763,7 @@ noticmpv4: break; } break; - case ICMP_SOURCE_QUENCH: { + case ICMP_SOURCE_QUENCH: /* * use a global boolean to control * whether TCP should respond to ICMP_SOURCE_QUENCH. @@ -5786,7 +5788,6 @@ noticmpv4: } break; } - } freemsg(mp); } @@ -5839,10 +5840,12 @@ noticmpv6: switch (icmp6->icmp6_type) { case ICMP6_PACKET_TOO_BIG: /* - * Update Path MTU, then try to send something out. + * Attempt to update path MTU and, if the MSS of the connection + * is altered, retransmit outstanding data. */ - tcp_update_pmtu(tcp, B_TRUE); - tcp_rexmit_after_error(tcp); + if (tcp_update_pmtu(tcp, B_TRUE)) { + tcp_rexmit_after_error(tcp); + } break; case ICMP6_DST_UNREACH: switch (icmp6->icmp6_code) { diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c index 8687b52d53..15e49ae070 100644 --- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c +++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c @@ -67,7 +67,8 @@ opdes_t tcp_opt_arr[] = { { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, -{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, +{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, { SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 }, { SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 }, @@ -505,6 +506,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr) } /* + * Set a TCP connection's participation in SO_REUSEPORT. This operation is + * performed under the protection of the squeue via tcp_setsockopt. + * The manipulation of tcp_rg_bind, as part of this operation, is subject to + * these constraints: + * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport + * under the protection of the squeue. + * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be + * altered until such time as tcp_free() cleans up the connection. + * 3. A connection undergoing bind, which matches to a connection participating + * in port-reuse, will switch its tcp_rg_bind pointer when it joins the + * group of an existing connection in tcp_bindi(). + */ +static int +tcp_set_reuseport(conn_t *connp, boolean_t do_enable) +{ + tcp_t *tcp = connp->conn_tcp; + struct tcp_rg_s *rg; + + if (!IPCL_IS_NONSTR(connp)) { + if (do_enable) { + /* + * SO_REUSEPORT cannot be enabled on sockets which have + * fallen back to the STREAMS API. + */ + return (EINVAL); + } else { + /* + * A connection with SO_REUSEPORT enabled should be + * prevented from falling back to STREAMS mode via + * logic in tcp_fallback. It is legal, however, for + * fallen-back connections to affirm the disabled state + * of SO_REUSEPORT. + */ + ASSERT(connp->conn_reuseport == 0); + return (0); + } + } + if (tcp->tcp_state <= TCPS_CLOSED) { + return (EINVAL); + } + if (connp->conn_reuseport == 0 && do_enable) { + /* disabled -> enabled */ + if (tcp->tcp_rg_bind != NULL) { + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } else { + /* + * Connection state is not a concern when initially + * populating tcp_rg_bind. Setting it to non-NULL on a + * bound or listening connection would only mean that + * new reused-port binds become a possibility. + */ + if ((rg = tcp_rg_init(tcp)) == NULL) { + return (ENOMEM); + } + tcp->tcp_rg_bind = rg; + } + connp->conn_reuseport = 1; + } else if (connp->conn_reuseport != 0 && !do_enable) { + /* enabled -> disabled */ + ASSERT(tcp->tcp_rg_bind != NULL); + if (tcp->tcp_state == TCPS_IDLE) { + /* + * If the connection has not been bound yet, discard + * the reuse group state. Since disabling SO_REUSEPORT + * on a bound socket will _not_ prevent others from + * reusing the port, the presence of tcp_rg_bind is + * used to determine reuse availability, not + * conn_reuseport. + * + * This allows proper behavior for examples such as: + * + * setsockopt(fd1, ... SO_REUSEPORT, &on_val...); + * bind(fd1, &myaddr, ...); + * setsockopt(fd1, ... SO_REUSEPORT, &off_val...); + * + * setsockopt(fd2, ... SO_REUSEPORT, &on_val...); + * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED + * + */ + rg = tcp->tcp_rg_bind; + tcp->tcp_rg_bind = NULL; + VERIFY(tcp_rg_remove(rg, tcp)); + tcp_rg_destroy(rg); + } else { + /* + * If a connection has been bound, it's no longer safe + * to manipulate tcp_rg_bind until connection clean-up + * during tcp_free. Just mark the member status of the + * connection as inactive. + */ + tcp_rg_setactive(tcp->tcp_rg_bind, do_enable); + } + connp->conn_reuseport = 0; + } + return (0); +} + +/* * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements. * Parameters are assumed to be verified by the caller. */ @@ -674,6 +773,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } *outlenp = inlen; return (0); + case SO_REUSEPORT: + if (!checkonly) { + return (tcp_set_reuseport(connp, *i1 != 0)); + } + return (0); } break; case IPPROTO_TCP: @@ -1031,10 +1135,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name, } break; case IPPROTO_IP: - if (connp->conn_family != AF_INET) { - *outlenp = 0; - return (EINVAL); - } switch (name) { case IP_SEC_OPT: /* diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c index 9b6c0daac3..32422be675 100644 --- a/usr/src/uts/common/inet/tcp/tcp_socket.c +++ b/usr/src/uts/common/inet/tcp/tcp_socket.c @@ -1029,6 +1029,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q, } /* + * Do not allow fallback on connections making use of SO_REUSEPORT. + */ + if (tcp->tcp_rg_bind != NULL) { + freeb(stropt_mp); + freeb(ordrel_mp); + squeue_synch_exit(connp, SQ_NODRAIN); + return (EINVAL); + } + + /* * Both endpoints must be of the same type (either STREAMS or * non-STREAMS) for fusion to be enabled. So if we are fused, * we have to unfuse. diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c index e29c76a696..226467e167 100644 --- a/usr/src/uts/common/inet/tcp/tcp_stats.c +++ b/usr/src/uts/common/inet/tcp/tcp_stats.c @@ -21,8 +21,8 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, Joyent Inc. All rights reserved. * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright 2019 OmniOS Community Edition (OmniOSce) Association. */ @@ -131,9 +131,14 @@ tcp_set_conninfo(tcp_t *tcp, struct tcpConnEntryInfo_s *tcei, boolean_t ispriv) tcei->ce_rto = tcp->tcp_rto; tcei->ce_mss = tcp->tcp_mss; tcei->ce_state = tcp->tcp_state; - tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3); tcei->ce_rtt_sum = NSEC2USEC(tcp->tcp_rtt_sum); tcei->ce_rtt_cnt = tcp->tcp_rtt_cnt; + + /* tcp_rtt_sa is stored as 8 times the average RTT */ + tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3); + + /* tcp_rtt_sd is stored as 4 times the average RTTVAR */ + tcei->ce_rtt_sd = NSEC2USEC(tcp->tcp_rtt_sd >> 2); } /* diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c index 5793a7fd27..7d9b449392 100644 --- a/usr/src/uts/common/inet/tcp/tcp_timers.c +++ b/usr/src/uts/common/inet/tcp/tcp_timers.c @@ -22,7 +22,7 @@ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright 2011 Joyent, Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. * Copyright (c) 2014, 2017 by Delphix. All rights reserved. */ diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h index 5669592cff..61af05f749 100644 --- a/usr/src/uts/common/inet/tcp_impl.h +++ b/usr/src/uts/common/inet/tcp_impl.h @@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls; * by setting it to 0. */ #define TCP_XMIT_LOWATER 4096 -#define TCP_XMIT_HIWATER 49152 +#define TCP_XMIT_HIWATER 128000 #define TCP_RECV_LOWATER 2048 -#define TCP_RECV_HIWATER 128000 +#define TCP_RECV_HIWATER 1048576 /* * Bind hash list size and has function. It has to be a power of 2 for @@ -395,6 +395,22 @@ typedef struct tcp_listen_cnt_s { uint32_t tlc_drop; } tcp_listen_cnt_t; +/* + * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT. + * - tcprg_lock: Protects the other fields + * - tcprg_size: Allocated size (in entries) of tcprg_members array + * - tcprg_count: Count of occupied tcprg_members slots + * - tcprg_active: Count of members which still have SO_REUSEPORT set + * - tcprg_members: Connections associated with address/port group + */ +typedef struct tcp_rg_s { + kmutex_t tcprg_lock; + unsigned int tcprg_size; + unsigned int tcprg_count; + unsigned int tcprg_active; + tcp_t **tcprg_members; +} tcp_rg_t; + #define TCP_TLC_REPORT_INTERVAL (30 * MINUTES) #define TCP_DECR_LISTEN_CNT(tcp) \ @@ -678,7 +694,7 @@ extern int tcp_rwnd_set(tcp_t *, uint32_t); extern int tcp_set_destination(tcp_t *); extern void tcp_set_ws_value(tcp_t *); extern void tcp_stop_lingering(tcp_t *); -extern void tcp_update_pmtu(tcp_t *, boolean_t); +extern boolean_t tcp_update_pmtu(tcp_t *, boolean_t); extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t); extern boolean_t tcp_zcopy_check(tcp_t *); extern void tcp_zcopy_notify(tcp_t *); @@ -695,6 +711,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *, int, boolean_t, boolean_t, boolean_t); extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *, boolean_t); +extern tcp_rg_t *tcp_rg_init(tcp_t *); +extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *); +extern void tcp_rg_destroy(tcp_rg_t *); +extern void tcp_rg_setactive(tcp_rg_t *, boolean_t); /* * Fusion related functions in tcp_fusion.c. diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 5d42a69fa2..4e208465f2 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -1671,6 +1671,11 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name, *i1 = udp->udp_vxlanhash; mutex_exit(&connp->conn_lock); return (sizeof (int)); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + *i1 = udp->udp_snd_to_conn ? 1 : 0; + mutex_exit(&connp->conn_lock); + return (sizeof (int)); } } mutex_enter(&connp->conn_lock); @@ -1826,6 +1831,11 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name, } /* Fully handled this option. */ return (0); + case UDP_SND_TO_CONNECTED: + mutex_enter(&connp->conn_lock); + udp->udp_snd_to_conn = onoff; + mutex_exit(&connp->conn_lock); + return (0); } break; } @@ -6096,10 +6106,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg, else return (error); } - if (udp->udp_state == TS_DATA_XFER) { + + /* + * Check if we're allowed to send to a connection on which we've + * already called 'connect'. The posix spec. allows both behaviors but + * historically we've returned an error if already connected. The + * client can allow this via a sockopt. + */ + if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) { UDPS_BUMP_MIB(us, udpOutErrors); return (EISCONN); } + error = proto_verify_ip_addr(connp->conn_family, (struct sockaddr *)msg->msg_name, msg->msg_namelen); if (error != 0) { diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c index c8e7d79e47..9c05b8c876 100644 --- a/usr/src/uts/common/inet/udp/udp_opt_data.c +++ b/usr/src/uts/common/inet/udp/udp_opt_data.c @@ -294,7 +294,9 @@ opdes_t udp_opt_arr[] = { }, { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int), 0 }, -{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 } +{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }, +{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), + 0 } }; /* diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 0fc597ccf3..ef11973707 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -179,12 +179,12 @@ typedef struct udp_s { udp_issocket : 1, /* socket mode; sockfs is on top */ udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */ udp_rcvhdr : 1, /* UDP_RCVHDR option */ - udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */ /* Because there's only VXLAN, cheat */ /* and only use a single bit */ + udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */ - udp_pad_to_bit_31 : 28; + udp_pad_to_bit_31 : 27; /* Following 2 fields protected by the uf_lock */ struct udp_s *udp_bind_hash; /* Bind hash chain */ |