summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet')
-rw-r--r--usr/src/uts/common/inet/bpf.h49
-rw-r--r--usr/src/uts/common/inet/bpf_filter.c572
-rw-r--r--usr/src/uts/common/inet/ip.h15
-rw-r--r--usr/src/uts/common/inet/ip/conn_opt.c22
-rw-r--r--usr/src/uts/common/inet/ip/icmp.c117
-rw-r--r--usr/src/uts/common/inet/ip/icmp_opt_data.c6
-rw-r--r--usr/src/uts/common/inet/ip/ip.c94
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c177
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c2
-rw-r--r--usr/src/uts/common/inet/ip/ipclassifier.c165
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h4
-rw-r--r--usr/src/uts/common/inet/ipd/ipd.c6
-rw-r--r--usr/src/uts/common/inet/ipf/cfw.c659
-rw-r--r--usr/src/uts/common/inet/ipf/fil.c5
-rw-r--r--usr/src/uts/common/inet/ipf/ip_fil_solaris.c304
-rw-r--r--usr/src/uts/common/inet/ipf/ip_log.c4
-rw-r--r--usr/src/uts/common/inet/ipf/ip_state.c19
-rw-r--r--usr/src/uts/common/inet/ipf/ipf.conf5
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/Makefile7
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ip_fil.h46
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ip_state.h4
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h69
-rw-r--r--usr/src/uts/common/inet/ipf/netinet/ipf_stack.h15
-rw-r--r--usr/src/uts/common/inet/ipf/solaris.c10
-rw-r--r--usr/src/uts/common/inet/mib2.h3
-rw-r--r--usr/src/uts/common/inet/rawip_impl.h6
-rw-r--r--usr/src/uts/common/inet/sockmods/datafilt.c116
-rw-r--r--usr/src/uts/common/inet/sockmods/sockmod_pfp.c11
-rw-r--r--usr/src/uts/common/inet/squeue.c104
-rw-r--r--usr/src/uts/common/inet/tcp.h10
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c39
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_bind.c226
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_input.c19
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_opt_data.c110
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_socket.c10
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_stats.c9
-rw-r--r--usr/src/uts/common/inet/tcp/tcp_timers.c2
-rw-r--r--usr/src/uts/common/inet/tcp_impl.h26
-rw-r--r--usr/src/uts/common/inet/udp/udp.c20
-rw-r--r--usr/src/uts/common/inet/udp/udp_opt_data.c4
-rw-r--r--usr/src/uts/common/inet/udp_impl.h4
41 files changed, 2753 insertions, 342 deletions
diff --git a/usr/src/uts/common/inet/bpf.h b/usr/src/uts/common/inet/bpf.h
new file mode 100644
index 0000000000..e3eac799e5
--- /dev/null
+++ b/usr/src/uts/common/inet/bpf.h
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef _INET_BPF_H
+#define _INET_BPF_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef _KERNEL
+
+#include <sys/types.h>
+
+/*
+ * Clone bpf_insn definition so that consumers don't need net/bpf.h to reason
+ * about struct sizing.
+ */
+typedef struct ip_bpf_insn {
+ uint16_t code;
+ uint8_t jt;
+ uint8_t jf;
+ uint32_t k;
+} ip_bpf_insn_t;
+
+extern uint32_t ip_bpf_filter(ip_bpf_insn_t *, uchar_t *, uint_t, uint_t);
+extern boolean_t ip_bpf_validate(ip_bpf_insn_t *, uint_t);
+
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_BPF_H */
diff --git a/usr/src/uts/common/inet/bpf_filter.c b/usr/src/uts/common/inet/bpf_filter.c
new file mode 100644
index 0000000000..5a9ba38da6
--- /dev/null
+++ b/usr/src/uts/common/inet/bpf_filter.c
@@ -0,0 +1,572 @@
+/* $NetBSD: bpf_filter.c,v 1.35 2008/08/20 13:01:54 joerg Exp $ */
+
+/*
+ * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)bpf_filter.c 8.1 (Berkeley) 6/10/93
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/stream.h>
+#include <sys/byteorder.h>
+#include <sys/sdt.h>
+#include <inet/bpf.h>
+#include <net/bpf.h>
+
+#define EXTRACT_SHORT(p) BE_IN16(p)
+#define EXTRACT_LONG(p) BE_IN32(p)
+
+#define M_LEN(_m) ((_m)->b_wptr - (_m)->b_rptr)
+#define mtod(_a, _t) ((_t)((_a)->b_rptr))
+#define MINDEX(len, m, k) \
+{ \
+ len = M_LEN(m); \
+ while (k >= len) { \
+ k -= len; \
+ m = m->b_cont; \
+ if (m == 0) \
+ return (0); \
+ len = M_LEN(m); \
+ } \
+}
+
+static int m_xword(mblk_t *, uint32_t, int *);
+static int m_xhalf(mblk_t *, uint32_t, int *);
+
+static int
+m_xword(mblk_t *m, uint32_t k, int *err)
+{
+ int len;
+ uchar_t *cp, *np;
+ mblk_t *m0;
+
+ *err = 1;
+ MINDEX(len, m, k);
+ cp = mtod(m, uchar_t *) + k;
+ if (len >= k + 4) {
+ *err = 0;
+ return (EXTRACT_LONG(cp));
+ }
+ m0 = m->b_cont;
+ if (m0 == 0 || M_LEN(m0) + len - k < 4) {
+ DTRACE_PROBE3(mblk_xword_fail, mblk_t *, m0, int, len, int, k);
+ return (0);
+ }
+ *err = 0;
+ np = mtod(m0, uchar_t *);
+ switch (len - k) {
+
+ case 1:
+ return ((cp[0] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]);
+
+ case 2:
+ return ((cp[0] << 24) | (cp[1] << 16) | (np[0] << 8) | np[1]);
+
+ default:
+ return ((cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | np[0]);
+ }
+}
+
+static int
+m_xhalf(mblk_t *m, uint32_t k, int *err)
+{
+ int len;
+ uchar_t *cp;
+ mblk_t *m0;
+
+ *err = 1;
+ MINDEX(len, m, k);
+ cp = mtod(m, uchar_t *) + k;
+ if (len >= k + 2) {
+ *err = 0;
+ return (EXTRACT_SHORT(cp));
+ }
+ m0 = m->b_cont;
+ if (m0 == 0) {
+ DTRACE_PROBE3(mblk_xhalf_fail, mblk_t *, m0, int, len, int, k);
+ return (0);
+ }
+ *err = 0;
+ return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]);
+}
+
+
+/*
+ * Execute the filter program starting at pc on the packet p
+ * wirelen is the length of the original packet
+ * buflen is the amount of data present
+ * When buflen is non-0, p is a pointer to a the start of the packet and the
+ * packet is only in one mblk_t.
+ * When buflen is 0, p is an mblk_t pointer.
+ */
+uint32_t
+ip_bpf_filter(ip_bpf_insn_t *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
+{
+ uint32_t A, X, k;
+ uint32_t mem[BPF_MEMWORDS];
+
+ if (pc == 0)
+ /*
+ * No filter means accept all.
+ */
+ return ((uint32_t)-1);
+ A = 0;
+ X = 0;
+ --pc;
+ /* CONSTCOND */
+ while (1) {
+ ++pc;
+ switch (pc->code) {
+
+ default:
+#ifdef _KERNEL
+ DTRACE_PROBE1(bpf_insn_unknown,
+ struct bpf_insn *, pc);
+ return (0);
+#else
+ abort();
+#endif
+ case BPF_RET|BPF_K:
+ return (pc->k);
+
+ case BPF_RET|BPF_A:
+ return (A);
+
+ case BPF_LD|BPF_W|BPF_ABS:
+ k = pc->k;
+ if (k + sizeof (int32_t) > buflen) {
+#ifdef _KERNEL
+ int merr = 0;
+
+ if (buflen != 0)
+ return (0);
+ A = m_xword((mblk_t *)p, k, &merr);
+ if (merr != 0)
+ return (0);
+ continue;
+#else
+ return (0);
+#endif
+ }
+ A = EXTRACT_LONG(&p[k]);
+ continue;
+
+ case BPF_LD|BPF_H|BPF_ABS:
+ k = pc->k;
+ if (k + sizeof (int16_t) > buflen) {
+#ifdef _KERNEL
+ int merr;
+
+ if (buflen != 0)
+ return (0);
+ A = m_xhalf((mblk_t *)p, k, &merr);
+ if (merr != 0)
+ return (0);
+ continue;
+#else
+ return (0);
+#endif
+ }
+ A = EXTRACT_SHORT(&p[k]);
+ continue;
+
+ case BPF_LD|BPF_B|BPF_ABS:
+ k = pc->k;
+ if (k >= buflen) {
+#ifdef _KERNEL
+ mblk_t *m;
+ int len;
+
+ if (buflen != 0)
+ return (0);
+ m = (mblk_t *)p;
+ MINDEX(len, m, k);
+ A = mtod(m, uchar_t *)[k];
+ continue;
+#else
+ return (0);
+#endif
+ }
+ A = p[k];
+ continue;
+
+ case BPF_LD|BPF_W|BPF_LEN:
+ A = wirelen;
+ continue;
+
+ case BPF_LDX|BPF_W|BPF_LEN:
+ X = wirelen;
+ continue;
+
+ case BPF_LD|BPF_W|BPF_IND:
+ k = X + pc->k;
+ if (k + sizeof (int32_t) > buflen) {
+#ifdef _KERNEL
+ int merr = 0;
+
+ if (buflen != 0)
+ return (0);
+ A = m_xword((mblk_t *)p, k, &merr);
+ if (merr != 0)
+ return (0);
+ continue;
+#else
+ return (0);
+#endif
+ }
+ A = EXTRACT_LONG(&p[k]);
+ continue;
+
+ case BPF_LD|BPF_H|BPF_IND:
+ k = X + pc->k;
+ if (k + sizeof (int16_t) > buflen) {
+#ifdef _KERNEL
+ int merr = 0;
+
+ if (buflen != 0)
+ return (0);
+ A = m_xhalf((mblk_t *)p, k, &merr);
+ if (merr != 0)
+ return (0);
+ continue;
+#else
+ return (0);
+#endif
+ }
+ A = EXTRACT_SHORT(&p[k]);
+ continue;
+
+ case BPF_LD|BPF_B|BPF_IND:
+ k = X + pc->k;
+ if (k >= buflen) {
+#ifdef _KERNEL
+ mblk_t *m;
+ int len;
+
+ if (buflen != 0)
+ return (0);
+ m = (mblk_t *)p;
+ MINDEX(len, m, k);
+ A = mtod(m, uchar_t *)[k];
+ continue;
+#else
+ return (0);
+#endif
+ }
+ A = p[k];
+ continue;
+
+ case BPF_LDX|BPF_MSH|BPF_B:
+ k = pc->k;
+ if (k >= buflen) {
+#ifdef _KERNEL
+ mblk_t *m;
+ int len;
+
+ if (buflen != 0)
+ return (0);
+ m = (mblk_t *)p;
+ MINDEX(len, m, k);
+ X = (mtod(m, char *)[k] & 0xf) << 2;
+ continue;
+#else
+ return (0);
+#endif
+ }
+ X = (p[pc->k] & 0xf) << 2;
+ continue;
+
+ case BPF_LD|BPF_IMM:
+ A = pc->k;
+ continue;
+
+ case BPF_LDX|BPF_IMM:
+ X = pc->k;
+ continue;
+
+ case BPF_LD|BPF_MEM:
+ A = mem[pc->k];
+ continue;
+
+ case BPF_LDX|BPF_MEM:
+ X = mem[pc->k];
+ continue;
+
+ case BPF_ST:
+ mem[pc->k] = A;
+ continue;
+
+ case BPF_STX:
+ mem[pc->k] = X;
+ continue;
+
+ case BPF_JMP|BPF_JA:
+ pc += pc->k;
+ continue;
+
+ case BPF_JMP|BPF_JGT|BPF_K:
+ pc += (A > pc->k) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_JMP|BPF_JGE|BPF_K:
+ pc += (A >= pc->k) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_JMP|BPF_JEQ|BPF_K:
+ pc += (A == pc->k) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_JMP|BPF_JSET|BPF_K:
+ pc += (A & pc->k) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_JMP|BPF_JGT|BPF_X:
+ pc += (A > X) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_JMP|BPF_JGE|BPF_X:
+ pc += (A >= X) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_JMP|BPF_JEQ|BPF_X:
+ pc += (A == X) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_JMP|BPF_JSET|BPF_X:
+ pc += (A & X) ? pc->jt : pc->jf;
+ continue;
+
+ case BPF_ALU|BPF_ADD|BPF_X:
+ A += X;
+ continue;
+
+ case BPF_ALU|BPF_SUB|BPF_X:
+ A -= X;
+ continue;
+
+ case BPF_ALU|BPF_MUL|BPF_X:
+ A *= X;
+ continue;
+
+ case BPF_ALU|BPF_DIV|BPF_X:
+ if (X == 0)
+ return (0);
+ A /= X;
+ continue;
+
+ case BPF_ALU|BPF_AND|BPF_X:
+ A &= X;
+ continue;
+
+ case BPF_ALU|BPF_OR|BPF_X:
+ A |= X;
+ continue;
+
+ case BPF_ALU|BPF_LSH|BPF_X:
+ A <<= X;
+ continue;
+
+ case BPF_ALU|BPF_RSH|BPF_X:
+ A >>= X;
+ continue;
+
+ case BPF_ALU|BPF_ADD|BPF_K:
+ A += pc->k;
+ continue;
+
+ case BPF_ALU|BPF_SUB|BPF_K:
+ A -= pc->k;
+ continue;
+
+ case BPF_ALU|BPF_MUL|BPF_K:
+ A *= pc->k;
+ continue;
+
+ case BPF_ALU|BPF_DIV|BPF_K:
+ A /= pc->k;
+ continue;
+
+ case BPF_ALU|BPF_AND|BPF_K:
+ A &= pc->k;
+ continue;
+
+ case BPF_ALU|BPF_OR|BPF_K:
+ A |= pc->k;
+ continue;
+
+ case BPF_ALU|BPF_LSH|BPF_K:
+ A <<= pc->k;
+ continue;
+
+ case BPF_ALU|BPF_RSH|BPF_K:
+ A >>= pc->k;
+ continue;
+
+ case BPF_ALU|BPF_NEG:
+ A = -A;
+ continue;
+
+ case BPF_MISC|BPF_TAX:
+ X = A;
+ continue;
+
+ case BPF_MISC|BPF_TXA:
+ A = X;
+ continue;
+ }
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Return true if the 'fcode' is a valid filter program.
+ * The constraints are that each jump be forward and to a valid
+ * code, that memory accesses are within valid ranges (to the
+ * extent that this can be checked statically; loads of packet
+ * data have to be, and are, also checked at run time), and that
+ * the code terminates with either an accept or reject.
+ *
+ * The kernel needs to be able to verify an application's filter code.
+ * Otherwise, a bogus program could easily crash the system.
+ */
+boolean_t
+ip_bpf_validate(ip_bpf_insn_t *f, uint_t len)
+{
+ uint_t i, from;
+ ip_bpf_insn_t *p;
+
+ if (len < 1 || len > BPF_MAXINSNS)
+ return (B_FALSE);
+
+ for (i = 0; i < len; ++i) {
+ p = &f[i];
+ DTRACE_PROBE1(bpf_valid_insn, struct bpf_insn *, p);
+ switch (BPF_CLASS(p->code)) {
+ /*
+ * Check that memory operations use valid addresses.
+ */
+ case BPF_LD:
+ case BPF_LDX:
+ switch (BPF_MODE(p->code)) {
+ case BPF_MEM:
+ if (p->k >= BPF_MEMWORDS)
+ return (B_FALSE);
+ break;
+ case BPF_ABS:
+ case BPF_IND:
+ case BPF_MSH:
+ case BPF_IMM:
+ case BPF_LEN:
+ break;
+ default:
+ return (B_FALSE);
+ }
+ break;
+ case BPF_ST:
+ case BPF_STX:
+ if (p->k >= BPF_MEMWORDS)
+ return (B_FALSE);
+ break;
+ case BPF_ALU:
+ switch (BPF_OP(p->code)) {
+ case BPF_ADD:
+ case BPF_SUB:
+ case BPF_MUL:
+ case BPF_OR:
+ case BPF_AND:
+ case BPF_LSH:
+ case BPF_RSH:
+ case BPF_NEG:
+ break;
+ case BPF_DIV:
+ /*
+ * Check for constant division by 0.
+ */
+ if (BPF_RVAL(p->code) == BPF_K && p->k == 0)
+ return (B_FALSE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ break;
+ case BPF_JMP:
+ /*
+ * Check that jumps are within the code block,
+ * and that unconditional branches don't go
+ * backwards as a result of an overflow.
+ * Unconditional branches have a 32-bit offset,
+ * so they could overflow; we check to make
+ * sure they don't. Conditional branches have
+ * an 8-bit offset, and the from address is <=
+ * BPF_MAXINSNS, and we assume that BPF_MAXINSNS
+ * is sufficiently small that adding 255 to it
+ * won't overflow.
+ *
+ * We know that len is <= BPF_MAXINSNS, and we
+ * assume that BPF_MAXINSNS is < the maximum size
+ * of a uint_t, so that i + 1 doesn't overflow.
+ */
+ from = i + 1;
+ switch (BPF_OP(p->code)) {
+ case BPF_JA:
+ if (from + p->k < from || from + p->k >= len)
+ return (B_FALSE);
+ break;
+ case BPF_JEQ:
+ case BPF_JGT:
+ case BPF_JGE:
+ case BPF_JSET:
+ if (from + p->jt >= len || from + p->jf >= len)
+ return (B_FALSE);
+ break;
+ default:
+ return (B_FALSE);
+ }
+ break;
+ case BPF_RET:
+ break;
+ case BPF_MISC:
+ break;
+ default:
+ return (B_FALSE);
+ }
+ }
+
+ return (BPF_CLASS(f[len - 1].code) == BPF_RET);
+}
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index c081c44a04..ebf2574363 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -1416,6 +1416,7 @@ typedef union ill_g_head_u {
#define ILL_CAPAB_DLD 0x20 /* DLD capabilities */
#define ILL_CAPAB_DLD_POLL 0x40 /* Polling */
#define ILL_CAPAB_DLD_DIRECT 0x80 /* Direct function call */
+#define ILL_CAPAB_DLD_IPCHECK 0x100 /* Check if IPs are permitted */
/*
* Per-ill Hardware Checksumming capbilities.
@@ -1772,6 +1773,10 @@ typedef struct ill_s {
* Used to save errors that occur during plumbing
*/
uint_t ill_ifname_pending_err;
+ /*
+ * Used to save errors that occur during binding
+ */
+ uint_t ill_dl_bind_err;
avl_node_t ill_avl_byppa; /* avl node based on ppa */
uint_t ill_mcast_nces; /* Number of NCEs that are multicast. */
list_t ill_nce; /* pointer to nce_s list */
@@ -1938,6 +1943,7 @@ typedef struct ill_s {
* ill_nd_lla_len ipsq + down ill only when ill is up
* ill_phys_addr_pend ipsq + down ill only when ill is up
* ill_ifname_pending_err ipsq ipsq
+ * ill_dl_bind_err ipsq ipsq
* ill_avl_byppa ipsq, ill_g_lock write once
*
* ill_fastpath_list ill_lock ill_lock
@@ -3580,6 +3586,8 @@ typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
typedef void *(*ip_dld_callb_t)(void *,
ip_flow_enable_t, void *);
typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t);
+typedef boolean_t (*ip_mac_ipcheck_t)(void *, boolean_t,
+ in6_addr_t *);
typedef int (*ip_capab_func_t)(void *, uint_t,
void *, uint_t);
@@ -3632,6 +3640,12 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */
void *idd_tx_fctl_dh; /* mac_client_handle */
} ill_dld_direct_t;
+/* IP - DLD direct function call to check if an IP is allowed */
+typedef struct ill_dld_ipcheck_s {
+ ip_mac_ipcheck_t idi_allowed_df;
+ void *idi_allowed_dh;
+} ill_dld_ipcheck_t;
+
/* IP - DLD polling capability */
typedef struct ill_dld_poll_s {
ill_rx_ring_t idp_ring_tbl[ILL_MAX_RINGS];
@@ -3643,6 +3657,7 @@ struct ill_dld_capab_s {
void *idc_capab_dh; /* dld_str_t *dsp */
ill_dld_direct_t idc_direct;
ill_dld_poll_t idc_poll;
+ ill_dld_ipcheck_t idc_ipcheck;
};
/*
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index 7aac9b655a..eeec56b162 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -644,6 +645,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
case SO_REUSEADDR:
*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
break; /* goto sizeof (int) option return */
+ case SO_REUSEPORT:
+ *i1 = connp->conn_reuseport;
+ break; /* goto sizeof (int) option return */
case SO_TYPE:
*i1 = connp->conn_so_type;
break; /* goto sizeof (int) option return */
@@ -1214,8 +1218,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
int error;
- if (connp->conn_family != AF_INET)
+ if (connp->conn_family == AF_INET6 &&
+ connp->conn_ipversion == IPV4_VERSION) {
+ /*
+ * Allow certain IPv4 options to be set on an AF_INET6 socket
+ * if the connection is still IPv4.
+ */
+ switch (name) {
+ case IP_TOS:
+ case T_IP_TOS:
+ case IP_TTL:
+ case IP_DONTFRAG:
+ break;
+ default:
+ return (EINVAL);
+ }
+ } else if (connp->conn_family != AF_INET) {
return (EINVAL);
+ }
ifindex = UINT_MAX;
switch (name) {
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 57ee0c5585..46c791298a 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -81,6 +81,7 @@
#include <sys/tsol/tnet.h>
#include <inet/rawip_impl.h>
+#include <net/bpf.h>
#include <sys/disp.h>
@@ -1018,6 +1019,12 @@ icmp_close_free(conn_t *connp)
icmp->icmp_filter = NULL;
}
+ if (icmp->icmp_bpf_len != 0) {
+ kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+ icmp->icmp_bpf_len = 0;
+ icmp->icmp_bpf_prog = NULL;
+ }
+
/*
* Clear any fields which the kmem_cache constructor clears.
* Only icmp_connp needs to be preserved.
@@ -1971,6 +1978,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
return (err);
}
+static int
+icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
+{
+ struct bpf_program prog;
+ ip_bpf_insn_t *insns = NULL;
+ unsigned int size;
+
+#ifdef _LP64
+ if (get_udatamodel() != DATAMODEL_NATIVE) {
+ struct bpf_program32 *prog32;
+
+ if (inlen != sizeof (struct bpf_program32)) {
+ return (EINVAL);
+ }
+ prog32 = (struct bpf_program32 *)invalp;
+ prog.bf_len = prog32->bf_len;
+ prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
+ } else
+#endif
+ if (inlen == sizeof (struct bpf_program)) {
+ bcopy(invalp, &prog, sizeof (prog));
+ } else {
+ return (EINVAL);
+ }
+
+ if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
+ return (EINVAL);
+ }
+ size = prog.bf_len * sizeof (struct bpf_insn);
+ insns = kmem_alloc(size, KM_SLEEP);
+ if (copyin(prog.bf_insns, insns, size) != 0) {
+ kmem_free(insns, size);
+ return (EFAULT);
+ }
+ if (!ip_bpf_validate(insns, prog.bf_len)) {
+ kmem_free(insns, size);
+ return (EINVAL);
+ }
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+ if (icmp->icmp_bpf_len != 0) {
+ ASSERT(icmp->icmp_bpf_prog != NULL);
+
+ kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+ }
+ icmp->icmp_bpf_len = size;
+ icmp->icmp_bpf_prog = insns;
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (0);
+}
+
+static int
+icmp_detach_filter(icmp_t *icmp)
+{
+ int error;
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+ if (icmp->icmp_bpf_len == 0) {
+ ASSERT(icmp->icmp_bpf_prog == NULL);
+ error = ENOENT;
+ } else {
+ kmem_free(icmp->icmp_bpf_prog,
+ icmp->icmp_bpf_len);
+ icmp->icmp_bpf_len = 0;
+ icmp->icmp_bpf_prog = NULL;
+ error = 0;
+ }
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (error);
+}
+
+static boolean_t
+icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
+{
+ boolean_t res;
+ uchar_t *buf = mp->b_rptr;
+ uint_t wirelen, len = MBLKL(mp);
+
+ rw_enter(&icmp->icmp_bpf_lock, RW_READER);
+ if (icmp->icmp_bpf_len == 0) {
+ rw_exit(&icmp->icmp_bpf_lock);
+ return (B_FALSE);
+ }
+ if (ira->ira_flags & IRAF_IS_IPV4) {
+ ipha_t *ipha = (ipha_t *)buf;
+
+ wirelen = ntohs(ipha->ipha_length);
+ } else {
+ ip6_t *ip6h = (ip6_t *)buf;
+
+ wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+ }
+ res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
+ rw_exit(&icmp->icmp_bpf_lock);
+
+ return (res);
+}
+
/*
* This routine sets socket options.
*/
@@ -2060,6 +2165,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
return (ENOBUFS);
}
break;
+ case SO_ATTACH_FILTER:
+ return (icmp_attach_filter(icmp, inlen, invalp));
+ case SO_DETACH_FILTER:
+ return (icmp_detach_filter(icmp));
}
break;
@@ -2605,6 +2714,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
/* Initialize regardless of IP version */
ipps.ipp_fields = 0;
+ /* Apply socket filter, if needed */
+ if (icmp->icmp_bpf_len != 0) {
+ if (icmp_eval_filter(icmp, mp, ira)) {
+ freemsg(mp);
+ return;
+ }
+ }
+
if (ira->ira_flags & IRAF_IS_IPV4) {
ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
ASSERT(MBLKL(mp) >= sizeof (ipha_t));
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index ff0310de0c..d65d3164d3 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -21,6 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -41,6 +42,7 @@
#include <netinet/ip_mroute.h>
#include <inet/optcom.h>
#include <inet/rawip_impl.h>
+#include <net/bpf.h>
/*
* Table of all known options handled on a ICMP protocol stack.
@@ -86,6 +88,10 @@ opdes_t icmp_opt_arr[] = {
0 },
{ SO_DOMAIN, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
+{ SO_ATTACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0,
+ sizeof (struct bpf_program), 0 },
+{ SO_DETACH_FILTER, SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 },
+
{ IP_OPTIONS, IPPROTO_IP, OA_RW, OA_RW, OP_NP,
(OP_VARLEN|OP_NODEFAULT),
IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 6063fa01d2..704f152bb9 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -8235,7 +8235,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
conn_t *connp = NULL;
t_uscalar_t paddrreq;
mblk_t *mp_hw;
- boolean_t success;
boolean_t ioctl_aborted = B_FALSE;
boolean_t log = B_TRUE;
@@ -8335,7 +8334,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
mutex_exit(&ill->ill_lock);
/*
- * Something went wrong with the bind. We presumably
+ * Something went wrong with the bind. If this was the
+ * result of a DL_NOTE_REPLUMB, then we presumably
* have an IOCTL hanging out waiting for completion.
* Find it, take down the interface that was coming
* up, and complete the IOCTL with the error noted.
@@ -8352,6 +8352,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
(void) ipif_down(ipif, NULL, NULL);
/* error is set below the switch */
+ } else {
+ /*
+ * There's no pending IOCTL, so the bind was
+ * most likely started by ill_dl_up(). We save
+ * the error and let it take care of responding
+ * to the IOCTL.
+ */
+ ill->ill_dl_bind_err = dlea->dl_unix_errno ?
+ dlea->dl_unix_errno : ENXIO;
}
break;
case DL_ENABMULTI_REQ:
@@ -8475,55 +8484,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
- /*
- * Now bring up the resolver; when that is complete, we'll
- * create IREs. Note that we intentionally mirror what
- * ipif_up() would have done, because we got here by way of
- * ill_dl_up(), which stopped ipif_up()'s processing.
- */
- if (ill->ill_isv6) {
- /*
- * v6 interfaces.
- * Unlike ARP which has to do another bind
- * and attach, once we get here we are
- * done with NDP
- */
- (void) ipif_resolver_up(ipif, Res_act_initial);
- if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
- err = ipif_up_done_v6(ipif);
- } else if (ill->ill_net_type == IRE_IF_RESOLVER) {
- /*
- * ARP and other v4 external resolvers.
- * Leave the pending mblk intact so that
- * the ioctl completes in ip_rput().
- */
- if (connp != NULL)
- mutex_enter(&connp->conn_lock);
- mutex_enter(&ill->ill_lock);
- success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
- mutex_exit(&ill->ill_lock);
- if (connp != NULL)
- mutex_exit(&connp->conn_lock);
- if (success) {
- err = ipif_resolver_up(ipif, Res_act_initial);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- mp1 = ipsq_pending_mp_get(ipsq, &connp);
- } else {
- /* The conn has started closing */
- err = EINTR;
- }
- } else {
- /*
- * This one is complete. Reply to pending ioctl.
- */
- (void) ipif_resolver_up(ipif, Res_act_initial);
- err = ipif_up_done(ipif);
- }
-
- if ((err == 0) && (ill->ill_up_ipifs)) {
+ if (ill->ill_up_ipifs) {
err = ill_up_ipifs(ill, q, mp1);
if (err == EINPROGRESS) {
freemsg(mp);
@@ -8531,25 +8492,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
}
}
- /*
- * If we have a moved ipif to bring up, and everything has
- * succeeded to this point, bring it up on the IPMP ill.
- * Otherwise, leave it down -- the admin can try to bring it
- * up by hand if need be.
- */
- if (ill->ill_move_ipif != NULL) {
- if (err != 0) {
- ill->ill_move_ipif = NULL;
- } else {
- ipif = ill->ill_move_ipif;
- ill->ill_move_ipif = NULL;
- err = ipif_up(ipif, q, mp1);
- if (err == EINPROGRESS) {
- freemsg(mp);
- return;
- }
- }
- }
break;
case DL_NOTIFY_IND: {
@@ -12621,6 +12563,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
ip_ioctl_cmd_t *ipip = arg;
ip_extract_func_t *extract_funcp;
+ ill_t *ill;
cmd_info_t ci;
int err;
boolean_t entered_ipsq = B_FALSE;
@@ -12742,6 +12685,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
/*
+ * We need to cache the ill_t that we're going to use as the argument
+ * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
+ * blown away by calling ipi_func.
+ */
+ ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
+
+ /*
* A return value of EINPROGRESS means the ioctl is
* either queued and waiting for some reason or has
* already completed.
@@ -12749,9 +12699,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
- int, ipip->ipi_cmd,
- ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
- ipif_t *, ci.ci_ipif);
+ int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
if (entered_ipsq)
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index cc67299a1b..2307837eb8 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -22,7 +22,7 @@
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1990 Mentat Inc.
* Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
*/
@@ -174,7 +174,7 @@ static ipif_t *ipif_lookup_on_name_async(char *name, size_t namelen,
static int ill_alloc_ppa(ill_if_t *, ill_t *);
static void ill_delete_interface_type(ill_if_t *);
-static int ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
+static int ill_dl_up(ill_t *ill, ipif_t *ipif);
static void ill_dl_down(ill_t *ill);
static void ill_down(ill_t *ill);
static void ill_down_ipifs(ill_t *, boolean_t);
@@ -1380,6 +1380,36 @@ ill_capability_probe(ill_t *ill)
ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
}
+static boolean_t
+ill_capability_wait(ill_t *ill)
+{
+ /*
+ * I'm in this ill's squeue, aka a writer. The ILL_CONDEMNED flag can
+ * only be set by someone who is the writer. Since we
+ * drop-and-reacquire the squeue in this loop, we need to check for
+ * ILL_CONDEMNED, which if set means nothing can signal our capability
+ * condition variable.
+ */
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ while (ill->ill_capab_pending_cnt != 0 &&
+ (ill->ill_state_flags & ILL_CONDEMNED) == 0) {
+ /* This may enable blocked callers of ill_capability_done(). */
+ ipsq_exit(ill->ill_phyint->phyint_ipsq);
+ /* Pause a bit (1msec) before we re-enter the squeue. */
+ delay(drv_usectohz(1000000));
+
+ /*
+ * If ipsq_enter() fails, someone set ILL_CONDEMNED
+ * while we dropped the squeue. Indicate such to the caller.
+ */
+ if (!ipsq_enter(ill, B_FALSE, CUR_OP))
+ return (B_FALSE);
+ }
+
+ return ((ill->ill_state_flags & ILL_CONDEMNED) == 0);
+}
+
void
ill_capability_reset(ill_t *ill, boolean_t reneg)
{
@@ -1390,6 +1420,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg)
ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
+ ASSERT(ill->ill_capab_reset_mp != NULL);
+
ill_capability_send(ill, ill->ill_capab_reset_mp);
ill->ill_capab_reset_mp = NULL;
/*
@@ -2109,6 +2141,49 @@ ill_capability_lso_enable(ill_t *ill)
}
}
+/*
+ * Check whether or not mac will prevent us from sending with a given IP
+ * address. This requires having the IPCHECK capability, which we should
+ * always be able to successfully negotiate, but if it's somehow missing
+ * then we just permit the caller to use the address, since mac does the
+ * actual enforcement and ip is just performing a courtesy check to help
+ * prevent users from unwittingly setting and attempting to use blocked
+ * addresses.
+ */
+static boolean_t
+ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr)
+{
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0)
+ return (B_TRUE);
+
+ ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck;
+ ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df;
+ return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr));
+}
+
+static void
+ill_capability_ipcheck_enable(ill_t *ill)
+{
+ ill_dld_capab_t *idc = ill->ill_dld_capab;
+ ill_dld_ipcheck_t *idi = &idc->idc_ipcheck;
+ dld_capab_ipcheck_t spoof;
+ int rc;
+
+ ASSERT(IAM_WRITER_ILL(ill));
+
+ bzero(&spoof, sizeof (spoof));
+ if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+ &spoof, DLD_ENABLE)) == 0) {
+ idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df;
+ idi->idi_allowed_dh = spoof.ipc_allowed_dh;
+ ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK;
+ } else {
+ cmn_err(CE_WARN, "warning: could not enable IPCHECK "
+ "capability, rc = %d\n", rc);
+ DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc);
+ }
+}
+
static void
ill_capability_dld_enable(ill_t *ill)
{
@@ -2121,6 +2196,8 @@ ill_capability_dld_enable(ill_t *ill)
ill_capability_direct_enable(ill);
ill_capability_poll_enable(ill);
}
+
+ ill_capability_ipcheck_enable(ill);
ill_capability_lso_enable(ill);
ill->ill_capabilities |= ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
@@ -2186,6 +2263,15 @@ ill_capability_dld_disable(ill_t *ill)
NULL, DLD_DISABLE);
}
+ if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) {
+ ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL);
+ ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL);
+
+ ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK;
+ (void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+ NULL, DLD_DISABLE);
+ }
+
ill->ill_capabilities &= ~ILL_CAPAB_DLD;
ill_mac_perim_exit(ill, mph);
}
@@ -9676,7 +9762,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
in6_addr_t v6addr;
boolean_t need_up = B_FALSE;
ill_t *ill;
- int i;
ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -9751,20 +9836,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
}
- /*
- * verify that the address being configured is permitted by the
- * ill_allowed_ips[] for the interface.
- */
- if (ill->ill_allowed_ips_cnt > 0) {
- for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
- if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
- &v6addr))
- break;
- }
- if (i == ill->ill_allowed_ips_cnt) {
- pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
- return (EPERM);
- }
+ /* verify that the address being configured is permitted by mac */
+ if (!ill_ipcheck_addr(ill, &v6addr)) {
+ return (EPERM);
}
/*
* Even if there is no change we redo things just to rerun
@@ -12704,6 +12778,12 @@ ill_dl_down(ill_t *ill)
}
ill->ill_unbind_mp = NULL;
+
+ mutex_enter(&ill->ill_lock);
+ ill->ill_dl_up = 0;
+ ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
+ mutex_exit(&ill->ill_lock);
+
if (mp != NULL) {
ip1dbg(("ill_dl_down: %s (%u) for %s\n",
dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
@@ -12726,11 +12806,13 @@ ill_dl_down(ill_t *ill)
ill_capability_dld_disable(ill);
ill_capability_reset(ill, B_FALSE);
ill_dlpi_send(ill, mp);
+
+ /*
+ * Wait for the capability reset to finish.
+ * In this case, it doesn't matter WHY or HOW it finished.
+ */
+ (void) ill_capability_wait(ill);
}
- mutex_enter(&ill->ill_lock);
- ill->ill_dl_up = 0;
- ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
- mutex_exit(&ill->ill_lock);
}
void
@@ -12852,6 +12934,7 @@ void
ill_capability_done(ill_t *ill)
{
ASSERT(ill->ill_capab_pending_cnt != 0);
+ ASSERT(IAM_WRITER_ILL(ill));
ill_dlpi_done(ill, DL_CAPABILITY_REQ);
@@ -14480,7 +14563,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
* address/netmask etc cause a down/up dance, but
* does not cause an unbind (DL_UNBIND) with the driver
*/
- return (ill_dl_up(ill, ipif, mp, q));
+ if ((err = ill_dl_up(ill, ipif)) != 0) {
+ return (err);
+ }
+ }
+
+ /* Reject bringing up interfaces with unusable IP addresses */
+ if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) {
+ return (EPERM);
}
/*
@@ -14593,24 +14683,22 @@ ill_delete_ires(ill_t *ill)
/*
* Perform a bind for the physical device.
- * When the routine returns EINPROGRESS then mp has been consumed and
- * the ioctl will be acked from ip_rput_dlpi.
- * Allocate an unbind message and save it until ipif_down.
+ *
+ * When the routine returns successfully then dlpi has been bound and
+ * capabilities negotiated. An unbind message will have been allocated
+ * for later use in ipif_down.
*/
static int
-ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
+ill_dl_up(ill_t *ill, ipif_t *ipif)
{
mblk_t *bind_mp = NULL;
mblk_t *unbind_mp = NULL;
- conn_t *connp;
- boolean_t success;
int err;
DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
ASSERT(IAM_WRITER_ILL(ill));
- ASSERT(mp != NULL);
/*
* Make sure we have an IRE_MULTICAST in case we immediately
@@ -14645,19 +14733,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
if (unbind_mp == NULL)
goto bad;
}
- /*
- * Record state needed to complete this operation when the
- * DL_BIND_ACK shows up. Also remember the pre-allocated mblks.
- */
- connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
- ASSERT(connp != NULL || !CONN_Q(q));
- GRAB_CONN_LOCK(q);
- mutex_enter(&ipif->ipif_ill->ill_lock);
- success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
- mutex_exit(&ipif->ipif_ill->ill_lock);
- RELEASE_CONN_LOCK(q);
- if (!success)
- goto bad;
/*
* Save the unbind message for ill_dl_down(); it will be consumed when
@@ -14669,6 +14744,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
ill_dlpi_send(ill, bind_mp);
/* Send down link-layer capabilities probe if not already done. */
ill_capability_probe(ill);
+ /*
+ * Wait for DLPI to be bound and the capability probe to finish.
+ * The call drops-and-reacquires the squeue. If it couldn't because
+ * ILL_CONDEMNED got set, bail.
+ */
+ if (!ill_capability_wait(ill))
+ return (ENXIO);
+
+ /* DLPI failed to bind. Return the saved error */
+ if (!ill->ill_dl_up) {
+ return (ill->ill_dl_bind_err);
+ }
/*
* Sysid used to rely on the fact that netboots set domainname
@@ -14686,11 +14773,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
cmn_err(CE_WARN, "no cached dhcp response");
}
- /*
- * This operation will complete in ip_rput_dlpi with either
- * a DL_BIND_ACK or DL_ERROR_ACK.
- */
- return (EINPROGRESS);
+ return (0);
bad:
ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 13e961333c..b6565d9c1f 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -153,7 +153,7 @@ ip_squeue_create(pri_t pri)
{
squeue_t *sqp;
- sqp = squeue_create(pri);
+ sqp = squeue_create(pri, B_TRUE);
ASSERT(sqp != NULL);
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 34832d56e5..d47997a4aa 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
* Copyright 2022 Joyent, Inc.
*/
@@ -871,67 +872,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t *connfp)
mutex_exit(&(connfp)->connf_lock); \
}
-#define IPCL_HASH_INSERT_BOUND(connfp, connp) { \
- conn_t *pconnp = NULL, *nconnp; \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- nconnp = (connfp)->connf_head; \
- while (nconnp != NULL && \
- !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) { \
- pconnp = nconnp; \
- nconnp = nconnp->conn_next; \
- } \
- if (pconnp != NULL) { \
- pconnp->conn_next = (connp); \
- (connp)->conn_prev = pconnp; \
- } else { \
- (connfp)->connf_head = (connp); \
- } \
- if (nconnp != NULL) { \
- (connp)->conn_next = nconnp; \
- nconnp->conn_prev = (connp); \
- } \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF(connp); \
- mutex_exit(&(connfp)->connf_lock); \
-}
+/*
+ * When inserting bound or wildcard entries into the hash, ordering rules are
+ * used to facilitate timely and correct lookups. The order is as follows:
+ * 1. Entries bound to a specific address
+ * 2. Entries bound to INADDR_ANY
+ * 3. Entries bound to ADDR_UNSPECIFIED
+ * Entries in a category which share conn_lport (such as those using
+ * SO_REUSEPORT) will be ordered such that the newest inserted is first.
+ */
-#define IPCL_HASH_INSERT_WILDCARD(connfp, connp) { \
- conn_t **list, *prev, *next; \
- boolean_t isv4mapped = \
- IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6); \
- IPCL_HASH_REMOVE((connp)); \
- mutex_enter(&(connfp)->connf_lock); \
- list = &(connfp)->connf_head; \
- prev = NULL; \
- while ((next = *list) != NULL) { \
- if (isv4mapped && \
- IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) && \
- connp->conn_zoneid == next->conn_zoneid) { \
- (connp)->conn_next = next; \
- if (prev != NULL) \
- prev = next->conn_prev; \
- next->conn_prev = (connp); \
- break; \
- } \
- list = &next->conn_next; \
- prev = next; \
- } \
- (connp)->conn_prev = prev; \
- *list = (connp); \
- (connp)->conn_fanout = (connfp); \
- (connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) | \
- IPCL_BOUND; \
- CONN_INC_REF((connp)); \
- mutex_exit(&(connfp)->connf_lock); \
+void
+ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
+{
+ conn_t *pconnp, *nconnp;
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ /*
+ * Walk though entries associated with the fanout until one is
+ * found which fulfills any of these conditions:
+ * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
+ * 2. Listen port the same as connp
+ */
+ if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
+ connp->conn_lport == nconnp->conn_lport)
+ break;
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
void
ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
{
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ conn_t **list, *prev, *next;
+ conn_t *pconnp = NULL, *nconnp;
+ boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
+
+ IPCL_HASH_REMOVE(connp);
+ mutex_enter(&connfp->connf_lock);
+ nconnp = connfp->connf_head;
+ pconnp = NULL;
+ while (nconnp != NULL) {
+ if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
+ isv4mapped && connp->conn_lport == nconnp->conn_lport)
+ break;
+ if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
+ (isv4mapped ||
+ connp->conn_lport == nconnp->conn_lport))
+ break;
+
+ pconnp = nconnp;
+ nconnp = nconnp->conn_next;
+ }
+ if (pconnp != NULL) {
+ pconnp->conn_next = connp;
+ connp->conn_prev = pconnp;
+ } else {
+ connfp->connf_head = connp;
+ }
+ if (nconnp != NULL) {
+ connp->conn_next = nconnp;
+ nconnp->conn_prev = connp;
+ }
+ connp->conn_fanout = connfp;
+ connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+ CONN_INC_REF(connp);
+ mutex_exit(&connfp->connf_lock);
}
/*
@@ -1037,9 +1062,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
} else {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
}
} else {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
@@ -1208,9 +1233,9 @@ ipcl_bind_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (protocol == IPPROTO_RSVP)
ill_set_inputfn_all(ipst);
@@ -1222,9 +1247,9 @@ ipcl_bind_insert_v4(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
ASSERT(connp->conn_ipversion == IPV4_VERSION);
@@ -1274,9 +1299,9 @@ ipcl_bind_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
@@ -1286,9 +1311,9 @@ ipcl_bind_insert_v6(conn_t *connp)
connfp = &ipst->ips_ipcl_bind_fanout[
IPCL_BIND_HASH(lport, ipst)];
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
if (cl_inet_listen != NULL) {
sa_family_t addr_family;
@@ -1419,9 +1444,9 @@ ipcl_conn_insert_v4(conn_t *connp)
if (connp->conn_faddr_v4 != INADDR_ANY) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (connp->conn_laddr_v4 != INADDR_ANY) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
@@ -1507,9 +1532,9 @@ ipcl_conn_insert_v6(conn_t *connp)
if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
IPCL_HASH_INSERT_CONNECTED(connfp, connp);
} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
- IPCL_HASH_INSERT_BOUND(connfp, connp);
+ ipcl_hash_insert_bound(connfp, connp);
} else {
- IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+ ipcl_hash_insert_wildcard(connfp, connp);
}
break;
}
@@ -2095,6 +2120,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
connp->conn_flags = IPCL_RAWIPCONN;
connp->conn_proto = IPPROTO_ICMP;
icmp->icmp_connp = connp;
+ rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
if (connp->conn_ixa == NULL)
@@ -2119,6 +2145,7 @@ rawip_conn_destructor(void *buf, void *cdrarg)
mutex_destroy(&connp->conn_lock);
cv_destroy(&connp->conn_cv);
rw_destroy(&connp->conn_ilg_lock);
+ rw_destroy(&icmp->icmp_bpf_lock);
/* Can be NULL if constructor failed */
if (connp->conn_ixa != NULL) {
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 89968826b3..70cff374a4 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -21,6 +21,7 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
*/
/*
@@ -299,7 +300,8 @@ struct conn_s {
conn_ipv6_recvpathmtu : 1, /* IPV6_RECVPATHMTU */
conn_mcbc_bind : 1, /* Bound to multi/broadcast */
- conn_pad_to_bit_31 : 12;
+ conn_reuseport : 1, /* SO_REUSEPORT state */
+ conn_pad_to_bit_31 : 11;
boolean_t conn_blocked; /* conn is flow-controlled */
diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c
index 104603d840..22f2d79d24 100644
--- a/usr/src/uts/common/inet/ipd/ipd.c
+++ b/usr/src/uts/common/inet/ipd/ipd.c
@@ -9,7 +9,7 @@
* http://www.illumos.org/license/CDDL.
*/
/*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
*/
/*
@@ -222,7 +222,7 @@ typedef struct ipd_netstack {
net_handle_t ipdn_v6hdl; /* IPv4 net handle */
int ipdn_hooked; /* are hooks registered */
hook_t *ipdn_v4in; /* IPv4 traffic in hook */
- hook_t *ipdn_v4out; /* IPv4 traffice out hook */
+ hook_t *ipdn_v4out; /* IPv4 traffic out hook */
hook_t *ipdn_v6in; /* IPv6 traffic in hook */
hook_t *ipdn_v6out; /* IPv6 traffic out hook */
int ipdn_enabled; /* which perturbs are on */
@@ -613,7 +613,7 @@ ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
/*
* If ipd_check_hooks_failed, that must mean that we failed to set up
* the hooks, so we are going to effectively zero out and fail the
- * request to enable corruption.
+ * request to enable packet delays.
*/
if (rval != 0)
ins->ipdn_delay = 0;
diff --git a/usr/src/uts/common/inet/ipf/cfw.c b/usr/src/uts/common/inet/ipf/cfw.c
new file mode 100644
index 0000000000..941aeac328
--- /dev/null
+++ b/usr/src/uts/common/inet/ipf/cfw.c
@@ -0,0 +1,659 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019, Joyent, Inc.
+ */
+
+/* IPF oddness for compilation in userland for IPF tests. */
+#if defined(KERNEL) || defined(_KERNEL)
+#undef KERNEL
+#undef _KERNEL
+#define KERNEL 1
+#define _KERNEL 1
+#endif
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include "netinet/ip_compat.h"
+#ifdef USE_INET6
+#include <netinet/icmp6.h>
+#endif
+#include <netinet/tcpip.h>
+#include "netinet/ip_fil.h"
+#include "netinet/ip_nat.h"
+#include "netinet/ip_frag.h"
+#include "netinet/ip_state.h"
+#include "netinet/ip_proxy.h"
+#include "netinet/ip_auth.h"
+#include "netinet/ipf_stack.h"
+#ifdef IPFILTER_SCAN
+#include "netinet/ip_scan.h"
+#endif
+#ifdef IPFILTER_SYNC
+#include "netinet/ip_sync.h"
+#endif
+#include "netinet/ip_pool.h"
+#include "netinet/ip_htable.h"
+#ifdef IPFILTER_COMPILED
+#include "netinet/ip_rules.h"
+#endif
+#if defined(_KERNEL)
+#include <sys/sunddi.h>
+#endif
+
+#include "netinet/ipf_cfw.h"
+#include <sys/file.h>
+#include <sys/uio.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+
+/*
+ * cfw == Cloud Firewall ==> routines for a global-zone data collector about
+ * ipf events for SmartOS. The only ones that CFW cares about are ones
+ * enforced by global-zone-controlled rulesets.
+ *
+ * The variable below is tied into the GZ-only ipf device /dev/ipfev, that
+ * flips this on when there is an open instance. This feature will also
+ * consume an fr_flag to have per-rule granularity.
+ */
+boolean_t ipf_cfwlog_enabled;
+
+/*
+ * Because ipf's test tools in $SRC/cmd insert all of these files, we need to
+ * stub out what we can vs. drag in even more headers and who knows what else.
+ */
+#ifdef _KERNEL
+
+/*
+ * CFW event ring buffer. Remember, this is for ALL ZONES because only a
+ * global-zone event-reader will be consuming these. In other words, it's
+ * not something to instantiate per-netstack.
+ *
+ * We may want to get more sophisticated and performant (e.g. per-processor),
+ * but for now keep the ring buffer simple and stupid.
+ * Must be a power of 2, to be bitmaskable, and must be countable by a uint_t
+ *
+ * Resizeable, see ipf_cfw_ring_resize() below.
+ */
+#define IPF_CFW_DEFAULT_RING_BUFS 1024
+#define IPF_CFW_MIN_RING_BUFS 8
+#define IPF_CFW_MAX_RING_BUFS (1U << 31U)
+
+/* Assume C's init-to-zero is sufficient for these types... */
+static kmutex_t cfw_ringlock;
+static kcondvar_t cfw_ringcv;
+
+static cfwev_t *cfw_ring; /* NULL by default. */
+static uint32_t cfw_ringsize; /* 0 by default, number of array elements. */
+static uint32_t cfw_ringmask; /* 0 by default. */
+
+/* If these are equal, we're either empty or full. */
+static uint_t cfw_ringstart, cfw_ringend;
+static boolean_t cfw_ringfull; /* Tell the difference here! */
+/* Bean-counters. */
+static uint64_t cfw_evreports;
+static uint64_t cfw_evdrops;
+
+/*
+ * Place an event in the CFW event ring buffer.
+ *
+ * For now, be simple and drop the oldest event if we overflow. We may wish to
+ * selectively drop older events based on type in the future.
+ */
+static void
+ipf_cfwev_report(cfwev_t *event)
+{
+ mutex_enter(&cfw_ringlock);
+ cfw_ring[cfw_ringend] = *event;
+ cfw_ringend++;
+ cfw_ringend &= cfw_ringmask;
+ if (cfw_ringfull) {
+ cfw_ringstart++;
+ cfw_ringstart &= cfw_ringmask;
+ ASSERT3U(cfw_ringstart, ==, cfw_ringend);
+ DTRACE_PROBE(ipf__cfw__evdrop);
+ cfw_evdrops++;
+ } else {
+ cfw_ringfull = (cfw_ringend == cfw_ringstart);
+ }
+ cfw_evreports++;
+ cv_broadcast(&cfw_ringcv);
+ mutex_exit(&cfw_ringlock);
+}
+
+/*
+ * Provide access to multiple CFW events that can allow copying straight from
+ * the ring buffer up to userland. Requires a callback (which could call
+ * uiomove() directly, OR to a local still-in-kernel buffer) that must do the
+ * data copying-out.
+ *
+ * Callback function is of the form:
+ *
+ * uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg);
+ *
+ * The function must return how many events got consumed, which MUST be <= the
+ * number available. The function must ALSO UNDERSTAND that cfw_ringlock is
+ * held and must not be released during this time. The function may be called
+ * more than once, if the available buffers wrap-around OR "block" is set and
+ * we don't have enough buffers. If any callback returns 0, exit the function
+ * with however many were consumed.
+ *
+ * This function, like the callback, returns the number of events *CONSUMED*.
+ *
+ * . . .
+ *
+ * Tunables for ipf_cfwev_consume_many().
+ *
+ * If you wish to attempt to coalesce reads (to reduce the likelihood of one
+ * event at a time during high load) change the number of tries below to
+ * something not 0. Early experiments set this to 10.
+ *
+ * The wait between tries is in usecs in cfw_timeout_wait. The pessimal
+ * case for this is a timeout_wait-spaced trickle of one event at a time.
+ */
+uint_t cfw_timeout_tries = 0;
+uint_t cfw_timeout_wait = 10000; /* 10ms wait. */
+
+typedef struct uio_error_s {
+ struct uio *ue_uio;
+ int ue_error;
+} uio_error_t;
+
+static uint_t
+ipf_cfwev_consume_many(uint_t num_requested, boolean_t block,
+ cfwmanycb_t cfw_many_cb, void *cbarg)
+{
+ uint_t consumed = 0, cb_consumed, contig_size;
+ uint_t timeout_tries = cfw_timeout_tries;
+ boolean_t eintr = B_FALSE;
+
+ mutex_enter(&cfw_ringlock);
+
+ while (num_requested > 0) {
+ clock_t delta;
+
+ /* Silly reality checks */
+ ASSERT3U(cfw_ringstart, <, cfw_ringsize);
+ ASSERT3U(cfw_ringend, <, cfw_ringsize);
+
+ if (cfw_ringstart > cfw_ringend || cfw_ringfull) {
+ /* We have from ringstart to the buffer's end. */
+ contig_size = cfw_ringsize - cfw_ringstart;
+ } else if (cfw_ringstart < cfw_ringend) {
+ /* We have no potential wrapping at this time. */
+ contig_size = cfw_ringend - cfw_ringstart;
+ } else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
+ /* Maybe something to consume now, try again. */
+ continue;
+ } else {
+ /* Nothing (more) to consume, return! */
+ eintr = (block && consumed == 0);
+ break;
+ }
+
+ /* Less asked-for than what we needed. */
+ if (num_requested < contig_size)
+ contig_size = num_requested;
+
+ cb_consumed =
+ cfw_many_cb(&(cfw_ring[cfw_ringstart]), contig_size, cbarg);
+ ASSERT3U(cb_consumed, <=, contig_size);
+
+ cfw_ringstart += cb_consumed;
+ ASSERT3U(cfw_ringstart, <=, cfw_ringmask + 1);
+ cfw_ringstart &= cfw_ringmask; /* In case of wraparound. */
+ consumed += cb_consumed;
+ cfw_ringfull = (cfw_ringfull && cb_consumed == 0);
+ if (cb_consumed < contig_size) {
+ /*
+ * Callback returned less than given.
+ * This is likely a uio error, but we have
+ * something. Get out of here.
+ */
+ break;
+ }
+ ASSERT3U(cb_consumed, ==, contig_size);
+ num_requested -= contig_size;
+
+ if (num_requested == 0) {
+ /* All done! */
+ break;
+ }
+
+ if (cfw_ringstart != cfw_ringend) {
+ /*
+ * We wrapped around the end of the buffer, and
+ * we have more available to fill our request.
+ */
+ ASSERT0(cfw_ringstart);
+ ASSERT(!cfw_ringfull);
+ continue;
+ }
+
+ /*
+ * We obtained some of the events we requested, but not all.
+ * Since we have nothing to consume, wait *a little* longer.
+ */
+ if (timeout_tries == 0)
+ break; /* Don't bother... */
+ delta = drv_usectohz(cfw_timeout_wait);
+ timeout_tries--;
+
+ switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta,
+ TR_CLOCK_TICK)) {
+ case 0:
+ /*
+ * Received signal! Return what we have OR if we have
+ * nothing, EINTR.
+ */
+ DTRACE_PROBE1(ipf__cfw__timedsignal, int, consumed);
+ eintr = (consumed == 0);
+ num_requested = 0;
+ break;
+ case -1:
+ /* Time reached! Bail with what we got. */
+ DTRACE_PROBE(ipf__cfw__timedexpired);
+ num_requested = 0;
+ break;
+ default:
+ /* Aha! We've got more! */
+ DTRACE_PROBE(ipf__cfw__moredata);
+ break;
+ }
+ }
+
+ mutex_exit(&cfw_ringlock);
+ if (eintr)
+ ((uio_error_t *)cbarg)->ue_error = EINTR;
+ return (consumed);
+}
+
+/*
+ * SmartOS likes using the zone's debug id. Make sure we squirrel that away in
+ * the ipf netstack instance if it's not there.
+ */
+static inline zoneid_t
+ifs_to_did(ipf_stack_t *ifs)
+{
+ if (ifs->ifs_zone_did == 0) {
+ zone_t *zone;
+
+ /*
+ * We can't get the zone_did at initialization time because
+ * most zone data isn't readily available then, cement the did
+ * in place now.
+ */
+ VERIFY3U(ifs->ifs_zone, !=, GLOBAL_ZONEID);
+ zone = zone_find_by_id(ifs->ifs_zone);
+ if (zone != NULL) {
+ ifs->ifs_zone_did = zone->zone_did;
+ zone_rele(zone);
+ }
+ /* Else we are either in shutdown or something weirder. */
+ }
+ return (ifs->ifs_zone_did);
+}
+
+/*
+ * ipf_block_cfwlog()
+ *
+ * Called by fr_check(). Record drop events for the global-zone data
+ * collector. Use rest-of-ipf-style names for the parameters.
+ */
+void
+ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs)
+{
+ cfwev_t event = {0};
+
+ /*
+ * We need a rule.
+ * Capture failure by using dtrace on this function's entry.
+ * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }'
+ */
+ if (fr == NULL)
+ return;
+
+ event.cfwev_type = CFWEV_BLOCK;
+ event.cfwev_length = sizeof (event);
+ /*
+ * IPF code elsewhere does the cheesy single-flag check, even though
+ * there are two flags in a rule (one for in, one for out).
+ */
+ event.cfwev_direction = (fr->fr_flags & FR_INQUE) ?
+ CFWDIR_IN : CFWDIR_OUT;
+
+ event.cfwev_protocol = fin->fin_p;
+ /*
+ * NOTE: fin_*port is in host/native order, and ICMP info is here too.
+ */
+ event.cfwev_sport = htons(fin->fin_sport);
+ event.cfwev_dport = htons(fin->fin_dport);
+
+ switch (fin->fin_v) {
+ case IPV4_VERSION:
+ IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr);
+ IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr);
+ break;
+ case IPV6_VERSION:
+ event.cfwev_saddr = fin->fin_src6.in6;
+ event.cfwev_daddr = fin->fin_dst6.in6;
+ break;
+ default:
+ /* We should never reach here, but mark it if we do. */
+ DTRACE_PROBE1(ipf__cfw__frinfo__badipversion, frinfo_t *, fin);
+ return;
+ }
+
+ /*
+ * uniqtime() is what ipf's GETKTIME() uses.
+ * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
+ */
+ uniqtime(&event.cfwev_tstamp);
+ event.cfwev_zonedid = ifs_to_did(ifs);
+ event.cfwev_ruleid = fin->fin_rule;
+ memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t));
+
+ ipf_cfwev_report(&event);
+}
+
+/*
+ * ipf_log_cfwlog()
+ *
+ * Twin of ipstate_log(), but records state events for the global-zone data
+ * collector.
+ */
+void
+ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs)
+{
+ cfwev_t event = {0};
+
+ switch (type) {
+ case ISL_NEW:
+ case ISL_CLONE:
+ event.cfwev_type = CFWEV_BEGIN;
+ break;
+ case ISL_EXPIRE:
+ case ISL_FLUSH:
+ case ISL_REMOVE:
+ case ISL_KILLED:
+ case ISL_ORPHAN:
+ /*
+ * We don't care about session disappearances in CFW logging
+ * for now. (Possible future: CFWEV_END)
+ */
+ return;
+ default:
+ event.cfwev_type = CFWEV_BLOCK;
+ break;
+ }
+
+ /*
+ * IPF code elsewhere does the cheesy single-flag check, even though
+ * there are two flags in a rule (one for in, one for out). Follow
+ * suit here.
+ */
+ event.cfwev_length = sizeof (event);
+ ASSERT(is->is_rule != NULL);
+ event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ?
+ CFWDIR_IN : CFWDIR_OUT;
+ event.cfwev_protocol = is->is_p;
+ switch (is->is_p) {
+ case IPPROTO_TCP:
+ case IPPROTO_UDP:
+ /* NOTE: is_*port is in network order. */
+ event.cfwev_sport = is->is_sport;
+ event.cfwev_dport = is->is_dport;
+ break;
+ case IPPROTO_ICMP:
+ case IPPROTO_ICMPV6:
+ /* Scribble the ICMP type in sport... */
+ event.cfwev_sport = is->is_icmp.ici_type;
+ break;
+ /* Other protocols leave the event's port fields empty. */
+ }
+
+ switch(is->is_v) {
+ case IPV4_VERSION:
+ IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr);
+ IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr);
+ break;
+ case IPV6_VERSION:
+ event.cfwev_saddr = is->is_src.in6;
+ event.cfwev_daddr = is->is_dst.in6;
+ break;
+ default:
+ /* Can't parse addresses if we don't know the version. Drop. */
+ DTRACE_PROBE1(ipf__cfw__ipstate__badipversion,
+ struct ipstate *, is);
+ return;
+ }
+
+ /*
+ * uniqtime() is what ipf's GETKTIME() uses.
+ * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
+ */
+ uniqtime(&event.cfwev_tstamp);
+ event.cfwev_zonedid = ifs_to_did(ifs);
+ event.cfwev_ruleid = is->is_rulen;
+ memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t));
+
+ ipf_cfwev_report(&event);
+}
+
+/*
+ * Callback routine we use for ipf_cfwev_consume_many().
+ * Returning 0 means error indication.
+ */
+static uint_t
+cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg)
+{
+ uio_error_t *ue = (uio_error_t *)cbarg;
+
+ ASSERT(MUTEX_HELD(&cfw_ringlock));
+
+ if (ue->ue_error != 0)
+ return (0);
+
+ ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr),
+ UIO_READ, ue->ue_uio);
+ if (ue->ue_error != 0)
+ return (0);
+
+ return (num_avail);
+}
+
+/*
+ * Resize the CFW event ring buffer.
+ *
+ * The caller must ensure the new size is a power of 2 between
+ * IPF_CFW_{MIN,MAX}_RING_BUFS (inclusive) or the special values
+ * IPF_CFW_RING_ALLOCATE (first-time creation) or IPF_CFW_RING_DESTROY
+ * (netstack-unload destruction).
+ *
+ * Everything in the current ring will be destroyed (and reported as a drop)
+ * upon resize.
+ */
+int
+ipf_cfw_ring_resize(uint32_t newsize)
+{
+ ASSERT(MUTEX_HELD(&cfw_ringlock) || newsize == IPF_CFW_RING_ALLOCATE ||
+ newsize == IPF_CFW_RING_DESTROY);
+
+ if (newsize == IPF_CFW_RING_ALLOCATE) {
+ if (cfw_ring != NULL)
+ return (EBUSY);
+ newsize = IPF_CFW_DEFAULT_RING_BUFS;
+ /* Fall through to allocating a new ring buffer. */
+ } else {
+ /* We may be called during error cleanup, so be liberal here. */
+ if ((cfw_ring == NULL && newsize == IPF_CFW_RING_DESTROY) ||
+ newsize == cfw_ringsize) {
+ return (0);
+ }
+ kmem_free(cfw_ring, cfw_ringsize * sizeof (cfwev_t));
+ cfw_ring = NULL;
+ if (cfw_ringfull) {
+ cfw_evdrops += cfw_ringsize;
+ } else if (cfw_ringstart > cfw_ringend) {
+ cfw_evdrops += cfw_ringend +
+ (cfw_ringsize - cfw_ringstart);
+ } else {
+ cfw_evdrops += cfw_ringend - cfw_ringstart;
+ }
+ cfw_ringsize = cfw_ringmask = cfw_ringstart = cfw_ringend = 0;
+ cfw_ringfull = B_FALSE;
+
+ if (newsize == IPF_CFW_RING_DESTROY)
+ return (0);
+ /*
+ * Keep the reports & drops around because if we're just
+ * resizing, we need to know what we lost.
+ */
+ }
+
+ ASSERT(ISP2(newsize));
+ cfw_ring = kmem_alloc(newsize * sizeof (cfwev_t), KM_SLEEP);
+ /* KM_SLEEP means we always succeed. */
+ cfw_ringsize = newsize;
+ cfw_ringmask = cfw_ringsize - 1;
+
+ return (0);
+}
+
+/*
+ * ioctl handler for /dev/ipfev. Only supports SIOCIPFCFWCFG (get data
+ * collector statistics and configuration), and SIOCIPFCFWNEWSZ (resize the
+ * event ring buffer).
+ */
+/* ARGSUSED */
+int
+ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp,
+ int *rp)
+{
+ ipfcfwcfg_t cfginfo;
+ int error;
+
+ if (cmd != SIOCIPFCFWCFG && cmd != SIOCIPFCFWNEWSZ)
+ return (EIO);
+
+ if (crgetzoneid(cp) != GLOBAL_ZONEID)
+ return (EACCES);
+
+ error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo));
+ if (error != 0)
+ return (EFAULT);
+
+ cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t);
+ mutex_enter(&cfw_ringlock);
+ cfginfo.ipfcfwc_evreports = cfw_evreports;
+ if (cmd == SIOCIPFCFWNEWSZ) {
+ uint32_t newsize = cfginfo.ipfcfwc_evringsize;
+
+ /* Do ioctl parameter checking here, then call the resizer. */
+ if (newsize < IPF_CFW_MIN_RING_BUFS ||
+ newsize > IPF_CFW_MAX_RING_BUFS || !ISP2(newsize)) {
+ error = EINVAL;
+ } else {
+ error = ipf_cfw_ring_resize(cfginfo.ipfcfwc_evringsize);
+ }
+ } else {
+ error = 0;
+ }
+ /* Both cfw_evdrops and cfw_ringsize are affected by resize. */
+ cfginfo.ipfcfwc_evdrops = cfw_evdrops;
+ cfginfo.ipfcfwc_evringsize = cfw_ringsize;
+ mutex_exit(&cfw_ringlock);
+
+ if (error != 0)
+ return (error);
+
+ error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo));
+ if (error != 0)
+ return (EFAULT);
+
+ return (0);
+}
+
+/*
+ * Send events up via /dev/ipfev reads. Will return only complete events.
+ */
+/* ARGSUSED */
+int
+ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp)
+{
+ uint_t requested, consumed;
+ uio_error_t ue = {uio, 0};
+ boolean_t block;
+
+ if (uio->uio_resid == 0)
+ return (0);
+ if (uio->uio_resid < sizeof (cfwev_t))
+ return (EINVAL);
+
+ block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0);
+ requested = uio->uio_resid / sizeof (cfwev_t);
+
+ /*
+ * As stated earlier, ipf_cfwev_consume_many() takes a callback.
+ * The callback may be called multiple times before we return.
+ * The callback will execute uiomove().
+ */
+ consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb,
+ &ue);
+ ASSERT3U(consumed, <=, requested);
+ if (!block && consumed == 0 && ue.ue_error == 0) {
+ /* No data available. */
+ ue.ue_error = EWOULDBLOCK;
+ } else if (ue.ue_error != 0 && ue.ue_error != EINTR) {
+ /*
+ * We had a problem that wasn't simply a
+ * case of cv_wait_sig() receiving a signal.
+ */
+ DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed);
+ mutex_enter(&cfw_ringlock);
+ cfw_evdrops += consumed;
+ mutex_exit(&cfw_ringlock);
+ }
+ return (ue.ue_error);
+}
+
+#else /* _KERNEL */
+
+/* Blank stubs to satisfy userland's test compilations. */
+
+int
+ipf_cfw_ring_resize(uint32_t a)
+{
+ return (0);
+}
+
+void
+ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c)
+{
+}
+
+void
+ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c)
+{
+}
+
+#endif /* _KERNEL */
diff --git a/usr/src/uts/common/inet/ipf/fil.c b/usr/src/uts/common/inet/ipf/fil.c
index 78980be106..48fa6e7325 100644
--- a/usr/src/uts/common/inet/ipf/fil.c
+++ b/usr/src/uts/common/inet/ipf/fil.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#if defined(KERNEL) || defined(_KERNEL)
@@ -2588,6 +2588,9 @@ ipf_stack_t *ifs;
}
#endif
+ if (IFS_CFWLOG(ifs, fr) && FR_ISBLOCK(pass))
+ ipf_block_cfwlog(fr, fin, ifs);
+
/*
* The FI_STATE flag is cleared here so that calling fr_checkstate
* will work when called from inside of fr_fastroute. Although
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index c9d5f03e13..0d34e0fce3 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
*/
#if !defined(lint)
@@ -85,6 +85,14 @@ static int ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
static int ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
void *));
static int ipf_hook6 __P((hook_data_t, int, int, void *));
+static int ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+ void *));
+static int ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+ void *));
static int ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *));
static int ipf_hookviona_out __P((hook_event_token_t, hook_data_t,
@@ -116,7 +124,7 @@ u_long *ip_forwarding = NULL;
#endif
vmem_t *ipf_minor; /* minor number arena */
-void *ipf_state; /* DDI state */
+void *ipf_state; /* DDI state */
/*
* GZ-controlled and per-zone stacks:
@@ -141,28 +149,38 @@ void *ipf_state; /* DDI state */
*/
/* IPv4 hook names */
-char *hook4_nicevents = "ipfilter_hook4_nicevents";
-char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz";
-char *hook4_in = "ipfilter_hook4_in";
-char *hook4_in_gz = "ipfilter_hook4_in_gz";
-char *hook4_out = "ipfilter_hook4_out";
-char *hook4_out_gz = "ipfilter_hook4_out_gz";
-char *hook4_loop_in = "ipfilter_hook4_loop_in";
-char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz";
-char *hook4_loop_out = "ipfilter_hook4_loop_out";
-char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz";
+char *hook4_nicevents = "ipfilter_hook4_nicevents";
+char *hook4_nicevents_gz = "ipfilter_hook4_nicevents_gz";
+char *hook4_in = "ipfilter_hook4_in";
+char *hook4_in_gz = "ipfilter_hook4_in_gz";
+char *hook4_out = "ipfilter_hook4_out";
+char *hook4_out_gz = "ipfilter_hook4_out_gz";
+char *hook4_loop_in = "ipfilter_hook4_loop_in";
+char *hook4_loop_in_gz = "ipfilter_hook4_loop_in_gz";
+char *hook4_loop_out = "ipfilter_hook4_loop_out";
+char *hook4_loop_out_gz = "ipfilter_hook4_loop_out_gz";
/* IPv6 hook names */
-char *hook6_nicevents = "ipfilter_hook6_nicevents";
-char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz";
-char *hook6_in = "ipfilter_hook6_in";
-char *hook6_in_gz = "ipfilter_hook6_in_gz";
-char *hook6_out = "ipfilter_hook6_out";
-char *hook6_out_gz = "ipfilter_hook6_out_gz";
-char *hook6_loop_in = "ipfilter_hook6_loop_in";
-char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
-char *hook6_loop_out = "ipfilter_hook6_loop_out";
-char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+char *hook6_nicevents = "ipfilter_hook6_nicevents";
+char *hook6_nicevents_gz = "ipfilter_hook6_nicevents_gz";
+char *hook6_in = "ipfilter_hook6_in";
+char *hook6_in_gz = "ipfilter_hook6_in_gz";
+char *hook6_out = "ipfilter_hook6_out";
+char *hook6_out_gz = "ipfilter_hook6_out_gz";
+char *hook6_loop_in = "ipfilter_hook6_loop_in";
+char *hook6_loop_in_gz = "ipfilter_hook6_loop_in_gz";
+char *hook6_loop_out = "ipfilter_hook6_loop_out";
+char *hook6_loop_out_gz = "ipfilter_hook6_loop_out_gz";
+
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in = "ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz = "ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in = "ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz = "ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out = "ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz = "ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out = "ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz = "ipfilter_hookvndl3v6_out_gz";
/* viona hook names */
char *hook_viona_in = "ipfilter_hookviona_in";
@@ -170,6 +188,39 @@ char *hook_viona_in_gz = "ipfilter_hookviona_in_gz";
char *hook_viona_out = "ipfilter_hookviona_out";
char *hook_viona_out_gz = "ipfilter_hookviona_out_gz";
+/*
+ * For VIONA. The net_{instance,protocol}_notify_register() functions only
+ * deal with per-callback-function granularity. We need two wrapper functions
+ * for GZ-controlled and per-zone instances.
+ */
+static int
+ipf_hook_instance_notify_gz(hook_notify_cmd_t command, void *arg,
+ const char *netid, const char *dummy, const char *instance)
+{
+ return (ipf_hook_instance_notify(command, arg, netid, dummy, instance));
+}
+
+static int
+ipf_hook_instance_notify_ngz(hook_notify_cmd_t command, void *arg,
+ const char *netid, const char *dummy, const char *instance)
+{
+ return (ipf_hook_instance_notify(command, arg, netid, dummy, instance));
+}
+
+static int
+ipf_hook_protocol_notify_gz(hook_notify_cmd_t command, void *arg,
+ const char *name, const char *dummy, const char *he_name)
+{
+ return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name));
+}
+
+static int
+ipf_hook_protocol_notify_ngz(hook_notify_cmd_t command, void *arg,
+ const char *name, const char *dummy, const char *he_name)
+{
+ return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name));
+}
+
/* ------------------------------------------------------------------------ */
/* Function: ipldetach */
/* Returns: int - 0 == success, else error. */
@@ -267,10 +318,36 @@ ipf_stack_t *ifs;
}
/*
+ * Remove VND hooks
+ */
+ if (ifs->ifs_ipf_vndl3v4 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+ UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v4 = NULL;
+ }
+
+ if (ifs->ifs_ipf_vndl3v6 != NULL) {
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+ NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+ UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+ NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+ if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+ goto detach_failed;
+ ifs->ifs_ipf_vndl3v6 = NULL;
+ }
+
+ /*
* Remove notification of viona hooks
*/
net_instance_notify_unregister(ifs->ifs_netid,
- ipf_hook_instance_notify);
+ ifs->ifs_gz_controlled ? ipf_hook_instance_notify_gz :
+ ipf_hook_instance_notify_ngz);
#undef UNDO_HOOK
@@ -278,6 +355,10 @@ ipf_stack_t *ifs;
* Normally, viona will unregister itself before ipldetach() is called,
* so these will be no-ops, but out of caution, we try to make sure
* we've removed any of our references.
+ *
+ * For now, the _gz and _ngz versions are both wrappers to what's
+ * below. Just call it directly, but if that changes fix here as
+ * well.
*/
(void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
NH_PHYSICAL_IN);
@@ -295,6 +376,10 @@ ipf_stack_t *ifs;
* traced, we pass the same value the nethook framework would
* pass, even though the callback does not currently use the
* value.
+ *
+ * For now, the _gz and _ngz versions are both wrappers to
+ * what's below. Just call it directly, but if that changes
+ * fix here as well.
*/
(void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr,
NULL, Hn_VIONA);
@@ -495,6 +580,49 @@ ipf_stack_t *ifs;
}
/*
+ * Add VND INET hooks
+ */
+ ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+ if (ifs->ifs_ipf_vndl3v4 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+ hook4_vnd_in, hook4_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+ hook4_vnd_out, hook4_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+ if (!ifs->ifs_hookvndl3v4_physical_out)
+ goto hookup_failed;
+
+
+ /*
+ * VND INET6 hooks
+ */
+ ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+ if (ifs->ifs_ipf_vndl3v6 == NULL)
+ goto hookup_failed;
+
+ HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+ hook6_vnd_in, hook6_vnd_in_gz, ifs);
+ HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+ hook6_vnd_out, hook6_vnd_out_gz, ifs);
+ ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_in)
+ goto hookup_failed;
+
+ ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+ NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+ if (!ifs->ifs_hookvndl3v6_physical_out)
+ goto hookup_failed;
+
+ /*
* VIONA INET hooks. While the nethook framework allows us to register
* hooks for events that haven't been registered yet, we instead
* register and unregister our hooks in response to notifications
@@ -504,9 +632,15 @@ ipf_stack_t *ifs;
* is unloaded, the viona module cannot later re-register them if it
* gets reloaded. As the ip, vnd, and ipf modules are rarely unloaded
* even on DEBUG kernels, they do not experience this issue.
+ *
+ * Today, the per-zone ones don't matter for a BHYVE-branded zone, BUT
+ * the ipf_hook_protocol_notify() function is GZ vs. per-zone aware.
+ * Employ two different versions of ipf_hook_instance_notify(), one for
+ * the GZ-controlled, and one for the per-zone one.
*/
- if (net_instance_notify_register(id, ipf_hook_instance_notify,
- ifs) != 0)
+ if (net_instance_notify_register(id, ifs->ifs_gz_controlled ?
+ ipf_hook_instance_notify_gz : ipf_hook_instance_notify_ngz, ifs) !=
+ 0)
goto hookup_failed;
/*
@@ -688,6 +822,7 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
{
ipf_stack_t *ifs = arg;
int ret = 0;
+ const boolean_t gz = ifs->ifs_gz_controlled;
/* We currently only care about viona hooks */
if (strcmp(instance, Hn_VIONA) != 0)
@@ -705,14 +840,16 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
return (EPROTONOSUPPORT);
ret = net_protocol_notify_register(ifs->ifs_ipf_viona,
- ipf_hook_protocol_notify, ifs);
+ gz ? ipf_hook_protocol_notify_gz :
+ ipf_hook_protocol_notify_ngz, ifs);
VERIFY(ret == 0 || ret == ESHUTDOWN);
break;
case HN_UNREGISTER:
if (ifs->ifs_ipf_viona == NULL)
break;
VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona,
- ipf_hook_protocol_notify));
+ gz ? ipf_hook_protocol_notify_gz :
+ ipf_hook_protocol_notify_ngz));
VERIFY0(net_protocol_release(ifs->ifs_ipf_viona));
ifs->ifs_ipf_viona = NULL;
break;
@@ -821,6 +958,9 @@ int *rp;
return ENXIO;
unit = isp->ipfs_minor;
+ if (unit == IPL_LOGEV)
+ return (ipf_cfwlog_ioctl(dev, cmd, data, mode, cp, rp));
+
zid = crgetzoneid(cp);
if (cmd == SIOCIPFZONESET) {
if (zid == GLOBAL_ZONEID)
@@ -1129,14 +1269,14 @@ ipf_stack_t *ifs;
{
net_handle_t nif;
- if (v == 4)
- nif = ifs->ifs_ipf_ipv4;
- else if (v == 6)
- nif = ifs->ifs_ipf_ipv6;
- else
- return 0;
-
- return (net_phylookup(nif, name));
+ if (v == 4)
+ nif = ifs->ifs_ipf_ipv4;
+ else if (v == 6)
+ nif = ifs->ifs_ipf_ipv6;
+ else
+ return 0;
+
+ return (net_phylookup(nif, name));
}
/*
@@ -1161,11 +1301,35 @@ cred_t *cred;
if (IPL_LOGMAX < min)
return ENXIO;
+ /* Special-case ipfev: global-zone-open only. */
+ if (min == IPL_LOGEV) {
+ if (crgetzoneid(cred) != GLOBAL_ZONEID)
+ return (ENXIO);
+ /*
+ * Else enable the CFW logging of events.
+ * NOTE: For now, we only allow one open at a time.
+ * Use atomic_cas to confirm/deny. And also for now,
+ * assume sizeof (boolean_t) == sizeof (uint_t).
+ *
+ * Per the *_{refrele,REFRELE}() in other parts of inet,
+ * ensure all loads/stores complete before calling cas.
+ * membar_exit() does this.
+ */
+ membar_exit();
+ if (atomic_cas_uint(&ipf_cfwlog_enabled, 0, 1) != 0)
+ return (EBUSY);
+ }
+
minor = (minor_t)(uintptr_t)vmem_alloc(ipf_minor, 1,
VM_BESTFIT | VM_SLEEP);
if (ddi_soft_state_zalloc(ipf_state, minor) != 0) {
vmem_free(ipf_minor, (void *)(uintptr_t)minor, 1);
+ if (min == IPL_LOGEV) {
+ /* See above... */
+ membar_exit();
+ VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1);
+ }
return ENXIO;
}
@@ -1187,6 +1351,7 @@ int flags, otype;
cred_t *cred;
{
minor_t min = getminor(dev);
+ ipf_devstate_t *isp;
#ifdef IPFDEBUG
cmn_err(CE_CONT, "iplclose(%x,%x,%x,%x)\n", dev, flags, otype, cred);
@@ -1195,6 +1360,15 @@ cred_t *cred;
if (IPL_LOGMAX < min)
return ENXIO;
+ isp = ddi_get_soft_state(ipf_state, min);
+ if (isp != NULL && isp->ipfs_minor == IPL_LOGEV) {
+ /*
+ * Disable CFW logging. See iplopen() for details.
+ */
+ membar_exit();
+ VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1);
+ }
+
ddi_soft_state_free(ipf_state, min);
vmem_free(ipf_minor, (void *)(uintptr_t)min, 1);
@@ -1225,6 +1399,8 @@ cred_t *cp;
return ENXIO;
unit = isp->ipfs_minor;
+ if (unit == IPL_LOGEV)
+ return (ipf_cfwlog_read(dev, uio, cp));
/*
* ipf_find_stack returns with a read lock on ifs_ipf_global
@@ -1277,6 +1453,9 @@ cred_t *cp;
return ENXIO;
unit = isp->ipfs_minor;
+ if (unit == IPL_LOGEV)
+ return (EIO); /* ipfev doesn't support write yet. */
+
/*
* ipf_find_stack returns with a read lock on ifs_ipf_global
*/
@@ -2068,8 +2247,11 @@ frdest_t *fdp;
return (-1);
}
- /* Check the src here, fin_ifp is the src interface. */
- if (!(fin->fin_flx & FI_GENERATED) &&
+ /*
+ * If we're forwarding (vs. injecting), check the src here, fin_ifp is
+ * the src interface.
+ */
+ if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) &&
!fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) {
return (-1);
}
@@ -2138,8 +2320,8 @@ frdest_t *fdp;
inj->ni_physical = net_routeto(net_data_p, sinp, NULL);
}
- /* we're checking the destination here */
- if (!(fin->fin_flx & FI_GENERATED) &&
+ /* If we're forwarding (vs. injecting), check the destinatation here. */
+ if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) &&
!fr_forwarding_enabled(inj->ni_physical, net_data_p)) {
goto bad_fastroute;
}
@@ -2355,6 +2537,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg,
}
/* ------------------------------------------------------------------------ */
+/* Function: ipf_hookvndl3_in */
+/* Returns: int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters: event(I) - pointer to event */
+/* info(I) - pointer to hook information for firewalling */
+/* */
+/* The vnd hooks are private hooks to ON. They represents a layer 2 */
+/* datapath generally used to implement virtual machines. The driver sends */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the */
+/* traditional packet hook flags. */
+/* */
+/* They end up calling the appropriate traditional ip hooks. */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+ return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
/* Function: ipf_hookviona_{in,out} */
/* Returns: int - 0 == packet ok, else problem, free packet if not done */
/* Parameters: event(I) - pointer to event */
@@ -3120,16 +3338,16 @@ fr_info_t *fin;
/* both IP versions. The details are going to be explained here. */
/* */
/* The packet looks as follows: */
-/* xxx | IP hdr | IP payload ... | */
-/* ^ ^ ^ ^ */
-/* | | | | */
+/* xxx | IP hdr | IP payload ... | */
+/* ^ ^ ^ ^ */
+/* | | | | */
/* | | | fin_m->b_wptr = fin->fin_dp + fin->fin_dlen */
/* | | | */
/* | | `- fin_m->fin_dp (in case of IPv4 points to L4 header) */
/* | | */
/* | `- fin_m->b_rptr + fin_ipoff (fin_ipoff is most likely 0 in case */
/* | of loopback) */
-/* | */
+/* | */
/* `- fin_m->b_rptr - points to L2 header in case of physical NIC */
/* */
/* All relevant IP headers are pulled up into the first mblk. It happened */
diff --git a/usr/src/uts/common/inet/ipf/ip_log.c b/usr/src/uts/common/inet/ipf/ip_log.c
index 584ee42d9a..b70e320def 100644
--- a/usr/src/uts/common/inet/ipf/ip_log.c
+++ b/usr/src/uts/common/inet/ipf/ip_log.c
@@ -8,7 +8,7 @@
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#include <sys/param.h>
@@ -373,9 +373,11 @@ u_int flags;
if (fin->fin_fr != NULL) {
ipfl.fl_loglevel = fin->fin_fr->fr_loglevel;
ipfl.fl_logtag = fin->fin_fr->fr_logtag;
+ bcopy(fin->fin_fr->fr_uuid, ipfl.fl_uuid, sizeof (uuid_t));
} else {
ipfl.fl_loglevel = 0xffff;
ipfl.fl_logtag = FR_NOLOGTAG;
+ bzero(ipfl.fl_uuid, sizeof (uuid_t));
}
if (fin->fin_nattag != NULL)
bcopy(fin->fin_nattag, (void *)&ipfl.fl_nattag,
diff --git a/usr/src/uts/common/inet/ipf/ip_state.c b/usr/src/uts/common/inet/ipf/ip_state.c
index 184f8775b6..a45bcbfdaf 100644
--- a/usr/src/uts/common/inet/ipf/ip_state.c
+++ b/usr/src/uts/common/inet/ipf/ip_state.c
@@ -5,7 +5,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#if defined(KERNEL) || defined(_KERNEL)
@@ -108,6 +108,7 @@ struct file;
# include <sys/systm.h>
# endif
#endif
+#include <sys/uuid.h>
/* END OF INCLUDES */
@@ -1445,6 +1446,7 @@ u_int flags;
is->is_sti.tqe_flags |= TQE_RULEBASED;
}
is->is_tag = fr->fr_logtag;
+ memcpy(is->is_uuid, fr->fr_uuid, sizeof (uuid_t));
is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1];
is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2];
@@ -1524,6 +1526,9 @@ u_int flags;
if (ifs->ifs_ipstate_logging)
ipstate_log(is, ISL_NEW, ifs);
+ if (IFS_CFWLOG(ifs, is->is_rule))
+ ipf_log_cfwlog(is, ISL_NEW, ifs);
+
RWLOCK_EXIT(&ifs->ifs_ipf_state);
fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr);
fin->fin_flx |= FI_STATE;
@@ -2314,6 +2319,8 @@ u_32_t cmask;
is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT);
if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging)
ipstate_log(is, ISL_CLONE, ifs);
+ if ((flags & SI_CLONED) && IFS_CFWLOG(ifs, is->is_rule))
+ ipf_log_cfwlog(is, ISL_CLONE, ifs);
}
ret = -1;
@@ -3397,6 +3404,15 @@ ipf_stack_t *ifs;
if (ifs->ifs_ipstate_logging != 0 && why != 0)
ipstate_log(is, why, ifs);
+ /*
+ * For now, ipf_log_cfwlog() copes with all "why" values. Strictly
+ * speaking, though, they all map to one event (CFWEV_END), which for
+ * now is not supported, hence no code calling ipf_log_cfwlog() like
+ * below:
+ *
+ * if (why != 0 && IFS_CFWLOG(ifs, is->is_rule))
+ * ipf_log_cfwlog(is, why, ifs);
+ */
if (is->is_rule != NULL) {
is->is_rule->fr_statecnt--;
@@ -3931,7 +3947,6 @@ int flags;
return rval;
}
-
/* ------------------------------------------------------------------------ */
/* Function: ipstate_log */
/* Returns: Nil */
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
#
#
name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/Makefile b/usr/src/uts/common/inet/ipf/netinet/Makefile
index cca3b48ac4..88f91e633f 100644
--- a/usr/src/uts/common/inet/ipf/netinet/Makefile
+++ b/usr/src/uts/common/inet/ipf/netinet/Makefile
@@ -1,16 +1,15 @@
#
-#ident "%Z%%M% %I% %E% SMI"
-#
# Copyright 2008 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
+# Copyright 2019 Joyent, Inc.
#
# uts/common/inet/ipf/netinet/Makefile
#
# include global definitions
include ../../../../../Makefile.master
-HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h \
- ip_frag.h ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h
+HDRS= ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h ip_frag.h \
+ ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h ipf_cfw.h
ROOTDIRS= $(ROOT)/usr/include/netinet
diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
index 4c3c5683b5..bb5ce7bd6c 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
@@ -8,7 +8,7 @@
*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019, Joyent, Inc.
*/
#ifndef __IP_FIL_H__
@@ -16,6 +16,7 @@
#include "netinet/ip_compat.h"
#include <sys/zone.h>
+#include <sys/uuid.h>
#ifdef SOLARIS
#undef SOLARIS
@@ -115,6 +116,8 @@
#define SIOCDELFR SIOCRMAFR
#define SIOCINSFR SIOCINAFR
# define SIOCIPFZONESET _IOWR('r', 97, struct ipfzoneobj)
+# define SIOCIPFCFWCFG _IOR('r', 98, struct ipfcfwcfg)
+# define SIOCIPFCFWNEWSZ _IOWR('r', 99, struct ipfcfwcfg)
/*
* What type of table is getting flushed?
@@ -600,6 +603,7 @@ typedef struct frentry {
u_32_t fr_flags; /* per-rule flags && options (see below) */
u_32_t fr_logtag; /* user defined log tag # */
u_32_t fr_collect; /* collection number */
+ uuid_t fr_uuid; /* user defined uuid */
u_int fr_arg; /* misc. numeric arg for rule */
u_int fr_loglevel; /* syslog log facility + priority */
u_int fr_age[2]; /* non-TCP timeouts */
@@ -728,6 +732,7 @@ typedef struct frentry {
#define FR_NEWISN 0x400000 /* new ISN for outgoing TCP */
#define FR_NOICMPERR 0x800000 /* do not match ICMP errors in state */
#define FR_STATESYNC 0x1000000 /* synchronize state to slave */
+#define FR_CFWLOG 0x2000000 /* Global CFW logging enabled */
#define FR_NOMATCH 0x8000000 /* no match occured */
/* 0x10000000 FF_LOGPASS */
/* 0x20000000 FF_LOGBLOCK */
@@ -883,6 +888,7 @@ typedef struct ipflog {
u_32_t fl_lflags;
u_32_t fl_logtag;
ipftag_t fl_nattag;
+ uuid_t fl_uuid;
u_short fl_plen; /* extra data after hlen */
u_short fl_loglevel; /* syslog log level */
char fl_group[FR_GROUPLEN];
@@ -931,6 +937,7 @@ typedef struct ipflog {
#define IPSYNC_NAME "/dev/ipsync"
#define IPSCAN_NAME "/dev/ipscan"
#define IPLOOKUP_NAME "/dev/iplookup"
+#define IPFEV_NAME "/dev/ipfev"
#define IPL_LOGIPF 0 /* Minor device #'s for accessing logs */
#define IPL_LOGNAT 1
@@ -939,8 +946,9 @@ typedef struct ipflog {
#define IPL_LOGSYNC 4
#define IPL_LOGSCAN 5
#define IPL_LOGLOOKUP 6
-#define IPL_LOGCOUNT 7
-#define IPL_LOGMAX 7
+#define IPL_LOGEV 7
+#define IPL_LOGCOUNT 8
+#define IPL_LOGMAX 8
#define IPL_LOGSIZE (IPL_LOGMAX + 1)
#define IPL_LOGALL -1
#define IPL_LOGNONE -2
@@ -1181,6 +1189,21 @@ typedef struct ipfzoneobj {
char ipfz_zonename[ZONENAME_MAX]; /* zone to act on */
} ipfzoneobj_t;
+/* ioctl to grab CFW logging parameters */
+typedef struct ipfcfwcfg {
+ /* CFG => Max event size, NEWSZ => ignored in, like CFG out. */
+ uint32_t ipfcfwc_maxevsize;
+ /*
+ * CFG => Current ring size,
+ * NEWSZ => New ring size, must be 2^N for 3 <= N <= 31.
+ */
+ uint32_t ipfcfwc_evringsize;
+ /* CFG => Number of event reports, NEWSZ => ignored in, like CFG out. */
+ uint64_t ipfcfwc_evreports;
+ /* CFG => Number of event drops, NEWSZ => ignored in, like CFG out. */
+ uint64_t ipfcfwc_evdrops;
+} ipfcfwcfg_t;
+
#if defined(_KERNEL)
/* Set ipfs_zoneid to this if no zone has been set: */
#define IPFS_ZONE_UNSET -2
@@ -1560,6 +1583,23 @@ extern int ipllog __P((int, fr_info_t *, void **, size_t *, int *, int,
ipf_stack_t *));
extern void fr_logunload __P((ipf_stack_t *));
+/* SmartOS single-FD global-zone state accumulator (see cfw.c) */
+extern boolean_t ipf_cfwlog_enabled;
+struct ipstate; /* Ugggh. */
+extern void ipf_log_cfwlog __P((struct ipstate *, uint_t, ipf_stack_t *));
+extern void ipf_block_cfwlog __P((frentry_t *, fr_info_t *, ipf_stack_t *));
+#define IFS_CFWLOG(ifs, fr) ((ifs)->ifs_gz_controlled && ipf_cfwlog_enabled &&\
+ fr != NULL && ((fr)->fr_flags & FR_CFWLOG))
+struct cfwev_s; /* See ipf_cfw.h */
+extern boolean_t ipf_cfwev_consume __P((struct cfwev_s *, boolean_t));
+/* See cfw.c's ipf_cfwev_consume_many() for details. */
+typedef uint_t (*cfwmanycb_t) __P((struct cfwev_s *, uint_t, void *));
+extern int ipf_cfwlog_read __P((dev_t, struct uio *, struct cred *));
+extern int ipf_cfwlog_ioctl __P((dev_t, int, intptr_t, int, cred_t *, int *));
+#define IPF_CFW_RING_ALLOCATE 0
+#define IPF_CFW_RING_DESTROY 1
+extern int ipf_cfw_ring_resize(uint32_t);
+
extern frentry_t *fr_acctpkt __P((fr_info_t *, u_32_t *));
extern int fr_copytolog __P((int, char *, int));
extern u_short fr_cksum __P((mb_t *, ip_t *, int, void *));
diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_state.h b/usr/src/uts/common/inet/ipf/netinet/ip_state.h
index 4c605c1b89..ef315d5ef1 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ip_state.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ip_state.h
@@ -8,11 +8,14 @@
*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef __IP_STATE_H__
#define __IP_STATE_H__
+#include <sys/uuid.h>
+
#if defined(__STDC__) || defined(__GNUC__) || defined(_AIX51)
# define SIOCDELST _IOW('r', 61, struct ipfobj)
#else
@@ -66,6 +69,7 @@ typedef struct ipstate {
/* in both directions */
u_32_t is_optmsk[2]; /* " " mask */
/* in both directions */
+ uuid_t is_uuid;
u_short is_sec; /* security options set */
u_short is_secmsk; /* " " mask */
u_short is_auth; /* authentication options set */
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h
new file mode 100644
index 0000000000..1972d2b3f7
--- /dev/null
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef __IPF_CFW_H__
+#define __IPF_CFW_H__
+
+#include <sys/types.h>
+#include <inet/ip6.h>
+#include <sys/uuid.h>
+
+/* Because ipf compiles this kernel file in userland testing... */
+#ifndef ASSERT3U
+#define ASSERT3U(a, b, c) ASSERT(a ## b ## c);
+#endif /* ASSERT3U */
+
+/*
+ * CFW Event, which is emitted to a global-zone listener. The global-zone
+ * listener solves the one-fd-per-zone problem of using each zone's ipmon.
+ *
+ * These must be 64-bit aligned because they form an array in-kernel. There
+ * might be reserved fields to ensure that alignment.
+ */
+#define CFWEV_BLOCK 1
+#define CFWEV_BEGIN 2
+#define CFWEV_END 3
+#define CFWDIR_IN 1
+#define CFWDIR_OUT 2
+
+typedef struct cfwev_s {
+ uint16_t cfwev_type; /* BEGIN, END, BLOCK */
+ uint16_t cfwev_length; /* in bytes, so capped to 65535 bytes */
+ zoneid_t cfwev_zonedid; /* Pullable from ipf_stack_t. */
+
+ uint32_t cfwev_ruleid; /* Pullable from fr_info_t. */
+ uint16_t cfwev_sport; /* Source port (network order) */
+ uint16_t cfwev_dport; /* Dest. port (network order) */
+
+ uint8_t cfwev_protocol; /* IPPROTO_* */
+ /* "direction" informs if src/dst are local/remote or remote/local. */
+ uint8_t cfwev_direction;
+ uint8_t cfwev_reserved[6]; /* Ensures 64-bit alignment. */
+
+ in6_addr_t cfwev_saddr; /* IPv4 addresses are V4MAPPED. */
+ in6_addr_t cfwev_daddr;
+
+ /*
+ * Because of 'struct timeval' being different between 32-bit and
+ * 64-bit ABIs, this interface is only usable by 64-bit binaries.
+ */
+ struct timeval cfwev_tstamp;
+
+ uuid_t cfwev_ruleuuid; /* Pullable from fr_info_t. */
+} cfwev_t;
+
+
+
+#endif /* __IPF_CFW_H__ */
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index 0ceea1e921..0b2a8d826f 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -6,7 +6,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright 2018 Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
#ifndef __IPF_STACK_H__
@@ -46,6 +46,7 @@ struct ipf_stack {
struct ipf_stack *ifs_gz_cont_ifs;
netid_t ifs_netid;
zoneid_t ifs_zone;
+ zoneid_t ifs_zone_did;
boolean_t ifs_gz_controlled;
/* ipf module */
@@ -126,6 +127,11 @@ struct ipf_stack {
hook_t *ifs_ipfhook6_loop_out;
hook_t *ifs_ipfhook6_nicevents;
+ hook_t *ifs_ipfhookvndl3v4_in;
+ hook_t *ifs_ipfhookvndl3v6_in;
+ hook_t *ifs_ipfhookvndl3v4_out;
+ hook_t *ifs_ipfhookvndl3v6_out;
+
hook_t *ifs_ipfhookviona_in;
hook_t *ifs_ipfhookviona_out;
@@ -140,12 +146,18 @@ struct ipf_stack {
boolean_t ifs_hook6_nic_events;
boolean_t ifs_hook6_loopback_in;
boolean_t ifs_hook6_loopback_out;
+ boolean_t ifs_hookvndl3v4_physical_in;
+ boolean_t ifs_hookvndl3v6_physical_in;
+ boolean_t ifs_hookvndl3v4_physical_out;
+ boolean_t ifs_hookvndl3v6_physical_out;
boolean_t ifs_hookviona_physical_in;
boolean_t ifs_hookviona_physical_out;
int ifs_ipf_loopback;
net_handle_t ifs_ipf_ipv4;
net_handle_t ifs_ipf_ipv6;
+ net_handle_t ifs_ipf_vndl3v4;
+ net_handle_t ifs_ipf_vndl3v6;
net_handle_t ifs_ipf_viona;
/* ip_auth.c */
@@ -305,6 +317,7 @@ struct ipf_stack {
char *ifs_addmask_key;
char *ifs_rn_zeros;
char *ifs_rn_ones;
+
#ifdef KERNEL
/* kstats for inbound and outbound */
kstat_t *ifs_kstatp[2];
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5ccbfa3188 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -6,7 +6,7 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*
- * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -116,7 +116,7 @@ static void ipf_stack_shutdown __P((const netid_t, void *));
static int ipf_property_g_update __P((dev_info_t *));
static char *ipf_devfiles[] = { IPL_NAME, IPNAT_NAME, IPSTATE_NAME,
IPAUTH_NAME, IPSYNC_NAME, IPSCAN_NAME,
- IPLOOKUP_NAME, NULL };
+ IPLOOKUP_NAME, IPFEV_NAME, NULL };
extern void *ipf_state; /* DDI state */
extern vmem_t *ipf_minor; /* minor number arena */
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
/*
* Destroy things for ipf for one stack.
*/
-/* ARGSUSED */
static void
ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
{
@@ -742,6 +741,9 @@ ddi_attach_cmd_t cmd;
ipf_dev_info = dip;
+ if (ipf_cfw_ring_resize(IPF_CFW_RING_ALLOCATE) != 0)
+ goto attach_failed;
+
ipfncb = net_instance_alloc(NETINFO_VERSION);
if (ipfncb == NULL)
goto attach_failed;
@@ -769,6 +771,7 @@ ddi_attach_cmd_t cmd;
}
attach_failed:
+ (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY);
ddi_remove_minor_node(dip, NULL);
ddi_prop_remove_all(dip);
ddi_soft_state_fini(&ipf_state);
@@ -796,6 +799,7 @@ ddi_detach_cmd_t cmd;
* framework guarantees we are not active with this devinfo
* node in any other entry points at this time.
*/
+ (void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY);
ddi_prop_remove_all(dip);
i = ddi_get_instance(dip);
ddi_remove_minor_node(dip, NULL);
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index 5a168523ee..85ca5ebdec 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -23,6 +23,7 @@
/*
* Copyright (c) 1990 Mentat Inc.
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
*/
/*
@@ -1400,6 +1401,8 @@ typedef struct tcpConnEntryInfo_s {
/* round-trip time smoothed average (us) */
Gauge ce_rtt_sa;
/* current rto (retransmit timeout) */
+ Gauge ce_rtt_sd;
+ /* current rto (retransmit timeout) */
Gauge ce_rto;
/* round-trip time count */
Gauge ce_rtt_cnt;
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 6fb72d1d08..ddb482db78 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -43,6 +44,7 @@ extern "C" {
#include <inet/ip.h>
#include <inet/optcom.h>
#include <inet/tunables.h>
+#include <inet/bpf.h>
/*
* ICMP stack instances
@@ -84,6 +86,10 @@ typedef struct icmp_s {
mblk_t *icmp_fallback_queue_head;
mblk_t *icmp_fallback_queue_tail;
struct sockaddr_storage icmp_delayed_addr;
+
+ krwlock_t icmp_bpf_lock; /* protects icmp_bpf */
+ ip_bpf_insn_t *icmp_bpf_prog; /* SO_ATTACH_FILTER bpf */
+ uint_t icmp_bpf_len;
} icmp_t;
/*
diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c
new file mode 100644
index 0000000000..6e1171de46
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/datafilt.c
@@ -0,0 +1,116 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * This file implements a socketfilter used to deter TCP connections.
+ * To defer a connection means to delay the return of accept(3SOCKET)
+ * until at least one byte is ready to be read(2). This filter may be
+ * applied automatically or programmatically through the use of
+ * soconfig(1M) and setsockopt(3SOCKET).
+ */
+
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+#include <sys/note.h>
+#include <sys/taskq.h>
+
+#define DATAFILT_MODULE "datafilt"
+
+static struct modlmisc dataf_modlmisc = {
+ &mod_miscops,
+ "Kernel data-ready socket filter"
+};
+
+static struct modlinkage dataf_modlinkage = {
+ MODREV_1,
+ &dataf_modlmisc,
+ NULL
+};
+
+static sof_rval_t
+dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph,
+ void *parg, struct sockaddr *laddr, socklen_t laddrlen,
+ struct sockaddr *faddr, socklen_t faddrlen, void **cookiep)
+{
+ _NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen,
+ cookiep));
+ return (SOF_RVAL_DEFER);
+}
+
+static void
+dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr)
+{
+ _NOTE(ARGUNUSED(handle, cookie, cr));
+}
+
+static mblk_t *
+dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags,
+ size_t *lenp)
+{
+ _NOTE(ARGUNUSED(cookie, flags, lenp));
+
+ if (mp != NULL && MBLKL(mp) > 0) {
+ sof_newconn_ready(handle);
+ sof_bypass(handle);
+ }
+
+ return (mp);
+}
+
+static sof_ops_t dataf_ops = {
+ .sofop_attach_passive = dataf_attach_passive_cb,
+ .sofop_detach = dataf_detach_cb,
+ .sofop_data_in = dataf_data_in_cb
+};
+
+int
+_init(void)
+{
+ int err;
+
+ /*
+ * This module is safe to attach even after some preliminary socket
+ * setup calls have taken place. See the comment for SOF_ATT_SAFE.
+ */
+ err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops,
+ SOF_ATT_SAFE);
+ if (err != 0)
+ return (err);
+ if ((err = mod_install(&dataf_modlinkage)) != 0)
+ (void) sof_unregister(DATAFILT_MODULE);
+
+ return (err);
+}
+
+int
+_fini(void)
+{
+ int err;
+
+ if ((err = sof_unregister(DATAFILT_MODULE)) != 0)
+ return (err);
+
+ return (mod_remove(&dataf_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&dataf_modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
index 586d7f06f8..76191e93b8 100644
--- a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
+++ b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
*/
#include <sys/types.h>
@@ -51,6 +51,7 @@
#include <sys/mac_client.h>
#include <sys/mac_provider.h>
#include <sys/mac_client_priv.h>
+#include <inet/bpf.h>
#include <netpacket/packet.h>
@@ -448,7 +449,7 @@ pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag)
buffer = (uchar_t *)mp;
}
rw_enter(&ps->ps_bpflock, RW_READER);
- if (bpf_filter(ps->ps_bpf.bf_insns, buffer,
+ if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer,
hdr.mhi_pktsize, buflen) == 0) {
rw_exit(&ps->ps_bpflock);
ps->ps_stats.tp_drops++;
@@ -1336,7 +1337,7 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
const void *optval, socklen_t optlen)
{
struct bpf_program prog;
- struct bpf_insn *fcode;
+ ip_bpf_insn_t *fcode;
struct pfpsock *ps;
struct sock_proto_props sopp;
int error = 0;
@@ -1370,10 +1371,10 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
return (EFAULT);
}
- if (bpf_validate(fcode, (int)prog.bf_len)) {
+ if (ip_bpf_validate(fcode, prog.bf_len)) {
rw_enter(&ps->ps_bpflock, RW_WRITER);
pfp_release_bpf(ps);
- ps->ps_bpf.bf_insns = fcode;
+ ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode;
ps->ps_bpf.bf_len = size;
rw_exit(&ps->ps_bpflock);
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 9fa40eccb6..e65af832eb 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -61,6 +61,10 @@
* connection are processed on that squeue. The connection ("conn") to
* squeue mapping is stored in "conn_t" member "conn_sqp".
*
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
* Since the processing of the connection cuts across multiple layers
* but still allows packets for different connnection to be processed on
* other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -241,7 +245,7 @@ squeue_init(void)
}
squeue_t *
-squeue_create(pri_t pri)
+squeue_create(pri_t pri, boolean_t isip)
{
squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
@@ -256,11 +260,36 @@ squeue_create(pri_t pri)
sqp->sq_enter = squeue_enter;
sqp->sq_drain = squeue_drain;
+ sqp->sq_isip = isip;
return (sqp);
}
/*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+ kt_did_t worker, poll;
+ mutex_enter(&sqp->sq_lock);
+ VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+ SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+ worker = sqp->sq_worker->t_did;
+ poll = sqp->sq_poll_thr->t_did;
+ sqp->sq_state |= SQS_EXIT;
+ cv_signal(&sqp->sq_poll_cv);
+ cv_signal(&sqp->sq_worker_cv);
+ mutex_exit(&sqp->sq_lock);
+
+ thread_join(poll);
+ thread_join(worker);
+ kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
* Bind squeue worker thread to the specified CPU, given by CPU id.
* If the CPU id value is -1, bind the worker thread to the value
* specified in sq_bind field. If a thread is already bound to a
@@ -380,18 +409,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
SQUEUE_DBG_CLEAR(sqp);
- CONN_DEC_REF(connp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -407,7 +439,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* still be best to process a single queued
* item if it matches the active connection.
*/
- if (sqp->sq_first != NULL) {
+ if (sqp->sq_first != NULL && sqp->sq_isip) {
squeue_try_drain_one(sqp, connp);
}
@@ -423,7 +455,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
return;
}
} else {
- if (ira != NULL) {
+ if (sqp->sq_isip == B_TRUE && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -496,7 +528,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
if (!(sqp->sq_state & SQS_REENTER) &&
(process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
(sqp->sq_run == curthread) && (cnt == 1) &&
- (connp->conn_on_sqp == B_FALSE)) {
+ (sqp->sq_isip == B_FALSE ||
+ connp->conn_on_sqp == B_FALSE)) {
sqp->sq_state |= SQS_REENTER;
mutex_exit(&sqp->sq_lock);
@@ -511,15 +544,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
* Handle squeue switching. More details in the
* block comment at the top of the file
*/
- if (connp->conn_sqp == sqp) {
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+ SQUEUE_DBG_SET(sqp, mp, proc, connp,
+ tag);
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -540,7 +579,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
#ifdef DEBUG
mp->b_tag = tag;
#endif
- if (ira != NULL) {
+ if (sqp->sq_isip && ira != NULL) {
mblk_t *attrmp;
ASSERT(cnt == 1);
@@ -658,7 +697,7 @@ again:
mp->b_prev = NULL;
/* Is there an ip_recv_attr_t to handle? */
- if (ip_recv_attr_is_mblk(mp)) {
+ if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
mblk_t *attrmp = mp;
ASSERT(attrmp->b_cont != NULL);
@@ -683,20 +722,25 @@ again:
/*
- * Handle squeue switching. More details in the
- * block comment at the top of the file
+ * Handle squeue switching. More details in the block comment at
+ * the top of the file. non-IP squeues cannot switch, as there
+ * is no conn_t.
*/
- if (connp->conn_sqp == sqp) {
+ if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
SQUEUE_DBG_SET(sqp, mp, proc, connp,
mp->b_tag);
- connp->conn_on_sqp = B_TRUE;
+ if (sqp->sq_isip == B_TRUE)
+ connp->conn_on_sqp = B_TRUE;
DTRACE_PROBE3(squeue__proc__start, squeue_t *,
sqp, mblk_t *, mp, conn_t *, connp);
(*proc)(connp, mp, sqp, ira);
DTRACE_PROBE2(squeue__proc__end, squeue_t *,
sqp, conn_t *, connp);
- connp->conn_on_sqp = B_FALSE;
- CONN_DEC_REF(connp);
+ if (sqp->sq_isip == B_TRUE) {
+ connp->conn_on_sqp = B_FALSE;
+ CONN_DEC_REF(connp);
+ }
+ SQUEUE_DBG_CLEAR(sqp);
} else {
SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -925,6 +969,11 @@ squeue_polling_thread(squeue_t *sqp)
cv_wait(async, lock);
CALLB_CPR_SAFE_END(&cprinfo, lock);
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
SQS_POLL_THR_QUIESCED);
if (ctl_state != 0) {
@@ -950,6 +999,9 @@ squeue_polling_thread(squeue_t *sqp)
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
(SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
+ /* Only IP related squeues should reach this point */
+ VERIFY(sqp->sq_isip == B_TRUE);
+
poll_again:
sq_rx_ring = sqp->sq_rx_ring;
sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1079,6 +1131,7 @@ squeue_worker_thr_control(squeue_t *sqp)
ill_rx_ring_t *rx_ring;
ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ VERIFY(sqp->sq_isip == B_TRUE);
if (sqp->sq_state & SQS_POLL_RESTART) {
/* Restart implies a previous quiesce. */
@@ -1190,6 +1243,11 @@ squeue_worker(squeue_t *sqp)
for (;;) {
for (;;) {
+ if (sqp->sq_state & SQS_EXIT) {
+ mutex_exit(lock);
+ thread_exit();
+ }
+
/*
* If the poll thread has handed control to us
* we need to break out of the wait.
@@ -1286,6 +1344,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
again:
sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
mutex_enter(&sqp->sq_lock);
if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1374,6 +1433,7 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn)
ASSERT(MUTEX_HELD(&sqp->sq_lock));
ASSERT((sqp->sq_state & SQS_PROC) == 0);
ASSERT(sqp->sq_run == NULL);
+ ASSERT(sqp->sq_isip);
VERIFY(mp != NULL);
/*
@@ -1440,6 +1500,9 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn)
CONN_DEC_REF(connp);
SQUEUE_DBG_CLEAR(sqp);
+ if (ira != NULL)
+ ira_cleanup(ira, B_TRUE);
+
done:
mutex_enter(&sqp->sq_lock);
sqp->sq_state &= ~(SQS_PROC);
@@ -1451,6 +1514,7 @@ squeue_synch_exit(conn_t *connp, int flag)
{
squeue_t *sqp = connp->conn_sqp;
+ VERIFY(sqp->sq_isip == B_TRUE);
ASSERT(flag == SQ_NODRAIN || flag == SQ_PROCESS);
mutex_enter(&sqp->sq_lock);
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 775c5abe6b..3ed2b7174a 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2014, 2017 by Delphix. All rights reserved.
* Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
@@ -137,6 +137,7 @@ typedef struct tcphdra_s {
struct conn_s;
struct tcp_listen_cnt_s;
+struct tcp_rg_s;
/*
* Control structure for each open TCP stream,
@@ -407,6 +408,13 @@ typedef struct tcp_s {
struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
struct tcp_s **tcp_ptpbhn;
+ /*
+ * Group of tcp_t entries bound to the same adress and port via
+ * SO_REUSEPORT. The pointer itself is protected by tf_lock in the
+ * containing tcps_bind_fanout slot.
+ */
+ struct tcp_rg_s *tcp_rg_bind;
+
uint_t tcp_maxpsz_multiplier;
uint32_t tcp_lso_max; /* maximum LSO payload */
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 9348ea3d0f..427a6df274 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -961,8 +961,7 @@ void
tcp_stop_lingering(tcp_t *tcp)
{
clock_t delta = 0;
- tcp_stack_t *tcps = tcp->tcp_tcps;
- conn_t *connp = tcp->tcp_connp;
+ conn_t *connp = tcp->tcp_connp;
tcp->tcp_linger_tid = 0;
if (tcp->tcp_state > TCPS_LISTEN) {
@@ -990,7 +989,7 @@ tcp_stop_lingering(tcp_t *tcp)
if (tcp->tcp_state == TCPS_TIME_WAIT) {
tcp_time_wait_append(tcp);
- TCP_DBGSTAT(tcps, tcp_detach_time_wait);
+ TCP_DBGSTAT(tcp->tcp_tcps, tcp_detach_time_wait);
goto finish;
}
@@ -1429,6 +1428,21 @@ tcp_free(tcp_t *tcp)
tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
/*
+ * Destroy any association with SO_REUSEPORT group.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ /*
+ * This is only necessary for connections which enabled
+ * SO_REUSEPORT but were never bound. Such connections should
+ * be the one and only member of the tcp_rg_tp to which they
+ * have been associated.
+ */
+ VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ tcp->tcp_rg_bind = NULL;
+ }
+
+ /*
* If this is a non-STREAM socket still holding on to an upper
* handle, release it. As a result of fallback we might also see
* STREAMS based conns with upper handles, in which case there is
@@ -2477,8 +2491,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
* Path MTU might have changed by either increase or decrease, so need to
* adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
* or negative MSS, since tcp_mss_set() will do it.
+ *
+ * Returns B_TRUE when the connection PMTU changes, otherwise B_FALSE.
*/
-void
+boolean_t
tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
{
uint32_t pmtu;
@@ -2488,10 +2504,10 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
iaflags_t ixaflags;
if (tcp->tcp_tcps->tcps_ignore_path_mtu)
- return;
+ return (B_FALSE);
if (tcp->tcp_state < TCPS_ESTABLISHED)
- return;
+ return (B_FALSE);
/*
* Always call ip_get_pmtu() to make sure that IP has updated
@@ -2511,13 +2527,13 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
* Nothing to change, so just return.
*/
if (mss == tcp->tcp_mss)
- return;
+ return (B_FALSE);
/*
* Currently, for ICMP errors, only PMTU decrease is handled.
*/
if (mss > tcp->tcp_mss && decrease_only)
- return;
+ return (B_FALSE);
DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
@@ -2552,6 +2568,7 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
}
ixa->ixa_flags = ixaflags;
+ return (B_TRUE);
}
int
@@ -3424,7 +3441,7 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
tcp_update_lso(tcp, connp->conn_ixa);
break;
case IXAN_PMTU:
- tcp_update_pmtu(tcp, B_FALSE);
+ (void) tcp_update_pmtu(tcp, B_FALSE);
break;
case IXAN_ZCOPY:
tcp_update_zcopy(tcp);
@@ -3755,7 +3772,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
{
tcp_stack_t *tcps;
int i;
- int error = 0;
major_t major;
size_t arrsz;
@@ -3819,8 +3835,7 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
tcps->tcps_mibkp = tcp_kstat_init(stackid);
major = mod_name_to_major(INET_NAME);
- error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident);
- ASSERT(error == 0);
+ VERIFY0(ldi_ident_from_major(major, &tcps->tcps_ldi_ident));
tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL);
diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c
index 86242fc944..5c2e1e1932 100644
--- a/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ b/usr/src/uts/common/inet/tcp/tcp_bind.c
@@ -22,6 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
* Copyright (c) 2016 by Delphix. All rights reserved.
*/
@@ -56,6 +57,7 @@ static uint32_t tcp_random_anon_port = 1;
static int tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
cred_t *cr);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
/*
* Hash list insertion routine for tcp_t structures. Each hash bucket
@@ -173,6 +175,16 @@ tcp_bind_hash_remove(tcp_t *tcp)
ASSERT(lockp != NULL);
mutex_enter(lockp);
+
+ /* destroy any association with SO_REUSEPORT group */
+ if (tcp->tcp_rg_bind != NULL) {
+ if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+ /* Last one out turns off the lights */
+ tcp_rg_destroy(tcp->tcp_rg_bind);
+ }
+ tcp->tcp_rg_bind = NULL;
+ }
+
if (tcp->tcp_ptpbhn) {
tcpnext = tcp->tcp_bind_hash_port;
if (tcpnext != NULL) {
@@ -638,13 +650,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
}
/*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
*
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it. If not, return
- * the first anonymous port we happen across. If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it. If not, return the first anonymous port we
+ * happen across. If no anonymous ports are available, return 0.
*
* In either case, when succeeding update the tcp_t to record the port number
* and insert it in the bind hash table.
@@ -664,6 +675,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int loopmax;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
+ boolean_t reuseport = connp->conn_reuseport;
/*
* Lookup for free addresses is done in a loop and "loopmax"
@@ -700,6 +712,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
tf_t *tbf;
tcp_t *ltcp;
conn_t *lconnp;
+ boolean_t attempt_reuse = B_FALSE;
lport = htons(port);
@@ -726,6 +739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
boolean_t not_socket;
boolean_t exclbind;
+ boolean_t addrmatch;
lconnp = ltcp->tcp_connp;
@@ -831,22 +845,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
&lconnp->conn_faddr_v6)))
continue;
+ addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+ &lconnp->conn_bound_addr_v6);
+
+ if (addrmatch && reuseport && bind_to_req_port_only &&
+ (ltcp->tcp_state == TCPS_BOUND ||
+ ltcp->tcp_state == TCPS_LISTEN)) {
+ /*
+ * This entry is bound to the exact same
+ * address and port. If SO_REUSEPORT is set on
+ * the calling socket, attempt to reuse this
+ * binding if it too had SO_REUSEPORT enabled
+ * when it was bound.
+ */
+ attempt_reuse = (ltcp->tcp_rg_bind != NULL);
+ break;
+ }
+
if (!reuseaddr) {
/*
- * No socket option SO_REUSEADDR.
- * If existing port is bound to
- * a non-wildcard IP address
- * and the requesting stream is
- * bound to a distinct
- * different IP addresses
- * (non-wildcard, also), keep
- * going.
+ * No socket option SO_REUSEADDR. If an
+ * existing port is bound to a non-wildcard IP
+ * address and the requesting stream is bound
+ * to a distinct different IP address
+ * (non-wildcard, also), keep going.
*/
if (!V6_OR_V4_INADDR_ANY(*laddr) &&
!V6_OR_V4_INADDR_ANY(
lconnp->conn_bound_addr_v6) &&
- !IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6))
+ !addrmatch)
continue;
if (ltcp->tcp_state >= TCPS_BOUND) {
/*
@@ -861,27 +888,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
* socket option SO_REUSEADDR is set on the
* binding tcp_t.
*
- * If two streams are bound to
- * same IP address or both addr
- * and bound source are wildcards
- * (INADDR_ANY), we want to stop
- * searching.
- * We have found a match of IP source
- * address and source port, which is
- * refused regardless of the
- * SO_REUSEADDR setting, so we break.
+ * If two streams are bound to the same IP
+ * address or both addr and bound source are
+ * wildcards (INADDR_ANY), we want to stop
+ * searching. We have found a match of IP
+ * source address and source port, which is
+ * refused regardless of the SO_REUSEADDR
+ * setting, so we break.
*/
- if (IN6_ARE_ADDR_EQUAL(laddr,
- &lconnp->conn_bound_addr_v6) &&
+ if (addrmatch &&
(ltcp->tcp_state == TCPS_LISTEN ||
ltcp->tcp_state == TCPS_BOUND))
break;
}
}
- if (ltcp != NULL) {
+ if (ltcp != NULL && !attempt_reuse) {
/* The port number is busy */
mutex_exit(&tbf->tf_lock);
} else {
+ if (attempt_reuse) {
+ int err;
+ struct tcp_rg_s *rg;
+
+ ASSERT(ltcp != NULL);
+ ASSERT(ltcp->tcp_rg_bind != NULL);
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+ err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+ if (err != 0) {
+ mutex_exit(&tbf->tf_lock);
+ return (0);
+ }
+ /*
+ * Now that the newly-binding socket has joined
+ * the existing reuseport group on ltcp, it
+ * should clean up its own (empty) group.
+ */
+ rg = tcp->tcp_rg_bind;
+ tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+ VERIFY(tcp_rg_remove(rg, tcp));
+ tcp_rg_destroy(rg);
+ }
+
/*
* This port is ours. Insert in fanout and mark as
* bound to prevent others from getting the port
@@ -946,3 +995,124 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
} while (++count < loopmax);
return (0);
}
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define TCP_RG_SIZE_MAX 64
+/* Step size when expanding members array */
+#define TCP_RG_SIZE_STEP 2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+ tcp_rg_t *rg;
+ rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP_LAZY);
+ if (rg == NULL)
+ return (NULL);
+ rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), KM_NOSLEEP_LAZY);
+ if (rg->tcprg_members == NULL) {
+ kmem_free(rg, sizeof (tcp_rg_t));
+ return (NULL);
+ }
+
+ mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+ rg->tcprg_size = 2;
+ rg->tcprg_count = 1;
+ rg->tcprg_active = 1;
+ rg->tcprg_members[0] = tcp;
+ return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+ mutex_enter(&rg->tcprg_lock);
+ ASSERT(rg->tcprg_count == 0);
+ ASSERT(rg->tcprg_active == 0);
+ kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+ mutex_destroy(&rg->tcprg_lock);
+ kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+ mutex_enter(&rg->tcprg_lock);
+
+ VERIFY(rg->tcprg_size > 0);
+ VERIFY(rg->tcprg_count <= rg->tcprg_size);
+ if (rg->tcprg_count != 0) {
+ cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+ cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+ if (crgetuid(oldcred) != crgetuid(newcred) ||
+ crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EPERM);
+ }
+ }
+
+ if (rg->tcprg_count == rg->tcprg_size) {
+ unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+ unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+ tcp_t **newmembers;
+
+ if (newsize > TCP_RG_SIZE_MAX) {
+ mutex_exit(&rg->tcprg_lock);
+ return (EINVAL);
+ }
+ newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+ KM_NOSLEEP_LAZY);
+ if (newmembers == NULL) {
+ mutex_exit(&rg->tcprg_lock);
+ return (ENOMEM);
+ }
+ bcopy(rg->tcprg_members, newmembers, oldalloc);
+ kmem_free(rg->tcprg_members, oldalloc);
+ rg->tcprg_members = newmembers;
+ rg->tcprg_size = newsize;
+ }
+
+ rg->tcprg_members[rg->tcprg_count] = tcp;
+ rg->tcprg_count++;
+ rg->tcprg_active++;
+
+ mutex_exit(&rg->tcprg_lock);
+ return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+ int i;
+ boolean_t is_empty;
+
+ mutex_enter(&rg->tcprg_lock);
+ for (i = 0; i < rg->tcprg_count; i++) {
+ if (rg->tcprg_members[i] == tcp)
+ break;
+ }
+ /* The item should be present */
+ ASSERT(i < rg->tcprg_count);
+ /* Move the last member into this position */
+ rg->tcprg_count--;
+ rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+ rg->tcprg_members[rg->tcprg_count] = NULL;
+ if (tcp->tcp_connp->conn_reuseport != 0)
+ rg->tcprg_active--;
+ is_empty = (rg->tcprg_count == 0);
+ mutex_exit(&rg->tcprg_lock);
+ return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+ mutex_enter(&rg->tcprg_lock);
+ if (is_active) {
+ rg->tcprg_active++;
+ } else {
+ rg->tcprg_active--;
+ }
+ mutex_exit(&rg->tcprg_lock);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index dd264528fc..22b0019a6a 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -5715,10 +5715,12 @@ noticmpv4:
switch (icmph->icmph_code) {
case ICMP_FRAGMENTATION_NEEDED:
/*
- * Update Path MTU, then try to send something out.
+ * Attempt to update path MTU and, if the MSS of the
+ * connection is altered, retransmit outstanding data.
*/
- tcp_update_pmtu(tcp, B_TRUE);
- tcp_rexmit_after_error(tcp);
+ if (tcp_update_pmtu(tcp, B_TRUE)) {
+ tcp_rexmit_after_error(tcp);
+ }
break;
case ICMP_PORT_UNREACHABLE:
case ICMP_PROTOCOL_UNREACHABLE:
@@ -5761,7 +5763,7 @@ noticmpv4:
break;
}
break;
- case ICMP_SOURCE_QUENCH: {
+ case ICMP_SOURCE_QUENCH:
/*
* use a global boolean to control
* whether TCP should respond to ICMP_SOURCE_QUENCH.
@@ -5786,7 +5788,6 @@ noticmpv4:
}
break;
}
- }
freemsg(mp);
}
@@ -5839,10 +5840,12 @@ noticmpv6:
switch (icmp6->icmp6_type) {
case ICMP6_PACKET_TOO_BIG:
/*
- * Update Path MTU, then try to send something out.
+ * Attempt to update path MTU and, if the MSS of the connection
+ * is altered, retransmit outstanding data.
*/
- tcp_update_pmtu(tcp, B_TRUE);
- tcp_rexmit_after_error(tcp);
+ if (tcp_update_pmtu(tcp, B_TRUE)) {
+ tcp_rexmit_after_error(tcp);
+ }
break;
case ICMP6_DST_UNREACH:
switch (icmp6->icmp6_code) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 8687b52d53..15e49ae070 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -67,7 +67,8 @@ opdes_t tcp_opt_arr[] = {
{ SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
},
{ SO_BROADCAST, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
{ SO_TYPE, SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
{ SO_SNDBUF, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
@@ -505,6 +506,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
}
/*
+ * Set a TCP connection's participation in SO_REUSEPORT. This operation is
+ * performed under the protection of the squeue via tcp_setsockopt.
+ * The manipulation of tcp_rg_bind, as part of this operation, is subject to
+ * these constraints:
+ * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
+ * under the protection of the squeue.
+ * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
+ * altered until such time as tcp_free() cleans up the connection.
+ * 3. A connection undergoing bind, which matches to a connection participating
+ * in port-reuse, will switch its tcp_rg_bind pointer when it joins the
+ * group of an existing connection in tcp_bindi().
+ */
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+ tcp_t *tcp = connp->conn_tcp;
+ struct tcp_rg_s *rg;
+
+ if (!IPCL_IS_NONSTR(connp)) {
+ if (do_enable) {
+ /*
+ * SO_REUSEPORT cannot be enabled on sockets which have
+ * fallen back to the STREAMS API.
+ */
+ return (EINVAL);
+ } else {
+ /*
+ * A connection with SO_REUSEPORT enabled should be
+ * prevented from falling back to STREAMS mode via
+ * logic in tcp_fallback. It is legal, however, for
+ * fallen-back connections to affirm the disabled state
+ * of SO_REUSEPORT.
+ */
+ ASSERT(connp->conn_reuseport == 0);
+ return (0);
+ }
+ }
+ if (tcp->tcp_state <= TCPS_CLOSED) {
+ return (EINVAL);
+ }
+ if (connp->conn_reuseport == 0 && do_enable) {
+ /* disabled -> enabled */
+ if (tcp->tcp_rg_bind != NULL) {
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ } else {
+ /*
+ * Connection state is not a concern when initially
+ * populating tcp_rg_bind. Setting it to non-NULL on a
+ * bound or listening connection would only mean that
+ * new reused-port binds become a possibility.
+ */
+ if ((rg = tcp_rg_init(tcp)) == NULL) {
+ return (ENOMEM);
+ }
+ tcp->tcp_rg_bind = rg;
+ }
+ connp->conn_reuseport = 1;
+ } else if (connp->conn_reuseport != 0 && !do_enable) {
+ /* enabled -> disabled */
+ ASSERT(tcp->tcp_rg_bind != NULL);
+ if (tcp->tcp_state == TCPS_IDLE) {
+ /*
+ * If the connection has not been bound yet, discard
+ * the reuse group state. Since disabling SO_REUSEPORT
+ * on a bound socket will _not_ prevent others from
+ * reusing the port, the presence of tcp_rg_bind is
+ * used to determine reuse availability, not
+ * conn_reuseport.
+ *
+ * This allows proper behavior for examples such as:
+ *
+ * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
+ * bind(fd1, &myaddr, ...);
+ * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
+ *
+ * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
+ * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
+ *
+ */
+ rg = tcp->tcp_rg_bind;
+ tcp->tcp_rg_bind = NULL;
+ VERIFY(tcp_rg_remove(rg, tcp));
+ tcp_rg_destroy(rg);
+ } else {
+ /*
+ * If a connection has been bound, it's no longer safe
+ * to manipulate tcp_rg_bind until connection clean-up
+ * during tcp_free. Just mark the member status of the
+ * connection as inactive.
+ */
+ tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+ }
+ connp->conn_reuseport = 0;
+ }
+ return (0);
+}
+
+/*
* We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
* Parameters are assumed to be verified by the caller.
*/
@@ -674,6 +773,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
*outlenp = inlen;
return (0);
+ case SO_REUSEPORT:
+ if (!checkonly) {
+ return (tcp_set_reuseport(connp, *i1 != 0));
+ }
+ return (0);
}
break;
case IPPROTO_TCP:
@@ -1031,10 +1135,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
}
break;
case IPPROTO_IP:
- if (connp->conn_family != AF_INET) {
- *outlenp = 0;
- return (EINVAL);
- }
switch (name) {
case IP_SEC_OPT:
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index 9b6c0daac3..32422be675 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -1029,6 +1029,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
}
/*
+ * Do not allow fallback on connections making use of SO_REUSEPORT.
+ */
+ if (tcp->tcp_rg_bind != NULL) {
+ freeb(stropt_mp);
+ freeb(ordrel_mp);
+ squeue_synch_exit(connp, SQ_NODRAIN);
+ return (EINVAL);
+ }
+
+ /*
* Both endpoints must be of the same type (either STREAMS or
* non-STREAMS) for fusion to be enabled. So if we are fused,
* we have to unfuse.
diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c
index e29c76a696..226467e167 100644
--- a/usr/src/uts/common/inet/tcp/tcp_stats.c
+++ b/usr/src/uts/common/inet/tcp/tcp_stats.c
@@ -21,8 +21,8 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
* Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
*/
@@ -131,9 +131,14 @@ tcp_set_conninfo(tcp_t *tcp, struct tcpConnEntryInfo_s *tcei, boolean_t ispriv)
tcei->ce_rto = tcp->tcp_rto;
tcei->ce_mss = tcp->tcp_mss;
tcei->ce_state = tcp->tcp_state;
- tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3);
tcei->ce_rtt_sum = NSEC2USEC(tcp->tcp_rtt_sum);
tcei->ce_rtt_cnt = tcp->tcp_rtt_cnt;
+
+ /* tcp_rtt_sa is stored as 8 times the average RTT */
+ tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3);
+
+ /* tcp_rtt_sd is stored as 4 times the average RTTVAR */
+ tcei->ce_rtt_sd = NSEC2USEC(tcp->tcp_rtt_sd >> 2);
}
/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
index 5793a7fd27..7d9b449392 100644
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -22,7 +22,7 @@
/*
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2011 Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
* Copyright (c) 2014, 2017 by Delphix. All rights reserved.
*/
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 5669592cff..61af05f749 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls;
* by setting it to 0.
*/
#define TCP_XMIT_LOWATER 4096
-#define TCP_XMIT_HIWATER 49152
+#define TCP_XMIT_HIWATER 128000
#define TCP_RECV_LOWATER 2048
-#define TCP_RECV_HIWATER 128000
+#define TCP_RECV_HIWATER 1048576
/*
* Bind hash list size and has function. It has to be a power of 2 for
@@ -395,6 +395,22 @@ typedef struct tcp_listen_cnt_s {
uint32_t tlc_drop;
} tcp_listen_cnt_t;
+/*
+ * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT.
+ * - tcprg_lock: Protects the other fields
+ * - tcprg_size: Allocated size (in entries) of tcprg_members array
+ * - tcprg_count: Count of occupied tcprg_members slots
+ * - tcprg_active: Count of members which still have SO_REUSEPORT set
+ * - tcprg_members: Connections associated with address/port group
+ */
+typedef struct tcp_rg_s {
+ kmutex_t tcprg_lock;
+ unsigned int tcprg_size;
+ unsigned int tcprg_count;
+ unsigned int tcprg_active;
+ tcp_t **tcprg_members;
+} tcp_rg_t;
+
#define TCP_TLC_REPORT_INTERVAL (30 * MINUTES)
#define TCP_DECR_LISTEN_CNT(tcp) \
@@ -678,7 +694,7 @@ extern int tcp_rwnd_set(tcp_t *, uint32_t);
extern int tcp_set_destination(tcp_t *);
extern void tcp_set_ws_value(tcp_t *);
extern void tcp_stop_lingering(tcp_t *);
-extern void tcp_update_pmtu(tcp_t *, boolean_t);
+extern boolean_t tcp_update_pmtu(tcp_t *, boolean_t);
extern mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
extern boolean_t tcp_zcopy_check(tcp_t *);
extern void tcp_zcopy_notify(tcp_t *);
@@ -695,6 +711,10 @@ extern in_port_t tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *,
int, boolean_t, boolean_t, boolean_t);
extern in_port_t tcp_update_next_port(in_port_t, const tcp_t *,
boolean_t);
+extern tcp_rg_t *tcp_rg_init(tcp_t *);
+extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *);
+extern void tcp_rg_destroy(tcp_rg_t *);
+extern void tcp_rg_setactive(tcp_rg_t *, boolean_t);
/*
* Fusion related functions in tcp_fusion.c.
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 5d42a69fa2..4e208465f2 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -1671,6 +1671,11 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
*i1 = udp->udp_vxlanhash;
mutex_exit(&connp->conn_lock);
return (sizeof (int));
+ case UDP_SND_TO_CONNECTED:
+ mutex_enter(&connp->conn_lock);
+ *i1 = udp->udp_snd_to_conn ? 1 : 0;
+ mutex_exit(&connp->conn_lock);
+ return (sizeof (int));
}
}
mutex_enter(&connp->conn_lock);
@@ -1826,6 +1831,11 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
}
/* Fully handled this option. */
return (0);
+ case UDP_SND_TO_CONNECTED:
+ mutex_enter(&connp->conn_lock);
+ udp->udp_snd_to_conn = onoff;
+ mutex_exit(&connp->conn_lock);
+ return (0);
}
break;
}
@@ -6096,10 +6106,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
else
return (error);
}
- if (udp->udp_state == TS_DATA_XFER) {
+
+ /*
+ * Check if we're allowed to send to a connection on which we've
+ * already called 'connect'. The posix spec. allows both behaviors but
+ * historically we've returned an error if already connected. The
+ * client can allow this via a sockopt.
+ */
+ if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {
UDPS_BUMP_MIB(us, udpOutErrors);
return (EISCONN);
}
+
error = proto_verify_ip_addr(connp->conn_family,
(struct sockaddr *)msg->msg_name, msg->msg_namelen);
if (error != 0) {
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index c8e7d79e47..9c05b8c876 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -294,7 +294,9 @@ opdes_t udp_opt_arr[] = {
},
{ UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int),
0 },
-{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }
+{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 },
+{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+ 0 }
};
/*
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 0fc597ccf3..ef11973707 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -179,12 +179,12 @@ typedef struct udp_s {
udp_issocket : 1, /* socket mode; sockfs is on top */
udp_nat_t_endpoint : 1, /* UDP_NAT_T_ENDPOINT option */
udp_rcvhdr : 1, /* UDP_RCVHDR option */
-
udp_vxlanhash: 1, /* UDP_SRCPORT_HASH option */
/* Because there's only VXLAN, cheat */
/* and only use a single bit */
+ udp_snd_to_conn: 1, /* UDP_SND_TO_CONNECTED option */
- udp_pad_to_bit_31 : 28;
+ udp_pad_to_bit_31 : 27;
/* Following 2 fields protected by the uf_lock */
struct udp_s *udp_bind_hash; /* Bind hash chain */