41 files changed, 2753 insertions, 342 deletions
diff --git a/usr/src/uts/common/inet/bpf.h b/usr/src/uts/common/inet/bpf.h
new file mode 100644
index 0000000000..e3eac799e5
--- /dev/null
+++ b/usr/src/uts/common/inet/bpf.h
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_INET_BPF_H
+#define	_INET_BPF_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#ifdef	_KERNEL
+
+#include <sys/types.h>
+
+/*
+ * Clone bpf_insn definition so that consumers don't need net/bpf.h to reason
+ * about struct sizing.
+ */
+typedef struct ip_bpf_insn {
+	uint16_t	code;
+	uint8_t		jt;
+	uint8_t		jf;
+	uint32_t	k;
+} ip_bpf_insn_t;
+
+extern uint32_t ip_bpf_filter(ip_bpf_insn_t *, uchar_t *, uint_t, uint_t);
+extern boolean_t ip_bpf_validate(ip_bpf_insn_t *, uint_t);
+
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _INET_BPF_H */
diff --git a/usr/src/uts/common/inet/bpf_filter.c b/usr/src/uts/common/inet/bpf_filter.c
new file mode 100644
index 0000000000..5a9ba38da6
--- /dev/null
+++ b/usr/src/uts/common/inet/bpf_filter.c
@@ -0,0 +1,572 @@
+/*	$NetBSD: bpf_filter.c,v 1.35 2008/08/20 13:01:54 joerg Exp $	*/
+
+/*
+ * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)bpf_filter.c	8.1 (Berkeley) 6/10/93
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/stream.h>
+#include <sys/byteorder.h>
+#include <sys/sdt.h>
+#include <inet/bpf.h>
+#include <net/bpf.h>
+
+#define	EXTRACT_SHORT(p)	BE_IN16(p)
+#define	EXTRACT_LONG(p)		BE_IN32(p)
+
+#define	M_LEN(_m)	((_m)->b_wptr - (_m)->b_rptr)
+#define	mtod(_a, _t)	((_t)((_a)->b_rptr))
+#define	MINDEX(len, m, k) 		\
+{ 					\
+	len = M_LEN(m); 		\
+	while (k >= len) { 		\
+		k -= len; 		\
+		m = m->b_cont; 		\
+		if (m == 0) 		\
+			return (0); 	\
+		len = M_LEN(m); 	\
+	} 				\
+}
+
+static int m_xword(mblk_t *, uint32_t, int *);
+static int m_xhalf(mblk_t *, uint32_t, int *);
+
+static int
+m_xword(mblk_t *m, uint32_t k, int *err)
+{
+	int len;
+	uchar_t *cp, *np;
+	mblk_t *m0;
+
+	*err = 1;
+	MINDEX(len, m, k);
+	cp = mtod(m, uchar_t *) + k;
+	if (len >= k + 4) {
+		*err = 0;
+		return (EXTRACT_LONG(cp));
+	}
+	m0 = m->b_cont;
+	if (m0 == 0 || M_LEN(m0) + len - k < 4) {
+		DTRACE_PROBE3(mblk_xword_fail, mblk_t *, m0, int, len, int, k);
+		return (0);
+	}
+	*err = 0;
+	np = mtod(m0, uchar_t *);
+	switch (len - k) {
+
+	case 1:
+		return ((cp[0] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]);
+
+	case 2:
+		return ((cp[0] << 24) | (cp[1] << 16) | (np[0] << 8) | np[1]);
+
+	default:
+		return ((cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | np[0]);
+	}
+}
+
+static int
+m_xhalf(mblk_t *m, uint32_t k, int *err)
+{
+	int len;
+	uchar_t *cp;
+	mblk_t *m0;
+
+	*err = 1;
+	MINDEX(len, m, k);
+	cp = mtod(m, uchar_t *) + k;
+	if (len >= k + 2) {
+		*err = 0;
+		return (EXTRACT_SHORT(cp));
+	}
+	m0 = m->b_cont;
+	if (m0 == 0) {
+		DTRACE_PROBE3(mblk_xhalf_fail, mblk_t *, m0, int, len, int, k);
+		return (0);
+	}
+	*err = 0;
+	return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]);
+}
+
+
+/*
+ * Execute the filter program starting at pc on the packet p
+ * wirelen is the length of the original packet
+ * buflen is the amount of data present
+ * When buflen is non-0, p is a pointer to a the start of the packet and the
+ * packet is only in one mblk_t.
+ * When buflen is 0, p is an mblk_t pointer.
+ */
+uint32_t
+ip_bpf_filter(ip_bpf_insn_t *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
+{
+	uint32_t A, X, k;
+	uint32_t mem[BPF_MEMWORDS];
+
+	if (pc == 0)
+		/*
+		 * No filter means accept all.
+		 */
+		return ((uint32_t)-1);
+	A = 0;
+	X = 0;
+	--pc;
+	/* CONSTCOND */
+	while (1) {
+		++pc;
+		switch (pc->code) {
+
+		default:
+#ifdef _KERNEL
+			DTRACE_PROBE1(bpf_insn_unknown,
+			    struct bpf_insn *, pc);
+			return (0);
+#else
+			abort();
+#endif
+		case BPF_RET|BPF_K:
+			return (pc->k);
+
+		case BPF_RET|BPF_A:
+			return (A);
+
+		case BPF_LD|BPF_W|BPF_ABS:
+			k = pc->k;
+			if (k + sizeof (int32_t) > buflen) {
+#ifdef _KERNEL
+				int merr = 0;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xword((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_LONG(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_H|BPF_ABS:
+			k = pc->k;
+			if (k + sizeof (int16_t) > buflen) {
+#ifdef _KERNEL
+				int merr;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xhalf((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_SHORT(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_B|BPF_ABS:
+			k = pc->k;
+			if (k >= buflen) {
+#ifdef _KERNEL
+				mblk_t *m;
+				int len;
+
+				if (buflen != 0)
+					return (0);
+				m = (mblk_t *)p;
+				MINDEX(len, m, k);
+				A = mtod(m, uchar_t *)[k];
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = p[k];
+			continue;
+
+		case BPF_LD|BPF_W|BPF_LEN:
+			A = wirelen;
+			continue;
+
+		case BPF_LDX|BPF_W|BPF_LEN:
+			X = wirelen;
+			continue;
+
+		case BPF_LD|BPF_W|BPF_IND:
+			k = X + pc->k;
+			if (k + sizeof (int32_t) > buflen) {
+#ifdef _KERNEL
+				int merr = 0;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xword((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_LONG(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_H|BPF_IND:
+			k = X + pc->k;
+			if (k + sizeof (int16_t) > buflen) {
+#ifdef _KERNEL
+				int merr = 0;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xhalf((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_SHORT(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_B|BPF_IND:
+			k = X + pc->k;
+			if (k >= buflen) {
+#ifdef _KERNEL
+				mblk_t *m;
+				int len;
+
+				if (buflen != 0)
+					return (0);
+				m = (mblk_t *)p;
+				MINDEX(len, m, k);
+				A = mtod(m, uchar_t *)[k];
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = p[k];
+			continue;
+
+		case BPF_LDX|BPF_MSH|BPF_B:
+			k = pc->k;
+			if (k >= buflen) {
+#ifdef _KERNEL
+				mblk_t *m;
+				int len;
+
+				if (buflen != 0)
+					return (0);
+				m = (mblk_t *)p;
+				MINDEX(len, m, k);
+				X = (mtod(m, char *)[k] & 0xf) << 2;
+				continue;
+#else
+				return (0);
+#endif
+			}
+			X = (p[pc->k] & 0xf) << 2;
+			continue;
+
+		case BPF_LD|BPF_IMM:
+			A = pc->k;
+			continue;
+
+		case BPF_LDX|BPF_IMM:
+			X = pc->k;
+			continue;
+
+		case BPF_LD|BPF_MEM:
+			A = mem[pc->k];
+			continue;
+
+		case BPF_LDX|BPF_MEM:
+			X = mem[pc->k];
+			continue;
+
+		case BPF_ST:
+			mem[pc->k] = A;
+			continue;
+
+		case BPF_STX:
+			mem[pc->k] = X;
+			continue;
+
+		case BPF_JMP|BPF_JA:
+			pc += pc->k;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_K:
+			pc += (A > pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_K:
+			pc += (A >= pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_K:
+			pc += (A == pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_K:
+			pc += (A & pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_X:
+			pc += (A > X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_X:
+			pc += (A >= X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_X:
+			pc += (A == X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_X:
+			pc += (A & X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_X:
+			A += X;
+			continue;
+
+		case BPF_ALU|BPF_SUB|BPF_X:
+			A -= X;
+			continue;
+
+		case BPF_ALU|BPF_MUL|BPF_X:
+			A *= X;
+			continue;
+
+		case BPF_ALU|BPF_DIV|BPF_X:
+			if (X == 0)
+				return (0);
+			A /= X;
+			continue;
+
+		case BPF_ALU|BPF_AND|BPF_X:
+			A &= X;
+			continue;
+
+		case BPF_ALU|BPF_OR|BPF_X:
+			A |= X;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_X:
+			A <<= X;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_X:
+			A >>= X;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_K:
+			A += pc->k;
+			continue;
+
+		case BPF_ALU|BPF_SUB|BPF_K:
+			A -= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_MUL|BPF_K:
+			A *= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_DIV|BPF_K:
+			A /= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_AND|BPF_K:
+			A &= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_OR|BPF_K:
+			A |= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_K:
+			A <<= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_K:
+			A >>= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_NEG:
+			A = -A;
+			continue;
+
+		case BPF_MISC|BPF_TAX:
+			X = A;
+			continue;
+
+		case BPF_MISC|BPF_TXA:
+			A = X;
+			continue;
+		}
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Return true if the 'fcode' is a valid filter program.
+ * The constraints are that each jump be forward and to a valid
+ * code, that memory accesses are within valid ranges (to the
+ * extent that this can be checked statically; loads of packet
+ * data have to be, and are, also checked at run time), and that
+ * the code terminates with either an accept or reject.
+ *
+ * The kernel needs to be able to verify an application's filter code.
+ * Otherwise, a bogus program could easily crash the system.
+ */
+boolean_t
+ip_bpf_validate(ip_bpf_insn_t *f, uint_t len)
+{
+	uint_t i, from;
+	ip_bpf_insn_t *p;
+
+	if (len < 1 || len > BPF_MAXINSNS)
+		return (B_FALSE);
+
+	for (i = 0; i < len; ++i) {
+		p = &f[i];
+		DTRACE_PROBE1(bpf_valid_insn, struct bpf_insn *, p);
+		switch (BPF_CLASS(p->code)) {
+		/*
+		 * Check that memory operations use valid addresses.
+		 */
+		case BPF_LD:
+		case BPF_LDX:
+			switch (BPF_MODE(p->code)) {
+			case BPF_MEM:
+				if (p->k >= BPF_MEMWORDS)
+					return (B_FALSE);
+				break;
+			case BPF_ABS:
+			case BPF_IND:
+			case BPF_MSH:
+			case BPF_IMM:
+			case BPF_LEN:
+				break;
+			default:
+				return (B_FALSE);
+			}
+			break;
+		case BPF_ST:
+		case BPF_STX:
+			if (p->k >= BPF_MEMWORDS)
+				return (B_FALSE);
+			break;
+		case BPF_ALU:
+			switch (BPF_OP(p->code)) {
+			case BPF_ADD:
+			case BPF_SUB:
+			case BPF_MUL:
+			case BPF_OR:
+			case BPF_AND:
+			case BPF_LSH:
+			case BPF_RSH:
+			case BPF_NEG:
+				break;
+			case BPF_DIV:
+				/*
+				 * Check for constant division by 0.
+				 */
+				if (BPF_RVAL(p->code) == BPF_K && p->k == 0)
+					return (B_FALSE);
+				break;
+			default:
+				return (B_FALSE);
+			}
+			break;
+		case BPF_JMP:
+			/*
+			 * Check that jumps are within the code block,
+			 * and that unconditional branches don't go
+			 * backwards as a result of an overflow.
+			 * Unconditional branches have a 32-bit offset,
+			 * so they could overflow; we check to make
+			 * sure they don't.  Conditional branches have
+			 * an 8-bit offset, and the from address is <=
+			 * BPF_MAXINSNS, and we assume that BPF_MAXINSNS
+			 * is sufficiently small that adding 255 to it
+			 * won't overflow.
+			 *
+			 * We know that len is <= BPF_MAXINSNS, and we
+			 * assume that BPF_MAXINSNS is < the maximum size
+			 * of a uint_t, so that i + 1 doesn't overflow.
+			 */
+			from = i + 1;
+			switch (BPF_OP(p->code)) {
+			case BPF_JA:
+				if (from + p->k < from || from + p->k >= len)
+					return (B_FALSE);
+				break;
+			case BPF_JEQ:
+			case BPF_JGT:
+			case BPF_JGE:
+			case BPF_JSET:
+				if (from + p->jt >= len || from + p->jf >= len)
+					return (B_FALSE);
+				break;
+			default:
+				return (B_FALSE);
+			}
+			break;
+		case BPF_RET:
+			break;
+		case BPF_MISC:
+			break;
+		default:
+			return (B_FALSE);
+		}
+	}
+
+	return (BPF_CLASS(f[len - 1].code) == BPF_RET);
+}
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index c081c44a04..ebf2574363 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -1416,6 +1416,7 @@ typedef union ill_g_head_u {
 #define	ILL_CAPAB_DLD		0x20		/* DLD capabilities */
 #define	ILL_CAPAB_DLD_POLL	0x40		/* Polling */
 #define	ILL_CAPAB_DLD_DIRECT	0x80		/* Direct function call */
+#define	ILL_CAPAB_DLD_IPCHECK	0x100		/* Check if IPs are permitted */
 
 /*
  * Per-ill Hardware Checksumming capbilities.
@@ -1772,6 +1773,10 @@ typedef struct ill_s {
 	 * Used to save errors that occur during plumbing
 	 */
 	uint_t		ill_ifname_pending_err;
+	/*
+	 * Used to save errors that occur during binding
+	 */
+	uint_t		ill_dl_bind_err;
 	avl_node_t	ill_avl_byppa; /* avl node based on ppa */
 	uint_t		ill_mcast_nces;	/* Number of NCEs that are multicast. */
 	list_t		ill_nce; /* pointer to nce_s list */
@@ -1938,6 +1943,7 @@ typedef struct ill_s {
  * ill_nd_lla_len		ipsq + down ill		only when ill is up
  * ill_phys_addr_pend		ipsq + down ill		only when ill is up
  * ill_ifname_pending_err	ipsq			ipsq
+ * ill_dl_bind_err		ipsq			ipsq
  * ill_avl_byppa		ipsq, ill_g_lock	write once
  *
  * ill_fastpath_list		ill_lock		ill_lock
@@ -3580,6 +3586,8 @@ typedef	void			(*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
 typedef void			*(*ip_dld_callb_t)(void *,
     ip_flow_enable_t, void *);
 typedef boolean_t		(*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t);
+typedef boolean_t		(*ip_mac_ipcheck_t)(void *, boolean_t,
+    in6_addr_t *);
 typedef int			(*ip_capab_func_t)(void *, uint_t,
     void *, uint_t);
 
@@ -3632,6 +3640,12 @@ typedef struct ill_dld_direct_s {		/* DLD provided driver Tx */
 	void			*idd_tx_fctl_dh;	/* mac_client_handle */
 } ill_dld_direct_t;
 
+/* IP - DLD direct function call to check if an IP is allowed */
+typedef struct ill_dld_ipcheck_s {
+	ip_mac_ipcheck_t	idi_allowed_df;
+	void			*idi_allowed_dh;
+} ill_dld_ipcheck_t;
+
 /* IP - DLD polling capability */
 typedef struct ill_dld_poll_s {
 	ill_rx_ring_t		idp_ring_tbl[ILL_MAX_RINGS];
@@ -3643,6 +3657,7 @@ struct ill_dld_capab_s {
 	void			*idc_capab_dh;	/* dld_str_t *dsp */
 	ill_dld_direct_t	idc_direct;
 	ill_dld_poll_t		idc_poll;
+	ill_dld_ipcheck_t	idc_ipcheck;
 };
 
 /*
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index 7aac9b655a..eeec56b162 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -644,6 +645,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
 		case SO_REUSEADDR:
 			*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
 			break;	/* goto sizeof (int) option return */
+		case SO_REUSEPORT:
+			*i1 = connp->conn_reuseport;
+			break;	/* goto sizeof (int) option return */
 		case SO_TYPE:
 			*i1 = connp->conn_so_type;
 			break;	/* goto sizeof (int) option return */
@@ -1214,8 +1218,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 	int		error;
 
-	if (connp->conn_family != AF_INET)
+	if (connp->conn_family == AF_INET6 &&
+	    connp->conn_ipversion == IPV4_VERSION) {
+		/*
+		 * Allow certain IPv4 options to be set on an AF_INET6 socket
+		 * if the connection is still IPv4.
+		 */
+		switch (name) {
+		case IP_TOS:
+		case T_IP_TOS:
+		case IP_TTL:
+		case IP_DONTFRAG:
+			break;
+		default:
+			return (EINVAL);
+		}
+	} else if (connp->conn_family != AF_INET) {
 		return (EINVAL);
+	}
 
 	ifindex = UINT_MAX;
 	switch (name) {
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index 57ee0c5585..46c791298a 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -81,6 +81,7 @@
 #include <sys/tsol/tnet.h>
 
 #include <inet/rawip_impl.h>
+#include <net/bpf.h>
 
 #include <sys/disp.h>
 
@@ -1018,6 +1019,12 @@ icmp_close_free(conn_t *connp)
 		icmp->icmp_filter = NULL;
 	}
 
+	if (icmp->icmp_bpf_len != 0) {
+		kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+		icmp->icmp_bpf_len = 0;
+		icmp->icmp_bpf_prog = NULL;
+	}
+
 	/*
 	 * Clear any fields which the kmem_cache constructor clears.
 	 * Only icmp_connp needs to be preserved.
@@ -1971,6 +1978,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
 	return (err);
 }
 
+static int
+icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
+{
+	struct bpf_program prog;
+	ip_bpf_insn_t *insns = NULL;
+	unsigned int size;
+
+#ifdef _LP64
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		struct bpf_program32 *prog32;
+
+		if (inlen != sizeof (struct bpf_program32)) {
+			return (EINVAL);
+		}
+		prog32 = (struct bpf_program32 *)invalp;
+		prog.bf_len = prog32->bf_len;
+		prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
+	} else
+#endif
+	if (inlen == sizeof (struct bpf_program)) {
+		bcopy(invalp, &prog, sizeof (prog));
+	} else {
+		return (EINVAL);
+	}
+
+	if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
+		return (EINVAL);
+	}
+	size = prog.bf_len * sizeof (struct bpf_insn);
+	insns = kmem_alloc(size, KM_SLEEP);
+	if (copyin(prog.bf_insns, insns, size) != 0) {
+		kmem_free(insns, size);
+		return (EFAULT);
+	}
+	if (!ip_bpf_validate(insns, prog.bf_len)) {
+		kmem_free(insns, size);
+		return (EINVAL);
+	}
+
+	rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+	if (icmp->icmp_bpf_len != 0) {
+		ASSERT(icmp->icmp_bpf_prog != NULL);
+
+		kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+	}
+	icmp->icmp_bpf_len = size;
+	icmp->icmp_bpf_prog = insns;
+	rw_exit(&icmp->icmp_bpf_lock);
+	return (0);
+}
+
+static int
+icmp_detach_filter(icmp_t *icmp)
+{
+	int error;
+
+	rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+	if (icmp->icmp_bpf_len == 0) {
+		ASSERT(icmp->icmp_bpf_prog == NULL);
+		error = ENOENT;
+	} else {
+		kmem_free(icmp->icmp_bpf_prog,
+		    icmp->icmp_bpf_len);
+		icmp->icmp_bpf_len = 0;
+		icmp->icmp_bpf_prog = NULL;
+		error = 0;
+	}
+	rw_exit(&icmp->icmp_bpf_lock);
+	return (error);
+}
+
+static boolean_t
+icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
+{
+	boolean_t res;
+	uchar_t *buf = mp->b_rptr;
+	uint_t wirelen, len = MBLKL(mp);
+
+	rw_enter(&icmp->icmp_bpf_lock, RW_READER);
+	if (icmp->icmp_bpf_len == 0) {
+		rw_exit(&icmp->icmp_bpf_lock);
+		return (B_FALSE);
+	}
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		ipha_t *ipha = (ipha_t *)buf;
+
+		wirelen = ntohs(ipha->ipha_length);
+	} else {
+		ip6_t *ip6h = (ip6_t *)buf;
+
+		wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+	}
+	res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
+	rw_exit(&icmp->icmp_bpf_lock);
+
+	return (res);
+}
+
 /*
  * This routine sets socket options.
  */
@@ -2060,6 +2165,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
 				return (ENOBUFS);
 			}
 			break;
+		case SO_ATTACH_FILTER:
+			return (icmp_attach_filter(icmp, inlen, invalp));
+		case SO_DETACH_FILTER:
+			return (icmp_detach_filter(icmp));
 		}
 		break;
 
@@ -2605,6 +2714,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 	/* Initialize regardless of IP version */
 	ipps.ipp_fields = 0;
 
+	/* Apply socket filter, if needed */
+	if (icmp->icmp_bpf_len != 0) {
+		if (icmp_eval_filter(icmp, mp, ira)) {
+			freemsg(mp);
+			return;
+		}
+	}
+
 	if (ira->ira_flags & IRAF_IS_IPV4) {
 		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
 		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index ff0310de0c..d65d3164d3 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -41,6 +42,7 @@
 #include <netinet/ip_mroute.h>
 #include <inet/optcom.h>
 #include <inet/rawip_impl.h>
+#include <net/bpf.h>
 
 /*
  * Table of all known options handled on a ICMP protocol stack.
@@ -86,6 +88,10 @@ opdes_t	icmp_opt_arr[] = {
 	0 },
 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 
+{ SO_ATTACH_FILTER,	SOL_SOCKET, OA_W, OA_W, OP_NP, 0,
+	sizeof (struct bpf_program), 0 },
+{ SO_DETACH_FILTER,	SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 },
+
 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index 6063fa01d2..704f152bb9 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -8235,7 +8235,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 	conn_t		*connp = NULL;
 	t_uscalar_t	paddrreq;
 	mblk_t		*mp_hw;
-	boolean_t	success;
 	boolean_t	ioctl_aborted = B_FALSE;
 	boolean_t	log = B_TRUE;
 
@@ -8335,7 +8334,8 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			ill->ill_state_flags &= ~ILL_DOWN_IN_PROGRESS;
 			mutex_exit(&ill->ill_lock);
 			/*
-			 * Something went wrong with the bind.  We presumably
+			 * Something went wrong with the bind. If this was the
+			 * result of a DL_NOTE_REPLUMB, then we presumably
 			 * have an IOCTL hanging out waiting for completion.
 			 * Find it, take down the interface that was coming
 			 * up, and complete the IOCTL with the error noted.
@@ -8352,6 +8352,15 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 
 				(void) ipif_down(ipif, NULL, NULL);
 				/* error is set below the switch */
+			} else {
+				/*
+				 * There's no pending IOCTL, so the bind was
+				 * most likely started by ill_dl_up(). We save
+				 * the error and let it take care of responding
+				 * to the IOCTL.
+				 */
+				ill->ill_dl_bind_err = dlea->dl_unix_errno ?
+				    dlea->dl_unix_errno : ENXIO;
 			}
 			break;
 		case DL_ENABMULTI_REQ:
@@ -8475,55 +8484,7 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 		DTRACE_PROBE1(ip__rput__dlpi__bind__ack, ill_t *, ill);
 		ill_nic_event_dispatch(ill, 0, NE_UP, NULL, 0);
 
-		/*
-		 * Now bring up the resolver; when that is complete, we'll
-		 * create IREs.  Note that we intentionally mirror what
-		 * ipif_up() would have done, because we got here by way of
-		 * ill_dl_up(), which stopped ipif_up()'s processing.
-		 */
-		if (ill->ill_isv6) {
-			/*
-			 * v6 interfaces.
-			 * Unlike ARP which has to do another bind
-			 * and attach, once we get here we are
-			 * done with NDP
-			 */
-			(void) ipif_resolver_up(ipif, Res_act_initial);
-			if ((err = ipif_ndp_up(ipif, B_TRUE)) == 0)
-				err = ipif_up_done_v6(ipif);
-		} else if (ill->ill_net_type == IRE_IF_RESOLVER) {
-			/*
-			 * ARP and other v4 external resolvers.
-			 * Leave the pending mblk intact so that
-			 * the ioctl completes in ip_rput().
-			 */
-			if (connp != NULL)
-				mutex_enter(&connp->conn_lock);
-			mutex_enter(&ill->ill_lock);
-			success = ipsq_pending_mp_add(connp, ipif, q, mp1, 0);
-			mutex_exit(&ill->ill_lock);
-			if (connp != NULL)
-				mutex_exit(&connp->conn_lock);
-			if (success) {
-				err = ipif_resolver_up(ipif, Res_act_initial);
-				if (err == EINPROGRESS) {
-					freemsg(mp);
-					return;
-				}
-				mp1 = ipsq_pending_mp_get(ipsq, &connp);
-			} else {
-				/* The conn has started closing */
-				err = EINTR;
-			}
-		} else {
-			/*
-			 * This one is complete. Reply to pending ioctl.
-			 */
-			(void) ipif_resolver_up(ipif, Res_act_initial);
-			err = ipif_up_done(ipif);
-		}
-
-		if ((err == 0) && (ill->ill_up_ipifs)) {
+		if (ill->ill_up_ipifs) {
 			err = ill_up_ipifs(ill, q, mp1);
 			if (err == EINPROGRESS) {
 				freemsg(mp);
@@ -8531,25 +8492,6 @@ ip_rput_dlpi_writer(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *dummy_arg)
 			}
 		}
 
-		/*
-		 * If we have a moved ipif to bring up, and everything has
-		 * succeeded to this point, bring it up on the IPMP ill.
-		 * Otherwise, leave it down -- the admin can try to bring it
-		 * up by hand if need be.
-		 */
-		if (ill->ill_move_ipif != NULL) {
-			if (err != 0) {
-				ill->ill_move_ipif = NULL;
-			} else {
-				ipif = ill->ill_move_ipif;
-				ill->ill_move_ipif = NULL;
-				err = ipif_up(ipif, q, mp1);
-				if (err == EINPROGRESS) {
-					freemsg(mp);
-					return;
-				}
-			}
-		}
 		break;
 
 	case DL_NOTIFY_IND: {
@@ -12621,6 +12563,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
 	ip_ioctl_cmd_t *ipip = arg;
 	ip_extract_func_t *extract_funcp;
+	ill_t *ill;
 	cmd_info_t ci;
 	int err;
 	boolean_t entered_ipsq = B_FALSE;
@@ -12742,6 +12685,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
 
 	/*
+	 * We need to cache the ill_t that we're going to use as the argument
+	 * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
+	 * blown away by calling ipi_func.
+	 */
+	ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
+
+	/*
 	 * A return value of EINPROGRESS means the ioctl is
 	 * either queued and waiting for some reason or has
 	 * already completed.
@@ -12749,9 +12699,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
 
 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
-	    int, ipip->ipi_cmd,
-	    ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
-	    ipif_t *, ci.ci_ipif);
+	    int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
 
 	if (entered_ipsq)
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index cc67299a1b..2307837eb8 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -22,7 +22,7 @@
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 1990 Mentat Inc.
  * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
@@ -174,7 +174,7 @@ static ipif_t	*ipif_lookup_on_name_async(char *name, size_t namelen,
 
 static int	ill_alloc_ppa(ill_if_t *, ill_t *);
 static void	ill_delete_interface_type(ill_if_t *);
-static int	ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q);
+static int	ill_dl_up(ill_t *ill, ipif_t *ipif);
 static void	ill_dl_down(ill_t *ill);
 static void	ill_down(ill_t *ill);
 static void	ill_down_ipifs(ill_t *, boolean_t);
@@ -1380,6 +1380,36 @@ ill_capability_probe(ill_t *ill)
 	ill->ill_dlpi_capab_state = IDCS_PROBE_SENT;
 }
 
+static boolean_t
+ill_capability_wait(ill_t *ill)
+{
+	/*
+	 * I'm in this ill's squeue, aka a writer.  The ILL_CONDEMNED flag can
+	 * only be set by someone who is the writer.  Since we
+	 * drop-and-reacquire the squeue in this loop, we need to check for
+	 * ILL_CONDEMNED, which if set means nothing can signal our capability
+	 * condition variable.
+	 */
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	while (ill->ill_capab_pending_cnt != 0 &&
+	    (ill->ill_state_flags & ILL_CONDEMNED) == 0) {
+		/* This may enable blocked callers of ill_capability_done(). */
+		ipsq_exit(ill->ill_phyint->phyint_ipsq);
+		/* Pause a bit (1msec) before we re-enter the squeue. */
+		delay(drv_usectohz(1000000));
+
+		/*
+		 * If ipsq_enter() fails, someone set ILL_CONDEMNED
+		 * while we dropped the squeue. Indicate such to the caller.
+		 */
+		if (!ipsq_enter(ill, B_FALSE, CUR_OP))
+			return (B_FALSE);
+	}
+
+	return ((ill->ill_state_flags & ILL_CONDEMNED) == 0);
+}
+
 void
 ill_capability_reset(ill_t *ill, boolean_t reneg)
 {
@@ -1390,6 +1420,8 @@ ill_capability_reset(ill_t *ill, boolean_t reneg)
 
 	ill->ill_dlpi_capab_state = reneg ? IDCS_RENEG : IDCS_RESET_SENT;
 
+	ASSERT(ill->ill_capab_reset_mp != NULL);
+
 	ill_capability_send(ill, ill->ill_capab_reset_mp);
 	ill->ill_capab_reset_mp = NULL;
 	/*
@@ -2109,6 +2141,49 @@ ill_capability_lso_enable(ill_t *ill)
 	}
 }
 
+/*
+ * Check whether or not mac will prevent us from sending with a given IP
+ * address. This requires having the IPCHECK capability, which we should
+ * always be able to successfully negotiate, but if it's somehow missing
+ * then we just permit the caller to use the address, since mac does the
+ * actual enforcement and ip is just performing a courtesy check to help
+ * prevent users from unwittingly setting and attempting to use blocked
+ * addresses.
+ */
+static boolean_t
+ill_ipcheck_addr(ill_t *ill, in6_addr_t *v6addr)
+{
+	if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) == 0)
+		return (B_TRUE);
+
+	ill_dld_ipcheck_t *idi = &ill->ill_dld_capab->idc_ipcheck;
+	ip_mac_ipcheck_t ipcheck = idi->idi_allowed_df;
+	return (ipcheck(idi->idi_allowed_dh, ill->ill_isv6, v6addr));
+}
+
+static void
+ill_capability_ipcheck_enable(ill_t *ill)
+{
+	ill_dld_capab_t		*idc = ill->ill_dld_capab;
+	ill_dld_ipcheck_t	*idi = &idc->idc_ipcheck;
+	dld_capab_ipcheck_t	spoof;
+	int rc;
+
+	ASSERT(IAM_WRITER_ILL(ill));
+
+	bzero(&spoof, sizeof (spoof));
+	if ((rc = idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+	    &spoof, DLD_ENABLE)) == 0) {
+		idi->idi_allowed_df = (ip_mac_ipcheck_t)spoof.ipc_allowed_df;
+		idi->idi_allowed_dh = spoof.ipc_allowed_dh;
+		ill->ill_capabilities |= ILL_CAPAB_DLD_IPCHECK;
+	} else {
+		cmn_err(CE_WARN, "warning: could not enable IPCHECK "
+		    "capability, rc = %d\n", rc);
+		DTRACE_PROBE2(ipcheck__off, (ill_t *), ill, (int), rc);
+	}
+}
+
 static void
 ill_capability_dld_enable(ill_t *ill)
 {
@@ -2121,6 +2196,8 @@ ill_capability_dld_enable(ill_t *ill)
 		ill_capability_direct_enable(ill);
 		ill_capability_poll_enable(ill);
 	}
+
+	ill_capability_ipcheck_enable(ill);
 	ill_capability_lso_enable(ill);
 	ill->ill_capabilities |= ILL_CAPAB_DLD;
 	ill_mac_perim_exit(ill, mph);
@@ -2186,6 +2263,15 @@ ill_capability_dld_disable(ill_t *ill)
 		    NULL, DLD_DISABLE);
 	}
 
+	if ((ill->ill_capabilities & ILL_CAPAB_DLD_IPCHECK) != 0) {
+		ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_df != NULL);
+		ASSERT(ill->ill_dld_capab->idc_ipcheck.idi_allowed_dh != NULL);
+
+		ill->ill_capabilities &= ~ILL_CAPAB_DLD_IPCHECK;
+		(void) idc->idc_capab_df(idc->idc_capab_dh, DLD_CAPAB_IPCHECK,
+		    NULL, DLD_DISABLE);
+	}
+
 	ill->ill_capabilities &= ~ILL_CAPAB_DLD;
 	ill_mac_perim_exit(ill, mph);
 }
@@ -9676,7 +9762,6 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 	in6_addr_t v6addr;
 	boolean_t need_up = B_FALSE;
 	ill_t *ill;
-	int i;
 
 	ip1dbg(("ip_sioctl_addr(%s:%u %p)\n",
 	    ipif->ipif_ill->ill_name, ipif->ipif_id, (void *)ipif));
@@ -9751,20 +9836,9 @@ ip_sioctl_addr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 
 		IN6_IPADDR_TO_V4MAPPED(addr, &v6addr);
 	}
-	/*
-	 * verify that the address being configured is permitted by the
-	 * ill_allowed_ips[] for the interface.
-	 */
-	if (ill->ill_allowed_ips_cnt > 0) {
-		for (i = 0; i < ill->ill_allowed_ips_cnt; i++) {
-			if (IN6_ARE_ADDR_EQUAL(&ill->ill_allowed_ips[i],
-			    &v6addr))
-				break;
-		}
-		if (i == ill->ill_allowed_ips_cnt) {
-			pr_addr_dbg("!allowed addr %s\n", AF_INET6, &v6addr);
-			return (EPERM);
-		}
+	/* verify that the address being configured is permitted by mac */
+	if (!ill_ipcheck_addr(ill, &v6addr)) {
+		return (EPERM);
 	}
 	/*
 	 * Even if there is no change we redo things just to rerun
@@ -12704,6 +12778,12 @@ ill_dl_down(ill_t *ill)
 	}
 
 	ill->ill_unbind_mp = NULL;
+
+	mutex_enter(&ill->ill_lock);
+	ill->ill_dl_up = 0;
+	ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
+	mutex_exit(&ill->ill_lock);
+
 	if (mp != NULL) {
 		ip1dbg(("ill_dl_down: %s (%u) for %s\n",
 		    dl_primstr(*(int *)mp->b_rptr), *(int *)mp->b_rptr,
@@ -12726,11 +12806,13 @@ ill_dl_down(ill_t *ill)
 			ill_capability_dld_disable(ill);
 		ill_capability_reset(ill, B_FALSE);
 		ill_dlpi_send(ill, mp);
+
+		/*
+		 * Wait for the capability reset to finish.
+		 * In this case, it doesn't matter WHY or HOW it finished.
+		 */
+		(void) ill_capability_wait(ill);
 	}
-	mutex_enter(&ill->ill_lock);
-	ill->ill_dl_up = 0;
-	ill_nic_event_dispatch(ill, 0, NE_DOWN, NULL, 0);
-	mutex_exit(&ill->ill_lock);
 }
 
 void
@@ -12852,6 +12934,7 @@ void
 ill_capability_done(ill_t *ill)
 {
 	ASSERT(ill->ill_capab_pending_cnt != 0);
+	ASSERT(IAM_WRITER_ILL(ill));
 
 	ill_dlpi_done(ill, DL_CAPABILITY_REQ);
 
@@ -14480,7 +14563,14 @@ ipif_up(ipif_t *ipif, queue_t *q, mblk_t *mp)
 			 * address/netmask etc cause a down/up dance, but
 			 * does not cause an unbind (DL_UNBIND) with the driver
 			 */
-			return (ill_dl_up(ill, ipif, mp, q));
+			if ((err = ill_dl_up(ill, ipif)) != 0) {
+				return (err);
+			}
+		}
+
+		/* Reject bringing up interfaces with unusable IP addresses */
+		if (!ill_ipcheck_addr(ill, &ipif->ipif_v6lcl_addr)) {
+			return (EPERM);
 		}
 
 		/*
@@ -14593,24 +14683,22 @@ ill_delete_ires(ill_t *ill)
 
 /*
  * Perform a bind for the physical device.
- * When the routine returns EINPROGRESS then mp has been consumed and
- * the ioctl will be acked from ip_rput_dlpi.
- * Allocate an unbind message and save it until ipif_down.
+ *
+ * When the routine returns successfully then dlpi has been bound and
+ * capabilities negotiated. An unbind message will have been allocated
+ * for later use in ipif_down.
  */
 static int
-ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
+ill_dl_up(ill_t *ill, ipif_t *ipif)
 {
 	mblk_t	*bind_mp = NULL;
 	mblk_t	*unbind_mp = NULL;
-	conn_t	*connp;
-	boolean_t success;
 	int	err;
 
 	DTRACE_PROBE2(ill__downup, char *, "ill_dl_up", ill_t *, ill);
 
 	ip1dbg(("ill_dl_up(%s)\n", ill->ill_name));
 	ASSERT(IAM_WRITER_ILL(ill));
-	ASSERT(mp != NULL);
 
 	/*
 	 * Make sure we have an IRE_MULTICAST in case we immediately
@@ -14645,19 +14733,6 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 		if (unbind_mp == NULL)
 			goto bad;
 	}
-	/*
-	 * Record state needed to complete this operation when the
-	 * DL_BIND_ACK shows up.  Also remember the pre-allocated mblks.
-	 */
-	connp = CONN_Q(q) ? Q_TO_CONN(q) : NULL;
-	ASSERT(connp != NULL || !CONN_Q(q));
-	GRAB_CONN_LOCK(q);
-	mutex_enter(&ipif->ipif_ill->ill_lock);
-	success = ipsq_pending_mp_add(connp, ipif, q, mp, 0);
-	mutex_exit(&ipif->ipif_ill->ill_lock);
-	RELEASE_CONN_LOCK(q);
-	if (!success)
-		goto bad;
 
 	/*
 	 * Save the unbind message for ill_dl_down(); it will be consumed when
@@ -14669,6 +14744,18 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 	ill_dlpi_send(ill, bind_mp);
 	/* Send down link-layer capabilities probe if not already done. */
 	ill_capability_probe(ill);
+	/*
+	 * Wait for DLPI to be bound and the capability probe to finish.
+	 * The call drops-and-reacquires the squeue. If it couldn't because
+	 * ILL_CONDEMNED got set, bail.
+	 */
+	if (!ill_capability_wait(ill))
+		return (ENXIO);
+
+	/* DLPI failed to bind. Return the saved error */
+	if (!ill->ill_dl_up) {
+		return (ill->ill_dl_bind_err);
+	}
 
 	/*
 	 * Sysid used to rely on the fact that netboots set domainname
@@ -14686,11 +14773,7 @@ ill_dl_up(ill_t *ill, ipif_t *ipif, mblk_t *mp, queue_t *q)
 			cmn_err(CE_WARN, "no cached dhcp response");
 	}
 
-	/*
-	 * This operation will complete in ip_rput_dlpi with either
-	 * a DL_BIND_ACK or DL_ERROR_ACK.
-	 */
-	return (EINPROGRESS);
+	return (0);
 bad:
 	ip1dbg(("ill_dl_up(%s) FAILED\n", ill->ill_name));
 
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 13e961333c..b6565d9c1f 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -153,7 +153,7 @@ ip_squeue_create(pri_t pri)
 {
 	squeue_t *sqp;
 
-	sqp = squeue_create(pri);
+	sqp = squeue_create(pri, B_TRUE);
 	ASSERT(sqp != NULL);
 	if (ip_squeue_create_callback != NULL)
 		ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index 34832d56e5..d47997a4aa 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
  * Copyright 2022 Joyent, Inc.
  */
@@ -871,67 +872,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
 	mutex_exit(&(connfp)->connf_lock);				\
 }
 
-#define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
-	conn_t *pconnp = NULL, *nconnp;					\
-	IPCL_HASH_REMOVE((connp));					\
-	mutex_enter(&(connfp)->connf_lock);				\
-	nconnp = (connfp)->connf_head;					\
-	while (nconnp != NULL &&					\
-	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
-		pconnp = nconnp;					\
-		nconnp = nconnp->conn_next;				\
-	}								\
-	if (pconnp != NULL) {						\
-		pconnp->conn_next = (connp);				\
-		(connp)->conn_prev = pconnp;				\
-	} else {							\
-		(connfp)->connf_head = (connp);				\
-	}								\
-	if (nconnp != NULL) {						\
-		(connp)->conn_next = nconnp;				\
-		nconnp->conn_prev = (connp);				\
-	}								\
-	(connp)->conn_fanout = (connfp);				\
-	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
-	    IPCL_BOUND;							\
-	CONN_INC_REF(connp);						\
-	mutex_exit(&(connfp)->connf_lock);				\
-}
+/*
+ * When inserting bound or wildcard entries into the hash, ordering rules are
+ * used to facilitate timely and correct lookups.  The order is as follows:
+ * 1. Entries bound to a specific address
+ * 2. Entries bound to INADDR_ANY
+ * 3. Entries bound to ADDR_UNSPECIFIED
+ * Entries in a category which share conn_lport (such as those using
+ * SO_REUSEPORT) will be ordered such that the newest inserted is first.
+ */
 
-#define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
-	conn_t **list, *prev, *next;					\
-	boolean_t isv4mapped =						\
-	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
-	IPCL_HASH_REMOVE((connp));					\
-	mutex_enter(&(connfp)->connf_lock);				\
-	list = &(connfp)->connf_head;					\
-	prev = NULL;							\
-	while ((next = *list) != NULL) {				\
-		if (isv4mapped &&					\
-		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
-		    connp->conn_zoneid == next->conn_zoneid) {		\
-			(connp)->conn_next = next;			\
-			if (prev != NULL)				\
-				prev = next->conn_prev;			\
-			next->conn_prev = (connp);			\
-			break;						\
-		}							\
-		list = &next->conn_next;				\
-		prev = next;						\
-	}								\
-	(connp)->conn_prev = prev;					\
-	*list = (connp);						\
-	(connp)->conn_fanout = (connfp);				\
-	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
-	    IPCL_BOUND;							\
-	CONN_INC_REF((connp));						\
-	mutex_exit(&(connfp)->connf_lock);				\
+void
+ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
+{
+	conn_t *pconnp, *nconnp;
+
+	IPCL_HASH_REMOVE(connp);
+	mutex_enter(&connfp->connf_lock);
+	nconnp = connfp->connf_head;
+	pconnp = NULL;
+	while (nconnp != NULL) {
+		/*
+		 * Walk though entries associated with the fanout until one is
+		 * found which fulfills any of these conditions:
+		 * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
+		 * 2. Listen port the same as connp
+		 */
+		if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
+		    connp->conn_lport == nconnp->conn_lport)
+			break;
+		pconnp = nconnp;
+		nconnp = nconnp->conn_next;
+	}
+	if (pconnp != NULL) {
+		pconnp->conn_next = connp;
+		connp->conn_prev = pconnp;
+	} else {
+		connfp->connf_head = connp;
+	}
+	if (nconnp != NULL) {
+		connp->conn_next = nconnp;
+		nconnp->conn_prev = connp;
+	}
+	connp->conn_fanout = connfp;
+	connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+	CONN_INC_REF(connp);
+	mutex_exit(&connfp->connf_lock);
 }
 
 void
 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 {
-	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+	conn_t **list, *prev, *next;
+	conn_t *pconnp = NULL, *nconnp;
+	boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
+
+	IPCL_HASH_REMOVE(connp);
+	mutex_enter(&connfp->connf_lock);
+	nconnp = connfp->connf_head;
+	pconnp = NULL;
+	while (nconnp != NULL) {
+		if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
+		    isv4mapped && connp->conn_lport == nconnp->conn_lport)
+			break;
+		if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
+		    (isv4mapped ||
+		    connp->conn_lport == nconnp->conn_lport))
+			break;
+
+		pconnp = nconnp;
+		nconnp = nconnp->conn_next;
+	}
+	if (pconnp != NULL) {
+		pconnp->conn_next = connp;
+		connp->conn_prev = pconnp;
+	} else {
+		connfp->connf_head = connp;
+	}
+	if (nconnp != NULL) {
+		connp->conn_next = nconnp;
+		nconnp->conn_prev = connp;
+	}
+	connp->conn_fanout = connfp;
+	connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+	CONN_INC_REF(connp);
+	mutex_exit(&connfp->connf_lock);
 }
 
 /*
@@ -1037,9 +1062,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		}
 	} else {
 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
@@ -1208,9 +1233,9 @@ ipcl_bind_insert_v4(conn_t *connp)
 		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (protocol == IPPROTO_RSVP)
 			ill_set_inputfn_all(ipst);
@@ -1222,9 +1247,9 @@ ipcl_bind_insert_v4(conn_t *connp)
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
 		if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (cl_inet_listen != NULL) {
 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
@@ -1274,9 +1299,9 @@ ipcl_bind_insert_v6(conn_t *connp)
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 
@@ -1286,9 +1311,9 @@ ipcl_bind_insert_v6(conn_t *connp)
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (cl_inet_listen != NULL) {
 			sa_family_t	addr_family;
@@ -1419,9 +1444,9 @@ ipcl_conn_insert_v4(conn_t *connp)
 		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 	}
@@ -1507,9 +1532,9 @@ ipcl_conn_insert_v6(conn_t *connp)
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 	}
@@ -2095,6 +2120,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
 	connp->conn_flags = IPCL_RAWIPCONN;
 	connp->conn_proto = IPPROTO_ICMP;
 	icmp->icmp_connp = connp;
+	rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
 	if (connp->conn_ixa == NULL)
@@ -2119,6 +2145,7 @@ rawip_conn_destructor(void *buf, void *cdrarg)
 	mutex_destroy(&connp->conn_lock);
 	cv_destroy(&connp->conn_cv);
 	rw_destroy(&connp->conn_ilg_lock);
+	rw_destroy(&icmp->icmp_bpf_lock);
 
 	/* Can be NULL if constructor failed */
 	if (connp->conn_ixa != NULL) {
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 89968826b3..70cff374a4 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /*
@@ -299,7 +300,8 @@ struct conn_s {
 		conn_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
 		conn_mcbc_bind : 1,		/* Bound to multi/broadcast */
 
-		conn_pad_to_bit_31 : 12;
+		conn_reuseport : 1,		/* SO_REUSEPORT state */
+		conn_pad_to_bit_31 : 11;
 
 	boolean_t	conn_blocked;		/* conn is flow-controlled */
 
diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c
index 104603d840..22f2d79d24 100644
--- a/usr/src/uts/common/inet/ipd/ipd.c
+++ b/usr/src/uts/common/inet/ipd/ipd.c
@@ -9,7 +9,7 @@
  * http://www.illumos.org/license/CDDL.
  */
 /*
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -222,7 +222,7 @@ typedef struct ipd_netstack {
 	net_handle_t	ipdn_v6hdl;		/* IPv4 net handle */
 	int		ipdn_hooked;		/* are hooks registered */
 	hook_t		*ipdn_v4in;		/* IPv4 traffic in hook */
-	hook_t		*ipdn_v4out;		/* IPv4 traffice out hook */
+	hook_t		*ipdn_v4out;		/* IPv4 traffic out hook */
 	hook_t		*ipdn_v6in;		/* IPv6 traffic in hook */
 	hook_t		*ipdn_v6out;		/* IPv6 traffic out hook */
 	int		ipdn_enabled;		/* which perturbs are on */
@@ -613,7 +613,7 @@ ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
 	/*
 	 * If ipd_check_hooks_failed, that must mean that we failed to set up
 	 * the hooks, so we are going to effectively zero out and fail the
-	 * request to enable corruption.
+	 * request to enable packet delays.
 	 */
 	if (rval != 0)
 		ins->ipdn_delay = 0;
diff --git a/usr/src/uts/common/inet/ipf/cfw.c b/usr/src/uts/common/inet/ipf/cfw.c
new file mode 100644
index 0000000000..941aeac328
--- /dev/null
+++ b/usr/src/uts/common/inet/ipf/cfw.c
@@ -0,0 +1,659 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019, Joyent, Inc.
+ */
+
+/* IPF oddness for compilation in userland for IPF tests. */
+#if defined(KERNEL) || defined(_KERNEL)
+#undef KERNEL
+#undef _KERNEL
+#define	KERNEL	1
+#define	_KERNEL	1
+#endif
+
+#include <sys/errno.h>
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/socket.h>
+#include <net/if.h>
+#include <net/route.h>
+#include <netinet/in.h>
+#include <netinet/in_systm.h>
+#include <netinet/ip.h>
+#include <netinet/ip_var.h>
+#include <netinet/tcp.h>
+#include "netinet/ip_compat.h"
+#ifdef	USE_INET6
+#include <netinet/icmp6.h>
+#endif
+#include <netinet/tcpip.h>
+#include "netinet/ip_fil.h"
+#include "netinet/ip_nat.h"
+#include "netinet/ip_frag.h"
+#include "netinet/ip_state.h"
+#include "netinet/ip_proxy.h"
+#include "netinet/ip_auth.h"
+#include "netinet/ipf_stack.h"
+#ifdef IPFILTER_SCAN
+#include "netinet/ip_scan.h"
+#endif
+#ifdef IPFILTER_SYNC
+#include "netinet/ip_sync.h"
+#endif
+#include "netinet/ip_pool.h"
+#include "netinet/ip_htable.h"
+#ifdef IPFILTER_COMPILED
+#include "netinet/ip_rules.h"
+#endif
+#if defined(_KERNEL)
+#include <sys/sunddi.h>
+#endif
+
+#include "netinet/ipf_cfw.h"
+#include <sys/file.h>
+#include <sys/uio.h>
+#include <sys/cred.h>
+#include <sys/ddi.h>
+
+/*
+ * cfw == Cloud Firewall ==> routines for a global-zone data collector about
+ * ipf events for SmartOS.  The only ones that CFW cares about are ones
+ * enforced by global-zone-controlled rulesets.
+ *
+ * The variable below is tied into the GZ-only ipf device /dev/ipfev, that
+ * flips this on when there is an open instance.  This feature will also
+ * consume an fr_flag to have per-rule granularity.
+ */
+boolean_t ipf_cfwlog_enabled;
+
+/*
+ * Because ipf's test tools in $SRC/cmd insert all of these files, we need to
+ * stub out what we can vs. drag in even more headers and who knows what else.
+ */
+#ifdef _KERNEL
+
+/*
+ * CFW event ring buffer.  Remember, this is for ALL ZONES because only a
+ * global-zone event-reader will be consuming these.  In other words, it's
+ * not something to instantiate per-netstack.
+ *
+ * We may want to get more sophisticated and performant (e.g. per-processor),
+ * but for now keep the ring buffer simple and stupid.
+ * Must be a power of 2, to be bitmaskable, and must be countable by a uint_t
+ *
+ * Resizeable, see ipf_cfw_ring_resize() below.
+ */
+#define	IPF_CFW_DEFAULT_RING_BUFS	1024
+#define	IPF_CFW_MIN_RING_BUFS		8
+#define	IPF_CFW_MAX_RING_BUFS		(1U << 31U)
+
+/* Assume C's init-to-zero is sufficient for these types... */
+static kmutex_t cfw_ringlock;
+static kcondvar_t cfw_ringcv;
+
+static cfwev_t *cfw_ring;	/* NULL by default. */
+static uint32_t cfw_ringsize;	/* 0 by default, number of array elements. */
+static uint32_t cfw_ringmask;	/* 0 by default. */
+
+/* If these are equal, we're either empty or full. */
+static uint_t cfw_ringstart, cfw_ringend;
+static boolean_t cfw_ringfull;	/* Tell the difference here! */
+/* Bean-counters. */
+static uint64_t cfw_evreports;
+static uint64_t cfw_evdrops;
+
+/*
+ * Place an event in the CFW event ring buffer.
+ *
+ * For now, be simple and drop the oldest event if we overflow. We may wish to
+ * selectively drop older events based on type in the future.
+ */
+static void
+ipf_cfwev_report(cfwev_t *event)
+{
+	mutex_enter(&cfw_ringlock);
+	cfw_ring[cfw_ringend] = *event;
+	cfw_ringend++;
+	cfw_ringend &= cfw_ringmask;
+	if (cfw_ringfull) {
+		cfw_ringstart++;
+		cfw_ringstart &= cfw_ringmask;
+		ASSERT3U(cfw_ringstart, ==, cfw_ringend);
+		DTRACE_PROBE(ipf__cfw__evdrop);
+		cfw_evdrops++;
+	} else {
+		cfw_ringfull = (cfw_ringend == cfw_ringstart);
+	}
+	cfw_evreports++;
+	cv_broadcast(&cfw_ringcv);
+	mutex_exit(&cfw_ringlock);
+}
+
+/*
+ * Provide access to multiple CFW events that can allow copying straight from
+ * the ring buffer up to userland.  Requires a callback (which could call
+ * uiomove() directly, OR to a local still-in-kernel buffer) that must do the
+ * data copying-out.
+ *
+ * Callback function is of the form:
+ *
+ *	uint_t cfw_many_cb(cfwev_t *evptr, int num_avail, void *cbarg);
+ *
+ * The function must return how many events got consumed, which MUST be <= the
+ * number available.  The function must ALSO UNDERSTAND that cfw_ringlock is
+ * held and must not be released during this time.  The function may be called
+ * more than once, if the available buffers wrap-around OR "block" is set and
+ * we don't have enough buffers.  If any callback returns 0, exit the function
+ * with however many were consumed.
+ *
+ * This function, like the callback, returns the number of events *CONSUMED*.
+ *
+ * .  .  .
+ *
+ * Tunables for ipf_cfwev_consume_many().
+ *
+ * If you wish to attempt to coalesce reads (to reduce the likelihood of one
+ * event at a time during high load) change the number of tries below to
+ * something not 0. Early experiments set this to 10.
+ *
+ * The wait between tries is in usecs in cfw_timeout_wait. The pessimal
+ * case for this is a timeout_wait-spaced trickle of one event at a time.
+ */
+uint_t cfw_timeout_tries = 0;
+uint_t cfw_timeout_wait = 10000;	/* 10ms wait. */
+
+typedef struct uio_error_s {
+	struct uio *ue_uio;
+	int ue_error;
+} uio_error_t;
+
+static uint_t
+ipf_cfwev_consume_many(uint_t num_requested, boolean_t block,
+    cfwmanycb_t cfw_many_cb, void *cbarg)
+{
+	uint_t consumed = 0, cb_consumed, contig_size;
+	uint_t timeout_tries = cfw_timeout_tries;
+	boolean_t eintr = B_FALSE;
+
+	mutex_enter(&cfw_ringlock);
+
+	while (num_requested > 0) {
+		clock_t delta;
+
+		/* Silly reality checks */
+		ASSERT3U(cfw_ringstart, <, cfw_ringsize);
+		ASSERT3U(cfw_ringend, <, cfw_ringsize);
+
+		if (cfw_ringstart > cfw_ringend || cfw_ringfull) {
+			/* We have from ringstart to the buffer's end. */
+			contig_size = cfw_ringsize - cfw_ringstart;
+		} else if (cfw_ringstart < cfw_ringend) {
+			/* We have no potential wrapping at this time. */
+			contig_size = cfw_ringend - cfw_ringstart;
+		} else if (block && cv_wait_sig(&cfw_ringcv, &cfw_ringlock)) {
+			/* Maybe something to consume now, try again. */
+			continue;
+		} else {
+			/* Nothing (more) to consume, return! */
+			eintr = (block && consumed == 0);
+			break;
+		}
+
+		/* Less asked-for than what we needed. */
+		if (num_requested < contig_size)
+			contig_size = num_requested;
+
+		cb_consumed =
+		    cfw_many_cb(&(cfw_ring[cfw_ringstart]), contig_size, cbarg);
+		ASSERT3U(cb_consumed, <=, contig_size);
+
+		cfw_ringstart += cb_consumed;
+		ASSERT3U(cfw_ringstart, <=, cfw_ringmask + 1);
+		cfw_ringstart &= cfw_ringmask;	/* In case of wraparound. */
+		consumed += cb_consumed;
+		cfw_ringfull = (cfw_ringfull && cb_consumed == 0);
+		if (cb_consumed < contig_size) {
+			/*
+			 * Callback returned less than given.
+			 * This is likely a uio error, but we have
+			 * something.  Get out of here.
+			 */
+			break;
+		}
+		ASSERT3U(cb_consumed, ==, contig_size);
+		num_requested -= contig_size;
+
+		if (num_requested == 0) {
+			/* All done! */
+			break;
+		}
+
+		if (cfw_ringstart != cfw_ringend) {
+			/*
+			 * We wrapped around the end of the buffer, and
+			 * we have more available to fill our request.
+			 */
+			ASSERT0(cfw_ringstart);
+			ASSERT(!cfw_ringfull);
+			continue;
+		}
+
+		/*
+		 * We obtained some of the events we requested, but not all.
+		 * Since we have nothing to consume, wait *a little* longer.
+		 */
+		if (timeout_tries == 0)
+			break;	/* Don't bother... */
+		delta = drv_usectohz(cfw_timeout_wait);
+		timeout_tries--;
+
+		switch (cv_reltimedwait_sig(&cfw_ringcv, &cfw_ringlock, delta,
+		    TR_CLOCK_TICK)) {
+		case 0:
+			/*
+			 * Received signal!  Return what we have OR if we have
+			 * nothing, EINTR.
+			 */
+			DTRACE_PROBE1(ipf__cfw__timedsignal, int, consumed);
+			eintr = (consumed == 0);
+			num_requested = 0;
+			break;
+		case -1:
+			/* Time reached! Bail with what we got. */
+			DTRACE_PROBE(ipf__cfw__timedexpired);
+			num_requested = 0;
+			break;
+		default:
+			/* Aha! We've got more! */
+			DTRACE_PROBE(ipf__cfw__moredata);
+			break;
+		}
+	}
+
+	mutex_exit(&cfw_ringlock);
+	if (eintr)
+		((uio_error_t *)cbarg)->ue_error = EINTR;
+	return (consumed);
+}
+
+/*
+ * SmartOS likes using the zone's debug id. Make sure we squirrel that away in
+ * the ipf netstack instance if it's not there.
+ */
+static inline zoneid_t
+ifs_to_did(ipf_stack_t *ifs)
+{
+	if (ifs->ifs_zone_did == 0) {
+		zone_t *zone;
+
+		/*
+		 * We can't get the zone_did at initialization time because
+		 * most zone data isn't readily available then, cement the did
+		 * in place now.
+		 */
+		VERIFY3U(ifs->ifs_zone, !=, GLOBAL_ZONEID);
+		zone = zone_find_by_id(ifs->ifs_zone);
+		if (zone != NULL) {
+			ifs->ifs_zone_did = zone->zone_did;
+			zone_rele(zone);
+		}
+		/* Else we are either in shutdown or something weirder. */
+	}
+	return (ifs->ifs_zone_did);
+}
+
+/*
+ * ipf_block_cfwlog()
+ *
+ * Called by fr_check().  Record drop events for the global-zone data
+ * collector.  Use rest-of-ipf-style names for the parameters.
+ */
+void
+ipf_block_cfwlog(frentry_t *fr, fr_info_t *fin, ipf_stack_t *ifs)
+{
+	cfwev_t event = {0};
+
+	/*
+	 * We need a rule.
+	 * Capture failure by using dtrace on this function's entry.
+	 * 'ipf_block_cfwlog:entry /arg0 == NULL/ { printf("GOTCHA!\n"); }'
+	 */
+	if (fr == NULL)
+		return;
+
+	event.cfwev_type = CFWEV_BLOCK;
+	event.cfwev_length = sizeof (event);
+	/*
+	 * IPF code elsewhere does the cheesy single-flag check, even though
+	 * there are two flags in a rule (one for in, one for out).
+	 */
+	event.cfwev_direction = (fr->fr_flags & FR_INQUE) ?
+	    CFWDIR_IN : CFWDIR_OUT;
+
+	event.cfwev_protocol = fin->fin_p;
+	/*
+	 * NOTE: fin_*port is in host/native order, and ICMP info is here too.
+	 */
+	event.cfwev_sport = htons(fin->fin_sport);
+	event.cfwev_dport = htons(fin->fin_dport);
+
+	switch (fin->fin_v) {
+	case IPV4_VERSION:
+		IN6_INADDR_TO_V4MAPPED(&fin->fin_src, &event.cfwev_saddr);
+		IN6_INADDR_TO_V4MAPPED(&fin->fin_dst, &event.cfwev_daddr);
+		break;
+	case IPV6_VERSION:
+		event.cfwev_saddr = fin->fin_src6.in6;
+		event.cfwev_daddr = fin->fin_dst6.in6;
+		break;
+	default:
+		/* We should never reach here, but mark it if we do. */
+		DTRACE_PROBE1(ipf__cfw__frinfo__badipversion, frinfo_t *, fin);
+		return;
+	}
+
+	/*
+	 * uniqtime() is what ipf's GETKTIME() uses.
+	 * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
+	 */
+	uniqtime(&event.cfwev_tstamp);
+	event.cfwev_zonedid = ifs_to_did(ifs);
+	event.cfwev_ruleid = fin->fin_rule;
+	memcpy(event.cfwev_ruleuuid, fr->fr_uuid, sizeof (uuid_t));
+
+	ipf_cfwev_report(&event);
+}
+
+/*
+ * ipf_log_cfwlog()
+ *
+ * Twin of ipstate_log(), but records state events for the global-zone data
+ * collector.
+ */
+void
+ipf_log_cfwlog(struct ipstate *is, uint_t type, ipf_stack_t *ifs)
+{
+	cfwev_t event = {0};
+
+	switch (type) {
+	case ISL_NEW:
+	case ISL_CLONE:
+		event.cfwev_type = CFWEV_BEGIN;
+		break;
+	case ISL_EXPIRE:
+	case ISL_FLUSH:
+	case ISL_REMOVE:
+	case ISL_KILLED:
+	case ISL_ORPHAN:
+		/*
+		 * We don't care about session disappearances in CFW logging
+		 * for now.  (Possible future: CFWEV_END)
+		 */
+		return;
+	default:
+		event.cfwev_type = CFWEV_BLOCK;
+		break;
+	}
+
+	/*
+	 * IPF code elsewhere does the cheesy single-flag check, even though
+	 * there are two flags in a rule (one for in, one for out).  Follow
+	 * suit here.
+	 */
+	event.cfwev_length = sizeof (event);
+	ASSERT(is->is_rule != NULL);
+	event.cfwev_direction = (is->is_rule->fr_flags & FR_INQUE) ?
+	    CFWDIR_IN : CFWDIR_OUT;
+	event.cfwev_protocol = is->is_p;
+	switch (is->is_p) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+		/* NOTE: is_*port is in network order. */
+		event.cfwev_sport = is->is_sport;
+		event.cfwev_dport = is->is_dport;
+		break;
+	case IPPROTO_ICMP:
+	case IPPROTO_ICMPV6:
+		/* Scribble the ICMP type in sport... */
+		event.cfwev_sport = is->is_icmp.ici_type;
+		break;
+	/* Other protocols leave the event's port fields empty. */
+	}
+
+	switch(is->is_v) {
+	case IPV4_VERSION:
+		IN6_INADDR_TO_V4MAPPED(&is->is_src.in4, &event.cfwev_saddr);
+		IN6_INADDR_TO_V4MAPPED(&is->is_dst.in4, &event.cfwev_daddr);
+		break;
+	case IPV6_VERSION:
+		event.cfwev_saddr = is->is_src.in6;
+		event.cfwev_daddr = is->is_dst.in6;
+		break;
+	default:
+		/* Can't parse addresses if we don't know the version.  Drop. */
+		DTRACE_PROBE1(ipf__cfw__ipstate__badipversion,
+		    struct ipstate *, is);
+		return;
+	}
+
+	/*
+	 * uniqtime() is what ipf's GETKTIME() uses.
+	 * If cfwev_tstamp needs to be sourced from elsewhere, fix that here.
+	 */
+	uniqtime(&event.cfwev_tstamp);
+	event.cfwev_zonedid = ifs_to_did(ifs);
+	event.cfwev_ruleid = is->is_rulen;
+	memcpy(event.cfwev_ruleuuid, is->is_uuid, sizeof (uuid_t));
+
+	ipf_cfwev_report(&event);
+}
+
+/*
+ * Callback routine we use for ipf_cfwev_consume_many().
+ * Returning 0 means error indication.
+ */
+static uint_t
+cfwlog_read_manycb(cfwev_t *evptr, uint_t num_avail, void *cbarg)
+{
+	uio_error_t *ue = (uio_error_t *)cbarg;
+
+	ASSERT(MUTEX_HELD(&cfw_ringlock));
+
+	if (ue->ue_error != 0)
+		return (0);
+
+	ue->ue_error = uiomove((caddr_t)evptr, num_avail * sizeof (*evptr),
+	    UIO_READ, ue->ue_uio);
+	if (ue->ue_error != 0)
+		return (0);
+
+	return (num_avail);
+}
+
+/*
+ * Resize the CFW event ring buffer.
+ *
+ * The caller must ensure the new size is a power of 2 between
+ * IPF_CFW_{MIN,MAX}_RING_BUFS (inclusive) or the special values
+ * IPF_CFW_RING_ALLOCATE (first-time creation) or IPF_CFW_RING_DESTROY
+ * (netstack-unload destruction).
+ *
+ * Everything in the current ring will be destroyed (and reported as a drop)
+ * upon resize.
+ */
+int
+ipf_cfw_ring_resize(uint32_t newsize)
+{
+	ASSERT(MUTEX_HELD(&cfw_ringlock) || newsize == IPF_CFW_RING_ALLOCATE ||
+	    newsize == IPF_CFW_RING_DESTROY);
+
+	if (newsize == IPF_CFW_RING_ALLOCATE) {
+		if (cfw_ring != NULL)
+			return (EBUSY);
+		newsize = IPF_CFW_DEFAULT_RING_BUFS;
+		/* Fall through to allocating a new ring buffer. */
+	} else {
+		/* We may be called during error cleanup, so be liberal here. */
+		if ((cfw_ring == NULL && newsize == IPF_CFW_RING_DESTROY) ||
+		    newsize == cfw_ringsize) {
+			return (0);
+		}
+		kmem_free(cfw_ring, cfw_ringsize * sizeof (cfwev_t));
+		cfw_ring = NULL;
+		if (cfw_ringfull) {
+			cfw_evdrops += cfw_ringsize;
+		} else if (cfw_ringstart > cfw_ringend) {
+			cfw_evdrops += cfw_ringend +
+			    (cfw_ringsize - cfw_ringstart);
+		} else {
+			cfw_evdrops += cfw_ringend - cfw_ringstart;
+		}
+		cfw_ringsize = cfw_ringmask = cfw_ringstart = cfw_ringend = 0;
+		cfw_ringfull = B_FALSE;
+
+		if (newsize == IPF_CFW_RING_DESTROY)
+			return (0);
+		/*
+		 * Keep the reports & drops around because if we're just
+		 * resizing, we need to know what we lost.
+		 */
+	}
+
+	ASSERT(ISP2(newsize));
+	cfw_ring = kmem_alloc(newsize * sizeof (cfwev_t), KM_SLEEP);
+	/* KM_SLEEP means we always succeed. */
+	cfw_ringsize = newsize;
+	cfw_ringmask = cfw_ringsize - 1;
+
+	return (0);
+}
+
+/*
+ * ioctl handler for /dev/ipfev.  Only supports SIOCIPFCFWCFG (get data
+ * collector statistics and configuration), and SIOCIPFCFWNEWSZ (resize the
+ * event ring buffer).
+ */
+/* ARGSUSED */
+int
+ipf_cfwlog_ioctl(dev_t dev, int cmd, intptr_t data, int mode, cred_t *cp,
+    int *rp)
+{
+	ipfcfwcfg_t cfginfo;
+	int error;
+
+	if (cmd != SIOCIPFCFWCFG && cmd != SIOCIPFCFWNEWSZ)
+		return (EIO);
+
+	if (crgetzoneid(cp) != GLOBAL_ZONEID)
+		return (EACCES);
+
+	error = COPYIN((caddr_t)data, (caddr_t)&cfginfo, sizeof (cfginfo));
+	if (error != 0)
+		return (EFAULT);
+
+	cfginfo.ipfcfwc_maxevsize = sizeof (cfwev_t);
+	mutex_enter(&cfw_ringlock);
+	cfginfo.ipfcfwc_evreports = cfw_evreports;
+	if (cmd == SIOCIPFCFWNEWSZ) {
+		uint32_t newsize = cfginfo.ipfcfwc_evringsize;
+
+		/* Do ioctl parameter checking here, then call the resizer. */
+		if (newsize < IPF_CFW_MIN_RING_BUFS ||
+		    newsize > IPF_CFW_MAX_RING_BUFS || !ISP2(newsize)) {
+			error = EINVAL;
+		} else {
+			error = ipf_cfw_ring_resize(cfginfo.ipfcfwc_evringsize);
+		}
+	} else {
+		error = 0;
+	}
+	/* Both cfw_evdrops and cfw_ringsize are affected by resize. */
+	cfginfo.ipfcfwc_evdrops = cfw_evdrops;
+	cfginfo.ipfcfwc_evringsize = cfw_ringsize;
+	mutex_exit(&cfw_ringlock);
+
+	if (error != 0)
+		return (error);
+
+	error = COPYOUT((caddr_t)&cfginfo, (caddr_t)data, sizeof (cfginfo));
+	if (error != 0)
+		return (EFAULT);
+
+	return (0);
+}
+
+/*
+ * Send events up via /dev/ipfev reads.  Will return only complete events.
+ */
+/* ARGSUSED */
+int
+ipf_cfwlog_read(dev_t dev, struct uio *uio, cred_t *cp)
+{
+	uint_t requested, consumed;
+	uio_error_t ue = {uio, 0};
+	boolean_t block;
+
+	if (uio->uio_resid == 0)
+		return (0);
+	if (uio->uio_resid < sizeof (cfwev_t))
+		return (EINVAL);
+
+	block = ((uio->uio_fmode & (FNDELAY | FNONBLOCK)) == 0);
+	requested = uio->uio_resid / sizeof (cfwev_t);
+
+	/*
+	 * As stated earlier, ipf_cfwev_consume_many() takes a callback.
+	 * The callback may be called multiple times before we return.
+	 * The callback will execute uiomove().
+	 */
+	consumed = ipf_cfwev_consume_many(requested, block, cfwlog_read_manycb,
+	    &ue);
+	ASSERT3U(consumed, <=, requested);
+	if (!block && consumed == 0 && ue.ue_error == 0) {
+		/* No data available. */
+		ue.ue_error = EWOULDBLOCK;
+	} else if (ue.ue_error != 0 && ue.ue_error != EINTR) {
+		/*
+		 * We had a problem that wasn't simply a
+		 * case of cv_wait_sig() receiving a signal.
+		 */
+		DTRACE_PROBE1(ipf__cfw__uiodiscard, int, consumed);
+		mutex_enter(&cfw_ringlock);
+		cfw_evdrops += consumed;
+		mutex_exit(&cfw_ringlock);
+	}
+	return (ue.ue_error);
+}
+
+#else	/* _KERNEL */
+
+/* Blank stubs to satisfy userland's test compilations. */
+
+int
+ipf_cfw_ring_resize(uint32_t a)
+{
+	return (0);
+}
+
+void
+ipf_log_cfwlog(struct ipstate *a, uint_t b, ipf_stack_t *c)
+{
+}
+
+void
+ipf_block_cfwlog(frentry_t *a, fr_info_t *b, ipf_stack_t *c)
+{
+}
+
+#endif	/* _KERNEL */
diff --git a/usr/src/uts/common/inet/ipf/fil.c b/usr/src/uts/common/inet/ipf/fil.c
index 78980be106..48fa6e7325 100644
--- a/usr/src/uts/common/inet/ipf/fil.c
+++ b/usr/src/uts/common/inet/ipf/fil.c
@@ -5,7 +5,7 @@
  *
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #if defined(KERNEL) || defined(_KERNEL)
@@ -2588,6 +2588,9 @@ ipf_stack_t *ifs;
 	}
 #endif
 
+	if (IFS_CFWLOG(ifs, fr) && FR_ISBLOCK(pass))
+		ipf_block_cfwlog(fr, fin, ifs);
+
 	/*
 	 * The FI_STATE flag is cleared here so that calling fr_checkstate
 	 * will work when called from inside of fr_fastroute.  Although
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index c9d5f03e13..0d34e0fce3 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -5,7 +5,7 @@
  *
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #if !defined(lint)
@@ -85,6 +85,14 @@ static	int	ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
 static	int	ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
     void *));
 static	int     ipf_hook6 __P((hook_data_t, int, int, void *));
+static	int	ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+    void *));
 
 static	int	ipf_hookviona_in __P((hook_event_token_t, hook_data_t, void *));
 static	int	ipf_hookviona_out __P((hook_event_token_t, hook_data_t,
@@ -116,7 +124,7 @@ u_long		*ip_forwarding = NULL;
 #endif
 
 vmem_t	*ipf_minor;	/* minor number arena */
-void 	*ipf_state;	/* DDI state */
+void	*ipf_state;	/* DDI state */
 
 /*
  * GZ-controlled and per-zone stacks:
@@ -141,28 +149,38 @@ void 	*ipf_state;	/* DDI state */
  */
 
 /* IPv4 hook names */
-char *hook4_nicevents = 	"ipfilter_hook4_nicevents";
-char *hook4_nicevents_gz = 	"ipfilter_hook4_nicevents_gz";
-char *hook4_in = 		"ipfilter_hook4_in";
-char *hook4_in_gz = 		"ipfilter_hook4_in_gz";
-char *hook4_out = 		"ipfilter_hook4_out";
-char *hook4_out_gz = 		"ipfilter_hook4_out_gz";
-char *hook4_loop_in = 		"ipfilter_hook4_loop_in";
-char *hook4_loop_in_gz = 	"ipfilter_hook4_loop_in_gz";
-char *hook4_loop_out = 		"ipfilter_hook4_loop_out";
-char *hook4_loop_out_gz = 	"ipfilter_hook4_loop_out_gz";
+char *hook4_nicevents =		"ipfilter_hook4_nicevents";
+char *hook4_nicevents_gz =	"ipfilter_hook4_nicevents_gz";
+char *hook4_in =		"ipfilter_hook4_in";
+char *hook4_in_gz =		"ipfilter_hook4_in_gz";
+char *hook4_out =		"ipfilter_hook4_out";
+char *hook4_out_gz =		"ipfilter_hook4_out_gz";
+char *hook4_loop_in =		"ipfilter_hook4_loop_in";
+char *hook4_loop_in_gz =	"ipfilter_hook4_loop_in_gz";
+char *hook4_loop_out =		"ipfilter_hook4_loop_out";
+char *hook4_loop_out_gz =	"ipfilter_hook4_loop_out_gz";
 
 /* IPv6 hook names */
-char *hook6_nicevents = 	"ipfilter_hook6_nicevents";
-char *hook6_nicevents_gz = 	"ipfilter_hook6_nicevents_gz";
-char *hook6_in = 		"ipfilter_hook6_in";
-char *hook6_in_gz = 		"ipfilter_hook6_in_gz";
-char *hook6_out = 		"ipfilter_hook6_out";
-char *hook6_out_gz = 		"ipfilter_hook6_out_gz";
-char *hook6_loop_in = 		"ipfilter_hook6_loop_in";
-char *hook6_loop_in_gz = 	"ipfilter_hook6_loop_in_gz";
-char *hook6_loop_out = 		"ipfilter_hook6_loop_out";
-char *hook6_loop_out_gz = 	"ipfilter_hook6_loop_out_gz";
+char *hook6_nicevents =		"ipfilter_hook6_nicevents";
+char *hook6_nicevents_gz =	"ipfilter_hook6_nicevents_gz";
+char *hook6_in =		"ipfilter_hook6_in";
+char *hook6_in_gz =		"ipfilter_hook6_in_gz";
+char *hook6_out =		"ipfilter_hook6_out";
+char *hook6_out_gz =		"ipfilter_hook6_out_gz";
+char *hook6_loop_in =		"ipfilter_hook6_loop_in";
+char *hook6_loop_in_gz =	"ipfilter_hook6_loop_in_gz";
+char *hook6_loop_out =		"ipfilter_hook6_loop_out";
+char *hook6_loop_out_gz =	"ipfilter_hook6_loop_out_gz";
+
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in =		"ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz =		"ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in =		"ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz =		"ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out =		"ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz =	"ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out =		"ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz =	"ipfilter_hookvndl3v6_out_gz";
 
 /* viona hook names */
 char *hook_viona_in =		"ipfilter_hookviona_in";
@@ -170,6 +188,39 @@ char *hook_viona_in_gz =	"ipfilter_hookviona_in_gz";
 char *hook_viona_out =		"ipfilter_hookviona_out";
 char *hook_viona_out_gz =	"ipfilter_hookviona_out_gz";
 
+/*
+ * For VIONA. The net_{instance,protocol}_notify_register() functions only
+ * deal with per-callback-function granularity. We need two wrapper functions
+ * for GZ-controlled and per-zone instances.
+ */
+static int
+ipf_hook_instance_notify_gz(hook_notify_cmd_t command, void *arg,
+    const char *netid, const char *dummy, const char *instance)
+{
+	return (ipf_hook_instance_notify(command, arg, netid, dummy, instance));
+}
+
+static int
+ipf_hook_instance_notify_ngz(hook_notify_cmd_t command, void *arg,
+    const char *netid, const char *dummy, const char *instance)
+{
+	return (ipf_hook_instance_notify(command, arg, netid, dummy, instance));
+}
+
+static int
+ipf_hook_protocol_notify_gz(hook_notify_cmd_t command, void *arg,
+    const char *name, const char *dummy, const char *he_name)
+{
+	return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name));
+}
+
+static int
+ipf_hook_protocol_notify_ngz(hook_notify_cmd_t command, void *arg,
+    const char *name, const char *dummy, const char *he_name)
+{
+	return (ipf_hook_protocol_notify(command, arg, name, dummy, he_name));
+}
+
 /* ------------------------------------------------------------------------ */
 /* Function:    ipldetach                                                   */
 /* Returns:     int - 0 == success, else error.                             */
@@ -267,10 +318,36 @@ ipf_stack_t *ifs;
 	}
 
 	/*
+	 * Remove VND hooks
+	 */
+	if (ifs->ifs_ipf_vndl3v4 != NULL) {
+		UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+		    NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+		UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+		    NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+		if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+			goto detach_failed;
+		ifs->ifs_ipf_vndl3v4 = NULL;
+	}
+
+	if (ifs->ifs_ipf_vndl3v6 != NULL) {
+		UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+		    NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+		UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+		    NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+		if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+			goto detach_failed;
+		ifs->ifs_ipf_vndl3v6 = NULL;
+	}
+
+	/*
 	 * Remove notification of viona hooks
 	 */
 	net_instance_notify_unregister(ifs->ifs_netid,
-	    ipf_hook_instance_notify);
+	    ifs->ifs_gz_controlled ? ipf_hook_instance_notify_gz :
+	    ipf_hook_instance_notify_ngz);
 
 #undef UNDO_HOOK
 
@@ -278,6 +355,10 @@ ipf_stack_t *ifs;
 	 * Normally, viona will unregister itself before ipldetach() is called,
 	 * so these will be no-ops, but out of caution, we try to make sure
 	 * we've removed any of our references.
+	 *
+	 * For now, the _gz and _ngz versions are both wrappers to what's
+	 * below.  Just call it directly, but if that changes fix here as
+	 * well.
 	 */
 	(void) ipf_hook_protocol_notify(HN_UNREGISTER, ifs, Hn_VIONA, NULL,
 	    NH_PHYSICAL_IN);
@@ -295,6 +376,10 @@ ipf_stack_t *ifs;
 		 * traced, we pass the same value the nethook framework would
 		 * pass, even though the callback does not currently use the
 		 * value.
+		 *
+		 * For now, the _gz and _ngz versions are both wrappers to
+		 * what's below.  Just call it directly, but if that changes
+		 * fix here as well.
 		 */
 		(void) ipf_hook_instance_notify(HN_UNREGISTER, ifs, netidstr,
 		    NULL, Hn_VIONA);
@@ -495,6 +580,49 @@ ipf_stack_t *ifs;
 	}
 
 	/*
+	 * Add VND INET hooks
+	 */
+	ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+	if (ifs->ifs_ipf_vndl3v4 == NULL)
+		goto hookup_failed;
+
+	HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+	    hook4_vnd_in, hook4_vnd_in_gz, ifs);
+	HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+	    hook4_vnd_out, hook4_vnd_out_gz, ifs);
+	ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+	    NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+	if (!ifs->ifs_hookvndl3v4_physical_in)
+		goto hookup_failed;
+
+	ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+	    NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+	if (!ifs->ifs_hookvndl3v4_physical_out)
+		goto hookup_failed;
+
+
+	/*
+	 * VND INET6 hooks
+	 */
+	ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+	if (ifs->ifs_ipf_vndl3v6 == NULL)
+		goto hookup_failed;
+
+	HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+	    hook6_vnd_in, hook6_vnd_in_gz, ifs);
+	HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+	    hook6_vnd_out, hook6_vnd_out_gz, ifs);
+	ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+	    NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+	if (!ifs->ifs_hookvndl3v6_physical_in)
+		goto hookup_failed;
+
+	ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+	    NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+	if (!ifs->ifs_hookvndl3v6_physical_out)
+		goto hookup_failed;
+
+	/*
 	 * VIONA INET hooks.  While the nethook framework allows us to register
 	 * hooks for events that haven't been registered yet, we instead
 	 * register and unregister our hooks in response to notifications
@@ -504,9 +632,15 @@ ipf_stack_t *ifs;
 	 * is unloaded, the viona module cannot later re-register them if it
 	 * gets reloaded.  As the ip, vnd, and ipf modules are rarely unloaded
 	 * even on DEBUG kernels, they do not experience this issue.
+	 *
+	 * Today, the per-zone ones don't matter for a BHYVE-branded zone, BUT
+	 * the ipf_hook_protocol_notify() function is GZ vs. per-zone aware.
+	 * Employ two different versions of ipf_hook_instance_notify(), one for
+	 * the GZ-controlled, and one for the per-zone one.
 	 */
-	if (net_instance_notify_register(id, ipf_hook_instance_notify,
-	    ifs) != 0)
+	if (net_instance_notify_register(id, ifs->ifs_gz_controlled ?
+	    ipf_hook_instance_notify_gz : ipf_hook_instance_notify_ngz, ifs) !=
+	    0)
 		goto hookup_failed;
 
 	/*
@@ -688,6 +822,7 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
 {
 	ipf_stack_t *ifs = arg;
 	int ret = 0;
+	const boolean_t gz = ifs->ifs_gz_controlled;
 
 	/* We currently only care about viona hooks */
 	if (strcmp(instance, Hn_VIONA) != 0)
@@ -705,14 +840,16 @@ ipf_hook_instance_notify(hook_notify_cmd_t command, void *arg,
 			return (EPROTONOSUPPORT);
 
 		ret = net_protocol_notify_register(ifs->ifs_ipf_viona,
-		    ipf_hook_protocol_notify, ifs);
+		    gz ? ipf_hook_protocol_notify_gz :
+		    ipf_hook_protocol_notify_ngz, ifs);
 		VERIFY(ret == 0 || ret == ESHUTDOWN);
 		break;
 	case HN_UNREGISTER:
 		if (ifs->ifs_ipf_viona == NULL)
 			break;
 		VERIFY0(net_protocol_notify_unregister(ifs->ifs_ipf_viona,
-		    ipf_hook_protocol_notify));
+		    gz ? ipf_hook_protocol_notify_gz :
+		    ipf_hook_protocol_notify_ngz));
 		VERIFY0(net_protocol_release(ifs->ifs_ipf_viona));
 		ifs->ifs_ipf_viona = NULL;
 		break;
@@ -821,6 +958,9 @@ int *rp;
 		return ENXIO;
 	unit = isp->ipfs_minor;
 
+	if (unit == IPL_LOGEV)
+		return (ipf_cfwlog_ioctl(dev, cmd, data, mode, cp, rp));
+
 	zid = crgetzoneid(cp);
 	if (cmd == SIOCIPFZONESET) {
 		if (zid == GLOBAL_ZONEID)
@@ -1129,14 +1269,14 @@ ipf_stack_t *ifs;
 {
 	net_handle_t nif;
  
-  	if (v == 4)
- 		nif = ifs->ifs_ipf_ipv4;
-  	else if (v == 6)
- 		nif = ifs->ifs_ipf_ipv6;
-  	else
- 		return 0;
-
- 	return (net_phylookup(nif, name));
+ 	if (v == 4)
+		nif = ifs->ifs_ipf_ipv4;
+ 	else if (v == 6)
+		nif = ifs->ifs_ipf_ipv6;
+ 	else
+		return 0;
+
+	return (net_phylookup(nif, name));
 }
 
 /*
@@ -1161,11 +1301,35 @@ cred_t *cred;
 	if (IPL_LOGMAX < min)
 		return ENXIO;
 
+	/* Special-case ipfev: global-zone-open only. */
+	if (min == IPL_LOGEV) {
+		if (crgetzoneid(cred) != GLOBAL_ZONEID)
+			return (ENXIO);
+		/*
+		 * Else enable the CFW logging of events.
+		 * NOTE: For now, we only allow one open at a time.
+		 * Use atomic_cas to confirm/deny. And also for now,
+		 * assume sizeof (boolean_t) == sizeof (uint_t).
+		 *
+		 * Per the *_{refrele,REFRELE}() in other parts of inet,
+		 * ensure all loads/stores complete before calling cas.
+		 * membar_exit() does this.
+		 */
+		membar_exit();
+		if (atomic_cas_uint(&ipf_cfwlog_enabled, 0, 1) != 0)
+			return (EBUSY);
+	}
+
 	minor = (minor_t)(uintptr_t)vmem_alloc(ipf_minor, 1,
 	    VM_BESTFIT | VM_SLEEP);
 
 	if (ddi_soft_state_zalloc(ipf_state, minor) != 0) {
 		vmem_free(ipf_minor, (void *)(uintptr_t)minor, 1);
+		if (min == IPL_LOGEV) {
+			/* See above... */
+			membar_exit();
+			VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1);
+		}
 		return ENXIO;
 	}
 
@@ -1187,6 +1351,7 @@ int flags, otype;
 cred_t *cred;
 {
 	minor_t	min = getminor(dev);
+	ipf_devstate_t *isp;
 
 #ifdef	IPFDEBUG
 	cmn_err(CE_CONT, "iplclose(%x,%x,%x,%x)\n", dev, flags, otype, cred);
@@ -1195,6 +1360,15 @@ cred_t *cred;
 	if (IPL_LOGMAX < min)
 		return ENXIO;
 
+	isp = ddi_get_soft_state(ipf_state, min);
+	if (isp != NULL && isp->ipfs_minor == IPL_LOGEV) {
+		/*
+		 * Disable CFW logging.  See iplopen() for details.
+		 */
+		membar_exit();
+		VERIFY(atomic_cas_uint(&ipf_cfwlog_enabled, 1, 0) == 1);
+	}
+
 	ddi_soft_state_free(ipf_state, min);
 	vmem_free(ipf_minor, (void *)(uintptr_t)min, 1);
 
@@ -1225,6 +1399,8 @@ cred_t *cp;
 		return ENXIO;
 	unit = isp->ipfs_minor;
 
+	if (unit == IPL_LOGEV)
+		return (ipf_cfwlog_read(dev, uio, cp));
 
         /*
 	 * ipf_find_stack returns with a read lock on ifs_ipf_global
@@ -1277,6 +1453,9 @@ cred_t *cp;
 		return ENXIO;
 	unit = isp->ipfs_minor;
 
+	if (unit == IPL_LOGEV)
+		return (EIO);	/* ipfev doesn't support write yet. */
+
         /*
 	 * ipf_find_stack returns with a read lock on ifs_ipf_global
 	 */
@@ -2068,8 +2247,11 @@ frdest_t *fdp;
 		return (-1);
 	}
 
-	/* Check the src here, fin_ifp is the src interface. */
-	if (!(fin->fin_flx & FI_GENERATED) &&
+	/*
+	 * If we're forwarding (vs. injecting), check the src here, fin_ifp is
+	 * the src interface.
+	 */
+	if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) &&
 	    !fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p)) {
 		return (-1);
 	}
@@ -2138,8 +2320,8 @@ frdest_t *fdp;
 		inj->ni_physical = net_routeto(net_data_p, sinp, NULL);
 	}
 
-	/* we're checking the destination here */
-	if (!(fin->fin_flx & FI_GENERATED) &&
+	/* If we're forwarding (vs. injecting), check the destinatation here. */
+	if (fdp != NULL && !(fin->fin_flx & FI_GENERATED) &&
 	    !fr_forwarding_enabled(inj->ni_physical, net_data_p)) {
 		goto bad_fastroute;
 	}
@@ -2355,6 +2537,42 @@ int ipf_hook_ether(hook_event_token_t token, hook_data_t info, void *arg,
 }
 
 /* ------------------------------------------------------------------------ */
+/* Function:    ipf_hookvndl3_in					    */
+/* Returns:     int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters:  event(I)     - pointer to event                             */
+/*              info(I)      - pointer to hook information for firewalling  */
+/*                                                                          */
+/* The vnd hooks are private hooks to ON. They represents a layer 2         */
+/* datapath generally used to implement virtual machines. The driver sends  */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the            */
+/* traditional packet hook flags.                                           */
+/*                                                                          */
+/* They end up calling the appropriate traditional ip hooks.                */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
 /* Function:    ipf_hookviona_{in,out}                                      */
 /* Returns:     int - 0 == packet ok, else problem, free packet if not done */
 /* Parameters:  event(I)     - pointer to event                             */
@@ -3120,16 +3338,16 @@ fr_info_t *fin;
 /* both IP versions. The details are going to be explained here.	    */
 /*                                                                          */
 /* The packet looks as follows:						    */
-/*    xxx | IP hdr | IP payload    ...	| 				    */
-/*    ^   ^        ^            	^				    */
-/*    |   |        |            	|				    */
+/*    xxx | IP hdr | IP payload    ...	|				    */
+/*    ^   ^        ^           	        ^				    */
+/*    |   |        |           	        |				    */
 /*    |   |        |		fin_m->b_wptr = fin->fin_dp + fin->fin_dlen */
 /*    |   |        |							    */
 /*    |   |        `- fin_m->fin_dp (in case of IPv4 points to L4 header)   */
 /*    |   |								    */
 /*    |   `- fin_m->b_rptr + fin_ipoff (fin_ipoff is most likely 0 in case  */
 /*    |      of loopback)						    */
-/*    |   								    */
+/*    |  								    */
 /*    `- fin_m->b_rptr -  points to L2 header in case of physical NIC	    */
 /*                                                                          */
 /* All relevant IP headers are pulled up into the first mblk. It happened   */
diff --git a/usr/src/uts/common/inet/ipf/ip_log.c b/usr/src/uts/common/inet/ipf/ip_log.c
index 584ee42d9a..b70e320def 100644
--- a/usr/src/uts/common/inet/ipf/ip_log.c
+++ b/usr/src/uts/common/inet/ipf/ip_log.c
@@ -8,7 +8,7 @@
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #include <sys/param.h>
@@ -373,9 +373,11 @@ u_int flags;
 	if (fin->fin_fr != NULL) {
 		ipfl.fl_loglevel = fin->fin_fr->fr_loglevel;
 		ipfl.fl_logtag = fin->fin_fr->fr_logtag;
+		bcopy(fin->fin_fr->fr_uuid, ipfl.fl_uuid, sizeof (uuid_t));
 	} else {
 		ipfl.fl_loglevel = 0xffff;
 		ipfl.fl_logtag = FR_NOLOGTAG;
+		bzero(ipfl.fl_uuid, sizeof (uuid_t));
 	}
 	if (fin->fin_nattag != NULL)
 		bcopy(fin->fin_nattag, (void *)&ipfl.fl_nattag,
diff --git a/usr/src/uts/common/inet/ipf/ip_state.c b/usr/src/uts/common/inet/ipf/ip_state.c
index 184f8775b6..a45bcbfdaf 100644
--- a/usr/src/uts/common/inet/ipf/ip_state.c
+++ b/usr/src/uts/common/inet/ipf/ip_state.c
@@ -5,7 +5,7 @@
  *
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #if defined(KERNEL) || defined(_KERNEL)
@@ -108,6 +108,7 @@ struct file;
 #  include <sys/systm.h>
 # endif
 #endif
+#include <sys/uuid.h>
 /* END OF INCLUDES */
 
 
@@ -1445,6 +1446,7 @@ u_int flags;
 			is->is_sti.tqe_flags |= TQE_RULEBASED;
 		}
 		is->is_tag = fr->fr_logtag;
+		memcpy(is->is_uuid, fr->fr_uuid, sizeof (uuid_t));
 
 		is->is_ifp[(out << 1) + 1] = fr->fr_ifas[1];
 		is->is_ifp[(1 - out) << 1] = fr->fr_ifas[2];
@@ -1524,6 +1526,9 @@ u_int flags;
 	if (ifs->ifs_ipstate_logging)
 		ipstate_log(is, ISL_NEW, ifs);
 
+	if (IFS_CFWLOG(ifs, is->is_rule))
+		ipf_log_cfwlog(is, ISL_NEW, ifs);
+
 	RWLOCK_EXIT(&ifs->ifs_ipf_state);
 	fin->fin_rev = IP6_NEQ(&is->is_dst, &fin->fin_daddr);
 	fin->fin_flx |= FI_STATE;
@@ -2314,6 +2319,8 @@ u_32_t cmask;
 		is->is_flags &= ~(SI_W_SPORT|SI_W_DPORT);
 		if ((flags & SI_CLONED) && ifs->ifs_ipstate_logging)
 			ipstate_log(is, ISL_CLONE, ifs);
+		if ((flags & SI_CLONED) && IFS_CFWLOG(ifs, is->is_rule))
+			ipf_log_cfwlog(is, ISL_CLONE, ifs);
 	}
 
 	ret = -1;
@@ -3397,6 +3404,15 @@ ipf_stack_t *ifs;
  
 	if (ifs->ifs_ipstate_logging != 0 && why != 0)
 		ipstate_log(is, why, ifs);
+	/*
+	 * For now, ipf_log_cfwlog() copes with all "why" values. Strictly
+	 * speaking, though, they all map to one event (CFWEV_END), which for
+	 * now is not supported, hence no code calling ipf_log_cfwlog() like
+	 * below:
+	 *
+	 * if (why != 0 && IFS_CFWLOG(ifs, is->is_rule))
+	 *	ipf_log_cfwlog(is, why, ifs);
+	 */
 
 	if (is->is_rule != NULL) {
 		is->is_rule->fr_statecnt--;
@@ -3931,7 +3947,6 @@ int flags;
 	return rval;
 }
 
-
 /* ------------------------------------------------------------------------ */
 /* Function:    ipstate_log                                                 */
 /* Returns:     Nil                                                         */
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
 #
 #
 name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/Makefile b/usr/src/uts/common/inet/ipf/netinet/Makefile
index cca3b48ac4..88f91e633f 100644
--- a/usr/src/uts/common/inet/ipf/netinet/Makefile
+++ b/usr/src/uts/common/inet/ipf/netinet/Makefile
@@ -1,16 +1,15 @@
 #
-#ident	"%Z%%M%	%I%	%E% SMI"
-#
 # Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
 # Use is subject to license terms.
+# Copyright 2019 Joyent, Inc.
 #
 # uts/common/inet/ipf/netinet/Makefile
 #
 # include global definitions
 include ../../../../../Makefile.master
 
-HDRS=	ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h \
-	ip_frag.h ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h
+HDRS=	ipl.h ip_compat.h ip_fil.h ip_nat.h ip_proxy.h ip_state.h ip_frag.h \
+	ip_auth.h ip_lookup.h ip_pool.h ip_htable.h ipf_stack.h ipf_cfw.h
 
 ROOTDIRS= $(ROOT)/usr/include/netinet
 
diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
index 4c3c5683b5..bb5ce7bd6c 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ip_fil.h
@@ -8,7 +8,7 @@
  *
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2019, Joyent, Inc.
  */
 
 #ifndef	__IP_FIL_H__
@@ -16,6 +16,7 @@
 
 #include "netinet/ip_compat.h"
 #include <sys/zone.h>
+#include <sys/uuid.h>
 
 #ifdef	SOLARIS
 #undef	SOLARIS
@@ -115,6 +116,8 @@
 #define	SIOCDELFR	SIOCRMAFR
 #define	SIOCINSFR	SIOCINAFR
 # define	SIOCIPFZONESET	_IOWR('r', 97, struct ipfzoneobj)
+# define	SIOCIPFCFWCFG	_IOR('r', 98, struct ipfcfwcfg)
+# define	SIOCIPFCFWNEWSZ	_IOWR('r', 99, struct ipfcfwcfg)
 
 /*
  * What type of table is getting flushed?
@@ -600,6 +603,7 @@ typedef	struct	frentry {
 	u_32_t	fr_flags;	/* per-rule flags && options (see below) */
 	u_32_t	fr_logtag;	/* user defined log tag # */
 	u_32_t	fr_collect;	/* collection number */
+	uuid_t	fr_uuid;	/* user defined uuid */
 	u_int	fr_arg;		/* misc. numeric arg for rule */ 
 	u_int	fr_loglevel;	/* syslog log facility + priority */
 	u_int	fr_age[2];	/* non-TCP timeouts */
@@ -728,6 +732,7 @@ typedef	struct	frentry {
 #define	FR_NEWISN	0x400000	/* new ISN for outgoing TCP */
 #define	FR_NOICMPERR	0x800000	/* do not match ICMP errors in state */
 #define	FR_STATESYNC	0x1000000	/* synchronize state to slave */
+#define	FR_CFWLOG	0x2000000	/* Global CFW logging enabled */
 #define	FR_NOMATCH	0x8000000	/* no match occured */
 		/*	0x10000000 	FF_LOGPASS */
 		/*	0x20000000 	FF_LOGBLOCK */
@@ -883,6 +888,7 @@ typedef	struct	ipflog	{
 	u_32_t	fl_lflags;
 	u_32_t	fl_logtag;
 	ipftag_t	fl_nattag;
+	uuid_t	fl_uuid;
 	u_short	fl_plen;	/* extra data after hlen */
 	u_short	fl_loglevel;	/* syslog log level */
 	char	fl_group[FR_GROUPLEN];
@@ -931,6 +937,7 @@ typedef	struct	ipflog	{
 #define	IPSYNC_NAME	"/dev/ipsync"
 #define	IPSCAN_NAME	"/dev/ipscan"
 #define	IPLOOKUP_NAME	"/dev/iplookup"
+#define	IPFEV_NAME	"/dev/ipfev"
 
 #define	IPL_LOGIPF	0	/* Minor device #'s for accessing logs */
 #define	IPL_LOGNAT	1
@@ -939,8 +946,9 @@ typedef	struct	ipflog	{
 #define	IPL_LOGSYNC	4
 #define	IPL_LOGSCAN	5
 #define	IPL_LOGLOOKUP	6
-#define	IPL_LOGCOUNT	7
-#define	IPL_LOGMAX	7
+#define	IPL_LOGEV	7
+#define	IPL_LOGCOUNT	8
+#define	IPL_LOGMAX	8
 #define	IPL_LOGSIZE	(IPL_LOGMAX + 1)
 #define	IPL_LOGALL	-1
 #define	IPL_LOGNONE	-2
@@ -1181,6 +1189,21 @@ typedef	struct	ipfzoneobj	{
 	char		ipfz_zonename[ZONENAME_MAX];	/* zone to act on */
 } ipfzoneobj_t;
 
+/* ioctl to grab CFW logging parameters */
+typedef struct ipfcfwcfg {
+	/* CFG => Max event size, NEWSZ => ignored in, like CFG out. */
+	uint32_t ipfcfwc_maxevsize;
+	/*
+	 * CFG => Current ring size,
+	 * NEWSZ => New ring size, must be 2^N for 3 <= N <= 31.
+	 */
+	uint32_t ipfcfwc_evringsize;
+	/* CFG => Number of event reports, NEWSZ => ignored in, like CFG out. */
+	uint64_t ipfcfwc_evreports;
+	/* CFG => Number of event drops, NEWSZ => ignored in, like CFG out. */
+	uint64_t ipfcfwc_evdrops;
+} ipfcfwcfg_t;
+
 #if defined(_KERNEL)
 /* Set ipfs_zoneid to this if no zone has been set: */
 #define IPFS_ZONE_UNSET	-2
@@ -1560,6 +1583,23 @@ extern	int	ipllog __P((int, fr_info_t *, void **, size_t *, int *, int,
 			    ipf_stack_t *));
 extern	void	fr_logunload __P((ipf_stack_t *));
 
+/* SmartOS single-FD global-zone state accumulator (see cfw.c) */
+extern boolean_t ipf_cfwlog_enabled;
+struct ipstate;	/* Ugggh. */
+extern void ipf_log_cfwlog __P((struct ipstate *, uint_t, ipf_stack_t *));
+extern void ipf_block_cfwlog __P((frentry_t *, fr_info_t *, ipf_stack_t *));
+#define	IFS_CFWLOG(ifs, fr) ((ifs)->ifs_gz_controlled && ipf_cfwlog_enabled &&\
+	fr != NULL && ((fr)->fr_flags & FR_CFWLOG))
+struct cfwev_s;	/* See ipf_cfw.h */
+extern boolean_t ipf_cfwev_consume __P((struct cfwev_s *, boolean_t));
+/* See cfw.c's ipf_cfwev_consume_many() for details. */
+typedef uint_t (*cfwmanycb_t) __P((struct cfwev_s *, uint_t, void *));
+extern int ipf_cfwlog_read __P((dev_t, struct uio *, struct cred *));
+extern int ipf_cfwlog_ioctl __P((dev_t, int, intptr_t, int, cred_t *, int *));
+#define	IPF_CFW_RING_ALLOCATE 0
+#define	IPF_CFW_RING_DESTROY 1
+extern int ipf_cfw_ring_resize(uint32_t);
+
 extern	frentry_t	*fr_acctpkt __P((fr_info_t *, u_32_t *));
 extern	int		fr_copytolog __P((int, char *, int));
 extern	u_short		fr_cksum __P((mb_t *, ip_t *, int, void *));
diff --git a/usr/src/uts/common/inet/ipf/netinet/ip_state.h b/usr/src/uts/common/inet/ipf/netinet/ip_state.h
index 4c605c1b89..ef315d5ef1 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ip_state.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ip_state.h
@@ -8,11 +8,14 @@
  *
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef	__IP_STATE_H__
 #define	__IP_STATE_H__
 
+#include <sys/uuid.h>
+
 #if defined(__STDC__) || defined(__GNUC__) || defined(_AIX51)
 # define	SIOCDELST	_IOW('r', 61, struct ipfobj)
 #else
@@ -66,6 +69,7 @@ typedef struct ipstate {
 					/* in both directions */
 	u_32_t	is_optmsk[2];		/*    "      "    mask */
 					/* in both directions */
+	uuid_t	is_uuid;
 	u_short	is_sec;			/* security options set */
 	u_short	is_secmsk;		/*    "        "    mask */
 	u_short	is_auth;		/* authentication options set */
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h
new file mode 100644
index 0000000000..1972d2b3f7
--- /dev/null
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_cfw.h
@@ -0,0 +1,69 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2019 Joyent, Inc.
+ */
+
+#ifndef	__IPF_CFW_H__
+#define	__IPF_CFW_H__
+
+#include <sys/types.h>
+#include <inet/ip6.h>
+#include <sys/uuid.h>
+
+/* Because ipf compiles this kernel file in userland testing... */
+#ifndef ASSERT3U
+#define	ASSERT3U(a, b, c) ASSERT(a ## b ## c);
+#endif	/* ASSERT3U */
+
+/*
+ * CFW Event, which is emitted to a global-zone listener. The global-zone
+ * listener solves the one-fd-per-zone problem of using each zone's ipmon.
+ *
+ * These must be 64-bit aligned because they form an array in-kernel. There
+ * might be reserved fields to ensure that alignment.
+ */
+#define	CFWEV_BLOCK	1
+#define	CFWEV_BEGIN	2
+#define	CFWEV_END	3
+#define	CFWDIR_IN	1
+#define	CFWDIR_OUT	2
+
+typedef struct cfwev_s {
+	uint16_t cfwev_type;	/* BEGIN, END, BLOCK */
+	uint16_t cfwev_length;	/* in bytes, so capped to 65535 bytes */
+	zoneid_t cfwev_zonedid;	/* Pullable from ipf_stack_t. */
+
+	uint32_t cfwev_ruleid;	/* Pullable from fr_info_t. */
+	uint16_t cfwev_sport;	/* Source port (network order) */
+	uint16_t cfwev_dport;	/* Dest. port (network order) */
+
+	uint8_t cfwev_protocol;	/* IPPROTO_* */
+	/* "direction" informs if src/dst are local/remote or remote/local. */
+	uint8_t cfwev_direction;
+	uint8_t cfwev_reserved[6];	/* Ensures 64-bit alignment. */
+
+	in6_addr_t cfwev_saddr;	/* IPv4 addresses are V4MAPPED. */
+	in6_addr_t cfwev_daddr;
+
+	/*
+	 * Because of 'struct timeval' being different between 32-bit and
+	 * 64-bit ABIs, this interface is only usable by 64-bit binaries.
+	 */
+	struct timeval cfwev_tstamp;
+
+	uuid_t cfwev_ruleuuid;	/* Pullable from fr_info_t. */
+} cfwev_t;
+
+
+
+#endif	/* __IPF_CFW_H__ */
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index 0ceea1e921..0b2a8d826f 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -6,7 +6,7 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright 2018 Joyent, Inc.  All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 #ifndef	__IPF_STACK_H__
@@ -46,6 +46,7 @@ struct ipf_stack {
 	struct ipf_stack	*ifs_gz_cont_ifs;
 	netid_t			ifs_netid;
 	zoneid_t		ifs_zone;
+	zoneid_t		ifs_zone_did;
 	boolean_t		ifs_gz_controlled;
 
 	/* ipf module */
@@ -126,6 +127,11 @@ struct ipf_stack {
 	hook_t		*ifs_ipfhook6_loop_out;
 	hook_t		*ifs_ipfhook6_nicevents;
 
+	hook_t		*ifs_ipfhookvndl3v4_in;
+	hook_t		*ifs_ipfhookvndl3v6_in;
+	hook_t		*ifs_ipfhookvndl3v4_out;
+	hook_t		*ifs_ipfhookvndl3v6_out;
+
 	hook_t		*ifs_ipfhookviona_in;
 	hook_t		*ifs_ipfhookviona_out;
 
@@ -140,12 +146,18 @@ struct ipf_stack {
 	boolean_t	ifs_hook6_nic_events;
 	boolean_t	ifs_hook6_loopback_in;
 	boolean_t	ifs_hook6_loopback_out;
+	boolean_t	ifs_hookvndl3v4_physical_in;
+	boolean_t	ifs_hookvndl3v6_physical_in;
+	boolean_t	ifs_hookvndl3v4_physical_out;
+	boolean_t	ifs_hookvndl3v6_physical_out;
 	boolean_t	ifs_hookviona_physical_in;
 	boolean_t	ifs_hookviona_physical_out;
 
 	int		ifs_ipf_loopback;
 	net_handle_t	ifs_ipf_ipv4;
 	net_handle_t	ifs_ipf_ipv6;
+	net_handle_t	ifs_ipf_vndl3v4;
+	net_handle_t	ifs_ipf_vndl3v6;
 	net_handle_t	ifs_ipf_viona;
 
 	/* ip_auth.c */
@@ -305,6 +317,7 @@ struct ipf_stack {
 	char			*ifs_addmask_key;
 	char			*ifs_rn_zeros;
 	char			*ifs_rn_ones;
+
 #ifdef KERNEL
 	/* kstats for inbound and outbound */
 	kstat_t			*ifs_kstatp[2];
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5ccbfa3188 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -6,7 +6,7 @@
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
- * Copyright (c) 2014, Joyent, Inc.  All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 /*
@@ -116,7 +116,7 @@ static	void	ipf_stack_shutdown __P((const netid_t, void *));
 static	int	ipf_property_g_update __P((dev_info_t *));
 static	char	*ipf_devfiles[] = { IPL_NAME, IPNAT_NAME, IPSTATE_NAME,
 				    IPAUTH_NAME, IPSYNC_NAME, IPSCAN_NAME,
-				    IPLOOKUP_NAME, NULL };
+				    IPLOOKUP_NAME, IPFEV_NAME, NULL };
 extern void 	*ipf_state;	/* DDI state */
 extern vmem_t	*ipf_minor;	/* minor number arena */
 
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
 /*
  * Destroy things for ipf for one stack.
  */
-/* ARGSUSED */
 static void
 ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
 {
@@ -742,6 +741,9 @@ ddi_attach_cmd_t cmd;
 
 		ipf_dev_info = dip;
 
+		if (ipf_cfw_ring_resize(IPF_CFW_RING_ALLOCATE) != 0)
+			goto attach_failed;
+
 		ipfncb = net_instance_alloc(NETINFO_VERSION);
 		if (ipfncb == NULL)
 			goto attach_failed;
@@ -769,6 +771,7 @@ ddi_attach_cmd_t cmd;
 	}
 
 attach_failed:
+	(void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY);
 	ddi_remove_minor_node(dip, NULL);
 	ddi_prop_remove_all(dip);
 	ddi_soft_state_fini(&ipf_state);
@@ -796,6 +799,7 @@ ddi_detach_cmd_t cmd;
 		 * framework guarantees we are not active with this devinfo
 		 * node in any other entry points at this time.
 		 */
+		(void) ipf_cfw_ring_resize(IPF_CFW_RING_DESTROY);
 		ddi_prop_remove_all(dip);
 		i = ddi_get_instance(dip);
 		ddi_remove_minor_node(dip, NULL);
diff --git a/usr/src/uts/common/inet/mib2.h b/usr/src/uts/common/inet/mib2.h
index 5a168523ee..85ca5ebdec 100644
--- a/usr/src/uts/common/inet/mib2.h
+++ b/usr/src/uts/common/inet/mib2.h
@@ -23,6 +23,7 @@
 /*
  * Copyright (c) 1990 Mentat Inc.
  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  */
 
 /*
@@ -1400,6 +1401,8 @@ typedef struct tcpConnEntryInfo_s {
 			/* round-trip time smoothed average (us) */
 	Gauge		ce_rtt_sa;
 			/* current rto (retransmit timeout) */
+	Gauge		ce_rtt_sd;
+			/* current rto (retransmit timeout) */
 	Gauge		ce_rto;
 			/* round-trip time count */
 	Gauge		ce_rtt_cnt;
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 6fb72d1d08..ddb482db78 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -43,6 +44,7 @@ extern "C" {
 #include <inet/ip.h>
 #include <inet/optcom.h>
 #include <inet/tunables.h>
+#include <inet/bpf.h>
 
 /*
  * ICMP stack instances
@@ -84,6 +86,10 @@ typedef	struct icmp_s {
 	mblk_t		*icmp_fallback_queue_head;
 	mblk_t		*icmp_fallback_queue_tail;
 	struct sockaddr_storage	icmp_delayed_addr;
+
+	krwlock_t	icmp_bpf_lock;	/* protects icmp_bpf */
+	ip_bpf_insn_t	*icmp_bpf_prog; /* SO_ATTACH_FILTER bpf */
+	uint_t		icmp_bpf_len;
 } icmp_t;
 
 /*
diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c
new file mode 100644
index 0000000000..6e1171de46
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/datafilt.c
@@ -0,0 +1,116 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * This file implements a socketfilter used to deter TCP connections.
+ * To defer a connection means to delay the return of accept(3SOCKET)
+ * until at least one byte is ready to be read(2). This filter may be
+ * applied automatically or programmatically through the use of
+ * soconfig(1M) and setsockopt(3SOCKET).
+ */
+
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+#include <sys/note.h>
+#include <sys/taskq.h>
+
+#define	DATAFILT_MODULE "datafilt"
+
+static struct modlmisc dataf_modlmisc = {
+	&mod_miscops,
+	"Kernel data-ready socket filter"
+};
+
+static struct modlinkage dataf_modlinkage = {
+	MODREV_1,
+	&dataf_modlmisc,
+	NULL
+};
+
+static sof_rval_t
+dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph,
+    void *parg, struct sockaddr *laddr, socklen_t laddrlen,
+    struct sockaddr *faddr, socklen_t faddrlen, void **cookiep)
+{
+	_NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen,
+	cookiep));
+	return (SOF_RVAL_DEFER);
+}
+
+static void
+dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr)
+{
+	_NOTE(ARGUNUSED(handle, cookie, cr));
+}
+
+static mblk_t *
+dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags,
+    size_t *lenp)
+{
+	_NOTE(ARGUNUSED(cookie, flags, lenp));
+
+	if (mp != NULL && MBLKL(mp) > 0) {
+		sof_newconn_ready(handle);
+		sof_bypass(handle);
+	}
+
+	return (mp);
+}
+
+static sof_ops_t dataf_ops = {
+	.sofop_attach_passive = dataf_attach_passive_cb,
+	.sofop_detach = dataf_detach_cb,
+	.sofop_data_in = dataf_data_in_cb
+};
+
+int
+_init(void)
+{
+	int err;
+
+	/*
+	 * This module is safe to attach even after some preliminary socket
+	 * setup calls have taken place. See the comment for SOF_ATT_SAFE.
+	 */
+	err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops,
+	    SOF_ATT_SAFE);
+	if (err != 0)
+		return (err);
+	if ((err = mod_install(&dataf_modlinkage)) != 0)
+		(void) sof_unregister(DATAFILT_MODULE);
+
+	return (err);
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = sof_unregister(DATAFILT_MODULE)) != 0)
+		return (err);
+
+	return (mod_remove(&dataf_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&dataf_modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
index 586d7f06f8..76191e93b8 100644
--- a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
+++ b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -51,6 +51,7 @@
 #include <sys/mac_client.h>
 #include <sys/mac_provider.h>
 #include <sys/mac_client_priv.h>
+#include <inet/bpf.h>
 
 #include <netpacket/packet.h>
 
@@ -448,7 +449,7 @@ pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag)
 			buffer = (uchar_t *)mp;
 		}
 		rw_enter(&ps->ps_bpflock, RW_READER);
-		if (bpf_filter(ps->ps_bpf.bf_insns, buffer,
+		if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer,
 		    hdr.mhi_pktsize, buflen) == 0) {
 			rw_exit(&ps->ps_bpflock);
 			ps->ps_stats.tp_drops++;
@@ -1336,7 +1337,7 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
     const void *optval, socklen_t optlen)
 {
 	struct bpf_program prog;
-	struct bpf_insn *fcode;
+	ip_bpf_insn_t *fcode;
 	struct pfpsock *ps;
 	struct sock_proto_props sopp;
 	int error = 0;
@@ -1370,10 +1371,10 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
 			return (EFAULT);
 		}
 
-		if (bpf_validate(fcode, (int)prog.bf_len)) {
+		if (ip_bpf_validate(fcode, prog.bf_len)) {
 			rw_enter(&ps->ps_bpflock, RW_WRITER);
 			pfp_release_bpf(ps);
-			ps->ps_bpf.bf_insns = fcode;
+			ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode;
 			ps->ps_bpf.bf_len = size;
 			rw_exit(&ps->ps_bpflock);
 
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 9fa40eccb6..e65af832eb 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -61,6 +61,10 @@
  * connection are processed on that squeue. The connection ("conn") to
  * squeue mapping is stored in "conn_t" member "conn_sqp".
  *
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
  * Since the processing of the connection cuts across multiple layers
  * but still allows packets for different connnection to be processed on
  * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -241,7 +245,7 @@ squeue_init(void)
 }
 
 squeue_t *
-squeue_create(pri_t pri)
+squeue_create(pri_t pri, boolean_t isip)
 {
 	squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
 
@@ -256,11 +260,36 @@ squeue_create(pri_t pri)
 
 	sqp->sq_enter = squeue_enter;
 	sqp->sq_drain = squeue_drain;
+	sqp->sq_isip = isip;
 
 	return (sqp);
 }
 
 /*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+	kt_did_t worker, poll;
+	mutex_enter(&sqp->sq_lock);
+	VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+	    SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+	worker = sqp->sq_worker->t_did;
+	poll = sqp->sq_poll_thr->t_did;
+	sqp->sq_state |= SQS_EXIT;
+	cv_signal(&sqp->sq_poll_cv);
+	cv_signal(&sqp->sq_worker_cv);
+	mutex_exit(&sqp->sq_lock);
+
+	thread_join(poll);
+	thread_join(worker);
+	kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
  * Bind squeue worker thread to the specified CPU, given by CPU id.
  * If the CPU id  value is -1, bind the worker thread to the value
  * specified in sq_bind field. If a thread is already bound to a
@@ -380,18 +409,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 			 * Handle squeue switching. More details in the
 			 * block comment at the top of the file
 			 */
-			if (connp->conn_sqp == sqp) {
+			if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
 				SQUEUE_DBG_SET(sqp, mp, proc, connp,
 				    tag);
-				connp->conn_on_sqp = B_TRUE;
+				if (sqp->sq_isip == B_TRUE)
+					connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
 				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
-				connp->conn_on_sqp = B_FALSE;
+				if (sqp->sq_isip == B_TRUE) {
+					connp->conn_on_sqp = B_FALSE;
+					CONN_DEC_REF(connp);
+				}
 				SQUEUE_DBG_CLEAR(sqp);
-				CONN_DEC_REF(connp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -407,7 +439,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				 * still be best to process a single queued
 				 * item if it matches the active connection.
 				 */
-				if (sqp->sq_first != NULL) {
+				if (sqp->sq_first != NULL && sqp->sq_isip) {
 					squeue_try_drain_one(sqp, connp);
 				}
 
@@ -423,7 +455,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				return;
 			}
 		} else {
-			if (ira != NULL) {
+			if (sqp->sq_isip == B_TRUE && ira != NULL) {
 				mblk_t	*attrmp;
 
 				ASSERT(cnt == 1);
@@ -496,7 +528,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 		if (!(sqp->sq_state & SQS_REENTER) &&
 		    (process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
 		    (sqp->sq_run == curthread) && (cnt == 1) &&
-		    (connp->conn_on_sqp == B_FALSE)) {
+		    (sqp->sq_isip == B_FALSE ||
+		    connp->conn_on_sqp == B_FALSE)) {
 			sqp->sq_state |= SQS_REENTER;
 			mutex_exit(&sqp->sq_lock);
 
@@ -511,15 +544,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 			 * Handle squeue switching. More details in the
 			 * block comment at the top of the file
 			 */
-			if (connp->conn_sqp == sqp) {
-				connp->conn_on_sqp = B_TRUE;
+			if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+				SQUEUE_DBG_SET(sqp, mp, proc, connp,
+				    tag);
+				if (sqp->sq_isip == B_TRUE)
+					connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
 				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
-				connp->conn_on_sqp = B_FALSE;
-				CONN_DEC_REF(connp);
+				if (sqp->sq_isip == B_TRUE) {
+					connp->conn_on_sqp = B_FALSE;
+					CONN_DEC_REF(connp);
+				}
+				SQUEUE_DBG_CLEAR(sqp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -540,7 +579,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 #ifdef DEBUG
 		mp->b_tag = tag;
 #endif
-		if (ira != NULL) {
+		if (sqp->sq_isip && ira != NULL) {
 			mblk_t	*attrmp;
 
 			ASSERT(cnt == 1);
@@ -658,7 +697,7 @@ again:
 		mp->b_prev = NULL;
 
 		/* Is there an ip_recv_attr_t to handle? */
-		if (ip_recv_attr_is_mblk(mp)) {
+		if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
 			mblk_t	*attrmp = mp;
 
 			ASSERT(attrmp->b_cont != NULL);
@@ -683,20 +722,25 @@ again:
 
 
 		/*
-		 * Handle squeue switching. More details in the
-		 * block comment at the top of the file
+		 * Handle squeue switching. More details in the block comment at
+		 * the top of the file. non-IP squeues cannot switch, as there
+		 * is no conn_t.
 		 */
-		if (connp->conn_sqp == sqp) {
+		if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
 			SQUEUE_DBG_SET(sqp, mp, proc, connp,
 			    mp->b_tag);
-			connp->conn_on_sqp = B_TRUE;
+			if (sqp->sq_isip == B_TRUE)
+				connp->conn_on_sqp = B_TRUE;
 			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 			    sqp, mblk_t *, mp, conn_t *, connp);
 			(*proc)(connp, mp, sqp, ira);
 			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 			    sqp, conn_t *, connp);
-			connp->conn_on_sqp = B_FALSE;
-			CONN_DEC_REF(connp);
+			if (sqp->sq_isip == B_TRUE) {
+				connp->conn_on_sqp = B_FALSE;
+				CONN_DEC_REF(connp);
+			}
+			SQUEUE_DBG_CLEAR(sqp);
 		} else {
 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
 			    SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -925,6 +969,11 @@ squeue_polling_thread(squeue_t *sqp)
 		cv_wait(async, lock);
 		CALLB_CPR_SAFE_END(&cprinfo, lock);
 
+		if (sqp->sq_state & SQS_EXIT) {
+			mutex_exit(lock);
+			thread_exit();
+		}
+
 		ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
 		    SQS_POLL_THR_QUIESCED);
 		if (ctl_state != 0) {
@@ -950,6 +999,9 @@ squeue_polling_thread(squeue_t *sqp)
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
 
+		/* Only IP related squeues should reach this point */
+		VERIFY(sqp->sq_isip == B_TRUE);
+
 poll_again:
 		sq_rx_ring = sqp->sq_rx_ring;
 		sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1079,6 +1131,7 @@ squeue_worker_thr_control(squeue_t *sqp)
 	ill_rx_ring_t	*rx_ring;
 
 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	if (sqp->sq_state & SQS_POLL_RESTART) {
 		/* Restart implies a previous quiesce. */
@@ -1190,6 +1243,11 @@ squeue_worker(squeue_t *sqp)
 
 	for (;;) {
 		for (;;) {
+			if (sqp->sq_state & SQS_EXIT) {
+				mutex_exit(lock);
+				thread_exit();
+			}
+
 			/*
 			 * If the poll thread has handed control to us
 			 * we need to break out of the wait.
@@ -1286,6 +1344,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
 
 again:
 	sqp = connp->conn_sqp;
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	mutex_enter(&sqp->sq_lock);
 	if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1374,6 +1433,7 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn)
 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
 	ASSERT((sqp->sq_state & SQS_PROC) == 0);
 	ASSERT(sqp->sq_run == NULL);
+	ASSERT(sqp->sq_isip);
 	VERIFY(mp != NULL);
 
 	/*
@@ -1440,6 +1500,9 @@ squeue_try_drain_one(squeue_t *sqp, conn_t *compare_conn)
 	CONN_DEC_REF(connp);
 	SQUEUE_DBG_CLEAR(sqp);
 
+	if (ira != NULL)
+		ira_cleanup(ira, B_TRUE);
+
 done:
 	mutex_enter(&sqp->sq_lock);
 	sqp->sq_state &= ~(SQS_PROC);
@@ -1451,6 +1514,7 @@ squeue_synch_exit(conn_t *connp, int flag)
 {
 	squeue_t *sqp = connp->conn_sqp;
 
+	VERIFY(sqp->sq_isip == B_TRUE);
 	ASSERT(flag == SQ_NODRAIN || flag == SQ_PROCESS);
 
 	mutex_enter(&sqp->sq_lock);
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index 775c5abe6b..3ed2b7174a 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  * Copyright 2020 OmniOS Community Edition (OmniOSce) Association.
@@ -137,6 +137,7 @@ typedef	struct tcphdra_s {
 
 struct conn_s;
 struct tcp_listen_cnt_s;
+struct tcp_rg_s;
 
 /*
  * Control structure for each open TCP stream,
@@ -407,6 +408,13 @@ typedef struct tcp_s {
 	struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
 	struct tcp_s **tcp_ptpbhn;
 
+	/*
+	 * Group of tcp_t entries bound to the same adress and port via
+	 * SO_REUSEPORT.  The pointer itself is protected by tf_lock in the
+	 * containing tcps_bind_fanout slot.
+	 */
+	struct tcp_rg_s	*tcp_rg_bind;
+
 	uint_t		tcp_maxpsz_multiplier;
 
 	uint32_t	tcp_lso_max; /* maximum LSO payload */
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 9348ea3d0f..427a6df274 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -961,8 +961,7 @@ void
 tcp_stop_lingering(tcp_t *tcp)
 {
 	clock_t	delta = 0;
-	tcp_stack_t	*tcps = tcp->tcp_tcps;
-	conn_t		*connp = tcp->tcp_connp;
+	conn_t	*connp = tcp->tcp_connp;
 
 	tcp->tcp_linger_tid = 0;
 	if (tcp->tcp_state > TCPS_LISTEN) {
@@ -990,7 +989,7 @@ tcp_stop_lingering(tcp_t *tcp)
 
 		if (tcp->tcp_state == TCPS_TIME_WAIT) {
 			tcp_time_wait_append(tcp);
-			TCP_DBGSTAT(tcps, tcp_detach_time_wait);
+			TCP_DBGSTAT(tcp->tcp_tcps, tcp_detach_time_wait);
 			goto finish;
 		}
 
@@ -1429,6 +1428,21 @@ tcp_free(tcp_t *tcp)
 		tcp->tcp_cc_algo->cb_destroy(&tcp->tcp_ccv);
 
 	/*
+	 * Destroy any association with SO_REUSEPORT group.
+	 */
+	if (tcp->tcp_rg_bind != NULL) {
+		/*
+		 * This is only necessary for connections which enabled
+		 * SO_REUSEPORT but were never bound.  Such connections should
+		 * be the one and only member of the tcp_rg_tp to which they
+		 * have been associated.
+		 */
+		VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+		tcp_rg_destroy(tcp->tcp_rg_bind);
+		tcp->tcp_rg_bind = NULL;
+	}
+
+	/*
 	 * If this is a non-STREAM socket still holding on to an upper
 	 * handle, release it. As a result of fallback we might also see
 	 * STREAMS based conns with upper handles, in which case there is
@@ -2477,8 +2491,10 @@ tcp_init_values(tcp_t *tcp, tcp_t *parent)
  * Path MTU might have changed by either increase or decrease, so need to
  * adjust the MSS based on the value of ixa_pmtu. No need to handle tiny
  * or negative MSS, since tcp_mss_set() will do it.
+ *
+ * Returns B_TRUE when the connection PMTU changes, otherwise B_FALSE.
  */
-void
+boolean_t
 tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
 {
 	uint32_t	pmtu;
@@ -2488,10 +2504,10 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
 	iaflags_t	ixaflags;
 
 	if (tcp->tcp_tcps->tcps_ignore_path_mtu)
-		return;
+		return (B_FALSE);
 
 	if (tcp->tcp_state < TCPS_ESTABLISHED)
-		return;
+		return (B_FALSE);
 
 	/*
 	 * Always call ip_get_pmtu() to make sure that IP has updated
@@ -2511,13 +2527,13 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
 	 * Nothing to change, so just return.
 	 */
 	if (mss == tcp->tcp_mss)
-		return;
+		return (B_FALSE);
 
 	/*
 	 * Currently, for ICMP errors, only PMTU decrease is handled.
 	 */
 	if (mss > tcp->tcp_mss && decrease_only)
-		return;
+		return (B_FALSE);
 
 	DTRACE_PROBE2(tcp_update_pmtu, int32_t, tcp->tcp_mss, uint32_t, mss);
 
@@ -2552,6 +2568,7 @@ tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only)
 		tcp->tcp_ipha->ipha_fragment_offset_and_flags = 0;
 	}
 	ixa->ixa_flags = ixaflags;
+	return (B_TRUE);
 }
 
 int
@@ -3424,7 +3441,7 @@ tcp_notify(void *arg, ip_xmit_attr_t *ixa, ixa_notify_type_t ntype,
 		tcp_update_lso(tcp, connp->conn_ixa);
 		break;
 	case IXAN_PMTU:
-		tcp_update_pmtu(tcp, B_FALSE);
+		(void) tcp_update_pmtu(tcp, B_FALSE);
 		break;
 	case IXAN_ZCOPY:
 		tcp_update_zcopy(tcp);
@@ -3755,7 +3772,6 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
 {
 	tcp_stack_t	*tcps;
 	int		i;
-	int		error = 0;
 	major_t		major;
 	size_t		arrsz;
 
@@ -3819,8 +3835,7 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
 	tcps->tcps_mibkp = tcp_kstat_init(stackid);
 
 	major = mod_name_to_major(INET_NAME);
-	error = ldi_ident_from_major(major, &tcps->tcps_ldi_ident);
-	ASSERT(error == 0);
+	VERIFY0(ldi_ident_from_major(major, &tcps->tcps_ldi_ident));
 	tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
 	ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
 	cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL);
diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c
index 86242fc944..5c2e1e1932 100644
--- a/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ b/usr/src/uts/common/inet/tcp/tcp_bind.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  * Copyright (c) 2016 by Delphix. All rights reserved.
  */
 
@@ -56,6 +57,7 @@ static uint32_t tcp_random_anon_port = 1;
 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
 		    cred_t *cr);
 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
 
 /*
  * Hash list insertion routine for tcp_t structures. Each hash bucket
@@ -173,6 +175,16 @@ tcp_bind_hash_remove(tcp_t *tcp)
 
 	ASSERT(lockp != NULL);
 	mutex_enter(lockp);
+
+	/* destroy any association with SO_REUSEPORT group */
+	if (tcp->tcp_rg_bind != NULL) {
+		if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+			/* Last one out turns off the lights */
+			tcp_rg_destroy(tcp->tcp_rg_bind);
+		}
+		tcp->tcp_rg_bind = NULL;
+	}
+
 	if (tcp->tcp_ptpbhn) {
 		tcpnext = tcp->tcp_bind_hash_port;
 		if (tcpnext != NULL) {
@@ -638,13 +650,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 }
 
 /*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
  *
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it.  If not, return
- * the first anonymous port we happen across.  If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it.  If not, return the first anonymous port we
+ * happen across.  If no anonymous ports are available, return 0.
  *
  * In either case, when succeeding update the tcp_t to record the port number
  * and insert it in the bind hash table.
@@ -664,6 +675,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 	int loopmax;
 	conn_t *connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	boolean_t reuseport = connp->conn_reuseport;
 
 	/*
 	 * Lookup for free addresses is done in a loop and "loopmax"
@@ -700,6 +712,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		tf_t		*tbf;
 		tcp_t		*ltcp;
 		conn_t		*lconnp;
+		boolean_t	attempt_reuse = B_FALSE;
 
 		lport = htons(port);
 
@@ -726,6 +739,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 			boolean_t not_socket;
 			boolean_t exclbind;
+			boolean_t addrmatch;
 
 			lconnp = ltcp->tcp_connp;
 
@@ -831,22 +845,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			    &lconnp->conn_faddr_v6)))
 				continue;
 
+			addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+			    &lconnp->conn_bound_addr_v6);
+
+			if (addrmatch && reuseport && bind_to_req_port_only &&
+			    (ltcp->tcp_state == TCPS_BOUND ||
+			    ltcp->tcp_state == TCPS_LISTEN)) {
+				/*
+				 * This entry is bound to the exact same
+				 * address and port.  If SO_REUSEPORT is set on
+				 * the calling socket, attempt to reuse this
+				 * binding if it too had SO_REUSEPORT enabled
+				 * when it was bound.
+				 */
+				attempt_reuse = (ltcp->tcp_rg_bind != NULL);
+				break;
+			}
+
 			if (!reuseaddr) {
 				/*
-				 * No socket option SO_REUSEADDR.
-				 * If existing port is bound to
-				 * a non-wildcard IP address
-				 * and the requesting stream is
-				 * bound to a distinct
-				 * different IP addresses
-				 * (non-wildcard, also), keep
-				 * going.
+				 * No socket option SO_REUSEADDR.  If an
+				 * existing port is bound to a non-wildcard IP
+				 * address and the requesting stream is bound
+				 * to a distinct different IP address
+				 * (non-wildcard, also), keep going.
 				 */
 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 				    !V6_OR_V4_INADDR_ANY(
 				    lconnp->conn_bound_addr_v6) &&
-				    !IN6_ARE_ADDR_EQUAL(laddr,
-				    &lconnp->conn_bound_addr_v6))
+				    !addrmatch)
 					continue;
 				if (ltcp->tcp_state >= TCPS_BOUND) {
 					/*
@@ -861,27 +888,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 				 * socket option SO_REUSEADDR is set on the
 				 * binding tcp_t.
 				 *
-				 * If two streams are bound to
-				 * same IP address or both addr
-				 * and bound source are wildcards
-				 * (INADDR_ANY), we want to stop
-				 * searching.
-				 * We have found a match of IP source
-				 * address and source port, which is
-				 * refused regardless of the
-				 * SO_REUSEADDR setting, so we break.
+				 * If two streams are bound to the same IP
+				 * address or both addr and bound source are
+				 * wildcards (INADDR_ANY), we want to stop
+				 * searching.  We have found a match of IP
+				 * source address and source port, which is
+				 * refused regardless of the SO_REUSEADDR
+				 * setting, so we break.
 				 */
-				if (IN6_ARE_ADDR_EQUAL(laddr,
-				    &lconnp->conn_bound_addr_v6) &&
+				if (addrmatch &&
 				    (ltcp->tcp_state == TCPS_LISTEN ||
 				    ltcp->tcp_state == TCPS_BOUND))
 					break;
 			}
 		}
-		if (ltcp != NULL) {
+		if (ltcp != NULL && !attempt_reuse) {
 			/* The port number is busy */
 			mutex_exit(&tbf->tf_lock);
 		} else {
+			if (attempt_reuse) {
+				int err;
+				struct tcp_rg_s *rg;
+
+				ASSERT(ltcp != NULL);
+				ASSERT(ltcp->tcp_rg_bind != NULL);
+				ASSERT(tcp->tcp_rg_bind != NULL);
+				ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+				err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+				if (err != 0) {
+					mutex_exit(&tbf->tf_lock);
+					return (0);
+				}
+				/*
+				 * Now that the newly-binding socket has joined
+				 * the existing reuseport group on ltcp, it
+				 * should clean up its own (empty) group.
+				 */
+				rg = tcp->tcp_rg_bind;
+				tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+				VERIFY(tcp_rg_remove(rg, tcp));
+				tcp_rg_destroy(rg);
+			}
+
 			/*
 			 * This port is ours. Insert in fanout and mark as
 			 * bound to prevent others from getting the port
@@ -946,3 +995,124 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 	} while (++count < loopmax);
 	return (0);
 }
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define	TCP_RG_SIZE_MAX		64
+/* Step size when expanding members array */
+#define	TCP_RG_SIZE_STEP	2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+	tcp_rg_t *rg;
+	rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP_LAZY);
+	if (rg == NULL)
+		return (NULL);
+	rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *), KM_NOSLEEP_LAZY);
+	if (rg->tcprg_members == NULL) {
+		kmem_free(rg, sizeof (tcp_rg_t));
+		return (NULL);
+	}
+
+	mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+	rg->tcprg_size = 2;
+	rg->tcprg_count = 1;
+	rg->tcprg_active = 1;
+	rg->tcprg_members[0] = tcp;
+	return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+	mutex_enter(&rg->tcprg_lock);
+	ASSERT(rg->tcprg_count == 0);
+	ASSERT(rg->tcprg_active == 0);
+	kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+	mutex_destroy(&rg->tcprg_lock);
+	kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+	mutex_enter(&rg->tcprg_lock);
+
+	VERIFY(rg->tcprg_size > 0);
+	VERIFY(rg->tcprg_count <= rg->tcprg_size);
+	if (rg->tcprg_count != 0) {
+		cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+		cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+		if (crgetuid(oldcred) != crgetuid(newcred) ||
+		    crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+			mutex_exit(&rg->tcprg_lock);
+			return (EPERM);
+		}
+	}
+
+	if (rg->tcprg_count == rg->tcprg_size) {
+		unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+		unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+		tcp_t **newmembers;
+
+		if (newsize > TCP_RG_SIZE_MAX) {
+			mutex_exit(&rg->tcprg_lock);
+			return (EINVAL);
+		}
+		newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+		    KM_NOSLEEP_LAZY);
+		if (newmembers == NULL) {
+			mutex_exit(&rg->tcprg_lock);
+			return (ENOMEM);
+		}
+		bcopy(rg->tcprg_members, newmembers, oldalloc);
+		kmem_free(rg->tcprg_members, oldalloc);
+		rg->tcprg_members = newmembers;
+		rg->tcprg_size = newsize;
+	}
+
+	rg->tcprg_members[rg->tcprg_count] = tcp;
+	rg->tcprg_count++;
+	rg->tcprg_active++;
+
+	mutex_exit(&rg->tcprg_lock);
+	return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+	int i;
+	boolean_t is_empty;
+
+	mutex_enter(&rg->tcprg_lock);
+	for (i = 0; i < rg->tcprg_count; i++) {
+		if (rg->tcprg_members[i] == tcp)
+			break;
+	}
+	/* The item should be present */
+	ASSERT(i < rg->tcprg_count);
+	/* Move the last member into this position */
+	rg->tcprg_count--;
+	rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+	rg->tcprg_members[rg->tcprg_count] = NULL;
+	if (tcp->tcp_connp->conn_reuseport != 0)
+		rg->tcprg_active--;
+	is_empty = (rg->tcprg_count == 0);
+	mutex_exit(&rg->tcprg_lock);
+	return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+	mutex_enter(&rg->tcprg_lock);
+	if (is_active) {
+		rg->tcprg_active++;
+	} else {
+		rg->tcprg_active--;
+	}
+	mutex_exit(&rg->tcprg_lock);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_input.c b/usr/src/uts/common/inet/tcp/tcp_input.c
index dd264528fc..22b0019a6a 100644
--- a/usr/src/uts/common/inet/tcp/tcp_input.c
+++ b/usr/src/uts/common/inet/tcp/tcp_input.c
@@ -5715,10 +5715,12 @@ noticmpv4:
 		switch (icmph->icmph_code) {
 		case ICMP_FRAGMENTATION_NEEDED:
 			/*
-			 * Update Path MTU, then try to send something out.
+			 * Attempt to update path MTU and, if the MSS of the
+			 * connection is altered, retransmit outstanding data.
 			 */
-			tcp_update_pmtu(tcp, B_TRUE);
-			tcp_rexmit_after_error(tcp);
+			if (tcp_update_pmtu(tcp, B_TRUE)) {
+				tcp_rexmit_after_error(tcp);
+			}
 			break;
 		case ICMP_PORT_UNREACHABLE:
 		case ICMP_PROTOCOL_UNREACHABLE:
@@ -5761,7 +5763,7 @@ noticmpv4:
 			break;
 		}
 		break;
-	case ICMP_SOURCE_QUENCH: {
+	case ICMP_SOURCE_QUENCH:
 		/*
 		 * use a global boolean to control
 		 * whether TCP should respond to ICMP_SOURCE_QUENCH.
@@ -5786,7 +5788,6 @@ noticmpv4:
 		}
 		break;
 	}
-	}
 	freemsg(mp);
 }
 
@@ -5839,10 +5840,12 @@ noticmpv6:
 	switch (icmp6->icmp6_type) {
 	case ICMP6_PACKET_TOO_BIG:
 		/*
-		 * Update Path MTU, then try to send something out.
+		 * Attempt to update path MTU and, if the MSS of the connection
+		 * is altered, retransmit outstanding data.
 		 */
-		tcp_update_pmtu(tcp, B_TRUE);
-		tcp_rexmit_after_error(tcp);
+		if (tcp_update_pmtu(tcp, B_TRUE)) {
+			tcp_rexmit_after_error(tcp);
+		}
 		break;
 	case ICMP6_DST_UNREACH:
 		switch (icmp6->icmp6_code) {
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 8687b52d53..15e49ae070 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -67,7 +67,8 @@ opdes_t	tcp_opt_arr[] = {
 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
@@ -505,6 +506,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 }
 
 /*
+ * Set a TCP connection's participation in SO_REUSEPORT.  This operation is
+ * performed under the protection of the squeue via tcp_setsockopt.
+ * The manipulation of tcp_rg_bind, as part of this operation, is subject to
+ * these constraints:
+ * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
+ *    under the protection of the squeue.
+ * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
+ *    altered until such time as tcp_free() cleans up the connection.
+ * 3. A connection undergoing bind, which matches to a connection participating
+ *    in port-reuse, will switch its tcp_rg_bind pointer when it joins the
+ *    group of an existing connection in tcp_bindi().
+ */
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+	tcp_t *tcp = connp->conn_tcp;
+	struct tcp_rg_s *rg;
+
+	if (!IPCL_IS_NONSTR(connp)) {
+		if (do_enable) {
+			/*
+			 * SO_REUSEPORT cannot be enabled on sockets which have
+			 * fallen back to the STREAMS API.
+			 */
+			return (EINVAL);
+		} else {
+			/*
+			 * A connection with SO_REUSEPORT enabled should be
+			 * prevented from falling back to STREAMS mode via
+			 * logic in tcp_fallback.  It is legal, however, for
+			 * fallen-back connections to affirm the disabled state
+			 * of SO_REUSEPORT.
+			 */
+			ASSERT(connp->conn_reuseport == 0);
+			return (0);
+		}
+	}
+	if (tcp->tcp_state <= TCPS_CLOSED) {
+		return (EINVAL);
+	}
+	if (connp->conn_reuseport == 0 && do_enable) {
+		/* disabled -> enabled */
+		if (tcp->tcp_rg_bind != NULL) {
+			tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+		} else {
+			/*
+			 * Connection state is not a concern when initially
+			 * populating tcp_rg_bind.  Setting it to non-NULL on a
+			 * bound or listening connection would only mean that
+			 * new reused-port binds become a possibility.
+			 */
+			if ((rg = tcp_rg_init(tcp)) == NULL) {
+				return (ENOMEM);
+			}
+			tcp->tcp_rg_bind = rg;
+		}
+		connp->conn_reuseport = 1;
+	} else if (connp->conn_reuseport != 0 && !do_enable) {
+		/* enabled -> disabled */
+		ASSERT(tcp->tcp_rg_bind != NULL);
+		if (tcp->tcp_state == TCPS_IDLE) {
+			/*
+			 * If the connection has not been bound yet, discard
+			 * the reuse group state.  Since disabling SO_REUSEPORT
+			 * on a bound socket will _not_ prevent others from
+			 * reusing the port, the presence of tcp_rg_bind is
+			 * used to determine reuse availability, not
+			 * conn_reuseport.
+			 *
+			 * This allows proper behavior for examples such as:
+			 *
+			 * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
+			 * bind(fd1, &myaddr, ...);
+			 * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
+			 *
+			 * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
+			 * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
+			 *
+			 */
+			rg = tcp->tcp_rg_bind;
+			tcp->tcp_rg_bind = NULL;
+			VERIFY(tcp_rg_remove(rg, tcp));
+			tcp_rg_destroy(rg);
+		} else {
+			/*
+			 * If a connection has been bound, it's no longer safe
+			 * to manipulate tcp_rg_bind until connection clean-up
+			 * during tcp_free.  Just mark the member status of the
+			 * connection as inactive.
+			 */
+			tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+		}
+		connp->conn_reuseport = 0;
+	}
+	return (0);
+}
+
+/*
  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
  * Parameters are assumed to be verified by the caller.
  */
@@ -674,6 +773,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			}
 			*outlenp = inlen;
 			return (0);
+		case SO_REUSEPORT:
+			if (!checkonly) {
+				return (tcp_set_reuseport(connp, *i1 != 0));
+			}
+			return (0);
 		}
 		break;
 	case IPPROTO_TCP:
@@ -1031,10 +1135,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		}
 		break;
 	case IPPROTO_IP:
-		if (connp->conn_family != AF_INET) {
-			*outlenp = 0;
-			return (EINVAL);
-		}
 		switch (name) {
 		case IP_SEC_OPT:
 			/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index 9b6c0daac3..32422be675 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -1029,6 +1029,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	}
 
 	/*
+	 * Do not allow fallback on connections making use of SO_REUSEPORT.
+	 */
+	if (tcp->tcp_rg_bind != NULL) {
+		freeb(stropt_mp);
+		freeb(ordrel_mp);
+		squeue_synch_exit(connp, SQ_NODRAIN);
+		return (EINVAL);
+	}
+
+	/*
 	 * Both endpoints must be of the same type (either STREAMS or
 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
 	 * we have to unfuse.
diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c
index e29c76a696..226467e167 100644
--- a/usr/src/uts/common/inet/tcp/tcp_stats.c
+++ b/usr/src/uts/common/inet/tcp/tcp_stats.c
@@ -21,8 +21,8 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
  * Copyright (c) 2015, 2016 by Delphix. All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  * Copyright 2019 OmniOS Community Edition (OmniOSce) Association.
  */
 
@@ -131,9 +131,14 @@ tcp_set_conninfo(tcp_t *tcp, struct tcpConnEntryInfo_s *tcei, boolean_t ispriv)
 	tcei->ce_rto =  tcp->tcp_rto;
 	tcei->ce_mss =  tcp->tcp_mss;
 	tcei->ce_state = tcp->tcp_state;
-	tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3);
 	tcei->ce_rtt_sum = NSEC2USEC(tcp->tcp_rtt_sum);
 	tcei->ce_rtt_cnt = tcp->tcp_rtt_cnt;
+
+	/* tcp_rtt_sa is stored as 8 times the average RTT */
+	tcei->ce_rtt_sa = NSEC2USEC(tcp->tcp_rtt_sa >> 3);
+
+	/* tcp_rtt_sd is stored as 4 times the average RTTVAR */
+	tcei->ce_rtt_sd = NSEC2USEC(tcp->tcp_rtt_sd >> 2);
 }
 
 /*
diff --git a/usr/src/uts/common/inet/tcp/tcp_timers.c b/usr/src/uts/common/inet/tcp/tcp_timers.c
index 5793a7fd27..7d9b449392 100644
--- a/usr/src/uts/common/inet/tcp/tcp_timers.c
+++ b/usr/src/uts/common/inet/tcp/tcp_timers.c
@@ -22,7 +22,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright 2011 Joyent, Inc.  All rights reserved.
+ * Copyright 2019 Joyent, Inc.
  * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
  */
 
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 5669592cff..61af05f749 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls;
  * by setting it to 0.
  */
 #define	TCP_XMIT_LOWATER	4096
-#define	TCP_XMIT_HIWATER	49152
+#define	TCP_XMIT_HIWATER	128000
 #define	TCP_RECV_LOWATER	2048
-#define	TCP_RECV_HIWATER	128000
+#define	TCP_RECV_HIWATER	1048576
 
 /*
  * Bind hash list size and has function.  It has to be a power of 2 for
@@ -395,6 +395,22 @@ typedef struct tcp_listen_cnt_s {
 	uint32_t	tlc_drop;
 } tcp_listen_cnt_t;
 
+/*
+ * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT.
+ * - tcprg_lock:	Protects the other fields
+ * - tcprg_size:	Allocated size (in entries) of tcprg_members array
+ * - tcprg_count:	Count of occupied tcprg_members slots
+ * - tcprg_active:	Count of members which still have SO_REUSEPORT set
+ * - tcprg_members:	Connections associated with address/port group
+ */
+typedef struct tcp_rg_s {
+	kmutex_t	tcprg_lock;
+	unsigned int	tcprg_size;
+	unsigned int	tcprg_count;
+	unsigned int	tcprg_active;
+	tcp_t		**tcprg_members;
+} tcp_rg_t;
+
 #define	TCP_TLC_REPORT_INTERVAL	(30 * MINUTES)
 
 #define	TCP_DECR_LISTEN_CNT(tcp)					\
@@ -678,7 +694,7 @@ extern int	tcp_rwnd_set(tcp_t *, uint32_t);
 extern int	tcp_set_destination(tcp_t *);
 extern void	tcp_set_ws_value(tcp_t *);
 extern void	tcp_stop_lingering(tcp_t *);
-extern void	tcp_update_pmtu(tcp_t *, boolean_t);
+extern boolean_t	tcp_update_pmtu(tcp_t *, boolean_t);
 extern mblk_t	*tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
 extern boolean_t	tcp_zcopy_check(tcp_t *);
 extern void	tcp_zcopy_notify(tcp_t *);
@@ -695,6 +711,10 @@ extern in_port_t	tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *,
 			    int, boolean_t, boolean_t, boolean_t);
 extern in_port_t	tcp_update_next_port(in_port_t, const tcp_t *,
 			    boolean_t);
+extern tcp_rg_t *tcp_rg_init(tcp_t *);
+extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *);
+extern void tcp_rg_destroy(tcp_rg_t *);
+extern void tcp_rg_setactive(tcp_rg_t *, boolean_t);
 
 /*
  * Fusion related functions in tcp_fusion.c.
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 5d42a69fa2..4e208465f2 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -1671,6 +1671,11 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
 			*i1 = udp->udp_vxlanhash;
 			mutex_exit(&connp->conn_lock);
 			return (sizeof (int));
+		case UDP_SND_TO_CONNECTED:
+			mutex_enter(&connp->conn_lock);
+			*i1 = udp->udp_snd_to_conn ? 1 : 0;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
 		}
 	}
 	mutex_enter(&connp->conn_lock);
@@ -1826,6 +1831,11 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
 			}
 			/* Fully handled this option. */
 			return (0);
+		case UDP_SND_TO_CONNECTED:
+			mutex_enter(&connp->conn_lock);
+			udp->udp_snd_to_conn = onoff;
+			mutex_exit(&connp->conn_lock);
+			return (0);
 		}
 		break;
 	}
@@ -6096,10 +6106,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 		else
 			return (error);
 	}
-	if (udp->udp_state == TS_DATA_XFER) {
+
+	/*
+	 * Check if we're allowed to send to a connection on which we've
+	 * already called 'connect'. The posix spec. allows both behaviors but
+	 * historically we've returned an error if already connected. The
+	 * client can allow this via a sockopt.
+	 */
+	if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {
 		UDPS_BUMP_MIB(us, udpOutErrors);
 		return (EISCONN);
 	}
+
 	error = proto_verify_ip_addr(connp->conn_family,
 	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
 	if (error != 0) {
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index c8e7d79e47..9c05b8c876 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -294,7 +294,9 @@ opdes_t	udp_opt_arr[] = {
 	},
 { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int),
 	0 },
-{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 }
+{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 },
+{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+	0 }
 };
 
 /*
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 0fc597ccf3..ef11973707 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -179,12 +179,12 @@ typedef	struct udp_s {
 		udp_issocket : 1,	/* socket mode; sockfs is on top */
 		udp_nat_t_endpoint : 1,	/* UDP_NAT_T_ENDPOINT option */
 		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
-
 		udp_vxlanhash: 1,	/* UDP_SRCPORT_HASH option */
 					/* Because there's only VXLAN, cheat */
 					/* and only use a single bit */
+		udp_snd_to_conn: 1,	/* UDP_SND_TO_CONNECTED option */
 
-		udp_pad_to_bit_31 : 28;
+		udp_pad_to_bit_31 : 27;
 
 	/* Following 2 fields protected by the uf_lock */
 	struct udp_s	*udp_bind_hash; /* Bind hash chain */