32 files changed, 1865 insertions, 178 deletions
diff --git a/usr/src/uts/common/inet/bpf.h b/usr/src/uts/common/inet/bpf.h
new file mode 100644
index 0000000000..e3eac799e5
--- /dev/null
+++ b/usr/src/uts/common/inet/bpf.h
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#ifndef	_INET_BPF_H
+#define	_INET_BPF_H
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+
+#ifdef	_KERNEL
+
+#include <sys/types.h>
+
+/*
+ * Clone bpf_insn definition so that consumers don't need net/bpf.h to reason
+ * about struct sizing.
+ */
+typedef struct ip_bpf_insn {
+	uint16_t	code;
+	uint8_t		jt;
+	uint8_t		jf;
+	uint32_t	k;
+} ip_bpf_insn_t;
+
+extern uint32_t ip_bpf_filter(ip_bpf_insn_t *, uchar_t *, uint_t, uint_t);
+extern boolean_t ip_bpf_validate(ip_bpf_insn_t *, uint_t);
+
+
+#endif	/* _KERNEL */
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _INET_BPF_H */
diff --git a/usr/src/uts/common/inet/bpf_filter.c b/usr/src/uts/common/inet/bpf_filter.c
new file mode 100644
index 0000000000..5a9ba38da6
--- /dev/null
+++ b/usr/src/uts/common/inet/bpf_filter.c
@@ -0,0 +1,572 @@
+/*	$NetBSD: bpf_filter.c,v 1.35 2008/08/20 13:01:54 joerg Exp $	*/
+
+/*
+ * Copyright (c) 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from the Stanford/CMU enet packet filter,
+ * (net/enet.c) distributed as part of 4.3BSD, and code contributed
+ * to Berkeley by Steven McCanne and Van Jacobson both of Lawrence
+ * Berkeley Laboratory.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)bpf_filter.c	8.1 (Berkeley) 6/10/93
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/time.h>
+#include <sys/stream.h>
+#include <sys/byteorder.h>
+#include <sys/sdt.h>
+#include <inet/bpf.h>
+#include <net/bpf.h>
+
+#define	EXTRACT_SHORT(p)	BE_IN16(p)
+#define	EXTRACT_LONG(p)		BE_IN32(p)
+
+#define	M_LEN(_m)	((_m)->b_wptr - (_m)->b_rptr)
+#define	mtod(_a, _t)	((_t)((_a)->b_rptr))
+#define	MINDEX(len, m, k) 		\
+{ 					\
+	len = M_LEN(m); 		\
+	while (k >= len) { 		\
+		k -= len; 		\
+		m = m->b_cont; 		\
+		if (m == 0) 		\
+			return (0); 	\
+		len = M_LEN(m); 	\
+	} 				\
+}
+
+static int m_xword(mblk_t *, uint32_t, int *);
+static int m_xhalf(mblk_t *, uint32_t, int *);
+
+static int
+m_xword(mblk_t *m, uint32_t k, int *err)
+{
+	int len;
+	uchar_t *cp, *np;
+	mblk_t *m0;
+
+	*err = 1;
+	MINDEX(len, m, k);
+	cp = mtod(m, uchar_t *) + k;
+	if (len >= k + 4) {
+		*err = 0;
+		return (EXTRACT_LONG(cp));
+	}
+	m0 = m->b_cont;
+	if (m0 == 0 || M_LEN(m0) + len - k < 4) {
+		DTRACE_PROBE3(mblk_xword_fail, mblk_t *, m0, int, len, int, k);
+		return (0);
+	}
+	*err = 0;
+	np = mtod(m0, uchar_t *);
+	switch (len - k) {
+
+	case 1:
+		return ((cp[0] << 24) | (np[0] << 16) | (np[1] << 8) | np[2]);
+
+	case 2:
+		return ((cp[0] << 24) | (cp[1] << 16) | (np[0] << 8) | np[1]);
+
+	default:
+		return ((cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | np[0]);
+	}
+}
+
+static int
+m_xhalf(mblk_t *m, uint32_t k, int *err)
+{
+	int len;
+	uchar_t *cp;
+	mblk_t *m0;
+
+	*err = 1;
+	MINDEX(len, m, k);
+	cp = mtod(m, uchar_t *) + k;
+	if (len >= k + 2) {
+		*err = 0;
+		return (EXTRACT_SHORT(cp));
+	}
+	m0 = m->b_cont;
+	if (m0 == 0) {
+		DTRACE_PROBE3(mblk_xhalf_fail, mblk_t *, m0, int, len, int, k);
+		return (0);
+	}
+	*err = 0;
+	return ((cp[0] << 8) | mtod(m0, uchar_t *)[0]);
+}
+
+
+/*
+ * Execute the filter program starting at pc on the packet p
+ * wirelen is the length of the original packet
+ * buflen is the amount of data present
+ * When buflen is non-0, p is a pointer to a the start of the packet and the
+ * packet is only in one mblk_t.
+ * When buflen is 0, p is an mblk_t pointer.
+ */
+uint32_t
+ip_bpf_filter(ip_bpf_insn_t *pc, uchar_t *p, uint_t wirelen, uint_t buflen)
+{
+	uint32_t A, X, k;
+	uint32_t mem[BPF_MEMWORDS];
+
+	if (pc == 0)
+		/*
+		 * No filter means accept all.
+		 */
+		return ((uint32_t)-1);
+	A = 0;
+	X = 0;
+	--pc;
+	/* CONSTCOND */
+	while (1) {
+		++pc;
+		switch (pc->code) {
+
+		default:
+#ifdef _KERNEL
+			DTRACE_PROBE1(bpf_insn_unknown,
+			    struct bpf_insn *, pc);
+			return (0);
+#else
+			abort();
+#endif
+		case BPF_RET|BPF_K:
+			return (pc->k);
+
+		case BPF_RET|BPF_A:
+			return (A);
+
+		case BPF_LD|BPF_W|BPF_ABS:
+			k = pc->k;
+			if (k + sizeof (int32_t) > buflen) {
+#ifdef _KERNEL
+				int merr = 0;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xword((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_LONG(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_H|BPF_ABS:
+			k = pc->k;
+			if (k + sizeof (int16_t) > buflen) {
+#ifdef _KERNEL
+				int merr;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xhalf((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_SHORT(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_B|BPF_ABS:
+			k = pc->k;
+			if (k >= buflen) {
+#ifdef _KERNEL
+				mblk_t *m;
+				int len;
+
+				if (buflen != 0)
+					return (0);
+				m = (mblk_t *)p;
+				MINDEX(len, m, k);
+				A = mtod(m, uchar_t *)[k];
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = p[k];
+			continue;
+
+		case BPF_LD|BPF_W|BPF_LEN:
+			A = wirelen;
+			continue;
+
+		case BPF_LDX|BPF_W|BPF_LEN:
+			X = wirelen;
+			continue;
+
+		case BPF_LD|BPF_W|BPF_IND:
+			k = X + pc->k;
+			if (k + sizeof (int32_t) > buflen) {
+#ifdef _KERNEL
+				int merr = 0;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xword((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_LONG(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_H|BPF_IND:
+			k = X + pc->k;
+			if (k + sizeof (int16_t) > buflen) {
+#ifdef _KERNEL
+				int merr = 0;
+
+				if (buflen != 0)
+					return (0);
+				A = m_xhalf((mblk_t *)p, k, &merr);
+				if (merr != 0)
+					return (0);
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = EXTRACT_SHORT(&p[k]);
+			continue;
+
+		case BPF_LD|BPF_B|BPF_IND:
+			k = X + pc->k;
+			if (k >= buflen) {
+#ifdef _KERNEL
+				mblk_t *m;
+				int len;
+
+				if (buflen != 0)
+					return (0);
+				m = (mblk_t *)p;
+				MINDEX(len, m, k);
+				A = mtod(m, uchar_t *)[k];
+				continue;
+#else
+				return (0);
+#endif
+			}
+			A = p[k];
+			continue;
+
+		case BPF_LDX|BPF_MSH|BPF_B:
+			k = pc->k;
+			if (k >= buflen) {
+#ifdef _KERNEL
+				mblk_t *m;
+				int len;
+
+				if (buflen != 0)
+					return (0);
+				m = (mblk_t *)p;
+				MINDEX(len, m, k);
+				X = (mtod(m, char *)[k] & 0xf) << 2;
+				continue;
+#else
+				return (0);
+#endif
+			}
+			X = (p[pc->k] & 0xf) << 2;
+			continue;
+
+		case BPF_LD|BPF_IMM:
+			A = pc->k;
+			continue;
+
+		case BPF_LDX|BPF_IMM:
+			X = pc->k;
+			continue;
+
+		case BPF_LD|BPF_MEM:
+			A = mem[pc->k];
+			continue;
+
+		case BPF_LDX|BPF_MEM:
+			X = mem[pc->k];
+			continue;
+
+		case BPF_ST:
+			mem[pc->k] = A;
+			continue;
+
+		case BPF_STX:
+			mem[pc->k] = X;
+			continue;
+
+		case BPF_JMP|BPF_JA:
+			pc += pc->k;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_K:
+			pc += (A > pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_K:
+			pc += (A >= pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_K:
+			pc += (A == pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_K:
+			pc += (A & pc->k) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGT|BPF_X:
+			pc += (A > X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JGE|BPF_X:
+			pc += (A >= X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JEQ|BPF_X:
+			pc += (A == X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_JMP|BPF_JSET|BPF_X:
+			pc += (A & X) ? pc->jt : pc->jf;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_X:
+			A += X;
+			continue;
+
+		case BPF_ALU|BPF_SUB|BPF_X:
+			A -= X;
+			continue;
+
+		case BPF_ALU|BPF_MUL|BPF_X:
+			A *= X;
+			continue;
+
+		case BPF_ALU|BPF_DIV|BPF_X:
+			if (X == 0)
+				return (0);
+			A /= X;
+			continue;
+
+		case BPF_ALU|BPF_AND|BPF_X:
+			A &= X;
+			continue;
+
+		case BPF_ALU|BPF_OR|BPF_X:
+			A |= X;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_X:
+			A <<= X;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_X:
+			A >>= X;
+			continue;
+
+		case BPF_ALU|BPF_ADD|BPF_K:
+			A += pc->k;
+			continue;
+
+		case BPF_ALU|BPF_SUB|BPF_K:
+			A -= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_MUL|BPF_K:
+			A *= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_DIV|BPF_K:
+			A /= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_AND|BPF_K:
+			A &= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_OR|BPF_K:
+			A |= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_LSH|BPF_K:
+			A <<= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_RSH|BPF_K:
+			A >>= pc->k;
+			continue;
+
+		case BPF_ALU|BPF_NEG:
+			A = -A;
+			continue;
+
+		case BPF_MISC|BPF_TAX:
+			X = A;
+			continue;
+
+		case BPF_MISC|BPF_TXA:
+			A = X;
+			continue;
+		}
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Return true if the 'fcode' is a valid filter program.
+ * The constraints are that each jump be forward and to a valid
+ * code, that memory accesses are within valid ranges (to the
+ * extent that this can be checked statically; loads of packet
+ * data have to be, and are, also checked at run time), and that
+ * the code terminates with either an accept or reject.
+ *
+ * The kernel needs to be able to verify an application's filter code.
+ * Otherwise, a bogus program could easily crash the system.
+ */
+boolean_t
+ip_bpf_validate(ip_bpf_insn_t *f, uint_t len)
+{
+	uint_t i, from;
+	ip_bpf_insn_t *p;
+
+	if (len < 1 || len > BPF_MAXINSNS)
+		return (B_FALSE);
+
+	for (i = 0; i < len; ++i) {
+		p = &f[i];
+		DTRACE_PROBE1(bpf_valid_insn, struct bpf_insn *, p);
+		switch (BPF_CLASS(p->code)) {
+		/*
+		 * Check that memory operations use valid addresses.
+		 */
+		case BPF_LD:
+		case BPF_LDX:
+			switch (BPF_MODE(p->code)) {
+			case BPF_MEM:
+				if (p->k >= BPF_MEMWORDS)
+					return (B_FALSE);
+				break;
+			case BPF_ABS:
+			case BPF_IND:
+			case BPF_MSH:
+			case BPF_IMM:
+			case BPF_LEN:
+				break;
+			default:
+				return (B_FALSE);
+			}
+			break;
+		case BPF_ST:
+		case BPF_STX:
+			if (p->k >= BPF_MEMWORDS)
+				return (B_FALSE);
+			break;
+		case BPF_ALU:
+			switch (BPF_OP(p->code)) {
+			case BPF_ADD:
+			case BPF_SUB:
+			case BPF_MUL:
+			case BPF_OR:
+			case BPF_AND:
+			case BPF_LSH:
+			case BPF_RSH:
+			case BPF_NEG:
+				break;
+			case BPF_DIV:
+				/*
+				 * Check for constant division by 0.
+				 */
+				if (BPF_RVAL(p->code) == BPF_K && p->k == 0)
+					return (B_FALSE);
+				break;
+			default:
+				return (B_FALSE);
+			}
+			break;
+		case BPF_JMP:
+			/*
+			 * Check that jumps are within the code block,
+			 * and that unconditional branches don't go
+			 * backwards as a result of an overflow.
+			 * Unconditional branches have a 32-bit offset,
+			 * so they could overflow; we check to make
+			 * sure they don't.  Conditional branches have
+			 * an 8-bit offset, and the from address is <=
+			 * BPF_MAXINSNS, and we assume that BPF_MAXINSNS
+			 * is sufficiently small that adding 255 to it
+			 * won't overflow.
+			 *
+			 * We know that len is <= BPF_MAXINSNS, and we
+			 * assume that BPF_MAXINSNS is < the maximum size
+			 * of a uint_t, so that i + 1 doesn't overflow.
+			 */
+			from = i + 1;
+			switch (BPF_OP(p->code)) {
+			case BPF_JA:
+				if (from + p->k < from || from + p->k >= len)
+					return (B_FALSE);
+				break;
+			case BPF_JEQ:
+			case BPF_JGT:
+			case BPF_JGE:
+			case BPF_JSET:
+				if (from + p->jt >= len || from + p->jf >= len)
+					return (B_FALSE);
+				break;
+			default:
+				return (B_FALSE);
+			}
+			break;
+		case BPF_RET:
+			break;
+		case BPF_MISC:
+			break;
+		default:
+			return (B_FALSE);
+		}
+	}
+
+	return (BPF_CLASS(f[len - 1].code) == BPF_RET);
+}
diff --git a/usr/src/uts/common/inet/inet_hash.h b/usr/src/uts/common/inet/inet_hash.h
new file mode 100644
index 0000000000..a790a797d1
--- /dev/null
+++ b/usr/src/uts/common/inet/inet_hash.h
@@ -0,0 +1,37 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Joyent, Inc.
+ */
+
+#ifndef _INET_INET_HASH_H
+#define	_INET_INET_HASH_H
+
+/*
+ * Common packet hashing routines shared across MAC, UDP, and others.
+ */
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define	INET_PKT_HASH_L2	0x01
+#define	INET_PKT_HASH_L3	0x02
+#define	INET_PKT_HASH_L4	0x04
+
+extern uint64_t inet_pkt_hash(uint_t, mblk_t *, uint8_t);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _INET_INET_HASH_H */
diff --git a/usr/src/uts/common/inet/ip/conn_opt.c b/usr/src/uts/common/inet/ip/conn_opt.c
index bcbc1c4949..b4bff4d7b4 100644
--- a/usr/src/uts/common/inet/ip/conn_opt.c
+++ b/usr/src/uts/common/inet/ip/conn_opt.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -619,6 +620,9 @@ conn_opt_get(conn_opt_arg_t *coa, t_scalar_t level, t_scalar_t name,
 		case SO_REUSEADDR:
 			*i1 = connp->conn_reuseaddr ? SO_REUSEADDR : 0;
 			break;	/* goto sizeof (int) option return */
+		case SO_REUSEPORT:
+			*i1 = connp->conn_reuseport;
+			break;	/* goto sizeof (int) option return */
 		case SO_TYPE:
 			*i1 = connp->conn_so_type;
 			break;	/* goto sizeof (int) option return */
@@ -1186,8 +1190,24 @@ conn_opt_set_ip(conn_opt_arg_t *coa, t_scalar_t name, uint_t inlen,
 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
 	int		error;
 
-	if (connp->conn_family != AF_INET)
+	if (connp->conn_family == AF_INET6 &&
+	    connp->conn_ipversion == IPV4_VERSION) {
+		/*
+		 * Allow certain IPv4 options to be set on an AF_INET6 socket
+		 * if the connection is still IPv4.
+		 */
+		switch (name) {
+		case IP_TOS:
+		case T_IP_TOS:
+		case IP_TTL:
+		case IP_DONTFRAG:
+			break;
+		default:
+			return (EINVAL);
+		}
+	} else if (connp->conn_family != AF_INET) {
 		return (EINVAL);
+	}
 
 	switch (name) {
 	case IP_TTL:
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index a4abdbd130..01119bf65b 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -22,6 +22,7 @@
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -80,6 +81,7 @@
 #include <sys/tsol/tnet.h>
 
 #include <inet/rawip_impl.h>
+#include <net/bpf.h>
 
 #include <sys/disp.h>
 
@@ -1011,6 +1013,12 @@ icmp_close_free(conn_t *connp)
 		icmp->icmp_filter = NULL;
 	}
 
+	if (icmp->icmp_bpf_len != 0) {
+		kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+		icmp->icmp_bpf_len = 0;
+		icmp->icmp_bpf_prog = NULL;
+	}
+
 	/*
 	 * Clear any fields which the kmem_cache constructor clears.
 	 * Only icmp_connp needs to be preserved.
@@ -1964,6 +1972,104 @@ icmp_tpi_opt_get(queue_t *q, int level, int name, uchar_t *ptr)
 	return (err);
 }
 
+static int
+icmp_attach_filter(icmp_t *icmp, uint_t inlen, const uchar_t *invalp)
+{
+	struct bpf_program prog;
+	ip_bpf_insn_t *insns = NULL;
+	unsigned int size;
+
+#ifdef _LP64
+	if (get_udatamodel() != DATAMODEL_NATIVE) {
+		struct bpf_program32 *prog32;
+
+		if (inlen != sizeof (struct bpf_program32)) {
+			return (EINVAL);
+		}
+		prog32 = (struct bpf_program32 *)invalp;
+		prog.bf_len = prog32->bf_len;
+		prog.bf_insns = (void *)(uint64_t)prog32->bf_insns;
+	} else
+#endif
+	if (inlen == sizeof (struct bpf_program)) {
+		bcopy(invalp, &prog, sizeof (prog));
+	} else {
+		return (EINVAL);
+	}
+
+	if (prog.bf_len > BPF_MAXINSNS || prog.bf_len == 0) {
+		return (EINVAL);
+	}
+	size = prog.bf_len * sizeof (struct bpf_insn);
+	insns = kmem_alloc(size, KM_SLEEP);
+	if (copyin(prog.bf_insns, insns, size) != 0) {
+		kmem_free(insns, size);
+		return (EFAULT);
+	}
+	if (!ip_bpf_validate(insns, prog.bf_len)) {
+		kmem_free(insns, size);
+		return (EINVAL);
+	}
+
+	rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+	if (icmp->icmp_bpf_len != 0) {
+		ASSERT(icmp->icmp_bpf_prog != NULL);
+
+		kmem_free(icmp->icmp_bpf_prog, icmp->icmp_bpf_len);
+	}
+	icmp->icmp_bpf_len = size;
+	icmp->icmp_bpf_prog = insns;
+	rw_exit(&icmp->icmp_bpf_lock);
+	return (0);
+}
+
+static int
+icmp_detach_filter(icmp_t *icmp)
+{
+	int error;
+
+	rw_enter(&icmp->icmp_bpf_lock, RW_WRITER);
+	if (icmp->icmp_bpf_len == 0) {
+		ASSERT(icmp->icmp_bpf_prog == NULL);
+		error = ENOENT;
+	} else {
+		kmem_free(icmp->icmp_bpf_prog,
+		    icmp->icmp_bpf_len);
+		icmp->icmp_bpf_len = 0;
+		icmp->icmp_bpf_prog = NULL;
+		error = 0;
+	}
+	rw_exit(&icmp->icmp_bpf_lock);
+	return (error);
+}
+
+static boolean_t
+icmp_eval_filter(icmp_t *icmp, mblk_t *mp, ip_recv_attr_t *ira)
+{
+	boolean_t res;
+	uchar_t *buf = mp->b_rptr;
+	uint_t wirelen, len = MBLKL(mp);
+
+	rw_enter(&icmp->icmp_bpf_lock, RW_READER);
+	if (icmp->icmp_bpf_len == 0) {
+		rw_exit(&icmp->icmp_bpf_lock);
+		return (B_FALSE);
+	}
+	if (ira->ira_flags & IRAF_IS_IPV4) {
+		ipha_t *ipha = (ipha_t *)buf;
+
+		wirelen = ntohs(ipha->ipha_length);
+	} else {
+		ip6_t *ip6h = (ip6_t *)buf;
+
+		wirelen = ntohs(ip6h->ip6_plen) + IPV6_HDR_LEN;
+	}
+	res = !ip_bpf_filter(icmp->icmp_bpf_prog, buf, wirelen, len);
+	rw_exit(&icmp->icmp_bpf_lock);
+
+	return (res);
+}
+
 /*
  * This routine sets socket options.
  */
@@ -2053,6 +2159,10 @@ icmp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
 				return (ENOBUFS);
 			}
 			break;
+		case SO_ATTACH_FILTER:
+			return (icmp_attach_filter(icmp, inlen, invalp));
+		case SO_DETACH_FILTER:
+			return (icmp_detach_filter(icmp));
 		}
 		break;
 
@@ -2598,6 +2708,14 @@ icmp_input(void *arg1, mblk_t *mp, void *arg2, ip_recv_attr_t *ira)
 	/* Initialize regardless of IP version */
 	ipps.ipp_fields = 0;
 
+	/* Apply socket filter, if needed */
+	if (icmp->icmp_bpf_len != 0) {
+		if (icmp_eval_filter(icmp, mp, ira)) {
+			freemsg(mp);
+			return;
+		}
+	}
+
 	if (ira->ira_flags & IRAF_IS_IPV4) {
 		ASSERT(IPH_HDR_VERSION(rptr) == IPV4_VERSION);
 		ASSERT(MBLKL(mp) >= sizeof (ipha_t));
@@ -5027,7 +5145,8 @@ rawip_stack_fini(netstackid_t stackid, void *arg)
 }
 
 static void *
-rawip_kstat_init(netstackid_t stackid) {
+rawip_kstat_init(netstackid_t stackid)
+{
 	kstat_t	*ksp;
 
 	rawip_named_kstat_t template = {
@@ -5039,9 +5158,7 @@ rawip_kstat_init(netstackid_t stackid) {
 	};
 
 	ksp = kstat_create_netstack("icmp", 0, "rawip", "mib2",
-					KSTAT_TYPE_NAMED,
-					NUM_OF_FIELDS(rawip_named_kstat_t),
-					0, stackid);
+	    KSTAT_TYPE_NAMED, NUM_OF_FIELDS(rawip_named_kstat_t), 0, stackid);
 	if (ksp == NULL || ksp->ks_data == NULL)
 		return (NULL);
 
diff --git a/usr/src/uts/common/inet/ip/icmp_opt_data.c b/usr/src/uts/common/inet/ip/icmp_opt_data.c
index ff0310de0c..d65d3164d3 100644
--- a/usr/src/uts/common/inet/ip/icmp_opt_data.c
+++ b/usr/src/uts/common/inet/ip/icmp_opt_data.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -41,6 +42,7 @@
 #include <netinet/ip_mroute.h>
 #include <inet/optcom.h>
 #include <inet/rawip_impl.h>
+#include <net/bpf.h>
 
 /*
  * Table of all known options handled on a ICMP protocol stack.
@@ -86,6 +88,10 @@ opdes_t	icmp_opt_arr[] = {
 	0 },
 { SO_DOMAIN,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 
+{ SO_ATTACH_FILTER,	SOL_SOCKET, OA_W, OA_W, OP_NP, 0,
+	sizeof (struct bpf_program), 0 },
+{ SO_DETACH_FILTER,	SOL_SOCKET, OA_W, OA_W, OP_NP, 0, 0, 0 },
+
 { IP_OPTIONS,	IPPROTO_IP, OA_RW, OA_RW, OP_NP,
 	(OP_VARLEN|OP_NODEFAULT),
 	IP_MAX_OPT_LENGTH + IP_ADDR_LEN, -1 /* not initialized */ },
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index f006e83a1f..73081b9c1c 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -12577,6 +12577,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
 	ip_ioctl_cmd_t *ipip = arg;
 	ip_extract_func_t *extract_funcp;
+	ill_t *ill;
 	cmd_info_t ci;
 	int err;
 	boolean_t entered_ipsq = B_FALSE;
@@ -12697,6 +12698,13 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	ipsq_current_start(ipsq, ci.ci_ipif, ipip->ipi_cmd);
 
 	/*
+	 * We need to cache the ill_t that we're going to use as the argument
+	 * to the ipif-ioctl DTrace probe (below) because the ci_ipif can be
+	 * blown away by calling ipi_func.
+	 */
+	ill = ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill;
+
+	/*
 	 * A return value of EINPROGRESS means the ioctl is
 	 * either queued and waiting for some reason or has
 	 * already completed.
@@ -12704,9 +12712,7 @@ ip_process_ioctl(ipsq_t *ipsq, queue_t *q, mblk_t *mp, void *arg)
 	err = (*ipip->ipi_func)(ci.ci_ipif, ci.ci_sin, q, mp, ipip, ci.ci_lifr);
 
 	DTRACE_PROBE4(ipif__ioctl, char *, "ip_process_ioctl finish WR",
-	    int, ipip->ipi_cmd,
-	    ill_t *, ci.ci_ipif == NULL ? NULL : ci.ci_ipif->ipif_ill,
-	    ipif_t *, ci.ci_ipif);
+	    int, ipip->ipi_cmd, ill_t *, ill, ipif_t *, ci.ci_ipif);
 	ip_ioctl_finish(q, mp, err, IPI2MODE(ipip), ipsq);
 
 	if (entered_ipsq)
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
index 85ee142dfc..c350d67c2d 100644
--- a/usr/src/uts/common/inet/ip/ip_attr.c
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
 	 */
 	if (ixa->ixa_free_flags & IXA_FREE_CRED)
 		crhold(ixa->ixa_cred);
+
+	/*
+	 * There is no cleanup in progress on this new copy.
+	 */
+	ixa->ixa_tcpcleanup = IXATC_IDLE;
 }
 
 /*
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 62d85b202b..b88dcae2d1 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -22,7 +22,7 @@
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 1990 Mentat Inc.
  * Copyright (c) 2013 by Delphix. All rights reserved.
- * Copyright 2013 Joyent, Inc.
+ * Copyright (c) 2016, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014, OmniTI Computer Consulting, Inc. All rights reserved.
  */
 
@@ -718,7 +718,7 @@ ill_dlur_copy_address(uchar_t *phys_src, uint_t phys_length,
  */
 mblk_t *
 ill_dlur_gen(uchar_t *addr, uint_t addr_length, t_uscalar_t sap,
-		t_scalar_t sap_length)
+    t_scalar_t sap_length)
 {
 	dl_unitdata_req_t *dlur;
 	mblk_t	*mp;
@@ -3855,15 +3855,18 @@ ill_lookup_on_ifindex_global_instance(uint_t index, boolean_t isv6)
 {
 	ip_stack_t	*ipst;
 	ill_t		*ill;
+	netstack_t	*ns;
 
-	ipst = netstack_find_by_stackid(GLOBAL_NETSTACKID)->netstack_ip;
-	if (ipst == NULL) {
+	ns = netstack_find_by_stackid(GLOBAL_NETSTACKID);
+
+	if ((ipst = ns->netstack_ip) == NULL) {
 		cmn_err(CE_WARN, "No ip_stack_t for zoneid zero!\n");
+		netstack_rele(ns);
 		return (NULL);
 	}
 
 	ill = ill_lookup_on_ifindex(index, isv6, ipst);
-	netstack_rele(ipst->ips_netstack);
+	netstack_rele(ns);
 	return (ill);
 }
 
@@ -10828,7 +10831,7 @@ ip_sioctl_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 /* ARGSUSED */
 int
 ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-	ip_ioctl_cmd_t *ipip, void *if_req)
+    ip_ioctl_cmd_t *ipip, void *if_req)
 {
 	struct ifreq	*ifr;
 	struct lifreq	*lifr;
@@ -10854,7 +10857,7 @@ ip_sioctl_get_mtu(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
 /* ARGSUSED2 */
 int
 ip_sioctl_brdaddr(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-	ip_ioctl_cmd_t *ipip, void *if_req)
+    ip_ioctl_cmd_t *ipip, void *if_req)
 {
 	ipaddr_t addr;
 	ire_t	*ire;
@@ -15590,7 +15593,7 @@ ip_select_source_v4(ill_t *ill, ipaddr_t setsrc, ipaddr_t dst,
 /* ARGSUSED */
 int
 if_unitsel_restart(ipif_t *ipif, sin_t *dummy_sin, queue_t *q, mblk_t *mp,
-	ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
+    ip_ioctl_cmd_t *ipip, void *dummy_ifreq)
 {
 	/*
 	 * ill_phyint_reinit merged the v4 and v6 into a single
@@ -16247,7 +16250,7 @@ ill_ptpaddr_cnt(const ill_t *ill)
 /* ARGSUSED */
 int
 ip_sioctl_get_lifusesrc(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
-	ip_ioctl_cmd_t *ipip, void *ifreq)
+    ip_ioctl_cmd_t *ipip, void *ifreq)
 {
 	struct lifreq	*lifr = ifreq;
 
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index 33a2fa5935..dedb4dadcc 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -163,7 +163,7 @@ ip_squeue_create(pri_t pri)
 {
 	squeue_t *sqp;
 
-	sqp = squeue_create(ip_squeue_worker_wait, pri);
+	sqp = squeue_create(ip_squeue_worker_wait, pri, B_TRUE);
 	ASSERT(sqp != NULL);
 	if (ip_squeue_create_callback != NULL)
 		ip_squeue_create_callback(sqp);
diff --git a/usr/src/uts/common/inet/ip/ipclassifier.c b/usr/src/uts/common/inet/ip/ipclassifier.c
index bc2173ff24..a59027801f 100644
--- a/usr/src/uts/common/inet/ip/ipclassifier.c
+++ b/usr/src/uts/common/inet/ip/ipclassifier.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 /*
@@ -868,67 +869,91 @@ ipcl_hash_remove_locked(conn_t *connp, connf_t	*connfp)
 	mutex_exit(&(connfp)->connf_lock);				\
 }
 
-#define	IPCL_HASH_INSERT_BOUND(connfp, connp) {				\
-	conn_t *pconnp = NULL, *nconnp;					\
-	IPCL_HASH_REMOVE((connp));					\
-	mutex_enter(&(connfp)->connf_lock);				\
-	nconnp = (connfp)->connf_head;					\
-	while (nconnp != NULL &&					\
-	    !_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6)) {		\
-		pconnp = nconnp;					\
-		nconnp = nconnp->conn_next;				\
-	}								\
-	if (pconnp != NULL) {						\
-		pconnp->conn_next = (connp);				\
-		(connp)->conn_prev = pconnp;				\
-	} else {							\
-		(connfp)->connf_head = (connp);				\
-	}								\
-	if (nconnp != NULL) {						\
-		(connp)->conn_next = nconnp;				\
-		nconnp->conn_prev = (connp);				\
-	}								\
-	(connp)->conn_fanout = (connfp);				\
-	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
-	    IPCL_BOUND;							\
-	CONN_INC_REF(connp);						\
-	mutex_exit(&(connfp)->connf_lock);				\
-}
+/*
+ * When inserting bound or wildcard entries into the hash, ordering rules are
+ * used to facilitate timely and correct lookups.  The order is as follows:
+ * 1. Entries bound to a specific address
+ * 2. Entries bound to INADDR_ANY
+ * 3. Entries bound to ADDR_UNSPECIFIED
+ * Entries in a category which share conn_lport (such as those using
+ * SO_REUSEPORT) will be ordered such that the newest inserted is first.
+ */
 
-#define	IPCL_HASH_INSERT_WILDCARD(connfp, connp) {			\
-	conn_t **list, *prev, *next;					\
-	boolean_t isv4mapped =						\
-	    IN6_IS_ADDR_V4MAPPED(&(connp)->conn_laddr_v6);		\
-	IPCL_HASH_REMOVE((connp));					\
-	mutex_enter(&(connfp)->connf_lock);				\
-	list = &(connfp)->connf_head;					\
-	prev = NULL;							\
-	while ((next = *list) != NULL) {				\
-		if (isv4mapped &&					\
-		    IN6_IS_ADDR_UNSPECIFIED(&next->conn_laddr_v6) &&	\
-		    connp->conn_zoneid == next->conn_zoneid) {		\
-			(connp)->conn_next = next;			\
-			if (prev != NULL)				\
-				prev = next->conn_prev;			\
-			next->conn_prev = (connp);			\
-			break;						\
-		}							\
-		list = &next->conn_next;				\
-		prev = next;						\
-	}								\
-	(connp)->conn_prev = prev;					\
-	*list = (connp);						\
-	(connp)->conn_fanout = (connfp);				\
-	(connp)->conn_flags = ((connp)->conn_flags & ~IPCL_REMOVED) |	\
-	    IPCL_BOUND;							\
-	CONN_INC_REF((connp));						\
-	mutex_exit(&(connfp)->connf_lock);				\
+void
+ipcl_hash_insert_bound(connf_t *connfp, conn_t *connp)
+{
+	conn_t *pconnp, *nconnp;
+
+	IPCL_HASH_REMOVE(connp);
+	mutex_enter(&connfp->connf_lock);
+	nconnp = connfp->connf_head;
+	pconnp = NULL;
+	while (nconnp != NULL) {
+		/*
+		 * Walk though entries associated with the fanout until one is
+		 * found which fulfills any of these conditions:
+		 * 1. Listen address of ADDR_ANY/ADDR_UNSPECIFIED
+		 * 2. Listen port the same as connp
+		 */
+		if (_IPCL_V4_MATCH_ANY(nconnp->conn_laddr_v6) ||
+		    connp->conn_lport == nconnp->conn_lport)
+			break;
+		pconnp = nconnp;
+		nconnp = nconnp->conn_next;
+	}
+	if (pconnp != NULL) {
+		pconnp->conn_next = connp;
+		connp->conn_prev = pconnp;
+	} else {
+		connfp->connf_head = connp;
+	}
+	if (nconnp != NULL) {
+		connp->conn_next = nconnp;
+		nconnp->conn_prev = connp;
+	}
+	connp->conn_fanout = connfp;
+	connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+	CONN_INC_REF(connp);
+	mutex_exit(&connfp->connf_lock);
 }
 
 void
 ipcl_hash_insert_wildcard(connf_t *connfp, conn_t *connp)
 {
-	IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+	conn_t **list, *prev, *next;
+	conn_t *pconnp = NULL, *nconnp;
+	boolean_t isv4mapped = IN6_IS_ADDR_V4MAPPED(&connp->conn_laddr_v6);
+
+	IPCL_HASH_REMOVE(connp);
+	mutex_enter(&connfp->connf_lock);
+	nconnp = connfp->connf_head;
+	pconnp = NULL;
+	while (nconnp != NULL) {
+		if (IN6_IS_ADDR_V4MAPPED_ANY(&nconnp->conn_laddr_v6) &&
+		    isv4mapped && connp->conn_lport == nconnp->conn_lport)
+			break;
+		if (IN6_IS_ADDR_UNSPECIFIED(&nconnp->conn_laddr_v6) &&
+		    (isv4mapped ||
+		    connp->conn_lport == nconnp->conn_lport))
+			break;
+
+		pconnp = nconnp;
+		nconnp = nconnp->conn_next;
+	}
+	if (pconnp != NULL) {
+		pconnp->conn_next = connp;
+		connp->conn_prev = pconnp;
+	} else {
+		connfp->connf_head = connp;
+	}
+	if (nconnp != NULL) {
+		connp->conn_next = nconnp;
+		nconnp->conn_prev = connp;
+	}
+	connp->conn_fanout = connfp;
+	connp->conn_flags = (connp->conn_flags & ~IPCL_REMOVED) | IPCL_BOUND;
+	CONN_INC_REF(connp);
+	mutex_exit(&connfp->connf_lock);
 }
 
 /*
@@ -1034,9 +1059,9 @@ ipcl_sctp_hash_insert(conn_t *connp, in_port_t lport)
 	    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_faddr_v6)) {
 		if (IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6) ||
 		    IN6_IS_ADDR_V4MAPPED_ANY(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		}
 	} else {
 		IPCL_HASH_INSERT_CONNECTED(connfp, connp);
@@ -1205,9 +1230,9 @@ ipcl_bind_insert_v4(conn_t *connp)
 		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (protocol == IPPROTO_RSVP)
 			ill_set_inputfn_all(ipst);
@@ -1219,9 +1244,9 @@ ipcl_bind_insert_v4(conn_t *connp)
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
 		if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (cl_inet_listen != NULL) {
 			ASSERT(connp->conn_ipversion == IPV4_VERSION);
@@ -1271,9 +1296,9 @@ ipcl_bind_insert_v6(conn_t *connp)
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 
@@ -1283,9 +1308,9 @@ ipcl_bind_insert_v6(conn_t *connp)
 		connfp = &ipst->ips_ipcl_bind_fanout[
 		    IPCL_BIND_HASH(lport, ipst)];
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		if (cl_inet_listen != NULL) {
 			sa_family_t	addr_family;
@@ -1416,9 +1441,9 @@ ipcl_conn_insert_v4(conn_t *connp)
 		if (connp->conn_faddr_v4 != INADDR_ANY) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (connp->conn_laddr_v4 != INADDR_ANY) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 	}
@@ -1504,9 +1529,9 @@ ipcl_conn_insert_v6(conn_t *connp)
 		if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_faddr_v6)) {
 			IPCL_HASH_INSERT_CONNECTED(connfp, connp);
 		} else if (!IN6_IS_ADDR_UNSPECIFIED(&connp->conn_laddr_v6)) {
-			IPCL_HASH_INSERT_BOUND(connfp, connp);
+			ipcl_hash_insert_bound(connfp, connp);
 		} else {
-			IPCL_HASH_INSERT_WILDCARD(connfp, connp);
+			ipcl_hash_insert_wildcard(connfp, connp);
 		}
 		break;
 	}
@@ -2092,6 +2117,7 @@ rawip_conn_constructor(void *buf, void *cdrarg, int kmflags)
 	connp->conn_flags = IPCL_RAWIPCONN;
 	connp->conn_proto = IPPROTO_ICMP;
 	icmp->icmp_connp = connp;
+	rw_init(&icmp->icmp_bpf_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&connp->conn_ilg_lock, NULL, RW_DEFAULT, NULL);
 	connp->conn_ixa = kmem_zalloc(sizeof (ip_xmit_attr_t), kmflags);
 	if (connp->conn_ixa == NULL)
@@ -2116,6 +2142,7 @@ rawip_conn_destructor(void *buf, void *cdrarg)
 	mutex_destroy(&connp->conn_lock);
 	cv_destroy(&connp->conn_cv);
 	rw_destroy(&connp->conn_ilg_lock);
+	rw_destroy(&icmp->icmp_bpf_lock);
 
 	/* Can be NULL if constructor failed */
 	if (connp->conn_ixa != NULL) {
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index c325e8dc26..2ca770ebe9 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
 {
 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
 	    "net", KSTAT_TYPE_NAMED,
-	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_PERSISTENT, stackid);
+	    sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
 
 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
 		return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index f6466434f6..c3139d9288 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -21,6 +21,7 @@
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015 Joyent, Inc.
  */
 
 #ifndef	_INET_IPCLASSIFIER_H
@@ -293,7 +294,8 @@ struct conn_s {
 		conn_ipv6_recvpathmtu : 1,	/* IPV6_RECVPATHMTU */
 		conn_mcbc_bind : 1,		/* Bound to multi/broadcast */
 
-		conn_pad_to_bit_31 : 12;
+		conn_reuseport : 1,		/* SO_REUSEPORT state */
+		conn_pad_to_bit_31 : 11;
 
 	boolean_t	conn_blocked;		/* conn is flow-controlled */
 
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index f958ca2261..2a32ccc940 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -5,7 +5,7 @@
  *
  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
  *
- * Copyright (c) 2015, Joyent, Inc.  All rights reserved.
+ * Copyright 2016, Joyent, Inc.
  */
 
 #if !defined(lint)
@@ -83,6 +83,14 @@ static	int	ipf_hook6_loop_out __P((hook_event_token_t, hook_data_t,
 static	int	ipf_hook6_loop_in __P((hook_event_token_t, hook_data_t,
     void *));
 static	int     ipf_hook6 __P((hook_data_t, int, int, void *));
+static	int	ipf_hookvndl3v4_in __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v6_in __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v4_out __P((hook_event_token_t, hook_data_t,
+    void *));
+static	int	ipf_hookvndl3v6_out __P((hook_event_token_t, hook_data_t,
+    void *));
 extern	int	ipf_geniter __P((ipftoken_t *, ipfgeniter_t *, ipf_stack_t *));
 extern	int	ipf_frruleiter __P((void *, int, void *, ipf_stack_t *));
 
@@ -152,6 +160,16 @@ char *hook6_loop_in_gz = 	"ipfilter_hook6_loop_in_gz";
 char *hook6_loop_out = 		"ipfilter_hook6_loop_out";
 char *hook6_loop_out_gz = 	"ipfilter_hook6_loop_out_gz";
 
+/* vnd IPv4/v6 hook names */
+char *hook4_vnd_in =		"ipfilter_hookvndl3v4_in";
+char *hook4_vnd_in_gz =		"ipfilter_hookvndl3v4_in_gz";
+char *hook6_vnd_in =		"ipfilter_hookvndl3v6_in";
+char *hook6_vnd_in_gz =		"ipfilter_hookvndl3v6_in_gz";
+char *hook4_vnd_out =		"ipfilter_hookvndl3v4_out";
+char *hook4_vnd_out_gz =	"ipfilter_hookvndl3v4_out_gz";
+char *hook6_vnd_out =		"ipfilter_hookvndl3v6_out";
+char *hook6_vnd_out_gz =	"ipfilter_hookvndl3v6_out_gz";
+
 /* ------------------------------------------------------------------------ */
 /* Function:    ipldetach                                                   */
 /* Returns:     int - 0 == success, else error.                             */
@@ -248,6 +266,31 @@ ipf_stack_t *ifs;
 		ifs->ifs_ipf_ipv4 = NULL;
 	}
 
+	/*
+	 * Remove VND hooks
+	 */
+	if (ifs->ifs_ipf_vndl3v4 != NULL) {
+		UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_in,
+		    NH_PHYSICAL_IN, ifs_ipfhookvndl3v4_in);
+		UNDO_HOOK(ifs_ipf_vndl3v4, ifs_hookvndl3v4_physical_out,
+		    NH_PHYSICAL_OUT, ifs_ipfhookvndl3v4_out);
+
+		if (net_protocol_release(ifs->ifs_ipf_vndl3v4) != 0)
+			goto detach_failed;
+		ifs->ifs_ipf_vndl3v4 = NULL;
+	}
+
+	if (ifs->ifs_ipf_vndl3v6 != NULL) {
+		UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_in,
+		    NH_PHYSICAL_IN, ifs_ipfhookvndl3v6_in);
+		UNDO_HOOK(ifs_ipf_vndl3v6, ifs_hookvndl3v6_physical_out,
+		    NH_PHYSICAL_OUT, ifs_ipfhookvndl3v6_out);
+
+		if (net_protocol_release(ifs->ifs_ipf_vndl3v6) != 0)
+			goto detach_failed;
+		ifs->ifs_ipf_vndl3v6 = NULL;
+	}
+
 #undef UNDO_HOOK
 
 #ifdef	IPFDEBUG
@@ -445,6 +488,48 @@ ipf_stack_t *ifs;
 	}
 
 	/*
+	 * Add VND INET hooks
+	 */
+	ifs->ifs_ipf_vndl3v4 = net_protocol_lookup(id, NHF_VND_INET);
+	if (ifs->ifs_ipf_vndl3v4 == NULL)
+		goto hookup_failed;
+
+	HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v4_in, ipf_hookvndl3v4_in,
+	    hook4_vnd_in, hook4_vnd_in_gz, ifs);
+	HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v4_out, ipf_hookvndl3v4_out,
+	    hook4_vnd_out, hook4_vnd_out_gz, ifs);
+	ifs->ifs_hookvndl3v4_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+	    NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v4_in) == 0);
+	if (!ifs->ifs_hookvndl3v4_physical_in)
+		goto hookup_failed;
+
+	ifs->ifs_hookvndl3v4_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v4,
+	    NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v4_out) == 0);
+	if (!ifs->ifs_hookvndl3v4_physical_out)
+		goto hookup_failed;
+
+
+	/*
+	 * VND INET6 hooks
+	 */
+	ifs->ifs_ipf_vndl3v6 = net_protocol_lookup(id, NHF_VND_INET6);
+	if (ifs->ifs_ipf_vndl3v6 == NULL)
+		goto hookup_failed;
+
+	HOOK_INIT_GZ_BEFORE(ifs->ifs_ipfhookvndl3v6_in, ipf_hookvndl3v6_in,
+	    hook6_vnd_in, hook6_vnd_in_gz, ifs);
+	HOOK_INIT_GZ_AFTER(ifs->ifs_ipfhookvndl3v6_out, ipf_hookvndl3v6_out,
+	    hook6_vnd_out, hook6_vnd_out_gz, ifs);
+	ifs->ifs_hookvndl3v6_physical_in = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+	    NH_PHYSICAL_IN, ifs->ifs_ipfhookvndl3v6_in) == 0);
+	if (!ifs->ifs_hookvndl3v6_physical_in)
+		goto hookup_failed;
+
+	ifs->ifs_hookvndl3v6_physical_out = (net_hook_register(ifs->ifs_ipf_vndl3v6,
+	    NH_PHYSICAL_OUT, ifs->ifs_ipfhookvndl3v6_out) == 0);
+	if (!ifs->ifs_hookvndl3v6_physical_out)
+		goto hookup_failed;
+	/*
 	 * Reacquire ipf_global, now it is safe.
 	 */
 	WRITE_ENTER(&ifs->ifs_ipf_global);
@@ -1011,7 +1096,6 @@ cred_t *cp;
 		return ENXIO;
 	unit = isp->ipfs_minor;
 
-
         /*
 	 * ipf_find_stack returns with a read lock on ifs_ipf_global
 	 */
@@ -1856,8 +1940,12 @@ frdest_t *fdp;
 		return (-1);
 	}
 
-	/* Check the src here, fin_ifp is the src interface. */
-	if (!fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p))
+	/*
+	 * If we're forwarding (vs. injecting), check the src here, fin_ifp is
+	 * the src interface.
+	 */
+	if (fdp != NULL &&
+	   !fr_forwarding_enabled((phy_if_t)fin->fin_ifp, net_data_p))
 		return (-1);
 
 	inj = net_inject_alloc(NETINFO_VERSION);
@@ -1924,8 +2012,8 @@ frdest_t *fdp;
 		inj->ni_physical = net_routeto(net_data_p, sinp, NULL);
 	}
 
-	/* we're checking the destinatation here */
-	if (!fr_forwarding_enabled(inj->ni_physical, net_data_p))
+	/* If we're forwarding (vs. injecting), check the destinatation here. */
+	if (fdp != NULL && !fr_forwarding_enabled(inj->ni_physical, net_data_p))
 		goto bad_fastroute;
 
 	/*
@@ -2045,6 +2133,42 @@ int ipf_hook6_loop_out(hook_event_token_t token, hook_data_t info, void *arg)
 }
 
 /* ------------------------------------------------------------------------ */
+/* Function:    ipf_hookvndl3_in					    */
+/* Returns:     int - 0 == packet ok, else problem, free packet if not done */
+/* Parameters:  event(I)     - pointer to event                             */
+/*              info(I)      - pointer to hook information for firewalling  */
+/*                                                                          */
+/* The vnd hooks are private hooks to ON. They represents a layer 2         */
+/* datapath generally used to implement virtual machines. The driver sends  */
+/* along L3 packets of either type IP or IPv6. The ethertype to distinguish */
+/* them is in the upper 16 bits while the remaining bits are the            */
+/* traditional packet hook flags.                                           */
+/*                                                                          */
+/* They end up calling the appropriate traditional ip hooks.                */
+/* ------------------------------------------------------------------------ */
+/*ARGSUSED*/
+int ipf_hookvndl3v4_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook4_in(token, info, arg);
+}
+
+int ipf_hookvndl3v6_in(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook6_in(token, info, arg);
+}
+
+/*ARGSUSED*/
+int ipf_hookvndl3v4_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook4_out(token, info, arg);
+}
+
+int ipf_hookvndl3v6_out(hook_event_token_t token, hook_data_t info, void *arg)
+{
+	return ipf_hook6_out(token, info, arg);
+}
+
+/* ------------------------------------------------------------------------ */
 /* Function:    ipf_hook4_loop_in                                           */
 /* Returns:     int - 0 == packet ok, else problem, free packet if not done */
 /* Parameters:  event(I)     - pointer to event                             */
diff --git a/usr/src/uts/common/inet/ipf/ipf.conf b/usr/src/uts/common/inet/ipf/ipf.conf
index 6b36f9fdbf..f49e024a72 100644
--- a/usr/src/uts/common/inet/ipf/ipf.conf
+++ b/usr/src/uts/common/inet/ipf/ipf.conf
@@ -1,3 +1,8 @@
 #
 #
 name="ipf" parent="pseudo" instance=0;
+
+# Increase the state table limits. fr_statemax should be ~70% of fr_statesize,
+# and both should be prime numbers
+fr_statesize=151007;
+fr_statemax=113279;
diff --git a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
index a239f1c1ca..9aa2478c6a 100644
--- a/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
+++ b/usr/src/uts/common/inet/ipf/netinet/ipf_stack.h
@@ -125,6 +125,10 @@ struct ipf_stack {
 	hook_t		*ifs_ipfhook6_loop_in;
 	hook_t		*ifs_ipfhook6_loop_out;
 	hook_t		*ifs_ipfhook6_nicevents;
+	hook_t		*ifs_ipfhookvndl3v4_in;
+	hook_t		*ifs_ipfhookvndl3v6_in;
+	hook_t		*ifs_ipfhookvndl3v4_out;
+	hook_t		*ifs_ipfhookvndl3v6_out;
 
 	/* flags to indicate whether hooks are registered. */
 	boolean_t	ifs_hook4_physical_in;
@@ -137,10 +141,16 @@ struct ipf_stack {
 	boolean_t	ifs_hook6_nic_events;
 	boolean_t	ifs_hook6_loopback_in;
 	boolean_t	ifs_hook6_loopback_out;
+	boolean_t	ifs_hookvndl3v4_physical_in;
+	boolean_t	ifs_hookvndl3v6_physical_in;
+	boolean_t	ifs_hookvndl3v4_physical_out;
+	boolean_t	ifs_hookvndl3v6_physical_out;
 
 	int		ifs_ipf_loopback;
 	net_handle_t	ifs_ipf_ipv4;
 	net_handle_t	ifs_ipf_ipv6;
+	net_handle_t	ifs_ipf_vndl3v4;
+	net_handle_t	ifs_ipf_vndl3v6;
 
 	/* ip_auth.c */
 	int			ifs_fr_authsize;
diff --git a/usr/src/uts/common/inet/ipf/solaris.c b/usr/src/uts/common/inet/ipf/solaris.c
index c541f4dddc..5d56debc31 100644
--- a/usr/src/uts/common/inet/ipf/solaris.c
+++ b/usr/src/uts/common/inet/ipf/solaris.c
@@ -625,7 +625,6 @@ ipf_stack_shutdown(const netid_t id, void *arg)
 /*
  * Destroy things for ipf for one stack.
  */
-/* ARGSUSED */
 static void
 ipf_stack_destroy_one(const netid_t id, ipf_stack_t *ifs)
 {
diff --git a/usr/src/uts/common/inet/ipnet/ipnet.c b/usr/src/uts/common/inet/ipnet/ipnet.c
index 75c30120f6..2f53a48d80 100644
--- a/usr/src/uts/common/inet/ipnet/ipnet.c
+++ b/usr/src/uts/common/inet/ipnet/ipnet.c
@@ -25,6 +25,10 @@
  */
 
 /*
+ * Copyright (c) 2016, Joyent, Inc. All rights reserved.
+ */
+
+/*
  * The ipnet device defined here provides access to packets at the IP layer. To
  * provide access to packets at this layer it registers a callback function in
  * the ip module and when there are open instances of the device ip will pass
@@ -2181,14 +2185,15 @@ ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
 	int		error;
 
 	ifp = (ipnetif_t *)handle;
-	ns = netstack_find_by_zoneid(ifp->if_zoneid);
 
-	if ((how == DL_PROMISC_PHYS) || (how == DL_PROMISC_MULTI)) {
-		error = ipnet_join_allmulti(ifp, ns->netstack_ipnet);
-		if (error != 0)
-			return (error);
-	} else {
+	if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
 		return (EINVAL);
+
+	ns = netstack_find_by_zoneid(ifp->if_zoneid);
+
+	if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
+		netstack_rele(ns);
+		return (error);
 	}
 
 	ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
diff --git a/usr/src/uts/common/inet/iptun/iptun.c b/usr/src/uts/common/inet/iptun/iptun.c
index c933efb470..fb4402dc17 100644
--- a/usr/src/uts/common/inet/iptun/iptun.c
+++ b/usr/src/uts/common/inet/iptun/iptun.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2016, Joyent, Inc. All rights reserved.
  */
 
 /*
@@ -1355,6 +1356,7 @@ iptun_free(iptun_t *iptun)
 		iptun->iptun_connp = NULL;
 	}
 
+	netstack_rele(iptun->iptun_ns);
 	kmem_cache_free(iptun_cache, iptun);
 	atomic_dec_32(&iptun_tunnelcount);
 }
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index 6fb72d1d08..ddb482db78 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -43,6 +44,7 @@ extern "C" {
 #include <inet/ip.h>
 #include <inet/optcom.h>
 #include <inet/tunables.h>
+#include <inet/bpf.h>
 
 /*
  * ICMP stack instances
@@ -84,6 +86,10 @@ typedef	struct icmp_s {
 	mblk_t		*icmp_fallback_queue_head;
 	mblk_t		*icmp_fallback_queue_tail;
 	struct sockaddr_storage	icmp_delayed_addr;
+
+	krwlock_t	icmp_bpf_lock;	/* protects icmp_bpf */
+	ip_bpf_insn_t	*icmp_bpf_prog; /* SO_ATTACH_FILTER bpf */
+	uint_t		icmp_bpf_len;
 } icmp_t;
 
 /*
diff --git a/usr/src/uts/common/inet/sockmods/datafilt.c b/usr/src/uts/common/inet/sockmods/datafilt.c
new file mode 100644
index 0000000000..6e1171de46
--- /dev/null
+++ b/usr/src/uts/common/inet/sockmods/datafilt.c
@@ -0,0 +1,116 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source.  A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2012, OmniTI Computer Consulting, Inc. All rights reserved.
+ */
+
+/*
+ * This file implements a socketfilter used to deter TCP connections.
+ * To defer a connection means to delay the return of accept(3SOCKET)
+ * until at least one byte is ready to be read(2). This filter may be
+ * applied automatically or programmatically through the use of
+ * soconfig(1M) and setsockopt(3SOCKET).
+ */
+
+#include <sys/kmem.h>
+#include <sys/systm.h>
+#include <sys/stropts.h>
+#include <sys/strsun.h>
+#include <sys/socketvar.h>
+#include <sys/sockfilter.h>
+#include <sys/note.h>
+#include <sys/taskq.h>
+
+#define	DATAFILT_MODULE "datafilt"
+
+static struct modlmisc dataf_modlmisc = {
+	&mod_miscops,
+	"Kernel data-ready socket filter"
+};
+
+static struct modlinkage dataf_modlinkage = {
+	MODREV_1,
+	&dataf_modlmisc,
+	NULL
+};
+
+static sof_rval_t
+dataf_attach_passive_cb(sof_handle_t handle, sof_handle_t ph,
+    void *parg, struct sockaddr *laddr, socklen_t laddrlen,
+    struct sockaddr *faddr, socklen_t faddrlen, void **cookiep)
+{
+	_NOTE(ARGUNUSED(handle, ph, parg, laddr, laddrlen, faddr, faddrlen,
+	cookiep));
+	return (SOF_RVAL_DEFER);
+}
+
+static void
+dataf_detach_cb(sof_handle_t handle, void *cookie, cred_t *cr)
+{
+	_NOTE(ARGUNUSED(handle, cookie, cr));
+}
+
+static mblk_t *
+dataf_data_in_cb(sof_handle_t handle, void *cookie, mblk_t *mp, int flags,
+    size_t *lenp)
+{
+	_NOTE(ARGUNUSED(cookie, flags, lenp));
+
+	if (mp != NULL && MBLKL(mp) > 0) {
+		sof_newconn_ready(handle);
+		sof_bypass(handle);
+	}
+
+	return (mp);
+}
+
+static sof_ops_t dataf_ops = {
+	.sofop_attach_passive = dataf_attach_passive_cb,
+	.sofop_detach = dataf_detach_cb,
+	.sofop_data_in = dataf_data_in_cb
+};
+
+int
+_init(void)
+{
+	int err;
+
+	/*
+	 * This module is safe to attach even after some preliminary socket
+	 * setup calls have taken place. See the comment for SOF_ATT_SAFE.
+	 */
+	err = sof_register(SOF_VERSION, DATAFILT_MODULE, &dataf_ops,
+	    SOF_ATT_SAFE);
+	if (err != 0)
+		return (err);
+	if ((err = mod_install(&dataf_modlinkage)) != 0)
+		(void) sof_unregister(DATAFILT_MODULE);
+
+	return (err);
+}
+
+int
+_fini(void)
+{
+	int err;
+
+	if ((err = sof_unregister(DATAFILT_MODULE)) != 0)
+		return (err);
+
+	return (mod_remove(&dataf_modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&dataf_modlinkage, modinfop));
+}
diff --git a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
index 586d7f06f8..76191e93b8 100644
--- a/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
+++ b/usr/src/uts/common/inet/sockmods/sockmod_pfp.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -51,6 +51,7 @@
 #include <sys/mac_client.h>
 #include <sys/mac_provider.h>
 #include <sys/mac_client_priv.h>
+#include <inet/bpf.h>
 
 #include <netpacket/packet.h>
 
@@ -448,7 +449,7 @@ pfp_packet(void *arg, mac_resource_handle_t mrh, mblk_t *mp, boolean_t flag)
 			buffer = (uchar_t *)mp;
 		}
 		rw_enter(&ps->ps_bpflock, RW_READER);
-		if (bpf_filter(ps->ps_bpf.bf_insns, buffer,
+		if (ip_bpf_filter((ip_bpf_insn_t *)ps->ps_bpf.bf_insns, buffer,
 		    hdr.mhi_pktsize, buflen) == 0) {
 			rw_exit(&ps->ps_bpflock);
 			ps->ps_stats.tp_drops++;
@@ -1336,7 +1337,7 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
     const void *optval, socklen_t optlen)
 {
 	struct bpf_program prog;
-	struct bpf_insn *fcode;
+	ip_bpf_insn_t *fcode;
 	struct pfpsock *ps;
 	struct sock_proto_props sopp;
 	int error = 0;
@@ -1370,10 +1371,10 @@ pfp_setsocket_sockopt(sock_lower_handle_t handle, int option_name,
 			return (EFAULT);
 		}
 
-		if (bpf_validate(fcode, (int)prog.bf_len)) {
+		if (ip_bpf_validate(fcode, prog.bf_len)) {
 			rw_enter(&ps->ps_bpflock, RW_WRITER);
 			pfp_release_bpf(ps);
-			ps->ps_bpf.bf_insns = fcode;
+			ps->ps_bpf.bf_insns = (struct bpf_insn *)fcode;
 			ps->ps_bpf.bf_len = size;
 			rw_exit(&ps->ps_bpflock);
 
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 2e08dc359b..1009f0700f 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,7 +23,7 @@
  */
 
 /*
- * Copyright 2012 Joyent, Inc.  All rights reserved.
+ * Copyright (c) 2014 Joyent, Inc.  All rights reserved.
  */
 
 /*
@@ -61,6 +61,10 @@
  * connection are processed on that squeue. The connection ("conn") to
  * squeue mapping is stored in "conn_t" member "conn_sqp".
  *
+ * If the squeue is not related to TCP/IP, then the value of sqp->sq_isip is
+ * false and it will not have an associated conn_t, which means many aspects of
+ * the system, such as polling and swtiching squeues will not be used.
+ *
  * Since the processing of the connection cuts across multiple layers
  * but still allows packets for different connnection to be processed on
  * other CPU/squeues, squeues are also termed as "Vertical Perimeter" or
@@ -244,7 +248,7 @@ squeue_init(void)
 
 /* ARGSUSED */
 squeue_t *
-squeue_create(clock_t wait, pri_t pri)
+squeue_create(clock_t wait, pri_t pri, boolean_t isip)
 {
 	squeue_t *sqp = kmem_cache_alloc(squeue_cache, KM_SLEEP);
 
@@ -260,11 +264,36 @@ squeue_create(clock_t wait, pri_t pri)
 
 	sqp->sq_enter = squeue_enter;
 	sqp->sq_drain = squeue_drain;
+	sqp->sq_isip = isip;
 
 	return (sqp);
 }
 
 /*
+ * We need to kill the threads and then clean up. We should VERIFY that
+ * polling is disabled so we don't have to worry about disassociating from
+ * MAC/IP/etc.
+ */
+void
+squeue_destroy(squeue_t *sqp)
+{
+	kt_did_t worker, poll;
+	mutex_enter(&sqp->sq_lock);
+	VERIFY(!(sqp->sq_state & (SQS_POLL_THR_QUIESCED |
+	    SQS_POLL_QUIESCE_DONE | SQS_PAUSE | SQS_EXIT)));
+	worker = sqp->sq_worker->t_did;
+	poll = sqp->sq_poll_thr->t_did;
+	sqp->sq_state |= SQS_EXIT;
+	cv_signal(&sqp->sq_poll_cv);
+	cv_signal(&sqp->sq_worker_cv);
+	mutex_exit(&sqp->sq_lock);
+
+	thread_join(poll);
+	thread_join(worker);
+	kmem_cache_free(squeue_cache, sqp);
+}
+
+/*
  * Bind squeue worker thread to the specified CPU, given by CPU id.
  * If the CPU id  value is -1, bind the worker thread to the value
  * specified in sq_bind field. If a thread is already bound to a
@@ -475,18 +504,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 			 * Handle squeue switching. More details in the
 			 * block comment at the top of the file
 			 */
-			if (connp->conn_sqp == sqp) {
+			if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
 				SQUEUE_DBG_SET(sqp, mp, proc, connp,
 				    tag);
-				connp->conn_on_sqp = B_TRUE;
+				if (sqp->sq_isip == B_TRUE)
+					connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
 				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
-				connp->conn_on_sqp = B_FALSE;
+				if (sqp->sq_isip == B_TRUE) {
+					connp->conn_on_sqp = B_FALSE;
+					CONN_DEC_REF(connp);
+				}
 				SQUEUE_DBG_CLEAR(sqp);
-				CONN_DEC_REF(connp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -513,7 +545,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 				return;
 			}
 		} else {
-			if (ira != NULL) {
+			if (sqp->sq_isip == B_TRUE && ira != NULL) {
 				mblk_t	*attrmp;
 
 				ASSERT(cnt == 1);
@@ -587,7 +619,8 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 		if (!(sqp->sq_state & SQS_REENTER) &&
 		    (process_flag != SQ_FILL) && (sqp->sq_first == NULL) &&
 		    (sqp->sq_run == curthread) && (cnt == 1) &&
-		    (connp->conn_on_sqp == B_FALSE)) {
+		    (sqp->sq_isip == B_FALSE ||
+		    connp->conn_on_sqp == B_FALSE)) {
 			sqp->sq_state |= SQS_REENTER;
 			mutex_exit(&sqp->sq_lock);
 
@@ -602,15 +635,21 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 			 * Handle squeue switching. More details in the
 			 * block comment at the top of the file
 			 */
-			if (connp->conn_sqp == sqp) {
-				connp->conn_on_sqp = B_TRUE;
+			if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
+				SQUEUE_DBG_SET(sqp, mp, proc, connp,
+				    tag);
+				if (sqp->sq_isip == B_TRUE)
+					connp->conn_on_sqp = B_TRUE;
 				DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 				    sqp, mblk_t *, mp, conn_t *, connp);
 				(*proc)(connp, mp, sqp, ira);
 				DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 				    sqp, conn_t *, connp);
-				connp->conn_on_sqp = B_FALSE;
-				CONN_DEC_REF(connp);
+				if (sqp->sq_isip == B_TRUE) {
+					connp->conn_on_sqp = B_FALSE;
+					CONN_DEC_REF(connp);
+				}
+				SQUEUE_DBG_CLEAR(sqp);
 			} else {
 				SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc,
 				    connp, ira, SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -631,7 +670,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 #ifdef DEBUG
 		mp->b_tag = tag;
 #endif
-		if (ira != NULL) {
+		if (sqp->sq_isip && ira != NULL) {
 			mblk_t	*attrmp;
 
 			ASSERT(cnt == 1);
@@ -779,7 +818,7 @@ again:
 		mp->b_prev = NULL;
 
 		/* Is there an ip_recv_attr_t to handle? */
-		if (ip_recv_attr_is_mblk(mp)) {
+		if (sqp->sq_isip == B_TRUE && ip_recv_attr_is_mblk(mp)) {
 			mblk_t	*attrmp = mp;
 
 			ASSERT(attrmp->b_cont != NULL);
@@ -804,20 +843,25 @@ again:
 
 
 		/*
-		 * Handle squeue switching. More details in the
-		 * block comment at the top of the file
+		 * Handle squeue switching. More details in the block comment at
+		 * the top of the file. non-IP squeues cannot switch, as there
+		 * is no conn_t.
 		 */
-		if (connp->conn_sqp == sqp) {
+		if (sqp->sq_isip == B_FALSE || connp->conn_sqp == sqp) {
 			SQUEUE_DBG_SET(sqp, mp, proc, connp,
 			    mp->b_tag);
-			connp->conn_on_sqp = B_TRUE;
+			if (sqp->sq_isip == B_TRUE)
+				connp->conn_on_sqp = B_TRUE;
 			DTRACE_PROBE3(squeue__proc__start, squeue_t *,
 			    sqp, mblk_t *, mp, conn_t *, connp);
 			(*proc)(connp, mp, sqp, ira);
 			DTRACE_PROBE2(squeue__proc__end, squeue_t *,
 			    sqp, conn_t *, connp);
-			connp->conn_on_sqp = B_FALSE;
-			CONN_DEC_REF(connp);
+			if (sqp->sq_isip == B_TRUE) {
+				connp->conn_on_sqp = B_FALSE;
+				CONN_DEC_REF(connp);
+			}
+			SQUEUE_DBG_CLEAR(sqp);
 		} else {
 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, proc, connp, ira,
 			    SQ_FILL, SQTAG_SQUEUE_CHANGE);
@@ -1051,6 +1095,11 @@ squeue_polling_thread(squeue_t *sqp)
 		cv_wait(async, lock);
 		CALLB_CPR_SAFE_END(&cprinfo, lock);
 
+		if (sqp->sq_state & SQS_EXIT) {
+			mutex_exit(lock);
+			thread_exit();
+		}
+
 		ctl_state = sqp->sq_state & (SQS_POLL_THR_CONTROL |
 		    SQS_POLL_THR_QUIESCED);
 		if (ctl_state != 0) {
@@ -1076,6 +1125,9 @@ squeue_polling_thread(squeue_t *sqp)
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS)) ==
 		    (SQS_PROC|SQS_POLLING|SQS_GET_PKTS));
 
+		/* Only IP related squeues should reach this point */
+		VERIFY(sqp->sq_isip == B_TRUE);
+
 poll_again:
 		sq_rx_ring = sqp->sq_rx_ring;
 		sq_get_pkts = sq_rx_ring->rr_rx;
@@ -1205,6 +1257,7 @@ squeue_worker_thr_control(squeue_t *sqp)
 	ill_rx_ring_t	*rx_ring;
 
 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	if (sqp->sq_state & SQS_POLL_RESTART) {
 		/* Restart implies a previous quiesce. */
@@ -1316,6 +1369,11 @@ squeue_worker(squeue_t *sqp)
 
 	for (;;) {
 		for (;;) {
+			if (sqp->sq_state & SQS_EXIT) {
+				mutex_exit(lock);
+				thread_exit();
+			}
+
 			/*
 			 * If the poll thread has handed control to us
 			 * we need to break out of the wait.
@@ -1412,6 +1470,7 @@ squeue_synch_enter(conn_t *connp, mblk_t *use_mp)
 
 again:
 	sqp = connp->conn_sqp;
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	mutex_enter(&sqp->sq_lock);
 	if (sqp->sq_first == NULL && !(sqp->sq_state & SQS_PROC)) {
@@ -1487,6 +1546,7 @@ void
 squeue_synch_exit(conn_t *connp)
 {
 	squeue_t *sqp = connp->conn_sqp;
+	VERIFY(sqp->sq_isip == B_TRUE);
 
 	mutex_enter(&sqp->sq_lock);
 	if (sqp->sq_run == curthread) {
diff --git a/usr/src/uts/common/inet/tcp.h b/usr/src/uts/common/inet/tcp.h
index b2b9973291..6ec2e6b2d7 100644
--- a/usr/src/uts/common/inet/tcp.h
+++ b/usr/src/uts/common/inet/tcp.h
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent, Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2014 by Delphix. All rights reserved.
  */
@@ -134,6 +134,7 @@ typedef	struct tcphdra_s {
 
 struct conn_s;
 struct tcp_listen_cnt_s;
+struct tcp_rg_s;
 
 /*
  * Control structure for each open TCP stream,
@@ -404,6 +405,13 @@ typedef struct tcp_s {
 	struct tcp_s *tcp_bind_hash_port; /* tcp_t's bound to the same lport */
 	struct tcp_s **tcp_ptpbhn;
 
+	/*
+	 * Group of tcp_t entries bound to the same adress and port via
+	 * SO_REUSEPORT.  The pointer itself is protected by tf_lock in the
+	 * containing tcps_bind_fanout slot.
+	 */
+	struct tcp_rg_s	*tcp_rg_bind;
+
 	uint_t		tcp_maxpsz_multiplier;
 
 	uint32_t	tcp_lso_max; /* maximum LSO payload */
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index fba7125690..cf046c968e 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -21,7 +21,7 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, Joyent Inc. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2013,2014 by Delphix. All rights reserved.
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
@@ -1423,6 +1423,21 @@ tcp_free(tcp_t *tcp)
 	tcp_close_mpp(&tcp->tcp_conn.tcp_eager_conn_ind);
 
 	/*
+	 * Destroy any association with SO_REUSEPORT group.
+	 */
+	if (tcp->tcp_rg_bind != NULL) {
+		/*
+		 * This is only necessary for connections which enabled
+		 * SO_REUSEPORT but were never bound.  Such connections should
+		 * be the one and only member of the tcp_rg_tp to which they
+		 * have been associated.
+		 */
+		VERIFY(tcp_rg_remove(tcp->tcp_rg_bind, tcp));
+		tcp_rg_destroy(tcp->tcp_rg_bind);
+		tcp->tcp_rg_bind = NULL;
+	}
+
+	/*
 	 * If this is a non-STREAM socket still holding on to an upper
 	 * handle, release it. As a result of fallback we might also see
 	 * STREAMS based conns with upper handles, in which case there is
@@ -2054,8 +2069,7 @@ tcp_reinit(tcp_t *tcp)
  * structure!
  */
 static void
-tcp_reinit_values(tcp)
-	tcp_t *tcp;
+tcp_reinit_values(tcp_t *tcp)
 {
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
 	conn_t		*connp = tcp->tcp_connp;
diff --git a/usr/src/uts/common/inet/tcp/tcp_bind.c b/usr/src/uts/common/inet/tcp/tcp_bind.c
index c6df39b91e..7ea9dc3413 100644
--- a/usr/src/uts/common/inet/tcp/tcp_bind.c
+++ b/usr/src/uts/common/inet/tcp/tcp_bind.c
@@ -22,6 +22,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -55,6 +56,7 @@ static uint32_t tcp_random_anon_port = 1;
 static int	tcp_bind_select_lport(tcp_t *, in_port_t *, boolean_t,
 		    cred_t *cr);
 static in_port_t	tcp_get_next_priv_port(const tcp_t *);
+static int tcp_rg_insert(tcp_rg_t *, struct tcp_s *);
 
 /*
  * Hash list insertion routine for tcp_t structures. Each hash bucket
@@ -172,6 +174,16 @@ tcp_bind_hash_remove(tcp_t *tcp)
 
 	ASSERT(lockp != NULL);
 	mutex_enter(lockp);
+
+	/* destroy any association with SO_REUSEPORT group */
+	if (tcp->tcp_rg_bind != NULL) {
+		if (tcp_rg_remove(tcp->tcp_rg_bind, tcp)) {
+			/* Last one out turns off the lights */
+			tcp_rg_destroy(tcp->tcp_rg_bind);
+		}
+		tcp->tcp_rg_bind = NULL;
+	}
+
 	if (tcp->tcp_ptpbhn) {
 		tcpnext = tcp->tcp_bind_hash_port;
 		if (tcpnext != NULL) {
@@ -636,13 +648,12 @@ tcp_bind_check(conn_t *connp, struct sockaddr *sa, socklen_t len, cred_t *cr,
 }
 
 /*
- * If the "bind_to_req_port_only" parameter is set, if the requested port
- * number is available, return it, If not return 0
+ * If the "bind_to_req_port_only" parameter is set and the requested port
+ * number is available, return it (else return 0).
  *
- * If "bind_to_req_port_only" parameter is not set and
- * If the requested port number is available, return it.  If not, return
- * the first anonymous port we happen across.  If no anonymous ports are
- * available, return 0. addr is the requested local address, if any.
+ * If "bind_to_req_port_only" parameter is not set and the requested port
+ * number is available, return it.  If not, return the first anonymous port we
+ * happen across.  If no anonymous ports are available, return 0.
  *
  * In either case, when succeeding update the tcp_t to record the port number
  * and insert it in the bind hash table.
@@ -662,6 +673,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 	int loopmax;
 	conn_t *connp = tcp->tcp_connp;
 	tcp_stack_t	*tcps = tcp->tcp_tcps;
+	boolean_t reuseport = connp->conn_reuseport;
 
 	/*
 	 * Lookup for free addresses is done in a loop and "loopmax"
@@ -698,6 +710,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		tf_t		*tbf;
 		tcp_t		*ltcp;
 		conn_t		*lconnp;
+		boolean_t	attempt_reuse = B_FALSE;
 
 		lport = htons(port);
 
@@ -724,6 +737,7 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 		for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
 			boolean_t not_socket;
 			boolean_t exclbind;
+			boolean_t addrmatch;
 
 			lconnp = ltcp->tcp_connp;
 
@@ -829,22 +843,35 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 			    &lconnp->conn_faddr_v6)))
 				continue;
 
+			addrmatch = IN6_ARE_ADDR_EQUAL(laddr,
+			    &lconnp->conn_bound_addr_v6);
+
+			if (addrmatch && reuseport && bind_to_req_port_only &&
+			    (ltcp->tcp_state == TCPS_BOUND ||
+			    ltcp->tcp_state == TCPS_LISTEN)) {
+				/*
+				 * This entry is bound to the exact same
+				 * address and port.  If SO_REUSEPORT is set on
+				 * the calling socket, attempt to reuse this
+				 * binding if it too had SO_REUSEPORT enabled
+				 * when it was bound.
+				 */
+				attempt_reuse = (ltcp->tcp_rg_bind != NULL);
+				break;
+			}
+
 			if (!reuseaddr) {
 				/*
-				 * No socket option SO_REUSEADDR.
-				 * If existing port is bound to
-				 * a non-wildcard IP address
-				 * and the requesting stream is
-				 * bound to a distinct
-				 * different IP addresses
-				 * (non-wildcard, also), keep
-				 * going.
+				 * No socket option SO_REUSEADDR.  If an
+				 * existing port is bound to a non-wildcard IP
+				 * address and the requesting stream is bound
+				 * to a distinct different IP address
+				 * (non-wildcard, also), keep going.
 				 */
 				if (!V6_OR_V4_INADDR_ANY(*laddr) &&
 				    !V6_OR_V4_INADDR_ANY(
 				    lconnp->conn_bound_addr_v6) &&
-				    !IN6_ARE_ADDR_EQUAL(laddr,
-				    &lconnp->conn_bound_addr_v6))
+				    !addrmatch)
 					continue;
 				if (ltcp->tcp_state >= TCPS_BOUND) {
 					/*
@@ -859,27 +886,49 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 				 * socket option SO_REUSEADDR is set on the
 				 * binding tcp_t.
 				 *
-				 * If two streams are bound to
-				 * same IP address or both addr
-				 * and bound source are wildcards
-				 * (INADDR_ANY), we want to stop
-				 * searching.
-				 * We have found a match of IP source
-				 * address and source port, which is
-				 * refused regardless of the
-				 * SO_REUSEADDR setting, so we break.
+				 * If two streams are bound to the same IP
+				 * address or both addr and bound source are
+				 * wildcards (INADDR_ANY), we want to stop
+				 * searching.  We have found a match of IP
+				 * source address and source port, which is
+				 * refused regardless of the SO_REUSEADDR
+				 * setting, so we break.
 				 */
-				if (IN6_ARE_ADDR_EQUAL(laddr,
-				    &lconnp->conn_bound_addr_v6) &&
+				if (addrmatch &&
 				    (ltcp->tcp_state == TCPS_LISTEN ||
 				    ltcp->tcp_state == TCPS_BOUND))
 					break;
 			}
 		}
-		if (ltcp != NULL) {
+		if (ltcp != NULL && !attempt_reuse) {
 			/* The port number is busy */
 			mutex_exit(&tbf->tf_lock);
 		} else {
+			if (attempt_reuse) {
+				int err;
+				struct tcp_rg_s *rg;
+
+				ASSERT(ltcp != NULL);
+				ASSERT(ltcp->tcp_rg_bind != NULL);
+				ASSERT(tcp->tcp_rg_bind != NULL);
+				ASSERT(ltcp->tcp_rg_bind != tcp->tcp_rg_bind);
+
+				err = tcp_rg_insert(ltcp->tcp_rg_bind, tcp);
+				if (err != 0) {
+					mutex_exit(&tbf->tf_lock);
+					return (0);
+				}
+				/*
+				 * Now that the newly-binding socket has joined
+				 * the existing reuseport group on ltcp, it
+				 * should clean up its own (empty) group.
+				 */
+				rg = tcp->tcp_rg_bind;
+				tcp->tcp_rg_bind = ltcp->tcp_rg_bind;
+				VERIFY(tcp_rg_remove(rg, tcp));
+				tcp_rg_destroy(rg);
+			}
+
 			/*
 			 * This port is ours. Insert in fanout and mark as
 			 * bound to prevent others from getting the port
@@ -944,3 +993,125 @@ tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
 	} while (++count < loopmax);
 	return (0);
 }
+
+/* Max number of members in TCP SO_REUSEPORT group */
+#define	TCP_RG_SIZE_MAX		64
+/* Step size when expanding members array */
+#define	TCP_RG_SIZE_STEP	2
+
+
+tcp_rg_t *
+tcp_rg_init(tcp_t *tcp)
+{
+	tcp_rg_t *rg;
+	rg = kmem_alloc(sizeof (tcp_rg_t), KM_NOSLEEP|KM_NORMALPRI);
+	if (rg == NULL)
+		return (NULL);
+	rg->tcprg_members = kmem_zalloc(2 * sizeof (tcp_t *),
+	    KM_NOSLEEP|KM_NORMALPRI);
+	if (rg->tcprg_members == NULL) {
+		kmem_free(rg, sizeof (tcp_rg_t));
+		return (NULL);
+	}
+
+	mutex_init(&rg->tcprg_lock, NULL, MUTEX_DEFAULT, NULL);
+	rg->tcprg_size = 2;
+	rg->tcprg_count = 1;
+	rg->tcprg_active = 1;
+	rg->tcprg_members[0] = tcp;
+	return (rg);
+}
+
+void
+tcp_rg_destroy(tcp_rg_t *rg)
+{
+	mutex_enter(&rg->tcprg_lock);
+	ASSERT(rg->tcprg_count == 0);
+	ASSERT(rg->tcprg_active == 0);
+	kmem_free(rg->tcprg_members, rg->tcprg_size * sizeof (tcp_t *));
+	mutex_destroy(&rg->tcprg_lock);
+	kmem_free(rg, sizeof (struct tcp_rg_s));
+}
+
+static int
+tcp_rg_insert(tcp_rg_t *rg, tcp_t *tcp)
+{
+	mutex_enter(&rg->tcprg_lock);
+
+	VERIFY(rg->tcprg_size > 0);
+	VERIFY(rg->tcprg_count <= rg->tcprg_size);
+	if (rg->tcprg_count != 0) {
+		cred_t *oldcred = rg->tcprg_members[0]->tcp_connp->conn_cred;
+		cred_t *newcred = tcp->tcp_connp->conn_cred;
+
+		if (crgetuid(oldcred) != crgetuid(newcred) ||
+		    crgetzoneid(oldcred) != crgetzoneid(newcred)) {
+			mutex_exit(&rg->tcprg_lock);
+			return (EPERM);
+		}
+	}
+
+	if (rg->tcprg_count == rg->tcprg_size) {
+		unsigned int oldalloc = rg->tcprg_size * sizeof (tcp_t *);
+		unsigned int newsize = rg->tcprg_size + TCP_RG_SIZE_STEP;
+		tcp_t **newmembers;
+
+		if (newsize > TCP_RG_SIZE_MAX) {
+			mutex_exit(&rg->tcprg_lock);
+			return (EINVAL);
+		}
+		newmembers = kmem_zalloc(newsize * sizeof (tcp_t *),
+		    KM_NOSLEEP|KM_NORMALPRI);
+		if (newmembers == NULL) {
+			mutex_exit(&rg->tcprg_lock);
+			return (ENOMEM);
+		}
+		bcopy(rg->tcprg_members, newmembers, oldalloc);
+		kmem_free(rg->tcprg_members, oldalloc);
+		rg->tcprg_members = newmembers;
+		rg->tcprg_size = newsize;
+	}
+
+	rg->tcprg_members[rg->tcprg_count] = tcp;
+	rg->tcprg_count++;
+	rg->tcprg_active++;
+
+	mutex_exit(&rg->tcprg_lock);
+	return (0);
+}
+
+boolean_t
+tcp_rg_remove(tcp_rg_t *rg, tcp_t *tcp)
+{
+	int i;
+	boolean_t is_empty;
+
+	mutex_enter(&rg->tcprg_lock);
+	for (i = 0; i < rg->tcprg_count; i++) {
+		if (rg->tcprg_members[i] == tcp)
+			break;
+	}
+	/* The item should be present */
+	ASSERT(i < rg->tcprg_count);
+	/* Move the last member into this position */
+	rg->tcprg_count--;
+	rg->tcprg_members[i] = rg->tcprg_members[rg->tcprg_count];
+	rg->tcprg_members[rg->tcprg_count] = NULL;
+	if (tcp->tcp_connp->conn_reuseport != 0)
+		rg->tcprg_active--;
+	is_empty = (rg->tcprg_count == 0);
+	mutex_exit(&rg->tcprg_lock);
+	return (is_empty);
+}
+
+void
+tcp_rg_setactive(tcp_rg_t *rg, boolean_t is_active)
+{
+	mutex_enter(&rg->tcprg_lock);
+	if (is_active) {
+		rg->tcprg_active++;
+	} else {
+		rg->tcprg_active--;
+	}
+	mutex_exit(&rg->tcprg_lock);
+}
diff --git a/usr/src/uts/common/inet/tcp/tcp_opt_data.c b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
index 1a5363bedc..50d97b6ea2 100644
--- a/usr/src/uts/common/inet/tcp/tcp_opt_data.c
+++ b/usr/src/uts/common/inet/tcp/tcp_opt_data.c
@@ -21,6 +21,7 @@
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2016 Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -62,7 +63,8 @@ opdes_t	tcp_opt_arr[] = {
 { SO_USELOOPBACK, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0
 	},
 { SO_BROADCAST,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
-{ SO_REUSEADDR, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEADDR,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
+{ SO_REUSEPORT,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_OOBINLINE, SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
 { SO_TYPE,	SOL_SOCKET, OA_R, OA_R, OP_NP, 0, sizeof (int), 0 },
 { SO_SNDBUF,	SOL_SOCKET, OA_RW, OA_RW, OP_NP, 0, sizeof (int), 0 },
@@ -484,6 +486,104 @@ tcp_opt_get(conn_t *connp, int level, int name, uchar_t *ptr)
 }
 
 /*
+ * Set a TCP connection's participation in SO_REUSEPORT.  This operation is
+ * performed under the protection of the squeue via tcp_setsockopt.
+ * The manipulation of tcp_rg_bind, as part of this operation, is subject to
+ * these constraints:
+ * 1. Prior to bind(), tcp_rg_bind can be set/cleared in tcp_set_reuseport
+ *    under the protection of the squeue.
+ * 2. Once the connection has been bound, the tcp_rg_bind pointer must not be
+ *    altered until such time as tcp_free() cleans up the connection.
+ * 3. A connection undergoing bind, which matches to a connection participating
+ *    in port-reuse, will switch its tcp_rg_bind pointer when it joins the
+ *    group of an existing connection in tcp_bindi().
+ */
+static int
+tcp_set_reuseport(conn_t *connp, boolean_t do_enable)
+{
+	tcp_t *tcp = connp->conn_tcp;
+	struct tcp_rg_s *rg;
+
+	if (!IPCL_IS_NONSTR(connp)) {
+		if (do_enable) {
+			/*
+			 * SO_REUSEPORT cannot be enabled on sockets which have
+			 * fallen back to the STREAMS API.
+			 */
+			return (EINVAL);
+		} else {
+			/*
+			 * A connection with SO_REUSEPORT enabled should be
+			 * prevented from falling back to STREAMS mode via
+			 * logic in tcp_fallback.  It is legal, however, for
+			 * fallen-back connections to affirm the disabled state
+			 * of SO_REUSEPORT.
+			 */
+			ASSERT(connp->conn_reuseport == 0);
+			return (0);
+		}
+	}
+	if (tcp->tcp_state <= TCPS_CLOSED) {
+		return (EINVAL);
+	}
+	if (connp->conn_reuseport == 0 && do_enable) {
+		/* disabled -> enabled */
+		if (tcp->tcp_rg_bind != NULL) {
+			tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+		} else {
+			/*
+			 * Connection state is not a concern when initially
+			 * populating tcp_rg_bind.  Setting it to non-NULL on a
+			 * bound or listening connection would only mean that
+			 * new reused-port binds become a possibility.
+			 */
+			if ((rg = tcp_rg_init(tcp)) == NULL) {
+				return (ENOMEM);
+			}
+			tcp->tcp_rg_bind = rg;
+		}
+		connp->conn_reuseport = 1;
+	} else if (connp->conn_reuseport != 0 && !do_enable) {
+		/* enabled -> disabled */
+		ASSERT(tcp->tcp_rg_bind != NULL);
+		if (tcp->tcp_state == TCPS_IDLE) {
+			/*
+			 * If the connection has not been bound yet, discard
+			 * the reuse group state.  Since disabling SO_REUSEPORT
+			 * on a bound socket will _not_ prevent others from
+			 * reusing the port, the presence of tcp_rg_bind is
+			 * used to determine reuse availability, not
+			 * conn_reuseport.
+			 *
+			 * This allows proper behavior for examples such as:
+			 *
+			 * setsockopt(fd1, ... SO_REUSEPORT, &on_val...);
+			 * bind(fd1, &myaddr, ...);
+			 * setsockopt(fd1, ... SO_REUSEPORT, &off_val...);
+			 *
+			 * setsockopt(fd2, ... SO_REUSEPORT, &on_val...);
+			 * bind(fd2, &myaddr, ...); // <- SHOULD SUCCEED
+			 *
+			 */
+			rg = tcp->tcp_rg_bind;
+			tcp->tcp_rg_bind = NULL;
+			VERIFY(tcp_rg_remove(rg, tcp));
+			tcp_rg_destroy(rg);
+		} else {
+			/*
+			 * If a connection has been bound, it's no longer safe
+			 * to manipulate tcp_rg_bind until connection clean-up
+			 * during tcp_free.  Just mark the member status of the
+			 * connection as inactive.
+			 */
+			tcp_rg_setactive(tcp->tcp_rg_bind, do_enable);
+		}
+		connp->conn_reuseport = 0;
+	}
+	return (0);
+}
+
+/*
  * We declare as 'int' rather than 'void' to satisfy pfi_t arg requirements.
  * Parameters are assumed to be verified by the caller.
  */
@@ -653,6 +753,11 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			}
 			*outlenp = inlen;
 			return (0);
+		case SO_REUSEPORT:
+			if (!checkonly) {
+				return (tcp_set_reuseport(connp, *i1 != 0));
+			}
+			return (0);
 		}
 		break;
 	case IPPROTO_TCP:
@@ -769,14 +874,37 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 			if (*i1 == 0) {
 				return (EINVAL);
 			} else if (tcp->tcp_ka_rinterval == 0) {
-				if ((tcp->tcp_ka_abort_thres / *i1) <
-				    tcp->tcp_rto_min ||
-				    (tcp->tcp_ka_abort_thres / *i1) >
-				    tcp->tcp_rto_max)
-					return (EINVAL);
+				/*
+				 * When TCP_KEEPCNT is specified without first
+				 * specifying a TCP_KEEPINTVL, we infer an
+				 * interval based on a tunable specific to our
+				 * stack: the tcp_keepalive_abort_interval.
+				 * (Or the TCP_KEEPALIVE_ABORT_THRESHOLD, in
+				 * the unlikely event that that has been set.)
+				 * Given the abort interval's default value of
+				 * 480 seconds, low TCP_KEEPCNT values can
+				 * result in intervals that exceed the default
+				 * maximum RTO of 60 seconds.  Rather than
+				 * fail in these cases, we (implicitly) clamp
+				 * the interval at the maximum RTO; if the
+				 * TCP_KEEPCNT is shortly followed by a
+				 * TCP_KEEPINTVL (as we expect), the abort
+				 * threshold will be recalculated correctly --
+				 * and if a TCP_KEEPINTVL is not forthcoming,
+				 * keep-alive will at least operate reasonably
+				 * given the underconfigured state.
+				 */
+				uint32_t interval;
+
+				interval = tcp->tcp_ka_abort_thres / *i1;
 
-				tcp->tcp_ka_rinterval =
-				    tcp->tcp_ka_abort_thres / *i1;
+				if (interval < tcp->tcp_rto_min)
+					interval = tcp->tcp_rto_min;
+
+				if (interval > tcp->tcp_rto_max)
+					interval = tcp->tcp_rto_max;
+
+				tcp->tcp_ka_rinterval = interval;
 			} else {
 				if ((*i1 * tcp->tcp_ka_rinterval) <
 				    tcps->tcps_keepalive_abort_interval_low ||
@@ -953,10 +1081,6 @@ tcp_opt_set(conn_t *connp, uint_t optset_context, int level, int name,
 		}
 		break;
 	case IPPROTO_IP:
-		if (connp->conn_family != AF_INET) {
-			*outlenp = 0;
-			return (EINVAL);
-		}
 		switch (name) {
 		case IP_SEC_OPT:
 			/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_socket.c b/usr/src/uts/common/inet/tcp/tcp_socket.c
index a431bf63d1..8f535a5dd1 100644
--- a/usr/src/uts/common/inet/tcp/tcp_socket.c
+++ b/usr/src/uts/common/inet/tcp/tcp_socket.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015 Joyent, Inc.
  */
 
 /* This file contains all TCP kernel socket related functions. */
@@ -1022,6 +1023,16 @@ tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
 	}
 
 	/*
+	 * Do not allow fallback on connections making use of SO_REUSEPORT.
+	 */
+	if (tcp->tcp_rg_bind != NULL) {
+		freeb(stropt_mp);
+		freeb(ordrel_mp);
+		squeue_synch_exit(connp);
+		return (EINVAL);
+	}
+
+	/*
 	 * Both endpoints must be of the same type (either STREAMS or
 	 * non-STREAMS) for fusion to be enabled. So if we are fused,
 	 * we have to unfuse.
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index ab7ffa4594..cb83b91fad 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -61,9 +61,9 @@ extern sock_downcalls_t sock_tcp_downcalls;
  * by setting it to 0.
  */
 #define	TCP_XMIT_LOWATER	4096
-#define	TCP_XMIT_HIWATER	49152
+#define	TCP_XMIT_HIWATER	128000
 #define	TCP_RECV_LOWATER	2048
-#define	TCP_RECV_HIWATER	128000
+#define	TCP_RECV_HIWATER	1048576
 
 /*
  * Bind hash list size and has function.  It has to be a power of 2 for
@@ -406,6 +406,22 @@ typedef struct tcp_listen_cnt_s {
 	uint32_t	tlc_drop;
 } tcp_listen_cnt_t;
 
+/*
+ * Track tcp_t entities bound to the same port/address tuple via SO_REUSEPORT.
+ * - tcprg_lock:	Protects the other fields
+ * - tcprg_size:	Allocated size (in entries) of tcprg_members array
+ * - tcprg_count:	Count of occupied tcprg_members slots
+ * - tcprg_active:	Count of members which still have SO_REUSEPORT set
+ * - tcprg_members:	Connections associated with address/port group
+ */
+typedef struct tcp_rg_s {
+	kmutex_t	tcprg_lock;
+	unsigned int	tcprg_size;
+	unsigned int	tcprg_count;
+	unsigned int	tcprg_active;
+	tcp_t		**tcprg_members;
+} tcp_rg_t;
+
 #define	TCP_TLC_REPORT_INTERVAL	(30 * MINUTES)
 
 #define	TCP_DECR_LISTEN_CNT(tcp)					\
@@ -649,6 +665,10 @@ extern in_port_t	tcp_bindi(tcp_t *, in_port_t, const in6_addr_t *,
 			    int, boolean_t, boolean_t, boolean_t);
 extern in_port_t	tcp_update_next_port(in_port_t, const tcp_t *,
 			    boolean_t);
+extern tcp_rg_t *tcp_rg_init(tcp_t *);
+extern boolean_t tcp_rg_remove(tcp_rg_t *, tcp_t *);
+extern void tcp_rg_destroy(tcp_rg_t *);
+extern void tcp_rg_setactive(tcp_rg_t *, boolean_t);
 
 /*
  * Fusion related functions in tcp_fusion.c.
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 5a15aea4de..a88bac932c 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -22,6 +22,7 @@
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2013 Nexenta Systems, Inc.  All rights reserved.
  * Copyright 2014, OmniTI Computer Consulting, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -76,7 +77,8 @@
 #include <inet/ipclassifier.h>
 #include <sys/squeue_impl.h>
 #include <inet/ipnet.h>
-#include <sys/ethernet.h>
+#include <sys/vxlan.h>
+#include <inet/inet_hash.h>
 
 #include <sys/tsol/label.h>
 #include <sys/tsol/tnet.h>
@@ -346,6 +348,89 @@ void (*cl_inet_unbind)(netstackid_t stack_id, uint8_t protocol,
 typedef union T_primitives *t_primp_t;
 
 /*
+ * Various protocols that encapsulate UDP have no real use for the source port.
+ * Instead, they want to vary the source port to provide better equal-cost
+ * multipathing and other systems that use fanout. Consider something like
+ * VXLAN. If you're actually sending multiple different streams to a single
+ * host, if you don't vary the source port, then the tuple of ( SRC IP, DST IP,
+ * SRC Port, DST Port) will always be the same.
+ *
+ * Here, we return a port to hash this to, if we know how to hash it. If for
+ * some reason we can't perform an L4 hash, then we just return the default
+ * value, usually the default port. After we determine the hash we transform it
+ * so that it's in the range of [ min, max ].
+ *
+ * We'd like to avoid a pull up for the sake of performing the hash. If the
+ * first mblk_t doesn't have the full protocol header, then we just send it to
+ * the default. If for some reason we have an encapsulated packet that has its
+ * protocol header in different parts of an mblk_t, then we'll go with the
+ * default port. This means that that if a driver isn't consistent about how it
+ * generates the frames for a given flow, it will not always be consistently
+ * hashed. That should be an uncommon event.
+ */
+uint16_t
+udp_srcport_hash(mblk_t *mp, int type, uint16_t min, uint16_t max,
+    uint16_t def)
+{
+	size_t szused = 0;
+	struct ether_header *ether;
+	struct ether_vlan_header *vether;
+	ip6_t *ip6h;
+	ipha_t *ipha;
+	uint16_t sap;
+	uint64_t hash;
+	uint32_t mod;
+
+	ASSERT(min <= max);
+
+	if (type != UDP_HASH_VXLAN)
+		return (def);
+
+	if (!IS_P2ALIGNED(mp->b_rptr, sizeof (uint16_t)))
+		return (def);
+
+	/*
+	 * The following logic is VXLAN specific to get at the header, if we
+	 * have formats, eg. GENEVE, then we should ignore this.
+	 *
+	 * The kernel overlay device often puts a first mblk_t for the data
+	 * which is just the encap. If so, then we're going to use that and try
+	 * to avoid a pull up.
+	 */
+	if (MBLKL(mp) == VXLAN_HDR_LEN) {
+		if (mp->b_cont == NULL)
+			return (def);
+		mp = mp->b_cont;
+		ether = (struct ether_header *)mp->b_rptr;
+	} else if (MBLKL(mp) < VXLAN_HDR_LEN) {
+		return (def);
+	} else {
+		szused = VXLAN_HDR_LEN;
+		ether = (struct ether_header *)((uintptr_t)mp->b_rptr + szused);
+	}
+
+	/* Can we hold a MAC header? */
+	if (MBLKL(mp) + szused < sizeof (struct ether_header))
+		return (def);
+
+	/*
+	 * We need to lie about the starting offset into the message block for
+	 * convenience. Undo it at the end. We know that inet_pkt_hash() won't
+	 * modify the mblk_t.
+	 */
+	mp->b_rptr += szused;
+	hash = inet_pkt_hash(DL_ETHER, mp, INET_PKT_HASH_L2 |
+	    INET_PKT_HASH_L3 | INET_PKT_HASH_L4);
+	mp->b_rptr -= szused;
+
+	if (hash == 0)
+		return (def);
+
+	mod = max - min + 1;
+	return ((hash % mod) + min);
+}
+
+/*
  * Return the next anonymous port in the privileged port range for
  * bind checking.
  *
@@ -1583,6 +1668,16 @@ udp_opt_get(conn_t *connp, t_scalar_t level, t_scalar_t name,
 			*i1 = udp->udp_rcvhdr ? 1 : 0;
 			mutex_exit(&connp->conn_lock);
 			return (sizeof (int));
+		case UDP_SRCPORT_HASH:
+			mutex_enter(&connp->conn_lock);
+			*i1 = udp->udp_vxlanhash;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
+		case UDP_SND_TO_CONNECTED:
+			mutex_enter(&connp->conn_lock);
+			*i1 = udp->udp_snd_to_conn ? 1 : 0;
+			mutex_exit(&connp->conn_lock);
+			return (sizeof (int));
 		}
 	}
 	mutex_enter(&connp->conn_lock);
@@ -1718,6 +1813,31 @@ udp_do_opt_set(conn_opt_arg_t *coa, int level, int name,
 			udp->udp_rcvhdr = onoff;
 			mutex_exit(&connp->conn_lock);
 			return (0);
+		case UDP_SRCPORT_HASH:
+			/*
+			 * This should have already been verified, but double
+			 * check.
+			 */
+			if ((error = secpolicy_ip_config(cr, B_FALSE)) != 0) {
+				return (error);
+			}
+
+			/* First see if the val is something we understand */
+			if (*i1 != UDP_HASH_DISABLE && *i1 != UDP_HASH_VXLAN)
+				return (EINVAL);
+
+			if (!checkonly) {
+				mutex_enter(&connp->conn_lock);
+				udp->udp_vxlanhash = *i1;
+				mutex_exit(&connp->conn_lock);
+			}
+			/* Fully handled this option. */
+			return (0);
+		case UDP_SND_TO_CONNECTED:
+			mutex_enter(&connp->conn_lock);
+			udp->udp_snd_to_conn = onoff;
+			mutex_exit(&connp->conn_lock);
+			return (0);
 		}
 		break;
 	}
@@ -2001,13 +2121,25 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
 	uint32_t	cksum;
 	udp_t		*udp = connp->conn_udp;
 	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
+	boolean_t	hash_srcport = udp->udp_vxlanhash;
 	uint_t		ulp_hdr_len;
+	uint16_t	srcport;
 
 	data_len = msgdsize(data_mp);
 	ulp_hdr_len = UDPH_SIZE;
 	if (insert_spi)
 		ulp_hdr_len += sizeof (uint32_t);
 
+	/*
+	 * If we have source port hashing going on, determine the hash before
+	 * we modify the mblk_t.
+	 */
+	if (hash_srcport == B_TRUE) {
+		srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+		    IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+		    ntohs(connp->conn_lport));
+	}
+
 	mp = conn_prepend_hdr(ixa, ipp, v6src, v6dst, IPPROTO_UDP, flowinfo,
 	    ulp_hdr_len, data_mp, data_len, us->us_wroff_extra, &cksum, errorp);
 	if (mp == NULL) {
@@ -2019,7 +2151,11 @@ udp_prepend_hdr(conn_t *connp, ip_xmit_attr_t *ixa, const ip_pkt_t *ipp,
 	ixa->ixa_pktlen = data_len + ixa->ixa_ip_hdr_length;
 
 	udpha = (udpha_t *)(mp->b_rptr + ixa->ixa_ip_hdr_length);
-	udpha->uha_src_port = connp->conn_lport;
+	if (hash_srcport == B_TRUE) {
+		udpha->uha_src_port = htons(srcport);
+	} else {
+		udpha->uha_src_port = connp->conn_lport;
+	}
 	udpha->uha_dst_port = dstport;
 	udpha->uha_checksum = 0;
 	udpha->uha_length = htons(data_len);
@@ -3194,6 +3330,7 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
 	udp_t		*udp = connp->conn_udp;
 	udp_stack_t	*us = udp->udp_us;
 	boolean_t	insert_spi = udp->udp_nat_t_endpoint;
+	boolean_t	hash_srcport = udp->udp_vxlanhash;
 	uint_t		pktlen;
 	uint_t		alloclen;
 	uint_t		copylen;
@@ -3202,10 +3339,21 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
 	udpha_t		*udpha;
 	uint32_t	cksum;
 	ip_pkt_t	*ipp;
+	uint16_t	srcport;
 
 	ASSERT(MUTEX_HELD(&connp->conn_lock));
 
 	/*
+	 * If we have source port hashing going on, determine the hash before
+	 * we modify the mblk_t.
+	 */
+	if (hash_srcport == B_TRUE) {
+		srcport = udp_srcport_hash(mp, UDP_HASH_VXLAN,
+		    IPPORT_DYNAMIC_MIN, IPPORT_DYNAMIC_MAX,
+		    ntohs(connp->conn_lport));
+	}
+
+	/*
 	 * Copy the header template and leave space for an SPI
 	 */
 	copylen = connp->conn_ht_iphc_len;
@@ -3303,6 +3451,9 @@ udp_prepend_header_template(conn_t *connp, ip_xmit_attr_t *ixa, mblk_t *mp,
 		*((uint32_t *)(udpha + 1)) = 0;
 
 	udpha->uha_dst_port = dstport;
+	if (hash_srcport == B_TRUE)
+		udpha->uha_src_port = htons(srcport);
+
 	return (mp);
 }
 
@@ -5947,10 +6098,18 @@ udp_send(sock_lower_handle_t proto_handle, mblk_t *mp, struct nmsghdr *msg,
 		else
 			return (error);
 	}
-	if (udp->udp_state == TS_DATA_XFER) {
+
+	/*
+	 * Check if we're allowed to send to a connection on which we've
+	 * already called 'connect'. The posix spec. allows both behaviors but
+	 * historically we've returned an error if already connected. The
+	 * client can allow this via a sockopt.
+	 */
+	if (udp->udp_state == TS_DATA_XFER && !udp->udp_snd_to_conn) {
 		UDPS_BUMP_MIB(us, udpOutErrors);
 		return (EISCONN);
 	}
+
 	error = proto_verify_ip_addr(connp->conn_family,
 	    (struct sockaddr *)msg->msg_name, msg->msg_namelen);
 	if (error != 0) {
diff --git a/usr/src/uts/common/inet/udp/udp_opt_data.c b/usr/src/uts/common/inet/udp/udp_opt_data.c
index c279bb4a21..847e2cdde6 100644
--- a/usr/src/uts/common/inet/udp/udp_opt_data.c
+++ b/usr/src/uts/common/inet/udp/udp_opt_data.c
@@ -21,6 +21,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #include <sys/types.h>
@@ -292,6 +293,9 @@ opdes_t	udp_opt_arr[] = {
 	},
 { UDP_NAT_T_ENDPOINT, IPPROTO_UDP, OA_RW, OA_RW, OP_PRIVPORT, 0, sizeof (int),
 	0 },
+{ UDP_SRCPORT_HASH, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int), 0 },
+{ UDP_SND_TO_CONNECTED, IPPROTO_UDP, OA_R, OA_RW, OP_CONFIG, 0, sizeof (int),
+	0 }
 };
 
 /*
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 6a31ce5c22..ebba10c0f7 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
  */
 
 #ifndef	_UDP_IMPL_H
@@ -178,8 +179,12 @@ typedef	struct udp_s {
 		udp_issocket : 1,	/* socket mode; sockfs is on top */
 		udp_nat_t_endpoint : 1,	/* UDP_NAT_T_ENDPOINT option */
 		udp_rcvhdr : 1,		/* UDP_RCVHDR option */
+		udp_vxlanhash: 1,	/* UDP_SRCPORT_HASH option */
+					/* Because there's only VXLAN, cheat */
+					/* and only use a single bit */
+		udp_snd_to_conn: 1,	/* UDP_SND_TO_CONNECTED option */
 
-		udp_pad_to_bit_31 : 29;
+		udp_pad_to_bit_31 : 27;
 
 	/* Following 2 fields protected by the uf_lock */
 	struct udp_s	*udp_bind_hash; /* Bind hash chain */