13 files changed, 1522 insertions, 49 deletions
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index 42adb4c451..bd50364310 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright (c) 1990 Mentat Inc.
  */
 
@@ -2195,6 +2196,8 @@ struct ip_xmit_attr_s {
 	 */
 	ixa_notify_t	ixa_notify;	/* Registered upcall notify function */
 	void		*ixa_notify_cookie; /* ULP cookie for ixa_notify */
+
+	uint_t		ixa_tcpcleanup;	/* Used by conn_ixa_cleanup */
 };
 
 /*
@@ -2266,6 +2269,14 @@ struct ip_xmit_attr_s {
 #define	IXA_FREE_TSL		0x00000002	/* ixa_tsl needs to be rele */
 
 /*
+ * Trivial state machine used to synchronize IXA cleanup for TCP connections.
+ * See conn_ixa_cleanup().
+ */
+#define	IXATC_IDLE		0x00000000
+#define	IXATC_INPROGRESS	0x00000001
+#define	IXATC_COMPLETE		0x00000002
+
+/*
  * Simplistic way to set the ixa_xmit_hint for locally generated traffic
  * and forwarded traffic. The shift amount are based on the size of the
  * structs to discard the low order bits which don't have much if any variation
@@ -3030,6 +3041,7 @@ extern vmem_t *ip_minor_arena_la;
 #define	ips_ip_strict_src_multihoming	ips_propinfo_tbl[80].prop_cur_uval
 #define	ips_ipv6_strict_src_multihoming	ips_propinfo_tbl[81].prop_cur_uval
 #define	ips_ipv6_drop_inbound_icmpv6	ips_propinfo_tbl[82].prop_cur_bval
+#define	ips_ip_dce_reclaim_threshold	ips_propinfo_tbl[83].prop_cur_uval
 
 extern int	dohwcksum;	/* use h/w cksum if supported by the h/w */
 #ifdef ZC_TEST
diff --git a/usr/src/uts/common/inet/ip/ip_attr.c b/usr/src/uts/common/inet/ip/ip_attr.c
index 3197858f8e..e040af14ba 100644
--- a/usr/src/uts/common/inet/ip/ip_attr.c
+++ b/usr/src/uts/common/inet/ip/ip_attr.c
@@ -909,6 +909,11 @@ ixa_safe_copy(ip_xmit_attr_t *src, ip_xmit_attr_t *ixa)
 	 */
 	if (ixa->ixa_free_flags & IXA_FREE_CRED)
 		crhold(ixa->ixa_cred);
+
+	/*
+	 * There is no cleanup in progress on this new copy.
+	 */
+	ixa->ixa_tcpcleanup = IXATC_IDLE;
 }
 
 /*
@@ -1176,6 +1181,59 @@ ixa_cleanup_stale(ip_xmit_attr_t *ixa)
 	}
 }
 
+static mblk_t *
+tcp_ixa_cleanup_getmblk(conn_t *connp)
+{
+	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
+	int need_retry;
+	mblk_t *mp;
+
+	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+
+	/*
+	 * It's possible that someone else came in and started cleaning up
+	 * another connection between the time we verified this one is not being
+	 * cleaned up and the time we actually get the shared mblk.  If that's
+	 * the case, we've dropped the lock, and some other thread may have
+	 * cleaned up this connection again, and is still waiting for
+	 * notification of that cleanup's completion.  Therefore we need to
+	 * recheck.
+	 */
+	do {
+		need_retry = 0;
+		while (connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE) {
+			cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
+			    &tcps->tcps_ixa_cleanup_lock);
+		}
+
+		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
+			/*
+			 * Multiple concurrent cleanups; need to have the last
+			 * one run since it could be an unplumb.
+			 */
+			need_retry = 1;
+			cv_wait(&tcps->tcps_ixa_cleanup_ready_cv,
+			    &tcps->tcps_ixa_cleanup_lock);
+		}
+	} while (need_retry);
+
+	/*
+	 * We now have the lock and the mblk; now make sure that no one else can
+	 * try to clean up this connection or enqueue it for cleanup, clear the
+	 * mblk pointer for this stack, drop the lock, and return the mblk.
+	 */
+	ASSERT(MUTEX_HELD(&tcps->tcps_ixa_cleanup_lock));
+	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_IDLE);
+	ASSERT(tcps->tcps_ixa_cleanup_mp == mp);
+	ASSERT(mp != NULL);
+
+	connp->conn_ixa->ixa_tcpcleanup = IXATC_INPROGRESS;
+	tcps->tcps_ixa_cleanup_mp = NULL;
+	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+
+	return (mp);
+}
+
 /*
  * Used to run ixa_cleanup_stale inside the tcp squeue.
  * When done we hand the mp back by assigning it to tcps_ixa_cleanup_mp
@@ -1195,11 +1253,39 @@ tcp_ixa_cleanup(void *arg, mblk_t *mp, void *arg2,
 
 	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
 	ASSERT(tcps->tcps_ixa_cleanup_mp == NULL);
+	connp->conn_ixa->ixa_tcpcleanup = IXATC_COMPLETE;
 	tcps->tcps_ixa_cleanup_mp = mp;
-	cv_signal(&tcps->tcps_ixa_cleanup_cv);
+	cv_signal(&tcps->tcps_ixa_cleanup_ready_cv);
+	/*
+	 * It is possible for any number of threads to be waiting for cleanup of
+	 * different connections.  Absent a per-connection (or per-IXA) CV, we
+	 * need to wake them all up even though only one can be waiting on this
+	 * particular cleanup.
+	 */
+	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
 	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
 }
 
+static void
+tcp_ixa_cleanup_wait_and_finish(conn_t *connp)
+{
+	tcp_stack_t *tcps = connp->conn_netstack->netstack_tcp;
+
+	mutex_enter(&tcps->tcps_ixa_cleanup_lock);
+
+	ASSERT(connp->conn_ixa->ixa_tcpcleanup != IXATC_IDLE);
+
+	while (connp->conn_ixa->ixa_tcpcleanup == IXATC_INPROGRESS) {
+		cv_wait(&tcps->tcps_ixa_cleanup_done_cv,
+		    &tcps->tcps_ixa_cleanup_lock);
+	}
+
+	ASSERT(connp->conn_ixa->ixa_tcpcleanup == IXATC_COMPLETE);
+	connp->conn_ixa->ixa_tcpcleanup = IXATC_IDLE;
+	cv_broadcast(&tcps->tcps_ixa_cleanup_done_cv);
+
+	mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+}
 
 /*
  * ipcl_walk() function to help release any IRE, NCE, or DCEs that
@@ -1214,21 +1300,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg)
 
 	if (IPCL_IS_TCP(connp)) {
 		mblk_t		*mp;
-		tcp_stack_t	*tcps;
-
-		tcps = connp->conn_netstack->netstack_tcp;
 
-		mutex_enter(&tcps->tcps_ixa_cleanup_lock);
-		while ((mp = tcps->tcps_ixa_cleanup_mp) == NULL) {
-			/*
-			 * Multiple concurrent cleanups; need to have the last
-			 * one run since it could be an unplumb.
-			 */
-			cv_wait(&tcps->tcps_ixa_cleanup_cv,
-			    &tcps->tcps_ixa_cleanup_lock);
-		}
-		tcps->tcps_ixa_cleanup_mp = NULL;
-		mutex_exit(&tcps->tcps_ixa_cleanup_lock);
+		mp = tcp_ixa_cleanup_getmblk(connp);
 
 		if (connp->conn_sqp->sq_run == curthread) {
 			/* Already on squeue */
@@ -1237,15 +1310,8 @@ conn_ixa_cleanup(conn_t *connp, void *arg)
 			CONN_INC_REF(connp);
 			SQUEUE_ENTER_ONE(connp->conn_sqp, mp, tcp_ixa_cleanup,
 			    connp, NULL, SQ_PROCESS, SQTAG_TCP_IXA_CLEANUP);
-
-			/* Wait until tcp_ixa_cleanup has run */
-			mutex_enter(&tcps->tcps_ixa_cleanup_lock);
-			while (tcps->tcps_ixa_cleanup_mp == NULL) {
-				cv_wait(&tcps->tcps_ixa_cleanup_cv,
-				    &tcps->tcps_ixa_cleanup_lock);
-			}
-			mutex_exit(&tcps->tcps_ixa_cleanup_lock);
 		}
+		tcp_ixa_cleanup_wait_and_finish(connp);
 	} else if (IPCL_IS_SCTP(connp)) {
 		sctp_t	*sctp;
 		sctp_faddr_t *fp;
diff --git a/usr/src/uts/common/inet/ip/ip_dce.c b/usr/src/uts/common/inet/ip/ip_dce.c
index 215bc4675f..502ee8a735 100644
--- a/usr/src/uts/common/inet/ip/ip_dce.c
+++ b/usr/src/uts/common/inet/ip/ip_dce.c
@@ -21,6 +21,7 @@
 
 /*
  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/types.h>
@@ -32,6 +33,7 @@
 #include <sys/cmn_err.h>
 #include <sys/debug.h>
 #include <sys/atomic.h>
+#include <sys/callb.h>
 #define	_SUN_TPI_VERSION 2
 #include <sys/tihdr.h>
 
@@ -102,7 +104,19 @@ static void	dce_delete_locked(dcb_t *, dce_t *);
 static void	dce_make_condemned(dce_t *);
 
 static kmem_cache_t *dce_cache;
+static kthread_t *dce_reclaim_thread;
+static kmutex_t dce_reclaim_lock;
+static kcondvar_t dce_reclaim_cv;
+static int dce_reclaim_shutdown;
 
+/* Global so it can be tuned in /etc/system. This must be a power of two. */
+uint_t ip_dce_hash_size = 1024;
+
+/* The time in seconds between executions of the IP DCE reclaim worker. */
+uint_t ip_dce_reclaim_interval = 60;
+
+/* The factor of the DCE threshold at which to start hard reclaims */
+uint_t ip_dce_reclaim_threshold_hard = 2;
 
 /* Operates on a uint64_t */
 #define	RANDOM_HASH(p) ((p) ^ ((p)>>16) ^ ((p)>>32) ^ ((p)>>48))
@@ -117,6 +131,11 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
 	uint_t	fraction_pmtu = fraction*4;
 	uint_t	hash;
 	dce_t	*dce, *nextdce;
+	hrtime_t seed = gethrtime();
+	uint_t	retained = 0;
+	uint_t	max = ipst->ips_ip_dce_reclaim_threshold;
+
+	max *= ip_dce_reclaim_threshold_hard;
 
 	rw_enter(&dcb->dcb_lock, RW_WRITER);
 	for (dce = dcb->dcb_dce; dce != NULL; dce = nextdce) {
@@ -132,13 +151,21 @@ dcb_reclaim(dcb_t *dcb, ip_stack_t *ipst, uint_t fraction)
 		} else {
 			mutex_exit(&dce->dce_lock);
 		}
-		hash = RANDOM_HASH((uint64_t)(uintptr_t)dce);
-		if (dce->dce_flags & DCEF_PMTU) {
-			if (hash % fraction_pmtu != 0)
-				continue;
-		} else {
-			if (hash % fraction != 0)
-				continue;
+
+		if (max == 0 || retained < max) {
+			hash = RANDOM_HASH((uint64_t)((uintptr_t)dce | seed));
+
+			if (dce->dce_flags & DCEF_PMTU) {
+				if (hash % fraction_pmtu != 0) {
+					retained++;
+					continue;
+				}
+			} else {
+				if (hash % fraction != 0) {
+					retained++;
+					continue;
+				}
+			}
 		}
 
 		IP_STAT(ipst, ip_dce_reclaim_deleted);
@@ -175,17 +202,19 @@ ip_dce_reclaim_stack(ip_stack_t *ipst)
 }
 
 /*
- * Called by the memory allocator subsystem directly, when the system
- * is running low on memory.
+ * Called by dce_reclaim_worker() below, and no one else.  Typically this will
+ * mean that the number of entries in the hash buckets has exceeded a tunable
+ * threshold.
  */
-/* ARGSUSED */
-void
-ip_dce_reclaim(void *args)
+static void
+ip_dce_reclaim(void)
 {
 	netstack_handle_t nh;
 	netstack_t *ns;
 	ip_stack_t *ipst;
 
+	ASSERT(curthread == dce_reclaim_thread);
+
 	netstack_next_init(&nh);
 	while ((ns = netstack_next(&nh)) != NULL) {
 		/*
@@ -196,26 +225,75 @@ ip_dce_reclaim(void *args)
 			netstack_rele(ns);
 			continue;
 		}
-		ip_dce_reclaim_stack(ipst);
+		if (atomic_swap_uint(&ipst->ips_dce_reclaim_needed, 0) != 0)
+			ip_dce_reclaim_stack(ipst);
 		netstack_rele(ns);
 	}
 	netstack_next_fini(&nh);
 }
 
+/* ARGSUSED */
+static void
+dce_reclaim_worker(void *arg)
+{
+	callb_cpr_t	cprinfo;
+
+	CALLB_CPR_INIT(&cprinfo, &dce_reclaim_lock, callb_generic_cpr,
+	    "dce_reclaim_worker");
+
+	mutex_enter(&dce_reclaim_lock);
+	while (!dce_reclaim_shutdown) {
+		CALLB_CPR_SAFE_BEGIN(&cprinfo);
+		(void) cv_timedwait(&dce_reclaim_cv, &dce_reclaim_lock,
+		    ddi_get_lbolt() + ip_dce_reclaim_interval * hz);
+		CALLB_CPR_SAFE_END(&cprinfo, &dce_reclaim_lock);
+
+		if (dce_reclaim_shutdown)
+			break;
+
+		mutex_exit(&dce_reclaim_lock);
+		ip_dce_reclaim();
+		mutex_enter(&dce_reclaim_lock);
+	}
+
+	ASSERT(MUTEX_HELD(&dce_reclaim_lock));
+	dce_reclaim_thread = NULL;
+	dce_reclaim_shutdown = 0;
+	cv_broadcast(&dce_reclaim_cv);
+	CALLB_CPR_EXIT(&cprinfo);	/* drops the lock */
+
+	thread_exit();
+}
+
 void
 dce_g_init(void)
 {
 	dce_cache = kmem_cache_create("dce_cache",
-	    sizeof (dce_t), 0, NULL, NULL, ip_dce_reclaim, NULL, NULL, 0);
+	    sizeof (dce_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+	mutex_init(&dce_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dce_reclaim_cv, NULL, CV_DEFAULT, NULL);
+
+	dce_reclaim_thread = thread_create(NULL, 0, dce_reclaim_worker,
+	    NULL, 0, &p0, TS_RUN, minclsyspri);
 }
 
 void
 dce_g_destroy(void)
 {
+	mutex_enter(&dce_reclaim_lock);
+	dce_reclaim_shutdown = 1;
+	cv_signal(&dce_reclaim_cv);
+	while (dce_reclaim_thread != NULL)
+		cv_wait(&dce_reclaim_cv, &dce_reclaim_lock);
+	mutex_exit(&dce_reclaim_lock);
+
+	cv_destroy(&dce_reclaim_cv);
+	mutex_destroy(&dce_reclaim_lock);
+
 	kmem_cache_destroy(dce_cache);
 }
 
-
 /*
  * Allocate a default DCE and a hash table for per-IP address DCEs
  */
@@ -234,7 +312,7 @@ dce_stack_init(ip_stack_t *ipst)
 	ipst->ips_dce_default->dce_ipst = ipst;
 
 	/* This must be a power of two since we are using IRE_ADDR_HASH macro */
-	ipst->ips_dce_hashsize = 256;
+	ipst->ips_dce_hashsize = ip_dce_hash_size;
 	ipst->ips_dce_hash_v4 = kmem_zalloc(ipst->ips_dce_hashsize *
 	    sizeof (dcb_t), KM_SLEEP);
 	ipst->ips_dce_hash_v6 = kmem_zalloc(ipst->ips_dce_hashsize *
@@ -414,6 +492,12 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
 
 	hash = IRE_ADDR_HASH(dst, ipst->ips_dce_hashsize);
 	dcb = &ipst->ips_dce_hash_v4[hash];
+	/*
+	 * Assuming that we get fairly even distribution across all of the
+	 * buckets, once one bucket is overly full, prune the whole cache.
+	 */
+	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
+		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 	rw_enter(&dcb->dcb_lock, RW_WRITER);
 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 		if (dce->dce_v4addr == dst) {
@@ -447,6 +531,7 @@ dce_lookup_and_add_v4(ipaddr_t dst, ip_stack_t *ipst)
 	dce->dce_ptpn = &dcb->dcb_dce;
 	dcb->dcb_dce = dce;
 	dce->dce_bucket = dcb;
+	atomic_add_32(&dcb->dcb_cnt, 1);
 	dce_refhold(dce);	/* For the caller */
 	rw_exit(&dcb->dcb_lock);
 
@@ -476,6 +561,12 @@ dce_lookup_and_add_v6(const in6_addr_t *dst, uint_t ifindex, ip_stack_t *ipst)
 
 	hash = IRE_ADDR_HASH_V6(*dst, ipst->ips_dce_hashsize);
 	dcb = &ipst->ips_dce_hash_v6[hash];
+	/*
+	 * Assuming that we get fairly even distribution across all of the
+	 * buckets, once one bucket is overly full, prune the whole cache.
+	 */
+	if (dcb->dcb_cnt > ipst->ips_ip_dce_reclaim_threshold)
+		atomic_or_uint(&ipst->ips_dce_reclaim_needed, 1);
 	rw_enter(&dcb->dcb_lock, RW_WRITER);
 	for (dce = dcb->dcb_dce; dce != NULL; dce = dce->dce_next) {
 		if (IN6_ARE_ADDR_EQUAL(&dce->dce_v6addr, dst) &&
diff --git a/usr/src/uts/common/inet/ip/ip_tunables.c b/usr/src/uts/common/inet/ip/ip_tunables.c
index 516d6c1a21..1e249b493e 100644
--- a/usr/src/uts/common/inet/ip/ip_tunables.c
+++ b/usr/src/uts/common/inet/ip/ip_tunables.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 /* Copyright (c) 1990 Mentat Inc. */
 
@@ -908,6 +909,11 @@ mod_prop_info_t ip_propinfo_tbl[] = {
 #else
 	{ "", 0, NULL, NULL, {0}, {0} },
 #endif
+
+	{ "_dce_reclaim_threshold", MOD_PROTO_IP,
+	    mod_set_uint32, mod_get_uint32,
+	    {1, 100000, 32}, {32} },
+
 	{ "mtu", MOD_PROTO_IPV4, NULL, ip_get_mtu, {0}, {0} },
 
 	{ "mtu", MOD_PROTO_IPV6, NULL, ip_get_mtu, {0}, {0} },
diff --git a/usr/src/uts/common/inet/ip/ipsecesp.c b/usr/src/uts/common/inet/ip/ipsecesp.c
index 47972a8c1a..96a0457678 100644
--- a/usr/src/uts/common/inet/ip/ipsecesp.c
+++ b/usr/src/uts/common/inet/ip/ipsecesp.c
@@ -234,8 +234,7 @@ esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
 {
 	espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
 	    "net", KSTAT_TYPE_NAMED,
-	    sizeof (esp_kstats_t) / sizeof (kstat_named_t),
-	    KSTAT_FLAG_PERSISTENT, stackid);
+	    sizeof (esp_kstats_t) / sizeof (kstat_named_t), 0, stackid);
 
 	if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
 		return (B_FALSE);
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index a564376cfb..706752b236 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -269,6 +269,7 @@ struct ip_stack {
 	uint_t		ips_dce_hashsize;
 	struct dcb_s	*ips_dce_hash_v4;
 	struct dcb_s	*ips_dce_hash_v6;
+	uint_t		ips_dce_reclaim_needed;
 
 	/* pending binds */
 	mblk_t		*ips_ip6_asp_pending_ops;
diff --git a/usr/src/uts/common/inet/ipd/ipd.c b/usr/src/uts/common/inet/ipd/ipd.c
new file mode 100644
index 0000000000..a6a09b043e
--- /dev/null
+++ b/usr/src/uts/common/inet/ipd/ipd.c
@@ -0,0 +1,1226 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ */
+
+/*
+ * ipd: Internet packet disturber
+ *
+ * The purpose of ipd is to simulate congested and lossy networks when they
+ * don't actually exist. The features of these congested and lossy networks are
+ * events that end up leading to retransmits and thus kicking us out of the
+ * TCP/IP fastpath. Since normally this would require us to have an actually
+ * congested network, which can be problematic, we instead simulate this
+ * behavior.
+ *
+ * 1. ipd's operations and restrictions
+ *
+ * ipd currently has facilities to cause IP traffic to be:
+ *
+ *   - Corrupted with some probability.
+ *   - Delayed for a set number of microseconds.
+ *   - Dropped with some probability.
+ *
+ * Each of these features are enabled on a per-zone basic. The current
+ * implementation restricts this specifically to exclusive stack zones.
+ * Enabling ipd on a given zone causes pfhooks to be installed for that zone's
+ * netstack. Because of the nature of ipd, it currently only supports exclusive
+ * stack zones and as a further restriction, it only allows the global zone
+ * administrative access. ipd can be enabled for the global zone, but doing so
+ * will cause all shared-stack zones to also be affected.
+ *
+ * 2. General architecture and Locking
+ *
+ * ipd consists of a few components. There is a per netstack data structure that
+ * is created and destroyed with the creation and destruction of each exclusive
+ * stack zone. Each of these netstacks is stored in a global list which is
+ * accessed for control of ipd via ioctls. The following diagram touches on the
+ * data structures that are used throughout ipd.
+ *
+ *   ADMINISTRATIVE			         DATA PATH
+ *
+ *    +--------+                          +------+       +------+
+ *    | ipdadm |                          |  ip  |       | nics |
+ *    +--------+                          +------+       +------+
+ *       |  ^                                |               |
+ *       |  | ioctl(2)                       |               |
+ *       V  |                                V               V
+ *    +----------+                     +-------------------------+
+ *    | /dev/ipd |                     | pfhooks packet callback | == ipd_hook()
+ *    +----------+                     +-------------------------+
+ *         |                                         |
+ *         |                                         |
+ *         V                                         |
+ *    +----------------+                             |
+ *    | list_t ipd_nsl |------+                      |
+ *    +----------------+      |                      |
+ *                            |                      |
+ *                            V     per netstack     V
+ *                         +----------------------------+
+ *                         |       ipd_nestack_t        |
+ *                         +----------------------------+
+ *
+ * ipd has two different entry points, one is administrative, the other is the
+ * data path. The administrative path is accessed by a userland component called
+ * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd.
+ * If the administrative path enables a specific zone, then the data path will
+ * become active for that zone. Any packet that leaves that zone's IP stack or
+ * is going to enter it, comes through the callback specified in the hook_t(9S)
+ * structure. This will cause each packet to go through ipd_hook().
+ *
+ * While the locking inside of ipd should be straightforward, unfortunately, the
+ * pfhooks subsystem necessarily complicates this a little bit. There are
+ * currently three different sets of locks in ipd.
+ *
+ *   - Global lock N on the netstack list.
+ *   - Global lock A on the active count.
+ *   - Per-netstack data structure lock Z.
+ *
+ * # Locking rules
+ *
+ * L.1a N must always be acquired first and released last
+ *
+ * If you need to acquire the netstack list lock, either for reading or writing,
+ * then N must be acquired first and before any other locks. It may not be
+ * dropped before any other lock.
+ *
+ * L.1b N must only be acquired from the administrative path and zone creation,
+ *      shutdown, and destruct callbacks.
+ *
+ * The data path, e.g. receiving the per-packet callbacks, should never be
+ * grabbing the list lock. If it is, then the architecture here needs to be
+ * reconsidered.
+ *
+ * L.2 Z cannot be held across calls to the pfhooks subsystem if packet hooks
+ *     are active.
+ *
+ * The way the pfhooks subsystem is designed is that a reference count is
+ * present on the hook_t while it is active. As long as that reference count is
+ * non-zero, a call to net_hook_unregister will block until it is lowered.
+ * Because the callbacks want the same lock for the netstack that is held by the
+ * administrative path calling into net_hook_unregister, we deadlock.
+ *
+ *  ioctl from ipdadm remove      hook_t cb (from nic)       hook_t cb (from IP)
+ *  -----------------------       --------------------       -------------------
+ *       |                             |                             |
+ *       |                        bump hook_t refcount               |
+ *  mutex_enter(ipd_nsl_lock);    enter ipd_hook()          bump hook_t refcount
+ *  mutex acquired                mutex_enter(ins->ipdn_lock);       |
+ *       |                        mutex acquired            enter ipd_hook()
+ *  mutex_enter(ins->ipdn_lock);       |            mutex_enter(ins->ipdn_lock);
+ *       |                             |                             |
+ *       |                             |                             |
+ *       |                        mutex_exit(ins->ipdn_lock);        |
+ *       |                             |                             |
+ *  mutex acquired                leave ipd_hook()                   |
+ *       |                        decrement hook_t refcount          |
+ *       |                             |                             |
+ *  ipd_teardown_hooks()               |                             |
+ *  net_hook_unregister()              |                             |
+ *  cv_wait() if recount               |                             |
+ *       |                             |                             |
+ *  ---------------------------------------------------------------------------
+ *
+ * At this point, we can see that the second hook callback still doesn't have
+ * the mutex, but it has bumped the hook_t refcount. However, it will never
+ * acquire the mutex that it needs to finish its operation and decrement the
+ * refcount.
+ *
+ * Obviously, deadlocking is not acceptable, thus the following corollary to the
+ * second locking rule:
+ *
+ * L.2 Corollary: If Z is being released across a call to the pfhooks subsystem,
+ *                N must be held.
+ *
+ * There is currently only one path where we have to worry about this. That is
+ * when we are removing a hook, but the zone is not being shutdown, then hooks
+ * are currently active. The only place that this currently happens is in
+ * ipd_check_hooks().
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/kmem.h>
+#include <sys/conf.h>
+#include <sys/stat.h>
+#include <sys/cmn_err.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/modctl.h>
+#include <sys/kstat.h>
+#include <sys/neti.h>
+#include <sys/list.h>
+#include <sys/ksynch.h>
+#include <sys/sysmacros.h>
+#include <sys/policy.h>
+#include <sys/atomic.h>
+#include <sys/model.h>
+#include <sys/strsun.h>
+
+#include <sys/netstack.h>
+#include <sys/hook.h>
+#include <sys/hook_event.h>
+
+#include <sys/ipd.h>
+
+#define	IPDN_STATUS_DISABLED	0x1
+#define	IPDN_STATUS_ENABLED	0x2
+#define	IPDN_STATUS_CONDEMNED	0x4
+
+/*
+ * These flags are used to determine whether or not the hooks are registered.
+ */
+#define	IPDN_HOOK_NONE		0x0
+#define	IPDN_HOOK_V4IN		0x1
+#define	IPDN_HOOK_V4OUT		0x2
+#define	IPDN_HOOK_V6IN		0x4
+#define	IPDN_HOOK_V6OUT		0x8
+#define	IPDN_HOOK_ALL		0xf
+
+/*
+ * Per-netstack kstats.
+ */
+typedef struct ipd_nskstat {
+	kstat_named_t	ink_ndrops;
+	kstat_named_t	ink_ncorrupts;
+	kstat_named_t	ink_ndelays;
+} ipd_nskstat_t;
+
+/*
+ * Different parts of this structure have different locking semantics. The list
+ * node is not normally referenced, if it is, one has to hold the ipd_nsl_lock.
+ * The following members are read only: ipdn_netid and ipdn_zoneid. The members
+ * of the kstat structure are always accessible in the data path, but the
+ * counters must be bumped with atomic operations. The ipdn_lock protects every
+ * other aspect of this structure. Please see the big theory statement on the
+ * requirements for lock ordering.
+ */
+typedef struct ipd_netstack {
+	list_node_t	ipdn_link;		/* link on ipd_nsl */
+	netid_t		ipdn_netid;		/* netstack id */
+	zoneid_t	ipdn_zoneid;		/* zone id */
+	kstat_t		*ipdn_kstat;		/* kstat_t ptr */
+	ipd_nskstat_t	ipdn_ksdata;		/* kstat data */
+	kmutex_t	ipdn_lock;		/* protects following members */
+	int		ipdn_status;		/* status flags */
+	net_handle_t	ipdn_v4hdl;		/* IPv4 net handle */
+	net_handle_t	ipdn_v6hdl;		/* IPv4 net handle */
+	int		ipdn_hooked;		/* are hooks registered */
+	hook_t		*ipdn_v4in;		/* IPv4 traffic in hook */
+	hook_t		*ipdn_v4out;		/* IPv4 traffice out hook */
+	hook_t		*ipdn_v6in;		/* IPv6 traffic in hook */
+	hook_t		*ipdn_v6out;		/* IPv6 traffic out hook */
+	int		ipdn_enabled;		/* which perturbs are on */
+	int		ipdn_corrupt;		/* corrupt percentage */
+	int		ipdn_drop;		/* drop percentage */
+	uint_t		ipdn_delay;		/* delay us */
+	long		ipdn_rand;		/* random seed */
+} ipd_netstack_t;
+
+/*
+ * ipd internal variables
+ */
+static dev_info_t	*ipd_devi;		/* device info */
+static net_instance_t	*ipd_neti;		/* net_instance for hooks */
+static unsigned int	ipd_max_delay = IPD_MAX_DELAY;	/* max delay in us */
+static kmutex_t		ipd_nsl_lock;		/* lock for the nestack list */
+static list_t		ipd_nsl;		/* list of netstacks */
+static kmutex_t		ipd_nactive_lock;	/* lock for nactive */
+static unsigned int	ipd_nactive; 		/* number of active netstacks */
+static int		ipd_nactive_fudge = 4;	/* amount to fudge by in list */
+
+/*
+ * Note that this random number implementation is based upon the old BSD 4.1
+ * rand. It's good enough for us!
+ */
+static int
+ipd_nextrand(ipd_netstack_t *ins)
+{
+	ins->ipdn_rand = ins->ipdn_rand * 1103515245L + 12345;
+	return (ins->ipdn_rand & 0x7fffffff);
+}
+
+static void
+ipd_ksbump(kstat_named_t *nkp)
+{
+	atomic_inc_64(&nkp->value.ui64);
+}
+
+/*
+ * This is where all the magic actually happens. The way that this works is we
+ * grab the ins lock to basically get a copy of all the data that we need to do
+ * our job and then let it go to minimize contention. In terms of actual work on
+ * the packet we do them in the following order:
+ *
+ * - drop
+ * - delay
+ * - corrupt
+ */
+/*ARGSUSED*/
+static int
+ipd_hook(hook_event_token_t event, hook_data_t data, void *arg)
+{
+	unsigned char *crp;
+	int dwait, corrupt, drop, rand, off, status;
+	mblk_t *mbp;
+	ipd_netstack_t *ins = arg;
+	hook_pkt_event_t *pkt = (hook_pkt_event_t *)data;
+
+	mutex_enter(&ins->ipdn_lock);
+	status = ins->ipdn_status;
+	dwait = ins->ipdn_delay;
+	corrupt = ins->ipdn_corrupt;
+	drop = ins->ipdn_drop;
+	rand = ipd_nextrand(ins);
+	mutex_exit(&ins->ipdn_lock);
+
+	/*
+	 * This probably cannot happen, but we'll do an extra guard just in
+	 * case.
+	 */
+	if (status & IPDN_STATUS_CONDEMNED)
+		return (0);
+
+	if (drop != 0 && rand % 100 < drop) {
+		freemsg(*pkt->hpe_mp);
+		*pkt->hpe_mp = NULL;
+		pkt->hpe_mb = NULL;
+		pkt->hpe_hdr = NULL;
+		ipd_ksbump(&ins->ipdn_ksdata.ink_ndrops);
+
+		return (1);
+	}
+
+	if (dwait != 0) {
+		if (dwait < TICK_TO_USEC(1))
+			drv_usecwait(dwait);
+		else
+			delay(drv_usectohz(dwait));
+		ipd_ksbump(&ins->ipdn_ksdata.ink_ndelays);
+	}
+
+	if (corrupt != 0 && rand % 100 < corrupt) {
+		/*
+		 * Since we're corrupting the mblk, just corrupt everything in
+		 * the chain. While we could corrupt the entire packet, that's a
+		 * little strong. Instead we're going to just change one of the
+		 * bytes in each mblock.
+		 */
+		mbp = *pkt->hpe_mp;
+		while (mbp != NULL) {
+			if (mbp->b_wptr == mbp->b_rptr)
+				continue;
+
+			/*
+			 * While pfhooks probably won't send us anything else,
+			 * let's just be extra careful. The stack probably isn't
+			 * as resiliant to corruption of control messages.
+			 */
+			if (DB_TYPE(mbp) != M_DATA)
+				continue;
+
+			off = rand % ((uintptr_t)mbp->b_wptr -
+			    (uintptr_t)mbp->b_rptr);
+			crp = mbp->b_rptr + off;
+			off = rand % 8;
+			*crp = *crp ^ (1 << off);
+
+			mbp = mbp->b_cont;
+		}
+		ipd_ksbump(&ins->ipdn_ksdata.ink_ncorrupts);
+	}
+
+	return (0);
+}
+
+/*
+ * Sets up and registers all the proper hooks needed for the netstack to capture
+ * packets. Callers are assumed to already be holding the ipd_netstack_t's lock.
+ * If there is a failure in setting something up, it is the responsibility of
+ * this function to clean it up. Once this function has been called, it should
+ * not be called until a corresponding call to tear down the hooks has been
+ * done.
+ */
+static int
+ipd_setup_hooks(ipd_netstack_t *ins)
+{
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+	ins->ipdn_v4hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET);
+	if (ins->ipdn_v4hdl == NULL)
+		goto cleanup;
+
+	ins->ipdn_v6hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET6);
+	if (ins->ipdn_v6hdl == NULL)
+		goto cleanup;
+
+	ins->ipdn_v4in = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v4in == NULL)
+		goto cleanup;
+
+	ins->ipdn_v4in->h_flags = 0;
+	ins->ipdn_v4in->h_hint = HH_NONE;
+	ins->ipdn_v4in->h_hintvalue = 0;
+	ins->ipdn_v4in->h_func = ipd_hook;
+	ins->ipdn_v4in->h_arg = ins;
+	ins->ipdn_v4in->h_name = "ipd IPv4 in";
+
+	if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v4in) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V4IN;
+
+	ins->ipdn_v4out = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v4out == NULL)
+		goto cleanup;
+	ins->ipdn_v4out->h_flags = 0;
+	ins->ipdn_v4out->h_hint = HH_NONE;
+	ins->ipdn_v4out->h_hintvalue = 0;
+	ins->ipdn_v4out->h_func = ipd_hook;
+	ins->ipdn_v4out->h_arg = ins;
+	ins->ipdn_v4out->h_name = "ipd IPv4 out";
+
+	if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v4out) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V4OUT;
+
+	ins->ipdn_v6in = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v6in == NULL)
+		goto cleanup;
+	ins->ipdn_v6in->h_flags = 0;
+	ins->ipdn_v6in->h_hint = HH_NONE;
+	ins->ipdn_v6in->h_hintvalue = 0;
+	ins->ipdn_v6in->h_func = ipd_hook;
+	ins->ipdn_v6in->h_arg = ins;
+	ins->ipdn_v6in->h_name = "ipd IPv6 in";
+
+	if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v6in) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V6IN;
+
+	ins->ipdn_v6out = hook_alloc(HOOK_VERSION);
+	if (ins->ipdn_v6out == NULL)
+		goto cleanup;
+	ins->ipdn_v6out->h_flags = 0;
+	ins->ipdn_v6out->h_hint = HH_NONE;
+	ins->ipdn_v6out->h_hintvalue = 0;
+	ins->ipdn_v6out->h_func = ipd_hook;
+	ins->ipdn_v6out->h_arg = ins;
+	ins->ipdn_v6out->h_name = "ipd IPv6 out";
+
+	if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v6out) != 0)
+		goto cleanup;
+	ins->ipdn_hooked |= IPDN_HOOK_V6OUT;
+	mutex_enter(&ipd_nactive_lock);
+	ipd_nactive++;
+	mutex_exit(&ipd_nactive_lock);
+
+	return (0);
+
+cleanup:
+	if (ins->ipdn_hooked & IPDN_HOOK_V6OUT)
+		(void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+		    ins->ipdn_v6out);
+
+	if (ins->ipdn_hooked & IPDN_HOOK_V6IN)
+		(void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+		    ins->ipdn_v6in);
+
+	if (ins->ipdn_hooked & IPDN_HOOK_V4OUT)
+		(void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+		    ins->ipdn_v4out);
+
+	if (ins->ipdn_hooked & IPDN_HOOK_V4IN)
+		(void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+		    ins->ipdn_v4in);
+
+	ins->ipdn_hooked = IPDN_HOOK_NONE;
+
+	if (ins->ipdn_v6out != NULL)
+		hook_free(ins->ipdn_v6out);
+
+	if (ins->ipdn_v6in != NULL)
+		hook_free(ins->ipdn_v6in);
+
+	if (ins->ipdn_v4out != NULL)
+		hook_free(ins->ipdn_v4out);
+
+	if (ins->ipdn_v4in != NULL)
+		hook_free(ins->ipdn_v4in);
+
+	if (ins->ipdn_v6hdl != NULL)
+		(void) net_protocol_release(ins->ipdn_v6hdl);
+
+	if (ins->ipdn_v4hdl != NULL)
+		(void) net_protocol_release(ins->ipdn_v4hdl);
+
+	return (1);
+}
+
+static void
+ipd_teardown_hooks(ipd_netstack_t *ins)
+{
+	ASSERT(ins->ipdn_hooked == IPDN_HOOK_ALL);
+	VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v6out) == 0);
+	VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v6in) == 0);
+	VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT,
+	    ins->ipdn_v4out) == 0);
+	VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN,
+	    ins->ipdn_v4in) == 0);
+
+	ins->ipdn_hooked = IPDN_HOOK_NONE;
+
+	hook_free(ins->ipdn_v6out);
+	hook_free(ins->ipdn_v6in);
+	hook_free(ins->ipdn_v4out);
+	hook_free(ins->ipdn_v4in);
+
+	VERIFY(net_protocol_release(ins->ipdn_v6hdl) == 0);
+	VERIFY(net_protocol_release(ins->ipdn_v4hdl) == 0);
+
+	mutex_enter(&ipd_nactive_lock);
+	ipd_nactive--;
+	mutex_exit(&ipd_nactive_lock);
+}
+
+static int
+ipd_check_hooks(ipd_netstack_t *ins, int type, boolean_t enable)
+{
+	int olden, rval;
+	olden = ins->ipdn_enabled;
+
+	if (enable)
+		ins->ipdn_enabled |= type;
+	else
+		ins->ipdn_enabled &= ~type;
+
+	/*
+	 * If hooks were previously enabled.
+	 */
+	if (olden == 0 && ins->ipdn_enabled != 0) {
+		rval = ipd_setup_hooks(ins);
+		if (rval != 0) {
+			ins->ipdn_enabled &= ~type;
+			ASSERT(ins->ipdn_enabled == 0);
+			return (rval);
+		}
+
+		return (0);
+	}
+
+	if (olden != 0 && ins->ipdn_enabled == 0) {
+		ASSERT(olden != 0);
+
+		/*
+		 * We have to drop the lock here, lest we cause a deadlock.
+		 * Unfortunately, there may be hooks that are running and are
+		 * actively in flight and we have to call the unregister
+		 * function. Due to the hooks framework, if there is an inflight
+		 * hook (most likely right now), and we are holding the
+		 * netstack's lock, those hooks will never return. This is
+		 * unfortunate.
+		 *
+		 * Because we only come into this path holding the list lock, we
+		 * know that only way that someone else can come in and get to
+		 * this structure is via the hook callbacks which are going to
+		 * only be doing reads. They'll also see that everything has
+		 * been disabled and return. So while this is unfortunate, it
+		 * should be relatively safe.
+		 */
+		mutex_exit(&ins->ipdn_lock);
+		ipd_teardown_hooks(ins);
+		mutex_enter(&ins->ipdn_lock);
+		return (0);
+	}
+
+	/*
+	 * Othwerise, nothing should have changed here.
+	 */
+	ASSERT((olden == 0) == (ins->ipdn_enabled == 0));
+	return (0);
+}
+
+static int
+ipd_toggle_corrupt(ipd_netstack_t *ins, int percent)
+{
+	int rval;
+
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+	if (percent < 0 || percent > 100)
+		return (ERANGE);
+
+	/*
+	 * If we've been asked to set the value to a value that we already have,
+	 * great, then we're done.
+	 */
+	if (percent == ins->ipdn_corrupt)
+		return (0);
+
+	ins->ipdn_corrupt = percent;
+	rval = ipd_check_hooks(ins, IPD_CORRUPT, percent != 0);
+
+	/*
+	 * If ipd_check_hooks_failed, that must mean that we failed to set up
+	 * the hooks, so we are going to effectively zero out and fail the
+	 * request to enable corruption.
+	 */
+	if (rval != 0)
+		ins->ipdn_corrupt = 0;
+
+	return (rval);
+}
+
+static int
+ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay)
+{
+	int rval;
+
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+	if (delay > ipd_max_delay)
+		return (ERANGE);
+
+	/*
+	 * If we've been asked to set the value to a value that we already have,
+	 * great, then we're done.
+	 */
+	if (delay == ins->ipdn_delay)
+		return (0);
+
+	ins->ipdn_delay = delay;
+	rval = ipd_check_hooks(ins, IPD_DELAY, delay != 0);
+
+	/*
+	 * If ipd_check_hooks_failed, that must mean that we failed to set up
+	 * the hooks, so we are going to effectively zero out and fail the
+	 * request to enable corruption.
+	 */
+	if (rval != 0)
+		ins->ipdn_delay = 0;
+
+	return (rval);
+}
+static int
+ipd_toggle_drop(ipd_netstack_t *ins, int percent)
+{
+	int rval;
+
+	ASSERT(MUTEX_HELD(&ins->ipdn_lock));
+
+	if (percent < 0 || percent > 100)
+		return (ERANGE);
+
+	/*
+	 * If we've been asked to set the value to a value that we already have,
+	 * great, then we're done.
+	 */
+	if (percent == ins->ipdn_drop)
+		return (0);
+
+	ins->ipdn_drop = percent;
+	rval = ipd_check_hooks(ins, IPD_DROP, percent != 0);
+
+	/*
+	 * If ipd_check_hooks_failed, that must mean that we failed to set up
+	 * the hooks, so we are going to effectively zero out and fail the
+	 * request to enable corruption.
+	 */
+	if (rval != 0)
+		ins->ipdn_drop = 0;
+
+	return (rval);
+}
+
+static int
+ipd_ioctl_perturb(ipd_ioc_perturb_t *ipi, cred_t *cr, intptr_t cmd)
+{
+	zoneid_t zid;
+	ipd_netstack_t *ins;
+	int rval = 0;
+
+	/*
+	 * If the zone that we're coming from is not the GZ, then we ignore it
+	 * completely and then instead just set the zoneid to be that of the
+	 * caller. If the zoneid is that of the GZ, then we don't touch this
+	 * value.
+	 */
+	zid = crgetzoneid(cr);
+	if (zid != GLOBAL_ZONEID)
+		ipi->ipip_zoneid = zid;
+
+	if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
+	    zid != GLOBAL_ZONEID)
+		return (EPERM);
+
+	/*
+	 * We need to hold the ipd_nsl_lock throughout the entire operation,
+	 * otherwise someone else could come in and remove us from the list and
+	 * free us, e.g. the netstack destroy handler. By holding the lock, we
+	 * stop it from being able to do anything wrong.
+	 */
+	mutex_enter(&ipd_nsl_lock);
+	for (ins = list_head(&ipd_nsl); ins != NULL;
+	    ins = list_next(&ipd_nsl, ins)) {
+		if (ins->ipdn_zoneid == ipi->ipip_zoneid)
+			break;
+	}
+
+	if (ins == NULL) {
+		mutex_exit(&ipd_nsl_lock);
+		return (EINVAL);
+	}
+
+	mutex_enter(&ins->ipdn_lock);
+
+	if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
+		rval = ESHUTDOWN;
+		goto cleanup;
+	}
+
+	switch (cmd) {
+	case IPDIOC_CORRUPT:
+		rval = ipd_toggle_corrupt(ins, ipi->ipip_arg);
+		break;
+	case IPDIOC_DELAY:
+		rval = ipd_toggle_delay(ins, ipi->ipip_arg);
+		break;
+	case IPDIOC_DROP:
+		rval = ipd_toggle_drop(ins, ipi->ipip_arg);
+		break;
+	}
+
+cleanup:
+	mutex_exit(&ins->ipdn_lock);
+	mutex_exit(&ipd_nsl_lock);
+	return (rval);
+}
+
+static int
+ipd_ioctl_remove(ipd_ioc_perturb_t *ipi, cred_t *cr)
+{
+	zoneid_t zid;
+	ipd_netstack_t *ins;
+	int rval = 0;
+
+	/*
+	 * See ipd_ioctl_perturb for the rational here.
+	 */
+	zid = crgetzoneid(cr);
+	if (zid != GLOBAL_ZONEID)
+		ipi->ipip_zoneid = zid;
+
+	if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID &&
+	    zid != GLOBAL_ZONEID)
+		return (EPERM);
+
+	mutex_enter(&ipd_nsl_lock);
+	for (ins = list_head(&ipd_nsl); ins != NULL;
+	    ins = list_next(&ipd_nsl, ins)) {
+		if (ins->ipdn_zoneid == ipi->ipip_zoneid)
+			break;
+	}
+
+	if (ins == NULL) {
+		mutex_exit(&ipd_nsl_lock);
+		return (EINVAL);
+	}
+
+	mutex_enter(&ins->ipdn_lock);
+
+	/*
+	 * If this is condemned, that means it's very shortly going to be torn
+	 * down. In that case, there's no reason to actually do anything here,
+	 * as it will all be done rather shortly in the destroy function.
+	 * Furthermore, because condemned corresponds with it having hit
+	 * shutdown, we know that no more packets can be received by this
+	 * netstack. All this translates to a no-op.
+	 */
+	if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) {
+		rval = 0;
+		goto cleanup;
+	}
+
+	rval = EINVAL;
+	/*
+	 * Go through and disable the requested pieces. We can safely ignore the
+	 * return value of ipd_check_hooks because the removal case should never
+	 * fail, we verify that in the hook teardown case.
+	 */
+	if (ipi->ipip_arg & IPD_CORRUPT) {
+		ins->ipdn_corrupt = 0;
+		(void) ipd_check_hooks(ins, IPD_CORRUPT, B_FALSE);
+		rval = 0;
+	}
+
+	if (ipi->ipip_arg & IPD_DELAY) {
+		ins->ipdn_delay = 0;
+		(void) ipd_check_hooks(ins, IPD_DELAY, B_FALSE);
+		rval = 0;
+	}
+
+	if (ipi->ipip_arg & IPD_DROP) {
+		ins->ipdn_drop = 0;
+		(void) ipd_check_hooks(ins, IPD_DROP, B_FALSE);
+		rval = 0;
+	}
+
+cleanup:
+	mutex_exit(&ins->ipdn_lock);
+	mutex_exit(&ipd_nsl_lock);
+	return (rval);
+}
+
+static int
+ipd_ioctl_info(ipd_ioc_info_t *ipi, cred_t *cr)
+{
+	zoneid_t zid;
+	ipd_netstack_t *ins;
+
+	/*
+	 * See ipd_ioctl_perturb for the rational here.
+	 */
+	zid = crgetzoneid(cr);
+	if (zid != GLOBAL_ZONEID)
+		ipi->ipii_zoneid = zid;
+
+	if (zoneid_to_netstackid(ipi->ipii_zoneid) == GLOBAL_NETSTACKID &&
+	    zid != GLOBAL_ZONEID)
+		return (EPERM);
+
+	mutex_enter(&ipd_nsl_lock);
+	for (ins = list_head(&ipd_nsl); ins != NULL;
+	    ins = list_next(&ipd_nsl, ins)) {
+		if (ins->ipdn_zoneid == ipi->ipii_zoneid)
+			break;
+	}
+
+	if (ins == NULL) {
+		mutex_exit(&ipd_nsl_lock);
+		return (EINVAL);
+	}
+
+	mutex_enter(&ins->ipdn_lock);
+	ipi->ipii_corrupt = ins->ipdn_corrupt;
+	ipi->ipii_delay = ins->ipdn_delay;
+	ipi->ipii_drop = ins->ipdn_drop;
+	mutex_exit(&ins->ipdn_lock);
+	mutex_exit(&ipd_nsl_lock);
+
+	return (0);
+}
+
+/*
+ * When this function is called, the value of the ipil_nzones argument controls
+ * how this function works. When called with a value of zero, then we treat that
+ * as the caller asking us what's a reasonable number of entries for me to
+ * allocate memory for. If the zone is the global zone, then we tell them how
+ * many folks are currently active and add a fudge factor. Otherwise the answer
+ * is always one.
+ *
+ * In the non-zero case, we give them that number of zone ids. While this isn't
+ * quite ideal as it might mean that someone misses something, this generally
+ * won't be an issue, as it involves a rather tight race condition in the
+ * current ipdadm implementation.
+ */
+static int
+ipd_ioctl_list(intptr_t arg, cred_t *cr)
+{
+	zoneid_t zid;
+	zoneid_t *zoneids;
+	ipd_netstack_t *ins;
+	uint_t nzoneids, rzids, cur;
+	int rval = 0;
+	STRUCT_DECL(ipd_ioc_list, h);
+
+	STRUCT_INIT(h, get_udatamodel());
+	if (ddi_copyin((void *)arg, STRUCT_BUF(h),
+	    STRUCT_SIZE(h), 0) != 0)
+		return (EFAULT);
+
+	zid = crgetzoneid(cr);
+
+	rzids = STRUCT_FGET(h, ipil_nzones);
+	if (rzids == 0) {
+		if (zid == GLOBAL_ZONEID) {
+			mutex_enter(&ipd_nactive_lock);
+			rzids = ipd_nactive + ipd_nactive_fudge;
+			mutex_exit(&ipd_nactive_lock);
+		} else {
+			rzids = 1;
+		}
+		STRUCT_FSET(h, ipil_nzones, rzids);
+		if (ddi_copyout(STRUCT_BUF(h), (void *)arg,
+		    STRUCT_SIZE(h), 0) != 0)
+			return (EFAULT);
+
+		return (0);
+	}
+
+	mutex_enter(&ipd_nsl_lock);
+	if (zid == GLOBAL_ZONEID) {
+		nzoneids = ipd_nactive;
+	} else {
+		nzoneids = 1;
+	}
+
+	zoneids = kmem_alloc(sizeof (zoneid_t) * nzoneids, KM_SLEEP);
+	cur = 0;
+	for (ins = list_head(&ipd_nsl); ins != NULL;
+	    ins = list_next(&ipd_nsl, ins)) {
+		if (ins->ipdn_enabled == 0)
+			continue;
+
+		if (zid == GLOBAL_ZONEID || zid == ins->ipdn_zoneid) {
+			zoneids[cur++] = ins->ipdn_zoneid;
+		}
+
+		if (zid != GLOBAL_ZONEID && zid == ins->ipdn_zoneid)
+			break;
+	}
+	ASSERT(cur == nzoneids);
+	mutex_exit(&ipd_nsl_lock);
+
+	STRUCT_FSET(h, ipil_nzones, nzoneids);
+	if (nzoneids < rzids)
+		rzids = nzoneids;
+	if (ddi_copyout(zoneids, STRUCT_FGETP(h, ipil_list),
+	    nzoneids * sizeof (zoneid_t), NULL) != 0)
+		rval = EFAULT;
+
+	kmem_free(zoneids, sizeof (zoneid_t) * nzoneids);
+	if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0)
+		return (EFAULT);
+
+	return (rval);
+}
+
+static void *
+ipd_nin_create(const netid_t id)
+{
+	ipd_netstack_t *ins;
+	ipd_nskstat_t *ink;
+
+	ins = kmem_zalloc(sizeof (ipd_netstack_t), KM_SLEEP);
+	ins->ipdn_status = IPDN_STATUS_DISABLED;
+	ins->ipdn_netid = id;
+	ins->ipdn_zoneid = netstackid_to_zoneid(id);
+	ins->ipdn_rand = gethrtime();
+	mutex_init(&ins->ipdn_lock, NULL, MUTEX_DRIVER, NULL);
+
+	ins->ipdn_kstat = net_kstat_create(id, "ipd", ins->ipdn_zoneid,
+	    "ipd", "net",  KSTAT_TYPE_NAMED,
+	    sizeof (ipd_nskstat_t) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (ins->ipdn_kstat != NULL) {
+		if (ins->ipdn_zoneid != GLOBAL_ZONEID)
+			kstat_zone_add(ins->ipdn_kstat, GLOBAL_ZONEID);
+
+		ink = &ins->ipdn_ksdata;
+		ins->ipdn_kstat->ks_data = ink;
+		kstat_named_init(&ink->ink_ncorrupts, "corrupts",
+		    KSTAT_DATA_UINT64);
+		kstat_named_init(&ink->ink_ndrops, "drops", KSTAT_DATA_UINT64);
+		kstat_named_init(&ink->ink_ndelays, "delays",
+		    KSTAT_DATA_UINT64);
+		kstat_install(ins->ipdn_kstat);
+	}
+
+	mutex_enter(&ipd_nsl_lock);
+	list_insert_tail(&ipd_nsl, ins);
+	mutex_exit(&ipd_nsl_lock);
+
+	return (ins);
+}
+
+static void
+ipd_nin_shutdown(const netid_t id, void *arg)
+{
+	ipd_netstack_t *ins = arg;
+
+	VERIFY(id == ins->ipdn_netid);
+	mutex_enter(&ins->ipdn_lock);
+	ASSERT(ins->ipdn_status == IPDN_STATUS_DISABLED ||
+	    ins->ipdn_status == IPDN_STATUS_ENABLED);
+	ins->ipdn_status |= IPDN_STATUS_CONDEMNED;
+	if (ins->ipdn_kstat != NULL)
+		net_kstat_delete(id, ins->ipdn_kstat);
+	mutex_exit(&ins->ipdn_lock);
+}
+
+/*ARGSUSED*/
+static void
+ipd_nin_destroy(const netid_t id, void *arg)
+{
+	ipd_netstack_t *ins = arg;
+
+	/*
+	 * At this point none of the hooks should be able to fire because the
+	 * zone has been shutdown and we are in the process of destroying it.
+	 * Thus it should not be possible for someone else to come in and grab
+	 * our ipd_netstack_t for this zone. Because of that, we know that we
+	 * are the only ones who could be running here.
+	 */
+	mutex_enter(&ipd_nsl_lock);
+	list_remove(&ipd_nsl, ins);
+	mutex_exit(&ipd_nsl_lock);
+
+	if (ins->ipdn_hooked)
+		ipd_teardown_hooks(ins);
+	mutex_destroy(&ins->ipdn_lock);
+	kmem_free(ins, sizeof (ipd_netstack_t));
+}
+
+/*ARGSUSED*/
+static int
+ipd_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+	if (flag & FEXCL || flag & FNDELAY)
+		return (EINVAL);
+
+	if (otype != OTYP_CHR)
+		return (EINVAL);
+
+	if (!(flag & FREAD && flag & FWRITE))
+		return (EINVAL);
+
+	if (secpolicy_ip_config(credp, B_FALSE) != 0)
+		return (EPERM);
+
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+ipd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
+{
+	int rval;
+	ipd_ioc_perturb_t ipip;
+	ipd_ioc_info_t ipii;
+
+	switch (cmd) {
+	case IPDIOC_CORRUPT:
+	case IPDIOC_DELAY:
+	case IPDIOC_DROP:
+		if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
+		    0) != 0)
+			return (EFAULT);
+		rval = ipd_ioctl_perturb(&ipip, cr, cmd);
+		return (rval);
+	case IPDIOC_REMOVE:
+		if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t),
+		    0) != 0)
+			return (EFAULT);
+		rval = ipd_ioctl_remove(&ipip, cr);
+		return (rval);
+	case IPDIOC_LIST:
+		/*
+		 * Because the list ioctl doesn't have a fixed-size struct due
+		 * to needing to pass around a pointer, we instead delegate the
+		 * copyin logic to the list code.
+		 */
+		return (ipd_ioctl_list(arg, cr));
+	case IPDIOC_INFO:
+		if (ddi_copyin((void *)arg, &ipii, sizeof (ipd_ioc_info_t),
+		    0) != 0)
+			return (EFAULT);
+		rval = ipd_ioctl_info(&ipii, cr);
+		if (rval != 0)
+			return (rval);
+		if (ddi_copyout(&ipii, (void *)arg, sizeof (ipd_ioc_info_t),
+		    0) != 0)
+			return (EFAULT);
+		return (0);
+	default:
+		break;
+	}
+	return (ENOTTY);
+}
+
+/*ARGSUSED*/
+static int
+ipd_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+	return (0);
+}
+
+static int
+ipd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+	minor_t instance;
+
+	if (cmd != DDI_ATTACH)
+		return (DDI_FAILURE);
+
+	if (ipd_devi != NULL)
+		return (DDI_FAILURE);
+
+	instance = ddi_get_instance(dip);
+	if (ddi_create_minor_node(dip, "ipd", S_IFCHR, instance,
+	    DDI_PSEUDO, 0) == DDI_FAILURE)
+		return (DDI_FAILURE);
+
+	ipd_neti = net_instance_alloc(NETINFO_VERSION);
+	if (ipd_neti == NULL) {
+		ddi_remove_minor_node(dip, NULL);
+		return (DDI_FAILURE);
+	}
+
+	/*
+	 * Note that these global structures MUST be initialized before we call
+	 * net_instance_register, as that will instantly cause us to drive into
+	 * the ipd_nin_create callbacks.
+	 */
+	list_create(&ipd_nsl, sizeof (ipd_netstack_t),
+	    offsetof(ipd_netstack_t, ipdn_link));
+	mutex_init(&ipd_nsl_lock, NULL, MUTEX_DRIVER, NULL);
+	mutex_init(&ipd_nactive_lock, NULL, MUTEX_DRIVER, NULL);
+
+	/* Note, net_instance_alloc sets the version. */
+	ipd_neti->nin_name = "ipd";
+	ipd_neti->nin_create = ipd_nin_create;
+	ipd_neti->nin_destroy = ipd_nin_destroy;
+	ipd_neti->nin_shutdown = ipd_nin_shutdown;
+	if (net_instance_register(ipd_neti) == DDI_FAILURE) {
+		net_instance_free(ipd_neti);
+		ddi_remove_minor_node(dip, NULL);
+	}
+
+	ddi_report_dev(dip);
+	ipd_devi = dip;
+
+	return (DDI_SUCCESS);
+}
+
+/*ARGSUSED*/
+static int
+ipd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+	int error;
+
+	switch (infocmd) {
+	case DDI_INFO_DEVT2DEVINFO:
+		*result = ipd_devi;
+		error = DDI_SUCCESS;
+		break;
+	case DDI_INFO_DEVT2INSTANCE:
+		*result = (void *)(uintptr_t)getminor((dev_t)arg);
+		error = DDI_SUCCESS;
+	default:
+		error = DDI_FAILURE;
+		break;
+	}
+
+	return (error);
+}
+
+static int
+ipd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+	if (cmd != DDI_DETACH)
+		return (DDI_FAILURE);
+
+	ASSERT(dip == ipd_devi);
+	ddi_remove_minor_node(dip, NULL);
+	ipd_devi = NULL;
+
+	if (ipd_neti != NULL) {
+		VERIFY(net_instance_unregister(ipd_neti) == 0);
+		net_instance_free(ipd_neti);
+	}
+
+	mutex_destroy(&ipd_nsl_lock);
+	mutex_destroy(&ipd_nactive_lock);
+	list_destroy(&ipd_nsl);
+
+	return (DDI_SUCCESS);
+}
+
+static struct cb_ops ipd_cb_ops = {
+	ipd_open,	/* open */
+	ipd_close,	/* close */
+	nodev,		/* strategy */
+	nodev,		/* print */
+	nodev,		/* dump */
+	nodev,		/* read */
+	nodev,		/* write */
+	ipd_ioctl,	/* ioctl */
+	nodev,		/* devmap */
+	nodev,		/* mmap */
+	nodev,		/* segmap */
+	nochpoll,	/* poll */
+	ddi_prop_op,	/* cb_prop_op */
+	NULL,		/* streamtab */
+	D_NEW | D_MP,	/* Driver compatibility flag */
+	CB_REV,		/* rev */
+	nodev,		/* aread */
+	nodev		/* awrite */
+};
+
+static struct dev_ops ipd_ops = {
+	DEVO_REV,		/* devo_rev */
+	0,			/* refcnt */
+	ipd_getinfo,		/* get_dev_info */
+	nulldev,		/* identify */
+	nulldev,		/* probe */
+	ipd_attach,		/* attach */
+	ipd_detach,		/* detach */
+	nodev,			/* reset */
+	&ipd_cb_ops,		/* driver operations */
+	NULL,			/* bus operations */
+	nodev,			/* dev power */
+	ddi_quiesce_not_needed	/* quiesce */
+};
+
+static struct modldrv modldrv = {
+	&mod_driverops,
+	"Internet packet disturber",
+	&ipd_ops
+};
+
+static struct modlinkage modlinkage = {
+	MODREV_1,
+	{ (void *)&modldrv, NULL }
+};
+
+int
+_init(void)
+{
+	return (mod_install(&modlinkage));
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+	return (mod_info(&modlinkage, modinfop));
+}
+
+int
+_fini(void)
+{
+	return (mod_remove(&modlinkage));
+}
diff --git a/usr/src/uts/common/inet/ipd/ipd.conf b/usr/src/uts/common/inet/ipd/ipd.conf
new file mode 100644
index 0000000000..83b9b685f4
--- /dev/null
+++ b/usr/src/uts/common/inet/ipd/ipd.conf
@@ -0,0 +1,27 @@
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License, Version 1.0 only
+# (the "License").  You may not use this file except in compliance
+# with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+#
+# Copyright (c) 2012 Joyent, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+name="ipd" parent="pseudo" instance=0;
diff --git a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
index 98cda0b7cc..75bac21ae4 100644
--- a/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
+++ b/usr/src/uts/common/inet/ipf/ip_fil_solaris.c
@@ -141,11 +141,13 @@ ipf_stack_t *ifs;
 
 #define	UNDO_HOOK(_f, _b, _e, _h)					\
 	do {								\
+	 	int tmp;						\
 		if (ifs->_f != NULL) {					\
 			if (ifs->_b) {					\
-				ifs->_b = (net_hook_unregister(ifs->_f,	\
-					   _e, ifs->_h) != 0);		\
-				if (!ifs->_b) {				\
+				tmp = net_hook_unregister(ifs->_f,	\
+					   _e, ifs->_h);		\
+				ifs->_b = (tmp != 0 && tmp != ENXIO);	\
+				if (!ifs->_b && ifs->_h != NULL) {	\
 					hook_free(ifs->_h);		\
 					ifs->_h = NULL;			\
 				}					\
diff --git a/usr/src/uts/common/inet/squeue.c b/usr/src/uts/common/inet/squeue.c
index 6d0bf70b2a..2e08dc359b 100644
--- a/usr/src/uts/common/inet/squeue.c
+++ b/usr/src/uts/common/inet/squeue.c
@@ -23,6 +23,10 @@
  */
 
 /*
+ * Copyright 2012 Joyent, Inc.  All rights reserved.
+ */
+
+/*
  * Squeues: General purpose serialization mechanism
  * ------------------------------------------------
  *
@@ -120,6 +124,8 @@
 #include <sys/sdt.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
+#include <sys/stack.h>
+#include <sys/archsystm.h>
 
 #include <inet/ipclassifier.h>
 #include <inet/udp_impl.h>
@@ -142,6 +148,9 @@ int squeue_workerwait_ms = 0;
 static int squeue_drain_ns = 0;
 static int squeue_workerwait_tick = 0;
 
+uintptr_t squeue_drain_stack_needed = 10240;
+uint_t squeue_drain_stack_toodeep;
+
 #define	MAX_BYTES_TO_PICKUP	150000
 
 #define	ENQUEUE_CHAIN(sqp, mp, tail, cnt) {			\
@@ -546,6 +555,7 @@ squeue_enter(squeue_t *sqp, mblk_t *mp, mblk_t *tail, uint32_t cnt,
 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
 		ASSERT(sqp->sq_first != NULL);
 		now = gethrtime();
+		sqp->sq_run = curthread;
 		sqp->sq_drain(sqp, SQS_ENTER, now + squeue_drain_ns);
 
 		/*
@@ -711,6 +721,20 @@ squeue_drain(squeue_t *sqp, uint_t proc_type, hrtime_t expire)
 	boolean_t	sq_poll_capable;
 	ip_recv_attr_t	*ira, iras;
 
+	/*
+	 * Before doing any work, check our stack depth; if we're not a
+	 * worker thread for this squeue and we're beginning to get tight on
+	 * on stack, kick the worker, bump a counter and return.
+	 */
+	if (proc_type != SQS_WORKER && STACK_BIAS + (uintptr_t)getfp() -
+	    (uintptr_t)curthread->t_stkbase < squeue_drain_stack_needed) {
+		ASSERT(mutex_owned(&sqp->sq_lock));
+		sqp->sq_awaken = ddi_get_lbolt();
+		cv_signal(&sqp->sq_worker_cv);
+		squeue_drain_stack_toodeep++;
+		return;
+	}
+
 	sq_poll_capable = (sqp->sq_state & SQS_POLL_CAPAB) != 0;
 again:
 	ASSERT(mutex_owned(&sqp->sq_lock));
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 1bb87e5c56..f79427e766 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -3792,7 +3792,8 @@ tcp_stack_init(netstackid_t stackid, netstack_t *ns)
 	ASSERT(error == 0);
 	tcps->tcps_ixa_cleanup_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
 	ASSERT(tcps->tcps_ixa_cleanup_mp != NULL);
-	cv_init(&tcps->tcps_ixa_cleanup_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tcps->tcps_ixa_cleanup_ready_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tcps->tcps_ixa_cleanup_done_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&tcps->tcps_ixa_cleanup_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	mutex_init(&tcps->tcps_reclaim_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -3857,7 +3858,8 @@ tcp_stack_fini(netstackid_t stackid, void *arg)
 
 	freeb(tcps->tcps_ixa_cleanup_mp);
 	tcps->tcps_ixa_cleanup_mp = NULL;
-	cv_destroy(&tcps->tcps_ixa_cleanup_cv);
+	cv_destroy(&tcps->tcps_ixa_cleanup_ready_cv);
+	cv_destroy(&tcps->tcps_ixa_cleanup_done_cv);
 	mutex_destroy(&tcps->tcps_ixa_cleanup_lock);
 
 	/*
diff --git a/usr/src/uts/common/inet/tcp/tcp_stats.c b/usr/src/uts/common/inet/tcp/tcp_stats.c
index 82fc0b227c..3cc5627b27 100644
--- a/usr/src/uts/common/inet/tcp/tcp_stats.c
+++ b/usr/src/uts/common/inet/tcp/tcp_stats.c
@@ -21,12 +21,14 @@
 
 /*
  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent Inc. All rights reserved.
  */
 
 #include <sys/types.h>
 #include <sys/tihdr.h>
 #include <sys/policy.h>
 #include <sys/tsol/tnet.h>
+#include <sys/kstat.h>
 
 #include <inet/common.h>
 #include <inet/ip.h>
@@ -505,7 +507,7 @@ tcp_kstat_init(netstackid_t stackid)
 		{ "connTableSize6",	KSTAT_DATA_INT32, 0 }
 	};
 
-	ksp = kstat_create_netstack(TCP_MOD_NAME, 0, TCP_MOD_NAME, "mib2",
+	ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, TCP_MOD_NAME, "mib2",
 	    KSTAT_TYPE_NAMED, NUM_OF_FIELDS(tcp_named_kstat_t), 0, stackid);
 
 	if (ksp == NULL)
@@ -518,6 +520,13 @@ tcp_kstat_init(netstackid_t stackid)
 	ksp->ks_update = tcp_kstat_update;
 	ksp->ks_private = (void *)(uintptr_t)stackid;
 
+	/*
+	 * If this is an exclusive netstack for a local zone, the global zone
+	 * should still be able to read the kstat.
+	 */
+	if (stackid != GLOBAL_NETSTACKID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
 	kstat_install(ksp);
 	return (ksp);
 }
@@ -733,7 +742,7 @@ tcp_kstat2_init(netstackid_t stackid)
 #endif
 	};
 
-	ksp = kstat_create_netstack(TCP_MOD_NAME, 0, "tcpstat", "net",
+	ksp = kstat_create_netstack(TCP_MOD_NAME, stackid, "tcpstat", "net",
 	    KSTAT_TYPE_NAMED, sizeof (template) / sizeof (kstat_named_t), 0,
 	    stackid);
 
@@ -744,6 +753,13 @@ tcp_kstat2_init(netstackid_t stackid)
 	ksp->ks_private = (void *)(uintptr_t)stackid;
 	ksp->ks_update = tcp_kstat2_update;
 
+	/*
+	 * If this is an exclusive netstack for a local zone, the global zone
+	 * should still be able to read the kstat.
+	 */
+	if (stackid != GLOBAL_NETSTACKID)
+		kstat_zone_add(ksp, GLOBAL_ZONEID);
+
 	kstat_install(ksp);
 	return (ksp);
 }
diff --git a/usr/src/uts/common/inet/tcp_stack.h b/usr/src/uts/common/inet/tcp_stack.h
index 2dccf6b78c..e46ebe08da 100644
--- a/usr/src/uts/common/inet/tcp_stack.h
+++ b/usr/src/uts/common/inet/tcp_stack.h
@@ -101,7 +101,8 @@ struct tcp_stack {
 	/* Used to synchronize access when reclaiming memory */
 	mblk_t		*tcps_ixa_cleanup_mp;
 	kmutex_t	tcps_ixa_cleanup_lock;
-	kcondvar_t	tcps_ixa_cleanup_cv;
+	kcondvar_t	tcps_ixa_cleanup_ready_cv;
+	kcondvar_t	tcps_ixa_cleanup_done_cv;
 
 	/* Variables for handling kmem reclaim call back. */
 	kmutex_t	tcps_reclaim_lock;