6558857 ZSD callback locking cause deadlocks

author: nordmark <none@none> 2008-01-22 15:57:26 -0800
committer: nordmark <none@none> 2008-01-22 15:57:26 -0800
commit: bd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c (patch)
tree: 34d74b100f909c973299a5ded0d0a231ac2d069e /usr/src
parent: c63537d6ab9d03a6ce330b36e829aba258c25d87 (diff)
download: illumos-joyent-bd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c.tar.gz
5 files changed, 1078 insertions, 460 deletions
diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c
index 94644741dd..2bce66d3f5 100644
--- a/usr/src/lib/libzonecfg/common/libzonecfg.c
+++ b/usr/src/lib/libzonecfg/common/libzonecfg.c
@@ -5107,6 +5107,8 @@ kernel_state_to_user_state(zoneid_t zoneid, zone_status_t kernel_state)
 	assert(kernel_state <= ZONE_MAX_STATE);
 	switch (kernel_state) {
 		case ZONE_IS_UNINITIALIZED:
+		case ZONE_IS_INITIALIZED:
+			/* The kernel will not return these two states */
 			return (ZONE_STATE_READY);
 		case ZONE_IS_READY:
 			/*
diff --git a/usr/src/uts/common/os/netstack.c b/usr/src/uts/common/os/netstack.c
index 44b147dc48..c1e59fe6c3 100644
--- a/usr/src/uts/common/os/netstack.c
+++ b/usr/src/uts/common/os/netstack.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -106,10 +106,6 @@ static void	*netstack_zone_create(zoneid_t zoneid);
 static void	netstack_zone_shutdown(zoneid_t zoneid, void *arg);
 static void	netstack_zone_destroy(zoneid_t zoneid, void *arg);
 
-static void	netstack_do_create(netstack_t *ns, int moduleid);
-static void	netstack_do_shutdown(netstack_t *ns, int moduleid);
-static void	netstack_do_destroy(netstack_t *ns, int moduleid);
-
 static void	netstack_shared_zone_add(zoneid_t zoneid);
 static void	netstack_shared_zone_remove(zoneid_t zoneid);
 static void	netstack_shared_kstat_add(kstat_t *ks);
@@ -117,6 +113,16 @@ static void	netstack_shared_kstat_remove(kstat_t *ks);
 
 typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
 
+static void	apply_all_netstacks(int, applyfn_t *);
+static void	apply_all_modules(netstack_t *, applyfn_t *);
+static void	apply_all_modules_reverse(netstack_t *, applyfn_t *);
+static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
+static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
+static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
+static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
+static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
+    kmutex_t *);
+
 void
 netstack_init(void)
 {
@@ -156,6 +162,10 @@ netstack_register(int moduleid,
 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 	ASSERT(module_create != NULL);
 
+	/*
+	 * Make instances created after this point in time run the create
+	 * callback.
+	 */
 	mutex_enter(&netstack_g_lock);
 	ASSERT(ns_reg[moduleid].nr_create == NULL);
 	ASSERT(ns_reg[moduleid].nr_flags == 0);
@@ -166,15 +176,17 @@ netstack_register(int moduleid,
 
 	/*
 	 * Determine the set of stacks that exist before we drop the lock.
-	 * Set CREATE_NEEDED for each of those.
+	 * Set NSS_CREATE_NEEDED for each of those.
 	 * netstacks which have been deleted will have NSS_CREATE_COMPLETED
 	 * set, but check NSF_CLOSING to be sure.
 	 */
 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
+		nm_state_t *nms = &ns->netstack_m_state[moduleid];
+
 		mutex_enter(&ns->netstack_lock);
 		if (!(ns->netstack_flags & NSF_CLOSING) &&
-		    (ns->netstack_m_state[moduleid] & NSS_CREATE_ALL) == 0) {
-			ns->netstack_m_state[moduleid] |= NSS_CREATE_NEEDED;
+		    (nms->nms_flags & NSS_CREATE_ALL) == 0) {
+			nms->nms_flags |= NSS_CREATE_NEEDED;
 			DTRACE_PROBE2(netstack__create__needed,
 			    netstack_t *, ns, int, moduleid);
 		}
@@ -183,12 +195,12 @@ netstack_register(int moduleid,
 	mutex_exit(&netstack_g_lock);
 
 	/*
-	 * Call the create function for each stack that has CREATE_NEEDED
-	 * for this moduleid.
-	 * Set CREATE_INPROGRESS, drop lock, and after done,
-	 * set CREATE_COMPLETE
+	 * At this point in time a new instance can be created or an instance
+	 * can be destroyed, or some other module can register or unregister.
+	 * Make sure we either run all the create functions for this moduleid
+	 * or we wait for any other creators for this moduleid.
 	 */
-	netstack_do_create(NULL, moduleid);
+	apply_all_netstacks(moduleid, netstack_apply_create);
 }
 
 void
@@ -204,41 +216,57 @@ netstack_unregister(int moduleid)
 	mutex_enter(&netstack_g_lock);
 	/*
 	 * Determine the set of stacks that exist before we drop the lock.
-	 * Set SHUTDOWN_NEEDED and DESTROY_NEEDED for each of those.
+	 * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
+	 * That ensures that when we return all the callbacks for existing
+	 * instances have completed. And since we set NRF_DYING no new
+	 * instances can use this module.
 	 */
 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
+		nm_state_t *nms = &ns->netstack_m_state[moduleid];
+
 		mutex_enter(&ns->netstack_lock);
 		if (ns_reg[moduleid].nr_shutdown != NULL &&
-		    (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) &&
-		    (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_ALL) == 0) {
-			ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_NEEDED;
+		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
+			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 			DTRACE_PROBE2(netstack__shutdown__needed,
 			    netstack_t *, ns, int, moduleid);
 		}
 		if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
 		    ns_reg[moduleid].nr_destroy != NULL &&
-		    (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) &&
-		    (ns->netstack_m_state[moduleid] & NSS_DESTROY_ALL) == 0) {
-			ns->netstack_m_state[moduleid] |= NSS_DESTROY_NEEDED;
+		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+		    (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
+			nms->nms_flags |= NSS_DESTROY_NEEDED;
 			DTRACE_PROBE2(netstack__destroy__needed,
 			    netstack_t *, ns, int, moduleid);
 		}
 		mutex_exit(&ns->netstack_lock);
 	}
+	/*
+	 * Prevent any new netstack from calling the registered create
+	 * function, while keeping the function pointers in place until the
+	 * shutdown and destroy callbacks are complete.
+	 */
+	ns_reg[moduleid].nr_flags |= NRF_DYING;
 	mutex_exit(&netstack_g_lock);
 
-	netstack_do_shutdown(NULL, moduleid);
-	netstack_do_destroy(NULL, moduleid);
+	apply_all_netstacks(moduleid, netstack_apply_shutdown);
+	apply_all_netstacks(moduleid, netstack_apply_destroy);
 
 	/*
-	 * Clear the netstack_m_state so that we can handle this module
+	 * Clear the nms_flags so that we can handle this module
 	 * being loaded again.
+	 * Also remove the registered functions.
 	 */
 	mutex_enter(&netstack_g_lock);
+	ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
+	ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
+		nm_state_t *nms = &ns->netstack_m_state[moduleid];
+
 		mutex_enter(&ns->netstack_lock);
-		if (ns->netstack_m_state[moduleid] & NSS_DESTROY_COMPLETED) {
-			ns->netstack_m_state[moduleid] = 0;
+		if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
+			nms->nms_flags = 0;
 			DTRACE_PROBE2(netstack__destroy__done,
 			    netstack_t *, ns, int, moduleid);
 		}
@@ -304,6 +332,7 @@ netstack_zone_create(zoneid_t zoneid)
 	}
 	/* Not found */
 	mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
 	ns->netstack_stackid = zoneid;
 	ns->netstack_numzones = 1;
 	ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
@@ -311,26 +340,44 @@ netstack_zone_create(zoneid_t zoneid)
 	*nsp = ns;
 	zone->zone_netstack = ns;
 
+	mutex_enter(&ns->netstack_lock);
+	/*
+	 * Mark this netstack as having a CREATE running so
+	 * any netstack_register/netstack_unregister waits for
+	 * the existing create callbacks to complete in moduleid order
+	 */
+	ns->netstack_flags |= NSF_ZONE_CREATE;
+
 	/*
 	 * Determine the set of module create functions that need to be
 	 * called before we drop the lock.
+	 * Set NSS_CREATE_NEEDED for each of those.
+	 * Skip any with NRF_DYING set, since those are in the process of
+	 * going away, by checking for flags being exactly NRF_REGISTERED.
 	 */
 	for (i = 0; i < NS_MAX; i++) {
-		mutex_enter(&ns->netstack_lock);
-		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
-		    (ns->netstack_m_state[i] & NSS_CREATE_ALL) == 0) {
-			ns->netstack_m_state[i] |= NSS_CREATE_NEEDED;
+		nm_state_t *nms = &ns->netstack_m_state[i];
+
+		cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
+
+		if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
+		    (nms->nms_flags & NSS_CREATE_ALL) == 0) {
+			nms->nms_flags |= NSS_CREATE_NEEDED;
 			DTRACE_PROBE2(netstack__create__needed,
 			    netstack_t *, ns, int, i);
 		}
-		mutex_exit(&ns->netstack_lock);
 	}
+	mutex_exit(&ns->netstack_lock);
 	mutex_exit(&netstack_g_lock);
 
-	netstack_do_create(ns, NS_ALL);
+	apply_all_modules(ns, netstack_apply_create);
 
+	/* Tell any waiting netstack_register/netstack_unregister to proceed */
 	mutex_enter(&ns->netstack_lock);
 	ns->netstack_flags &= ~NSF_UNINIT;
+	ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
+	ns->netstack_flags &= ~NSF_ZONE_CREATE;
+	cv_broadcast(&ns->netstack_cv);
 	mutex_exit(&ns->netstack_lock);
 
 	return (ns);
@@ -356,29 +403,46 @@ netstack_zone_shutdown(zoneid_t zoneid, void *arg)
 	mutex_exit(&ns->netstack_lock);
 
 	mutex_enter(&netstack_g_lock);
+	mutex_enter(&ns->netstack_lock);
+	/*
+	 * Mark this netstack as having a SHUTDOWN running so
+	 * any netstack_register/netstack_unregister waits for
+	 * the existing create callbacks to complete in moduleid order
+	 */
+	ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
+	ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
+
 	/*
 	 * Determine the set of stacks that exist before we drop the lock.
-	 * Set SHUTDOWN_NEEDED for each of those.
+	 * Set NSS_SHUTDOWN_NEEDED for each of those.
 	 */
 	for (i = 0; i < NS_MAX; i++) {
-		mutex_enter(&ns->netstack_lock);
+		nm_state_t *nms = &ns->netstack_m_state[i];
+
 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 		    ns_reg[i].nr_shutdown != NULL &&
-		    (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
-		    (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) {
-			ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED;
+		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
+			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 			DTRACE_PROBE2(netstack__shutdown__needed,
 			    netstack_t *, ns, int, i);
 		}
-		mutex_exit(&ns->netstack_lock);
 	}
+	mutex_exit(&ns->netstack_lock);
 	mutex_exit(&netstack_g_lock);
 
 	/*
 	 * Call the shutdown function for all registered modules for this
 	 * netstack.
 	 */
-	netstack_do_shutdown(ns, NS_ALL);
+	apply_all_modules(ns, netstack_apply_shutdown);
+
+	/* Tell any waiting netstack_register/netstack_unregister to proceed */
+	mutex_enter(&ns->netstack_lock);
+	ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
+	ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
+	cv_broadcast(&ns->netstack_cv);
+	mutex_exit(&ns->netstack_lock);
 }
 
 /*
@@ -429,70 +493,183 @@ netstack_stack_inactive(netstack_t *ns)
 	int i;
 
 	mutex_enter(&netstack_g_lock);
+	mutex_enter(&ns->netstack_lock);
+	/*
+	 * Mark this netstack as having a DESTROY running so
+	 * any netstack_register/netstack_unregister waits for
+	 * the existing destroy callbacks to complete in reverse moduleid order
+	 */
+	ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
+	ns->netstack_flags |= NSF_ZONE_DESTROY;
 	/*
 	 * If the shutdown callback wasn't called earlier (e.g., if this is
-	 * a netstack shared between multiple zones), then we call it now.
+	 * a netstack shared between multiple zones), then we schedule it now.
+	 *
+	 * Determine the set of stacks that exist before we drop the lock.
+	 * Set NSS_DESTROY_NEEDED for each of those. That
+	 * ensures that when we return all the callbacks for existing
+	 * instances have completed.
 	 */
 	for (i = 0; i < NS_MAX; i++) {
-		mutex_enter(&ns->netstack_lock);
+		nm_state_t *nms = &ns->netstack_m_state[i];
+
 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 		    ns_reg[i].nr_shutdown != NULL &&
-		    (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
-		    (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) {
-			ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED;
+		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
+			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
 			DTRACE_PROBE2(netstack__shutdown__needed,
 			    netstack_t *, ns, int, i);
 		}
-		mutex_exit(&ns->netstack_lock);
-	}
-	/*
-	 * Determine the set of stacks that exist before we drop the lock.
-	 * Set DESTROY_NEEDED for each of those.
-	 */
-	for (i = 0; i < NS_MAX; i++) {
-		mutex_enter(&ns->netstack_lock);
+
 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
 		    ns_reg[i].nr_destroy != NULL &&
-		    (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
-		    (ns->netstack_m_state[i] & NSS_DESTROY_ALL) == 0) {
-			ns->netstack_m_state[i] |= NSS_DESTROY_NEEDED;
+		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+		    (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
+			nms->nms_flags |= NSS_DESTROY_NEEDED;
 			DTRACE_PROBE2(netstack__destroy__needed,
 			    netstack_t *, ns, int, i);
 		}
-		mutex_exit(&ns->netstack_lock);
 	}
+	mutex_exit(&ns->netstack_lock);
 	mutex_exit(&netstack_g_lock);
 
 	/*
 	 * Call the shutdown and destroy functions for all registered modules
 	 * for this netstack.
+	 *
+	 * Since there are some ordering dependencies between the modules we
+	 * tear them down in the reverse order of what was used to create them.
+	 *
+	 * Since a netstack_t is never reused (when a zone is rebooted it gets
+	 * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
+	 * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
+	 * That is different than in the netstack_unregister() case.
 	 */
-	netstack_do_shutdown(ns, NS_ALL);
-	netstack_do_destroy(ns, NS_ALL);
+	apply_all_modules(ns, netstack_apply_shutdown);
+	apply_all_modules_reverse(ns, netstack_apply_destroy);
+
+	/* Tell any waiting netstack_register/netstack_unregister to proceed */
+	mutex_enter(&ns->netstack_lock);
+	ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
+	ns->netstack_flags &= ~NSF_ZONE_DESTROY;
+	cv_broadcast(&ns->netstack_cv);
+	mutex_exit(&ns->netstack_lock);
+}
+
+/*
+ * Apply a function to all netstacks for a particular moduleid.
+ *
+ * If there is any zone activity (due to a zone being created, shutdown,
+ * or destroyed) we wait for that to complete before we proceed. This ensures
+ * that the moduleids are processed in order when a zone is created or
+ * destroyed.
+ *
+ * The applyfn has to drop netstack_g_lock if it does some work.
+ * In that case we don't follow netstack_next,
+ * even if it is possible to do so without any hazards. This is
+ * because we want the design to allow for the list of netstacks threaded
+ * by netstack_next to change in any arbitrary way during the time the
+ * lock was dropped.
+ *
+ * It is safe to restart the loop at netstack_head since the applyfn
+ * changes netstack_m_state as it processes things, so a subsequent
+ * pass through will have no effect in applyfn, hence the loop will terminate
+ * in at worst O(N^2).
+ */
+static void
+apply_all_netstacks(int moduleid, applyfn_t *applyfn)
+{
+	netstack_t *ns;
+
+	mutex_enter(&netstack_g_lock);
+	ns = netstack_head;
+	while (ns != NULL) {
+		if (wait_for_zone_creator(ns, &netstack_g_lock)) {
+			/* Lock dropped - restart at head */
+			ns = netstack_head;
+		} else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
+			/* Lock dropped - restart at head */
+			ns = netstack_head;
+		} else {
+			ns = ns->netstack_next;
+		}
+	}
+	mutex_exit(&netstack_g_lock);
+}
+
+/*
+ * Apply a function to all moduleids for a particular netstack.
+ *
+ * Since the netstack linkage doesn't matter in this case we can
+ * ignore whether the function drops the lock.
+ */
+static void
+apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
+{
+	int i;
+
+	mutex_enter(&netstack_g_lock);
+	for (i = 0; i < NS_MAX; i++) {
+		/*
+		 * We don't care whether the lock was dropped
+		 * since we are not iterating over netstack_head.
+		 */
+		(void) (applyfn)(&netstack_g_lock, ns, i);
+	}
+	mutex_exit(&netstack_g_lock);
+}
+
+/* Like the above but in reverse moduleid order */
+static void
+apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
+{
+	int i;
+
+	mutex_enter(&netstack_g_lock);
+	for (i = NS_MAX-1; i >= 0; i--) {
+		/*
+		 * We don't care whether the lock was dropped
+		 * since we are not iterating over netstack_head.
+		 */
+		(void) (applyfn)(&netstack_g_lock, ns, i);
+	}
+	mutex_exit(&netstack_g_lock);
 }
 
 /*
  * Call the create function for the ns and moduleid if CREATE_NEEDED
  * is set.
- * When it calls it, it drops the netstack_lock held by the caller,
- * and returns true to tell the caller it needs to re-evalute the
- * state..
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all netstacks/moduleids.
+ *
+ * When we call the create function, we temporarily drop the netstack_lock
+ * held by the caller, and return true to tell the caller it needs to
+ * re-evalute the state.
  */
 static boolean_t
 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
 {
 	void *result;
 	netstackid_t stackid;
+	nm_state_t *nms = &ns->netstack_m_state[moduleid];
+	boolean_t dropped = B_FALSE;
 
 	ASSERT(MUTEX_HELD(lockp));
 	mutex_enter(&ns->netstack_lock);
-	if (ns->netstack_m_state[moduleid] & NSS_CREATE_NEEDED) {
-		ns->netstack_m_state[moduleid] &= ~NSS_CREATE_NEEDED;
-		ns->netstack_m_state[moduleid] |= NSS_CREATE_INPROGRESS;
+
+	if (wait_for_nms_inprogress(ns, nms, lockp))
+		dropped = B_TRUE;
+
+	if (nms->nms_flags & NSS_CREATE_NEEDED) {
+		nms->nms_flags &= ~NSS_CREATE_NEEDED;
+		nms->nms_flags |= NSS_CREATE_INPROGRESS;
 		DTRACE_PROBE2(netstack__create__inprogress,
 		    netstack_t *, ns, int, moduleid);
 		mutex_exit(&ns->netstack_lock);
 		mutex_exit(lockp);
+		dropped = B_TRUE;
 
 		ASSERT(ns_reg[moduleid].nr_create != NULL);
 		stackid = ns->netstack_stackid;
@@ -504,42 +681,55 @@ netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
 		    void *, result, netstack_t *, ns);
 
 		ASSERT(result != NULL);
+		mutex_enter(lockp);
 		mutex_enter(&ns->netstack_lock);
 		ns->netstack_modules[moduleid] = result;
-		ns->netstack_m_state[moduleid] &= ~NSS_CREATE_INPROGRESS;
-		ns->netstack_m_state[moduleid] |= NSS_CREATE_COMPLETED;
+		nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
+		nms->nms_flags |= NSS_CREATE_COMPLETED;
+		cv_broadcast(&nms->nms_cv);
 		DTRACE_PROBE2(netstack__create__completed,
 		    netstack_t *, ns, int, moduleid);
 		mutex_exit(&ns->netstack_lock);
-		return (B_TRUE);
+		return (dropped);
 	} else {
 		mutex_exit(&ns->netstack_lock);
-		return (B_FALSE);
+		return (dropped);
 	}
 }
 
 /*
  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
  * is set.
- * When it calls it, it drops the netstack_lock held by the caller,
- * and returns true to tell the caller it needs to re-evalute the
- * state..
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all netstacks/moduleids.
+ *
+ * When we call the shutdown function, we temporarily drop the netstack_lock
+ * held by the caller, and return true to tell the caller it needs to
+ * re-evalute the state.
  */
 static boolean_t
 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
 {
 	netstackid_t stackid;
 	void * netstack_module;
+	nm_state_t *nms = &ns->netstack_m_state[moduleid];
+	boolean_t dropped = B_FALSE;
 
 	ASSERT(MUTEX_HELD(lockp));
 	mutex_enter(&ns->netstack_lock);
-	if (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_NEEDED) {
-		ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_NEEDED;
-		ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_INPROGRESS;
+
+	if (wait_for_nms_inprogress(ns, nms, lockp))
+		dropped = B_TRUE;
+
+	if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
+		nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
+		nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
 		DTRACE_PROBE2(netstack__shutdown__inprogress,
 		    netstack_t *, ns, int, moduleid);
 		mutex_exit(&ns->netstack_lock);
 		mutex_exit(lockp);
+		dropped = B_TRUE;
 
 		ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
 		stackid = ns->netstack_stackid;
@@ -551,43 +741,55 @@ netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
 		DTRACE_PROBE1(netstack__shutdown__end,
 		    netstack_t *, ns);
 
+		mutex_enter(lockp);
 		mutex_enter(&ns->netstack_lock);
-		ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_INPROGRESS;
-		ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_COMPLETED;
+		nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
+		nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
+		cv_broadcast(&nms->nms_cv);
 		DTRACE_PROBE2(netstack__shutdown__completed,
 		    netstack_t *, ns, int, moduleid);
 		mutex_exit(&ns->netstack_lock);
-		return (B_TRUE);
+		return (dropped);
 	} else {
 		mutex_exit(&ns->netstack_lock);
-		return (B_FALSE);
+		return (dropped);
 	}
 }
 
 /*
  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
  * is set.
- * When it calls it, it drops the netstack_lock held by the caller,
- * and returns true to tell the caller it needs to re-evalute the
- * state..
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all netstacks/moduleids.
+ *
+ * When we call the destroy function, we temporarily drop the netstack_lock
+ * held by the caller, and return true to tell the caller it needs to
+ * re-evalute the state.
  */
 static boolean_t
 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
 {
 	netstackid_t stackid;
 	void * netstack_module;
+	nm_state_t *nms = &ns->netstack_m_state[moduleid];
+	boolean_t dropped = B_FALSE;
 
 	ASSERT(MUTEX_HELD(lockp));
 	mutex_enter(&ns->netstack_lock);
-	if (ns->netstack_m_state[moduleid] & NSS_DESTROY_NEEDED) {
-		ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_NEEDED;
-		ns->netstack_m_state[moduleid] |= NSS_DESTROY_INPROGRESS;
+
+	if (wait_for_nms_inprogress(ns, nms, lockp))
+		dropped = B_TRUE;
+
+	if (nms->nms_flags & NSS_DESTROY_NEEDED) {
+		nms->nms_flags &= ~NSS_DESTROY_NEEDED;
+		nms->nms_flags |= NSS_DESTROY_INPROGRESS;
 		DTRACE_PROBE2(netstack__destroy__inprogress,
 		    netstack_t *, ns, int, moduleid);
 		mutex_exit(&ns->netstack_lock);
 		mutex_exit(lockp);
+		dropped = B_TRUE;
 
-		/* XXX race against unregister? */
 		ASSERT(ns_reg[moduleid].nr_destroy != NULL);
 		stackid = ns->netstack_stackid;
 		netstack_module = ns->netstack_modules[moduleid];
@@ -598,177 +800,83 @@ netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
 		DTRACE_PROBE1(netstack__destroy__end,
 		    netstack_t *, ns);
 
+		mutex_enter(lockp);
 		mutex_enter(&ns->netstack_lock);
 		ns->netstack_modules[moduleid] = NULL;
-		ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_INPROGRESS;
-		ns->netstack_m_state[moduleid] |= NSS_DESTROY_COMPLETED;
+		nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
+		nms->nms_flags |= NSS_DESTROY_COMPLETED;
+		cv_broadcast(&nms->nms_cv);
 		DTRACE_PROBE2(netstack__destroy__completed,
 		    netstack_t *, ns, int, moduleid);
 		mutex_exit(&ns->netstack_lock);
-		return (B_TRUE);
+		return (dropped);
 	} else {
 		mutex_exit(&ns->netstack_lock);
-		return (B_FALSE);
+		return (dropped);
 	}
 }
 
 /*
- * Apply a function to all netstacks for a particular moduleid.
- *
- * The applyfn has to drop netstack_g_lock if it does some work.
- * In that case we don't follow netstack_next after reacquiring the
- * lock, even if it is possible to do so without any hazards. This is
- * because we want the design to allow for the list of netstacks threaded
- * by netstack_next to change in any arbitrary way during the time the
- * lock was dropped.
- *
- * It is safe to restart the loop at netstack_head since the applyfn
- * changes netstack_m_state as it processes things, so a subsequent
- * pass through will have no effect in applyfn, hence the loop will terminate
- * in at worst O(N^2).
+ * If somebody  is creating the netstack (due to a new zone being created)
+ * then we wait for them to complete. This ensures that any additional
+ * netstack_register() doesn't cause the create functions to run out of
+ * order.
+ * Note that we do not need such a global wait in the case of the shutdown
+ * and destroy callbacks, since in that case it is sufficient for both
+ * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
+ * Returns true if lockp was temporarily dropped while waiting.
  */
-static void
-apply_all_netstacks(int moduleid, applyfn_t *applyfn)
+static boolean_t
+wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
 {
-	netstack_t *ns;
+	boolean_t dropped = B_FALSE;
 
-	mutex_enter(&netstack_g_lock);
-	ns = netstack_head;
-	while (ns != NULL) {
-		if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
-			/* Lock dropped - restart at head */
-#ifdef NS_DEBUG
-			(void) printf("apply_all_netstacks: "
-			    "LD for %p/%d, %d\n",
-			    (void *)ns, ns->netstack_stackid, moduleid);
-#endif
-			mutex_enter(&netstack_g_lock);
-			ns = netstack_head;
-		} else {
-			ns = ns->netstack_next;
+	mutex_enter(&ns->netstack_lock);
+	while (ns->netstack_flags & NSF_ZONE_CREATE) {
+		DTRACE_PROBE1(netstack__wait__zone__inprogress,
+		    netstack_t *, ns);
+		if (lockp != NULL) {
+			dropped = B_TRUE;
+			mutex_exit(lockp);
+		}
+		cv_wait(&ns->netstack_cv, &ns->netstack_lock);
+		if (lockp != NULL) {
+			/* First drop netstack_lock to preserve order */
+			mutex_exit(&ns->netstack_lock);
+			mutex_enter(lockp);
+			mutex_enter(&ns->netstack_lock);
 		}
 	}
-	mutex_exit(&netstack_g_lock);
+	mutex_exit(&ns->netstack_lock);
+	return (dropped);
 }
 
 /*
- * Apply a function to all moduleids for a particular netstack.
- *
- * Since the netstack linkage doesn't matter in this case we can
- * ignore whether the function drops the lock.
+ * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
+ * combination.
+ * Returns true if lockp was temporarily dropped while waiting.
  */
-static void
-apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
+static boolean_t
+wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
 {
-	int i;
-
-	mutex_enter(&netstack_g_lock);
-	for (i = 0; i < NS_MAX; i++) {
-		if ((applyfn)(&netstack_g_lock, ns, i)) {
-			/*
-			 * Lock dropped but since we are not iterating over
-			 * netstack_head we can just reacquire the lock.
-			 */
-			mutex_enter(&netstack_g_lock);
+	boolean_t dropped = B_FALSE;
+
+	while (nms->nms_flags & NSS_ALL_INPROGRESS) {
+		DTRACE_PROBE2(netstack__wait__nms__inprogress,
+		    netstack_t *, ns, nm_state_t *, nms);
+		if (lockp != NULL) {
+			dropped = B_TRUE;
+			mutex_exit(lockp);
 		}
-	}
-	mutex_exit(&netstack_g_lock);
-}
-
-/* Like the above but in reverse moduleid order */
-static void
-apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
-{
-	int i;
-
-	mutex_enter(&netstack_g_lock);
-	for (i = NS_MAX-1; i >= 0; i--) {
-		if ((applyfn)(&netstack_g_lock, ns, i)) {
-			/*
-			 * Lock dropped but since we are not iterating over
-			 * netstack_head we can just reacquire the lock.
-			 */
-			mutex_enter(&netstack_g_lock);
+		cv_wait(&nms->nms_cv, &ns->netstack_lock);
+		if (lockp != NULL) {
+			/* First drop netstack_lock to preserve order */
+			mutex_exit(&ns->netstack_lock);
+			mutex_enter(lockp);
+			mutex_enter(&ns->netstack_lock);
 		}
 	}
-	mutex_exit(&netstack_g_lock);
-}
-
-/*
- * Apply a function to a subset of all module/netstack combinations.
- *
- * If ns is non-NULL we restrict it to that particular instance.
- * If moduleid is a particular one (not NS_ALL), then we restrict it
- * to that particular moduleid.
- * When walking the moduleid, the reverse argument specifies that they
- * should be walked in reverse order.
- * The applyfn returns true if it had dropped the locks.
- */
-static void
-netstack_do_apply(netstack_t *ns, int moduleid, boolean_t reverse,
-    applyfn_t *applyfn)
-{
-	if (ns != NULL) {
-		ASSERT(moduleid == NS_ALL);
-		if (reverse)
-			apply_all_modules_reverse(ns, applyfn);
-		else
-			apply_all_modules(ns, applyfn);
-	} else {
-		ASSERT(moduleid != NS_ALL);
-
-		apply_all_netstacks(moduleid, applyfn);
-	}
-}
-
-/*
- * Run the create function for all modules x stack combinations
- * that have NSS_CREATE_NEEDED set.
- *
- * Call the create function for each stack that has CREATE_NEEDED.
- * Set CREATE_INPROGRESS, drop lock, and after done,
- * set CREATE_COMPLETE
- */
-static void
-netstack_do_create(netstack_t *ns, int moduleid)
-{
-	netstack_do_apply(ns, moduleid, B_FALSE, netstack_apply_create);
-}
-
-/*
- * Run the shutdown function for all modules x stack combinations
- * that have NSS_SHUTDOWN_NEEDED set.
- *
- * Call the shutdown function for each stack that has SHUTDOWN_NEEDED.
- * Set SHUTDOWN_INPROGRESS, drop lock, and after done,
- * set SHUTDOWN_COMPLETE
- */
-static void
-netstack_do_shutdown(netstack_t *ns, int moduleid)
-{
-	netstack_do_apply(ns, moduleid, B_FALSE, netstack_apply_shutdown);
-}
-
-/*
- * Run the destroy function for all modules x stack combinations
- * that have NSS_DESTROY_NEEDED set.
- *
- * Call the destroy function for each stack that has DESTROY_NEEDED.
- * Set DESTROY_INPROGRESS, drop lock, and after done,
- * set DESTROY_COMPLETE
- *
- * Since a netstack_t is never reused (when a zone is rebooted it gets
- * a new zoneid == netstackid i.e. a new netstack_t is allocated) we leave
- * netstack_m_state the way it is i.e. with NSS_DESTROY_COMPLETED set.
- */
-static void
-netstack_do_destroy(netstack_t *ns, int moduleid)
-{
-	/*
-	 * Have to walk the moduleids in reverse order since some
-	 * modules make implicit assumptions about the order
-	 */
-	netstack_do_apply(ns, moduleid, B_TRUE, netstack_apply_destroy);
+	return (dropped);
 }
 
 /*
@@ -845,7 +953,10 @@ netstack_find_by_zoneid(zoneid_t zoneid)
 }
 
 /*
- * Find a stack instance given the zoneid.
+ * Find a stack instance given the zoneid. Can only be called from
+ * the create callback. See the comments in zone_find_by_id_nolock why
+ * that limitation exists.
+ *
  * Increases the reference count if found; caller must do a
  * netstack_rele().
  *
@@ -853,8 +964,6 @@ netstack_find_by_zoneid(zoneid_t zoneid)
  * matches.
  *
  * Skip the unitialized ones.
- *
- * NOTE: The caller must hold zonehash_lock.
  */
 netstack_t *
 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
@@ -875,7 +984,7 @@ netstack_find_by_zoneid_nolock(zoneid_t zoneid)
 	else
 		netstack_hold(ns);
 
-	zone_rele(zone);
+	/* zone_find_by_id_nolock does not have a hold on the zone */
 	return (ns);
 }
 
@@ -913,6 +1022,7 @@ netstack_rele(netstack_t *ns)
 	netstack_t **nsp;
 	boolean_t found;
 	int refcnt, numzones;
+	int i;
 
 	mutex_enter(&ns->netstack_lock);
 	ASSERT(ns->netstack_refcnt > 0);
@@ -959,6 +1069,14 @@ netstack_rele(netstack_t *ns)
 		ASSERT(ns->netstack_numzones == 0);
 
 		ASSERT(ns->netstack_flags & NSF_CLOSING);
+
+		for (i = 0; i < NS_MAX; i++) {
+			nm_state_t *nms = &ns->netstack_m_state[i];
+
+			cv_destroy(&nms->nms_cv);
+		}
+		mutex_destroy(&ns->netstack_lock);
+		cv_destroy(&ns->netstack_cv);
 		kmem_free(ns, sizeof (*ns));
 	}
 }
@@ -996,7 +1114,7 @@ kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
 		zoneid_t zoneid = ks_netstackid;
 
 		return (kstat_create_zone(ks_module, ks_instance, ks_name,
-			ks_class, ks_type, ks_ndata, ks_flags, zoneid));
+		    ks_class, ks_type, ks_ndata, ks_flags, zoneid));
 	}
 }
 
@@ -1144,7 +1262,9 @@ netstack_find_shared_zoneid(zoneid_t zoneid)
 /*
  * Hide the fact that zoneids and netstackids are allocated from
  * the same space in the current implementation.
- * XXX could add checks that the stackid/zoneids are valid...
+ * We currently do not check that the stackid/zoneids are valid, since there
+ * is no need for that. But this should only be done for ids that are
+ * valid.
  */
 zoneid_t
 netstackid_to_zoneid(netstackid_t stackid)
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 033bc96ea3..75354330ef 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -61,6 +61,10 @@
  *   initialized zone is added to the list of active zones on the system but
  *   isn't accessible.
  *
+ *   ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
+ *   not yet completed. Not possible to enter the zone, but attributes can
+ *   be retrieved.
+ *
  *   ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
  *   ready.  The zone is made visible after the ZSD constructor callbacks are
  *   executed.  A zone remains in this state until it transitions into
@@ -228,6 +232,7 @@
 
 #include <sys/door.h>
 #include <sys/cpuvar.h>
+#include <sys/sdt.h>
 
 #include <sys/uadmin.h>
 #include <sys/session.h>
@@ -313,6 +318,7 @@ evchan_t *zone_event_chan;
  */
 const char  *zone_status_table[] = {
 	ZONE_EVENT_UNINITIALIZED,	/* uninitialized */
+	ZONE_EVENT_INITIALIZED,		/* initialized */
 	ZONE_EVENT_READY,		/* ready */
 	ZONE_EVENT_READY,		/* booting */
 	ZONE_EVENT_RUNNING,		/* running */
@@ -351,6 +357,19 @@ static int zone_remove_datalink(zoneid_t, char *);
 static int zone_check_datalink(zoneid_t *, char *);
 static int zone_list_datalink(zoneid_t, int *, char *);
 
+typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
+
+static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
+static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
+static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
+static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
+    zone_key_t);
+static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
+static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
+    kmutex_t *);
+static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
+    kmutex_t *);
+
 /*
  * Bump this number when you alter the zone syscall interfaces; this is
  * because we need to have support for previous API versions in libc
@@ -485,78 +504,54 @@ mount_completed(void)
  * The locking strategy and overall picture is as follows:
  *
  * When someone calls zone_key_create(), a template ZSD entry is added to the
- * global list "zsd_registered_keys", protected by zsd_key_lock.  The
- * constructor callback is called immediately on all existing zones, and a
- * copy of the ZSD entry added to the per-zone zone_zsd list (protected by
- * zone_lock).  As this operation requires the list of zones, the list of
- * registered keys, and the per-zone list of ZSD entries to remain constant
- * throughout the entire operation, it must grab zonehash_lock, zone_lock for
- * all existing zones, and zsd_key_lock, in that order.  Similar locking is
- * needed when zone_key_delete() is called.  It is thus sufficient to hold
- * zsd_key_lock *or* zone_lock to prevent additions to or removals from the
- * per-zone zone_zsd list.
+ * global list "zsd_registered_keys", protected by zsd_key_lock.  While
+ * holding that lock all the existing zones are marked as
+ * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
+ * zone_zsd list (protected by zone_lock). The global list is updated first
+ * (under zone_key_lock) to make sure that newly created zones use the
+ * most recent list of keys. Then under zonehash_lock we walk the zones
+ * and mark them.  Similar locking is used in zone_key_delete().
  *
- * Note that this implementation does not make a copy of the ZSD entry if a
- * constructor callback is not provided.  A zone_getspecific() on such an
- * uninitialized ZSD entry will return NULL.
+ * The actual create, shutdown, and destroy callbacks are done without
+ * holding any lock. And zsd_flags are used to ensure that the operations
+ * completed so that when zone_key_create (and zone_create) is done, as well as
+ * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
+ * are completed.
  *
  * When new zones are created constructor callbacks for all registered ZSD
- * entries will be called.
+ * entries will be called. That also uses the above two phases of marking
+ * what needs to be done, and then running the callbacks without holding
+ * any locks.
  *
  * The framework does not provide any locking around zone_getspecific() and
  * zone_setspecific() apart from that needed for internal consistency, so
  * callers interested in atomic "test-and-set" semantics will need to provide
  * their own locking.
  */
-void
-zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
-    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
-{
-	struct zsd_entry *zsdp;
-	struct zsd_entry *t;
-	struct zone *zone;
 
-	zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
-	zsdp->zsd_data = NULL;
-	zsdp->zsd_create = create;
-	zsdp->zsd_shutdown = shutdown;
-	zsdp->zsd_destroy = destroy;
-
-	mutex_enter(&zonehash_lock);	/* stop the world */
-	for (zone = list_head(&zone_active); zone != NULL;
-	    zone = list_next(&zone_active, zone))
-		mutex_enter(&zone->zone_lock);	/* lock all zones */
-
-	mutex_enter(&zsd_key_lock);
-	*keyp = zsdp->zsd_key = ++zsd_keyval;
-	ASSERT(zsd_keyval != 0);
-	list_insert_tail(&zsd_registered_keys, zsdp);
-	mutex_exit(&zsd_key_lock);
+/*
+ * Helper function to find the zsd_entry associated with the key in the
+ * given list.
+ */
+static struct zsd_entry *
+zsd_find(list_t *l, zone_key_t key)
+{
+	struct zsd_entry *zsd;
 
-	if (create != NULL) {
-		for (zone = list_head(&zone_active); zone != NULL;
-		    zone = list_next(&zone_active, zone)) {
-			t = kmem_alloc(sizeof (*t), KM_SLEEP);
-			t->zsd_key = *keyp;
-			t->zsd_data = (*create)(zone->zone_id);
-			t->zsd_create = create;
-			t->zsd_shutdown = shutdown;
-			t->zsd_destroy = destroy;
-			list_insert_tail(&zone->zone_zsd, t);
+	for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
+		if (zsd->zsd_key == key) {
+			return (zsd);
 		}
 	}
-	for (zone = list_head(&zone_active); zone != NULL;
-	    zone = list_next(&zone_active, zone))
-		mutex_exit(&zone->zone_lock);
-	mutex_exit(&zonehash_lock);
+	return (NULL);
 }
 
 /*
  * Helper function to find the zsd_entry associated with the key in the
- * given list.
+ * given list. Move it to the front of the list.
  */
 static struct zsd_entry *
-zsd_find(list_t *l, zone_key_t key)
+zsd_find_mru(list_t *l, zone_key_t key)
 {
 	struct zsd_entry *zsd;
 
@@ -575,9 +570,88 @@ zsd_find(list_t *l, zone_key_t key)
 	return (NULL);
 }
 
+void
+zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
+    void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
+{
+	struct zsd_entry *zsdp;
+	struct zsd_entry *t;
+	struct zone *zone;
+	zone_key_t  key;
+
+	zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
+	zsdp->zsd_data = NULL;
+	zsdp->zsd_create = create;
+	zsdp->zsd_shutdown = shutdown;
+	zsdp->zsd_destroy = destroy;
+
+	/*
+	 * Insert in global list of callbacks. Makes future zone creations
+	 * see it.
+	 */
+	mutex_enter(&zsd_key_lock);
+	*keyp = key = zsdp->zsd_key = ++zsd_keyval;
+	ASSERT(zsd_keyval != 0);
+	list_insert_tail(&zsd_registered_keys, zsdp);
+	mutex_exit(&zsd_key_lock);
+
+	/*
+	 * Insert for all existing zones and mark them as needing
+	 * a create callback.
+	 */
+	mutex_enter(&zonehash_lock);	/* stop the world */
+	for (zone = list_head(&zone_active); zone != NULL;
+	    zone = list_next(&zone_active, zone)) {
+		zone_status_t status;
+
+		mutex_enter(&zone->zone_lock);
+
+		/* Skip zones that are on the way down or not yet up */
+		status = zone_status_get(zone);
+		if (status >= ZONE_IS_DOWN ||
+		    status == ZONE_IS_UNINITIALIZED) {
+			mutex_exit(&zone->zone_lock);
+			continue;
+		}
+
+		t = zsd_find_mru(&zone->zone_zsd, key);
+		if (t != NULL) {
+			/*
+			 * A zsd_configure already inserted it after
+			 * we dropped zsd_key_lock above.
+			 */
+			mutex_exit(&zone->zone_lock);
+			continue;
+		}
+		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+		t->zsd_key = key;
+		t->zsd_create = create;
+		t->zsd_shutdown = shutdown;
+		t->zsd_destroy = destroy;
+		if (create != NULL) {
+			t->zsd_flags = ZSD_CREATE_NEEDED;
+			DTRACE_PROBE2(zsd__create__needed,
+			    zone_t *, zone, zone_key_t, key);
+		}
+		list_insert_tail(&zone->zone_zsd, t);
+		mutex_exit(&zone->zone_lock);
+	}
+	mutex_exit(&zonehash_lock);
+
+	if (create != NULL) {
+		/* Now call the create callback for this key */
+		zsd_apply_all_zones(zsd_apply_create, key);
+	}
+}
+
 /*
  * Function called when a module is being unloaded, or otherwise wishes
  * to unregister its ZSD key and callbacks.
+ *
+ * Remove from the global list and determine the functions that need to
+ * be called under a global lock. Then call the functions without
+ * holding any locks. Finally free up the zone_zsd entries. (The apply
+ * functions need to access the zone_zsd entries to find zsd_data etc.)
  */
 int
 zone_key_delete(zone_key_t key)
@@ -585,65 +659,88 @@ zone_key_delete(zone_key_t key)
 	struct zsd_entry *zsdp = NULL;
 	zone_t *zone;
 
-	mutex_enter(&zonehash_lock);	/* Zone create/delete waits for us */
-	for (zone = list_head(&zone_active); zone != NULL;
-	    zone = list_next(&zone_active, zone))
-		mutex_enter(&zone->zone_lock);	/* lock all zones */
-
 	mutex_enter(&zsd_key_lock);
-	zsdp = zsd_find(&zsd_registered_keys, key);
-	if (zsdp == NULL)
-		goto notfound;
+	zsdp = zsd_find_mru(&zsd_registered_keys, key);
+	if (zsdp == NULL) {
+		mutex_exit(&zsd_key_lock);
+		return (-1);
+	}
 	list_remove(&zsd_registered_keys, zsdp);
 	mutex_exit(&zsd_key_lock);
 
+	mutex_enter(&zonehash_lock);
 	for (zone = list_head(&zone_active); zone != NULL;
 	    zone = list_next(&zone_active, zone)) {
 		struct zsd_entry *del;
-		void *data;
-
-		if (!(zone->zone_flags & ZF_DESTROYED)) {
-			del = zsd_find(&zone->zone_zsd, key);
-			if (del != NULL) {
-				data = del->zsd_data;
-				ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
-				ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
-				list_remove(&zone->zone_zsd, del);
-				kmem_free(del, sizeof (*del));
-			} else {
-				data = NULL;
-			}
-			if (zsdp->zsd_shutdown)
-				zsdp->zsd_shutdown(zone->zone_id, data);
-			if (zsdp->zsd_destroy)
-				zsdp->zsd_destroy(zone->zone_id, data);
+
+		mutex_enter(&zone->zone_lock);
+		del = zsd_find_mru(&zone->zone_zsd, key);
+		if (del == NULL) {
+			/*
+			 * Somebody else got here first e.g the zone going
+			 * away.
+			 */
+			mutex_exit(&zone->zone_lock);
+			continue;
+		}
+		ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
+		ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
+		if (del->zsd_shutdown != NULL &&
+		    (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
+			del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
+			DTRACE_PROBE2(zsd__shutdown__needed,
+			    zone_t *, zone, zone_key_t, key);
+		}
+		if (del->zsd_destroy != NULL &&
+		    (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
+			del->zsd_flags |= ZSD_DESTROY_NEEDED;
+			DTRACE_PROBE2(zsd__destroy__needed,
+			    zone_t *, zone, zone_key_t, key);
 		}
 		mutex_exit(&zone->zone_lock);
 	}
 	mutex_exit(&zonehash_lock);
 	kmem_free(zsdp, sizeof (*zsdp));
-	return (0);
 
-notfound:
-	mutex_exit(&zsd_key_lock);
+	/* Now call the shutdown and destroy callback for this key */
+	zsd_apply_all_zones(zsd_apply_shutdown, key);
+	zsd_apply_all_zones(zsd_apply_destroy, key);
+
+	/* Now we can free up the zsdp structures in each zone */
+	mutex_enter(&zonehash_lock);
 	for (zone = list_head(&zone_active); zone != NULL;
-	    zone = list_next(&zone_active, zone))
+	    zone = list_next(&zone_active, zone)) {
+		struct zsd_entry *del;
+
+		mutex_enter(&zone->zone_lock);
+		del = zsd_find(&zone->zone_zsd, key);
+		if (del != NULL) {
+			list_remove(&zone->zone_zsd, del);
+			ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
+			kmem_free(del, sizeof (*del));
+		}
 		mutex_exit(&zone->zone_lock);
+	}
 	mutex_exit(&zonehash_lock);
-	return (-1);
+
+	return (0);
 }
 
 /*
  * ZSD counterpart of pthread_setspecific().
+ *
+ * Since all zsd callbacks, including those with no create function,
+ * have an entry in zone_zsd, if the key is registered it is part of
+ * the zone_zsd list.
+ * Return an error if the key wasn't registerd.
  */
 int
 zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 {
 	struct zsd_entry *t;
-	struct zsd_entry *zsdp = NULL;
 
 	mutex_enter(&zone->zone_lock);
-	t = zsd_find(&zone->zone_zsd, key);
+	t = zsd_find_mru(&zone->zone_zsd, key);
 	if (t != NULL) {
 		/*
 		 * Replace old value with new
@@ -652,36 +749,8 @@ zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
 		mutex_exit(&zone->zone_lock);
 		return (0);
 	}
-	/*
-	 * If there was no previous value, go through the list of registered
-	 * keys.
-	 *
-	 * We avoid grabbing zsd_key_lock until we are sure we need it; this is
-	 * necessary for shutdown callbacks to be able to execute without fear
-	 * of deadlock.
-	 */
-	mutex_enter(&zsd_key_lock);
-	zsdp = zsd_find(&zsd_registered_keys, key);
-	if (zsdp == NULL) { 	/* Key was not registered */
-		mutex_exit(&zsd_key_lock);
-		mutex_exit(&zone->zone_lock);
-		return (-1);
-	}
-
-	/*
-	 * Add a zsd_entry to this zone, using the template we just retrieved
-	 * to initialize the constructor and destructor(s).
-	 */
-	t = kmem_alloc(sizeof (*t), KM_SLEEP);
-	t->zsd_key = key;
-	t->zsd_data = (void *)data;
-	t->zsd_create = zsdp->zsd_create;
-	t->zsd_shutdown = zsdp->zsd_shutdown;
-	t->zsd_destroy = zsdp->zsd_destroy;
-	list_insert_tail(&zone->zone_zsd, t);
-	mutex_exit(&zsd_key_lock);
 	mutex_exit(&zone->zone_lock);
-	return (0);
+	return (-1);
 }
 
 /*
@@ -694,7 +763,7 @@ zone_getspecific(zone_key_t key, zone_t *zone)
 	void *data;
 
 	mutex_enter(&zone->zone_lock);
-	t = zsd_find(&zone->zone_zsd, key);
+	t = zsd_find_mru(&zone->zone_zsd, key);
 	data = (t == NULL ? NULL : t->zsd_data);
 	mutex_exit(&zone->zone_lock);
 	return (data);
@@ -703,42 +772,41 @@ zone_getspecific(zone_key_t key, zone_t *zone)
 /*
  * Function used to initialize a zone's list of ZSD callbacks and data
  * when the zone is being created.  The callbacks are initialized from
- * the template list (zsd_registered_keys), and the constructor
- * callback executed (if one exists).
- *
- * This is called before the zone is made publicly available, hence no
- * need to grab zone_lock.
- *
- * Although we grab and release zsd_key_lock, new entries cannot be
- * added to or removed from the zsd_registered_keys list until we
- * release zonehash_lock, so there isn't a window for a
- * zone_key_create() to come in after we've dropped zsd_key_lock but
- * before the zone is added to the zone list, such that the constructor
- * callbacks aren't executed for the new zone.
+ * the template list (zsd_registered_keys). The constructor callback is
+ * executed later (once the zone exists and with locks dropped).
  */
 static void
 zone_zsd_configure(zone_t *zone)
 {
 	struct zsd_entry *zsdp;
 	struct zsd_entry *t;
-	zoneid_t zoneid = zone->zone_id;
 
 	ASSERT(MUTEX_HELD(&zonehash_lock));
 	ASSERT(list_head(&zone->zone_zsd) == NULL);
+	mutex_enter(&zone->zone_lock);
 	mutex_enter(&zsd_key_lock);
 	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
 	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
+		/*
+		 * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
+		 * should not have added anything to it.
+		 */
+		ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
+
+		t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+		t->zsd_key = zsdp->zsd_key;
+		t->zsd_create = zsdp->zsd_create;
+		t->zsd_shutdown = zsdp->zsd_shutdown;
+		t->zsd_destroy = zsdp->zsd_destroy;
 		if (zsdp->zsd_create != NULL) {
-			t = kmem_alloc(sizeof (*t), KM_SLEEP);
-			t->zsd_key = zsdp->zsd_key;
-			t->zsd_create = zsdp->zsd_create;
-			t->zsd_data = (*t->zsd_create)(zoneid);
-			t->zsd_shutdown = zsdp->zsd_shutdown;
-			t->zsd_destroy = zsdp->zsd_destroy;
-			list_insert_tail(&zone->zone_zsd, t);
+			t->zsd_flags = ZSD_CREATE_NEEDED;
+			DTRACE_PROBE2(zsd__create__needed,
+			    zone_t *, zone, zone_key_t, zsdp->zsd_key);
 		}
+		list_insert_tail(&zone->zone_zsd, t);
 	}
 	mutex_exit(&zsd_key_lock);
+	mutex_exit(&zone->zone_lock);
 }
 
 enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
@@ -749,70 +817,47 @@ enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
 static void
 zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
 {
-	struct zsd_entry *zsdp;
 	struct zsd_entry *t;
-	zoneid_t zoneid = zone->zone_id;
 
 	ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
 	ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
 	ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
 
-	mutex_enter(&zone->zone_lock);
-	if (ct == ZSD_DESTROY) {
-		if (zone->zone_flags & ZF_DESTROYED) {
-			/*
-			 * Make sure destructors are only called once.
-			 */
-			mutex_exit(&zone->zone_lock);
-			return;
-		}
-		zone->zone_flags |= ZF_DESTROYED;
-	}
-	mutex_exit(&zone->zone_lock);
-
 	/*
-	 * Both zsd_key_lock and zone_lock need to be held in order to add or
-	 * remove a ZSD key, (either globally as part of
-	 * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
-	 * possible through zone_setspecific()), so it's sufficient to hold
-	 * zsd_key_lock here.
-	 *
-	 * This is a good thing, since we don't want to recursively try to grab
-	 * zone_lock if a callback attempts to do something like a crfree() or
-	 * zone_rele().
+	 * Run the callback solely based on what is registered for the zone
+	 * in zone_zsd. The global list can change independently of this
+	 * as keys are registered and unregistered and we don't register new
+	 * callbacks for a zone that is in the process of going away.
 	 */
-	mutex_enter(&zsd_key_lock);
-	for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
-	    zsdp = list_next(&zsd_registered_keys, zsdp)) {
-		zone_key_t key = zsdp->zsd_key;
+	mutex_enter(&zone->zone_lock);
+	for (t = list_head(&zone->zone_zsd); t != NULL;
+	    t = list_next(&zone->zone_zsd, t)) {
+		zone_key_t key = t->zsd_key;
 
 		/* Skip if no callbacks registered */
-		if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
-			continue;
-		if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
-			continue;
-		/*
-		 * Call the callback with the zone-specific data if we can find
-		 * any, otherwise with NULL.
-		 */
-		t = zsd_find(&zone->zone_zsd, key);
-		if (t != NULL) {
-			if (ct == ZSD_SHUTDOWN) {
-				t->zsd_shutdown(zoneid, t->zsd_data);
-			} else {
-				ASSERT(ct == ZSD_DESTROY);
-				t->zsd_destroy(zoneid, t->zsd_data);
+
+		if (ct == ZSD_SHUTDOWN) {
+			if (t->zsd_shutdown != NULL &&
+			    (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
+				t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
+				DTRACE_PROBE2(zsd__shutdown__needed,
+				    zone_t *, zone, zone_key_t, key);
 			}
 		} else {
-			if (ct == ZSD_SHUTDOWN) {
-				zsdp->zsd_shutdown(zoneid, NULL);
-			} else {
-				ASSERT(ct == ZSD_DESTROY);
-				zsdp->zsd_destroy(zoneid, NULL);
+			if (t->zsd_destroy != NULL &&
+			    (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
+				t->zsd_flags |= ZSD_DESTROY_NEEDED;
+				DTRACE_PROBE2(zsd__destroy__needed,
+				    zone_t *, zone, zone_key_t, key);
 			}
 		}
 	}
-	mutex_exit(&zsd_key_lock);
+	mutex_exit(&zone->zone_lock);
+
+	/* Now call the shutdown and destroy callback for this key */
+	zsd_apply_all_keys(zsd_apply_shutdown, zone);
+	zsd_apply_all_keys(zsd_apply_destroy, zone);
+
 }
 
 /*
@@ -827,12 +872,379 @@ zone_free_zsd(zone_t *zone)
 	/*
 	 * Free all the zsd_entry's we had on this zone.
 	 */
+	mutex_enter(&zone->zone_lock);
 	for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
 		next = list_next(&zone->zone_zsd, t);
 		list_remove(&zone->zone_zsd, t);
+		ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
 		kmem_free(t, sizeof (*t));
 	}
 	list_destroy(&zone->zone_zsd);
+	mutex_exit(&zone->zone_lock);
+
+}
+
+/*
+ * Apply a function to all zones for particular key value.
+ *
+ * The applyfn has to drop zonehash_lock if it does some work, and
+ * then reacquire it before it returns.
+ * When the lock is dropped we don't follow list_next even
+ * if it is possible to do so without any hazards. This is
+ * because we want the design to allow for the list of zones
+ * to change in any arbitrary way during the time the
+ * lock was dropped.
+ *
+ * It is safe to restart the loop at list_head since the applyfn
+ * changes the zsd_flags as it does work, so a subsequent
+ * pass through will have no effect in applyfn, hence the loop will terminate
+ * in at worst O(N^2).
+ */
+static void
+zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
+{
+	zone_t *zone;
+
+	mutex_enter(&zonehash_lock);
+	zone = list_head(&zone_active);
+	while (zone != NULL) {
+		if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
+			/* Lock dropped - restart at head */
+			zone = list_head(&zone_active);
+		} else {
+			zone = list_next(&zone_active, zone);
+		}
+	}
+	mutex_exit(&zonehash_lock);
+}
+
+/*
+ * Apply a function to all keys for a particular zone.
+ *
+ * The applyfn has to drop zonehash_lock if it does some work, and
+ * then reacquire it before it returns.
+ * When the lock is dropped we don't follow list_next even
+ * if it is possible to do so without any hazards. This is
+ * because we want the design to allow for the list of zsd callbacks
+ * to change in any arbitrary way during the time the
+ * lock was dropped.
+ *
+ * It is safe to restart the loop at list_head since the applyfn
+ * changes the zsd_flags as it does work, so a subsequent
+ * pass through will have no effect in applyfn, hence the loop will terminate
+ * in at worst O(N^2).
+ */
+static void
+zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
+{
+	struct zsd_entry *t;
+
+	mutex_enter(&zone->zone_lock);
+	t = list_head(&zone->zone_zsd);
+	while (t != NULL) {
+		if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
+			/* Lock dropped - restart at head */
+			t = list_head(&zone->zone_zsd);
+		} else {
+			t = list_next(&zone->zone_zsd, t);
+		}
+	}
+	mutex_exit(&zone->zone_lock);
+}
+
+/*
+ * Call the create function for the zone and key if CREATE_NEEDED
+ * is set.
+ * If some other thread gets here first and sets CREATE_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all zones/keys.
+ *
+ * When we call the create function, we drop the global held by the
+ * caller, and return true to tell the caller it needs to re-evalute the
+ * state.
+ * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
+ * remains held on exit.
+ */
+static boolean_t
+zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
+    zone_t *zone, zone_key_t key)
+{
+	void *result;
+	struct zsd_entry *t;
+	boolean_t dropped;
+
+	if (lockp != NULL) {
+		ASSERT(MUTEX_HELD(lockp));
+	}
+	if (zone_lock_held) {
+		ASSERT(MUTEX_HELD(&zone->zone_lock));
+	} else {
+		mutex_enter(&zone->zone_lock);
+	}
+
+	t = zsd_find(&zone->zone_zsd, key);
+	if (t == NULL) {
+		/*
+		 * Somebody else got here first e.g the zone going
+		 * away.
+		 */
+		if (!zone_lock_held)
+			mutex_exit(&zone->zone_lock);
+		return (B_FALSE);
+	}
+	dropped = B_FALSE;
+	if (zsd_wait_for_inprogress(zone, t, lockp))
+		dropped = B_TRUE;
+
+	if (t->zsd_flags & ZSD_CREATE_NEEDED) {
+		t->zsd_flags &= ~ZSD_CREATE_NEEDED;
+		t->zsd_flags |= ZSD_CREATE_INPROGRESS;
+		DTRACE_PROBE2(zsd__create__inprogress,
+		    zone_t *, zone, zone_key_t, key);
+		mutex_exit(&zone->zone_lock);
+		if (lockp != NULL)
+			mutex_exit(lockp);
+
+		dropped = B_TRUE;
+		ASSERT(t->zsd_create != NULL);
+		DTRACE_PROBE2(zsd__create__start,
+		    zone_t *, zone, zone_key_t, key);
+
+		result = (*t->zsd_create)(zone->zone_id);
+
+		DTRACE_PROBE2(zsd__create__end,
+		    zone_t *, zone, voidn *, result);
+
+		ASSERT(result != NULL);
+		if (lockp != NULL)
+			mutex_enter(lockp);
+		mutex_enter(&zone->zone_lock);
+		t->zsd_data = result;
+		t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
+		t->zsd_flags |= ZSD_CREATE_COMPLETED;
+		cv_broadcast(&t->zsd_cv);
+		DTRACE_PROBE2(zsd__create__completed,
+		    zone_t *, zone, zone_key_t, key);
+	}
+	if (!zone_lock_held)
+		mutex_exit(&zone->zone_lock);
+	return (dropped);
+}
+
+/*
+ * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
+ * is set.
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all zones/keys.
+ *
+ * When we call the shutdown function, we drop the global held by the
+ * caller, and return true to tell the caller it needs to re-evalute the
+ * state.
+ * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
+ * remains held on exit.
+ */
+static boolean_t
+zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
+    zone_t *zone, zone_key_t key)
+{
+	struct zsd_entry *t;
+	void *data;
+	boolean_t dropped;
+
+	if (lockp != NULL) {
+		ASSERT(MUTEX_HELD(lockp));
+	}
+	if (zone_lock_held) {
+		ASSERT(MUTEX_HELD(&zone->zone_lock));
+	} else {
+		mutex_enter(&zone->zone_lock);
+	}
+
+	t = zsd_find(&zone->zone_zsd, key);
+	if (t == NULL) {
+		/*
+		 * Somebody else got here first e.g the zone going
+		 * away.
+		 */
+		if (!zone_lock_held)
+			mutex_exit(&zone->zone_lock);
+		return (B_FALSE);
+	}
+	dropped = B_FALSE;
+	if (zsd_wait_for_creator(zone, t, lockp))
+		dropped = B_TRUE;
+
+	if (zsd_wait_for_inprogress(zone, t, lockp))
+		dropped = B_TRUE;
+
+	if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
+		t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
+		t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
+		DTRACE_PROBE2(zsd__shutdown__inprogress,
+		    zone_t *, zone, zone_key_t, key);
+		mutex_exit(&zone->zone_lock);
+		if (lockp != NULL)
+			mutex_exit(lockp);
+		dropped = B_TRUE;
+
+		ASSERT(t->zsd_shutdown != NULL);
+		data = t->zsd_data;
+
+		DTRACE_PROBE2(zsd__shutdown__start,
+		    zone_t *, zone, zone_key_t, key);
+
+		(t->zsd_shutdown)(zone->zone_id, data);
+		DTRACE_PROBE2(zsd__shutdown__end,
+		    zone_t *, zone, zone_key_t, key);
+
+		if (lockp != NULL)
+			mutex_enter(lockp);
+		mutex_enter(&zone->zone_lock);
+		t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
+		t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
+		cv_broadcast(&t->zsd_cv);
+		DTRACE_PROBE2(zsd__shutdown__completed,
+		    zone_t *, zone, zone_key_t, key);
+	}
+	if (!zone_lock_held)
+		mutex_exit(&zone->zone_lock);
+	return (dropped);
+}
+
+/*
+ * Call the destroy function for the zone and key if DESTROY_NEEDED
+ * is set.
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all zones/keys.
+ *
+ * When we call the destroy function, we drop the global held by the
+ * caller, and return true to tell the caller it needs to re-evalute the
+ * state.
+ * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
+ * remains held on exit.
+ */
+static boolean_t
+zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
+    zone_t *zone, zone_key_t key)
+{
+	struct zsd_entry *t;
+	void *data;
+	boolean_t dropped;
+
+	if (lockp != NULL) {
+		ASSERT(MUTEX_HELD(lockp));
+	}
+	if (zone_lock_held) {
+		ASSERT(MUTEX_HELD(&zone->zone_lock));
+	} else {
+		mutex_enter(&zone->zone_lock);
+	}
+
+	t = zsd_find(&zone->zone_zsd, key);
+	if (t == NULL) {
+		/*
+		 * Somebody else got here first e.g the zone going
+		 * away.
+		 */
+		if (!zone_lock_held)
+			mutex_exit(&zone->zone_lock);
+		return (B_FALSE);
+	}
+	dropped = B_FALSE;
+	if (zsd_wait_for_creator(zone, t, lockp))
+		dropped = B_TRUE;
+
+	if (zsd_wait_for_inprogress(zone, t, lockp))
+		dropped = B_TRUE;
+
+	if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
+		t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
+		t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
+		DTRACE_PROBE2(zsd__destroy__inprogress,
+		    zone_t *, zone, zone_key_t, key);
+		mutex_exit(&zone->zone_lock);
+		if (lockp != NULL)
+			mutex_exit(lockp);
+		dropped = B_TRUE;
+
+		ASSERT(t->zsd_destroy != NULL);
+		data = t->zsd_data;
+		DTRACE_PROBE2(zsd__destroy__start,
+		    zone_t *, zone, zone_key_t, key);
+
+		(t->zsd_destroy)(zone->zone_id, data);
+		DTRACE_PROBE2(zsd__destroy__end,
+		    zone_t *, zone, zone_key_t, key);
+
+		if (lockp != NULL)
+			mutex_enter(lockp);
+		mutex_enter(&zone->zone_lock);
+		t->zsd_data = NULL;
+		t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
+		t->zsd_flags |= ZSD_DESTROY_COMPLETED;
+		cv_broadcast(&t->zsd_cv);
+		DTRACE_PROBE2(zsd__destroy__completed,
+		    zone_t *, zone, zone_key_t, key);
+	}
+	if (!zone_lock_held)
+		mutex_exit(&zone->zone_lock);
+	return (dropped);
+}
+
+/*
+ * Wait for any CREATE_NEEDED flag to be cleared.
+ * Returns true if lockp was temporarily dropped while waiting.
+ */
+static boolean_t
+zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
+{
+	boolean_t dropped = B_FALSE;
+
+	while (t->zsd_flags & ZSD_CREATE_NEEDED) {
+		DTRACE_PROBE2(zsd__wait__for__creator,
+		    zone_t *, zone, struct zsd_entry *, t);
+		if (lockp != NULL) {
+			dropped = B_TRUE;
+			mutex_exit(lockp);
+		}
+		cv_wait(&t->zsd_cv, &zone->zone_lock);
+		if (lockp != NULL) {
+			/* First drop zone_lock to preserve order */
+			mutex_exit(&zone->zone_lock);
+			mutex_enter(lockp);
+			mutex_enter(&zone->zone_lock);
+		}
+	}
+	return (dropped);
+}
+
+/*
+ * Wait for any INPROGRESS flag to be cleared.
+ * Returns true if lockp was temporarily dropped while waiting.
+ */
+static boolean_t
+zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
+{
+	boolean_t dropped = B_FALSE;
+
+	while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
+		DTRACE_PROBE2(zsd__wait__for__inprogress,
+		    zone_t *, zone, struct zsd_entry *, t);
+		if (lockp != NULL) {
+			dropped = B_TRUE;
+			mutex_exit(lockp);
+		}
+		cv_wait(&t->zsd_cv, &zone->zone_lock);
+		if (lockp != NULL) {
+			/* First drop zone_lock to preserve order */
+			mutex_exit(&zone->zone_lock);
+			mutex_enter(lockp);
+			mutex_enter(&zone->zone_lock);
+		}
+	}
+	return (dropped);
 }
 
 /*
@@ -2960,10 +3372,15 @@ zsched(void *arg)
 	/*
 	 * Tell the world that we're done setting up.
 	 *
-	 * At this point we want to set the zone status to ZONE_IS_READY
+	 * At this point we want to set the zone status to ZONE_IS_INITIALIZED
 	 * and atomically set the zone's processor set visibility.  Once
 	 * we drop pool_lock() this zone will automatically get updated
 	 * to reflect any future changes to the pools configuration.
+	 *
+	 * Note that after we drop the locks below (zonehash_lock in
+	 * particular) other operations such as a zone_getattr call can
+	 * now proceed and observe the zone. That is the reason for doing a
+	 * state transition to the INITIALIZED state.
 	 */
 	pool_lock();
 	mutex_enter(&cpu_lock);
@@ -2974,12 +3391,21 @@ zsched(void *arg)
 		zone_pset_set(zone, pool_default->pool_pset->pset_id);
 	mutex_enter(&zone_status_lock);
 	ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
-	zone_status_set(zone, ZONE_IS_READY);
+	zone_status_set(zone, ZONE_IS_INITIALIZED);
 	mutex_exit(&zone_status_lock);
 	mutex_exit(&zonehash_lock);
 	mutex_exit(&cpu_lock);
 	pool_unlock();
 
+	/* Now call the create callback for this key */
+	zsd_apply_all_keys(zsd_apply_create, zone);
+
+	/* The callbacks are complete. Mark ZONE_IS_READY */
+	mutex_enter(&zone_status_lock);
+	ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
+	zone_status_set(zone, ZONE_IS_READY);
+	mutex_exit(&zone_status_lock);
+
 	/*
 	 * Once we see the zone transition to the ZONE_IS_BOOTING state,
 	 * we launch init, and set the state to running.
@@ -4071,7 +4497,7 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
 		return (set_errno(EINVAL));
 	}
 	zone_status = zone_status_get(zone);
-	if (zone_status < ZONE_IS_READY) {
+	if (zone_status < ZONE_IS_INITIALIZED) {
 		mutex_exit(&zonehash_lock);
 		return (set_errno(EINVAL));
 	}
@@ -5698,21 +6124,28 @@ zone_list_datalink(zoneid_t zoneid, int *nump, char *buf)
 
 /*
  * Public interface for looking up a zone by zoneid. It's a customized version
- * for netstack_zone_create(), it:
- * 1. Doesn't acquire the zonehash_lock, since it is called from
- *    zone_key_create() or zone_zsd_configure(), lock already held.
- * 2. Doesn't check the status of the zone.
- * 3. It will be called even before zone_init is called, in that case the
+ * for netstack_zone_create(). It can only be called from the zsd create
+ * callbacks, since it doesn't have reference on the zone structure hence if
+ * it is called elsewhere the zone could disappear after the zonehash_lock
+ * is dropped.
+ *
+ * Furthermore it
+ * 1. Doesn't check the status of the zone.
+ * 2. It will be called even before zone_init is called, in that case the
  *    address of zone0 is returned directly, and netstack_zone_create()
  *    will only assign a value to zone0.zone_netstack, won't break anything.
+ * 3. Returns without the zone being held.
  */
 zone_t *
 zone_find_by_id_nolock(zoneid_t zoneid)
 {
-	ASSERT(MUTEX_HELD(&zonehash_lock));
+	zone_t *zone;
 
+	mutex_enter(&zonehash_lock);
 	if (zonehashbyid == NULL)
-		return (&zone0);
+		zone = &zone0;
 	else
-		return (zone_find_all_by_id(zoneid));
+		zone = zone_find_all_by_id(zoneid);
+	mutex_exit(&zonehash_lock);
+	return (zone);
 }
diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h
index 795cf37eb5..9bd7701693 100644
--- a/usr/src/uts/common/sys/netstack.h
+++ b/usr/src/uts/common/sys/netstack.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 #ifndef _SYS_NETSTACK_H
@@ -84,6 +84,46 @@ typedef id_t	netstackid_t;
 #define	NS_MAX		(NS_STR+1)
 
 /*
+ * State maintained for each module which tracks the state of
+ * the create, shutdown and destroy callbacks.
+ *
+ * Keeps track of pending actions to avoid holding locks when
+ * calling into the create/shutdown/destroy functions in the module.
+ */
+#ifdef _KERNEL
+typedef struct {
+	uint16_t 	nms_flags;
+	kcondvar_t	nms_cv;
+} nm_state_t;
+
+/*
+ * nms_flags
+ */
+#define	NSS_CREATE_NEEDED	0x0001
+#define	NSS_CREATE_INPROGRESS	0x0002
+#define	NSS_CREATE_COMPLETED	0x0004
+#define	NSS_SHUTDOWN_NEEDED	0x0010
+#define	NSS_SHUTDOWN_INPROGRESS	0x0020
+#define	NSS_SHUTDOWN_COMPLETED	0x0040
+#define	NSS_DESTROY_NEEDED	0x0100
+#define	NSS_DESTROY_INPROGRESS	0x0200
+#define	NSS_DESTROY_COMPLETED	0x0400
+
+#define	NSS_CREATE_ALL	\
+	(NSS_CREATE_NEEDED|NSS_CREATE_INPROGRESS|NSS_CREATE_COMPLETED)
+#define	NSS_SHUTDOWN_ALL	\
+	(NSS_SHUTDOWN_NEEDED|NSS_SHUTDOWN_INPROGRESS|NSS_SHUTDOWN_COMPLETED)
+#define	NSS_DESTROY_ALL	\
+	(NSS_DESTROY_NEEDED|NSS_DESTROY_INPROGRESS|NSS_DESTROY_COMPLETED)
+
+#define	NSS_ALL_INPROGRESS	\
+	(NSS_CREATE_INPROGRESS|NSS_SHUTDOWN_INPROGRESS|NSS_DESTROY_INPROGRESS)
+#else
+/* User-level compile like IP Filter needs a netstack_t. Dummy */
+typedef uint_t nm_state_t;
+#endif /* _KERNEL */
+
+/*
  * One for every netstack in the system.
  * We use a union so that the compilar and lint can provide type checking -
  * in principle we could have
@@ -136,7 +176,7 @@ struct netstack {
 #define	netstack_ipf		netstack_u.nu_s.nu_ipf
 #define	netstack_str		netstack_u.nu_s.nu_str
 
-	uint16_t	netstack_m_state[NS_MAX]; /* module state */
+	nm_state_t	netstack_m_state[NS_MAX]; /* module state */
 
 	kmutex_t	netstack_lock;
 	struct netstack *netstack_next;
@@ -144,34 +184,23 @@ struct netstack {
 	int		netstack_numzones;	/* Number of zones using this */
 	int		netstack_refcnt;	/* Number of hold-rele */
 	int		netstack_flags;	/* See below */
+
+#ifdef _KERNEL
+	/* Needed to ensure that we run the callback functions in order */
+	kcondvar_t	netstack_cv;
+#endif
 };
 typedef struct netstack netstack_t;
 
 /* netstack_flags values */
-#define	NSF_UNINIT	0x01		/* Not initialized */
-#define	NSF_CLOSING	0x02		/* Going away */
+#define	NSF_UNINIT		0x01		/* Not initialized */
+#define	NSF_CLOSING		0x02		/* Going away */
+#define	NSF_ZONE_CREATE		0x04		/* create callbacks inprog */
+#define	NSF_ZONE_SHUTDOWN	0x08		/* shutdown callbacks */
+#define	NSF_ZONE_DESTROY	0x10		/* destroy callbacks */
 
-/*
- * State for each module for each stack - netstack_m_state[moduleid]
- * Keeps track of pending actions to avoid holding looks when
- * calling into the create/shutdown/destroy functions in the module.
- */
-#define	NSS_CREATE_NEEDED	0x0001
-#define	NSS_CREATE_INPROGRESS	0x0002
-#define	NSS_CREATE_COMPLETED	0x0004
-#define	NSS_SHUTDOWN_NEEDED	0x0010
-#define	NSS_SHUTDOWN_INPROGRESS	0x0020
-#define	NSS_SHUTDOWN_COMPLETED	0x0040
-#define	NSS_DESTROY_NEEDED	0x0100
-#define	NSS_DESTROY_INPROGRESS	0x0200
-#define	NSS_DESTROY_COMPLETED	0x0400
-
-#define	NSS_CREATE_ALL	\
-	(NSS_CREATE_NEEDED|NSS_CREATE_INPROGRESS|NSS_CREATE_COMPLETED)
-#define	NSS_SHUTDOWN_ALL	\
-	(NSS_SHUTDOWN_NEEDED|NSS_SHUTDOWN_INPROGRESS|NSS_SHUTDOWN_COMPLETED)
-#define	NSS_DESTROY_ALL	\
-	(NSS_DESTROY_NEEDED|NSS_DESTROY_INPROGRESS|NSS_DESTROY_COMPLETED)
+#define	NSF_ZONE_INPROGRESS	\
+	(NSF_ZONE_CREATE|NSF_ZONE_SHUTDOWN|NSF_ZONE_DESTROY)
 
 /*
  * One for each of the NS_* values.
@@ -185,6 +214,7 @@ struct netstack_registry {
 
 /* nr_flags values */
 #define	NRF_REGISTERED	0x01
+#define	NRF_DYING	0x02	/* No new creates */
 
 /*
  * To support kstat_create_netstack() using kstat_add_zone we need
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 7e7dd9e88a..0a93e8651e 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -106,6 +106,7 @@ extern "C" {
 #define	ZONE_EVENT_STATUS_SUBCLASS	"change"
 
 #define	ZONE_EVENT_UNINITIALIZED	"uninitialized"
+#define	ZONE_EVENT_INITIALIZED		"initialized"
 #define	ZONE_EVENT_READY		"ready"
 #define	ZONE_EVENT_RUNNING		"running"
 #define	ZONE_EVENT_SHUTTING_DOWN	"shutting_down"
@@ -201,6 +202,7 @@ typedef struct {
 /* zone_status */
 typedef enum {
 	ZONE_IS_UNINITIALIZED = 0,
+	ZONE_IS_INITIALIZED,
 	ZONE_IS_READY,
 	ZONE_IS_BOOTING,
 	ZONE_IS_RUNNING,
@@ -268,7 +270,6 @@ typedef struct zone_cmd_rval {
 #define	ZONE_DOOR_PATH		ZONES_TMPDIR "/%s.zoneadmd_door"
 
 /* zone_flags */
-#define	ZF_DESTROYED		0x1	/* ZSD destructor callbacks run */
 #define	ZF_HASHED_LABEL		0x2	/* zone has a unique label */
 #define	ZF_IS_SCRATCH		0x4	/* scratch zone */
 #define	ZF_NET_EXCL		0x8	/* Zone has an exclusive IP stack */
@@ -476,6 +477,13 @@ extern int	zone_setspecific(zone_key_t, zone_t *, const void *);
 /*
  * The definition of a zsd_entry is truly private to zone.c and is only
  * placed here so it can be shared with mdb.
+ *
+ * State maintained for each zone times each registered key, which tracks
+ * the state of the create, shutdown and destroy callbacks.
+ *
+ * zsd_flags is used to keep track of pending actions to avoid holding locks
+ * when calling the create/shutdown/destroy callbacks, since doing so
+ * could lead to deadlocks.
  */
 struct zsd_entry {
 	zone_key_t		zsd_key;	/* Key used to lookup value */
@@ -488,9 +496,34 @@ struct zsd_entry {
 	void			(*zsd_shutdown)(zoneid_t, void *);
 	void			(*zsd_destroy)(zoneid_t, void *);
 	list_node_t		zsd_linkage;
+	uint16_t 		zsd_flags;	/* See below */
+	kcondvar_t		zsd_cv;
 };
 
 /*
+ * zsd_flags
+ */
+#define	ZSD_CREATE_NEEDED	0x0001
+#define	ZSD_CREATE_INPROGRESS	0x0002
+#define	ZSD_CREATE_COMPLETED	0x0004
+#define	ZSD_SHUTDOWN_NEEDED	0x0010
+#define	ZSD_SHUTDOWN_INPROGRESS	0x0020
+#define	ZSD_SHUTDOWN_COMPLETED	0x0040
+#define	ZSD_DESTROY_NEEDED	0x0100
+#define	ZSD_DESTROY_INPROGRESS	0x0200
+#define	ZSD_DESTROY_COMPLETED	0x0400
+
+#define	ZSD_CREATE_ALL	\
+	(ZSD_CREATE_NEEDED|ZSD_CREATE_INPROGRESS|ZSD_CREATE_COMPLETED)
+#define	ZSD_SHUTDOWN_ALL	\
+	(ZSD_SHUTDOWN_NEEDED|ZSD_SHUTDOWN_INPROGRESS|ZSD_SHUTDOWN_COMPLETED)
+#define	ZSD_DESTROY_ALL	\
+	(ZSD_DESTROY_NEEDED|ZSD_DESTROY_INPROGRESS|ZSD_DESTROY_COMPLETED)
+
+#define	ZSD_ALL_INPROGRESS \
+	(ZSD_CREATE_INPROGRESS|ZSD_SHUTDOWN_INPROGRESS|ZSD_DESTROY_INPROGRESS)
+
+/*
  * Macros to help with zone visibility restrictions.
  */
author	nordmark <none@none>	2008-01-22 15:57:26 -0800
committer	nordmark <none@none>	2008-01-22 15:57:26 -0800
commit	bd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c (patch)
tree	34d74b100f909c973299a5ded0d0a231ac2d069e /usr/src
parent	c63537d6ab9d03a6ce330b36e829aba258c25d87 (diff)
download	illumos-joyent-bd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c.tar.gz