summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authornordmark <none@none>2008-01-22 15:57:26 -0800
committernordmark <none@none>2008-01-22 15:57:26 -0800
commitbd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c (patch)
tree34d74b100f909c973299a5ded0d0a231ac2d069e /usr/src
parentc63537d6ab9d03a6ce330b36e829aba258c25d87 (diff)
downloadillumos-joyent-bd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c.tar.gz
6558857 ZSD callback locking cause deadlocks
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/lib/libzonecfg/common/libzonecfg.c2
-rw-r--r--usr/src/uts/common/os/netstack.c594
-rw-r--r--usr/src/uts/common/os/zone.c825
-rw-r--r--usr/src/uts/common/sys/netstack.h80
-rw-r--r--usr/src/uts/common/sys/zone.h37
5 files changed, 1078 insertions, 460 deletions
diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c
index 94644741dd..2bce66d3f5 100644
--- a/usr/src/lib/libzonecfg/common/libzonecfg.c
+++ b/usr/src/lib/libzonecfg/common/libzonecfg.c
@@ -5107,6 +5107,8 @@ kernel_state_to_user_state(zoneid_t zoneid, zone_status_t kernel_state)
assert(kernel_state <= ZONE_MAX_STATE);
switch (kernel_state) {
case ZONE_IS_UNINITIALIZED:
+ case ZONE_IS_INITIALIZED:
+ /* The kernel will not return these two states */
return (ZONE_STATE_READY);
case ZONE_IS_READY:
/*
diff --git a/usr/src/uts/common/os/netstack.c b/usr/src/uts/common/os/netstack.c
index 44b147dc48..c1e59fe6c3 100644
--- a/usr/src/uts/common/os/netstack.c
+++ b/usr/src/uts/common/os/netstack.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -106,10 +106,6 @@ static void *netstack_zone_create(zoneid_t zoneid);
static void netstack_zone_shutdown(zoneid_t zoneid, void *arg);
static void netstack_zone_destroy(zoneid_t zoneid, void *arg);
-static void netstack_do_create(netstack_t *ns, int moduleid);
-static void netstack_do_shutdown(netstack_t *ns, int moduleid);
-static void netstack_do_destroy(netstack_t *ns, int moduleid);
-
static void netstack_shared_zone_add(zoneid_t zoneid);
static void netstack_shared_zone_remove(zoneid_t zoneid);
static void netstack_shared_kstat_add(kstat_t *ks);
@@ -117,6 +113,16 @@ static void netstack_shared_kstat_remove(kstat_t *ks);
typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
+static void apply_all_netstacks(int, applyfn_t *);
+static void apply_all_modules(netstack_t *, applyfn_t *);
+static void apply_all_modules_reverse(netstack_t *, applyfn_t *);
+static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
+static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
+static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
+static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
+static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
+ kmutex_t *);
+
void
netstack_init(void)
{
@@ -156,6 +162,10 @@ netstack_register(int moduleid,
ASSERT(moduleid >= 0 && moduleid < NS_MAX);
ASSERT(module_create != NULL);
+ /*
+ * Make instances created after this point in time run the create
+ * callback.
+ */
mutex_enter(&netstack_g_lock);
ASSERT(ns_reg[moduleid].nr_create == NULL);
ASSERT(ns_reg[moduleid].nr_flags == 0);
@@ -166,15 +176,17 @@ netstack_register(int moduleid,
/*
* Determine the set of stacks that exist before we drop the lock.
- * Set CREATE_NEEDED for each of those.
+ * Set NSS_CREATE_NEEDED for each of those.
* netstacks which have been deleted will have NSS_CREATE_COMPLETED
* set, but check NSF_CLOSING to be sure.
*/
for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
+ nm_state_t *nms = &ns->netstack_m_state[moduleid];
+
mutex_enter(&ns->netstack_lock);
if (!(ns->netstack_flags & NSF_CLOSING) &&
- (ns->netstack_m_state[moduleid] & NSS_CREATE_ALL) == 0) {
- ns->netstack_m_state[moduleid] |= NSS_CREATE_NEEDED;
+ (nms->nms_flags & NSS_CREATE_ALL) == 0) {
+ nms->nms_flags |= NSS_CREATE_NEEDED;
DTRACE_PROBE2(netstack__create__needed,
netstack_t *, ns, int, moduleid);
}
@@ -183,12 +195,12 @@ netstack_register(int moduleid,
mutex_exit(&netstack_g_lock);
/*
- * Call the create function for each stack that has CREATE_NEEDED
- * for this moduleid.
- * Set CREATE_INPROGRESS, drop lock, and after done,
- * set CREATE_COMPLETE
+ * At this point in time a new instance can be created or an instance
+ * can be destroyed, or some other module can register or unregister.
+ * Make sure we either run all the create functions for this moduleid
+ * or we wait for any other creators for this moduleid.
*/
- netstack_do_create(NULL, moduleid);
+ apply_all_netstacks(moduleid, netstack_apply_create);
}
void
@@ -204,41 +216,57 @@ netstack_unregister(int moduleid)
mutex_enter(&netstack_g_lock);
/*
* Determine the set of stacks that exist before we drop the lock.
- * Set SHUTDOWN_NEEDED and DESTROY_NEEDED for each of those.
+ * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
+ * That ensures that when we return all the callbacks for existing
+ * instances have completed. And since we set NRF_DYING no new
+ * instances can use this module.
*/
for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
+ nm_state_t *nms = &ns->netstack_m_state[moduleid];
+
mutex_enter(&ns->netstack_lock);
if (ns_reg[moduleid].nr_shutdown != NULL &&
- (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) &&
- (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_ALL) == 0) {
- ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_NEEDED;
+ (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+ (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
+ nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
DTRACE_PROBE2(netstack__shutdown__needed,
netstack_t *, ns, int, moduleid);
}
if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
ns_reg[moduleid].nr_destroy != NULL &&
- (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) &&
- (ns->netstack_m_state[moduleid] & NSS_DESTROY_ALL) == 0) {
- ns->netstack_m_state[moduleid] |= NSS_DESTROY_NEEDED;
+ (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+ (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
+ nms->nms_flags |= NSS_DESTROY_NEEDED;
DTRACE_PROBE2(netstack__destroy__needed,
netstack_t *, ns, int, moduleid);
}
mutex_exit(&ns->netstack_lock);
}
+ /*
+ * Prevent any new netstack from calling the registered create
+ * function, while keeping the function pointers in place until the
+ * shutdown and destroy callbacks are complete.
+ */
+ ns_reg[moduleid].nr_flags |= NRF_DYING;
mutex_exit(&netstack_g_lock);
- netstack_do_shutdown(NULL, moduleid);
- netstack_do_destroy(NULL, moduleid);
+ apply_all_netstacks(moduleid, netstack_apply_shutdown);
+ apply_all_netstacks(moduleid, netstack_apply_destroy);
/*
- * Clear the netstack_m_state so that we can handle this module
+ * Clear the nms_flags so that we can handle this module
* being loaded again.
+ * Also remove the registered functions.
*/
mutex_enter(&netstack_g_lock);
+ ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
+ ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
+ nm_state_t *nms = &ns->netstack_m_state[moduleid];
+
mutex_enter(&ns->netstack_lock);
- if (ns->netstack_m_state[moduleid] & NSS_DESTROY_COMPLETED) {
- ns->netstack_m_state[moduleid] = 0;
+ if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
+ nms->nms_flags = 0;
DTRACE_PROBE2(netstack__destroy__done,
netstack_t *, ns, int, moduleid);
}
@@ -304,6 +332,7 @@ netstack_zone_create(zoneid_t zoneid)
}
/* Not found */
mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
ns->netstack_stackid = zoneid;
ns->netstack_numzones = 1;
ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
@@ -311,26 +340,44 @@ netstack_zone_create(zoneid_t zoneid)
*nsp = ns;
zone->zone_netstack = ns;
+ mutex_enter(&ns->netstack_lock);
+ /*
+ * Mark this netstack as having a CREATE running so
+ * any netstack_register/netstack_unregister waits for
+ * the existing create callbacks to complete in moduleid order
+ */
+ ns->netstack_flags |= NSF_ZONE_CREATE;
+
/*
* Determine the set of module create functions that need to be
* called before we drop the lock.
+ * Set NSS_CREATE_NEEDED for each of those.
+ * Skip any with NRF_DYING set, since those are in the process of
+ * going away, by checking for flags being exactly NRF_REGISTERED.
*/
for (i = 0; i < NS_MAX; i++) {
- mutex_enter(&ns->netstack_lock);
- if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
- (ns->netstack_m_state[i] & NSS_CREATE_ALL) == 0) {
- ns->netstack_m_state[i] |= NSS_CREATE_NEEDED;
+ nm_state_t *nms = &ns->netstack_m_state[i];
+
+ cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
+
+ if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
+ (nms->nms_flags & NSS_CREATE_ALL) == 0) {
+ nms->nms_flags |= NSS_CREATE_NEEDED;
DTRACE_PROBE2(netstack__create__needed,
netstack_t *, ns, int, i);
}
- mutex_exit(&ns->netstack_lock);
}
+ mutex_exit(&ns->netstack_lock);
mutex_exit(&netstack_g_lock);
- netstack_do_create(ns, NS_ALL);
+ apply_all_modules(ns, netstack_apply_create);
+ /* Tell any waiting netstack_register/netstack_unregister to proceed */
mutex_enter(&ns->netstack_lock);
ns->netstack_flags &= ~NSF_UNINIT;
+ ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
+ ns->netstack_flags &= ~NSF_ZONE_CREATE;
+ cv_broadcast(&ns->netstack_cv);
mutex_exit(&ns->netstack_lock);
return (ns);
@@ -356,29 +403,46 @@ netstack_zone_shutdown(zoneid_t zoneid, void *arg)
mutex_exit(&ns->netstack_lock);
mutex_enter(&netstack_g_lock);
+ mutex_enter(&ns->netstack_lock);
+ /*
+ * Mark this netstack as having a SHUTDOWN running so
+ * any netstack_register/netstack_unregister waits for
+ * the existing create callbacks to complete in moduleid order
+ */
+ ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
+ ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
+
/*
* Determine the set of stacks that exist before we drop the lock.
- * Set SHUTDOWN_NEEDED for each of those.
+ * Set NSS_SHUTDOWN_NEEDED for each of those.
*/
for (i = 0; i < NS_MAX; i++) {
- mutex_enter(&ns->netstack_lock);
+ nm_state_t *nms = &ns->netstack_m_state[i];
+
if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
ns_reg[i].nr_shutdown != NULL &&
- (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
- (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) {
- ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED;
+ (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+ (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
+ nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
DTRACE_PROBE2(netstack__shutdown__needed,
netstack_t *, ns, int, i);
}
- mutex_exit(&ns->netstack_lock);
}
+ mutex_exit(&ns->netstack_lock);
mutex_exit(&netstack_g_lock);
/*
* Call the shutdown function for all registered modules for this
* netstack.
*/
- netstack_do_shutdown(ns, NS_ALL);
+ apply_all_modules(ns, netstack_apply_shutdown);
+
+ /* Tell any waiting netstack_register/netstack_unregister to proceed */
+ mutex_enter(&ns->netstack_lock);
+ ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
+ ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
+ cv_broadcast(&ns->netstack_cv);
+ mutex_exit(&ns->netstack_lock);
}
/*
@@ -429,70 +493,183 @@ netstack_stack_inactive(netstack_t *ns)
int i;
mutex_enter(&netstack_g_lock);
+ mutex_enter(&ns->netstack_lock);
+ /*
+ * Mark this netstack as having a DESTROY running so
+ * any netstack_register/netstack_unregister waits for
+ * the existing destroy callbacks to complete in reverse moduleid order
+ */
+ ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
+ ns->netstack_flags |= NSF_ZONE_DESTROY;
/*
* If the shutdown callback wasn't called earlier (e.g., if this is
- * a netstack shared between multiple zones), then we call it now.
+ * a netstack shared between multiple zones), then we schedule it now.
+ *
+ * Determine the set of stacks that exist before we drop the lock.
+ * Set NSS_DESTROY_NEEDED for each of those. That
+ * ensures that when we return all the callbacks for existing
+ * instances have completed.
*/
for (i = 0; i < NS_MAX; i++) {
- mutex_enter(&ns->netstack_lock);
+ nm_state_t *nms = &ns->netstack_m_state[i];
+
if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
ns_reg[i].nr_shutdown != NULL &&
- (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
- (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) {
- ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED;
+ (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+ (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
+ nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
DTRACE_PROBE2(netstack__shutdown__needed,
netstack_t *, ns, int, i);
}
- mutex_exit(&ns->netstack_lock);
- }
- /*
- * Determine the set of stacks that exist before we drop the lock.
- * Set DESTROY_NEEDED for each of those.
- */
- for (i = 0; i < NS_MAX; i++) {
- mutex_enter(&ns->netstack_lock);
+
if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
ns_reg[i].nr_destroy != NULL &&
- (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
- (ns->netstack_m_state[i] & NSS_DESTROY_ALL) == 0) {
- ns->netstack_m_state[i] |= NSS_DESTROY_NEEDED;
+ (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+ (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
+ nms->nms_flags |= NSS_DESTROY_NEEDED;
DTRACE_PROBE2(netstack__destroy__needed,
netstack_t *, ns, int, i);
}
- mutex_exit(&ns->netstack_lock);
}
+ mutex_exit(&ns->netstack_lock);
mutex_exit(&netstack_g_lock);
/*
* Call the shutdown and destroy functions for all registered modules
* for this netstack.
+ *
+ * Since there are some ordering dependencies between the modules we
+ * tear them down in the reverse order of what was used to create them.
+ *
+ * Since a netstack_t is never reused (when a zone is rebooted it gets
+ * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
+ * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
+ * That is different than in the netstack_unregister() case.
*/
- netstack_do_shutdown(ns, NS_ALL);
- netstack_do_destroy(ns, NS_ALL);
+ apply_all_modules(ns, netstack_apply_shutdown);
+ apply_all_modules_reverse(ns, netstack_apply_destroy);
+
+ /* Tell any waiting netstack_register/netstack_unregister to proceed */
+ mutex_enter(&ns->netstack_lock);
+ ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
+ ns->netstack_flags &= ~NSF_ZONE_DESTROY;
+ cv_broadcast(&ns->netstack_cv);
+ mutex_exit(&ns->netstack_lock);
+}
+
+/*
+ * Apply a function to all netstacks for a particular moduleid.
+ *
+ * If there is any zone activity (due to a zone being created, shutdown,
+ * or destroyed) we wait for that to complete before we proceed. This ensures
+ * that the moduleids are processed in order when a zone is created or
+ * destroyed.
+ *
+ * The applyfn has to drop netstack_g_lock if it does some work.
+ * In that case we don't follow netstack_next,
+ * even if it is possible to do so without any hazards. This is
+ * because we want the design to allow for the list of netstacks threaded
+ * by netstack_next to change in any arbitrary way during the time the
+ * lock was dropped.
+ *
+ * It is safe to restart the loop at netstack_head since the applyfn
+ * changes netstack_m_state as it processes things, so a subsequent
+ * pass through will have no effect in applyfn, hence the loop will terminate
+ * in at worst O(N^2).
+ */
+static void
+apply_all_netstacks(int moduleid, applyfn_t *applyfn)
+{
+ netstack_t *ns;
+
+ mutex_enter(&netstack_g_lock);
+ ns = netstack_head;
+ while (ns != NULL) {
+ if (wait_for_zone_creator(ns, &netstack_g_lock)) {
+ /* Lock dropped - restart at head */
+ ns = netstack_head;
+ } else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
+ /* Lock dropped - restart at head */
+ ns = netstack_head;
+ } else {
+ ns = ns->netstack_next;
+ }
+ }
+ mutex_exit(&netstack_g_lock);
+}
+
+/*
+ * Apply a function to all moduleids for a particular netstack.
+ *
+ * Since the netstack linkage doesn't matter in this case we can
+ * ignore whether the function drops the lock.
+ */
+static void
+apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
+{
+ int i;
+
+ mutex_enter(&netstack_g_lock);
+ for (i = 0; i < NS_MAX; i++) {
+ /*
+ * We don't care whether the lock was dropped
+ * since we are not iterating over netstack_head.
+ */
+ (void) (applyfn)(&netstack_g_lock, ns, i);
+ }
+ mutex_exit(&netstack_g_lock);
+}
+
+/* Like the above but in reverse moduleid order */
+static void
+apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
+{
+ int i;
+
+ mutex_enter(&netstack_g_lock);
+ for (i = NS_MAX-1; i >= 0; i--) {
+ /*
+ * We don't care whether the lock was dropped
+ * since we are not iterating over netstack_head.
+ */
+ (void) (applyfn)(&netstack_g_lock, ns, i);
+ }
+ mutex_exit(&netstack_g_lock);
}
/*
* Call the create function for the ns and moduleid if CREATE_NEEDED
* is set.
- * When it calls it, it drops the netstack_lock held by the caller,
- * and returns true to tell the caller it needs to re-evalute the
- * state..
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all netstacks/moduleids.
+ *
+ * When we call the create function, we temporarily drop the netstack_lock
+ * held by the caller, and return true to tell the caller it needs to
+ * re-evalute the state.
*/
static boolean_t
netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
{
void *result;
netstackid_t stackid;
+ nm_state_t *nms = &ns->netstack_m_state[moduleid];
+ boolean_t dropped = B_FALSE;
ASSERT(MUTEX_HELD(lockp));
mutex_enter(&ns->netstack_lock);
- if (ns->netstack_m_state[moduleid] & NSS_CREATE_NEEDED) {
- ns->netstack_m_state[moduleid] &= ~NSS_CREATE_NEEDED;
- ns->netstack_m_state[moduleid] |= NSS_CREATE_INPROGRESS;
+
+ if (wait_for_nms_inprogress(ns, nms, lockp))
+ dropped = B_TRUE;
+
+ if (nms->nms_flags & NSS_CREATE_NEEDED) {
+ nms->nms_flags &= ~NSS_CREATE_NEEDED;
+ nms->nms_flags |= NSS_CREATE_INPROGRESS;
DTRACE_PROBE2(netstack__create__inprogress,
netstack_t *, ns, int, moduleid);
mutex_exit(&ns->netstack_lock);
mutex_exit(lockp);
+ dropped = B_TRUE;
ASSERT(ns_reg[moduleid].nr_create != NULL);
stackid = ns->netstack_stackid;
@@ -504,42 +681,55 @@ netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
void *, result, netstack_t *, ns);
ASSERT(result != NULL);
+ mutex_enter(lockp);
mutex_enter(&ns->netstack_lock);
ns->netstack_modules[moduleid] = result;
- ns->netstack_m_state[moduleid] &= ~NSS_CREATE_INPROGRESS;
- ns->netstack_m_state[moduleid] |= NSS_CREATE_COMPLETED;
+ nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
+ nms->nms_flags |= NSS_CREATE_COMPLETED;
+ cv_broadcast(&nms->nms_cv);
DTRACE_PROBE2(netstack__create__completed,
netstack_t *, ns, int, moduleid);
mutex_exit(&ns->netstack_lock);
- return (B_TRUE);
+ return (dropped);
} else {
mutex_exit(&ns->netstack_lock);
- return (B_FALSE);
+ return (dropped);
}
}
/*
* Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
* is set.
- * When it calls it, it drops the netstack_lock held by the caller,
- * and returns true to tell the caller it needs to re-evalute the
- * state..
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all netstacks/moduleids.
+ *
+ * When we call the shutdown function, we temporarily drop the netstack_lock
+ * held by the caller, and return true to tell the caller it needs to
+ * re-evalute the state.
*/
static boolean_t
netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
{
netstackid_t stackid;
void * netstack_module;
+ nm_state_t *nms = &ns->netstack_m_state[moduleid];
+ boolean_t dropped = B_FALSE;
ASSERT(MUTEX_HELD(lockp));
mutex_enter(&ns->netstack_lock);
- if (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_NEEDED) {
- ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_NEEDED;
- ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_INPROGRESS;
+
+ if (wait_for_nms_inprogress(ns, nms, lockp))
+ dropped = B_TRUE;
+
+ if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
+ nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
+ nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
DTRACE_PROBE2(netstack__shutdown__inprogress,
netstack_t *, ns, int, moduleid);
mutex_exit(&ns->netstack_lock);
mutex_exit(lockp);
+ dropped = B_TRUE;
ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
stackid = ns->netstack_stackid;
@@ -551,43 +741,55 @@ netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
DTRACE_PROBE1(netstack__shutdown__end,
netstack_t *, ns);
+ mutex_enter(lockp);
mutex_enter(&ns->netstack_lock);
- ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_INPROGRESS;
- ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_COMPLETED;
+ nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
+ nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
+ cv_broadcast(&nms->nms_cv);
DTRACE_PROBE2(netstack__shutdown__completed,
netstack_t *, ns, int, moduleid);
mutex_exit(&ns->netstack_lock);
- return (B_TRUE);
+ return (dropped);
} else {
mutex_exit(&ns->netstack_lock);
- return (B_FALSE);
+ return (dropped);
}
}
/*
* Call the destroy function for the ns and moduleid if DESTROY_NEEDED
* is set.
- * When it calls it, it drops the netstack_lock held by the caller,
- * and returns true to tell the caller it needs to re-evalute the
- * state..
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all netstacks/moduleids.
+ *
+ * When we call the destroy function, we temporarily drop the netstack_lock
+ * held by the caller, and return true to tell the caller it needs to
+ * re-evalute the state.
*/
static boolean_t
netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
{
netstackid_t stackid;
void * netstack_module;
+ nm_state_t *nms = &ns->netstack_m_state[moduleid];
+ boolean_t dropped = B_FALSE;
ASSERT(MUTEX_HELD(lockp));
mutex_enter(&ns->netstack_lock);
- if (ns->netstack_m_state[moduleid] & NSS_DESTROY_NEEDED) {
- ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_NEEDED;
- ns->netstack_m_state[moduleid] |= NSS_DESTROY_INPROGRESS;
+
+ if (wait_for_nms_inprogress(ns, nms, lockp))
+ dropped = B_TRUE;
+
+ if (nms->nms_flags & NSS_DESTROY_NEEDED) {
+ nms->nms_flags &= ~NSS_DESTROY_NEEDED;
+ nms->nms_flags |= NSS_DESTROY_INPROGRESS;
DTRACE_PROBE2(netstack__destroy__inprogress,
netstack_t *, ns, int, moduleid);
mutex_exit(&ns->netstack_lock);
mutex_exit(lockp);
+ dropped = B_TRUE;
- /* XXX race against unregister? */
ASSERT(ns_reg[moduleid].nr_destroy != NULL);
stackid = ns->netstack_stackid;
netstack_module = ns->netstack_modules[moduleid];
@@ -598,177 +800,83 @@ netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
DTRACE_PROBE1(netstack__destroy__end,
netstack_t *, ns);
+ mutex_enter(lockp);
mutex_enter(&ns->netstack_lock);
ns->netstack_modules[moduleid] = NULL;
- ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_INPROGRESS;
- ns->netstack_m_state[moduleid] |= NSS_DESTROY_COMPLETED;
+ nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
+ nms->nms_flags |= NSS_DESTROY_COMPLETED;
+ cv_broadcast(&nms->nms_cv);
DTRACE_PROBE2(netstack__destroy__completed,
netstack_t *, ns, int, moduleid);
mutex_exit(&ns->netstack_lock);
- return (B_TRUE);
+ return (dropped);
} else {
mutex_exit(&ns->netstack_lock);
- return (B_FALSE);
+ return (dropped);
}
}
/*
- * Apply a function to all netstacks for a particular moduleid.
- *
- * The applyfn has to drop netstack_g_lock if it does some work.
- * In that case we don't follow netstack_next after reacquiring the
- * lock, even if it is possible to do so without any hazards. This is
- * because we want the design to allow for the list of netstacks threaded
- * by netstack_next to change in any arbitrary way during the time the
- * lock was dropped.
- *
- * It is safe to restart the loop at netstack_head since the applyfn
- * changes netstack_m_state as it processes things, so a subsequent
- * pass through will have no effect in applyfn, hence the loop will terminate
- * in at worst O(N^2).
+ * If somebody is creating the netstack (due to a new zone being created)
+ * then we wait for them to complete. This ensures that any additional
+ * netstack_register() doesn't cause the create functions to run out of
+ * order.
+ * Note that we do not need such a global wait in the case of the shutdown
+ * and destroy callbacks, since in that case it is sufficient for both
+ * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
+ * Returns true if lockp was temporarily dropped while waiting.
*/
-static void
-apply_all_netstacks(int moduleid, applyfn_t *applyfn)
+static boolean_t
+wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
{
- netstack_t *ns;
+ boolean_t dropped = B_FALSE;
- mutex_enter(&netstack_g_lock);
- ns = netstack_head;
- while (ns != NULL) {
- if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
- /* Lock dropped - restart at head */
-#ifdef NS_DEBUG
- (void) printf("apply_all_netstacks: "
- "LD for %p/%d, %d\n",
- (void *)ns, ns->netstack_stackid, moduleid);
-#endif
- mutex_enter(&netstack_g_lock);
- ns = netstack_head;
- } else {
- ns = ns->netstack_next;
+ mutex_enter(&ns->netstack_lock);
+ while (ns->netstack_flags & NSF_ZONE_CREATE) {
+ DTRACE_PROBE1(netstack__wait__zone__inprogress,
+ netstack_t *, ns);
+ if (lockp != NULL) {
+ dropped = B_TRUE;
+ mutex_exit(lockp);
+ }
+ cv_wait(&ns->netstack_cv, &ns->netstack_lock);
+ if (lockp != NULL) {
+ /* First drop netstack_lock to preserve order */
+ mutex_exit(&ns->netstack_lock);
+ mutex_enter(lockp);
+ mutex_enter(&ns->netstack_lock);
}
}
- mutex_exit(&netstack_g_lock);
+ mutex_exit(&ns->netstack_lock);
+ return (dropped);
}
/*
- * Apply a function to all moduleids for a particular netstack.
- *
- * Since the netstack linkage doesn't matter in this case we can
- * ignore whether the function drops the lock.
+ * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
+ * combination.
+ * Returns true if lockp was temporarily dropped while waiting.
*/
-static void
-apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
+static boolean_t
+wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
{
- int i;
-
- mutex_enter(&netstack_g_lock);
- for (i = 0; i < NS_MAX; i++) {
- if ((applyfn)(&netstack_g_lock, ns, i)) {
- /*
- * Lock dropped but since we are not iterating over
- * netstack_head we can just reacquire the lock.
- */
- mutex_enter(&netstack_g_lock);
+ boolean_t dropped = B_FALSE;
+
+ while (nms->nms_flags & NSS_ALL_INPROGRESS) {
+ DTRACE_PROBE2(netstack__wait__nms__inprogress,
+ netstack_t *, ns, nm_state_t *, nms);
+ if (lockp != NULL) {
+ dropped = B_TRUE;
+ mutex_exit(lockp);
}
- }
- mutex_exit(&netstack_g_lock);
-}
-
-/* Like the above but in reverse moduleid order */
-static void
-apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
-{
- int i;
-
- mutex_enter(&netstack_g_lock);
- for (i = NS_MAX-1; i >= 0; i--) {
- if ((applyfn)(&netstack_g_lock, ns, i)) {
- /*
- * Lock dropped but since we are not iterating over
- * netstack_head we can just reacquire the lock.
- */
- mutex_enter(&netstack_g_lock);
+ cv_wait(&nms->nms_cv, &ns->netstack_lock);
+ if (lockp != NULL) {
+ /* First drop netstack_lock to preserve order */
+ mutex_exit(&ns->netstack_lock);
+ mutex_enter(lockp);
+ mutex_enter(&ns->netstack_lock);
}
}
- mutex_exit(&netstack_g_lock);
-}
-
-/*
- * Apply a function to a subset of all module/netstack combinations.
- *
- * If ns is non-NULL we restrict it to that particular instance.
- * If moduleid is a particular one (not NS_ALL), then we restrict it
- * to that particular moduleid.
- * When walking the moduleid, the reverse argument specifies that they
- * should be walked in reverse order.
- * The applyfn returns true if it had dropped the locks.
- */
-static void
-netstack_do_apply(netstack_t *ns, int moduleid, boolean_t reverse,
- applyfn_t *applyfn)
-{
- if (ns != NULL) {
- ASSERT(moduleid == NS_ALL);
- if (reverse)
- apply_all_modules_reverse(ns, applyfn);
- else
- apply_all_modules(ns, applyfn);
- } else {
- ASSERT(moduleid != NS_ALL);
-
- apply_all_netstacks(moduleid, applyfn);
- }
-}
-
-/*
- * Run the create function for all modules x stack combinations
- * that have NSS_CREATE_NEEDED set.
- *
- * Call the create function for each stack that has CREATE_NEEDED.
- * Set CREATE_INPROGRESS, drop lock, and after done,
- * set CREATE_COMPLETE
- */
-static void
-netstack_do_create(netstack_t *ns, int moduleid)
-{
- netstack_do_apply(ns, moduleid, B_FALSE, netstack_apply_create);
-}
-
-/*
- * Run the shutdown function for all modules x stack combinations
- * that have NSS_SHUTDOWN_NEEDED set.
- *
- * Call the shutdown function for each stack that has SHUTDOWN_NEEDED.
- * Set SHUTDOWN_INPROGRESS, drop lock, and after done,
- * set SHUTDOWN_COMPLETE
- */
-static void
-netstack_do_shutdown(netstack_t *ns, int moduleid)
-{
- netstack_do_apply(ns, moduleid, B_FALSE, netstack_apply_shutdown);
-}
-
-/*
- * Run the destroy function for all modules x stack combinations
- * that have NSS_DESTROY_NEEDED set.
- *
- * Call the destroy function for each stack that has DESTROY_NEEDED.
- * Set DESTROY_INPROGRESS, drop lock, and after done,
- * set DESTROY_COMPLETE
- *
- * Since a netstack_t is never reused (when a zone is rebooted it gets
- * a new zoneid == netstackid i.e. a new netstack_t is allocated) we leave
- * netstack_m_state the way it is i.e. with NSS_DESTROY_COMPLETED set.
- */
-static void
-netstack_do_destroy(netstack_t *ns, int moduleid)
-{
- /*
- * Have to walk the moduleids in reverse order since some
- * modules make implicit assumptions about the order
- */
- netstack_do_apply(ns, moduleid, B_TRUE, netstack_apply_destroy);
+ return (dropped);
}
/*
@@ -845,7 +953,10 @@ netstack_find_by_zoneid(zoneid_t zoneid)
}
/*
- * Find a stack instance given the zoneid.
+ * Find a stack instance given the zoneid. Can only be called from
+ * the create callback. See the comments in zone_find_by_id_nolock why
+ * that limitation exists.
+ *
* Increases the reference count if found; caller must do a
* netstack_rele().
*
@@ -853,8 +964,6 @@ netstack_find_by_zoneid(zoneid_t zoneid)
* matches.
*
* Skip the unitialized ones.
- *
- * NOTE: The caller must hold zonehash_lock.
*/
netstack_t *
netstack_find_by_zoneid_nolock(zoneid_t zoneid)
@@ -875,7 +984,7 @@ netstack_find_by_zoneid_nolock(zoneid_t zoneid)
else
netstack_hold(ns);
- zone_rele(zone);
+ /* zone_find_by_id_nolock does not have a hold on the zone */
return (ns);
}
@@ -913,6 +1022,7 @@ netstack_rele(netstack_t *ns)
netstack_t **nsp;
boolean_t found;
int refcnt, numzones;
+ int i;
mutex_enter(&ns->netstack_lock);
ASSERT(ns->netstack_refcnt > 0);
@@ -959,6 +1069,14 @@ netstack_rele(netstack_t *ns)
ASSERT(ns->netstack_numzones == 0);
ASSERT(ns->netstack_flags & NSF_CLOSING);
+
+ for (i = 0; i < NS_MAX; i++) {
+ nm_state_t *nms = &ns->netstack_m_state[i];
+
+ cv_destroy(&nms->nms_cv);
+ }
+ mutex_destroy(&ns->netstack_lock);
+ cv_destroy(&ns->netstack_cv);
kmem_free(ns, sizeof (*ns));
}
}
@@ -996,7 +1114,7 @@ kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
zoneid_t zoneid = ks_netstackid;
return (kstat_create_zone(ks_module, ks_instance, ks_name,
- ks_class, ks_type, ks_ndata, ks_flags, zoneid));
+ ks_class, ks_type, ks_ndata, ks_flags, zoneid));
}
}
@@ -1144,7 +1262,9 @@ netstack_find_shared_zoneid(zoneid_t zoneid)
/*
* Hide the fact that zoneids and netstackids are allocated from
* the same space in the current implementation.
- * XXX could add checks that the stackid/zoneids are valid...
+ * We currently do not check that the stackid/zoneids are valid, since there
+ * is no need for that. But this should only be done for ids that are
+ * valid.
*/
zoneid_t
netstackid_to_zoneid(netstackid_t stackid)
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 033bc96ea3..75354330ef 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -61,6 +61,10 @@
* initialized zone is added to the list of active zones on the system but
* isn't accessible.
*
+ * ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are
+ * not yet completed. Not possible to enter the zone, but attributes can
+ * be retrieved.
+ *
* ZONE_IS_READY: zsched (the kernel dummy process for a zone) is
* ready. The zone is made visible after the ZSD constructor callbacks are
* executed. A zone remains in this state until it transitions into
@@ -228,6 +232,7 @@
#include <sys/door.h>
#include <sys/cpuvar.h>
+#include <sys/sdt.h>
#include <sys/uadmin.h>
#include <sys/session.h>
@@ -313,6 +318,7 @@ evchan_t *zone_event_chan;
*/
const char *zone_status_table[] = {
ZONE_EVENT_UNINITIALIZED, /* uninitialized */
+ ZONE_EVENT_INITIALIZED, /* initialized */
ZONE_EVENT_READY, /* ready */
ZONE_EVENT_READY, /* booting */
ZONE_EVENT_RUNNING, /* running */
@@ -351,6 +357,19 @@ static int zone_remove_datalink(zoneid_t, char *);
static int zone_check_datalink(zoneid_t *, char *);
static int zone_list_datalink(zoneid_t, int *, char *);
+typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t);
+
+static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t);
+static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *);
+static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t);
+static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *,
+ zone_key_t);
+static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t);
+static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *,
+ kmutex_t *);
+static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *,
+ kmutex_t *);
+
/*
* Bump this number when you alter the zone syscall interfaces; this is
* because we need to have support for previous API versions in libc
@@ -485,78 +504,54 @@ mount_completed(void)
* The locking strategy and overall picture is as follows:
*
* When someone calls zone_key_create(), a template ZSD entry is added to the
- * global list "zsd_registered_keys", protected by zsd_key_lock. The
- * constructor callback is called immediately on all existing zones, and a
- * copy of the ZSD entry added to the per-zone zone_zsd list (protected by
- * zone_lock). As this operation requires the list of zones, the list of
- * registered keys, and the per-zone list of ZSD entries to remain constant
- * throughout the entire operation, it must grab zonehash_lock, zone_lock for
- * all existing zones, and zsd_key_lock, in that order. Similar locking is
- * needed when zone_key_delete() is called. It is thus sufficient to hold
- * zsd_key_lock *or* zone_lock to prevent additions to or removals from the
- * per-zone zone_zsd list.
+ * global list "zsd_registered_keys", protected by zsd_key_lock. While
+ * holding that lock all the existing zones are marked as
+ * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone
+ * zone_zsd list (protected by zone_lock). The global list is updated first
+ * (under zone_key_lock) to make sure that newly created zones use the
+ * most recent list of keys. Then under zonehash_lock we walk the zones
+ * and mark them. Similar locking is used in zone_key_delete().
*
- * Note that this implementation does not make a copy of the ZSD entry if a
- * constructor callback is not provided. A zone_getspecific() on such an
- * uninitialized ZSD entry will return NULL.
+ * The actual create, shutdown, and destroy callbacks are done without
+ * holding any lock. And zsd_flags are used to ensure that the operations
+ * completed so that when zone_key_create (and zone_create) is done, as well as
+ * zone_key_delete (and zone_destroy) is done, all the necessary callbacks
+ * are completed.
*
* When new zones are created constructor callbacks for all registered ZSD
- * entries will be called.
+ * entries will be called. That also uses the above two phases of marking
+ * what needs to be done, and then running the callbacks without holding
+ * any locks.
*
* The framework does not provide any locking around zone_getspecific() and
* zone_setspecific() apart from that needed for internal consistency, so
* callers interested in atomic "test-and-set" semantics will need to provide
* their own locking.
*/
-void
-zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
- void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
-{
- struct zsd_entry *zsdp;
- struct zsd_entry *t;
- struct zone *zone;
- zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP);
- zsdp->zsd_data = NULL;
- zsdp->zsd_create = create;
- zsdp->zsd_shutdown = shutdown;
- zsdp->zsd_destroy = destroy;
-
- mutex_enter(&zonehash_lock); /* stop the world */
- for (zone = list_head(&zone_active); zone != NULL;
- zone = list_next(&zone_active, zone))
- mutex_enter(&zone->zone_lock); /* lock all zones */
-
- mutex_enter(&zsd_key_lock);
- *keyp = zsdp->zsd_key = ++zsd_keyval;
- ASSERT(zsd_keyval != 0);
- list_insert_tail(&zsd_registered_keys, zsdp);
- mutex_exit(&zsd_key_lock);
+/*
+ * Helper function to find the zsd_entry associated with the key in the
+ * given list.
+ */
+static struct zsd_entry *
+zsd_find(list_t *l, zone_key_t key)
+{
+ struct zsd_entry *zsd;
- if (create != NULL) {
- for (zone = list_head(&zone_active); zone != NULL;
- zone = list_next(&zone_active, zone)) {
- t = kmem_alloc(sizeof (*t), KM_SLEEP);
- t->zsd_key = *keyp;
- t->zsd_data = (*create)(zone->zone_id);
- t->zsd_create = create;
- t->zsd_shutdown = shutdown;
- t->zsd_destroy = destroy;
- list_insert_tail(&zone->zone_zsd, t);
+ for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) {
+ if (zsd->zsd_key == key) {
+ return (zsd);
}
}
- for (zone = list_head(&zone_active); zone != NULL;
- zone = list_next(&zone_active, zone))
- mutex_exit(&zone->zone_lock);
- mutex_exit(&zonehash_lock);
+ return (NULL);
}
/*
* Helper function to find the zsd_entry associated with the key in the
- * given list.
+ * given list. Move it to the front of the list.
*/
static struct zsd_entry *
-zsd_find(list_t *l, zone_key_t key)
+zsd_find_mru(list_t *l, zone_key_t key)
{
struct zsd_entry *zsd;
@@ -575,9 +570,88 @@ zsd_find(list_t *l, zone_key_t key)
return (NULL);
}
+void
+zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t),
+ void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *))
+{
+ struct zsd_entry *zsdp;
+ struct zsd_entry *t;
+ struct zone *zone;
+ zone_key_t key;
+
+ zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP);
+ zsdp->zsd_data = NULL;
+ zsdp->zsd_create = create;
+ zsdp->zsd_shutdown = shutdown;
+ zsdp->zsd_destroy = destroy;
+
+ /*
+ * Insert in global list of callbacks. Makes future zone creations
+ * see it.
+ */
+ mutex_enter(&zsd_key_lock);
+ *keyp = key = zsdp->zsd_key = ++zsd_keyval;
+ ASSERT(zsd_keyval != 0);
+ list_insert_tail(&zsd_registered_keys, zsdp);
+ mutex_exit(&zsd_key_lock);
+
+ /*
+ * Insert for all existing zones and mark them as needing
+ * a create callback.
+ */
+ mutex_enter(&zonehash_lock); /* stop the world */
+ for (zone = list_head(&zone_active); zone != NULL;
+ zone = list_next(&zone_active, zone)) {
+ zone_status_t status;
+
+ mutex_enter(&zone->zone_lock);
+
+ /* Skip zones that are on the way down or not yet up */
+ status = zone_status_get(zone);
+ if (status >= ZONE_IS_DOWN ||
+ status == ZONE_IS_UNINITIALIZED) {
+ mutex_exit(&zone->zone_lock);
+ continue;
+ }
+
+ t = zsd_find_mru(&zone->zone_zsd, key);
+ if (t != NULL) {
+ /*
+ * A zsd_configure already inserted it after
+ * we dropped zsd_key_lock above.
+ */
+ mutex_exit(&zone->zone_lock);
+ continue;
+ }
+ t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+ t->zsd_key = key;
+ t->zsd_create = create;
+ t->zsd_shutdown = shutdown;
+ t->zsd_destroy = destroy;
+ if (create != NULL) {
+ t->zsd_flags = ZSD_CREATE_NEEDED;
+ DTRACE_PROBE2(zsd__create__needed,
+ zone_t *, zone, zone_key_t, key);
+ }
+ list_insert_tail(&zone->zone_zsd, t);
+ mutex_exit(&zone->zone_lock);
+ }
+ mutex_exit(&zonehash_lock);
+
+ if (create != NULL) {
+ /* Now call the create callback for this key */
+ zsd_apply_all_zones(zsd_apply_create, key);
+ }
+}
+
/*
* Function called when a module is being unloaded, or otherwise wishes
* to unregister its ZSD key and callbacks.
+ *
+ * Remove from the global list and determine the functions that need to
+ * be called under a global lock. Then call the functions without
+ * holding any locks. Finally free up the zone_zsd entries. (The apply
+ * functions need to access the zone_zsd entries to find zsd_data etc.)
*/
int
zone_key_delete(zone_key_t key)
@@ -585,65 +659,88 @@ zone_key_delete(zone_key_t key)
struct zsd_entry *zsdp = NULL;
zone_t *zone;
- mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */
- for (zone = list_head(&zone_active); zone != NULL;
- zone = list_next(&zone_active, zone))
- mutex_enter(&zone->zone_lock); /* lock all zones */
-
mutex_enter(&zsd_key_lock);
- zsdp = zsd_find(&zsd_registered_keys, key);
- if (zsdp == NULL)
- goto notfound;
+ zsdp = zsd_find_mru(&zsd_registered_keys, key);
+ if (zsdp == NULL) {
+ mutex_exit(&zsd_key_lock);
+ return (-1);
+ }
list_remove(&zsd_registered_keys, zsdp);
mutex_exit(&zsd_key_lock);
+ mutex_enter(&zonehash_lock);
for (zone = list_head(&zone_active); zone != NULL;
zone = list_next(&zone_active, zone)) {
struct zsd_entry *del;
- void *data;
-
- if (!(zone->zone_flags & ZF_DESTROYED)) {
- del = zsd_find(&zone->zone_zsd, key);
- if (del != NULL) {
- data = del->zsd_data;
- ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
- ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
- list_remove(&zone->zone_zsd, del);
- kmem_free(del, sizeof (*del));
- } else {
- data = NULL;
- }
- if (zsdp->zsd_shutdown)
- zsdp->zsd_shutdown(zone->zone_id, data);
- if (zsdp->zsd_destroy)
- zsdp->zsd_destroy(zone->zone_id, data);
+
+ mutex_enter(&zone->zone_lock);
+ del = zsd_find_mru(&zone->zone_zsd, key);
+ if (del == NULL) {
+ /*
+ * Somebody else got here first e.g the zone going
+ * away.
+ */
+ mutex_exit(&zone->zone_lock);
+ continue;
+ }
+ ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown);
+ ASSERT(del->zsd_destroy == zsdp->zsd_destroy);
+ if (del->zsd_shutdown != NULL &&
+ (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
+ del->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
+ DTRACE_PROBE2(zsd__shutdown__needed,
+ zone_t *, zone, zone_key_t, key);
+ }
+ if (del->zsd_destroy != NULL &&
+ (del->zsd_flags & ZSD_DESTROY_ALL) == 0) {
+ del->zsd_flags |= ZSD_DESTROY_NEEDED;
+ DTRACE_PROBE2(zsd__destroy__needed,
+ zone_t *, zone, zone_key_t, key);
}
mutex_exit(&zone->zone_lock);
}
mutex_exit(&zonehash_lock);
kmem_free(zsdp, sizeof (*zsdp));
- return (0);
-notfound:
- mutex_exit(&zsd_key_lock);
+ /* Now call the shutdown and destroy callback for this key */
+ zsd_apply_all_zones(zsd_apply_shutdown, key);
+ zsd_apply_all_zones(zsd_apply_destroy, key);
+
+ /* Now we can free up the zsdp structures in each zone */
+ mutex_enter(&zonehash_lock);
for (zone = list_head(&zone_active); zone != NULL;
- zone = list_next(&zone_active, zone))
+ zone = list_next(&zone_active, zone)) {
+ struct zsd_entry *del;
+
+ mutex_enter(&zone->zone_lock);
+ del = zsd_find(&zone->zone_zsd, key);
+ if (del != NULL) {
+ list_remove(&zone->zone_zsd, del);
+ ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS));
+ kmem_free(del, sizeof (*del));
+ }
mutex_exit(&zone->zone_lock);
+ }
mutex_exit(&zonehash_lock);
- return (-1);
+
+ return (0);
}
/*
* ZSD counterpart of pthread_setspecific().
+ *
+ * Since all zsd callbacks, including those with no create function,
+ * have an entry in zone_zsd, if the key is registered it is part of
+ * the zone_zsd list.
+ * Return an error if the key wasn't registerd.
*/
int
zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
{
struct zsd_entry *t;
- struct zsd_entry *zsdp = NULL;
mutex_enter(&zone->zone_lock);
- t = zsd_find(&zone->zone_zsd, key);
+ t = zsd_find_mru(&zone->zone_zsd, key);
if (t != NULL) {
/*
* Replace old value with new
@@ -652,36 +749,8 @@ zone_setspecific(zone_key_t key, zone_t *zone, const void *data)
mutex_exit(&zone->zone_lock);
return (0);
}
- /*
- * If there was no previous value, go through the list of registered
- * keys.
- *
- * We avoid grabbing zsd_key_lock until we are sure we need it; this is
- * necessary for shutdown callbacks to be able to execute without fear
- * of deadlock.
- */
- mutex_enter(&zsd_key_lock);
- zsdp = zsd_find(&zsd_registered_keys, key);
- if (zsdp == NULL) { /* Key was not registered */
- mutex_exit(&zsd_key_lock);
- mutex_exit(&zone->zone_lock);
- return (-1);
- }
-
- /*
- * Add a zsd_entry to this zone, using the template we just retrieved
- * to initialize the constructor and destructor(s).
- */
- t = kmem_alloc(sizeof (*t), KM_SLEEP);
- t->zsd_key = key;
- t->zsd_data = (void *)data;
- t->zsd_create = zsdp->zsd_create;
- t->zsd_shutdown = zsdp->zsd_shutdown;
- t->zsd_destroy = zsdp->zsd_destroy;
- list_insert_tail(&zone->zone_zsd, t);
- mutex_exit(&zsd_key_lock);
mutex_exit(&zone->zone_lock);
- return (0);
+ return (-1);
}
/*
@@ -694,7 +763,7 @@ zone_getspecific(zone_key_t key, zone_t *zone)
void *data;
mutex_enter(&zone->zone_lock);
- t = zsd_find(&zone->zone_zsd, key);
+ t = zsd_find_mru(&zone->zone_zsd, key);
data = (t == NULL ? NULL : t->zsd_data);
mutex_exit(&zone->zone_lock);
return (data);
@@ -703,42 +772,41 @@ zone_getspecific(zone_key_t key, zone_t *zone)
/*
* Function used to initialize a zone's list of ZSD callbacks and data
* when the zone is being created. The callbacks are initialized from
- * the template list (zsd_registered_keys), and the constructor
- * callback executed (if one exists).
- *
- * This is called before the zone is made publicly available, hence no
- * need to grab zone_lock.
- *
- * Although we grab and release zsd_key_lock, new entries cannot be
- * added to or removed from the zsd_registered_keys list until we
- * release zonehash_lock, so there isn't a window for a
- * zone_key_create() to come in after we've dropped zsd_key_lock but
- * before the zone is added to the zone list, such that the constructor
- * callbacks aren't executed for the new zone.
+ * the template list (zsd_registered_keys). The constructor callback is
+ * executed later (once the zone exists and with locks dropped).
*/
static void
zone_zsd_configure(zone_t *zone)
{
struct zsd_entry *zsdp;
struct zsd_entry *t;
- zoneid_t zoneid = zone->zone_id;
ASSERT(MUTEX_HELD(&zonehash_lock));
ASSERT(list_head(&zone->zone_zsd) == NULL);
+ mutex_enter(&zone->zone_lock);
mutex_enter(&zsd_key_lock);
for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
zsdp = list_next(&zsd_registered_keys, zsdp)) {
+ /*
+ * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create
+ * should not have added anything to it.
+ */
+ ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL);
+
+ t = kmem_zalloc(sizeof (*t), KM_SLEEP);
+ t->zsd_key = zsdp->zsd_key;
+ t->zsd_create = zsdp->zsd_create;
+ t->zsd_shutdown = zsdp->zsd_shutdown;
+ t->zsd_destroy = zsdp->zsd_destroy;
if (zsdp->zsd_create != NULL) {
- t = kmem_alloc(sizeof (*t), KM_SLEEP);
- t->zsd_key = zsdp->zsd_key;
- t->zsd_create = zsdp->zsd_create;
- t->zsd_data = (*t->zsd_create)(zoneid);
- t->zsd_shutdown = zsdp->zsd_shutdown;
- t->zsd_destroy = zsdp->zsd_destroy;
- list_insert_tail(&zone->zone_zsd, t);
+ t->zsd_flags = ZSD_CREATE_NEEDED;
+ DTRACE_PROBE2(zsd__create__needed,
+ zone_t *, zone, zone_key_t, zsdp->zsd_key);
}
+ list_insert_tail(&zone->zone_zsd, t);
}
mutex_exit(&zsd_key_lock);
+ mutex_exit(&zone->zone_lock);
}
enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
@@ -749,70 +817,47 @@ enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY };
static void
zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct)
{
- struct zsd_entry *zsdp;
struct zsd_entry *t;
- zoneid_t zoneid = zone->zone_id;
ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY);
ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY);
ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN);
- mutex_enter(&zone->zone_lock);
- if (ct == ZSD_DESTROY) {
- if (zone->zone_flags & ZF_DESTROYED) {
- /*
- * Make sure destructors are only called once.
- */
- mutex_exit(&zone->zone_lock);
- return;
- }
- zone->zone_flags |= ZF_DESTROYED;
- }
- mutex_exit(&zone->zone_lock);
-
/*
- * Both zsd_key_lock and zone_lock need to be held in order to add or
- * remove a ZSD key, (either globally as part of
- * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is
- * possible through zone_setspecific()), so it's sufficient to hold
- * zsd_key_lock here.
- *
- * This is a good thing, since we don't want to recursively try to grab
- * zone_lock if a callback attempts to do something like a crfree() or
- * zone_rele().
+ * Run the callback solely based on what is registered for the zone
+ * in zone_zsd. The global list can change independently of this
+ * as keys are registered and unregistered and we don't register new
+ * callbacks for a zone that is in the process of going away.
*/
- mutex_enter(&zsd_key_lock);
- for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL;
- zsdp = list_next(&zsd_registered_keys, zsdp)) {
- zone_key_t key = zsdp->zsd_key;
+ mutex_enter(&zone->zone_lock);
+ for (t = list_head(&zone->zone_zsd); t != NULL;
+ t = list_next(&zone->zone_zsd, t)) {
+ zone_key_t key = t->zsd_key;
/* Skip if no callbacks registered */
- if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL)
- continue;
- if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL)
- continue;
- /*
- * Call the callback with the zone-specific data if we can find
- * any, otherwise with NULL.
- */
- t = zsd_find(&zone->zone_zsd, key);
- if (t != NULL) {
- if (ct == ZSD_SHUTDOWN) {
- t->zsd_shutdown(zoneid, t->zsd_data);
- } else {
- ASSERT(ct == ZSD_DESTROY);
- t->zsd_destroy(zoneid, t->zsd_data);
+
+ if (ct == ZSD_SHUTDOWN) {
+ if (t->zsd_shutdown != NULL &&
+ (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) {
+ t->zsd_flags |= ZSD_SHUTDOWN_NEEDED;
+ DTRACE_PROBE2(zsd__shutdown__needed,
+ zone_t *, zone, zone_key_t, key);
}
} else {
- if (ct == ZSD_SHUTDOWN) {
- zsdp->zsd_shutdown(zoneid, NULL);
- } else {
- ASSERT(ct == ZSD_DESTROY);
- zsdp->zsd_destroy(zoneid, NULL);
+ if (t->zsd_destroy != NULL &&
+ (t->zsd_flags & ZSD_DESTROY_ALL) == 0) {
+ t->zsd_flags |= ZSD_DESTROY_NEEDED;
+ DTRACE_PROBE2(zsd__destroy__needed,
+ zone_t *, zone, zone_key_t, key);
}
}
}
- mutex_exit(&zsd_key_lock);
+ mutex_exit(&zone->zone_lock);
+
+ /* Now call the shutdown and destroy callback for this key */
+ zsd_apply_all_keys(zsd_apply_shutdown, zone);
+ zsd_apply_all_keys(zsd_apply_destroy, zone);
+
}
/*
@@ -827,12 +872,379 @@ zone_free_zsd(zone_t *zone)
/*
* Free all the zsd_entry's we had on this zone.
*/
+ mutex_enter(&zone->zone_lock);
for (t = list_head(&zone->zone_zsd); t != NULL; t = next) {
next = list_next(&zone->zone_zsd, t);
list_remove(&zone->zone_zsd, t);
+ ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS));
kmem_free(t, sizeof (*t));
}
list_destroy(&zone->zone_zsd);
+ mutex_exit(&zone->zone_lock);
+
+}
+
+/*
+ * Apply a function to all zones for particular key value.
+ *
+ * The applyfn has to drop zonehash_lock if it does some work, and
+ * then reacquire it before it returns.
+ * When the lock is dropped we don't follow list_next even
+ * if it is possible to do so without any hazards. This is
+ * because we want the design to allow for the list of zones
+ * to change in any arbitrary way during the time the
+ * lock was dropped.
+ *
+ * It is safe to restart the loop at list_head since the applyfn
+ * changes the zsd_flags as it does work, so a subsequent
+ * pass through will have no effect in applyfn, hence the loop will terminate
+ * in at worst O(N^2).
+ */
+static void
+zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key)
+{
+ zone_t *zone;
+
+ mutex_enter(&zonehash_lock);
+ zone = list_head(&zone_active);
+ while (zone != NULL) {
+ if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) {
+ /* Lock dropped - restart at head */
+ zone = list_head(&zone_active);
+ } else {
+ zone = list_next(&zone_active, zone);
+ }
+ }
+ mutex_exit(&zonehash_lock);
+}
+
+/*
+ * Apply a function to all keys for a particular zone.
+ *
+ * The applyfn has to drop zonehash_lock if it does some work, and
+ * then reacquire it before it returns.
+ * When the lock is dropped we don't follow list_next even
+ * if it is possible to do so without any hazards. This is
+ * because we want the design to allow for the list of zsd callbacks
+ * to change in any arbitrary way during the time the
+ * lock was dropped.
+ *
+ * It is safe to restart the loop at list_head since the applyfn
+ * changes the zsd_flags as it does work, so a subsequent
+ * pass through will have no effect in applyfn, hence the loop will terminate
+ * in at worst O(N^2).
+ */
+static void
+zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone)
+{
+ struct zsd_entry *t;
+
+ mutex_enter(&zone->zone_lock);
+ t = list_head(&zone->zone_zsd);
+ while (t != NULL) {
+ if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) {
+ /* Lock dropped - restart at head */
+ t = list_head(&zone->zone_zsd);
+ } else {
+ t = list_next(&zone->zone_zsd, t);
+ }
+ }
+ mutex_exit(&zone->zone_lock);
+}
+
+/*
+ * Call the create function for the zone and key if CREATE_NEEDED
+ * is set.
+ * If some other thread gets here first and sets CREATE_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all zones/keys.
+ *
+ * When we call the create function, we drop the global held by the
+ * caller, and return true to tell the caller it needs to re-evalute the
+ * state.
+ * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
+ * remains held on exit.
+ */
+static boolean_t
+zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held,
+ zone_t *zone, zone_key_t key)
+{
+ void *result;
+ struct zsd_entry *t;
+ boolean_t dropped;
+
+ if (lockp != NULL) {
+ ASSERT(MUTEX_HELD(lockp));
+ }
+ if (zone_lock_held) {
+ ASSERT(MUTEX_HELD(&zone->zone_lock));
+ } else {
+ mutex_enter(&zone->zone_lock);
+ }
+
+ t = zsd_find(&zone->zone_zsd, key);
+ if (t == NULL) {
+ /*
+ * Somebody else got here first e.g the zone going
+ * away.
+ */
+ if (!zone_lock_held)
+ mutex_exit(&zone->zone_lock);
+ return (B_FALSE);
+ }
+ dropped = B_FALSE;
+ if (zsd_wait_for_inprogress(zone, t, lockp))
+ dropped = B_TRUE;
+
+ if (t->zsd_flags & ZSD_CREATE_NEEDED) {
+ t->zsd_flags &= ~ZSD_CREATE_NEEDED;
+ t->zsd_flags |= ZSD_CREATE_INPROGRESS;
+ DTRACE_PROBE2(zsd__create__inprogress,
+ zone_t *, zone, zone_key_t, key);
+ mutex_exit(&zone->zone_lock);
+ if (lockp != NULL)
+ mutex_exit(lockp);
+
+ dropped = B_TRUE;
+ ASSERT(t->zsd_create != NULL);
+ DTRACE_PROBE2(zsd__create__start,
+ zone_t *, zone, zone_key_t, key);
+
+ result = (*t->zsd_create)(zone->zone_id);
+
+ DTRACE_PROBE2(zsd__create__end,
+ zone_t *, zone, voidn *, result);
+
+ ASSERT(result != NULL);
+ if (lockp != NULL)
+ mutex_enter(lockp);
+ mutex_enter(&zone->zone_lock);
+ t->zsd_data = result;
+ t->zsd_flags &= ~ZSD_CREATE_INPROGRESS;
+ t->zsd_flags |= ZSD_CREATE_COMPLETED;
+ cv_broadcast(&t->zsd_cv);
+ DTRACE_PROBE2(zsd__create__completed,
+ zone_t *, zone, zone_key_t, key);
+ }
+ if (!zone_lock_held)
+ mutex_exit(&zone->zone_lock);
+ return (dropped);
+}
+
+/*
+ * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED
+ * is set.
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all zones/keys.
+ *
+ * When we call the shutdown function, we drop the global held by the
+ * caller, and return true to tell the caller it needs to re-evalute the
+ * state.
+ * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
+ * remains held on exit.
+ */
+static boolean_t
+zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held,
+ zone_t *zone, zone_key_t key)
+{
+ struct zsd_entry *t;
+ void *data;
+ boolean_t dropped;
+
+ if (lockp != NULL) {
+ ASSERT(MUTEX_HELD(lockp));
+ }
+ if (zone_lock_held) {
+ ASSERT(MUTEX_HELD(&zone->zone_lock));
+ } else {
+ mutex_enter(&zone->zone_lock);
+ }
+
+ t = zsd_find(&zone->zone_zsd, key);
+ if (t == NULL) {
+ /*
+ * Somebody else got here first e.g the zone going
+ * away.
+ */
+ if (!zone_lock_held)
+ mutex_exit(&zone->zone_lock);
+ return (B_FALSE);
+ }
+ dropped = B_FALSE;
+ if (zsd_wait_for_creator(zone, t, lockp))
+ dropped = B_TRUE;
+
+ if (zsd_wait_for_inprogress(zone, t, lockp))
+ dropped = B_TRUE;
+
+ if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) {
+ t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED;
+ t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS;
+ DTRACE_PROBE2(zsd__shutdown__inprogress,
+ zone_t *, zone, zone_key_t, key);
+ mutex_exit(&zone->zone_lock);
+ if (lockp != NULL)
+ mutex_exit(lockp);
+ dropped = B_TRUE;
+
+ ASSERT(t->zsd_shutdown != NULL);
+ data = t->zsd_data;
+
+ DTRACE_PROBE2(zsd__shutdown__start,
+ zone_t *, zone, zone_key_t, key);
+
+ (t->zsd_shutdown)(zone->zone_id, data);
+ DTRACE_PROBE2(zsd__shutdown__end,
+ zone_t *, zone, zone_key_t, key);
+
+ if (lockp != NULL)
+ mutex_enter(lockp);
+ mutex_enter(&zone->zone_lock);
+ t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS;
+ t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED;
+ cv_broadcast(&t->zsd_cv);
+ DTRACE_PROBE2(zsd__shutdown__completed,
+ zone_t *, zone, zone_key_t, key);
+ }
+ if (!zone_lock_held)
+ mutex_exit(&zone->zone_lock);
+ return (dropped);
+}
+
+/*
+ * Call the destroy function for the zone and key if DESTROY_NEEDED
+ * is set.
+ * If some other thread gets here first and sets *_INPROGRESS, then
+ * we wait for that thread to complete so that we can ensure that
+ * all the callbacks are done when we've looped over all zones/keys.
+ *
+ * When we call the destroy function, we drop the global held by the
+ * caller, and return true to tell the caller it needs to re-evalute the
+ * state.
+ * If the caller holds zone_lock then zone_lock_held is set, and zone_lock
+ * remains held on exit.
+ */
+static boolean_t
+zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held,
+ zone_t *zone, zone_key_t key)
+{
+ struct zsd_entry *t;
+ void *data;
+ boolean_t dropped;
+
+ if (lockp != NULL) {
+ ASSERT(MUTEX_HELD(lockp));
+ }
+ if (zone_lock_held) {
+ ASSERT(MUTEX_HELD(&zone->zone_lock));
+ } else {
+ mutex_enter(&zone->zone_lock);
+ }
+
+ t = zsd_find(&zone->zone_zsd, key);
+ if (t == NULL) {
+ /*
+ * Somebody else got here first e.g the zone going
+ * away.
+ */
+ if (!zone_lock_held)
+ mutex_exit(&zone->zone_lock);
+ return (B_FALSE);
+ }
+ dropped = B_FALSE;
+ if (zsd_wait_for_creator(zone, t, lockp))
+ dropped = B_TRUE;
+
+ if (zsd_wait_for_inprogress(zone, t, lockp))
+ dropped = B_TRUE;
+
+ if (t->zsd_flags & ZSD_DESTROY_NEEDED) {
+ t->zsd_flags &= ~ZSD_DESTROY_NEEDED;
+ t->zsd_flags |= ZSD_DESTROY_INPROGRESS;
+ DTRACE_PROBE2(zsd__destroy__inprogress,
+ zone_t *, zone, zone_key_t, key);
+ mutex_exit(&zone->zone_lock);
+ if (lockp != NULL)
+ mutex_exit(lockp);
+ dropped = B_TRUE;
+
+ ASSERT(t->zsd_destroy != NULL);
+ data = t->zsd_data;
+ DTRACE_PROBE2(zsd__destroy__start,
+ zone_t *, zone, zone_key_t, key);
+
+ (t->zsd_destroy)(zone->zone_id, data);
+ DTRACE_PROBE2(zsd__destroy__end,
+ zone_t *, zone, zone_key_t, key);
+
+ if (lockp != NULL)
+ mutex_enter(lockp);
+ mutex_enter(&zone->zone_lock);
+ t->zsd_data = NULL;
+ t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS;
+ t->zsd_flags |= ZSD_DESTROY_COMPLETED;
+ cv_broadcast(&t->zsd_cv);
+ DTRACE_PROBE2(zsd__destroy__completed,
+ zone_t *, zone, zone_key_t, key);
+ }
+ if (!zone_lock_held)
+ mutex_exit(&zone->zone_lock);
+ return (dropped);
+}
+
+/*
+ * Wait for any CREATE_NEEDED flag to be cleared.
+ * Returns true if lockp was temporarily dropped while waiting.
+ */
+static boolean_t
+zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
+{
+ boolean_t dropped = B_FALSE;
+
+ while (t->zsd_flags & ZSD_CREATE_NEEDED) {
+ DTRACE_PROBE2(zsd__wait__for__creator,
+ zone_t *, zone, struct zsd_entry *, t);
+ if (lockp != NULL) {
+ dropped = B_TRUE;
+ mutex_exit(lockp);
+ }
+ cv_wait(&t->zsd_cv, &zone->zone_lock);
+ if (lockp != NULL) {
+ /* First drop zone_lock to preserve order */
+ mutex_exit(&zone->zone_lock);
+ mutex_enter(lockp);
+ mutex_enter(&zone->zone_lock);
+ }
+ }
+ return (dropped);
+}
+
+/*
+ * Wait for any INPROGRESS flag to be cleared.
+ * Returns true if lockp was temporarily dropped while waiting.
+ */
+static boolean_t
+zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp)
+{
+ boolean_t dropped = B_FALSE;
+
+ while (t->zsd_flags & ZSD_ALL_INPROGRESS) {
+ DTRACE_PROBE2(zsd__wait__for__inprogress,
+ zone_t *, zone, struct zsd_entry *, t);
+ if (lockp != NULL) {
+ dropped = B_TRUE;
+ mutex_exit(lockp);
+ }
+ cv_wait(&t->zsd_cv, &zone->zone_lock);
+ if (lockp != NULL) {
+ /* First drop zone_lock to preserve order */
+ mutex_exit(&zone->zone_lock);
+ mutex_enter(lockp);
+ mutex_enter(&zone->zone_lock);
+ }
+ }
+ return (dropped);
}
/*
@@ -2960,10 +3372,15 @@ zsched(void *arg)
/*
* Tell the world that we're done setting up.
*
- * At this point we want to set the zone status to ZONE_IS_READY
+ * At this point we want to set the zone status to ZONE_IS_INITIALIZED
* and atomically set the zone's processor set visibility. Once
* we drop pool_lock() this zone will automatically get updated
* to reflect any future changes to the pools configuration.
+ *
+ * Note that after we drop the locks below (zonehash_lock in
+ * particular) other operations such as a zone_getattr call can
+ * now proceed and observe the zone. That is the reason for doing a
+ * state transition to the INITIALIZED state.
*/
pool_lock();
mutex_enter(&cpu_lock);
@@ -2974,12 +3391,21 @@ zsched(void *arg)
zone_pset_set(zone, pool_default->pool_pset->pset_id);
mutex_enter(&zone_status_lock);
ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
- zone_status_set(zone, ZONE_IS_READY);
+ zone_status_set(zone, ZONE_IS_INITIALIZED);
mutex_exit(&zone_status_lock);
mutex_exit(&zonehash_lock);
mutex_exit(&cpu_lock);
pool_unlock();
+ /* Now call the create callback for this key */
+ zsd_apply_all_keys(zsd_apply_create, zone);
+
+ /* The callbacks are complete. Mark ZONE_IS_READY */
+ mutex_enter(&zone_status_lock);
+ ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED);
+ zone_status_set(zone, ZONE_IS_READY);
+ mutex_exit(&zone_status_lock);
+
/*
* Once we see the zone transition to the ZONE_IS_BOOTING state,
* we launch init, and set the state to running.
@@ -4071,7 +4497,7 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
return (set_errno(EINVAL));
}
zone_status = zone_status_get(zone);
- if (zone_status < ZONE_IS_READY) {
+ if (zone_status < ZONE_IS_INITIALIZED) {
mutex_exit(&zonehash_lock);
return (set_errno(EINVAL));
}
@@ -5698,21 +6124,28 @@ zone_list_datalink(zoneid_t zoneid, int *nump, char *buf)
/*
* Public interface for looking up a zone by zoneid. It's a customized version
- * for netstack_zone_create(), it:
- * 1. Doesn't acquire the zonehash_lock, since it is called from
- * zone_key_create() or zone_zsd_configure(), lock already held.
- * 2. Doesn't check the status of the zone.
- * 3. It will be called even before zone_init is called, in that case the
+ * for netstack_zone_create(). It can only be called from the zsd create
+ * callbacks, since it doesn't have reference on the zone structure hence if
+ * it is called elsewhere the zone could disappear after the zonehash_lock
+ * is dropped.
+ *
+ * Furthermore it
+ * 1. Doesn't check the status of the zone.
+ * 2. It will be called even before zone_init is called, in that case the
* address of zone0 is returned directly, and netstack_zone_create()
* will only assign a value to zone0.zone_netstack, won't break anything.
+ * 3. Returns without the zone being held.
*/
zone_t *
zone_find_by_id_nolock(zoneid_t zoneid)
{
- ASSERT(MUTEX_HELD(&zonehash_lock));
+ zone_t *zone;
+ mutex_enter(&zonehash_lock);
if (zonehashbyid == NULL)
- return (&zone0);
+ zone = &zone0;
else
- return (zone_find_all_by_id(zoneid));
+ zone = zone_find_all_by_id(zoneid);
+ mutex_exit(&zonehash_lock);
+ return (zone);
}
diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h
index 795cf37eb5..9bd7701693 100644
--- a/usr/src/uts/common/sys/netstack.h
+++ b/usr/src/uts/common/sys/netstack.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#ifndef _SYS_NETSTACK_H
@@ -84,6 +84,46 @@ typedef id_t netstackid_t;
#define NS_MAX (NS_STR+1)
/*
+ * State maintained for each module which tracks the state of
+ * the create, shutdown and destroy callbacks.
+ *
+ * Keeps track of pending actions to avoid holding locks when
+ * calling into the create/shutdown/destroy functions in the module.
+ */
+#ifdef _KERNEL
+typedef struct {
+ uint16_t nms_flags;
+ kcondvar_t nms_cv;
+} nm_state_t;
+
+/*
+ * nms_flags
+ */
+#define NSS_CREATE_NEEDED 0x0001
+#define NSS_CREATE_INPROGRESS 0x0002
+#define NSS_CREATE_COMPLETED 0x0004
+#define NSS_SHUTDOWN_NEEDED 0x0010
+#define NSS_SHUTDOWN_INPROGRESS 0x0020
+#define NSS_SHUTDOWN_COMPLETED 0x0040
+#define NSS_DESTROY_NEEDED 0x0100
+#define NSS_DESTROY_INPROGRESS 0x0200
+#define NSS_DESTROY_COMPLETED 0x0400
+
+#define NSS_CREATE_ALL \
+ (NSS_CREATE_NEEDED|NSS_CREATE_INPROGRESS|NSS_CREATE_COMPLETED)
+#define NSS_SHUTDOWN_ALL \
+ (NSS_SHUTDOWN_NEEDED|NSS_SHUTDOWN_INPROGRESS|NSS_SHUTDOWN_COMPLETED)
+#define NSS_DESTROY_ALL \
+ (NSS_DESTROY_NEEDED|NSS_DESTROY_INPROGRESS|NSS_DESTROY_COMPLETED)
+
+#define NSS_ALL_INPROGRESS \
+ (NSS_CREATE_INPROGRESS|NSS_SHUTDOWN_INPROGRESS|NSS_DESTROY_INPROGRESS)
+#else
+/* User-level compile like IP Filter needs a netstack_t. Dummy */
+typedef uint_t nm_state_t;
+#endif /* _KERNEL */
+
+/*
* One for every netstack in the system.
* We use a union so that the compilar and lint can provide type checking -
* in principle we could have
@@ -136,7 +176,7 @@ struct netstack {
#define netstack_ipf netstack_u.nu_s.nu_ipf
#define netstack_str netstack_u.nu_s.nu_str
- uint16_t netstack_m_state[NS_MAX]; /* module state */
+ nm_state_t netstack_m_state[NS_MAX]; /* module state */
kmutex_t netstack_lock;
struct netstack *netstack_next;
@@ -144,34 +184,23 @@ struct netstack {
int netstack_numzones; /* Number of zones using this */
int netstack_refcnt; /* Number of hold-rele */
int netstack_flags; /* See below */
+
+#ifdef _KERNEL
+ /* Needed to ensure that we run the callback functions in order */
+ kcondvar_t netstack_cv;
+#endif
};
typedef struct netstack netstack_t;
/* netstack_flags values */
-#define NSF_UNINIT 0x01 /* Not initialized */
-#define NSF_CLOSING 0x02 /* Going away */
+#define NSF_UNINIT 0x01 /* Not initialized */
+#define NSF_CLOSING 0x02 /* Going away */
+#define NSF_ZONE_CREATE 0x04 /* create callbacks inprog */
+#define NSF_ZONE_SHUTDOWN 0x08 /* shutdown callbacks */
+#define NSF_ZONE_DESTROY 0x10 /* destroy callbacks */
-/*
- * State for each module for each stack - netstack_m_state[moduleid]
- * Keeps track of pending actions to avoid holding looks when
- * calling into the create/shutdown/destroy functions in the module.
- */
-#define NSS_CREATE_NEEDED 0x0001
-#define NSS_CREATE_INPROGRESS 0x0002
-#define NSS_CREATE_COMPLETED 0x0004
-#define NSS_SHUTDOWN_NEEDED 0x0010
-#define NSS_SHUTDOWN_INPROGRESS 0x0020
-#define NSS_SHUTDOWN_COMPLETED 0x0040
-#define NSS_DESTROY_NEEDED 0x0100
-#define NSS_DESTROY_INPROGRESS 0x0200
-#define NSS_DESTROY_COMPLETED 0x0400
-
-#define NSS_CREATE_ALL \
- (NSS_CREATE_NEEDED|NSS_CREATE_INPROGRESS|NSS_CREATE_COMPLETED)
-#define NSS_SHUTDOWN_ALL \
- (NSS_SHUTDOWN_NEEDED|NSS_SHUTDOWN_INPROGRESS|NSS_SHUTDOWN_COMPLETED)
-#define NSS_DESTROY_ALL \
- (NSS_DESTROY_NEEDED|NSS_DESTROY_INPROGRESS|NSS_DESTROY_COMPLETED)
+#define NSF_ZONE_INPROGRESS \
+ (NSF_ZONE_CREATE|NSF_ZONE_SHUTDOWN|NSF_ZONE_DESTROY)
/*
* One for each of the NS_* values.
@@ -185,6 +214,7 @@ struct netstack_registry {
/* nr_flags values */
#define NRF_REGISTERED 0x01
+#define NRF_DYING 0x02 /* No new creates */
/*
* To support kstat_create_netstack() using kstat_add_zone we need
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 7e7dd9e88a..0a93e8651e 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -106,6 +106,7 @@ extern "C" {
#define ZONE_EVENT_STATUS_SUBCLASS "change"
#define ZONE_EVENT_UNINITIALIZED "uninitialized"
+#define ZONE_EVENT_INITIALIZED "initialized"
#define ZONE_EVENT_READY "ready"
#define ZONE_EVENT_RUNNING "running"
#define ZONE_EVENT_SHUTTING_DOWN "shutting_down"
@@ -201,6 +202,7 @@ typedef struct {
/* zone_status */
typedef enum {
ZONE_IS_UNINITIALIZED = 0,
+ ZONE_IS_INITIALIZED,
ZONE_IS_READY,
ZONE_IS_BOOTING,
ZONE_IS_RUNNING,
@@ -268,7 +270,6 @@ typedef struct zone_cmd_rval {
#define ZONE_DOOR_PATH ZONES_TMPDIR "/%s.zoneadmd_door"
/* zone_flags */
-#define ZF_DESTROYED 0x1 /* ZSD destructor callbacks run */
#define ZF_HASHED_LABEL 0x2 /* zone has a unique label */
#define ZF_IS_SCRATCH 0x4 /* scratch zone */
#define ZF_NET_EXCL 0x8 /* Zone has an exclusive IP stack */
@@ -476,6 +477,13 @@ extern int zone_setspecific(zone_key_t, zone_t *, const void *);
/*
* The definition of a zsd_entry is truly private to zone.c and is only
* placed here so it can be shared with mdb.
+ *
+ * State maintained for each zone times each registered key, which tracks
+ * the state of the create, shutdown and destroy callbacks.
+ *
+ * zsd_flags is used to keep track of pending actions to avoid holding locks
+ * when calling the create/shutdown/destroy callbacks, since doing so
+ * could lead to deadlocks.
*/
struct zsd_entry {
zone_key_t zsd_key; /* Key used to lookup value */
@@ -488,9 +496,34 @@ struct zsd_entry {
void (*zsd_shutdown)(zoneid_t, void *);
void (*zsd_destroy)(zoneid_t, void *);
list_node_t zsd_linkage;
+ uint16_t zsd_flags; /* See below */
+ kcondvar_t zsd_cv;
};
/*
+ * zsd_flags
+ */
+#define ZSD_CREATE_NEEDED 0x0001
+#define ZSD_CREATE_INPROGRESS 0x0002
+#define ZSD_CREATE_COMPLETED 0x0004
+#define ZSD_SHUTDOWN_NEEDED 0x0010
+#define ZSD_SHUTDOWN_INPROGRESS 0x0020
+#define ZSD_SHUTDOWN_COMPLETED 0x0040
+#define ZSD_DESTROY_NEEDED 0x0100
+#define ZSD_DESTROY_INPROGRESS 0x0200
+#define ZSD_DESTROY_COMPLETED 0x0400
+
+#define ZSD_CREATE_ALL \
+ (ZSD_CREATE_NEEDED|ZSD_CREATE_INPROGRESS|ZSD_CREATE_COMPLETED)
+#define ZSD_SHUTDOWN_ALL \
+ (ZSD_SHUTDOWN_NEEDED|ZSD_SHUTDOWN_INPROGRESS|ZSD_SHUTDOWN_COMPLETED)
+#define ZSD_DESTROY_ALL \
+ (ZSD_DESTROY_NEEDED|ZSD_DESTROY_INPROGRESS|ZSD_DESTROY_COMPLETED)
+
+#define ZSD_ALL_INPROGRESS \
+ (ZSD_CREATE_INPROGRESS|ZSD_SHUTDOWN_INPROGRESS|ZSD_DESTROY_INPROGRESS)
+
+/*
* Macros to help with zone visibility restrictions.
*/