diff options
author | nordmark <none@none> | 2008-01-22 15:57:26 -0800 |
---|---|---|
committer | nordmark <none@none> | 2008-01-22 15:57:26 -0800 |
commit | bd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c (patch) | |
tree | 34d74b100f909c973299a5ded0d0a231ac2d069e /usr/src | |
parent | c63537d6ab9d03a6ce330b36e829aba258c25d87 (diff) | |
download | illumos-joyent-bd41d0a82bd89bc81d63ae5dfc2ba4245f74ea6c.tar.gz |
6558857 ZSD callback locking cause deadlocks
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/lib/libzonecfg/common/libzonecfg.c | 2 | ||||
-rw-r--r-- | usr/src/uts/common/os/netstack.c | 594 | ||||
-rw-r--r-- | usr/src/uts/common/os/zone.c | 825 | ||||
-rw-r--r-- | usr/src/uts/common/sys/netstack.h | 80 | ||||
-rw-r--r-- | usr/src/uts/common/sys/zone.h | 37 |
5 files changed, 1078 insertions, 460 deletions
diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c index 94644741dd..2bce66d3f5 100644 --- a/usr/src/lib/libzonecfg/common/libzonecfg.c +++ b/usr/src/lib/libzonecfg/common/libzonecfg.c @@ -5107,6 +5107,8 @@ kernel_state_to_user_state(zoneid_t zoneid, zone_status_t kernel_state) assert(kernel_state <= ZONE_MAX_STATE); switch (kernel_state) { case ZONE_IS_UNINITIALIZED: + case ZONE_IS_INITIALIZED: + /* The kernel will not return these two states */ return (ZONE_STATE_READY); case ZONE_IS_READY: /* diff --git a/usr/src/uts/common/os/netstack.c b/usr/src/uts/common/os/netstack.c index 44b147dc48..c1e59fe6c3 100644 --- a/usr/src/uts/common/os/netstack.c +++ b/usr/src/uts/common/os/netstack.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -106,10 +106,6 @@ static void *netstack_zone_create(zoneid_t zoneid); static void netstack_zone_shutdown(zoneid_t zoneid, void *arg); static void netstack_zone_destroy(zoneid_t zoneid, void *arg); -static void netstack_do_create(netstack_t *ns, int moduleid); -static void netstack_do_shutdown(netstack_t *ns, int moduleid); -static void netstack_do_destroy(netstack_t *ns, int moduleid); - static void netstack_shared_zone_add(zoneid_t zoneid); static void netstack_shared_zone_remove(zoneid_t zoneid); static void netstack_shared_kstat_add(kstat_t *ks); @@ -117,6 +113,16 @@ static void netstack_shared_kstat_remove(kstat_t *ks); typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int); +static void apply_all_netstacks(int, applyfn_t *); +static void apply_all_modules(netstack_t *, applyfn_t *); +static void apply_all_modules_reverse(netstack_t *, applyfn_t *); +static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int); +static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int); +static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int); +static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *); +static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *, + kmutex_t *); + void netstack_init(void) { @@ -156,6 +162,10 @@ netstack_register(int moduleid, ASSERT(moduleid >= 0 && moduleid < NS_MAX); ASSERT(module_create != NULL); + /* + * Make instances created after this point in time run the create + * callback. + */ mutex_enter(&netstack_g_lock); ASSERT(ns_reg[moduleid].nr_create == NULL); ASSERT(ns_reg[moduleid].nr_flags == 0); @@ -166,15 +176,17 @@ netstack_register(int moduleid, /* * Determine the set of stacks that exist before we drop the lock. - * Set CREATE_NEEDED for each of those. + * Set NSS_CREATE_NEEDED for each of those. * netstacks which have been deleted will have NSS_CREATE_COMPLETED * set, but check NSF_CLOSING to be sure. */ for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) { + nm_state_t *nms = &ns->netstack_m_state[moduleid]; + mutex_enter(&ns->netstack_lock); if (!(ns->netstack_flags & NSF_CLOSING) && - (ns->netstack_m_state[moduleid] & NSS_CREATE_ALL) == 0) { - ns->netstack_m_state[moduleid] |= NSS_CREATE_NEEDED; + (nms->nms_flags & NSS_CREATE_ALL) == 0) { + nms->nms_flags |= NSS_CREATE_NEEDED; DTRACE_PROBE2(netstack__create__needed, netstack_t *, ns, int, moduleid); } @@ -183,12 +195,12 @@ netstack_register(int moduleid, mutex_exit(&netstack_g_lock); /* - * Call the create function for each stack that has CREATE_NEEDED - * for this moduleid. - * Set CREATE_INPROGRESS, drop lock, and after done, - * set CREATE_COMPLETE + * At this point in time a new instance can be created or an instance + * can be destroyed, or some other module can register or unregister. + * Make sure we either run all the create functions for this moduleid + * or we wait for any other creators for this moduleid. */ - netstack_do_create(NULL, moduleid); + apply_all_netstacks(moduleid, netstack_apply_create); } void @@ -204,41 +216,57 @@ netstack_unregister(int moduleid) mutex_enter(&netstack_g_lock); /* * Determine the set of stacks that exist before we drop the lock. - * Set SHUTDOWN_NEEDED and DESTROY_NEEDED for each of those. + * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those. + * That ensures that when we return all the callbacks for existing + * instances have completed. And since we set NRF_DYING no new + * instances can use this module. */ for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) { + nm_state_t *nms = &ns->netstack_m_state[moduleid]; + mutex_enter(&ns->netstack_lock); if (ns_reg[moduleid].nr_shutdown != NULL && - (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) && - (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_ALL) == 0) { - ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_NEEDED; + (nms->nms_flags & NSS_CREATE_COMPLETED) && + (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) { + nms->nms_flags |= NSS_SHUTDOWN_NEEDED; DTRACE_PROBE2(netstack__shutdown__needed, netstack_t *, ns, int, moduleid); } if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) && ns_reg[moduleid].nr_destroy != NULL && - (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) && - (ns->netstack_m_state[moduleid] & NSS_DESTROY_ALL) == 0) { - ns->netstack_m_state[moduleid] |= NSS_DESTROY_NEEDED; + (nms->nms_flags & NSS_CREATE_COMPLETED) && + (nms->nms_flags & NSS_DESTROY_ALL) == 0) { + nms->nms_flags |= NSS_DESTROY_NEEDED; DTRACE_PROBE2(netstack__destroy__needed, netstack_t *, ns, int, moduleid); } mutex_exit(&ns->netstack_lock); } + /* + * Prevent any new netstack from calling the registered create + * function, while keeping the function pointers in place until the + * shutdown and destroy callbacks are complete. + */ + ns_reg[moduleid].nr_flags |= NRF_DYING; mutex_exit(&netstack_g_lock); - netstack_do_shutdown(NULL, moduleid); - netstack_do_destroy(NULL, moduleid); + apply_all_netstacks(moduleid, netstack_apply_shutdown); + apply_all_netstacks(moduleid, netstack_apply_destroy); /* - * Clear the netstack_m_state so that we can handle this module + * Clear the nms_flags so that we can handle this module * being loaded again. + * Also remove the registered functions. */ mutex_enter(&netstack_g_lock); + ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED); + ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING); for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) { + nm_state_t *nms = &ns->netstack_m_state[moduleid]; + mutex_enter(&ns->netstack_lock); - if (ns->netstack_m_state[moduleid] & NSS_DESTROY_COMPLETED) { - ns->netstack_m_state[moduleid] = 0; + if (nms->nms_flags & NSS_DESTROY_COMPLETED) { + nms->nms_flags = 0; DTRACE_PROBE2(netstack__destroy__done, netstack_t *, ns, int, moduleid); } @@ -304,6 +332,7 @@ netstack_zone_create(zoneid_t zoneid) } /* Not found */ mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL); ns->netstack_stackid = zoneid; ns->netstack_numzones = 1; ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */ @@ -311,26 +340,44 @@ netstack_zone_create(zoneid_t zoneid) *nsp = ns; zone->zone_netstack = ns; + mutex_enter(&ns->netstack_lock); + /* + * Mark this netstack as having a CREATE running so + * any netstack_register/netstack_unregister waits for + * the existing create callbacks to complete in moduleid order + */ + ns->netstack_flags |= NSF_ZONE_CREATE; + /* * Determine the set of module create functions that need to be * called before we drop the lock. + * Set NSS_CREATE_NEEDED for each of those. + * Skip any with NRF_DYING set, since those are in the process of + * going away, by checking for flags being exactly NRF_REGISTERED. */ for (i = 0; i < NS_MAX; i++) { - mutex_enter(&ns->netstack_lock); - if ((ns_reg[i].nr_flags & NRF_REGISTERED) && - (ns->netstack_m_state[i] & NSS_CREATE_ALL) == 0) { - ns->netstack_m_state[i] |= NSS_CREATE_NEEDED; + nm_state_t *nms = &ns->netstack_m_state[i]; + + cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL); + + if ((ns_reg[i].nr_flags == NRF_REGISTERED) && + (nms->nms_flags & NSS_CREATE_ALL) == 0) { + nms->nms_flags |= NSS_CREATE_NEEDED; DTRACE_PROBE2(netstack__create__needed, netstack_t *, ns, int, i); } - mutex_exit(&ns->netstack_lock); } + mutex_exit(&ns->netstack_lock); mutex_exit(&netstack_g_lock); - netstack_do_create(ns, NS_ALL); + apply_all_modules(ns, netstack_apply_create); + /* Tell any waiting netstack_register/netstack_unregister to proceed */ mutex_enter(&ns->netstack_lock); ns->netstack_flags &= ~NSF_UNINIT; + ASSERT(ns->netstack_flags & NSF_ZONE_CREATE); + ns->netstack_flags &= ~NSF_ZONE_CREATE; + cv_broadcast(&ns->netstack_cv); mutex_exit(&ns->netstack_lock); return (ns); @@ -356,29 +403,46 @@ netstack_zone_shutdown(zoneid_t zoneid, void *arg) mutex_exit(&ns->netstack_lock); mutex_enter(&netstack_g_lock); + mutex_enter(&ns->netstack_lock); + /* + * Mark this netstack as having a SHUTDOWN running so + * any netstack_register/netstack_unregister waits for + * the existing create callbacks to complete in moduleid order + */ + ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS)); + ns->netstack_flags |= NSF_ZONE_SHUTDOWN; + /* * Determine the set of stacks that exist before we drop the lock. - * Set SHUTDOWN_NEEDED for each of those. + * Set NSS_SHUTDOWN_NEEDED for each of those. */ for (i = 0; i < NS_MAX; i++) { - mutex_enter(&ns->netstack_lock); + nm_state_t *nms = &ns->netstack_m_state[i]; + if ((ns_reg[i].nr_flags & NRF_REGISTERED) && ns_reg[i].nr_shutdown != NULL && - (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) && - (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) { - ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED; + (nms->nms_flags & NSS_CREATE_COMPLETED) && + (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) { + nms->nms_flags |= NSS_SHUTDOWN_NEEDED; DTRACE_PROBE2(netstack__shutdown__needed, netstack_t *, ns, int, i); } - mutex_exit(&ns->netstack_lock); } + mutex_exit(&ns->netstack_lock); mutex_exit(&netstack_g_lock); /* * Call the shutdown function for all registered modules for this * netstack. */ - netstack_do_shutdown(ns, NS_ALL); + apply_all_modules(ns, netstack_apply_shutdown); + + /* Tell any waiting netstack_register/netstack_unregister to proceed */ + mutex_enter(&ns->netstack_lock); + ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN); + ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN; + cv_broadcast(&ns->netstack_cv); + mutex_exit(&ns->netstack_lock); } /* @@ -429,70 +493,183 @@ netstack_stack_inactive(netstack_t *ns) int i; mutex_enter(&netstack_g_lock); + mutex_enter(&ns->netstack_lock); + /* + * Mark this netstack as having a DESTROY running so + * any netstack_register/netstack_unregister waits for + * the existing destroy callbacks to complete in reverse moduleid order + */ + ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS)); + ns->netstack_flags |= NSF_ZONE_DESTROY; /* * If the shutdown callback wasn't called earlier (e.g., if this is - * a netstack shared between multiple zones), then we call it now. + * a netstack shared between multiple zones), then we schedule it now. + * + * Determine the set of stacks that exist before we drop the lock. + * Set NSS_DESTROY_NEEDED for each of those. That + * ensures that when we return all the callbacks for existing + * instances have completed. */ for (i = 0; i < NS_MAX; i++) { - mutex_enter(&ns->netstack_lock); + nm_state_t *nms = &ns->netstack_m_state[i]; + if ((ns_reg[i].nr_flags & NRF_REGISTERED) && ns_reg[i].nr_shutdown != NULL && - (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) && - (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) { - ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED; + (nms->nms_flags & NSS_CREATE_COMPLETED) && + (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) { + nms->nms_flags |= NSS_SHUTDOWN_NEEDED; DTRACE_PROBE2(netstack__shutdown__needed, netstack_t *, ns, int, i); } - mutex_exit(&ns->netstack_lock); - } - /* - * Determine the set of stacks that exist before we drop the lock. - * Set DESTROY_NEEDED for each of those. - */ - for (i = 0; i < NS_MAX; i++) { - mutex_enter(&ns->netstack_lock); + if ((ns_reg[i].nr_flags & NRF_REGISTERED) && ns_reg[i].nr_destroy != NULL && - (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) && - (ns->netstack_m_state[i] & NSS_DESTROY_ALL) == 0) { - ns->netstack_m_state[i] |= NSS_DESTROY_NEEDED; + (nms->nms_flags & NSS_CREATE_COMPLETED) && + (nms->nms_flags & NSS_DESTROY_ALL) == 0) { + nms->nms_flags |= NSS_DESTROY_NEEDED; DTRACE_PROBE2(netstack__destroy__needed, netstack_t *, ns, int, i); } - mutex_exit(&ns->netstack_lock); } + mutex_exit(&ns->netstack_lock); mutex_exit(&netstack_g_lock); /* * Call the shutdown and destroy functions for all registered modules * for this netstack. + * + * Since there are some ordering dependencies between the modules we + * tear them down in the reverse order of what was used to create them. + * + * Since a netstack_t is never reused (when a zone is rebooted it gets + * a new zoneid == netstackid i.e. a new netstack_t is allocated) we + * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set. + * That is different than in the netstack_unregister() case. */ - netstack_do_shutdown(ns, NS_ALL); - netstack_do_destroy(ns, NS_ALL); + apply_all_modules(ns, netstack_apply_shutdown); + apply_all_modules_reverse(ns, netstack_apply_destroy); + + /* Tell any waiting netstack_register/netstack_unregister to proceed */ + mutex_enter(&ns->netstack_lock); + ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY); + ns->netstack_flags &= ~NSF_ZONE_DESTROY; + cv_broadcast(&ns->netstack_cv); + mutex_exit(&ns->netstack_lock); +} + +/* + * Apply a function to all netstacks for a particular moduleid. + * + * If there is any zone activity (due to a zone being created, shutdown, + * or destroyed) we wait for that to complete before we proceed. This ensures + * that the moduleids are processed in order when a zone is created or + * destroyed. + * + * The applyfn has to drop netstack_g_lock if it does some work. + * In that case we don't follow netstack_next, + * even if it is possible to do so without any hazards. This is + * because we want the design to allow for the list of netstacks threaded + * by netstack_next to change in any arbitrary way during the time the + * lock was dropped. + * + * It is safe to restart the loop at netstack_head since the applyfn + * changes netstack_m_state as it processes things, so a subsequent + * pass through will have no effect in applyfn, hence the loop will terminate + * in at worst O(N^2). + */ +static void +apply_all_netstacks(int moduleid, applyfn_t *applyfn) +{ + netstack_t *ns; + + mutex_enter(&netstack_g_lock); + ns = netstack_head; + while (ns != NULL) { + if (wait_for_zone_creator(ns, &netstack_g_lock)) { + /* Lock dropped - restart at head */ + ns = netstack_head; + } else if ((applyfn)(&netstack_g_lock, ns, moduleid)) { + /* Lock dropped - restart at head */ + ns = netstack_head; + } else { + ns = ns->netstack_next; + } + } + mutex_exit(&netstack_g_lock); +} + +/* + * Apply a function to all moduleids for a particular netstack. + * + * Since the netstack linkage doesn't matter in this case we can + * ignore whether the function drops the lock. + */ +static void +apply_all_modules(netstack_t *ns, applyfn_t *applyfn) +{ + int i; + + mutex_enter(&netstack_g_lock); + for (i = 0; i < NS_MAX; i++) { + /* + * We don't care whether the lock was dropped + * since we are not iterating over netstack_head. + */ + (void) (applyfn)(&netstack_g_lock, ns, i); + } + mutex_exit(&netstack_g_lock); +} + +/* Like the above but in reverse moduleid order */ +static void +apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn) +{ + int i; + + mutex_enter(&netstack_g_lock); + for (i = NS_MAX-1; i >= 0; i--) { + /* + * We don't care whether the lock was dropped + * since we are not iterating over netstack_head. + */ + (void) (applyfn)(&netstack_g_lock, ns, i); + } + mutex_exit(&netstack_g_lock); } /* * Call the create function for the ns and moduleid if CREATE_NEEDED * is set. - * When it calls it, it drops the netstack_lock held by the caller, - * and returns true to tell the caller it needs to re-evalute the - * state.. + * If some other thread gets here first and sets *_INPROGRESS, then + * we wait for that thread to complete so that we can ensure that + * all the callbacks are done when we've looped over all netstacks/moduleids. + * + * When we call the create function, we temporarily drop the netstack_lock + * held by the caller, and return true to tell the caller it needs to + * re-evalute the state. */ static boolean_t netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid) { void *result; netstackid_t stackid; + nm_state_t *nms = &ns->netstack_m_state[moduleid]; + boolean_t dropped = B_FALSE; ASSERT(MUTEX_HELD(lockp)); mutex_enter(&ns->netstack_lock); - if (ns->netstack_m_state[moduleid] & NSS_CREATE_NEEDED) { - ns->netstack_m_state[moduleid] &= ~NSS_CREATE_NEEDED; - ns->netstack_m_state[moduleid] |= NSS_CREATE_INPROGRESS; + + if (wait_for_nms_inprogress(ns, nms, lockp)) + dropped = B_TRUE; + + if (nms->nms_flags & NSS_CREATE_NEEDED) { + nms->nms_flags &= ~NSS_CREATE_NEEDED; + nms->nms_flags |= NSS_CREATE_INPROGRESS; DTRACE_PROBE2(netstack__create__inprogress, netstack_t *, ns, int, moduleid); mutex_exit(&ns->netstack_lock); mutex_exit(lockp); + dropped = B_TRUE; ASSERT(ns_reg[moduleid].nr_create != NULL); stackid = ns->netstack_stackid; @@ -504,42 +681,55 @@ netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid) void *, result, netstack_t *, ns); ASSERT(result != NULL); + mutex_enter(lockp); mutex_enter(&ns->netstack_lock); ns->netstack_modules[moduleid] = result; - ns->netstack_m_state[moduleid] &= ~NSS_CREATE_INPROGRESS; - ns->netstack_m_state[moduleid] |= NSS_CREATE_COMPLETED; + nms->nms_flags &= ~NSS_CREATE_INPROGRESS; + nms->nms_flags |= NSS_CREATE_COMPLETED; + cv_broadcast(&nms->nms_cv); DTRACE_PROBE2(netstack__create__completed, netstack_t *, ns, int, moduleid); mutex_exit(&ns->netstack_lock); - return (B_TRUE); + return (dropped); } else { mutex_exit(&ns->netstack_lock); - return (B_FALSE); + return (dropped); } } /* * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED * is set. - * When it calls it, it drops the netstack_lock held by the caller, - * and returns true to tell the caller it needs to re-evalute the - * state.. + * If some other thread gets here first and sets *_INPROGRESS, then + * we wait for that thread to complete so that we can ensure that + * all the callbacks are done when we've looped over all netstacks/moduleids. + * + * When we call the shutdown function, we temporarily drop the netstack_lock + * held by the caller, and return true to tell the caller it needs to + * re-evalute the state. */ static boolean_t netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid) { netstackid_t stackid; void * netstack_module; + nm_state_t *nms = &ns->netstack_m_state[moduleid]; + boolean_t dropped = B_FALSE; ASSERT(MUTEX_HELD(lockp)); mutex_enter(&ns->netstack_lock); - if (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_NEEDED) { - ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_NEEDED; - ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_INPROGRESS; + + if (wait_for_nms_inprogress(ns, nms, lockp)) + dropped = B_TRUE; + + if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) { + nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED; + nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS; DTRACE_PROBE2(netstack__shutdown__inprogress, netstack_t *, ns, int, moduleid); mutex_exit(&ns->netstack_lock); mutex_exit(lockp); + dropped = B_TRUE; ASSERT(ns_reg[moduleid].nr_shutdown != NULL); stackid = ns->netstack_stackid; @@ -551,43 +741,55 @@ netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid) DTRACE_PROBE1(netstack__shutdown__end, netstack_t *, ns); + mutex_enter(lockp); mutex_enter(&ns->netstack_lock); - ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_INPROGRESS; - ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_COMPLETED; + nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS; + nms->nms_flags |= NSS_SHUTDOWN_COMPLETED; + cv_broadcast(&nms->nms_cv); DTRACE_PROBE2(netstack__shutdown__completed, netstack_t *, ns, int, moduleid); mutex_exit(&ns->netstack_lock); - return (B_TRUE); + return (dropped); } else { mutex_exit(&ns->netstack_lock); - return (B_FALSE); + return (dropped); } } /* * Call the destroy function for the ns and moduleid if DESTROY_NEEDED * is set. - * When it calls it, it drops the netstack_lock held by the caller, - * and returns true to tell the caller it needs to re-evalute the - * state.. + * If some other thread gets here first and sets *_INPROGRESS, then + * we wait for that thread to complete so that we can ensure that + * all the callbacks are done when we've looped over all netstacks/moduleids. + * + * When we call the destroy function, we temporarily drop the netstack_lock + * held by the caller, and return true to tell the caller it needs to + * re-evalute the state. */ static boolean_t netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid) { netstackid_t stackid; void * netstack_module; + nm_state_t *nms = &ns->netstack_m_state[moduleid]; + boolean_t dropped = B_FALSE; ASSERT(MUTEX_HELD(lockp)); mutex_enter(&ns->netstack_lock); - if (ns->netstack_m_state[moduleid] & NSS_DESTROY_NEEDED) { - ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_NEEDED; - ns->netstack_m_state[moduleid] |= NSS_DESTROY_INPROGRESS; + + if (wait_for_nms_inprogress(ns, nms, lockp)) + dropped = B_TRUE; + + if (nms->nms_flags & NSS_DESTROY_NEEDED) { + nms->nms_flags &= ~NSS_DESTROY_NEEDED; + nms->nms_flags |= NSS_DESTROY_INPROGRESS; DTRACE_PROBE2(netstack__destroy__inprogress, netstack_t *, ns, int, moduleid); mutex_exit(&ns->netstack_lock); mutex_exit(lockp); + dropped = B_TRUE; - /* XXX race against unregister? */ ASSERT(ns_reg[moduleid].nr_destroy != NULL); stackid = ns->netstack_stackid; netstack_module = ns->netstack_modules[moduleid]; @@ -598,177 +800,83 @@ netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid) DTRACE_PROBE1(netstack__destroy__end, netstack_t *, ns); + mutex_enter(lockp); mutex_enter(&ns->netstack_lock); ns->netstack_modules[moduleid] = NULL; - ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_INPROGRESS; - ns->netstack_m_state[moduleid] |= NSS_DESTROY_COMPLETED; + nms->nms_flags &= ~NSS_DESTROY_INPROGRESS; + nms->nms_flags |= NSS_DESTROY_COMPLETED; + cv_broadcast(&nms->nms_cv); DTRACE_PROBE2(netstack__destroy__completed, netstack_t *, ns, int, moduleid); mutex_exit(&ns->netstack_lock); - return (B_TRUE); + return (dropped); } else { mutex_exit(&ns->netstack_lock); - return (B_FALSE); + return (dropped); } } /* - * Apply a function to all netstacks for a particular moduleid. - * - * The applyfn has to drop netstack_g_lock if it does some work. - * In that case we don't follow netstack_next after reacquiring the - * lock, even if it is possible to do so without any hazards. This is - * because we want the design to allow for the list of netstacks threaded - * by netstack_next to change in any arbitrary way during the time the - * lock was dropped. - * - * It is safe to restart the loop at netstack_head since the applyfn - * changes netstack_m_state as it processes things, so a subsequent - * pass through will have no effect in applyfn, hence the loop will terminate - * in at worst O(N^2). + * If somebody is creating the netstack (due to a new zone being created) + * then we wait for them to complete. This ensures that any additional + * netstack_register() doesn't cause the create functions to run out of + * order. + * Note that we do not need such a global wait in the case of the shutdown + * and destroy callbacks, since in that case it is sufficient for both + * threads to set NEEDED and wait for INPROGRESS to ensure ordering. + * Returns true if lockp was temporarily dropped while waiting. */ -static void -apply_all_netstacks(int moduleid, applyfn_t *applyfn) +static boolean_t +wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp) { - netstack_t *ns; + boolean_t dropped = B_FALSE; - mutex_enter(&netstack_g_lock); - ns = netstack_head; - while (ns != NULL) { - if ((applyfn)(&netstack_g_lock, ns, moduleid)) { - /* Lock dropped - restart at head */ -#ifdef NS_DEBUG - (void) printf("apply_all_netstacks: " - "LD for %p/%d, %d\n", - (void *)ns, ns->netstack_stackid, moduleid); -#endif - mutex_enter(&netstack_g_lock); - ns = netstack_head; - } else { - ns = ns->netstack_next; + mutex_enter(&ns->netstack_lock); + while (ns->netstack_flags & NSF_ZONE_CREATE) { + DTRACE_PROBE1(netstack__wait__zone__inprogress, + netstack_t *, ns); + if (lockp != NULL) { + dropped = B_TRUE; + mutex_exit(lockp); + } + cv_wait(&ns->netstack_cv, &ns->netstack_lock); + if (lockp != NULL) { + /* First drop netstack_lock to preserve order */ + mutex_exit(&ns->netstack_lock); + mutex_enter(lockp); + mutex_enter(&ns->netstack_lock); } } - mutex_exit(&netstack_g_lock); + mutex_exit(&ns->netstack_lock); + return (dropped); } /* - * Apply a function to all moduleids for a particular netstack. - * - * Since the netstack linkage doesn't matter in this case we can - * ignore whether the function drops the lock. + * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid + * combination. + * Returns true if lockp was temporarily dropped while waiting. */ -static void -apply_all_modules(netstack_t *ns, applyfn_t *applyfn) +static boolean_t +wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp) { - int i; - - mutex_enter(&netstack_g_lock); - for (i = 0; i < NS_MAX; i++) { - if ((applyfn)(&netstack_g_lock, ns, i)) { - /* - * Lock dropped but since we are not iterating over - * netstack_head we can just reacquire the lock. - */ - mutex_enter(&netstack_g_lock); + boolean_t dropped = B_FALSE; + + while (nms->nms_flags & NSS_ALL_INPROGRESS) { + DTRACE_PROBE2(netstack__wait__nms__inprogress, + netstack_t *, ns, nm_state_t *, nms); + if (lockp != NULL) { + dropped = B_TRUE; + mutex_exit(lockp); } - } - mutex_exit(&netstack_g_lock); -} - -/* Like the above but in reverse moduleid order */ -static void -apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn) -{ - int i; - - mutex_enter(&netstack_g_lock); - for (i = NS_MAX-1; i >= 0; i--) { - if ((applyfn)(&netstack_g_lock, ns, i)) { - /* - * Lock dropped but since we are not iterating over - * netstack_head we can just reacquire the lock. - */ - mutex_enter(&netstack_g_lock); + cv_wait(&nms->nms_cv, &ns->netstack_lock); + if (lockp != NULL) { + /* First drop netstack_lock to preserve order */ + mutex_exit(&ns->netstack_lock); + mutex_enter(lockp); + mutex_enter(&ns->netstack_lock); } } - mutex_exit(&netstack_g_lock); -} - -/* - * Apply a function to a subset of all module/netstack combinations. - * - * If ns is non-NULL we restrict it to that particular instance. - * If moduleid is a particular one (not NS_ALL), then we restrict it - * to that particular moduleid. - * When walking the moduleid, the reverse argument specifies that they - * should be walked in reverse order. - * The applyfn returns true if it had dropped the locks. - */ -static void -netstack_do_apply(netstack_t *ns, int moduleid, boolean_t reverse, - applyfn_t *applyfn) -{ - if (ns != NULL) { - ASSERT(moduleid == NS_ALL); - if (reverse) - apply_all_modules_reverse(ns, applyfn); - else - apply_all_modules(ns, applyfn); - } else { - ASSERT(moduleid != NS_ALL); - - apply_all_netstacks(moduleid, applyfn); - } -} - -/* - * Run the create function for all modules x stack combinations - * that have NSS_CREATE_NEEDED set. - * - * Call the create function for each stack that has CREATE_NEEDED. - * Set CREATE_INPROGRESS, drop lock, and after done, - * set CREATE_COMPLETE - */ -static void -netstack_do_create(netstack_t *ns, int moduleid) -{ - netstack_do_apply(ns, moduleid, B_FALSE, netstack_apply_create); -} - -/* - * Run the shutdown function for all modules x stack combinations - * that have NSS_SHUTDOWN_NEEDED set. - * - * Call the shutdown function for each stack that has SHUTDOWN_NEEDED. - * Set SHUTDOWN_INPROGRESS, drop lock, and after done, - * set SHUTDOWN_COMPLETE - */ -static void -netstack_do_shutdown(netstack_t *ns, int moduleid) -{ - netstack_do_apply(ns, moduleid, B_FALSE, netstack_apply_shutdown); -} - -/* - * Run the destroy function for all modules x stack combinations - * that have NSS_DESTROY_NEEDED set. - * - * Call the destroy function for each stack that has DESTROY_NEEDED. - * Set DESTROY_INPROGRESS, drop lock, and after done, - * set DESTROY_COMPLETE - * - * Since a netstack_t is never reused (when a zone is rebooted it gets - * a new zoneid == netstackid i.e. a new netstack_t is allocated) we leave - * netstack_m_state the way it is i.e. with NSS_DESTROY_COMPLETED set. - */ -static void -netstack_do_destroy(netstack_t *ns, int moduleid) -{ - /* - * Have to walk the moduleids in reverse order since some - * modules make implicit assumptions about the order - */ - netstack_do_apply(ns, moduleid, B_TRUE, netstack_apply_destroy); + return (dropped); } /* @@ -845,7 +953,10 @@ netstack_find_by_zoneid(zoneid_t zoneid) } /* - * Find a stack instance given the zoneid. + * Find a stack instance given the zoneid. Can only be called from + * the create callback. See the comments in zone_find_by_id_nolock why + * that limitation exists. + * * Increases the reference count if found; caller must do a * netstack_rele(). * @@ -853,8 +964,6 @@ netstack_find_by_zoneid(zoneid_t zoneid) * matches. * * Skip the unitialized ones. - * - * NOTE: The caller must hold zonehash_lock. */ netstack_t * netstack_find_by_zoneid_nolock(zoneid_t zoneid) @@ -875,7 +984,7 @@ netstack_find_by_zoneid_nolock(zoneid_t zoneid) else netstack_hold(ns); - zone_rele(zone); + /* zone_find_by_id_nolock does not have a hold on the zone */ return (ns); } @@ -913,6 +1022,7 @@ netstack_rele(netstack_t *ns) netstack_t **nsp; boolean_t found; int refcnt, numzones; + int i; mutex_enter(&ns->netstack_lock); ASSERT(ns->netstack_refcnt > 0); @@ -959,6 +1069,14 @@ netstack_rele(netstack_t *ns) ASSERT(ns->netstack_numzones == 0); ASSERT(ns->netstack_flags & NSF_CLOSING); + + for (i = 0; i < NS_MAX; i++) { + nm_state_t *nms = &ns->netstack_m_state[i]; + + cv_destroy(&nms->nms_cv); + } + mutex_destroy(&ns->netstack_lock); + cv_destroy(&ns->netstack_cv); kmem_free(ns, sizeof (*ns)); } } @@ -996,7 +1114,7 @@ kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name, zoneid_t zoneid = ks_netstackid; return (kstat_create_zone(ks_module, ks_instance, ks_name, - ks_class, ks_type, ks_ndata, ks_flags, zoneid)); + ks_class, ks_type, ks_ndata, ks_flags, zoneid)); } } @@ -1144,7 +1262,9 @@ netstack_find_shared_zoneid(zoneid_t zoneid) /* * Hide the fact that zoneids and netstackids are allocated from * the same space in the current implementation. - * XXX could add checks that the stackid/zoneids are valid... + * We currently do not check that the stackid/zoneids are valid, since there + * is no need for that. But this should only be done for ids that are + * valid. */ zoneid_t netstackid_to_zoneid(netstackid_t stackid) diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index 033bc96ea3..75354330ef 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -61,6 +61,10 @@ * initialized zone is added to the list of active zones on the system but * isn't accessible. * + * ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are + * not yet completed. Not possible to enter the zone, but attributes can + * be retrieved. + * * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is * ready. The zone is made visible after the ZSD constructor callbacks are * executed. A zone remains in this state until it transitions into @@ -228,6 +232,7 @@ #include <sys/door.h> #include <sys/cpuvar.h> +#include <sys/sdt.h> #include <sys/uadmin.h> #include <sys/session.h> @@ -313,6 +318,7 @@ evchan_t *zone_event_chan; */ const char *zone_status_table[] = { ZONE_EVENT_UNINITIALIZED, /* uninitialized */ + ZONE_EVENT_INITIALIZED, /* initialized */ ZONE_EVENT_READY, /* ready */ ZONE_EVENT_READY, /* booting */ ZONE_EVENT_RUNNING, /* running */ @@ -351,6 +357,19 @@ static int zone_remove_datalink(zoneid_t, char *); static int zone_check_datalink(zoneid_t *, char *); static int zone_list_datalink(zoneid_t, int *, char *); +typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); + +static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t); +static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *); +static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t); +static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *, + zone_key_t); +static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t); +static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *, + kmutex_t *); +static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, + kmutex_t *); + /* * Bump this number when you alter the zone syscall interfaces; this is * because we need to have support for previous API versions in libc @@ -485,78 +504,54 @@ mount_completed(void) * The locking strategy and overall picture is as follows: * * When someone calls zone_key_create(), a template ZSD entry is added to the - * global list "zsd_registered_keys", protected by zsd_key_lock. The - * constructor callback is called immediately on all existing zones, and a - * copy of the ZSD entry added to the per-zone zone_zsd list (protected by - * zone_lock). As this operation requires the list of zones, the list of - * registered keys, and the per-zone list of ZSD entries to remain constant - * throughout the entire operation, it must grab zonehash_lock, zone_lock for - * all existing zones, and zsd_key_lock, in that order. Similar locking is - * needed when zone_key_delete() is called. It is thus sufficient to hold - * zsd_key_lock *or* zone_lock to prevent additions to or removals from the - * per-zone zone_zsd list. + * global list "zsd_registered_keys", protected by zsd_key_lock. While + * holding that lock all the existing zones are marked as + * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone + * zone_zsd list (protected by zone_lock). The global list is updated first + * (under zone_key_lock) to make sure that newly created zones use the + * most recent list of keys. Then under zonehash_lock we walk the zones + * and mark them. Similar locking is used in zone_key_delete(). * - * Note that this implementation does not make a copy of the ZSD entry if a - * constructor callback is not provided. A zone_getspecific() on such an - * uninitialized ZSD entry will return NULL. + * The actual create, shutdown, and destroy callbacks are done without + * holding any lock. And zsd_flags are used to ensure that the operations + * completed so that when zone_key_create (and zone_create) is done, as well as + * zone_key_delete (and zone_destroy) is done, all the necessary callbacks + * are completed. * * When new zones are created constructor callbacks for all registered ZSD - * entries will be called. + * entries will be called. That also uses the above two phases of marking + * what needs to be done, and then running the callbacks without holding + * any locks. * * The framework does not provide any locking around zone_getspecific() and * zone_setspecific() apart from that needed for internal consistency, so * callers interested in atomic "test-and-set" semantics will need to provide * their own locking. */ -void -zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), - void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) -{ - struct zsd_entry *zsdp; - struct zsd_entry *t; - struct zone *zone; - zsdp = kmem_alloc(sizeof (*zsdp), KM_SLEEP); - zsdp->zsd_data = NULL; - zsdp->zsd_create = create; - zsdp->zsd_shutdown = shutdown; - zsdp->zsd_destroy = destroy; - - mutex_enter(&zonehash_lock); /* stop the world */ - for (zone = list_head(&zone_active); zone != NULL; - zone = list_next(&zone_active, zone)) - mutex_enter(&zone->zone_lock); /* lock all zones */ - - mutex_enter(&zsd_key_lock); - *keyp = zsdp->zsd_key = ++zsd_keyval; - ASSERT(zsd_keyval != 0); - list_insert_tail(&zsd_registered_keys, zsdp); - mutex_exit(&zsd_key_lock); +/* + * Helper function to find the zsd_entry associated with the key in the + * given list. + */ +static struct zsd_entry * +zsd_find(list_t *l, zone_key_t key) +{ + struct zsd_entry *zsd; - if (create != NULL) { - for (zone = list_head(&zone_active); zone != NULL; - zone = list_next(&zone_active, zone)) { - t = kmem_alloc(sizeof (*t), KM_SLEEP); - t->zsd_key = *keyp; - t->zsd_data = (*create)(zone->zone_id); - t->zsd_create = create; - t->zsd_shutdown = shutdown; - t->zsd_destroy = destroy; - list_insert_tail(&zone->zone_zsd, t); + for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { + if (zsd->zsd_key == key) { + return (zsd); } } - for (zone = list_head(&zone_active); zone != NULL; - zone = list_next(&zone_active, zone)) - mutex_exit(&zone->zone_lock); - mutex_exit(&zonehash_lock); + return (NULL); } /* * Helper function to find the zsd_entry associated with the key in the - * given list. + * given list. Move it to the front of the list. */ static struct zsd_entry * -zsd_find(list_t *l, zone_key_t key) +zsd_find_mru(list_t *l, zone_key_t key) { struct zsd_entry *zsd; @@ -575,9 +570,88 @@ zsd_find(list_t *l, zone_key_t key) return (NULL); } +void +zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), + void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) +{ + struct zsd_entry *zsdp; + struct zsd_entry *t; + struct zone *zone; + zone_key_t key; + + zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP); + zsdp->zsd_data = NULL; + zsdp->zsd_create = create; + zsdp->zsd_shutdown = shutdown; + zsdp->zsd_destroy = destroy; + + /* + * Insert in global list of callbacks. Makes future zone creations + * see it. + */ + mutex_enter(&zsd_key_lock); + *keyp = key = zsdp->zsd_key = ++zsd_keyval; + ASSERT(zsd_keyval != 0); + list_insert_tail(&zsd_registered_keys, zsdp); + mutex_exit(&zsd_key_lock); + + /* + * Insert for all existing zones and mark them as needing + * a create callback. + */ + mutex_enter(&zonehash_lock); /* stop the world */ + for (zone = list_head(&zone_active); zone != NULL; + zone = list_next(&zone_active, zone)) { + zone_status_t status; + + mutex_enter(&zone->zone_lock); + + /* Skip zones that are on the way down or not yet up */ + status = zone_status_get(zone); + if (status >= ZONE_IS_DOWN || + status == ZONE_IS_UNINITIALIZED) { + mutex_exit(&zone->zone_lock); + continue; + } + + t = zsd_find_mru(&zone->zone_zsd, key); + if (t != NULL) { + /* + * A zsd_configure already inserted it after + * we dropped zsd_key_lock above. + */ + mutex_exit(&zone->zone_lock); + continue; + } + t = kmem_zalloc(sizeof (*t), KM_SLEEP); + t->zsd_key = key; + t->zsd_create = create; + t->zsd_shutdown = shutdown; + t->zsd_destroy = destroy; + if (create != NULL) { + t->zsd_flags = ZSD_CREATE_NEEDED; + DTRACE_PROBE2(zsd__create__needed, + zone_t *, zone, zone_key_t, key); + } + list_insert_tail(&zone->zone_zsd, t); + mutex_exit(&zone->zone_lock); + } + mutex_exit(&zonehash_lock); + + if (create != NULL) { + /* Now call the create callback for this key */ + zsd_apply_all_zones(zsd_apply_create, key); + } +} + /* * Function called when a module is being unloaded, or otherwise wishes * to unregister its ZSD key and callbacks. + * + * Remove from the global list and determine the functions that need to + * be called under a global lock. Then call the functions without + * holding any locks. Finally free up the zone_zsd entries. (The apply + * functions need to access the zone_zsd entries to find zsd_data etc.) */ int zone_key_delete(zone_key_t key) @@ -585,65 +659,88 @@ zone_key_delete(zone_key_t key) struct zsd_entry *zsdp = NULL; zone_t *zone; - mutex_enter(&zonehash_lock); /* Zone create/delete waits for us */ - for (zone = list_head(&zone_active); zone != NULL; - zone = list_next(&zone_active, zone)) - mutex_enter(&zone->zone_lock); /* lock all zones */ - mutex_enter(&zsd_key_lock); - zsdp = zsd_find(&zsd_registered_keys, key); - if (zsdp == NULL) - goto notfound; + zsdp = zsd_find_mru(&zsd_registered_keys, key); + if (zsdp == NULL) { + mutex_exit(&zsd_key_lock); + return (-1); + } list_remove(&zsd_registered_keys, zsdp); mutex_exit(&zsd_key_lock); + mutex_enter(&zonehash_lock); for (zone = list_head(&zone_active); zone != NULL; zone = list_next(&zone_active, zone)) { struct zsd_entry *del; - void *data; - - if (!(zone->zone_flags & ZF_DESTROYED)) { - del = zsd_find(&zone->zone_zsd, key); - if (del != NULL) { - data = del->zsd_data; - ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); - ASSERT(del->zsd_destroy == zsdp->zsd_destroy); - list_remove(&zone->zone_zsd, del); - kmem_free(del, sizeof (*del)); - } else { - data = NULL; - } - if (zsdp->zsd_shutdown) - zsdp->zsd_shutdown(zone->zone_id, data); - if (zsdp->zsd_destroy) - zsdp->zsd_destroy(zone->zone_id, data); + + mutex_enter(&zone->zone_lock); + del = zsd_find_mru(&zone->zone_zsd, key); + if (del == NULL) { + /* + * Somebody else got here first e.g the zone going + * away. + */ + mutex_exit(&zone->zone_lock); + continue; + } + ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); + ASSERT(del->zsd_destroy == zsdp->zsd_destroy); + if (del->zsd_shutdown != NULL && + (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { + del->zsd_flags |= ZSD_SHUTDOWN_NEEDED; + DTRACE_PROBE2(zsd__shutdown__needed, + zone_t *, zone, zone_key_t, key); + } + if (del->zsd_destroy != NULL && + (del->zsd_flags & ZSD_DESTROY_ALL) == 0) { + del->zsd_flags |= ZSD_DESTROY_NEEDED; + DTRACE_PROBE2(zsd__destroy__needed, + zone_t *, zone, zone_key_t, key); } mutex_exit(&zone->zone_lock); } mutex_exit(&zonehash_lock); kmem_free(zsdp, sizeof (*zsdp)); - return (0); -notfound: - mutex_exit(&zsd_key_lock); + /* Now call the shutdown and destroy callback for this key */ + zsd_apply_all_zones(zsd_apply_shutdown, key); + zsd_apply_all_zones(zsd_apply_destroy, key); + + /* Now we can free up the zsdp structures in each zone */ + mutex_enter(&zonehash_lock); for (zone = list_head(&zone_active); zone != NULL; - zone = list_next(&zone_active, zone)) + zone = list_next(&zone_active, zone)) { + struct zsd_entry *del; + + mutex_enter(&zone->zone_lock); + del = zsd_find(&zone->zone_zsd, key); + if (del != NULL) { + list_remove(&zone->zone_zsd, del); + ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS)); + kmem_free(del, sizeof (*del)); + } mutex_exit(&zone->zone_lock); + } mutex_exit(&zonehash_lock); - return (-1); + + return (0); } /* * ZSD counterpart of pthread_setspecific(). + * + * Since all zsd callbacks, including those with no create function, + * have an entry in zone_zsd, if the key is registered it is part of + * the zone_zsd list. + * Return an error if the key wasn't registerd. */ int zone_setspecific(zone_key_t key, zone_t *zone, const void *data) { struct zsd_entry *t; - struct zsd_entry *zsdp = NULL; mutex_enter(&zone->zone_lock); - t = zsd_find(&zone->zone_zsd, key); + t = zsd_find_mru(&zone->zone_zsd, key); if (t != NULL) { /* * Replace old value with new @@ -652,36 +749,8 @@ zone_setspecific(zone_key_t key, zone_t *zone, const void *data) mutex_exit(&zone->zone_lock); return (0); } - /* - * If there was no previous value, go through the list of registered - * keys. - * - * We avoid grabbing zsd_key_lock until we are sure we need it; this is - * necessary for shutdown callbacks to be able to execute without fear - * of deadlock. - */ - mutex_enter(&zsd_key_lock); - zsdp = zsd_find(&zsd_registered_keys, key); - if (zsdp == NULL) { /* Key was not registered */ - mutex_exit(&zsd_key_lock); - mutex_exit(&zone->zone_lock); - return (-1); - } - - /* - * Add a zsd_entry to this zone, using the template we just retrieved - * to initialize the constructor and destructor(s). - */ - t = kmem_alloc(sizeof (*t), KM_SLEEP); - t->zsd_key = key; - t->zsd_data = (void *)data; - t->zsd_create = zsdp->zsd_create; - t->zsd_shutdown = zsdp->zsd_shutdown; - t->zsd_destroy = zsdp->zsd_destroy; - list_insert_tail(&zone->zone_zsd, t); - mutex_exit(&zsd_key_lock); mutex_exit(&zone->zone_lock); - return (0); + return (-1); } /* @@ -694,7 +763,7 @@ zone_getspecific(zone_key_t key, zone_t *zone) void *data; mutex_enter(&zone->zone_lock); - t = zsd_find(&zone->zone_zsd, key); + t = zsd_find_mru(&zone->zone_zsd, key); data = (t == NULL ? NULL : t->zsd_data); mutex_exit(&zone->zone_lock); return (data); @@ -703,42 +772,41 @@ zone_getspecific(zone_key_t key, zone_t *zone) /* * Function used to initialize a zone's list of ZSD callbacks and data * when the zone is being created. The callbacks are initialized from - * the template list (zsd_registered_keys), and the constructor - * callback executed (if one exists). - * - * This is called before the zone is made publicly available, hence no - * need to grab zone_lock. - * - * Although we grab and release zsd_key_lock, new entries cannot be - * added to or removed from the zsd_registered_keys list until we - * release zonehash_lock, so there isn't a window for a - * zone_key_create() to come in after we've dropped zsd_key_lock but - * before the zone is added to the zone list, such that the constructor - * callbacks aren't executed for the new zone. + * the template list (zsd_registered_keys). The constructor callback is + * executed later (once the zone exists and with locks dropped). */ static void zone_zsd_configure(zone_t *zone) { struct zsd_entry *zsdp; struct zsd_entry *t; - zoneid_t zoneid = zone->zone_id; ASSERT(MUTEX_HELD(&zonehash_lock)); ASSERT(list_head(&zone->zone_zsd) == NULL); + mutex_enter(&zone->zone_lock); mutex_enter(&zsd_key_lock); for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; zsdp = list_next(&zsd_registered_keys, zsdp)) { + /* + * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create + * should not have added anything to it. + */ + ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL); + + t = kmem_zalloc(sizeof (*t), KM_SLEEP); + t->zsd_key = zsdp->zsd_key; + t->zsd_create = zsdp->zsd_create; + t->zsd_shutdown = zsdp->zsd_shutdown; + t->zsd_destroy = zsdp->zsd_destroy; if (zsdp->zsd_create != NULL) { - t = kmem_alloc(sizeof (*t), KM_SLEEP); - t->zsd_key = zsdp->zsd_key; - t->zsd_create = zsdp->zsd_create; - t->zsd_data = (*t->zsd_create)(zoneid); - t->zsd_shutdown = zsdp->zsd_shutdown; - t->zsd_destroy = zsdp->zsd_destroy; - list_insert_tail(&zone->zone_zsd, t); + t->zsd_flags = ZSD_CREATE_NEEDED; + DTRACE_PROBE2(zsd__create__needed, + zone_t *, zone, zone_key_t, zsdp->zsd_key); } + list_insert_tail(&zone->zone_zsd, t); } mutex_exit(&zsd_key_lock); + mutex_exit(&zone->zone_lock); } enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; @@ -749,70 +817,47 @@ enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; static void zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) { - struct zsd_entry *zsdp; struct zsd_entry *t; - zoneid_t zoneid = zone->zone_id; ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); - mutex_enter(&zone->zone_lock); - if (ct == ZSD_DESTROY) { - if (zone->zone_flags & ZF_DESTROYED) { - /* - * Make sure destructors are only called once. - */ - mutex_exit(&zone->zone_lock); - return; - } - zone->zone_flags |= ZF_DESTROYED; - } - mutex_exit(&zone->zone_lock); - /* - * Both zsd_key_lock and zone_lock need to be held in order to add or - * remove a ZSD key, (either globally as part of - * zone_key_create()/zone_key_delete(), or on a per-zone basis, as is - * possible through zone_setspecific()), so it's sufficient to hold - * zsd_key_lock here. - * - * This is a good thing, since we don't want to recursively try to grab - * zone_lock if a callback attempts to do something like a crfree() or - * zone_rele(). + * Run the callback solely based on what is registered for the zone + * in zone_zsd. The global list can change independently of this + * as keys are registered and unregistered and we don't register new + * callbacks for a zone that is in the process of going away. */ - mutex_enter(&zsd_key_lock); - for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; - zsdp = list_next(&zsd_registered_keys, zsdp)) { - zone_key_t key = zsdp->zsd_key; + mutex_enter(&zone->zone_lock); + for (t = list_head(&zone->zone_zsd); t != NULL; + t = list_next(&zone->zone_zsd, t)) { + zone_key_t key = t->zsd_key; /* Skip if no callbacks registered */ - if (ct == ZSD_SHUTDOWN && zsdp->zsd_shutdown == NULL) - continue; - if (ct == ZSD_DESTROY && zsdp->zsd_destroy == NULL) - continue; - /* - * Call the callback with the zone-specific data if we can find - * any, otherwise with NULL. - */ - t = zsd_find(&zone->zone_zsd, key); - if (t != NULL) { - if (ct == ZSD_SHUTDOWN) { - t->zsd_shutdown(zoneid, t->zsd_data); - } else { - ASSERT(ct == ZSD_DESTROY); - t->zsd_destroy(zoneid, t->zsd_data); + + if (ct == ZSD_SHUTDOWN) { + if (t->zsd_shutdown != NULL && + (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { + t->zsd_flags |= ZSD_SHUTDOWN_NEEDED; + DTRACE_PROBE2(zsd__shutdown__needed, + zone_t *, zone, zone_key_t, key); } } else { - if (ct == ZSD_SHUTDOWN) { - zsdp->zsd_shutdown(zoneid, NULL); - } else { - ASSERT(ct == ZSD_DESTROY); - zsdp->zsd_destroy(zoneid, NULL); + if (t->zsd_destroy != NULL && + (t->zsd_flags & ZSD_DESTROY_ALL) == 0) { + t->zsd_flags |= ZSD_DESTROY_NEEDED; + DTRACE_PROBE2(zsd__destroy__needed, + zone_t *, zone, zone_key_t, key); } } } - mutex_exit(&zsd_key_lock); + mutex_exit(&zone->zone_lock); + + /* Now call the shutdown and destroy callback for this key */ + zsd_apply_all_keys(zsd_apply_shutdown, zone); + zsd_apply_all_keys(zsd_apply_destroy, zone); + } /* @@ -827,12 +872,379 @@ zone_free_zsd(zone_t *zone) /* * Free all the zsd_entry's we had on this zone. */ + mutex_enter(&zone->zone_lock); for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { next = list_next(&zone->zone_zsd, t); list_remove(&zone->zone_zsd, t); + ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS)); kmem_free(t, sizeof (*t)); } list_destroy(&zone->zone_zsd); + mutex_exit(&zone->zone_lock); + +} + +/* + * Apply a function to all zones for particular key value. + * + * The applyfn has to drop zonehash_lock if it does some work, and + * then reacquire it before it returns. + * When the lock is dropped we don't follow list_next even + * if it is possible to do so without any hazards. This is + * because we want the design to allow for the list of zones + * to change in any arbitrary way during the time the + * lock was dropped. + * + * It is safe to restart the loop at list_head since the applyfn + * changes the zsd_flags as it does work, so a subsequent + * pass through will have no effect in applyfn, hence the loop will terminate + * in at worst O(N^2). + */ +static void +zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key) +{ + zone_t *zone; + + mutex_enter(&zonehash_lock); + zone = list_head(&zone_active); + while (zone != NULL) { + if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) { + /* Lock dropped - restart at head */ + zone = list_head(&zone_active); + } else { + zone = list_next(&zone_active, zone); + } + } + mutex_exit(&zonehash_lock); +} + +/* + * Apply a function to all keys for a particular zone. + * + * The applyfn has to drop zonehash_lock if it does some work, and + * then reacquire it before it returns. + * When the lock is dropped we don't follow list_next even + * if it is possible to do so without any hazards. This is + * because we want the design to allow for the list of zsd callbacks + * to change in any arbitrary way during the time the + * lock was dropped. + * + * It is safe to restart the loop at list_head since the applyfn + * changes the zsd_flags as it does work, so a subsequent + * pass through will have no effect in applyfn, hence the loop will terminate + * in at worst O(N^2). + */ +static void +zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone) +{ + struct zsd_entry *t; + + mutex_enter(&zone->zone_lock); + t = list_head(&zone->zone_zsd); + while (t != NULL) { + if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) { + /* Lock dropped - restart at head */ + t = list_head(&zone->zone_zsd); + } else { + t = list_next(&zone->zone_zsd, t); + } + } + mutex_exit(&zone->zone_lock); +} + +/* + * Call the create function for the zone and key if CREATE_NEEDED + * is set. + * If some other thread gets here first and sets CREATE_INPROGRESS, then + * we wait for that thread to complete so that we can ensure that + * all the callbacks are done when we've looped over all zones/keys. + * + * When we call the create function, we drop the global held by the + * caller, and return true to tell the caller it needs to re-evalute the + * state. + * If the caller holds zone_lock then zone_lock_held is set, and zone_lock + * remains held on exit. + */ +static boolean_t +zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held, + zone_t *zone, zone_key_t key) +{ + void *result; + struct zsd_entry *t; + boolean_t dropped; + + if (lockp != NULL) { + ASSERT(MUTEX_HELD(lockp)); + } + if (zone_lock_held) { + ASSERT(MUTEX_HELD(&zone->zone_lock)); + } else { + mutex_enter(&zone->zone_lock); + } + + t = zsd_find(&zone->zone_zsd, key); + if (t == NULL) { + /* + * Somebody else got here first e.g the zone going + * away. + */ + if (!zone_lock_held) + mutex_exit(&zone->zone_lock); + return (B_FALSE); + } + dropped = B_FALSE; + if (zsd_wait_for_inprogress(zone, t, lockp)) + dropped = B_TRUE; + + if (t->zsd_flags & ZSD_CREATE_NEEDED) { + t->zsd_flags &= ~ZSD_CREATE_NEEDED; + t->zsd_flags |= ZSD_CREATE_INPROGRESS; + DTRACE_PROBE2(zsd__create__inprogress, + zone_t *, zone, zone_key_t, key); + mutex_exit(&zone->zone_lock); + if (lockp != NULL) + mutex_exit(lockp); + + dropped = B_TRUE; + ASSERT(t->zsd_create != NULL); + DTRACE_PROBE2(zsd__create__start, + zone_t *, zone, zone_key_t, key); + + result = (*t->zsd_create)(zone->zone_id); + + DTRACE_PROBE2(zsd__create__end, + zone_t *, zone, voidn *, result); + + ASSERT(result != NULL); + if (lockp != NULL) + mutex_enter(lockp); + mutex_enter(&zone->zone_lock); + t->zsd_data = result; + t->zsd_flags &= ~ZSD_CREATE_INPROGRESS; + t->zsd_flags |= ZSD_CREATE_COMPLETED; + cv_broadcast(&t->zsd_cv); + DTRACE_PROBE2(zsd__create__completed, + zone_t *, zone, zone_key_t, key); + } + if (!zone_lock_held) + mutex_exit(&zone->zone_lock); + return (dropped); +} + +/* + * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED + * is set. + * If some other thread gets here first and sets *_INPROGRESS, then + * we wait for that thread to complete so that we can ensure that + * all the callbacks are done when we've looped over all zones/keys. + * + * When we call the shutdown function, we drop the global held by the + * caller, and return true to tell the caller it needs to re-evalute the + * state. + * If the caller holds zone_lock then zone_lock_held is set, and zone_lock + * remains held on exit. + */ +static boolean_t +zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held, + zone_t *zone, zone_key_t key) +{ + struct zsd_entry *t; + void *data; + boolean_t dropped; + + if (lockp != NULL) { + ASSERT(MUTEX_HELD(lockp)); + } + if (zone_lock_held) { + ASSERT(MUTEX_HELD(&zone->zone_lock)); + } else { + mutex_enter(&zone->zone_lock); + } + + t = zsd_find(&zone->zone_zsd, key); + if (t == NULL) { + /* + * Somebody else got here first e.g the zone going + * away. + */ + if (!zone_lock_held) + mutex_exit(&zone->zone_lock); + return (B_FALSE); + } + dropped = B_FALSE; + if (zsd_wait_for_creator(zone, t, lockp)) + dropped = B_TRUE; + + if (zsd_wait_for_inprogress(zone, t, lockp)) + dropped = B_TRUE; + + if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) { + t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED; + t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS; + DTRACE_PROBE2(zsd__shutdown__inprogress, + zone_t *, zone, zone_key_t, key); + mutex_exit(&zone->zone_lock); + if (lockp != NULL) + mutex_exit(lockp); + dropped = B_TRUE; + + ASSERT(t->zsd_shutdown != NULL); + data = t->zsd_data; + + DTRACE_PROBE2(zsd__shutdown__start, + zone_t *, zone, zone_key_t, key); + + (t->zsd_shutdown)(zone->zone_id, data); + DTRACE_PROBE2(zsd__shutdown__end, + zone_t *, zone, zone_key_t, key); + + if (lockp != NULL) + mutex_enter(lockp); + mutex_enter(&zone->zone_lock); + t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS; + t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED; + cv_broadcast(&t->zsd_cv); + DTRACE_PROBE2(zsd__shutdown__completed, + zone_t *, zone, zone_key_t, key); + } + if (!zone_lock_held) + mutex_exit(&zone->zone_lock); + return (dropped); +} + +/* + * Call the destroy function for the zone and key if DESTROY_NEEDED + * is set. + * If some other thread gets here first and sets *_INPROGRESS, then + * we wait for that thread to complete so that we can ensure that + * all the callbacks are done when we've looped over all zones/keys. + * + * When we call the destroy function, we drop the global held by the + * caller, and return true to tell the caller it needs to re-evalute the + * state. + * If the caller holds zone_lock then zone_lock_held is set, and zone_lock + * remains held on exit. + */ +static boolean_t +zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held, + zone_t *zone, zone_key_t key) +{ + struct zsd_entry *t; + void *data; + boolean_t dropped; + + if (lockp != NULL) { + ASSERT(MUTEX_HELD(lockp)); + } + if (zone_lock_held) { + ASSERT(MUTEX_HELD(&zone->zone_lock)); + } else { + mutex_enter(&zone->zone_lock); + } + + t = zsd_find(&zone->zone_zsd, key); + if (t == NULL) { + /* + * Somebody else got here first e.g the zone going + * away. + */ + if (!zone_lock_held) + mutex_exit(&zone->zone_lock); + return (B_FALSE); + } + dropped = B_FALSE; + if (zsd_wait_for_creator(zone, t, lockp)) + dropped = B_TRUE; + + if (zsd_wait_for_inprogress(zone, t, lockp)) + dropped = B_TRUE; + + if (t->zsd_flags & ZSD_DESTROY_NEEDED) { + t->zsd_flags &= ~ZSD_DESTROY_NEEDED; + t->zsd_flags |= ZSD_DESTROY_INPROGRESS; + DTRACE_PROBE2(zsd__destroy__inprogress, + zone_t *, zone, zone_key_t, key); + mutex_exit(&zone->zone_lock); + if (lockp != NULL) + mutex_exit(lockp); + dropped = B_TRUE; + + ASSERT(t->zsd_destroy != NULL); + data = t->zsd_data; + DTRACE_PROBE2(zsd__destroy__start, + zone_t *, zone, zone_key_t, key); + + (t->zsd_destroy)(zone->zone_id, data); + DTRACE_PROBE2(zsd__destroy__end, + zone_t *, zone, zone_key_t, key); + + if (lockp != NULL) + mutex_enter(lockp); + mutex_enter(&zone->zone_lock); + t->zsd_data = NULL; + t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS; + t->zsd_flags |= ZSD_DESTROY_COMPLETED; + cv_broadcast(&t->zsd_cv); + DTRACE_PROBE2(zsd__destroy__completed, + zone_t *, zone, zone_key_t, key); + } + if (!zone_lock_held) + mutex_exit(&zone->zone_lock); + return (dropped); +} + +/* + * Wait for any CREATE_NEEDED flag to be cleared. + * Returns true if lockp was temporarily dropped while waiting. + */ +static boolean_t +zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) +{ + boolean_t dropped = B_FALSE; + + while (t->zsd_flags & ZSD_CREATE_NEEDED) { + DTRACE_PROBE2(zsd__wait__for__creator, + zone_t *, zone, struct zsd_entry *, t); + if (lockp != NULL) { + dropped = B_TRUE; + mutex_exit(lockp); + } + cv_wait(&t->zsd_cv, &zone->zone_lock); + if (lockp != NULL) { + /* First drop zone_lock to preserve order */ + mutex_exit(&zone->zone_lock); + mutex_enter(lockp); + mutex_enter(&zone->zone_lock); + } + } + return (dropped); +} + +/* + * Wait for any INPROGRESS flag to be cleared. + * Returns true if lockp was temporarily dropped while waiting. + */ +static boolean_t +zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) +{ + boolean_t dropped = B_FALSE; + + while (t->zsd_flags & ZSD_ALL_INPROGRESS) { + DTRACE_PROBE2(zsd__wait__for__inprogress, + zone_t *, zone, struct zsd_entry *, t); + if (lockp != NULL) { + dropped = B_TRUE; + mutex_exit(lockp); + } + cv_wait(&t->zsd_cv, &zone->zone_lock); + if (lockp != NULL) { + /* First drop zone_lock to preserve order */ + mutex_exit(&zone->zone_lock); + mutex_enter(lockp); + mutex_enter(&zone->zone_lock); + } + } + return (dropped); } /* @@ -2960,10 +3372,15 @@ zsched(void *arg) /* * Tell the world that we're done setting up. * - * At this point we want to set the zone status to ZONE_IS_READY + * At this point we want to set the zone status to ZONE_IS_INITIALIZED * and atomically set the zone's processor set visibility. Once * we drop pool_lock() this zone will automatically get updated * to reflect any future changes to the pools configuration. + * + * Note that after we drop the locks below (zonehash_lock in + * particular) other operations such as a zone_getattr call can + * now proceed and observe the zone. That is the reason for doing a + * state transition to the INITIALIZED state. */ pool_lock(); mutex_enter(&cpu_lock); @@ -2974,12 +3391,21 @@ zsched(void *arg) zone_pset_set(zone, pool_default->pool_pset->pset_id); mutex_enter(&zone_status_lock); ASSERT(zone_status_get(zone) == ZONE_IS_UNINITIALIZED); - zone_status_set(zone, ZONE_IS_READY); + zone_status_set(zone, ZONE_IS_INITIALIZED); mutex_exit(&zone_status_lock); mutex_exit(&zonehash_lock); mutex_exit(&cpu_lock); pool_unlock(); + /* Now call the create callback for this key */ + zsd_apply_all_keys(zsd_apply_create, zone); + + /* The callbacks are complete. Mark ZONE_IS_READY */ + mutex_enter(&zone_status_lock); + ASSERT(zone_status_get(zone) == ZONE_IS_INITIALIZED); + zone_status_set(zone, ZONE_IS_READY); + mutex_exit(&zone_status_lock); + /* * Once we see the zone transition to the ZONE_IS_BOOTING state, * we launch init, and set the state to running. @@ -4071,7 +4497,7 @@ zone_getattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) return (set_errno(EINVAL)); } zone_status = zone_status_get(zone); - if (zone_status < ZONE_IS_READY) { + if (zone_status < ZONE_IS_INITIALIZED) { mutex_exit(&zonehash_lock); return (set_errno(EINVAL)); } @@ -5698,21 +6124,28 @@ zone_list_datalink(zoneid_t zoneid, int *nump, char *buf) /* * Public interface for looking up a zone by zoneid. It's a customized version - * for netstack_zone_create(), it: - * 1. Doesn't acquire the zonehash_lock, since it is called from - * zone_key_create() or zone_zsd_configure(), lock already held. - * 2. Doesn't check the status of the zone. - * 3. It will be called even before zone_init is called, in that case the + * for netstack_zone_create(). It can only be called from the zsd create + * callbacks, since it doesn't have reference on the zone structure hence if + * it is called elsewhere the zone could disappear after the zonehash_lock + * is dropped. + * + * Furthermore it + * 1. Doesn't check the status of the zone. + * 2. It will be called even before zone_init is called, in that case the * address of zone0 is returned directly, and netstack_zone_create() * will only assign a value to zone0.zone_netstack, won't break anything. + * 3. Returns without the zone being held. */ zone_t * zone_find_by_id_nolock(zoneid_t zoneid) { - ASSERT(MUTEX_HELD(&zonehash_lock)); + zone_t *zone; + mutex_enter(&zonehash_lock); if (zonehashbyid == NULL) - return (&zone0); + zone = &zone0; else - return (zone_find_all_by_id(zoneid)); + zone = zone_find_all_by_id(zoneid); + mutex_exit(&zonehash_lock); + return (zone); } diff --git a/usr/src/uts/common/sys/netstack.h b/usr/src/uts/common/sys/netstack.h index 795cf37eb5..9bd7701693 100644 --- a/usr/src/uts/common/sys/netstack.h +++ b/usr/src/uts/common/sys/netstack.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_NETSTACK_H @@ -84,6 +84,46 @@ typedef id_t netstackid_t; #define NS_MAX (NS_STR+1) /* + * State maintained for each module which tracks the state of + * the create, shutdown and destroy callbacks. + * + * Keeps track of pending actions to avoid holding locks when + * calling into the create/shutdown/destroy functions in the module. + */ +#ifdef _KERNEL +typedef struct { + uint16_t nms_flags; + kcondvar_t nms_cv; +} nm_state_t; + +/* + * nms_flags + */ +#define NSS_CREATE_NEEDED 0x0001 +#define NSS_CREATE_INPROGRESS 0x0002 +#define NSS_CREATE_COMPLETED 0x0004 +#define NSS_SHUTDOWN_NEEDED 0x0010 +#define NSS_SHUTDOWN_INPROGRESS 0x0020 +#define NSS_SHUTDOWN_COMPLETED 0x0040 +#define NSS_DESTROY_NEEDED 0x0100 +#define NSS_DESTROY_INPROGRESS 0x0200 +#define NSS_DESTROY_COMPLETED 0x0400 + +#define NSS_CREATE_ALL \ + (NSS_CREATE_NEEDED|NSS_CREATE_INPROGRESS|NSS_CREATE_COMPLETED) +#define NSS_SHUTDOWN_ALL \ + (NSS_SHUTDOWN_NEEDED|NSS_SHUTDOWN_INPROGRESS|NSS_SHUTDOWN_COMPLETED) +#define NSS_DESTROY_ALL \ + (NSS_DESTROY_NEEDED|NSS_DESTROY_INPROGRESS|NSS_DESTROY_COMPLETED) + +#define NSS_ALL_INPROGRESS \ + (NSS_CREATE_INPROGRESS|NSS_SHUTDOWN_INPROGRESS|NSS_DESTROY_INPROGRESS) +#else +/* User-level compile like IP Filter needs a netstack_t. Dummy */ +typedef uint_t nm_state_t; +#endif /* _KERNEL */ + +/* * One for every netstack in the system. * We use a union so that the compilar and lint can provide type checking - * in principle we could have @@ -136,7 +176,7 @@ struct netstack { #define netstack_ipf netstack_u.nu_s.nu_ipf #define netstack_str netstack_u.nu_s.nu_str - uint16_t netstack_m_state[NS_MAX]; /* module state */ + nm_state_t netstack_m_state[NS_MAX]; /* module state */ kmutex_t netstack_lock; struct netstack *netstack_next; @@ -144,34 +184,23 @@ struct netstack { int netstack_numzones; /* Number of zones using this */ int netstack_refcnt; /* Number of hold-rele */ int netstack_flags; /* See below */ + +#ifdef _KERNEL + /* Needed to ensure that we run the callback functions in order */ + kcondvar_t netstack_cv; +#endif }; typedef struct netstack netstack_t; /* netstack_flags values */ -#define NSF_UNINIT 0x01 /* Not initialized */ -#define NSF_CLOSING 0x02 /* Going away */ +#define NSF_UNINIT 0x01 /* Not initialized */ +#define NSF_CLOSING 0x02 /* Going away */ +#define NSF_ZONE_CREATE 0x04 /* create callbacks inprog */ +#define NSF_ZONE_SHUTDOWN 0x08 /* shutdown callbacks */ +#define NSF_ZONE_DESTROY 0x10 /* destroy callbacks */ -/* - * State for each module for each stack - netstack_m_state[moduleid] - * Keeps track of pending actions to avoid holding looks when - * calling into the create/shutdown/destroy functions in the module. - */ -#define NSS_CREATE_NEEDED 0x0001 -#define NSS_CREATE_INPROGRESS 0x0002 -#define NSS_CREATE_COMPLETED 0x0004 -#define NSS_SHUTDOWN_NEEDED 0x0010 -#define NSS_SHUTDOWN_INPROGRESS 0x0020 -#define NSS_SHUTDOWN_COMPLETED 0x0040 -#define NSS_DESTROY_NEEDED 0x0100 -#define NSS_DESTROY_INPROGRESS 0x0200 -#define NSS_DESTROY_COMPLETED 0x0400 - -#define NSS_CREATE_ALL \ - (NSS_CREATE_NEEDED|NSS_CREATE_INPROGRESS|NSS_CREATE_COMPLETED) -#define NSS_SHUTDOWN_ALL \ - (NSS_SHUTDOWN_NEEDED|NSS_SHUTDOWN_INPROGRESS|NSS_SHUTDOWN_COMPLETED) -#define NSS_DESTROY_ALL \ - (NSS_DESTROY_NEEDED|NSS_DESTROY_INPROGRESS|NSS_DESTROY_COMPLETED) +#define NSF_ZONE_INPROGRESS \ + (NSF_ZONE_CREATE|NSF_ZONE_SHUTDOWN|NSF_ZONE_DESTROY) /* * One for each of the NS_* values. @@ -185,6 +214,7 @@ struct netstack_registry { /* nr_flags values */ #define NRF_REGISTERED 0x01 +#define NRF_DYING 0x02 /* No new creates */ /* * To support kstat_create_netstack() using kstat_add_zone we need diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index 7e7dd9e88a..0a93e8651e 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -106,6 +106,7 @@ extern "C" { #define ZONE_EVENT_STATUS_SUBCLASS "change" #define ZONE_EVENT_UNINITIALIZED "uninitialized" +#define ZONE_EVENT_INITIALIZED "initialized" #define ZONE_EVENT_READY "ready" #define ZONE_EVENT_RUNNING "running" #define ZONE_EVENT_SHUTTING_DOWN "shutting_down" @@ -201,6 +202,7 @@ typedef struct { /* zone_status */ typedef enum { ZONE_IS_UNINITIALIZED = 0, + ZONE_IS_INITIALIZED, ZONE_IS_READY, ZONE_IS_BOOTING, ZONE_IS_RUNNING, @@ -268,7 +270,6 @@ typedef struct zone_cmd_rval { #define ZONE_DOOR_PATH ZONES_TMPDIR "/%s.zoneadmd_door" /* zone_flags */ -#define ZF_DESTROYED 0x1 /* ZSD destructor callbacks run */ #define ZF_HASHED_LABEL 0x2 /* zone has a unique label */ #define ZF_IS_SCRATCH 0x4 /* scratch zone */ #define ZF_NET_EXCL 0x8 /* Zone has an exclusive IP stack */ @@ -476,6 +477,13 @@ extern int zone_setspecific(zone_key_t, zone_t *, const void *); /* * The definition of a zsd_entry is truly private to zone.c and is only * placed here so it can be shared with mdb. + * + * State maintained for each zone times each registered key, which tracks + * the state of the create, shutdown and destroy callbacks. + * + * zsd_flags is used to keep track of pending actions to avoid holding locks + * when calling the create/shutdown/destroy callbacks, since doing so + * could lead to deadlocks. */ struct zsd_entry { zone_key_t zsd_key; /* Key used to lookup value */ @@ -488,9 +496,34 @@ struct zsd_entry { void (*zsd_shutdown)(zoneid_t, void *); void (*zsd_destroy)(zoneid_t, void *); list_node_t zsd_linkage; + uint16_t zsd_flags; /* See below */ + kcondvar_t zsd_cv; }; /* + * zsd_flags + */ +#define ZSD_CREATE_NEEDED 0x0001 +#define ZSD_CREATE_INPROGRESS 0x0002 +#define ZSD_CREATE_COMPLETED 0x0004 +#define ZSD_SHUTDOWN_NEEDED 0x0010 +#define ZSD_SHUTDOWN_INPROGRESS 0x0020 +#define ZSD_SHUTDOWN_COMPLETED 0x0040 +#define ZSD_DESTROY_NEEDED 0x0100 +#define ZSD_DESTROY_INPROGRESS 0x0200 +#define ZSD_DESTROY_COMPLETED 0x0400 + +#define ZSD_CREATE_ALL \ + (ZSD_CREATE_NEEDED|ZSD_CREATE_INPROGRESS|ZSD_CREATE_COMPLETED) +#define ZSD_SHUTDOWN_ALL \ + (ZSD_SHUTDOWN_NEEDED|ZSD_SHUTDOWN_INPROGRESS|ZSD_SHUTDOWN_COMPLETED) +#define ZSD_DESTROY_ALL \ + (ZSD_DESTROY_NEEDED|ZSD_DESTROY_INPROGRESS|ZSD_DESTROY_COMPLETED) + +#define ZSD_ALL_INPROGRESS \ + (ZSD_CREATE_INPROGRESS|ZSD_SHUTDOWN_INPROGRESS|ZSD_DESTROY_INPROGRESS) + +/* * Macros to help with zone visibility restrictions. */ |