diff options
Diffstat (limited to 'usr/src/uts/common/os/zone.c')
| -rw-r--r-- | usr/src/uts/common/os/zone.c | 322 |
1 files changed, 311 insertions, 11 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index bbc91e3fa7..79ccd94ae4 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -223,6 +223,7 @@ #include <sys/pool.h> #include <sys/pool_pset.h> #include <sys/pset.h> +#include <sys/strlog.h> #include <sys/sysmacros.h> #include <sys/callb.h> #include <sys/vmparam.h> @@ -249,6 +250,17 @@ #include <vm/seg.h> #include <sys/mac.h> +/* + * This constant specifies the number of seconds that threads waiting for + * subsystems to release a zone's general-purpose references will wait before + * they log the zone's reference counts. The constant's value shouldn't + * be so small that reference counts are unnecessarily reported for zones + * whose references are slowly released. On the other hand, it shouldn't be so + * large that users reboot their systems out of frustration over hung zones + * before the system logs the zones' reference counts. + */ +#define ZONE_DESTROY_TIMEOUT_SECS 60 + /* List of data link IDs which are accessible from the zone */ typedef struct zone_dl { datalink_id_t zdl_id; @@ -338,6 +350,20 @@ const char *zone_status_table[] = { }; /* + * This array contains the names of the subsystems listed in zone_ref_subsys_t + * (see sys/zone.h). + */ +static char *zone_ref_subsys_names[] = { + "NFS", /* ZONE_REF_NFS */ + "NFSv4", /* ZONE_REF_NFSV4 */ + "SMBFS", /* ZONE_REF_SMBFS */ + "MNTFS", /* ZONE_REF_MNTFS */ + "LOFI", /* ZONE_REF_LOFI */ + "VFS", /* ZONE_REF_VFS */ + "IPC" /* ZONE_REF_IPC */ +}; + +/* * This isn't static so lint doesn't complain. */ rctl_hndl_t rc_zone_cpu_shares; @@ -1881,6 +1907,8 @@ zone_zsd_init(void) zone0.zone_lockedmem_kstat = NULL; zone0.zone_swapresv_kstat = NULL; zone0.zone_nprocs_kstat = NULL; + list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), + offsetof(zone_ref_t, zref_linkage)); list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); list_insert_head(&zone_active, &zone0); @@ -2144,6 +2172,7 @@ zone_free(zone_t *zone) ASSERT(zone->zone_kcred == NULL); ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || zone_status_get(zone) == ZONE_IS_UNINITIALIZED); + ASSERT(list_is_empty(&zone->zone_ref_list)); /* * Remove any zone caps. @@ -2160,6 +2189,7 @@ zone_free(zone_t *zone) mutex_exit(&zone_deathrow_lock); } + list_destroy(&zone->zone_ref_list); zone_free_zsd(zone); zone_free_datasets(zone); list_destroy(&zone->zone_dl_list); @@ -2494,6 +2524,11 @@ zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) * to force halt/reboot to block waiting for the zone_cred_ref to drop * to 0. This can be useful to flush out other sources of cached creds * that may be less innocuous than the driver case. + * + * Zones also provide a tracked reference counting mechanism in which zone + * references are represented by "crumbs" (zone_ref structures). Crumbs help + * debuggers determine the sources of leaked zone references. See + * zone_hold_ref() and zone_rele_ref() below for more information. */ int zone_wait_for_cred = 0; @@ -2506,6 +2541,14 @@ zone_hold_locked(zone_t *z) ASSERT(z->zone_ref != 0); } +/* + * Increment the specified zone's reference count. The zone's zone_t structure + * will not be freed as long as the zone's reference count is nonzero. + * Decrement the zone's reference count via zone_rele(). + * + * NOTE: This function should only be used to hold zones for short periods of + * time. Use zone_hold_ref() if the zone must be held for a long time. + */ void zone_hold(zone_t *z) { @@ -2522,14 +2565,27 @@ zone_hold(zone_t *z) #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) -void -zone_rele(zone_t *z) +/* + * Common zone reference release function invoked by zone_rele() and + * zone_rele_ref(). If subsys is ZONE_REF_NUM_SUBSYS, then the specified + * zone's subsystem-specific reference counters are not affected by the + * release. If ref is not NULL, then the zone_ref_t to which it refers is + * removed from the specified zone's reference list. ref must be non-NULL iff + * subsys is not ZONE_REF_NUM_SUBSYS. + */ +static void +zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys) { boolean_t wakeup; mutex_enter(&z->zone_lock); ASSERT(z->zone_ref != 0); z->zone_ref--; + if (subsys != ZONE_REF_NUM_SUBSYS) { + ASSERT(z->zone_subsys_ref[subsys] != 0); + z->zone_subsys_ref[subsys]--; + list_remove(&z->zone_ref_list, ref); + } if (z->zone_ref == 0 && z->zone_cred_ref == 0) { /* no more refs, free the structure */ mutex_exit(&z->zone_lock); @@ -2551,6 +2607,83 @@ zone_rele(zone_t *z) } } +/* + * Decrement the specified zone's reference count. The specified zone will + * cease to exist after this function returns if the reference count drops to + * zero. This function should be paired with zone_hold(). + */ +void +zone_rele(zone_t *z) +{ + zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS); +} + +/* + * Initialize a zone reference structure. This function must be invoked for + * a reference structure before the structure is passed to zone_hold_ref(). + */ +void +zone_init_ref(zone_ref_t *ref) +{ + ref->zref_zone = NULL; + list_link_init(&ref->zref_linkage); +} + +/* + * Acquire a reference to zone z. The caller must specify the + * zone_ref_subsys_t constant associated with its subsystem. The specified + * zone_ref_t structure will represent a reference to the specified zone. Use + * zone_rele_ref() to release the reference. + * + * The referenced zone_t structure will not be freed as long as the zone_t's + * zone_status field is not ZONE_IS_DEAD and the zone has outstanding + * references. + * + * NOTE: The zone_ref_t structure must be initialized before it is used. + * See zone_init_ref() above. + */ +void +zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys) +{ + ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS); + + /* + * Prevent consumers from reusing a reference structure before + * releasing it. + */ + VERIFY(ref->zref_zone == NULL); + + ref->zref_zone = z; + mutex_enter(&z->zone_lock); + zone_hold_locked(z); + z->zone_subsys_ref[subsys]++; + ASSERT(z->zone_subsys_ref[subsys] != 0); + list_insert_head(&z->zone_ref_list, ref); + mutex_exit(&z->zone_lock); +} + +/* + * Release the zone reference represented by the specified zone_ref_t. + * The reference is invalid after it's released; however, the zone_ref_t + * structure can be reused without having to invoke zone_init_ref(). + * subsys should be the same value that was passed to zone_hold_ref() + * when the reference was acquired. + */ +void +zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys) +{ + zone_rele_common(ref->zref_zone, ref, subsys); + + /* + * Set the zone_ref_t's zref_zone field to NULL to generate panics + * when consumers dereference the reference. This helps us catch + * consumers who use released references. Furthermore, this lets + * consumers reuse the zone_ref_t structure without having to + * invoke zone_init_ref(). + */ + ref->zref_zone = NULL; +} + void zone_cred_hold(zone_t *z) { @@ -3763,10 +3896,12 @@ static int zone_set_privset(zone_t *zone, const priv_set_t *zone_privs, size_t zone_privssz) { - priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); + priv_set_t *privs; if (zone_privssz < sizeof (priv_set_t)) - return (set_errno(ENOMEM)); + return (ENOMEM); + + privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); if (copyin(zone_privs, privs, sizeof (priv_set_t))) { kmem_free(privs, sizeof (priv_set_t)); @@ -3982,6 +4117,8 @@ zone_create(const char *zone_name, const char *zone_root, mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL); + list_create(&zone->zone_ref_list, sizeof (zone_ref_t), + offsetof(zone_ref_t, zref_linkage)); list_create(&zone->zone_zsd, sizeof (struct zsd_entry), offsetof(struct zsd_entry, zsd_linkage)); list_create(&zone->zone_datasets, sizeof (zone_dataset_t), @@ -4267,7 +4404,7 @@ errout: * zone_kcred. To free the zone, we call crfree, which will call * zone_cred_rele, which will call zone_free. */ - ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */ + ASSERT(zone->zone_cred_ref == 1); ASSERT(zone->zone_kcred->cr_ref == 1); ASSERT(zone->zone_ref == 0); zkcr = zone->zone_kcred; @@ -4553,6 +4690,101 @@ zone_shutdown(zoneid_t zoneid) } /* + * Log the specified zone's reference counts. The caller should not be + * holding the zone's zone_lock. + */ +static void +zone_log_refcounts(zone_t *zone) +{ + char *buffer; + char *buffer_position; + uint32_t buffer_size; + uint32_t index; + uint_t ref; + uint_t cred_ref; + + /* + * Construct a string representing the subsystem-specific reference + * counts. The counts are printed in ascending order by index into the + * zone_t::zone_subsys_ref array. The list will be surrounded by + * square brackets [] and will only contain nonzero reference counts. + * + * The buffer will hold two square bracket characters plus ten digits, + * one colon, one space, one comma, and some characters for a + * subsystem name per subsystem-specific reference count. (Unsigned 32- + * bit integers have at most ten decimal digits.) The last + * reference count's comma is replaced by the closing square + * bracket and a NULL character to terminate the string. + * + * NOTE: We have to grab the zone's zone_lock to create a consistent + * snapshot of the zone's reference counters. + * + * First, figure out how much space the string buffer will need. + * The buffer's size is stored in buffer_size. + */ + buffer_size = 2; /* for the square brackets */ + mutex_enter(&zone->zone_lock); + zone->zone_flags |= ZF_REFCOUNTS_LOGGED; + ref = zone->zone_ref; + cred_ref = zone->zone_cred_ref; + for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) + if (zone->zone_subsys_ref[index] != 0) + buffer_size += strlen(zone_ref_subsys_names[index]) + + 13; + if (buffer_size == 2) { + /* + * No subsystems had nonzero reference counts. Don't bother + * with allocating a buffer; just log the general-purpose and + * credential reference counts. + */ + mutex_exit(&zone->zone_lock); + (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE, + "Zone '%s' (ID: %d) is shutting down, but %u zone " + "references and %u credential references are still extant", + zone->zone_name, zone->zone_id, ref, cred_ref); + return; + } + + /* + * buffer_size contains the exact number of characters that the + * buffer will need. Allocate the buffer and fill it with nonzero + * subsystem-specific reference counts. Surround the results with + * square brackets afterwards. + */ + buffer = kmem_alloc(buffer_size, KM_SLEEP); + buffer_position = &buffer[1]; + for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) { + /* + * NOTE: The DDI's version of sprintf() returns a pointer to + * the modified buffer rather than the number of bytes written + * (as in snprintf(3C)). This is unfortunate and annoying. + * Therefore, we'll use snprintf() with INT_MAX to get the + * number of bytes written. Using INT_MAX is safe because + * the buffer is perfectly sized for the data: we'll never + * overrun the buffer. + */ + if (zone->zone_subsys_ref[index] != 0) + buffer_position += snprintf(buffer_position, INT_MAX, + "%s: %u,", zone_ref_subsys_names[index], + zone->zone_subsys_ref[index]); + } + mutex_exit(&zone->zone_lock); + buffer[0] = '['; + ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size); + ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ','); + buffer_position[-1] = ']'; + + /* + * Log the reference counts and free the message buffer. + */ + (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE, + "Zone '%s' (ID: %d) is shutting down, but %u zone references and " + "%u credential references are still extant %s", zone->zone_name, + zone->zone_id, ref, cred_ref, buffer); + kmem_free(buffer, buffer_size); +} + +/* * Systemcall entry point to finalize the zone halt process. The caller * must have already successfully called zone_shutdown(). * @@ -4566,6 +4798,8 @@ zone_destroy(zoneid_t zoneid) uint64_t uniqid; zone_t *zone; zone_status_t status; + clock_t wait_time; + boolean_t log_refcounts; if (secpolicy_zone_config(CRED()) != 0) return (set_errno(EPERM)); @@ -4609,9 +4843,12 @@ zone_destroy(zoneid_t zoneid) zone_rele(zone); zone = NULL; /* potentially free'd */ + log_refcounts = B_FALSE; + wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS); mutex_enter(&zonehash_lock); for (; /* ever */; ) { boolean_t unref; + boolean_t refs_have_been_logged; if ((zone = zone_find_all_by_id(zoneid)) == NULL || zone->zone_uniqid != uniqid) { @@ -4624,6 +4861,8 @@ zone_destroy(zoneid_t zoneid) } mutex_enter(&zone->zone_lock); unref = ZONE_IS_UNREF(zone); + refs_have_been_logged = (zone->zone_flags & + ZF_REFCOUNTS_LOGGED); mutex_exit(&zone->zone_lock); if (unref) { /* @@ -4636,12 +4875,69 @@ zone_destroy(zoneid_t zoneid) break; } + /* + * Wait for zone_rele_common() or zone_cred_rele() to signal + * zone_destroy_cv. zone_destroy_cv is signaled only when + * some zone's general-purpose reference count reaches one. + * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting + * on zone_destroy_cv, then log the zone's reference counts and + * continue to wait for zone_rele() and zone_cred_rele(). + */ + if (!refs_have_been_logged) { + if (!log_refcounts) { + /* + * This thread hasn't timed out waiting on + * zone_destroy_cv yet. Wait wait_time clock + * ticks (initially ZONE_DESTROY_TIMEOUT_SECS + * seconds) for the zone's references to clear. + */ + ASSERT(wait_time > 0); + wait_time = cv_reltimedwait_sig( + &zone_destroy_cv, &zonehash_lock, wait_time, + TR_SEC); + if (wait_time > 0) { + /* + * A thread in zone_rele() or + * zone_cred_rele() signaled + * zone_destroy_cv before this thread's + * wait timed out. The zone might have + * only one reference left; find out! + */ + continue; + } else if (wait_time == 0) { + /* The thread's process was signaled. */ + mutex_exit(&zonehash_lock); + return (set_errno(EINTR)); + } + + /* + * The thread timed out while waiting on + * zone_destroy_cv. Even though the thread + * timed out, it has to check whether another + * thread woke up from zone_destroy_cv and + * destroyed the zone. + * + * If the zone still exists and has more than + * one unreleased general-purpose reference, + * then log the zone's reference counts. + */ + log_refcounts = B_TRUE; + continue; + } + + /* + * The thread already timed out on zone_destroy_cv while + * waiting for subsystems to release the zone's last + * general-purpose references. Log the zone's reference + * counts and wait indefinitely on zone_destroy_cv. + */ + zone_log_refcounts(zone); + } if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) { - /* Signaled */ + /* The thread's process was signaled. */ mutex_exit(&zonehash_lock); return (set_errno(EINTR)); } - } /* @@ -5012,7 +5308,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) { zone_t *zone; zone_status_t zone_status; - int err; + int err = -1; zone_net_data_t *zbuf; if (secpolicy_zone_config(CRED()) != 0) @@ -5039,8 +5335,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) * non-global zones. */ zone_status = zone_status_get(zone); - if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) + if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) { + err = EINVAL; goto done; + } switch (attr) { case ZONE_ATTR_INITNAME: @@ -5074,12 +5372,13 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) case ZONE_ATTR_NETWORK: if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) { err = EINVAL; - goto done; + break; } zbuf = kmem_alloc(bufsize, KM_SLEEP); if (copyin(buf, zbuf, bufsize) != 0) { + kmem_free(zbuf, bufsize); err = EFAULT; - goto done; + break; } err = zone_set_network(zoneid, zbuf); kmem_free(zbuf, bufsize); @@ -5093,6 +5392,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize) done: zone_rele(zone); + ASSERT(err != -1); return (err != 0 ? set_errno(err) : 0); } |
