summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/os/zone.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/os/zone.c')
-rw-r--r--usr/src/uts/common/os/zone.c322
1 files changed, 311 insertions, 11 deletions
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index bbc91e3fa7..79ccd94ae4 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -223,6 +223,7 @@
#include <sys/pool.h>
#include <sys/pool_pset.h>
#include <sys/pset.h>
+#include <sys/strlog.h>
#include <sys/sysmacros.h>
#include <sys/callb.h>
#include <sys/vmparam.h>
@@ -249,6 +250,17 @@
#include <vm/seg.h>
#include <sys/mac.h>
+/*
+ * This constant specifies the number of seconds that threads waiting for
+ * subsystems to release a zone's general-purpose references will wait before
+ * they log the zone's reference counts. The constant's value shouldn't
+ * be so small that reference counts are unnecessarily reported for zones
+ * whose references are slowly released. On the other hand, it shouldn't be so
+ * large that users reboot their systems out of frustration over hung zones
+ * before the system logs the zones' reference counts.
+ */
+#define ZONE_DESTROY_TIMEOUT_SECS 60
+
/* List of data link IDs which are accessible from the zone */
typedef struct zone_dl {
datalink_id_t zdl_id;
@@ -338,6 +350,20 @@ const char *zone_status_table[] = {
};
/*
+ * This array contains the names of the subsystems listed in zone_ref_subsys_t
+ * (see sys/zone.h).
+ */
+static char *zone_ref_subsys_names[] = {
+ "NFS", /* ZONE_REF_NFS */
+ "NFSv4", /* ZONE_REF_NFSV4 */
+ "SMBFS", /* ZONE_REF_SMBFS */
+ "MNTFS", /* ZONE_REF_MNTFS */
+ "LOFI", /* ZONE_REF_LOFI */
+ "VFS", /* ZONE_REF_VFS */
+ "IPC" /* ZONE_REF_IPC */
+};
+
+/*
* This isn't static so lint doesn't complain.
*/
rctl_hndl_t rc_zone_cpu_shares;
@@ -1881,6 +1907,8 @@ zone_zsd_init(void)
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
+ list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
+ offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
offsetof(struct zsd_entry, zsd_linkage));
list_insert_head(&zone_active, &zone0);
@@ -2144,6 +2172,7 @@ zone_free(zone_t *zone)
ASSERT(zone->zone_kcred == NULL);
ASSERT(zone_status_get(zone) == ZONE_IS_DEAD ||
zone_status_get(zone) == ZONE_IS_UNINITIALIZED);
+ ASSERT(list_is_empty(&zone->zone_ref_list));
/*
* Remove any zone caps.
@@ -2160,6 +2189,7 @@ zone_free(zone_t *zone)
mutex_exit(&zone_deathrow_lock);
}
+ list_destroy(&zone->zone_ref_list);
zone_free_zsd(zone);
zone_free_datasets(zone);
list_destroy(&zone->zone_dl_list);
@@ -2494,6 +2524,11 @@ zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status)
* to force halt/reboot to block waiting for the zone_cred_ref to drop
* to 0. This can be useful to flush out other sources of cached creds
* that may be less innocuous than the driver case.
+ *
+ * Zones also provide a tracked reference counting mechanism in which zone
+ * references are represented by "crumbs" (zone_ref structures). Crumbs help
+ * debuggers determine the sources of leaked zone references. See
+ * zone_hold_ref() and zone_rele_ref() below for more information.
*/
int zone_wait_for_cred = 0;
@@ -2506,6 +2541,14 @@ zone_hold_locked(zone_t *z)
ASSERT(z->zone_ref != 0);
}
+/*
+ * Increment the specified zone's reference count. The zone's zone_t structure
+ * will not be freed as long as the zone's reference count is nonzero.
+ * Decrement the zone's reference count via zone_rele().
+ *
+ * NOTE: This function should only be used to hold zones for short periods of
+ * time. Use zone_hold_ref() if the zone must be held for a long time.
+ */
void
zone_hold(zone_t *z)
{
@@ -2522,14 +2565,27 @@ zone_hold(zone_t *z)
#define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \
(!zone_wait_for_cred || (zone)->zone_cred_ref == 0))
-void
-zone_rele(zone_t *z)
+/*
+ * Common zone reference release function invoked by zone_rele() and
+ * zone_rele_ref(). If subsys is ZONE_REF_NUM_SUBSYS, then the specified
+ * zone's subsystem-specific reference counters are not affected by the
+ * release. If ref is not NULL, then the zone_ref_t to which it refers is
+ * removed from the specified zone's reference list. ref must be non-NULL iff
+ * subsys is not ZONE_REF_NUM_SUBSYS.
+ */
+static void
+zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
{
boolean_t wakeup;
mutex_enter(&z->zone_lock);
ASSERT(z->zone_ref != 0);
z->zone_ref--;
+ if (subsys != ZONE_REF_NUM_SUBSYS) {
+ ASSERT(z->zone_subsys_ref[subsys] != 0);
+ z->zone_subsys_ref[subsys]--;
+ list_remove(&z->zone_ref_list, ref);
+ }
if (z->zone_ref == 0 && z->zone_cred_ref == 0) {
/* no more refs, free the structure */
mutex_exit(&z->zone_lock);
@@ -2551,6 +2607,83 @@ zone_rele(zone_t *z)
}
}
+/*
+ * Decrement the specified zone's reference count. The specified zone will
+ * cease to exist after this function returns if the reference count drops to
+ * zero. This function should be paired with zone_hold().
+ */
+void
+zone_rele(zone_t *z)
+{
+ zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS);
+}
+
+/*
+ * Initialize a zone reference structure. This function must be invoked for
+ * a reference structure before the structure is passed to zone_hold_ref().
+ */
+void
+zone_init_ref(zone_ref_t *ref)
+{
+ ref->zref_zone = NULL;
+ list_link_init(&ref->zref_linkage);
+}
+
+/*
+ * Acquire a reference to zone z. The caller must specify the
+ * zone_ref_subsys_t constant associated with its subsystem. The specified
+ * zone_ref_t structure will represent a reference to the specified zone. Use
+ * zone_rele_ref() to release the reference.
+ *
+ * The referenced zone_t structure will not be freed as long as the zone_t's
+ * zone_status field is not ZONE_IS_DEAD and the zone has outstanding
+ * references.
+ *
+ * NOTE: The zone_ref_t structure must be initialized before it is used.
+ * See zone_init_ref() above.
+ */
+void
+zone_hold_ref(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys)
+{
+ ASSERT(subsys >= 0 && subsys < ZONE_REF_NUM_SUBSYS);
+
+ /*
+ * Prevent consumers from reusing a reference structure before
+ * releasing it.
+ */
+ VERIFY(ref->zref_zone == NULL);
+
+ ref->zref_zone = z;
+ mutex_enter(&z->zone_lock);
+ zone_hold_locked(z);
+ z->zone_subsys_ref[subsys]++;
+ ASSERT(z->zone_subsys_ref[subsys] != 0);
+ list_insert_head(&z->zone_ref_list, ref);
+ mutex_exit(&z->zone_lock);
+}
+
+/*
+ * Release the zone reference represented by the specified zone_ref_t.
+ * The reference is invalid after it's released; however, the zone_ref_t
+ * structure can be reused without having to invoke zone_init_ref().
+ * subsys should be the same value that was passed to zone_hold_ref()
+ * when the reference was acquired.
+ */
+void
+zone_rele_ref(zone_ref_t *ref, zone_ref_subsys_t subsys)
+{
+ zone_rele_common(ref->zref_zone, ref, subsys);
+
+ /*
+ * Set the zone_ref_t's zref_zone field to NULL to generate panics
+ * when consumers dereference the reference. This helps us catch
+ * consumers who use released references. Furthermore, this lets
+ * consumers reuse the zone_ref_t structure without having to
+ * invoke zone_init_ref().
+ */
+ ref->zref_zone = NULL;
+}
+
void
zone_cred_hold(zone_t *z)
{
@@ -3763,10 +3896,12 @@ static int
zone_set_privset(zone_t *zone, const priv_set_t *zone_privs,
size_t zone_privssz)
{
- priv_set_t *privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
+ priv_set_t *privs;
if (zone_privssz < sizeof (priv_set_t))
- return (set_errno(ENOMEM));
+ return (ENOMEM);
+
+ privs = kmem_alloc(sizeof (priv_set_t), KM_SLEEP);
if (copyin(zone_privs, privs, sizeof (priv_set_t))) {
kmem_free(privs, sizeof (priv_set_t));
@@ -3982,6 +4117,8 @@ zone_create(const char *zone_name, const char *zone_root,
mutex_init(&zone->zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&zone->zone_mem_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&zone->zone_cv, NULL, CV_DEFAULT, NULL);
+ list_create(&zone->zone_ref_list, sizeof (zone_ref_t),
+ offsetof(zone_ref_t, zref_linkage));
list_create(&zone->zone_zsd, sizeof (struct zsd_entry),
offsetof(struct zsd_entry, zsd_linkage));
list_create(&zone->zone_datasets, sizeof (zone_dataset_t),
@@ -4267,7 +4404,7 @@ errout:
* zone_kcred. To free the zone, we call crfree, which will call
* zone_cred_rele, which will call zone_free.
*/
- ASSERT(zone->zone_cred_ref == 1); /* for zone_kcred */
+ ASSERT(zone->zone_cred_ref == 1);
ASSERT(zone->zone_kcred->cr_ref == 1);
ASSERT(zone->zone_ref == 0);
zkcr = zone->zone_kcred;
@@ -4553,6 +4690,101 @@ zone_shutdown(zoneid_t zoneid)
}
/*
+ * Log the specified zone's reference counts. The caller should not be
+ * holding the zone's zone_lock.
+ */
+static void
+zone_log_refcounts(zone_t *zone)
+{
+ char *buffer;
+ char *buffer_position;
+ uint32_t buffer_size;
+ uint32_t index;
+ uint_t ref;
+ uint_t cred_ref;
+
+ /*
+ * Construct a string representing the subsystem-specific reference
+ * counts. The counts are printed in ascending order by index into the
+ * zone_t::zone_subsys_ref array. The list will be surrounded by
+ * square brackets [] and will only contain nonzero reference counts.
+ *
+ * The buffer will hold two square bracket characters plus ten digits,
+ * one colon, one space, one comma, and some characters for a
+ * subsystem name per subsystem-specific reference count. (Unsigned 32-
+ * bit integers have at most ten decimal digits.) The last
+ * reference count's comma is replaced by the closing square
+ * bracket and a NULL character to terminate the string.
+ *
+ * NOTE: We have to grab the zone's zone_lock to create a consistent
+ * snapshot of the zone's reference counters.
+ *
+ * First, figure out how much space the string buffer will need.
+ * The buffer's size is stored in buffer_size.
+ */
+ buffer_size = 2; /* for the square brackets */
+ mutex_enter(&zone->zone_lock);
+ zone->zone_flags |= ZF_REFCOUNTS_LOGGED;
+ ref = zone->zone_ref;
+ cred_ref = zone->zone_cred_ref;
+ for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index)
+ if (zone->zone_subsys_ref[index] != 0)
+ buffer_size += strlen(zone_ref_subsys_names[index]) +
+ 13;
+ if (buffer_size == 2) {
+ /*
+ * No subsystems had nonzero reference counts. Don't bother
+ * with allocating a buffer; just log the general-purpose and
+ * credential reference counts.
+ */
+ mutex_exit(&zone->zone_lock);
+ (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
+ "Zone '%s' (ID: %d) is shutting down, but %u zone "
+ "references and %u credential references are still extant",
+ zone->zone_name, zone->zone_id, ref, cred_ref);
+ return;
+ }
+
+ /*
+ * buffer_size contains the exact number of characters that the
+ * buffer will need. Allocate the buffer and fill it with nonzero
+ * subsystem-specific reference counts. Surround the results with
+ * square brackets afterwards.
+ */
+ buffer = kmem_alloc(buffer_size, KM_SLEEP);
+ buffer_position = &buffer[1];
+ for (index = 0; index < ZONE_REF_NUM_SUBSYS; ++index) {
+ /*
+ * NOTE: The DDI's version of sprintf() returns a pointer to
+ * the modified buffer rather than the number of bytes written
+ * (as in snprintf(3C)). This is unfortunate and annoying.
+ * Therefore, we'll use snprintf() with INT_MAX to get the
+ * number of bytes written. Using INT_MAX is safe because
+ * the buffer is perfectly sized for the data: we'll never
+ * overrun the buffer.
+ */
+ if (zone->zone_subsys_ref[index] != 0)
+ buffer_position += snprintf(buffer_position, INT_MAX,
+ "%s: %u,", zone_ref_subsys_names[index],
+ zone->zone_subsys_ref[index]);
+ }
+ mutex_exit(&zone->zone_lock);
+ buffer[0] = '[';
+ ASSERT((uintptr_t)(buffer_position - buffer) < buffer_size);
+ ASSERT(buffer_position[0] == '\0' && buffer_position[-1] == ',');
+ buffer_position[-1] = ']';
+
+ /*
+ * Log the reference counts and free the message buffer.
+ */
+ (void) strlog(0, 0, 1, SL_CONSOLE | SL_NOTE,
+ "Zone '%s' (ID: %d) is shutting down, but %u zone references and "
+ "%u credential references are still extant %s", zone->zone_name,
+ zone->zone_id, ref, cred_ref, buffer);
+ kmem_free(buffer, buffer_size);
+}
+
+/*
* Systemcall entry point to finalize the zone halt process. The caller
* must have already successfully called zone_shutdown().
*
@@ -4566,6 +4798,8 @@ zone_destroy(zoneid_t zoneid)
uint64_t uniqid;
zone_t *zone;
zone_status_t status;
+ clock_t wait_time;
+ boolean_t log_refcounts;
if (secpolicy_zone_config(CRED()) != 0)
return (set_errno(EPERM));
@@ -4609,9 +4843,12 @@ zone_destroy(zoneid_t zoneid)
zone_rele(zone);
zone = NULL; /* potentially free'd */
+ log_refcounts = B_FALSE;
+ wait_time = SEC_TO_TICK(ZONE_DESTROY_TIMEOUT_SECS);
mutex_enter(&zonehash_lock);
for (; /* ever */; ) {
boolean_t unref;
+ boolean_t refs_have_been_logged;
if ((zone = zone_find_all_by_id(zoneid)) == NULL ||
zone->zone_uniqid != uniqid) {
@@ -4624,6 +4861,8 @@ zone_destroy(zoneid_t zoneid)
}
mutex_enter(&zone->zone_lock);
unref = ZONE_IS_UNREF(zone);
+ refs_have_been_logged = (zone->zone_flags &
+ ZF_REFCOUNTS_LOGGED);
mutex_exit(&zone->zone_lock);
if (unref) {
/*
@@ -4636,12 +4875,69 @@ zone_destroy(zoneid_t zoneid)
break;
}
+ /*
+ * Wait for zone_rele_common() or zone_cred_rele() to signal
+ * zone_destroy_cv. zone_destroy_cv is signaled only when
+ * some zone's general-purpose reference count reaches one.
+ * If ZONE_DESTROY_TIMEOUT_SECS seconds elapse while waiting
+ * on zone_destroy_cv, then log the zone's reference counts and
+ * continue to wait for zone_rele() and zone_cred_rele().
+ */
+ if (!refs_have_been_logged) {
+ if (!log_refcounts) {
+ /*
+ * This thread hasn't timed out waiting on
+ * zone_destroy_cv yet. Wait wait_time clock
+ * ticks (initially ZONE_DESTROY_TIMEOUT_SECS
+ * seconds) for the zone's references to clear.
+ */
+ ASSERT(wait_time > 0);
+ wait_time = cv_reltimedwait_sig(
+ &zone_destroy_cv, &zonehash_lock, wait_time,
+ TR_SEC);
+ if (wait_time > 0) {
+ /*
+ * A thread in zone_rele() or
+ * zone_cred_rele() signaled
+ * zone_destroy_cv before this thread's
+ * wait timed out. The zone might have
+ * only one reference left; find out!
+ */
+ continue;
+ } else if (wait_time == 0) {
+ /* The thread's process was signaled. */
+ mutex_exit(&zonehash_lock);
+ return (set_errno(EINTR));
+ }
+
+ /*
+ * The thread timed out while waiting on
+ * zone_destroy_cv. Even though the thread
+ * timed out, it has to check whether another
+ * thread woke up from zone_destroy_cv and
+ * destroyed the zone.
+ *
+ * If the zone still exists and has more than
+ * one unreleased general-purpose reference,
+ * then log the zone's reference counts.
+ */
+ log_refcounts = B_TRUE;
+ continue;
+ }
+
+ /*
+ * The thread already timed out on zone_destroy_cv while
+ * waiting for subsystems to release the zone's last
+ * general-purpose references. Log the zone's reference
+ * counts and wait indefinitely on zone_destroy_cv.
+ */
+ zone_log_refcounts(zone);
+ }
if (cv_wait_sig(&zone_destroy_cv, &zonehash_lock) == 0) {
- /* Signaled */
+ /* The thread's process was signaled. */
mutex_exit(&zonehash_lock);
return (set_errno(EINTR));
}
-
}
/*
@@ -5012,7 +5308,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
{
zone_t *zone;
zone_status_t zone_status;
- int err;
+ int err = -1;
zone_net_data_t *zbuf;
if (secpolicy_zone_config(CRED()) != 0)
@@ -5039,8 +5335,10 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
* non-global zones.
*/
zone_status = zone_status_get(zone);
- if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY)
+ if (attr != ZONE_ATTR_PHYS_MCAP && zone_status > ZONE_IS_READY) {
+ err = EINVAL;
goto done;
+ }
switch (attr) {
case ZONE_ATTR_INITNAME:
@@ -5074,12 +5372,13 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
case ZONE_ATTR_NETWORK:
if (bufsize > (PIPE_BUF + sizeof (zone_net_data_t))) {
err = EINVAL;
- goto done;
+ break;
}
zbuf = kmem_alloc(bufsize, KM_SLEEP);
if (copyin(buf, zbuf, bufsize) != 0) {
+ kmem_free(zbuf, bufsize);
err = EFAULT;
- goto done;
+ break;
}
err = zone_set_network(zoneid, zbuf);
kmem_free(zbuf, bufsize);
@@ -5093,6 +5392,7 @@ zone_setattr(zoneid_t zoneid, int attr, void *buf, size_t bufsize)
done:
zone_rele(zone);
+ ASSERT(err != -1);
return (err != 0 ? set_errno(err) : 0);
}