summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authordstaff <none@none>2005-12-21 19:34:44 -0800
committerdstaff <none@none>2005-12-21 19:34:44 -0800
commitcf8f45c7690afabe63bdb8066b11db58d708ad09 (patch)
tree05649165cccd7b495884695d11680184230e2051
parent454be575b1b84146968106421f6fbe80ffffbe59 (diff)
downloadillumos-joyent-cf8f45c7690afabe63bdb8066b11db58d708ad09.tar.gz
PSARC/2005/607 Zone Events for Sun Cluster
5052723 RFE: need notifications of zone state transition
-rw-r--r--usr/src/lib/libzonecfg/Makefile.com2
-rw-r--r--usr/src/lib/libzonecfg/common/libzonecfg.c337
-rw-r--r--usr/src/lib/libzonecfg/spec/libzonecfg.spec30
-rw-r--r--usr/src/uts/common/os/zone.c54
-rw-r--r--usr/src/uts/common/sys/zone.h15
5 files changed, 437 insertions, 1 deletions
diff --git a/usr/src/lib/libzonecfg/Makefile.com b/usr/src/lib/libzonecfg/Makefile.com
index d38aa2bdc0..5b3dcaaa25 100644
--- a/usr/src/lib/libzonecfg/Makefile.com
+++ b/usr/src/lib/libzonecfg/Makefile.com
@@ -34,7 +34,7 @@ include ../../Makefile.lib
LIBS = $(DYNLIB) $(LINTLIB)
LDLIBS += -lc -lsocket -lnsl -luuid
-$(DYNLIB) := LDLIBS += -lxml2
+$(DYNLIB) := LDLIBS += -lxml2 -lnvpair -lsysevent
CPPFLAGS += -I/usr/include/libxml2 -I$(SRCDIR) -D_REENTRANT
$(LINTLIB) := SRCS= $(SRCDIR)/$(LINTSRC)
diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c
index 416be740e3..2f549d228f 100644
--- a/usr/src/lib/libzonecfg/common/libzonecfg.c
+++ b/usr/src/lib/libzonecfg/common/libzonecfg.c
@@ -26,6 +26,9 @@
#pragma ident "%Z%%M% %I% %E% SMI"
+#include <libsysevent.h>
+#include <pthread.h>
+#include <stdlib.h>
#include <errno.h>
#include <fnmatch.h>
#include <strings.h>
@@ -39,6 +42,7 @@
#include <sys/mntio.h>
#include <sys/mnttab.h>
#include <sys/types.h>
+#include <sys/nvpair.h>
#include <arpa/inet.h>
#include <netdb.h>
@@ -54,7 +58,11 @@
#include <libzonecfg.h>
#include "zonecfg_impl.h"
+
#define _PATH_TMPFILE "/zonecfg.XXXXXX"
+#define ZONE_CB_RETRY_COUNT 10
+#define ZONE_EVENT_PING_SUBCLASS "ping"
+#define ZONE_EVENT_PING_PUBLISHER "solaris"
/* Hard-code the DTD element/attribute/entity names just once, here. */
#define DTD_ELEM_ATTR (const xmlChar *) "attr"
@@ -107,6 +115,21 @@ struct zone_dochandle {
char zone_dh_delete_name[ZONENAME_MAX];
};
+struct znotify {
+ void * zn_private;
+ evchan_t *zn_eventchan;
+ int (*zn_callback)(const char *zonename, zoneid_t zid,
+ const char *newstate, const char *oldstate, hrtime_t when, void *p);
+ pthread_mutex_t zn_mutex;
+ pthread_cond_t zn_cond;
+ pthread_mutex_t zn_bigmutex;
+ volatile enum {ZN_UNLOCKED, ZN_LOCKED, ZN_PING_INFLIGHT,
+ ZN_PING_RECEIVED} zn_state;
+ char zn_subscriber_id[MAX_SUBID_LEN];
+ volatile boolean_t zn_failed;
+ int zn_failure_count;
+};
+
char *zonecfg_root = "";
/*
@@ -3521,6 +3544,320 @@ zonecfg_valid_rctl(const char *name, const rctlblk_t *rctlblk)
return (B_TRUE);
}
+/*
+ * There is always a race condition between reading the initial copy of
+ * a zones state and its state changing. We address this by providing
+ * zonecfg_notify_critical_enter and zonecfg_noticy_critical_exit functions.
+ * When zonecfg_critical_enter is called, sets the state field to LOCKED
+ * and aquires biglock. Biglock protects against other threads executing
+ * critical_enter and the state field protects against state changes during
+ * the critical period.
+ *
+ * If any state changes occur, zn_cb will set the failed field of the znotify
+ * structure. This will cause the critical_exit function to re-lock the
+ * channel and return an error. Since evsnts may be delayed, the critical_exit
+ * function "flushes" the queue by putting an event on the queue and waiting for
+ * zn_cb to notify critical_exit that it received the ping event.
+ */
+static const char *
+string_get_tok(const char *in, char delim, int num)
+{
+ int i = 0;
+
+ for (; i < num; in++) {
+ if (*in == delim)
+ i++;
+ if (*in == 0)
+ return (NULL);
+ }
+ return (in);
+}
+
+static boolean_t
+is_ping(sysevent_t *ev)
+{
+ if (strcmp(sysevent_get_subclass_name(ev),
+ ZONE_EVENT_PING_SUBCLASS) == 0) {
+ return (B_TRUE);
+ } else {
+ return (B_FALSE);
+ }
+}
+
+static boolean_t
+is_my_ping(sysevent_t *ev)
+{
+ const char *sender;
+ char mypid[sizeof (pid_t) * 3 + 1];
+
+ (void) snprintf(mypid, sizeof (mypid), "%i", getpid());
+ sender = string_get_tok(sysevent_get_pub(ev), ':', 3);
+ if (sender == NULL)
+ return (B_FALSE);
+ if (strcmp(sender, mypid) != 0)
+ return (B_FALSE);
+ return (B_TRUE);
+}
+
+static int
+do_callback(struct znotify *zevtchan, sysevent_t *ev)
+{
+ nvlist_t *l;
+ int zid;
+ char *zonename;
+ char *newstate;
+ char *oldstate;
+ int ret;
+ hrtime_t when;
+
+ if (strcmp(sysevent_get_subclass_name(ev),
+ ZONE_EVENT_STATUS_SUBCLASS) == 0) {
+
+ if (sysevent_get_attr_list(ev, &l) != 0) {
+ if (errno == ENOMEM) {
+ zevtchan->zn_failure_count++;
+ return (EAGAIN);
+ }
+ return (0);
+ }
+ ret = 0;
+
+ if ((nvlist_lookup_string(l, ZONE_CB_NAME, &zonename) == 0) &&
+ (nvlist_lookup_string(l, ZONE_CB_NEWSTATE, &newstate)
+ == 0) &&
+ (nvlist_lookup_string(l, ZONE_CB_OLDSTATE, &oldstate)
+ == 0) &&
+ (nvlist_lookup_uint64(l, ZONE_CB_TIMESTAMP,
+ (uint64_t *)&when) == 0) &&
+ (nvlist_lookup_int32(l, ZONE_CB_ZONEID, &zid) == 0)) {
+ ret = zevtchan->zn_callback(zonename, zid, newstate,
+ oldstate, when, zevtchan->zn_private);
+ }
+
+ zevtchan->zn_failure_count = 0;
+ nvlist_free(l);
+ return (ret);
+ } else {
+ /*
+ * We have received an event in an unknown subclass. Ignore.
+ */
+ zevtchan->zn_failure_count = 0;
+ return (0);
+ }
+}
+
+static int
+zn_cb(sysevent_t *ev, void *p)
+{
+ struct znotify *zevtchan = p;
+ int error;
+
+ (void) pthread_mutex_lock(&(zevtchan->zn_mutex));
+
+ if (is_ping(ev) && !is_my_ping(ev)) {
+ (void) pthread_mutex_unlock((&zevtchan->zn_mutex));
+ return (0);
+ }
+
+ if (zevtchan->zn_state == ZN_LOCKED) {
+ assert(!is_ping(ev));
+ zevtchan->zn_failed = B_TRUE;
+ (void) pthread_mutex_unlock(&(zevtchan->zn_mutex));
+ return (0);
+ }
+
+ if (zevtchan->zn_state == ZN_PING_INFLIGHT) {
+ if (is_ping(ev)) {
+ zevtchan->zn_state = ZN_PING_RECEIVED;
+ (void) pthread_cond_signal(&(zevtchan->zn_cond));
+ (void) pthread_mutex_unlock(&(zevtchan->zn_mutex));
+ return (0);
+ } else {
+ zevtchan->zn_failed = B_TRUE;
+ (void) pthread_mutex_unlock(&(zevtchan->zn_mutex));
+ return (0);
+ }
+ }
+
+ if (zevtchan->zn_state == ZN_UNLOCKED) {
+
+ error = do_callback(zevtchan, ev);
+ (void) pthread_mutex_unlock(&(zevtchan->zn_mutex));
+ /*
+ * Every ENOMEM failure causes do_callback to increment
+ * zn_failure_count and every success causes it to
+ * set zn_failure_count to zero. If we got EAGAIN,
+ * we will sleep for zn_failure_count seconds and return
+ * EAGAIN to gpec to try again.
+ *
+ * After 55 seconds, or 10 try's we give up and drop the
+ * event.
+ */
+ if (error == EAGAIN) {
+ if (zevtchan->zn_failure_count > ZONE_CB_RETRY_COUNT) {
+ return (0);
+ }
+ (void) sleep(zevtchan->zn_failure_count);
+ }
+ return (error);
+ }
+
+ if (zevtchan->zn_state == ZN_PING_RECEIVED) {
+ (void) pthread_mutex_unlock(&(zevtchan->zn_mutex));
+ return (0);
+ }
+
+ abort();
+ return (0);
+}
+
+void
+zonecfg_notify_critical_enter(void *h)
+{
+ struct znotify *zevtchan = h;
+
+ (void) pthread_mutex_lock(&(zevtchan->zn_bigmutex));
+ zevtchan->zn_state = ZN_LOCKED;
+}
+
+int
+zonecfg_notify_critical_exit(void * h)
+{
+
+ struct znotify *zevtchan = h;
+
+ if (zevtchan->zn_state == ZN_UNLOCKED)
+ return (0);
+
+ (void) pthread_mutex_lock(&(zevtchan->zn_mutex));
+ zevtchan->zn_state = ZN_PING_INFLIGHT;
+
+ sysevent_evc_publish(zevtchan->zn_eventchan, ZONE_EVENT_STATUS_CLASS,
+ ZONE_EVENT_PING_SUBCLASS, ZONE_EVENT_PING_PUBLISHER,
+ zevtchan->zn_subscriber_id, NULL, EVCH_SLEEP);
+
+ while (zevtchan->zn_state != ZN_PING_RECEIVED) {
+ (void) pthread_cond_wait(&(zevtchan->zn_cond),
+ &(zevtchan->zn_mutex));
+ }
+
+ if (zevtchan->zn_failed == B_TRUE) {
+ zevtchan->zn_state = ZN_LOCKED;
+ zevtchan->zn_failed = B_FALSE;
+ (void) pthread_mutex_unlock(&(zevtchan->zn_mutex));
+ return (1);
+ }
+
+ zevtchan->zn_state = ZN_UNLOCKED;
+ (void) pthread_mutex_unlock(&(zevtchan->zn_mutex));
+ (void) pthread_mutex_unlock(&(zevtchan->zn_bigmutex));
+ return (0);
+}
+
+void
+zonecfg_notify_critical_abort(void *h)
+{
+ struct znotify *zevtchan = h;
+
+ zevtchan->zn_state = ZN_UNLOCKED;
+ zevtchan->zn_failed = B_FALSE;
+ /*
+ * Don't do anything about zn_lock. If it is held, it could only be
+ * held by zn_cb and it will be unlocked soon.
+ */
+ (void) pthread_mutex_unlock(&(zevtchan->zn_bigmutex));
+}
+
+void *
+zonecfg_notify_bind(int(*func)(const char *zonename, zoneid_t zid,
+ const char *newstate, const char *oldstate, hrtime_t when, void *p),
+ void *p)
+{
+ struct znotify *zevtchan;
+ int i = 1;
+ int r;
+
+ zevtchan = malloc(sizeof (struct znotify));
+
+ if (zevtchan == NULL)
+ return (NULL);
+
+ zevtchan->zn_private = p;
+ zevtchan->zn_callback = func;
+ zevtchan->zn_state = ZN_UNLOCKED;
+ zevtchan->zn_failed = B_FALSE;
+
+ if (pthread_mutex_init(&(zevtchan->zn_mutex), NULL))
+ goto out2;
+ if (pthread_cond_init(&(zevtchan->zn_cond), NULL)) {
+ (void) pthread_mutex_destroy(&(zevtchan->zn_mutex));
+ goto out2;
+ }
+ if (pthread_mutex_init(&(zevtchan->zn_bigmutex), NULL)) {
+ (void) pthread_mutex_destroy(&(zevtchan->zn_mutex));
+ (void) pthread_cond_destroy(&(zevtchan->zn_cond));
+ goto out2;
+ }
+
+ if (sysevent_evc_bind(ZONE_EVENT_CHANNEL, &(zevtchan->zn_eventchan),
+ 0) != 0)
+ goto out2;
+
+ do {
+ /*
+ * At 4 digits the subscriber ID gets too long and we have
+ * no chance of successfully registering.
+ */
+ if (i > 999)
+ goto out;
+
+ (void) sprintf(zevtchan->zn_subscriber_id, "zone_%li_%i",
+ getpid() % 999999l, i);
+
+ r = sysevent_evc_subscribe(zevtchan->zn_eventchan,
+ zevtchan->zn_subscriber_id, ZONE_EVENT_STATUS_CLASS, zn_cb,
+ zevtchan, 0);
+
+ i++;
+
+ } while (r);
+
+ return (zevtchan);
+out:
+ sysevent_evc_unbind(zevtchan->zn_eventchan);
+ (void) pthread_mutex_destroy(&zevtchan->zn_mutex);
+ (void) pthread_cond_destroy(&zevtchan->zn_cond);
+ (void) pthread_mutex_destroy(&(zevtchan->zn_bigmutex));
+out2:
+ free(zevtchan);
+
+ return (NULL);
+}
+
+void
+zonecfg_notify_unbind(void *handle)
+{
+
+ int ret;
+
+ sysevent_evc_unbind(((struct znotify *)handle)->zn_eventchan);
+ /*
+ * Check that all evc threads have gone away. This should be
+ * enforced by sysevent_evc_unbind.
+ */
+ ret = pthread_mutex_trylock(&((struct znotify *)handle)->zn_mutex);
+
+ if (ret)
+ abort();
+
+ (void) pthread_mutex_unlock(&((struct znotify *)handle)->zn_mutex);
+ (void) pthread_mutex_destroy(&((struct znotify *)handle)->zn_mutex);
+ (void) pthread_cond_destroy(&((struct znotify *)handle)->zn_cond);
+ (void) pthread_mutex_destroy(&((struct znotify *)handle)->zn_bigmutex);
+
+ free(handle);
+}
+
static int
zonecfg_add_ds_core(zone_dochandle_t handle, struct zone_dstab *tabptr)
{
diff --git a/usr/src/lib/libzonecfg/spec/libzonecfg.spec b/usr/src/lib/libzonecfg/spec/libzonecfg.spec
index a0cdb3db98..30b22a3526 100644
--- a/usr/src/lib/libzonecfg/spec/libzonecfg.spec
+++ b/usr/src/lib/libzonecfg/spec/libzonecfg.spec
@@ -664,6 +664,36 @@ declaration int zonecfg_construct_rctlblk(const struct zone_rctlvaltab *, rctlbl
version SUNWprivate_1.1
end
+function zonecfg_notify_bind
+include <libzonecfg.h>
+declaration void * zonecfg_notify_bind(int(*)(const char *, zoneid_t zid, const char *, const char *, void *), void *)
+version SUNWprivate_1.1
+end
+
+function zonecfg_notify_unbind
+include <libzonecfg.h>
+delcaration void zonecfg_notify_unbind(void *)
+version SUNWprivate_1.1
+end
+
+function zonecfg_notify_critical_enter
+include <libzonecfg.h>
+declaration void zonecfg_notify_critical_enter(void *);
+version SUNWprivate_1.1
+end
+
+function zonecfg_notify_critical_abort
+include <libzonecfg.h>
+declaration void zonecfg_notify_critical_abort(void *);
+version SUNWprivate_1.1
+end
+
+function zonecfg_notify_critical_exit
+include <libzonecfg.h>
+declaration int zonecfg_notify_critical_exit(void *);
+version SUNWprivate_1.1
+end
+
function zonecfg_open_scratch
include <libzonecfg.h>
declaration FILE *zonecfg_open_scratch(const char *, boolean_t)
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index fd3de9852f..9236430b40 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -194,6 +194,7 @@
#include <sys/pathname.h>
#include <sys/proc.h>
#include <sys/project.h>
+#include <sys/sysevent.h>
#include <sys/task.h>
#include <sys/systm.h>
#include <sys/types.h>
@@ -287,6 +288,27 @@ static kmutex_t zone_deathrow_lock;
/* number of zones is limited by virtual interface limit in IP */
uint_t maxzones = 8192;
+/* Event channel to sent zone state change notifications */
+evchan_t *zone_event_chan;
+
+/*
+ * This table holds the mapping from kernel zone states to
+ * states visible in the state notification API.
+ * The idea is that we only expose "obvious" states and
+ * do not expose states which are just implementation details.
+ */
+const char *zone_status_table[] = {
+ ZONE_EVENT_UNINITIALIZED, /* uninitialized */
+ ZONE_EVENT_READY, /* ready */
+ ZONE_EVENT_READY, /* booting */
+ ZONE_EVENT_RUNNING, /* running */
+ ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */
+ ZONE_EVENT_SHUTTING_DOWN, /* empty */
+ ZONE_EVENT_SHUTTING_DOWN, /* down */
+ ZONE_EVENT_SHUTTING_DOWN, /* dying */
+ ZONE_EVENT_UNINITIALIZED, /* dead */
+};
+
/*
* This isn't static so lint doesn't complain.
*/
@@ -986,6 +1008,7 @@ zone_init(void)
rctl_set_t *set;
rctl_alloc_gp_t *gp;
rctl_entity_p_t e;
+ int res;
ASSERT(curproc == &p0);
@@ -1064,6 +1087,15 @@ zone_init(void)
* will be set when the root filesystem is mounted).
*/
global_zone = &zone0;
+
+ /*
+ * Setup an event channel to send zone status change notifications on
+ */
+ res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan,
+ EVCH_CREAT);
+
+ if (res)
+ panic("Sysevent_evc_bind failed during zone setup.\n");
}
static void
@@ -1120,10 +1152,32 @@ zone_free(zone_t *zone)
static void
zone_status_set(zone_t *zone, zone_status_t status)
{
+
+ nvlist_t *nvl = NULL;
ASSERT(MUTEX_HELD(&zone_status_lock));
ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE &&
status >= zone_status_get(zone));
+
+ if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) ||
+ nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) ||
+ nvlist_add_string(nvl, ZONE_CB_NEWSTATE,
+ zone_status_table[status]) ||
+ nvlist_add_string(nvl, ZONE_CB_OLDSTATE,
+ zone_status_table[zone->zone_status]) ||
+ nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) ||
+ nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) ||
+ sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS,
+ ZONE_EVENT_STATUS_SUBCLASS,
+ "sun.com", "kernel", nvl, EVCH_SLEEP)) {
+#ifdef DEBUG
+ (void) printf(
+ "Failed to allocate and send zone state change event.\n");
+#endif
+ }
+ nvlist_free(nvl);
+
zone->zone_status = status;
+
cv_broadcast(&zone->zone_cv);
}
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index cc2031ec2f..441ea8cb3e 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -77,6 +77,21 @@ extern "C" {
#define ZONE_ATTR_POOLID 6
#define ZONE_ATTR_INITPID 7
+#define ZONE_EVENT_CHANNEL "com.sun:zones:status"
+#define ZONE_EVENT_STATUS_CLASS "status"
+#define ZONE_EVENT_STATUS_SUBCLASS "change"
+
+#define ZONE_EVENT_UNINITIALIZED "uninitialized"
+#define ZONE_EVENT_READY "ready"
+#define ZONE_EVENT_RUNNING "running"
+#define ZONE_EVENT_SHUTTING_DOWN "shutting_down"
+
+#define ZONE_CB_NAME "zonename"
+#define ZONE_CB_NEWSTATE "newstate"
+#define ZONE_CB_OLDSTATE "oldstate"
+#define ZONE_CB_TIMESTAMP "when"
+#define ZONE_CB_ZONEID "zoneid"
+
#ifdef _SYSCALL32
typedef struct {
caddr32_t zone_name;