diff options
author | dstaff <none@none> | 2005-12-21 19:34:44 -0800 |
---|---|---|
committer | dstaff <none@none> | 2005-12-21 19:34:44 -0800 |
commit | cf8f45c7690afabe63bdb8066b11db58d708ad09 (patch) | |
tree | 05649165cccd7b495884695d11680184230e2051 | |
parent | 454be575b1b84146968106421f6fbe80ffffbe59 (diff) | |
download | illumos-joyent-cf8f45c7690afabe63bdb8066b11db58d708ad09.tar.gz |
PSARC/2005/607 Zone Events for Sun Cluster
5052723 RFE: need notifications of zone state transition
-rw-r--r-- | usr/src/lib/libzonecfg/Makefile.com | 2 | ||||
-rw-r--r-- | usr/src/lib/libzonecfg/common/libzonecfg.c | 337 | ||||
-rw-r--r-- | usr/src/lib/libzonecfg/spec/libzonecfg.spec | 30 | ||||
-rw-r--r-- | usr/src/uts/common/os/zone.c | 54 | ||||
-rw-r--r-- | usr/src/uts/common/sys/zone.h | 15 |
5 files changed, 437 insertions, 1 deletions
diff --git a/usr/src/lib/libzonecfg/Makefile.com b/usr/src/lib/libzonecfg/Makefile.com index d38aa2bdc0..5b3dcaaa25 100644 --- a/usr/src/lib/libzonecfg/Makefile.com +++ b/usr/src/lib/libzonecfg/Makefile.com @@ -34,7 +34,7 @@ include ../../Makefile.lib LIBS = $(DYNLIB) $(LINTLIB) LDLIBS += -lc -lsocket -lnsl -luuid -$(DYNLIB) := LDLIBS += -lxml2 +$(DYNLIB) := LDLIBS += -lxml2 -lnvpair -lsysevent CPPFLAGS += -I/usr/include/libxml2 -I$(SRCDIR) -D_REENTRANT $(LINTLIB) := SRCS= $(SRCDIR)/$(LINTSRC) diff --git a/usr/src/lib/libzonecfg/common/libzonecfg.c b/usr/src/lib/libzonecfg/common/libzonecfg.c index 416be740e3..2f549d228f 100644 --- a/usr/src/lib/libzonecfg/common/libzonecfg.c +++ b/usr/src/lib/libzonecfg/common/libzonecfg.c @@ -26,6 +26,9 @@ #pragma ident "%Z%%M% %I% %E% SMI" +#include <libsysevent.h> +#include <pthread.h> +#include <stdlib.h> #include <errno.h> #include <fnmatch.h> #include <strings.h> @@ -39,6 +42,7 @@ #include <sys/mntio.h> #include <sys/mnttab.h> #include <sys/types.h> +#include <sys/nvpair.h> #include <arpa/inet.h> #include <netdb.h> @@ -54,7 +58,11 @@ #include <libzonecfg.h> #include "zonecfg_impl.h" + #define _PATH_TMPFILE "/zonecfg.XXXXXX" +#define ZONE_CB_RETRY_COUNT 10 +#define ZONE_EVENT_PING_SUBCLASS "ping" +#define ZONE_EVENT_PING_PUBLISHER "solaris" /* Hard-code the DTD element/attribute/entity names just once, here. */ #define DTD_ELEM_ATTR (const xmlChar *) "attr" @@ -107,6 +115,21 @@ struct zone_dochandle { char zone_dh_delete_name[ZONENAME_MAX]; }; +struct znotify { + void * zn_private; + evchan_t *zn_eventchan; + int (*zn_callback)(const char *zonename, zoneid_t zid, + const char *newstate, const char *oldstate, hrtime_t when, void *p); + pthread_mutex_t zn_mutex; + pthread_cond_t zn_cond; + pthread_mutex_t zn_bigmutex; + volatile enum {ZN_UNLOCKED, ZN_LOCKED, ZN_PING_INFLIGHT, + ZN_PING_RECEIVED} zn_state; + char zn_subscriber_id[MAX_SUBID_LEN]; + volatile boolean_t zn_failed; + int zn_failure_count; +}; + char *zonecfg_root = ""; /* @@ -3521,6 +3544,320 @@ zonecfg_valid_rctl(const char *name, const rctlblk_t *rctlblk) return (B_TRUE); } +/* + * There is always a race condition between reading the initial copy of + * a zones state and its state changing. We address this by providing + * zonecfg_notify_critical_enter and zonecfg_noticy_critical_exit functions. + * When zonecfg_critical_enter is called, sets the state field to LOCKED + * and aquires biglock. Biglock protects against other threads executing + * critical_enter and the state field protects against state changes during + * the critical period. + * + * If any state changes occur, zn_cb will set the failed field of the znotify + * structure. This will cause the critical_exit function to re-lock the + * channel and return an error. Since evsnts may be delayed, the critical_exit + * function "flushes" the queue by putting an event on the queue and waiting for + * zn_cb to notify critical_exit that it received the ping event. + */ +static const char * +string_get_tok(const char *in, char delim, int num) +{ + int i = 0; + + for (; i < num; in++) { + if (*in == delim) + i++; + if (*in == 0) + return (NULL); + } + return (in); +} + +static boolean_t +is_ping(sysevent_t *ev) +{ + if (strcmp(sysevent_get_subclass_name(ev), + ZONE_EVENT_PING_SUBCLASS) == 0) { + return (B_TRUE); + } else { + return (B_FALSE); + } +} + +static boolean_t +is_my_ping(sysevent_t *ev) +{ + const char *sender; + char mypid[sizeof (pid_t) * 3 + 1]; + + (void) snprintf(mypid, sizeof (mypid), "%i", getpid()); + sender = string_get_tok(sysevent_get_pub(ev), ':', 3); + if (sender == NULL) + return (B_FALSE); + if (strcmp(sender, mypid) != 0) + return (B_FALSE); + return (B_TRUE); +} + +static int +do_callback(struct znotify *zevtchan, sysevent_t *ev) +{ + nvlist_t *l; + int zid; + char *zonename; + char *newstate; + char *oldstate; + int ret; + hrtime_t when; + + if (strcmp(sysevent_get_subclass_name(ev), + ZONE_EVENT_STATUS_SUBCLASS) == 0) { + + if (sysevent_get_attr_list(ev, &l) != 0) { + if (errno == ENOMEM) { + zevtchan->zn_failure_count++; + return (EAGAIN); + } + return (0); + } + ret = 0; + + if ((nvlist_lookup_string(l, ZONE_CB_NAME, &zonename) == 0) && + (nvlist_lookup_string(l, ZONE_CB_NEWSTATE, &newstate) + == 0) && + (nvlist_lookup_string(l, ZONE_CB_OLDSTATE, &oldstate) + == 0) && + (nvlist_lookup_uint64(l, ZONE_CB_TIMESTAMP, + (uint64_t *)&when) == 0) && + (nvlist_lookup_int32(l, ZONE_CB_ZONEID, &zid) == 0)) { + ret = zevtchan->zn_callback(zonename, zid, newstate, + oldstate, when, zevtchan->zn_private); + } + + zevtchan->zn_failure_count = 0; + nvlist_free(l); + return (ret); + } else { + /* + * We have received an event in an unknown subclass. Ignore. + */ + zevtchan->zn_failure_count = 0; + return (0); + } +} + +static int +zn_cb(sysevent_t *ev, void *p) +{ + struct znotify *zevtchan = p; + int error; + + (void) pthread_mutex_lock(&(zevtchan->zn_mutex)); + + if (is_ping(ev) && !is_my_ping(ev)) { + (void) pthread_mutex_unlock((&zevtchan->zn_mutex)); + return (0); + } + + if (zevtchan->zn_state == ZN_LOCKED) { + assert(!is_ping(ev)); + zevtchan->zn_failed = B_TRUE; + (void) pthread_mutex_unlock(&(zevtchan->zn_mutex)); + return (0); + } + + if (zevtchan->zn_state == ZN_PING_INFLIGHT) { + if (is_ping(ev)) { + zevtchan->zn_state = ZN_PING_RECEIVED; + (void) pthread_cond_signal(&(zevtchan->zn_cond)); + (void) pthread_mutex_unlock(&(zevtchan->zn_mutex)); + return (0); + } else { + zevtchan->zn_failed = B_TRUE; + (void) pthread_mutex_unlock(&(zevtchan->zn_mutex)); + return (0); + } + } + + if (zevtchan->zn_state == ZN_UNLOCKED) { + + error = do_callback(zevtchan, ev); + (void) pthread_mutex_unlock(&(zevtchan->zn_mutex)); + /* + * Every ENOMEM failure causes do_callback to increment + * zn_failure_count and every success causes it to + * set zn_failure_count to zero. If we got EAGAIN, + * we will sleep for zn_failure_count seconds and return + * EAGAIN to gpec to try again. + * + * After 55 seconds, or 10 try's we give up and drop the + * event. + */ + if (error == EAGAIN) { + if (zevtchan->zn_failure_count > ZONE_CB_RETRY_COUNT) { + return (0); + } + (void) sleep(zevtchan->zn_failure_count); + } + return (error); + } + + if (zevtchan->zn_state == ZN_PING_RECEIVED) { + (void) pthread_mutex_unlock(&(zevtchan->zn_mutex)); + return (0); + } + + abort(); + return (0); +} + +void +zonecfg_notify_critical_enter(void *h) +{ + struct znotify *zevtchan = h; + + (void) pthread_mutex_lock(&(zevtchan->zn_bigmutex)); + zevtchan->zn_state = ZN_LOCKED; +} + +int +zonecfg_notify_critical_exit(void * h) +{ + + struct znotify *zevtchan = h; + + if (zevtchan->zn_state == ZN_UNLOCKED) + return (0); + + (void) pthread_mutex_lock(&(zevtchan->zn_mutex)); + zevtchan->zn_state = ZN_PING_INFLIGHT; + + sysevent_evc_publish(zevtchan->zn_eventchan, ZONE_EVENT_STATUS_CLASS, + ZONE_EVENT_PING_SUBCLASS, ZONE_EVENT_PING_PUBLISHER, + zevtchan->zn_subscriber_id, NULL, EVCH_SLEEP); + + while (zevtchan->zn_state != ZN_PING_RECEIVED) { + (void) pthread_cond_wait(&(zevtchan->zn_cond), + &(zevtchan->zn_mutex)); + } + + if (zevtchan->zn_failed == B_TRUE) { + zevtchan->zn_state = ZN_LOCKED; + zevtchan->zn_failed = B_FALSE; + (void) pthread_mutex_unlock(&(zevtchan->zn_mutex)); + return (1); + } + + zevtchan->zn_state = ZN_UNLOCKED; + (void) pthread_mutex_unlock(&(zevtchan->zn_mutex)); + (void) pthread_mutex_unlock(&(zevtchan->zn_bigmutex)); + return (0); +} + +void +zonecfg_notify_critical_abort(void *h) +{ + struct znotify *zevtchan = h; + + zevtchan->zn_state = ZN_UNLOCKED; + zevtchan->zn_failed = B_FALSE; + /* + * Don't do anything about zn_lock. If it is held, it could only be + * held by zn_cb and it will be unlocked soon. + */ + (void) pthread_mutex_unlock(&(zevtchan->zn_bigmutex)); +} + +void * +zonecfg_notify_bind(int(*func)(const char *zonename, zoneid_t zid, + const char *newstate, const char *oldstate, hrtime_t when, void *p), + void *p) +{ + struct znotify *zevtchan; + int i = 1; + int r; + + zevtchan = malloc(sizeof (struct znotify)); + + if (zevtchan == NULL) + return (NULL); + + zevtchan->zn_private = p; + zevtchan->zn_callback = func; + zevtchan->zn_state = ZN_UNLOCKED; + zevtchan->zn_failed = B_FALSE; + + if (pthread_mutex_init(&(zevtchan->zn_mutex), NULL)) + goto out2; + if (pthread_cond_init(&(zevtchan->zn_cond), NULL)) { + (void) pthread_mutex_destroy(&(zevtchan->zn_mutex)); + goto out2; + } + if (pthread_mutex_init(&(zevtchan->zn_bigmutex), NULL)) { + (void) pthread_mutex_destroy(&(zevtchan->zn_mutex)); + (void) pthread_cond_destroy(&(zevtchan->zn_cond)); + goto out2; + } + + if (sysevent_evc_bind(ZONE_EVENT_CHANNEL, &(zevtchan->zn_eventchan), + 0) != 0) + goto out2; + + do { + /* + * At 4 digits the subscriber ID gets too long and we have + * no chance of successfully registering. + */ + if (i > 999) + goto out; + + (void) sprintf(zevtchan->zn_subscriber_id, "zone_%li_%i", + getpid() % 999999l, i); + + r = sysevent_evc_subscribe(zevtchan->zn_eventchan, + zevtchan->zn_subscriber_id, ZONE_EVENT_STATUS_CLASS, zn_cb, + zevtchan, 0); + + i++; + + } while (r); + + return (zevtchan); +out: + sysevent_evc_unbind(zevtchan->zn_eventchan); + (void) pthread_mutex_destroy(&zevtchan->zn_mutex); + (void) pthread_cond_destroy(&zevtchan->zn_cond); + (void) pthread_mutex_destroy(&(zevtchan->zn_bigmutex)); +out2: + free(zevtchan); + + return (NULL); +} + +void +zonecfg_notify_unbind(void *handle) +{ + + int ret; + + sysevent_evc_unbind(((struct znotify *)handle)->zn_eventchan); + /* + * Check that all evc threads have gone away. This should be + * enforced by sysevent_evc_unbind. + */ + ret = pthread_mutex_trylock(&((struct znotify *)handle)->zn_mutex); + + if (ret) + abort(); + + (void) pthread_mutex_unlock(&((struct znotify *)handle)->zn_mutex); + (void) pthread_mutex_destroy(&((struct znotify *)handle)->zn_mutex); + (void) pthread_cond_destroy(&((struct znotify *)handle)->zn_cond); + (void) pthread_mutex_destroy(&((struct znotify *)handle)->zn_bigmutex); + + free(handle); +} + static int zonecfg_add_ds_core(zone_dochandle_t handle, struct zone_dstab *tabptr) { diff --git a/usr/src/lib/libzonecfg/spec/libzonecfg.spec b/usr/src/lib/libzonecfg/spec/libzonecfg.spec index a0cdb3db98..30b22a3526 100644 --- a/usr/src/lib/libzonecfg/spec/libzonecfg.spec +++ b/usr/src/lib/libzonecfg/spec/libzonecfg.spec @@ -664,6 +664,36 @@ declaration int zonecfg_construct_rctlblk(const struct zone_rctlvaltab *, rctlbl version SUNWprivate_1.1 end +function zonecfg_notify_bind +include <libzonecfg.h> +declaration void * zonecfg_notify_bind(int(*)(const char *, zoneid_t zid, const char *, const char *, void *), void *) +version SUNWprivate_1.1 +end + +function zonecfg_notify_unbind +include <libzonecfg.h> +delcaration void zonecfg_notify_unbind(void *) +version SUNWprivate_1.1 +end + +function zonecfg_notify_critical_enter +include <libzonecfg.h> +declaration void zonecfg_notify_critical_enter(void *); +version SUNWprivate_1.1 +end + +function zonecfg_notify_critical_abort +include <libzonecfg.h> +declaration void zonecfg_notify_critical_abort(void *); +version SUNWprivate_1.1 +end + +function zonecfg_notify_critical_exit +include <libzonecfg.h> +declaration int zonecfg_notify_critical_exit(void *); +version SUNWprivate_1.1 +end + function zonecfg_open_scratch include <libzonecfg.h> declaration FILE *zonecfg_open_scratch(const char *, boolean_t) diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c index fd3de9852f..9236430b40 100644 --- a/usr/src/uts/common/os/zone.c +++ b/usr/src/uts/common/os/zone.c @@ -194,6 +194,7 @@ #include <sys/pathname.h> #include <sys/proc.h> #include <sys/project.h> +#include <sys/sysevent.h> #include <sys/task.h> #include <sys/systm.h> #include <sys/types.h> @@ -287,6 +288,27 @@ static kmutex_t zone_deathrow_lock; /* number of zones is limited by virtual interface limit in IP */ uint_t maxzones = 8192; +/* Event channel to sent zone state change notifications */ +evchan_t *zone_event_chan; + +/* + * This table holds the mapping from kernel zone states to + * states visible in the state notification API. + * The idea is that we only expose "obvious" states and + * do not expose states which are just implementation details. + */ +const char *zone_status_table[] = { + ZONE_EVENT_UNINITIALIZED, /* uninitialized */ + ZONE_EVENT_READY, /* ready */ + ZONE_EVENT_READY, /* booting */ + ZONE_EVENT_RUNNING, /* running */ + ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */ + ZONE_EVENT_SHUTTING_DOWN, /* empty */ + ZONE_EVENT_SHUTTING_DOWN, /* down */ + ZONE_EVENT_SHUTTING_DOWN, /* dying */ + ZONE_EVENT_UNINITIALIZED, /* dead */ +}; + /* * This isn't static so lint doesn't complain. */ @@ -986,6 +1008,7 @@ zone_init(void) rctl_set_t *set; rctl_alloc_gp_t *gp; rctl_entity_p_t e; + int res; ASSERT(curproc == &p0); @@ -1064,6 +1087,15 @@ zone_init(void) * will be set when the root filesystem is mounted). */ global_zone = &zone0; + + /* + * Setup an event channel to send zone status change notifications on + */ + res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan, + EVCH_CREAT); + + if (res) + panic("Sysevent_evc_bind failed during zone setup.\n"); } static void @@ -1120,10 +1152,32 @@ zone_free(zone_t *zone) static void zone_status_set(zone_t *zone, zone_status_t status) { + + nvlist_t *nvl = NULL; ASSERT(MUTEX_HELD(&zone_status_lock)); ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && status >= zone_status_get(zone)); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || + nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || + nvlist_add_string(nvl, ZONE_CB_NEWSTATE, + zone_status_table[status]) || + nvlist_add_string(nvl, ZONE_CB_OLDSTATE, + zone_status_table[zone->zone_status]) || + nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || + nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || + sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, + ZONE_EVENT_STATUS_SUBCLASS, + "sun.com", "kernel", nvl, EVCH_SLEEP)) { +#ifdef DEBUG + (void) printf( + "Failed to allocate and send zone state change event.\n"); +#endif + } + nvlist_free(nvl); + zone->zone_status = status; + cv_broadcast(&zone->zone_cv); } diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h index cc2031ec2f..441ea8cb3e 100644 --- a/usr/src/uts/common/sys/zone.h +++ b/usr/src/uts/common/sys/zone.h @@ -77,6 +77,21 @@ extern "C" { #define ZONE_ATTR_POOLID 6 #define ZONE_ATTR_INITPID 7 +#define ZONE_EVENT_CHANNEL "com.sun:zones:status" +#define ZONE_EVENT_STATUS_CLASS "status" +#define ZONE_EVENT_STATUS_SUBCLASS "change" + +#define ZONE_EVENT_UNINITIALIZED "uninitialized" +#define ZONE_EVENT_READY "ready" +#define ZONE_EVENT_RUNNING "running" +#define ZONE_EVENT_SHUTTING_DOWN "shutting_down" + +#define ZONE_CB_NAME "zonename" +#define ZONE_CB_NEWSTATE "newstate" +#define ZONE_CB_OLDSTATE "oldstate" +#define ZONE_CB_TIMESTAMP "when" +#define ZONE_CB_ZONEID "zoneid" + #ifdef _SYSCALL32 typedef struct { caddr32_t zone_name; |