diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/svc/startd/startd.c | 133 | ||||
-rw-r--r-- | usr/src/cmd/svc/svccfg/svccfg_libscf.c | 231 |
2 files changed, 363 insertions, 1 deletions
diff --git a/usr/src/cmd/svc/startd/startd.c b/usr/src/cmd/svc/startd/startd.c index 6e3ea9876b..c5307879e2 100644 --- a/usr/src/cmd/svc/startd/startd.c +++ b/usr/src/cmd/svc/startd/startd.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2015, Joyent, Inc. */ /* @@ -42,6 +42,137 @@ * engine commands by executing methods, updating the repository, and sending * feedback (mostly state updates) to the graph engine. * + * Overview of the SMF Architecture + * + * There are a few different components that make up SMF and are responsible + * for different pieces of functionality that are used: + * + * svc.startd(1M): A daemon that is in charge of starting, stopping, and + * restarting services and instances. + * svc.configd(1M): A daemon that manages the repository that stores + * information, property groups, and state of the different services and + * instances. + * libscf(3LIB): A C library that provides the glue for communicating, + * accessing, and updating information about services and instances. + * svccfg(1M): A utility to add and remove services as well as change the + * properties associated with different services and instances. + * svcadm(1M): A utility to control the different instance of a service. You + * can use this to enable and disable them among some other useful things. + * svcs(1): A utility that reports on the status of various services on the + * system. + * + * The following block diagram explains how these components communicate: + * + * The SMF Block Diagram + * Repository + * This attempts to show +---------+ +--------+ + * the relations between | | SQL | | + * the different pieces | configd |<----------->| SQLite | + * that make SMF work and | | Transaction | | + * users/administrators +---------+ +--------+ + * call into. ^ ^ + * | | + * door_call(3C)| | door_call(3C) + * | | + * v v + * +----------+ +--------+ +--------+ +----------+ + * | | | | | | | svccfg | + * | startd |<--->| libscf | | libscf |<---->| svcadm | + * | | | (3LIB) | | (3LIB) | | svcs | + * +----------+ +--------+ +--------+ +----------+ + * ^ ^ + * | | fork(2)/exec(2) + * | | libcontract(3LIB) + * v v Various System/User services + * +-------------------------------------------------------------------+ + * | system/filesystem/local:default system/coreadm:default | + * | network/loopback:default system/zones:default | + * | milestone/multi-user:default system/cron:default | + * | system/console-login:default network/ssh:default | + * | system/pfexec:default system/svc/restarter:default | + * +-------------------------------------------------------------------+ + * + * Chatting with Configd and Sharing Repository Information + * + * As you run commands with svcs, svccfg, and svcadm, they are all creating a + * libscf handle to communicate with configd. As calls are made via libscf they + * ultimately go and talk to configd to get information. However, how we + * actually are talking to configd is not as straightforward as it appears. + * + * When configd starts up it creates a door located at + * /etc/svc/volatile/repository_door. This door runs the routine called + * main_switcher() from usr/src/cmd/svc/configd/maindoor.c. When you first + * invoke svc(cfg|s|adm), one of the first things that occurs is creating a + * scf_handle_t and binding it to configd by calling scf_handle_bind(). This + * function makes a door call to configd and gets returned a new file + * descriptor. This file descriptor is itself another door which calls into + * configd's client_switcher(). This is the door that is actually used when + * getting and fetching properties, and many other useful things. + * + * svc.startd needs a way to notice the changes that occur to the repository. + * For example, if you enabled a service that was not previously running, it's + * up to startd to notice that this has happened, check dependencies, and + * eventually start up the service. The way it gets these notifications is via + * a thread who's sole purpose in life is to call _scf_notify_wait(). This + * function acts like poll(2) but for changes that occur in the repository. + * Once this thread gets the event, it dispatches the event appropriately. + * + * The Events of svc.startd + * + * svc.startd has to handle a lot of complexity. Understanding how you go from + * getting the notification that a service was enabled to actually enabling it + * is not obvious from a cursory glance. The first thing to keep in mind is + * that startd maintains a graph of all the related services and instances so + * it can keep track of what is enabled, what dependencies exist, etc. all so + * that it can answer the question of what is affected by a change. Internally + * there are a lot of different queues for events, threads to process these + * queues, and different paths to have events enter these queues. What follows + * is a diagram that attempts to explain some of those paths, though it's + * important to note that for some of these pieces, such as the graph and + * vertex events, there are many additional ways and code paths these threads + * and functions can take. And yes, restarter_event_enqueue() is not the same + * thing as restarter_queue_event(). + * + * Threads/Functions Queues Threads/Functions + * + * called by various + * +----------------+ +-------+ +-------------+ + * --->| graph_protocol | graph_event | graph | graph_event_ | graph_event | + * --->| _send_event() |------------>| event |----------------->| _thread | + * +----------------+ _enqueue() | queue | dequeue() +-------------+ + * +-------+ | + * _scf_notify_wait() vertex_send_event()| + * | v + * | +------------------+ +--------------------+ + * +->| repository_event | vertex_send_event() | restarter_protocol | + * | _thread |----------------------------->| _send_event() | + * +------------------+ +--------------------+ + * | | out to other + * restarter_ restarter_ | | restarters + * event_dequeue() +-----------+ event_ | | not startd + * +----------------| restarter |<------------+ +-------------> + * v | event | enqueue() + * +-----------------+ | queue | +------------------> + * | restarter_event | +-----------+ |+-----------------> + * | _thread | ||+----------------> + * +-----------------+ ||| start/stop inst + * | +--------------+ +--------------------+ + * | | instance | | restarter_process_ | + * +-------------->| event |------>| events | + * restarter_ | queue | | per-instance lwp | + * queue_event() +--------------+ +--------------------+ + * ||| various funcs + * ||| controlling + * ||| instance state + * ||+---------------> + * |+----------------> + * +-----------------> + * + * What's important to take away is that there is a queue for each instance on + * the system that handles events related to dealing directly with that + * instance and that events can be added to it because of changes to properties + * that are made to configd and acted upon asynchronously by startd. + * * Error handling * * In general, when svc.startd runs out of memory it reattempts a few times, diff --git a/usr/src/cmd/svc/svccfg/svccfg_libscf.c b/usr/src/cmd/svc/svccfg/svccfg_libscf.c index 5a96e5eac4..df7f7af209 100644 --- a/usr/src/cmd/svc/svccfg/svccfg_libscf.c +++ b/usr/src/cmd/svc/svccfg/svccfg_libscf.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, Joyent, Inc. All rights reserved. * Copyright 2012 Milan Jurik. All rights reserved. */ @@ -44,6 +45,7 @@ #include <stdarg.h> #include <string.h> #include <strings.h> +#include <time.h> #include <unistd.h> #include <wait.h> #include <poll.h> @@ -241,6 +243,9 @@ static const char *emsg_dpt_no_dep; static int li_only = 0; static int no_refresh = 0; +/* how long in ns we should wait between checks for a pg */ +static uint64_t pg_timeout = 100 * (NANOSEC / MILLISEC); + /* import globals, to minimize allocations */ static scf_scope_t *imp_scope = NULL; static scf_service_t *imp_svc = NULL, *imp_tsvc = NULL; @@ -6751,6 +6756,203 @@ connaborted: } /* + * When an instance is imported we end up telling configd about it. Once we tell + * configd about these changes, startd eventually notices. If this is a new + * instance, the manifest may not specify the SCF_PG_RESTARTER (restarter) + * property group. However, many of the other tools expect that this property + * group exists and has certain values. + * + * These values are added asynchronously by startd. We should not return from + * this routine until we can verify that the property group we need is there. + * + * Before we go ahead and verify this, we have to ask ourselves an important + * question: Is the early manifest service currently running? Because if it is + * running and it has invoked us, then the service will never get a restarter + * property because svc.startd is blocked on EMI finishing before it lets itself + * fully connect to svc.configd. Of course, this means that this race condition + * is in fact impossible to 100% eliminate. + * + * svc.startd makes sure that EMI only runs once and has succeeded by checking + * the state of the EMI instance. If it is online it bails out and makes sure + * that it doesn't run again. In this case, we're going to do something similar, + * only if the state is online, then we're going to actually verify. EMI always + * has to be present, but it can be explicitly disabled to reduce the amount of + * damage it can cause. If EMI has been disabled then we no longer have to worry + * about the implicit race condition and can go ahead and check things. If EMI + * is in some state that isn't online or disabled and isn't runinng, then we + * assume that things are rather bad and we're not going to get in your way, + * even if the rest of SMF does. + * + * Returns 0 on success or returns an errno. + */ +#ifndef NATIVE_BUILD +static int +lscf_instance_verify(scf_scope_t *scope, entity_t *svc, entity_t *inst) +{ + int ret, err; + struct timespec ts; + char *emi_state; + + /* + * smf_get_state does not distinguish between its different failure + * modes: memory allocation failures and SMF internal failures. + */ + if ((emi_state = smf_get_state(SCF_INSTANCE_EMI)) == NULL) + return (EAGAIN); + + /* + * As per the block comment for this function check the state of EMI + */ + if (strcmp(emi_state, SCF_STATE_STRING_ONLINE) != 0 && + strcmp(emi_state, SCF_STATE_STRING_DISABLED) != 0) { + warn(gettext("Not validating instance %s:%s because EMI's " + "state is %s\n"), svc->sc_name, inst->sc_name, emi_state); + free(emi_state); + return (0); + } + + free(emi_state); + + /* + * First we have to get the property. + */ + if ((ret = scf_scope_get_service(scope, svc->sc_name, imp_svc)) != 0) { + ret = scf_error(); + warn(gettext("Failed to look up service: %s\n"), svc->sc_name); + return (ret); + } + + /* + * We should always be able to get the instance. It should already + * exist because we just created it or got it. There probably is a + * slim chance that someone may have come in and deleted it though from + * under us. + */ + if ((ret = scf_service_get_instance(imp_svc, inst->sc_name, imp_inst)) + != 0) { + ret = scf_error(); + warn(gettext("Failed to verify instance: %s\n"), inst->sc_name); + switch (ret) { + case SCF_ERROR_DELETED: + err = ENODEV; + break; + case SCF_ERROR_CONNECTION_BROKEN: + warn(gettext("Lost repository connection\n")); + err = ECONNABORTED; + break; + case SCF_ERROR_NOT_FOUND: + warn(gettext("Instance \"%s\" disappeared out from " + "under us.\n"), inst->sc_name); + err = ENOENT; + break; + default: + bad_error("scf_service_get_instance", ret); + } + + return (err); + } + + /* + * An astute observer may want to use _scf_wait_pg which would notify us + * of a property group change, unfortunately that does not work if the + * property group in question does not exist. So instead we have to + * manually poll and ask smf the best way to get to it. + */ + while ((ret = scf_instance_get_pg(imp_inst, SCF_PG_RESTARTER, imp_pg)) + != SCF_SUCCESS) { + ret = scf_error(); + if (ret != SCF_ERROR_NOT_FOUND) { + warn(gettext("Failed to get restarter property " + "group for instance: %s\n"), inst->sc_name); + switch (ret) { + case SCF_ERROR_DELETED: + err = ENODEV; + break; + case SCF_ERROR_CONNECTION_BROKEN: + warn(gettext("Lost repository connection\n")); + err = ECONNABORTED; + break; + default: + bad_error("scf_service_get_instance", ret); + } + + return (err); + } + + ts.tv_sec = pg_timeout / NANOSEC; + ts.tv_nsec = pg_timeout % NANOSEC; + + (void) nanosleep(&ts, NULL); + } + + /* + * svcadm also expects that the SCF_PROPERTY_STATE property is present. + * So in addition to the property group being present, we need to wait + * for the property to be there in some form. + * + * Note that a property group is a frozen snapshot in time. To properly + * get beyond this, you have to refresh the property group each time. + */ + while ((ret = scf_pg_get_property(imp_pg, SCF_PROPERTY_STATE, + imp_prop)) != 0) { + + ret = scf_error(); + if (ret != SCF_ERROR_NOT_FOUND) { + warn(gettext("Failed to get property %s from the " + "restarter property group of instance %s\n"), + SCF_PROPERTY_STATE, inst->sc_name); + switch (ret) { + case SCF_ERROR_CONNECTION_BROKEN: + warn(gettext("Lost repository connection\n")); + err = ECONNABORTED; + break; + case SCF_ERROR_DELETED: + err = ENODEV; + break; + default: + bad_error("scf_pg_get_property", ret); + } + + return (err); + } + + ts.tv_sec = pg_timeout / NANOSEC; + ts.tv_nsec = pg_timeout % NANOSEC; + + (void) nanosleep(&ts, NULL); + + ret = scf_instance_get_pg(imp_inst, SCF_PG_RESTARTER, imp_pg); + if (ret != SCF_SUCCESS) { + warn(gettext("Failed to get restarter property " + "group for instance: %s\n"), inst->sc_name); + switch (ret) { + case SCF_ERROR_DELETED: + err = ENODEV; + break; + case SCF_ERROR_CONNECTION_BROKEN: + warn(gettext("Lost repository connection\n")); + err = ECONNABORTED; + break; + default: + bad_error("scf_service_get_instance", ret); + } + + return (err); + } + } + + /* + * We don't have to free the property groups or other values that we got + * because we stored them in global variables that are allocated and + * freed by the routines that call into these functions. Unless of + * course the rest of the code here that we are basing this on is + * mistaken. + */ + return (0); +} +#endif + +/* * If the service is missing, create it, import its properties, and import the * instances. Since the service is brand new, it should be empty, and if we * run into any existing entities (SCF_ERROR_EXISTS), abort. @@ -8122,7 +8324,36 @@ lscf_bundle_import(bundle_t *bndl, const char *filename, uint_t flags) goto progress; result = 0; + + /* + * This snippet of code assumes that we are running svccfg as we + * normally do -- witih svc.startd running. Of course, that is + * not actually the case all the time because we also use a + * varient of svc.configd and svccfg which are only meant to + * run during the build process. During this time we have no + * svc.startd, so this check would hang the build process. + */ +#ifndef NATIVE_BUILD + /* + * Verify that the restarter group is preset + */ + for (svc = uu_list_first(bndl->sc_bundle_services); + svc != NULL; + svc = uu_list_next(bndl->sc_bundle_services, svc)) { + + insts = svc->sc_u.sc_service.sc_service_instances; + + for (inst = uu_list_first(insts); + inst != NULL; + inst = uu_list_next(insts, inst)) { + if (lscf_instance_verify(imp_scope, svc, + inst) != 0) + goto progress; + } + } +#endif goto out; + } if (uu_error() != UU_ERROR_CALLBACK_FAILED) |