summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/svc/startd/startd.c133
-rw-r--r--usr/src/cmd/svc/svccfg/svccfg_libscf.c231
2 files changed, 363 insertions, 1 deletions
diff --git a/usr/src/cmd/svc/startd/startd.c b/usr/src/cmd/svc/startd/startd.c
index 6e3ea9876b..c5307879e2 100644
--- a/usr/src/cmd/svc/startd/startd.c
+++ b/usr/src/cmd/svc/startd/startd.c
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright 2015, Joyent, Inc.
*/
/*
@@ -42,6 +42,137 @@
* engine commands by executing methods, updating the repository, and sending
* feedback (mostly state updates) to the graph engine.
*
+ * Overview of the SMF Architecture
+ *
+ * There are a few different components that make up SMF and are responsible
+ * for different pieces of functionality that are used:
+ *
+ * svc.startd(1M): A daemon that is in charge of starting, stopping, and
+ * restarting services and instances.
+ * svc.configd(1M): A daemon that manages the repository that stores
+ * information, property groups, and state of the different services and
+ * instances.
+ * libscf(3LIB): A C library that provides the glue for communicating,
+ * accessing, and updating information about services and instances.
+ * svccfg(1M): A utility to add and remove services as well as change the
+ * properties associated with different services and instances.
+ * svcadm(1M): A utility to control the different instance of a service. You
+ * can use this to enable and disable them among some other useful things.
+ * svcs(1): A utility that reports on the status of various services on the
+ * system.
+ *
+ * The following block diagram explains how these components communicate:
+ *
+ * The SMF Block Diagram
+ * Repository
+ * This attempts to show +---------+ +--------+
+ * the relations between | | SQL | |
+ * the different pieces | configd |<----------->| SQLite |
+ * that make SMF work and | | Transaction | |
+ * users/administrators +---------+ +--------+
+ * call into. ^ ^
+ * | |
+ * door_call(3C)| | door_call(3C)
+ * | |
+ * v v
+ * +----------+ +--------+ +--------+ +----------+
+ * | | | | | | | svccfg |
+ * | startd |<--->| libscf | | libscf |<---->| svcadm |
+ * | | | (3LIB) | | (3LIB) | | svcs |
+ * +----------+ +--------+ +--------+ +----------+
+ * ^ ^
+ * | | fork(2)/exec(2)
+ * | | libcontract(3LIB)
+ * v v Various System/User services
+ * +-------------------------------------------------------------------+
+ * | system/filesystem/local:default system/coreadm:default |
+ * | network/loopback:default system/zones:default |
+ * | milestone/multi-user:default system/cron:default |
+ * | system/console-login:default network/ssh:default |
+ * | system/pfexec:default system/svc/restarter:default |
+ * +-------------------------------------------------------------------+
+ *
+ * Chatting with Configd and Sharing Repository Information
+ *
+ * As you run commands with svcs, svccfg, and svcadm, they are all creating a
+ * libscf handle to communicate with configd. As calls are made via libscf they
+ * ultimately go and talk to configd to get information. However, how we
+ * actually are talking to configd is not as straightforward as it appears.
+ *
+ * When configd starts up it creates a door located at
+ * /etc/svc/volatile/repository_door. This door runs the routine called
+ * main_switcher() from usr/src/cmd/svc/configd/maindoor.c. When you first
+ * invoke svc(cfg|s|adm), one of the first things that occurs is creating a
+ * scf_handle_t and binding it to configd by calling scf_handle_bind(). This
+ * function makes a door call to configd and gets returned a new file
+ * descriptor. This file descriptor is itself another door which calls into
+ * configd's client_switcher(). This is the door that is actually used when
+ * getting and fetching properties, and many other useful things.
+ *
+ * svc.startd needs a way to notice the changes that occur to the repository.
+ * For example, if you enabled a service that was not previously running, it's
+ * up to startd to notice that this has happened, check dependencies, and
+ * eventually start up the service. The way it gets these notifications is via
+ * a thread who's sole purpose in life is to call _scf_notify_wait(). This
+ * function acts like poll(2) but for changes that occur in the repository.
+ * Once this thread gets the event, it dispatches the event appropriately.
+ *
+ * The Events of svc.startd
+ *
+ * svc.startd has to handle a lot of complexity. Understanding how you go from
+ * getting the notification that a service was enabled to actually enabling it
+ * is not obvious from a cursory glance. The first thing to keep in mind is
+ * that startd maintains a graph of all the related services and instances so
+ * it can keep track of what is enabled, what dependencies exist, etc. all so
+ * that it can answer the question of what is affected by a change. Internally
+ * there are a lot of different queues for events, threads to process these
+ * queues, and different paths to have events enter these queues. What follows
+ * is a diagram that attempts to explain some of those paths, though it's
+ * important to note that for some of these pieces, such as the graph and
+ * vertex events, there are many additional ways and code paths these threads
+ * and functions can take. And yes, restarter_event_enqueue() is not the same
+ * thing as restarter_queue_event().
+ *
+ * Threads/Functions Queues Threads/Functions
+ *
+ * called by various
+ * +----------------+ +-------+ +-------------+
+ * --->| graph_protocol | graph_event | graph | graph_event_ | graph_event |
+ * --->| _send_event() |------------>| event |----------------->| _thread |
+ * +----------------+ _enqueue() | queue | dequeue() +-------------+
+ * +-------+ |
+ * _scf_notify_wait() vertex_send_event()|
+ * | v
+ * | +------------------+ +--------------------+
+ * +->| repository_event | vertex_send_event() | restarter_protocol |
+ * | _thread |----------------------------->| _send_event() |
+ * +------------------+ +--------------------+
+ * | | out to other
+ * restarter_ restarter_ | | restarters
+ * event_dequeue() +-----------+ event_ | | not startd
+ * +----------------| restarter |<------------+ +------------->
+ * v | event | enqueue()
+ * +-----------------+ | queue | +------------------>
+ * | restarter_event | +-----------+ |+----------------->
+ * | _thread | ||+---------------->
+ * +-----------------+ ||| start/stop inst
+ * | +--------------+ +--------------------+
+ * | | instance | | restarter_process_ |
+ * +-------------->| event |------>| events |
+ * restarter_ | queue | | per-instance lwp |
+ * queue_event() +--------------+ +--------------------+
+ * ||| various funcs
+ * ||| controlling
+ * ||| instance state
+ * ||+--------------->
+ * |+---------------->
+ * +----------------->
+ *
+ * What's important to take away is that there is a queue for each instance on
+ * the system that handles events related to dealing directly with that
+ * instance and that events can be added to it because of changes to properties
+ * that are made to configd and acted upon asynchronously by startd.
+ *
* Error handling
*
* In general, when svc.startd runs out of memory it reattempts a few times,
diff --git a/usr/src/cmd/svc/svccfg/svccfg_libscf.c b/usr/src/cmd/svc/svccfg/svccfg_libscf.c
index 5a96e5eac4..df7f7af209 100644
--- a/usr/src/cmd/svc/svccfg/svccfg_libscf.c
+++ b/usr/src/cmd/svc/svccfg/svccfg_libscf.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011, Joyent, Inc. All rights reserved.
* Copyright 2012 Milan Jurik. All rights reserved.
*/
@@ -44,6 +45,7 @@
#include <stdarg.h>
#include <string.h>
#include <strings.h>
+#include <time.h>
#include <unistd.h>
#include <wait.h>
#include <poll.h>
@@ -241,6 +243,9 @@ static const char *emsg_dpt_no_dep;
static int li_only = 0;
static int no_refresh = 0;
+/* how long in ns we should wait between checks for a pg */
+static uint64_t pg_timeout = 100 * (NANOSEC / MILLISEC);
+
/* import globals, to minimize allocations */
static scf_scope_t *imp_scope = NULL;
static scf_service_t *imp_svc = NULL, *imp_tsvc = NULL;
@@ -6751,6 +6756,203 @@ connaborted:
}
/*
+ * When an instance is imported we end up telling configd about it. Once we tell
+ * configd about these changes, startd eventually notices. If this is a new
+ * instance, the manifest may not specify the SCF_PG_RESTARTER (restarter)
+ * property group. However, many of the other tools expect that this property
+ * group exists and has certain values.
+ *
+ * These values are added asynchronously by startd. We should not return from
+ * this routine until we can verify that the property group we need is there.
+ *
+ * Before we go ahead and verify this, we have to ask ourselves an important
+ * question: Is the early manifest service currently running? Because if it is
+ * running and it has invoked us, then the service will never get a restarter
+ * property because svc.startd is blocked on EMI finishing before it lets itself
+ * fully connect to svc.configd. Of course, this means that this race condition
+ * is in fact impossible to 100% eliminate.
+ *
+ * svc.startd makes sure that EMI only runs once and has succeeded by checking
+ * the state of the EMI instance. If it is online it bails out and makes sure
+ * that it doesn't run again. In this case, we're going to do something similar,
+ * only if the state is online, then we're going to actually verify. EMI always
+ * has to be present, but it can be explicitly disabled to reduce the amount of
+ * damage it can cause. If EMI has been disabled then we no longer have to worry
+ * about the implicit race condition and can go ahead and check things. If EMI
+ * is in some state that isn't online or disabled and isn't runinng, then we
+ * assume that things are rather bad and we're not going to get in your way,
+ * even if the rest of SMF does.
+ *
+ * Returns 0 on success or returns an errno.
+ */
+#ifndef NATIVE_BUILD
+static int
+lscf_instance_verify(scf_scope_t *scope, entity_t *svc, entity_t *inst)
+{
+ int ret, err;
+ struct timespec ts;
+ char *emi_state;
+
+ /*
+ * smf_get_state does not distinguish between its different failure
+ * modes: memory allocation failures and SMF internal failures.
+ */
+ if ((emi_state = smf_get_state(SCF_INSTANCE_EMI)) == NULL)
+ return (EAGAIN);
+
+ /*
+ * As per the block comment for this function check the state of EMI
+ */
+ if (strcmp(emi_state, SCF_STATE_STRING_ONLINE) != 0 &&
+ strcmp(emi_state, SCF_STATE_STRING_DISABLED) != 0) {
+ warn(gettext("Not validating instance %s:%s because EMI's "
+ "state is %s\n"), svc->sc_name, inst->sc_name, emi_state);
+ free(emi_state);
+ return (0);
+ }
+
+ free(emi_state);
+
+ /*
+ * First we have to get the property.
+ */
+ if ((ret = scf_scope_get_service(scope, svc->sc_name, imp_svc)) != 0) {
+ ret = scf_error();
+ warn(gettext("Failed to look up service: %s\n"), svc->sc_name);
+ return (ret);
+ }
+
+ /*
+ * We should always be able to get the instance. It should already
+ * exist because we just created it or got it. There probably is a
+ * slim chance that someone may have come in and deleted it though from
+ * under us.
+ */
+ if ((ret = scf_service_get_instance(imp_svc, inst->sc_name, imp_inst))
+ != 0) {
+ ret = scf_error();
+ warn(gettext("Failed to verify instance: %s\n"), inst->sc_name);
+ switch (ret) {
+ case SCF_ERROR_DELETED:
+ err = ENODEV;
+ break;
+ case SCF_ERROR_CONNECTION_BROKEN:
+ warn(gettext("Lost repository connection\n"));
+ err = ECONNABORTED;
+ break;
+ case SCF_ERROR_NOT_FOUND:
+ warn(gettext("Instance \"%s\" disappeared out from "
+ "under us.\n"), inst->sc_name);
+ err = ENOENT;
+ break;
+ default:
+ bad_error("scf_service_get_instance", ret);
+ }
+
+ return (err);
+ }
+
+ /*
+ * An astute observer may want to use _scf_wait_pg which would notify us
+ * of a property group change, unfortunately that does not work if the
+ * property group in question does not exist. So instead we have to
+ * manually poll and ask smf the best way to get to it.
+ */
+ while ((ret = scf_instance_get_pg(imp_inst, SCF_PG_RESTARTER, imp_pg))
+ != SCF_SUCCESS) {
+ ret = scf_error();
+ if (ret != SCF_ERROR_NOT_FOUND) {
+ warn(gettext("Failed to get restarter property "
+ "group for instance: %s\n"), inst->sc_name);
+ switch (ret) {
+ case SCF_ERROR_DELETED:
+ err = ENODEV;
+ break;
+ case SCF_ERROR_CONNECTION_BROKEN:
+ warn(gettext("Lost repository connection\n"));
+ err = ECONNABORTED;
+ break;
+ default:
+ bad_error("scf_service_get_instance", ret);
+ }
+
+ return (err);
+ }
+
+ ts.tv_sec = pg_timeout / NANOSEC;
+ ts.tv_nsec = pg_timeout % NANOSEC;
+
+ (void) nanosleep(&ts, NULL);
+ }
+
+ /*
+ * svcadm also expects that the SCF_PROPERTY_STATE property is present.
+ * So in addition to the property group being present, we need to wait
+ * for the property to be there in some form.
+ *
+ * Note that a property group is a frozen snapshot in time. To properly
+ * get beyond this, you have to refresh the property group each time.
+ */
+ while ((ret = scf_pg_get_property(imp_pg, SCF_PROPERTY_STATE,
+ imp_prop)) != 0) {
+
+ ret = scf_error();
+ if (ret != SCF_ERROR_NOT_FOUND) {
+ warn(gettext("Failed to get property %s from the "
+ "restarter property group of instance %s\n"),
+ SCF_PROPERTY_STATE, inst->sc_name);
+ switch (ret) {
+ case SCF_ERROR_CONNECTION_BROKEN:
+ warn(gettext("Lost repository connection\n"));
+ err = ECONNABORTED;
+ break;
+ case SCF_ERROR_DELETED:
+ err = ENODEV;
+ break;
+ default:
+ bad_error("scf_pg_get_property", ret);
+ }
+
+ return (err);
+ }
+
+ ts.tv_sec = pg_timeout / NANOSEC;
+ ts.tv_nsec = pg_timeout % NANOSEC;
+
+ (void) nanosleep(&ts, NULL);
+
+ ret = scf_instance_get_pg(imp_inst, SCF_PG_RESTARTER, imp_pg);
+ if (ret != SCF_SUCCESS) {
+ warn(gettext("Failed to get restarter property "
+ "group for instance: %s\n"), inst->sc_name);
+ switch (ret) {
+ case SCF_ERROR_DELETED:
+ err = ENODEV;
+ break;
+ case SCF_ERROR_CONNECTION_BROKEN:
+ warn(gettext("Lost repository connection\n"));
+ err = ECONNABORTED;
+ break;
+ default:
+ bad_error("scf_service_get_instance", ret);
+ }
+
+ return (err);
+ }
+ }
+
+ /*
+ * We don't have to free the property groups or other values that we got
+ * because we stored them in global variables that are allocated and
+ * freed by the routines that call into these functions. Unless of
+ * course the rest of the code here that we are basing this on is
+ * mistaken.
+ */
+ return (0);
+}
+#endif
+
+/*
* If the service is missing, create it, import its properties, and import the
* instances. Since the service is brand new, it should be empty, and if we
* run into any existing entities (SCF_ERROR_EXISTS), abort.
@@ -8122,7 +8324,36 @@ lscf_bundle_import(bundle_t *bndl, const char *filename, uint_t flags)
goto progress;
result = 0;
+
+ /*
+ * This snippet of code assumes that we are running svccfg as we
+ * normally do -- witih svc.startd running. Of course, that is
+ * not actually the case all the time because we also use a
+ * varient of svc.configd and svccfg which are only meant to
+ * run during the build process. During this time we have no
+ * svc.startd, so this check would hang the build process.
+ */
+#ifndef NATIVE_BUILD
+ /*
+ * Verify that the restarter group is preset
+ */
+ for (svc = uu_list_first(bndl->sc_bundle_services);
+ svc != NULL;
+ svc = uu_list_next(bndl->sc_bundle_services, svc)) {
+
+ insts = svc->sc_u.sc_service.sc_service_instances;
+
+ for (inst = uu_list_first(insts);
+ inst != NULL;
+ inst = uu_list_next(insts, inst)) {
+ if (lscf_instance_verify(imp_scope, svc,
+ inst) != 0)
+ goto progress;
+ }
+ }
+#endif
goto out;
+
}
if (uu_error() != UU_ERROR_CALLBACK_FAILED)