diff options
Diffstat (limited to 'usr/src/cmd/svc/startd')
-rw-r--r-- | usr/src/cmd/svc/startd/graph.c | 26 | ||||
-rw-r--r-- | usr/src/cmd/svc/startd/method.c | 96 | ||||
-rw-r--r-- | usr/src/cmd/svc/startd/startd.h | 19 |
3 files changed, 94 insertions, 47 deletions
diff --git a/usr/src/cmd/svc/startd/graph.c b/usr/src/cmd/svc/startd/graph.c index c831c99301..5a4e933220 100644 --- a/usr/src/cmd/svc/startd/graph.c +++ b/usr/src/cmd/svc/startd/graph.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2015, Syneto S.R.L. All rights reserved. */ @@ -142,6 +143,8 @@ #include <assert.h> #include <errno.h> #include <fcntl.h> +#include <sys/types.h> +#include <sys/stat.h> #include <fm/libfmevent.h> #include <libscf.h> #include <libscf_priv.h> @@ -4876,6 +4879,20 @@ vertex_subgraph_dependencies_shutdown(scf_handle_t *h, graph_vertex_t *v, was_up = up_state(old_state); now_up = up_state(v->gv_state); + if (halting != -1 && old_state == RESTARTER_STATE_DISABLED && + v->gv_state != RESTARTER_STATE_DISABLED) { + /* + * We're halting and we have a svc which is transitioning to + * offline in parallel. This leads to a race condition where + * gt_enter_offline might re-enable the svc after we disabled + * it. Since we're halting, we want to ensure no svc ever + * transitions out of the disabled state. In this case, modify + * the flags to keep us on the halting path. + */ + was_up = 0; + now_up = 0; + } + if (!was_up && now_up) { ++non_subgraph_svcs; } else if (was_up && !now_up) { @@ -6828,6 +6845,7 @@ repository_event_thread(void *unused) char *fmri = startd_alloc(max_scf_fmri_size); char *pg_name = startd_alloc(max_scf_value_size); int r; + int fd; h = libscf_handle_create_bound_loop(); @@ -6850,6 +6868,14 @@ retry: goto retry; } + if ((fd = open("/etc/svc/volatile/startd.ready", O_RDONLY | O_CREAT, + S_IRUSR)) < 0) { + log_error(LOG_WARNING, "Couldn't create startd.ready file\n", + SCF_GROUP_FRAMEWORK, scf_strerror(scf_error())); + } else { + (void) close(fd); + } + /*CONSTCOND*/ while (1) { ssize_t res; diff --git a/usr/src/cmd/svc/startd/method.c b/usr/src/cmd/svc/startd/method.c index cc9ce6768c..c3cd0144c1 100644 --- a/usr/src/cmd/svc/startd/method.c +++ b/usr/src/cmd/svc/startd/method.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright 2011 Joyent Inc. + * Copyright 2012 Joyent, Inc. All rights reserved. */ /* @@ -100,34 +100,18 @@ static uint_t method_events[] = { * method_record_start(restarter_inst_t *) * Record a service start for rate limiting. Place the current time * in the circular array of instance starts. + * + * Save the critical_failure_period and critical_failure_allowed with either + * the defaults or the svc properties startd/critical_failure_count and + * startd/critical_failure_period. + * ri_crit_fail_allowed is capped at RINST_START_TIMES. */ static void method_record_start(restarter_inst_t *inst) { - int index = inst->ri_start_index++ % RINST_START_TIMES; - - inst->ri_start_time[index] = gethrtime(); -} - -/* - * method_rate_critical(restarter_inst_t *) - * Return true if the average start interval is less than the permitted - * interval. The implicit interval defaults to RINST_FAILURE_RATE_NS and - * RINST_START_TIMES but may be overridden with the svc properties - * startd/critical_failure_count and startd/critical_failure_period - * which represent the number of failures to consider and the amount of - * time in seconds in which that number may occur, respectively. Note that - * this time is measured as of the transition to 'enabled' rather than wall - * clock time. - * Implicit success if insufficient measurements for an average exist. - */ -int -method_rate_critical(restarter_inst_t *inst) -{ + int index; + uint_t critical_failure_allowed = RINST_START_TIMES; hrtime_t critical_failure_period; - uint_t critical_failure_count = RINST_START_TIMES; - uint_t n = inst->ri_start_index; - hrtime_t avg_ns = 0; uint64_t scf_fr, scf_st; scf_propvec_t *prop = NULL; scf_propvec_t restart_critical[] = { @@ -151,17 +135,48 @@ method_rate_critical(restarter_inst_t *inst) * in seconds but tracked in ns */ critical_failure_period = (hrtime_t)scf_fr * NANOSEC; - critical_failure_count = (uint_t)scf_st; + critical_failure_allowed = (uint_t)scf_st; + + if (critical_failure_allowed > RINST_START_TIMES) + critical_failure_allowed = RINST_START_TIMES; + if (critical_failure_allowed < 1) + critical_failure_allowed = 1; + } - if (inst->ri_start_index < critical_failure_count) + + inst->ri_crit_fail_allowed = critical_failure_allowed; + inst->ri_crit_fail_period = critical_failure_period; + + index = inst->ri_start_index++ % critical_failure_allowed; + inst->ri_start_time[index] = gethrtime(); +} + +/* + * method_rate_critical(restarter_inst_t *) + * Return true if the number of failures within the interval + * ri_crit_fail_period exceeds ri_crit_fail_allowed. The allowed failure + * count defaults to RINST_START_TIMES and the implicit interval defaults + * to RINST_FAILURE_RATE_NS but may be overridden with the svc properties + * startd/critical_failure_count and startd/critical_failure_period which + * represent the acceptable number of failures and the amount of time in + * seconds in which that number may occur, respectively. Note that this time + * is measured as of the transition to 'enabled' rather than wall clock + * time. Implicitly not critical if insufficient failures have occured. + */ +int +method_rate_critical(restarter_inst_t *inst) +{ + uint_t n = inst->ri_start_index; + uint_t fail_allowed = inst->ri_crit_fail_allowed; + hrtime_t diff_ns; + + if (n < fail_allowed) return (0); - avg_ns = - (inst->ri_start_time[(n - 1) % critical_failure_count] - - inst->ri_start_time[n % critical_failure_count]) / - (critical_failure_count - 1); + diff_ns = inst->ri_start_time[(n - 1) % fail_allowed] - + inst->ri_start_time[n % fail_allowed]; - return (avg_ns < critical_failure_period); + return (diff_ns < inst->ri_crit_fail_period); } /* @@ -989,7 +1004,8 @@ method_run(restarter_inst_t **instp, int type, int *exit_code) goto contract_out; } - if (!WIFEXITED(ret_status)) { + if (!WIFEXITED(ret_status) && + WEXITSTATUS(ret_status) != SMF_EXIT_NODAEMON) { /* * If method didn't exit itself (it was killed by an * external entity, etc.), consider the entire @@ -1018,7 +1034,7 @@ method_run(restarter_inst_t **instp, int type, int *exit_code) } *exit_code = WEXITSTATUS(ret_status); - if (*exit_code != 0) { + if (*exit_code != 0 && *exit_code != SMF_EXIT_NODAEMON) { log_error(LOG_WARNING, "%s: Method \"%s\" failed with exit status %d.\n", inst->ri_i.i_fmri, method, WEXITSTATUS(ret_status)); @@ -1027,6 +1043,7 @@ method_run(restarter_inst_t **instp, int type, int *exit_code) log_instance(inst, B_TRUE, "Method \"%s\" exited with status " "%d.", mname, *exit_code); + /* Note: we will take this path for SMF_EXIT_NODAEMON */ if (*exit_code != 0) goto contract_out; @@ -1073,7 +1090,10 @@ assured_kill: } contract_out: - /* Abandon contracts for transient methods & methods that fail. */ + /* + * Abandon contracts for transient methods, methods that exit with + * SMF_EXIT_NODAEMON & methods that fail. + */ transient = method_is_transient(inst, type); if ((transient || *exit_code != 0 || result != 0) && (restarter_is_kill_method(method) < 0)) @@ -1169,7 +1189,7 @@ retry: r = method_run(&inst, info->sf_method_type, &exit_code); - if (r == 0 && exit_code == 0) { + if (r == 0 && (exit_code == 0 || exit_code == SMF_EXIT_NODAEMON)) { /* Success! */ assert(inst->ri_i.i_next_state != RESTARTER_STATE_NONE); @@ -1187,6 +1207,12 @@ retry: else method_remove_contract(inst, B_TRUE, B_TRUE); } + + /* + * For methods that exit with SMF_EXIT_NODAEMON, we already + * called method_remove_contract in method_run. + */ + /* * We don't care whether the handle was rebound because this is * the last thing we do with it. diff --git a/usr/src/cmd/svc/startd/startd.h b/usr/src/cmd/svc/startd/startd.h index c1062e45e0..df3e98a27b 100644 --- a/usr/src/cmd/svc/startd/startd.h +++ b/usr/src/cmd/svc/startd/startd.h @@ -21,7 +21,7 @@ /* * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2015, Joyent, Inc. All rights reserved. */ #ifndef _STARTD_H @@ -82,16 +82,9 @@ extern "C" { #endif -#ifndef NDEBUG -#define bad_error(func, err) { \ - (void) fprintf(stderr, "%s:%d: %s() failed with unexpected " \ - "error %d. Aborting.\n", __FILE__, __LINE__, (func), (err)); \ - abort(); \ -} -#else -#define bad_error(func, err) abort() -#endif - +#define bad_error(func, err) \ + uu_panic("%s:%d: %s() failed with unexpected " \ + "error %d. Aborting.\n", __FILE__, __LINE__, (func), (err)); #define min(a, b) (((a) < (b)) ? (a) : (b)) @@ -405,7 +398,7 @@ typedef enum { #define RINST_RETAKE_MASK 0x0f000000 -#define RINST_START_TIMES 5 /* failures to consider */ +#define RINST_START_TIMES 10 /* up to 10 fails to consider */ #define RINST_FAILURE_RATE_NS 600000000000LL /* 1 failure/10 minutes */ #define RINST_WT_SVC_FAILURE_RATE_NS NANOSEC /* 1 failure/second */ @@ -427,6 +420,8 @@ typedef struct restarter_inst { hrtime_t ri_start_time[RINST_START_TIMES]; uint_t ri_start_index; /* times started */ + uint_t ri_crit_fail_allowed; + hrtime_t ri_crit_fail_period; uu_list_node_t ri_link; pthread_mutex_t ri_lock; |