diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2012-08-24 20:29:03 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2012-08-24 21:41:13 +0000 |
commit | a1f8197be6fcb2714673440bd5c7984434e457ca (patch) | |
tree | b4fe21d1f75a2588370edd1707dd83b520619c0b | |
parent | 716dfede464fc06bd0987fbc27f6d34209ac97e3 (diff) | |
download | illumos-joyent-a1f8197be6fcb2714673440bd5c7984434e457ca.tar.gz |
OS-1491 SYSOPS-4940 svc.startd incorrectly placing services into maintenance mode
code reviewed by Robert Mustacchi
-rw-r--r-- | usr/src/cmd/svc/startd/method.c | 75 | ||||
-rw-r--r-- | usr/src/cmd/svc/startd/startd.h | 2 |
2 files changed, 47 insertions, 30 deletions
diff --git a/usr/src/cmd/svc/startd/method.c b/usr/src/cmd/svc/startd/method.c index 326150376c..4b7481f76a 100644 --- a/usr/src/cmd/svc/startd/method.c +++ b/usr/src/cmd/svc/startd/method.c @@ -100,34 +100,18 @@ static uint_t method_events[] = { * method_record_start(restarter_inst_t *) * Record a service start for rate limiting. Place the current time * in the circular array of instance starts. + * + * Save the critical_failure_period and critical_failure_allowed with either + * the defaults or the svc properties startd/critical_failure_count and + * startd/critical_failure_period. + * ri_crit_fail_allowed is capped at RINST_START_TIMES. */ static void method_record_start(restarter_inst_t *inst) { - int index = inst->ri_start_index++ % RINST_START_TIMES; - - inst->ri_start_time[index] = gethrtime(); -} - -/* - * method_rate_critical(restarter_inst_t *) - * Return true if the average start interval is less than the permitted - * interval. The implicit interval defaults to RINST_FAILURE_RATE_NS and - * RINST_START_TIMES but may be overridden with the svc properties - * startd/critical_failure_count and startd/critical_failure_period - * which represent the number of failures to consider and the amount of - * time in seconds in which that number may occur, respectively. Note that - * this time is measured as of the transition to 'enabled' rather than wall - * clock time. - * Implicit success if insufficient measurements for an average exist. - */ -int -method_rate_critical(restarter_inst_t *inst) -{ + int index; + uint_t critical_failure_allowed = RINST_START_TIMES; hrtime_t critical_failure_period; - uint_t critical_failure_count = RINST_START_TIMES; - uint_t n = inst->ri_start_index; - hrtime_t avg_ns = 0; uint64_t scf_fr, scf_st; scf_propvec_t *prop = NULL; scf_propvec_t restart_critical[] = { @@ -151,17 +135,48 @@ method_rate_critical(restarter_inst_t *inst) * in seconds but tracked in ns */ critical_failure_period = (hrtime_t)scf_fr * NANOSEC; - critical_failure_count = (uint_t)scf_st; + critical_failure_allowed = (uint_t)scf_st; + + if (critical_failure_allowed > RINST_START_TIMES) + critical_failure_allowed = RINST_START_TIMES; + if (critical_failure_allowed < 1) + critical_failure_allowed = 1; + } - if (inst->ri_start_index < critical_failure_count) + + inst->ri_crit_fail_allowed = critical_failure_allowed; + inst->ri_crit_fail_period = critical_failure_period; + + index = inst->ri_start_index++ % critical_failure_allowed; + inst->ri_start_time[index] = gethrtime(); +} + +/* + * method_rate_critical(restarter_inst_t *) + * Return true if the number of failures within the interval + * ri_crit_fail_period exceeds ri_crit_fail_allowed. The allowed failure + * count defaults to RINST_START_TIMES and the implicit interval defaults + * to RINST_FAILURE_RATE_NS but may be overridden with the svc properties + * startd/critical_failure_count and startd/critical_failure_period which + * represent the acceptable number of failures and the amount of time in + * seconds in which that number may occur, respectively. Note that this time + * is measured as of the transition to 'enabled' rather than wall clock + * time. Implicitly not critical if insufficient failures have occured. + */ +int +method_rate_critical(restarter_inst_t *inst) +{ + uint_t n = inst->ri_start_index; + uint_t fail_allowed = inst->ri_crit_fail_allowed; + hrtime_t diff_ns; + + if (n < fail_allowed) return (0); - avg_ns = - (inst->ri_start_time[(n - 1) % critical_failure_count] - - inst->ri_start_time[n % critical_failure_count]) / - (critical_failure_count - 1); + diff_ns = inst->ri_start_time[(n - 1) % fail_allowed] - + inst->ri_start_time[n % fail_allowed]; - return (avg_ns < critical_failure_period); + return (diff_ns < inst->ri_crit_fail_period); } /* diff --git a/usr/src/cmd/svc/startd/startd.h b/usr/src/cmd/svc/startd/startd.h index a84e1742bd..6753fb77d9 100644 --- a/usr/src/cmd/svc/startd/startd.h +++ b/usr/src/cmd/svc/startd/startd.h @@ -427,6 +427,8 @@ typedef struct restarter_inst { hrtime_t ri_start_time[RINST_START_TIMES]; uint_t ri_start_index; /* times started */ + uint_t ri_crit_fail_allowed; + hrtime_t ri_crit_fail_period; uu_list_node_t ri_link; pthread_mutex_t ri_lock; |