summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJerry Jelinek <jerry.jelinek@joyent.com>2012-08-24 20:29:03 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2012-08-24 21:41:13 +0000
commita1f8197be6fcb2714673440bd5c7984434e457ca (patch)
treeb4fe21d1f75a2588370edd1707dd83b520619c0b
parent716dfede464fc06bd0987fbc27f6d34209ac97e3 (diff)
downloadillumos-joyent-a1f8197be6fcb2714673440bd5c7984434e457ca.tar.gz
OS-1491 SYSOPS-4940 svc.startd incorrectly placing services into maintenance mode
code reviewed by Robert Mustacchi
-rw-r--r--usr/src/cmd/svc/startd/method.c75
-rw-r--r--usr/src/cmd/svc/startd/startd.h2
2 files changed, 47 insertions, 30 deletions
diff --git a/usr/src/cmd/svc/startd/method.c b/usr/src/cmd/svc/startd/method.c
index 326150376c..4b7481f76a 100644
--- a/usr/src/cmd/svc/startd/method.c
+++ b/usr/src/cmd/svc/startd/method.c
@@ -100,34 +100,18 @@ static uint_t method_events[] = {
* method_record_start(restarter_inst_t *)
* Record a service start for rate limiting. Place the current time
* in the circular array of instance starts.
+ *
+ * Save the critical_failure_period and critical_failure_allowed with either
+ * the defaults or the svc properties startd/critical_failure_count and
+ * startd/critical_failure_period.
+ * ri_crit_fail_allowed is capped at RINST_START_TIMES.
*/
static void
method_record_start(restarter_inst_t *inst)
{
- int index = inst->ri_start_index++ % RINST_START_TIMES;
-
- inst->ri_start_time[index] = gethrtime();
-}
-
-/*
- * method_rate_critical(restarter_inst_t *)
- * Return true if the average start interval is less than the permitted
- * interval. The implicit interval defaults to RINST_FAILURE_RATE_NS and
- * RINST_START_TIMES but may be overridden with the svc properties
- * startd/critical_failure_count and startd/critical_failure_period
- * which represent the number of failures to consider and the amount of
- * time in seconds in which that number may occur, respectively. Note that
- * this time is measured as of the transition to 'enabled' rather than wall
- * clock time.
- * Implicit success if insufficient measurements for an average exist.
- */
-int
-method_rate_critical(restarter_inst_t *inst)
-{
+ int index;
+ uint_t critical_failure_allowed = RINST_START_TIMES;
hrtime_t critical_failure_period;
- uint_t critical_failure_count = RINST_START_TIMES;
- uint_t n = inst->ri_start_index;
- hrtime_t avg_ns = 0;
uint64_t scf_fr, scf_st;
scf_propvec_t *prop = NULL;
scf_propvec_t restart_critical[] = {
@@ -151,17 +135,48 @@ method_rate_critical(restarter_inst_t *inst)
* in seconds but tracked in ns
*/
critical_failure_period = (hrtime_t)scf_fr * NANOSEC;
- critical_failure_count = (uint_t)scf_st;
+ critical_failure_allowed = (uint_t)scf_st;
+
+ if (critical_failure_allowed > RINST_START_TIMES)
+ critical_failure_allowed = RINST_START_TIMES;
+ if (critical_failure_allowed < 1)
+ critical_failure_allowed = 1;
+
}
- if (inst->ri_start_index < critical_failure_count)
+
+ inst->ri_crit_fail_allowed = critical_failure_allowed;
+ inst->ri_crit_fail_period = critical_failure_period;
+
+ index = inst->ri_start_index++ % critical_failure_allowed;
+ inst->ri_start_time[index] = gethrtime();
+}
+
+/*
+ * method_rate_critical(restarter_inst_t *)
+ * Return true if the number of failures within the interval
+ * ri_crit_fail_period exceeds ri_crit_fail_allowed. The allowed failure
+ * count defaults to RINST_START_TIMES and the implicit interval defaults
+ * to RINST_FAILURE_RATE_NS but may be overridden with the svc properties
+ * startd/critical_failure_count and startd/critical_failure_period which
+ * represent the acceptable number of failures and the amount of time in
+ * seconds in which that number may occur, respectively. Note that this time
+ * is measured as of the transition to 'enabled' rather than wall clock
+ * time. Implicitly not critical if insufficient failures have occured.
+ */
+int
+method_rate_critical(restarter_inst_t *inst)
+{
+ uint_t n = inst->ri_start_index;
+ uint_t fail_allowed = inst->ri_crit_fail_allowed;
+ hrtime_t diff_ns;
+
+ if (n < fail_allowed)
return (0);
- avg_ns =
- (inst->ri_start_time[(n - 1) % critical_failure_count] -
- inst->ri_start_time[n % critical_failure_count]) /
- (critical_failure_count - 1);
+ diff_ns = inst->ri_start_time[(n - 1) % fail_allowed] -
+ inst->ri_start_time[n % fail_allowed];
- return (avg_ns < critical_failure_period);
+ return (diff_ns < inst->ri_crit_fail_period);
}
/*
diff --git a/usr/src/cmd/svc/startd/startd.h b/usr/src/cmd/svc/startd/startd.h
index a84e1742bd..6753fb77d9 100644
--- a/usr/src/cmd/svc/startd/startd.h
+++ b/usr/src/cmd/svc/startd/startd.h
@@ -427,6 +427,8 @@ typedef struct restarter_inst {
hrtime_t ri_start_time[RINST_START_TIMES];
uint_t ri_start_index; /* times started */
+ uint_t ri_crit_fail_allowed;
+ hrtime_t ri_crit_fail_period;
uu_list_node_t ri_link;
pthread_mutex_t ri_lock;