diff options
author | Jerry Jelinek <jerry.jelinek@joyent.com> | 2012-07-05 15:52:15 +0000 |
---|---|---|
committer | Robert Mustacchi <rm@joyent.com> | 2013-08-11 10:18:28 -0700 |
commit | 2a17138d7a5102bc6e0bf0444224cd0c416d98f0 (patch) | |
tree | 9a52cf97c5dface1def11adb7a64a2e5a9494810 | |
parent | 0d421f668cdfd7a53019f57234af254738038aa0 (diff) | |
download | illumos-joyent-2a17138d7a5102bc6e0bf0444224cd0c416d98f0.tar.gz |
3989 svc.startd gets stuck in a loop when HOME dir doesn't exist
3990 a misconfigured smf service can cause svc.configd to leak memory and eventually hang
3991 startd, configd spinning due to bad sac start method
Reviewed by: Robert Mustacchi <rm@joyent.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r-- | usr/src/cmd/svc/startd/method.c | 38 | ||||
-rw-r--r-- | usr/src/cmd/svc/startd/restarter.c | 61 | ||||
-rw-r--r-- | usr/src/cmd/svc/startd/startd.h | 6 | ||||
-rw-r--r-- | usr/src/cmd/svc/startd/wait.c | 10 | ||||
-rw-r--r-- | usr/src/man/man1m/svc.startd.1m | 6 |
5 files changed, 106 insertions, 15 deletions
diff --git a/usr/src/cmd/svc/startd/method.c b/usr/src/cmd/svc/startd/method.c index 6fc05ed49a..eed6c680ad 100644 --- a/usr/src/cmd/svc/startd/method.c +++ b/usr/src/cmd/svc/startd/method.c @@ -121,10 +121,10 @@ method_record_start(restarter_inst_t *inst) * clock time. * Implicit success if insufficient measurements for an average exist. */ -static int +int method_rate_critical(restarter_inst_t *inst) { - hrtime_t critical_failure_period = RINST_FAILURE_RATE_NS; + hrtime_t critical_failure_period; uint_t critical_failure_count = RINST_START_TIMES; uint_t n = inst->ri_start_index; hrtime_t avg_ns = 0; @@ -136,6 +136,11 @@ method_rate_critical(restarter_inst_t *inst) { NULL } }; + if (instance_is_wait_style(inst)) + critical_failure_period = RINST_WT_SVC_FAILURE_RATE_NS; + else + critical_failure_period = RINST_FAILURE_RATE_NS; + restart_critical[0].pv_ptr = &scf_fr; restart_critical[1].pv_ptr = &scf_st; @@ -832,12 +837,39 @@ method_run(restarter_inst_t **instp, int type, int *exit_code) "to root-accessible libraries\n", inst->ri_i.i_fmri); /* + * For wait-style svc, sanity check that method exists to prevent an + * infinite loop. + */ + if (instance_is_wait_style(inst) && type == METHOD_START) { + char *pend; + struct stat64 sbuf; + + /* + * We need to handle start method strings that have arguments, + * such as '/lib/svc/method/console-login %i'. + */ + if ((pend = strchr(method, ' ')) != NULL) + *pend = '\0'; + + if (stat64(method, &sbuf) == -1 && errno == ENOENT) { + log_instance(inst, B_TRUE, "Missing start method (%s), " + "changing state to maintenance.", method); + restarter_free_method_context(mcp); + result = ENOENT; + goto out; + } + if (pend != NULL) + *pend = ' '; + } + + /* * If the service is restarting too quickly, send it to * maintenance. */ if (type == METHOD_START) { method_record_start(inst); - if (method_rate_critical(inst)) { + if (method_rate_critical(inst) && + !instance_is_wait_style(inst)) { log_instance(inst, B_TRUE, "Restarting too quickly, " "changing state to maintenance."); result = ELOOP; diff --git a/usr/src/cmd/svc/startd/restarter.c b/usr/src/cmd/svc/startd/restarter.c index 203d6eba05..6adb323289 100644 --- a/usr/src/cmd/svc/startd/restarter.c +++ b/usr/src/cmd/svc/startd/restarter.c @@ -140,6 +140,8 @@ static restarter_instance_list_t instance_list; static uu_list_pool_t *restarter_queue_pool; +#define WT_SVC_ERR_THROTTLE 1 /* 1 sec delay for erroring wait svc */ + /* * Function used to reset the restart times for an instance, when * an administrative task comes along and essentially makes the times @@ -1030,6 +1032,7 @@ stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst, int err; restarter_error_t re; restarter_str_t reason; + restarter_instance_state_t new_state; assert(MUTEX_HELD(&inst->ri_lock)); assert(inst->ri_method_thread == 0); @@ -1040,6 +1043,16 @@ stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst, reason = restarter_str_ct_ev_exit; cp = "all processes in service exited"; break; + case RSTOP_ERR_CFG: + re = RERR_FAULT; + reason = restarter_str_method_failed; + cp = "service exited with a configuration error"; + break; + case RSTOP_ERR_EXIT: + re = RERR_RESTART; + reason = restarter_str_ct_ev_exit; + cp = "service exited with an error"; + break; case RSTOP_CORE: re = RERR_FAULT; reason = restarter_str_ct_ev_core; @@ -1107,7 +1120,10 @@ stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst, log_framework(re == RERR_FAULT ? LOG_INFO : LOG_DEBUG, "%s: Instance stopping because %s.\n", inst->ri_i.i_fmri, cp); - if (instance_is_wait_style(inst) && cause == RSTOP_EXIT) { + if (instance_is_wait_style(inst) && + (cause == RSTOP_EXIT || + cause == RSTOP_ERR_CFG || + cause == RSTOP_ERR_EXIT)) { /* * No need to stop instance, as child has exited; remove * contract and move the instance to the offline state. @@ -1123,8 +1139,26 @@ stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst, bad_error("restarter_instance_update_states", err); } - (void) update_fault_count(inst, FAULT_COUNT_RESET); - reset_start_times(inst); + if (cause == RSTOP_ERR_EXIT) { + /* + * The RSTOP_ERR_EXIT cause is set via the + * wait_thread -> wait_remove code path when we have + * a "wait" style svc that exited with an error. If + * the svc is failing too quickly, we throttle it so + * that we don't restart it more than once/second. + * Since we know we're running in the wait thread its + * ok to throttle it right here. + */ + (void) update_fault_count(inst, FAULT_COUNT_INCR); + if (method_rate_critical(inst)) { + log_instance(inst, B_TRUE, "Failing too " + "quickly, throttling."); + (void) sleep(WT_SVC_ERR_THROTTLE); + } + } else { + (void) update_fault_count(inst, FAULT_COUNT_RESET); + reset_start_times(inst); + } if (inst->ri_i.i_primary_ctid != 0) { inst->ri_m_inst = @@ -1149,7 +1183,8 @@ stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst, bad_error("restarter_instance_update_states", err); } - return (0); + if (cause != RSTOP_ERR_CFG) + return (0); } else if (instance_is_wait_style(inst) && re == RERR_RESTART) { /* * Stopping a wait service through means other than the pid @@ -1161,9 +1196,23 @@ stop_instance(scf_handle_t *local_handle, restarter_inst_t *inst, wait_ignore_by_fmri(inst->ri_i.i_fmri); } + /* + * There are some configuration errors which we cannot detect until we + * try to run the method. For example, see exec_method() where the + * restarter_set_method_context() call can return SMF_EXIT_ERR_CONFIG + * in several cases. If this happens for a "wait-style" svc, + * wait_remove() sets the cause as RSTOP_ERR_CFG so that we can detect + * the configuration error and go into maintenance, even though it is + * a "wait-style" svc. + */ + if (cause == RSTOP_ERR_CFG) + new_state = RESTARTER_STATE_MAINT; + else + new_state = inst->ri_i.i_enabled ? + RESTARTER_STATE_OFFLINE : RESTARTER_STATE_DISABLED; + switch (err = restarter_instance_update_states(local_handle, inst, - inst->ri_i.i_state, inst->ri_i.i_enabled ? RESTARTER_STATE_OFFLINE : - RESTARTER_STATE_DISABLED, RERR_NONE, reason)) { + inst->ri_i.i_state, new_state, RERR_NONE, reason)) { case 0: case ECONNRESET: break; diff --git a/usr/src/cmd/svc/startd/startd.h b/usr/src/cmd/svc/startd/startd.h index 456cf9d9cb..c1062e45e0 100644 --- a/usr/src/cmd/svc/startd/startd.h +++ b/usr/src/cmd/svc/startd/startd.h @@ -379,7 +379,9 @@ typedef enum { RSTOP_HWERR, /* uncorrectable hardware error */ RSTOP_DEPENDENCY, /* dependency activity caused stop */ RSTOP_DISABLE, /* disabled */ - RSTOP_RESTART /* restart requested */ + RSTOP_RESTART, /* restart requested */ + RSTOP_ERR_CFG, /* wait svc exited with a config. error */ + RSTOP_ERR_EXIT /* wait svc exited with an error */ } stop_cause_t; /* @@ -405,6 +407,7 @@ typedef enum { #define RINST_START_TIMES 5 /* failures to consider */ #define RINST_FAILURE_RATE_NS 600000000000LL /* 1 failure/10 minutes */ +#define RINST_WT_SVC_FAILURE_RATE_NS NANOSEC /* 1 failure/second */ /* Number of events in the queue when we start dropping ADMIN events. */ #define RINST_QUEUE_THRESHOLD 100 @@ -717,6 +720,7 @@ void log_instance_fmri(const char *, const char *, boolean_t, /* method.c */ void *method_thread(void *); void method_remove_contract(restarter_inst_t *, boolean_t, boolean_t); +int method_rate_critical(restarter_inst_t *); /* misc.c */ void startd_close(int); diff --git a/usr/src/cmd/svc/startd/wait.c b/usr/src/cmd/svc/startd/wait.c index 823f9241a0..ebd83be10e 100644 --- a/usr/src/cmd/svc/startd/wait.c +++ b/usr/src/cmd/svc/startd/wait.c @@ -21,10 +21,9 @@ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. + * Copyright 2012, Joyent, Inc. All rights reserved. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * wait.c - asynchronous monitoring of "wait registered" start methods * @@ -90,6 +89,7 @@ static void wait_remove(wait_info_t *wi, int direct) { int status; + stop_cause_t cause = RSTOP_EXIT; if (waitpid(wi->wi_pid, &status, 0) == -1) { if (wi->wi_parent) @@ -101,6 +101,10 @@ wait_remove(wait_info_t *wi, int direct) log_framework(LOG_NOTICE, "instance %s exited with status %d\n", wi->wi_fmri, WEXITSTATUS(status)); + if (WEXITSTATUS(status) == SMF_EXIT_ERR_CONFIG) + cause = RSTOP_ERR_CFG; + else + cause = RSTOP_ERR_EXIT; } } @@ -137,7 +141,7 @@ wait_remove(wait_info_t *wi, int direct) log_framework(LOG_DEBUG, "wait_remove requesting stop of %s\n", wi->wi_fmri); - (void) stop_instance_fmri(wait_hndl, wi->wi_fmri, RSTOP_EXIT); + (void) stop_instance_fmri(wait_hndl, wi->wi_fmri, cause); } uu_list_node_fini(wi, &wi->wi_link, wait_info_pool); diff --git a/usr/src/man/man1m/svc.startd.1m b/usr/src/man/man1m/svc.startd.1m index 726e556de1..7c80c35e23 100644 --- a/usr/src/man/man1m/svc.startd.1m +++ b/usr/src/man/man1m/svc.startd.1m @@ -1,6 +1,6 @@ '\" te .\" Copyright (c) 2008, Sun Microsystems, Inc. All Rights Reserved. -.\" Copyright 2011, Joyent Inc +.\" Copyright 2012, Joyent, Inc. All Rights Reserved. .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] @@ -488,7 +488,9 @@ conditions occurs. "\fBWait\fR" model services are restarted whenever the child process associated with the service exits. A child process that exits is not considered an error for "\fBwait\fR" model services, and repeated failures do not lead to a -transition to maintenance state. +transition to maintenance state. However, a wait service which is repeatedly +exiting with an error that exceeds the default rate (5 failures/second) will be +throttled back so that the service only restarts once per second. .SS "LEGACY SERVICES" .sp .LP |