diff options
Diffstat (limited to 'src/pmie/pmie_check.sh')
-rw-r--r-- | src/pmie/pmie_check.sh | 691 |
1 files changed, 691 insertions, 0 deletions
diff --git a/src/pmie/pmie_check.sh b/src/pmie/pmie_check.sh new file mode 100644 index 0000000..273903b --- /dev/null +++ b/src/pmie/pmie_check.sh @@ -0,0 +1,691 @@ +#! /bin/sh +# +# Copyright (c) 2013-2014 Red Hat. +# Copyright (c) 1998-2000,2003 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# +# Administrative script to check pmie processes are alive, and restart +# them as required. +# + +# Get standard environment +. $PCP_DIR/etc/pcp.env +. $PCP_SHARE_DIR/lib/rc-proc.sh + +PMIE=pmie +PMIECONF="$PCP_BIN_DIR/pmieconf" + +# error messages should go to stderr, not the GUI notifiers +unset PCP_STDERR + +# added to handle problem when /var/log/pcp is a symlink, as first +# reported by Micah_Altman@harvard.edu in Nov 2001 +# +_unsymlink_path() +{ + [ -z "$1" ] && return + __d=`dirname $1` + __real_d=`cd $__d 2>/dev/null && $PWDCMND` + if [ -z "$__real_d" ] + then + echo $1 + else + echo $__real_d/`basename $1` + fi +} + +# constant setup +# +tmp=`mktemp -d /tmp/pcp.XXXXXXXXX` || exit 1 +status=0 +echo >$tmp/lock +trap "rm -rf \`[ -f $tmp/lock ] && cat $tmp/lock\` $tmp; exit \$status" 0 1 2 3 15 +prog=`basename $0` + +# control file for pmie administration ... edit the entries in this +# file to reflect your local configuration +# +CONTROL=$PCP_PMIECONTROL_PATH + +# NB: FQDN cleanup; don't guess a 'real name for localhost', and +# definitely don't truncate it a la `hostname -s`. Instead now +# we use such a string only for the default log subdirectory, ie. +# for substituting LOCALHOSTNAME in the third column of $CONTROL. + +# determine path for pwd command to override shell built-in +PWDCMND=`which pwd 2>/dev/null | $PCP_AWK_PROG ' +BEGIN { i = 0 } +/ not in / { i = 1 } +/ aliased to / { i = 1 } + { if ( i == 0 ) print } +'` +[ -z "$PWDCMND" ] && PWDCMND=/bin/pwd +eval $PWDCMND -P >/dev/null 2>&1 +[ $? -eq 0 ] && PWDCMND="$PWDCMND -P" +here=`$PWDCMND` + +# determine whether we can automatically enable any events sinks +CONFARGS="-cF" +if which esplogger >/dev/null 2>&1 +then + CONFARGS='m global syslog_prefix $esp_prefix$' +fi + +# option parsing +# +SHOWME=false +MV=mv +RM=rm +CP=cp +KILL=pmsignal +TERSE=false +VERBOSE=false +VERY_VERBOSE=false +CHECK_RUNLEVEL=false +START_PMIE=true + +echo > $tmp/usage +cat >> $tmp/usage << EOF +Options: + -c=FILE,--control=FILE configuration of pmie instances to manage + -C query system service runlevel information + -N,--showme perform a dry run, showing what would be done + -s,--stop stop pmie processes instead of starting them + -T,--terse produce a terser form of output + -V,--verbose increase diagnostic verbosity + --help +EOF + +ARGS=`pmgetopt --progname=$prog --config=$tmp/usage -- "$@"` +[ $? != 0 ] && exit 1 + +eval set -- "$ARGS" +while [ $# -gt 0 ] +do + case "$1" + in + -c) CONTROL="$2" + shift + ;; + -C) CHECK_RUNLEVEL=true + ;; + -N) SHOWME=true + MV="echo + mv" + RM="echo + rm" + CP="echo + cp" + KILL="echo + kill" + ;; + -s) START_PMIE=false + ;; + -T) TERSE=true + ;; + -V) if $VERBOSE + then + VERY_VERBOSE=true + else + VERBOSE=true + fi + ;; + --) shift + break + ;; + -\?) pmgetopt --usage --progname=$prog --config=$tmp/usage + status=1 + exit + ;; + esac + shift +done + +if [ $# -ne 0 ] +then + pmgetopt --usage --progname=$prog --config=$tmp/usage + status=1 + exit +fi + +_error() +{ + echo "$prog: [$CONTROL:$line]" + echo "Error: $1" + echo "... automated performance reasoning for host \"$host\" unchanged" + touch $tmp/err +} + +_warning() +{ + echo "$prog [$CONTROL:$line]" + echo "Warning: $1" +} + +_message() +{ + case $1 + in + 'restart') + $PCP_ECHO_PROG $PCP_ECHO_N "Restarting pmie for host \"$host\" ...""$PCP_ECHO_C" + ;; + esac +} + +_lock() +{ + # demand mutual exclusion + # + rm -f $tmp/stamp + delay=200 # tenths of a second + while [ $delay -ne 0 ] + do + if pmlock -v $logfile.lock >$tmp/out + then + echo $logfile.lock >$tmp/lock + break + else + if [ ! -f $tmp/stamp ] + then + touch -t `pmdate -30M %Y%m%d%H%M` $tmp/stamp + fi + if [ -n "`find $logfile.lock ! -newer $tmp/stamp -print 2>/dev/null`" ] + then + _warning "removing lock file older than 30 minutes" + ls -l $logfile.lock + rm -f $logfile.lock + fi + fi + pmsleep 0.1 + delay=`expr $delay - 1` + done + + if [ $delay -eq 0 ] + then + # failed to gain mutex lock + # + if [ -f $logfile.lock ] + then + _warning "is another PCP cron job running concurrently?" + ls -l $logfile.lock + else + echo "$prog: `cat $tmp/out`" + fi + _warning "failed to acquire exclusive lock ($logfile.lock) ..." + continue + fi +} + +_unlock() +{ + rm -f $logfile.lock + echo >$tmp/lock +} + +_check_logfile() +{ + if [ ! -f $logfile ] + then + echo "$prog: Error: cannot find pmie output file at \"$logfile\"" + if $TERSE + then + : + else + logdir=`dirname $logfile` + echo "Directory (`cd $logdir; $PWDCMND`) contents:" + LC_TIME=POSIX ls -la $logdir + fi + else + echo "Contents of pmie output file \"$logfile\" ..." + cat $logfile + fi +} + +_check_pmie() +{ + $VERBOSE && $PCP_ECHO_PROG $PCP_ECHO_N " [process $1] ""$PCP_ECHO_C" + + # wait until pmie process starts, or exits + # + delay=5 + [ ! -z "$PMCD_CONNECT_TIMEOUT" ] && delay=$PMCD_CONNECT_TIMEOUT + x=5 + [ ! -z "$PMCD_REQUEST_TIMEOUT" ] && x=$PMCD_REQUEST_TIMEOUT + + # wait for maximum time of a connection and 20 requests + # + delay=`expr \( $delay + 20 \* $x \) \* 10` # tenths of a second + while [ $delay -ne 0 ] + do + if [ -f $logfile ] + then + # $logfile was previously removed, if it has appeared again then + # we know pmie has started ... if not just sleep and try again + # + if ls "$PCP_TMP_DIR/pmie/$1" >$tmp/out 2>&1 + then + if grep "No such file or directory" $tmp/out >/dev/null + then + : + else + $VERBOSE && echo " done" + return 0 + fi + fi + + _plist=`_get_pids_by_name pmie` + _found=false + for _p in `echo $_plist` + do + [ $_p -eq $1 ] && _found=true + done + + if $_found + then + # process still here, just hasn't created its status file + # yet, try again + : + else + $VERBOSE || _message restart + echo " process exited!" + if $TERSE + then + : + else + echo "$prog: Error: failed to restart pmie" + echo "Current pmie processes:" + $PCP_PS_PROG $PCP_PS_ALL_FLAGS | tee $tmp/tmp | sed -n -e 1p + for _p in `echo $_plist` + do + sed -n -e "/^[ ]*[^ ]* [ ]*$_p /p" < $tmp/tmp + done + echo + fi + _check_logfile + return 1 + fi + fi + pmsleep 0.1 + delay=`expr $delay - 1` + $VERBOSE && [ `expr $delay % 10` -eq 0 ] && \ + $PCP_ECHO_PROG $PCP_ECHO_N ".""$PCP_ECHO_C" + done + $VERBOSE || _message restart + echo " timed out waiting!" + if $TERSE + then + : + else + sed -e 's/^/ /' $tmp/out + fi + _check_logfile + return 1 +} + +_get_configfile() +{ + # extract the pmie configuration file (-c) from a list of arguments + # + echo $@ | sed -n \ + -e 's/^/ /' \ + -e 's/[ ][ ]*/ /g' \ + -e 's/-c /-c/' \ + -e 's/.* -c\([^ ]*\).*/\1/p' +} + +_configure_pmie() +{ + # update a pmie configuration file if it should be created/modified + # + configfile="$1" + + if [ -f "$configfile" ] + then + # look for "magic" string at start of file, and ensure we created it + sed 1q "$configfile" | grep '^// pmieconf-pmie [0-9]' >/dev/null + magic=$? + grep '^// Auto-generated by pmieconf' "$configfile" >/dev/null + owned=$? + if [ $magic -eq 0 -a $owned -eq 0 ] + then + # pmieconf file, see if re-generation is needed + cp "$configfile" $tmp/pmie + if $PMIECONF -f $tmp/pmie $CONFARGS >$tmp/diag 2>&1 + then + grep -v "generated by pmieconf" "$configfile" >$tmp/old + grep -v "generated by pmieconf" $tmp/pmie >$tmp/new + if ! diff $tmp/old $tmp/new >/dev/null + then + if [ -w $configfile ] + then + $VERBOSE && echo "Reconfigured: \"$configfile\" (pmieconf)" + eval $CP $tmp/pmie "$configfile" + else + _warning "no write access to pmieconf file \"$configfile\", skip reconfiguration" + ls -l "$configfile" + fi + fi + else + _warning "pmieconf failed to reconfigure \"$configfile\"" + cat "s;$tmp/pmie;$configfile;g" $tmp/diag + echo "=== start pmieconf file ===" + cat $tmp/pmie + echo "=== end pmieconf file ===" + fi + fi + elif [ ! -e "$configfile" ] + then + # file does not exist, generate it, if possible + if $SHOWME + then + echo "+ $PMIECONF -f $configfile $CONFARGS" + elif ! $PMIECONF -f "$configfile" $CONFARGS >$tmp/diag 2>&1 + then + _warning "pmieconf failed to generate \"$configfile\"" + cat $tmp/diag + echo "=== start pmieconf file ===" + cat "$configfile" + echo "=== end pmieconf file ===" + else + chown $PCP_USER:$PCP_GROUP "$configfile" >/dev/null 2>&1 + fi + fi +} + +QUIETLY=false +if [ $CHECK_RUNLEVEL = true ] +then + # determine whether to start/stop based on runlevel settings - we + # need to do this when running unilaterally from cron, else we'll + # always start pmie up (even when we shouldn't). + # + QUIETLY=true + if is_chkconfig_on pmie + then + START_PMIE=true + else + START_PMIE=false + fi +fi + +if [ $START_PMIE = false ] +then + # if pmie has never been started, there's no work to do to stop it + [ ! -d "$PCP_TMP_DIR/pmie" ] && exit + $QUIETLY || $PCP_BINADM_DIR/pmpost "stop pmie from $prog" +fi + +if [ ! -f "$CONTROL" ] +then + echo "$prog: Error: cannot find control file ($CONTROL)" + status=1 + exit +fi + +# 1.0 is the first release, and the version is set in the control file +# with a $version=x.y line +# +version=1.0 +eval `grep '^version=' "$CONTROL" | sort -rn` +if [ $version != "1.0" ] +then + _error "unsupported version (got $version, expected 1.0)" + status=1 + exit +fi + +echo >$tmp/dir +rm -f $tmp/err $tmp/pmies + +line=0 +cat "$CONTROL" \ + | sed -e "s;PCP_LOG_DIR;$PCP_LOG_DIR;g" \ + | while read host socks logfile args +do + # start in one place for each iteration (beware relative paths) + cd "$here" + line=`expr $line + 1` + + # NB: FQDN cleanup: substitute the LOCALHOSTNAME marker in the config line + # differently for the directory and the pcp -h HOST arguments. + logfile_hostname=`hostname || echo localhost` + logfile=`echo $logfile | sed -e "s;LOCALHOSTNAME;$logfile_hostname;"` + logfile=`_unsymlink_path $logfile` + [ "x$host" = "xLOCALHOSTNAME" ] && host=local: + + case "$host" + in + \#*|'') # comment or empty + continue + ;; + \$*) # in-line variable assignment + $SHOWME && echo "# $host $socks $logfile $args" + cmd=`echo "$host $socks $logfile $args" \ + | sed -n \ + -e "/='/s/\(='[^']*'\).*/\1/" \ + -e '/="/s/\(="[^"]*"\).*/\1/' \ + -e '/=[^"'"'"']/s/[;&<>|].*$//' \ + -e '/^\\$[A-Za-z][A-Za-z0-9_]*=/{ +s/^\\$// +s/^\([A-Za-z][A-Za-z0-9_]*\)=/export \1; \1=/p +}'` + if [ -z "$cmd" ] + then + # in-line command, not a variable assignment + _warning "in-line command is not a variable assignment, line ignored" + else + case "$cmd" + in + 'export PATH;'*) + _warning "cannot change \$PATH, line ignored" + ;; + 'export IFS;'*) + _warning "cannot change \$IFS, line ignored" + ;; + *) + $SHOWME && echo "+ $cmd" + eval $cmd + ;; + esac + fi + continue + ;; + esac + + if [ -z "$socks" -o -z "$logfile" -o -z "$args" ] + then + _error "insufficient fields in control file record" + continue + fi + + $VERY_VERBOSE && echo "Check pmie -h $host -l $logfile ..." + + # make sure output directory exists + # + dir=`dirname $logfile` + if [ ! -d "$dir" ] + then + mkdir -p -m 755 "$dir" >$tmp/err 2>&1 + if [ ! -d "$dir" ] + then + cat $tmp/err + _error "cannot create directory ($dir) for pmie log file" + continue + fi + chown $PCP_USER:$PCP_GROUP "$dir" >/dev/null 2>&1 + fi + + cd "$dir" + dir=`$PWDCMND` + $SHOWME && echo "+ cd $dir" + + # ensure pcp user will be able to write there + # + chown -R $PCP_USER:$PCP_GROUP "$dir" >/dev/null 2>&1 + if [ ! -w "$dir" ] + then + _warning "no write access in $dir, skip lock file processing" + ls -ld "$dir" + else + _lock + fi + + # match $logfile from control file to running pmies + pid="" + for file in $PCP_TMP_DIR/pmie/[0-9]* + do + [ "$file" = "$PCP_TMP_DIR/pmie/[0-9]*" ] && continue + $VERY_VERBOSE && $PCP_ECHO_PROG $PCP_ECHO_N "... try $file: ""$PCP_ECHO_C" + + p_id=`echo $file | sed -e 's,.*/,,'` + p_logfile="" + p_pmcd_host="" + + # throw away stderr in case $file has been removed by now + eval `$PCP_BINADM_DIR/pmiestatus $file 2>/dev/null | $PCP_AWK_PROG ' +NR == 2 { printf "p_logfile=\"%s\"\n", $0; next } +NR == 3 { printf "p_pmcd_host=\"%s\"\n", $0; next } + { next }'` + + p_logfile=`_unsymlink_path $p_logfile` + if [ "$p_logfile" != $logfile ] + then + $VERY_VERBOSE && echo "different logfile, skip" + $VERY_VERBOSE && echo " $p_logfile differs to $logfile" + elif _get_pids_by_name pmie | grep "^$p_id\$" >/dev/null + then + $VERY_VERBOSE && echo "pmie process $p_id identified, OK" + pid=$p_id + break + else + $VERY_VERBOSE && echo "pmie process $p_id not running, skip" + $VERY_VERBOSE && _get_pids_by_name pmie + fi + done + + if $VERY_VERBOSE + then + if [ -z "$pid" ] + then + echo "No current pmie process exists for:" + else + echo "Found pmie process $pid monitoring:" + fi + echo " host = $host" + echo " log file = $logfile" + fi + + if [ -z "$pid" -a $START_PMIE = true ] + then + configfile=`_get_configfile $args` + if [ ! -z "$configfile" ] + then + # if this is a relative path and not relative to cwd, + # substitute in the default pmie search location. + # + if [ ! -f "$configfile" -a "`basename $configfile`" = "$configfile" ] + then + configfile="$PCP_SYSCONF_DIR/pmie/$configfile" + fi + + # check configuration file exists and is up to date + _configure_pmie "$configfile" "$host" + fi + + args="-h $host -l $logfile $args" + + $VERBOSE && _message restart + + sock_me='' + if [ "$socks" = y ] + then + # only check for pmsocks if it's specified in the control file + have_pmsocks=false + if which pmsocks >/dev/null 2>&1 + then + # check if pmsocks has been set up correctly + if pmsocks ls >/dev/null 2>&1 + then + have_pmsocks=true + fi + fi + + if $have_pmsocks + then + sock_me="pmsocks " + else + echo "$prog: Warning: no pmsocks available, would run without" + sock_me="" + fi + fi + + [ -f "$logfile" ] && eval $MV -f "$logfile" "$logfile.prior" + + if $SHOWME + then + $VERBOSE && echo + echo "+ ${sock_me}$PMIE -b $args" + _unlock + continue + else + # since this is launched as a sort of daemon, any output should + # go on pmie's stderr, i.e. $logfile ... use -b for this + # + $VERY_VERBOSE && ( echo; $PCP_ECHO_PROG $PCP_ECHO_N "+ ${sock_me}$PMIE -b $args""$PCP_ECHO_C"; echo "..." ) + $PCP_BINADM_DIR/pmpost "start pmie from $prog for host $host" + ${sock_me}$PMIE -b $args & + pid=$! + fi + + # wait for pmie to get started, and check on its health + _check_pmie $pid + + elif [ ! -z "$pid" -a $START_PMIE = false ] + then + # Send pmie a SIGTERM, which is noted as a pending shutdown. + # Add pid to list of pmies sent SIGTERM - may need SIGKILL later. + # + $VERY_VERBOSE && echo "+ $KILL -s TERM $pid" + eval $KILL -s TERM $pid + $PCP_ECHO_PROG $PCP_ECHO_N "$pid ""$PCP_ECHO_C" >> $tmp/pmies + fi + + _unlock +done + +# check all the SIGTERM'd pmies really died - if not, use a bigger hammer. +# +if $SHOWME +then + : +elif [ $START_PMIE = false -a -s $tmp/pmies ] +then + pmielist=`cat $tmp/pmies` + if ps -p "$pmielist" >/dev/null 2>&1 + then + $VERY_VERBOSE && ( echo; $PCP_ECHO_PROG $PCP_ECHO_N "+ $KILL -KILL `cat $tmp/pmies` ...""$PCP_ECHO_C" ) + eval $KILL -s KILL $pmielist >/dev/null 2>&1 + delay=30 # tenths of a second + while ps -f -p "$pmielist" >$tmp/alive 2>&1 + do + if [ $delay -gt 0 ] + then + pmsleep 0.1 + delay=`expr $delay - 1` + continue + fi + echo "$prog: Error: pmie process(es) will not die" + cat $tmp/alive + status=1 + break + done + fi +fi + +[ -f $tmp/err ] && status=1 +exit |