#! /bin/sh
#
# Control script for running PCP QA tests
#
# Copyright (c) 1997-2002 Silicon Graphics, Inc.  All Rights Reserved.
#

mypid=$$
status=0
needwrap=true
try=0
n_bad=0
bad=""
notrun=""
interrupt=true
myname=`basename $0`
iam=$myname  #  a synonym

# status and log files
CHECKLOCK=/tmp/check-LOCK
CHECKSTS=/tmp/check.sts     	    	#  If you change these, hangcheck.pcpqa
CHECKPID=/tmp/check.pid     	    	#  will need to change, too.
CHECKSLOG=/var/tmp/check-start.log	#  A check.log already exists for 
    	    	    	    	    	#  another reason.


_wallclock()
{
    date "+%H %M %S" | $PCP_AWK_PROG '{ print $1*3600 + $2*60 + $3 }'
}

_timestamp()
{
    now=`date "+%D-%T"` 
    $PCP_ECHO_PROG $PCP_ECHO_N " [$now]""$PCP_ECHO_C"
}

_release_lock()
{
    if [ -f "$CHECKLOCK" ]
    then
    	LOCKOWNER=`cat "$CHECKLOCK" 2>/dev/null` || return 0
    	[ "$LOCKOWNER" = "$mypid" ] && rm -f "$CHECKLOCK"
    fi

    return 0
}

_wrapup()
{
    # for hangcheck ...
    # remove files that were used by hangcheck
    #
    if [ "$HANGCHECK" = true -a "$USER" = pcpqa ]
    then
        checkpid=`cat "$CHECKPID"`
    	[ "$checkpid" = "$mypid" -a -f "$CHECKSTS" ] && rm -f "$CHECKSTS"
    	[ "$checkpid" = "$mypid" -a -f "$CHECKPID" ] && rm -f "$CHECKPID"
    fi

    if [ -z "$tmp" ]
    then
	# did not get very far into the intialization!
	:
    else
	# release the lock and remove backup files
	_release_lock
	[ -d $tmp ] && ( rm -rf $tmp/checksums ; rmdir $tmp )

	if $showme
	then
	    :
	elif $needwrap
	then
	    if [ -f check.time -a -f $tmp.time ]
	    then
		cat check.time $tmp.time \
		| $PCP_AWK_PROG '
	{ t[$1] = $2 }
END	{ if (NR > 0) {
	    for (i in t) print i " " t[i]
	  }
	}' \
		| sort -n >$tmp.out
		mv $tmp.out check.time
	    fi

	    echo "" >>check.log
	    date >>check.log
	    echo $list | fmt | sed -e 's/^/    /' >>check.log
	    $interrupt && echo "Interrupted! [running $seq]" >>check.log

	    if [ ! -z "$notrun" ]
	    then
		[ $color = true ] && tput bold && tput setaf 4 # blue
		echo "Not run:$notrun"
		[ $color = true ] && tput sgr0 # reset
		echo "Not run:$notrun" | fmt >>check.log
	    fi
	    if [ ! -z "$n_bad" -a "$n_bad" != 0 ]
	    then
		[ $color = true ] && tput bold && tput setaf 1 # red
		echo "Failures:$bad"
		echo "Failed $n_bad of $try tests"
		[ $color = true ] && tput sgr0 # reset
		echo "Failures:$bad" | fmt >>check.log
		echo "Failed $n_bad of $try tests" >>check.log
	    else
		if [ $try != 0 ]
		then
		    [ $color = true ] && tput bold && tput setaf 2 # green
		    echo "Passed all $try tests"
		    [ $color = true ] && tput sgr0 # reset
		    echo "Passed all $try tests" >>check.log
		fi
	    fi
	    needwrap=false
	fi

	rm -f $tmp.*
    fi
}

_addfiles ()
{
    af=$1
    [ "$af" = "" ] && return 1
    [ ! -f "$af" ] && touch "$af"
    shift

    for fn in "$@"
    do
    	fgrep -s "$fn" "$af" >/dev/null
    	[ $? = 1 ] && echo "$fn" >>"$af"
    done

    return 0
}

_check_lock() {
    #  Check that a check process of that process ID found in
    #  $CHECKLOCK exists, and if not, release the lock.

    [ ! -f "$CHECKLOCK" ] && return 0
    PID=`cat "$CHECKLOCK" 2>/dev/null` || return 0

    CCNT=`ps -e -o "pid args" | grep -v grep | grep "$PID" | grep check | \
      $PCP_AWK_PROG '{ print $1 }'`
    if [ "$PID" != "$CCNT" ]
    then
    	#  We can remove the lock; no check process found with that ID
    	$sudo rm -f "$CHECKLOCK"
    fi
    
    return 0
}

_get_lock()
{
    #  Does someone else have a lock on check at this time?  If so, we
    #  can't run a test until the lock is removed.
    #
    #  NOTE: the use of check-LOCK rather than check.pid was done so that
    #  people running check manually (rather than run.pcpqa running check)
    #  can have tests running between themselves.  This is better than
    #  having people waiting on one long series of tests passed to check
    #  and having spent 10 minutes waiting for nothing.

    #  Check that an instance of check who claims to have the lock actually
    #  exists!
    _check_lock

    #  Get (make) a lock
    echomessage=true
    for sleeptime in \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 \
      2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
      2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
      2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 \
      5 5 5 5 5 5 5 5 5 5 5 5 0  #  10 minutes waiting time per test...
    do
    	if [ -f "$CHECKLOCK" ]
    	then
    	    LOCKOWNER=`cat "$CHECKLOCK" 2>/dev/null` || continue

    	    if [ "$LOCKOWNER" != $mypid ]
    	    then
    	    	#  wait until lock disappears...
    	    	if [ "$sleeptime" = 0 ]
    	    	then
    	    	    #  We should leave... something's terribly wrong.
    	    	    echo ""
    	    	    return 1
    	    	else
    	    	    $echomessage && \
    	    	      $PCP_ECHO_PROG $PCP_ECHO_N " waiting for lock [owner pid=$LOCKOWNER]... ""$PCP_ECHO_C" && \
    	    	      echomessage=false
    	    	    sleep $sleeptime
    	    	fi
    	    else
    	    	#  already have lock
    	    	break
    	    fi
    	else
    	    #  make lock
    	    echo "$mypid" >"$CHECKLOCK"
    	    chmod a+r "$CHECKLOCK"
    	    break
    	fi
    done
    $echomessage || echo "got it; proceeding: $seq"

    return 0
}

_make_checkfiles()
{
    if [ ! -f $tmp.checkfiles ]
    then
	[ -z "$PCP_PMCDOPTIONS_PATH" ] && \
		PCP_PMCDOPTIONS_PATH="$PCP_SYSCONF_DIR/pmcd/pmcd.options"
	[ -z "$PCP_PMLOGGERCONTROL_PATH" ] && \
		PCP_PMLOGGERCONTROL_PATH="$PCP_SYSCONF_DIR/pmlogger/control"
	[ -z "$PCP_PMIECONTROL_PATH" ] && \
		PCP_PMIECONTROL_PATH="$PCP_SYSCONF_DIR/pmie/control"
	_checkfiles="$PCP_PMCDCONF_PATH \
		$PCP_PMLOGGERCONTROL_PATH \
		$PCP_SYSCONF_DIR/pmlogger/config.default \
		$PCP_PMLOGGERCONTROL_PATH \
		$PCP_PMCDOPTIONS_PATH \
		$PCP_DIR/etc/init.d/pcp \
		$PCP_DIR/etc/pcp.conf $PCP_DIR/etc/pcp.env \
		$PCP_PMDAS_DIR/sample/dsohelp.dir \
		$PCP_PMDAS_DIR/sample/dsohelp.pag \
		$PCP_PMDAS_DIR/sample/help.dir \
		$PCP_PMDAS_DIR/sample/help.pag \
		$PCP_PMDAS_DIR/simple/simple.conf"
    fi
}

_checksums()
{
    cmd="$1"

    _make_checkfiles

    case "$cmd"
    in
	get)
	    mkdir -p $tmp/checksums
    	    chmod a+w $tmp/checksums
	    for f in `cat $tmp.checkfiles`
	    do
    	    	buf=`echo $f | sed -e 's;/;+;g'`
    	    	buf=$tmp/checksums/$buf
		[ -f $f ] && sum $f
		[ -f $f -a ! -f $buf ] && $sudo cp $f $buf
	    done
	    ;;

	check)
	    for f in `cat $tmp.checkfiles`
	    do
    	    	buf=`echo $f | sed -e 's;/;+;g'`
    	    	buf=$tmp/checksums/$buf
		if [ ! -f $f ]
		then
		    if fgrep "$f" $2 >/dev/null 2>&1
		    then
			echo "    Missing: \"$f\""
		        [ -f $buf ] && $sudo cp -f $buf $f
		    fi
		else
		    _cs=`sum $f`
		    if fgrep "$_cs" $2 >/dev/null 2>&1
		    then
			$sudo rm -f $f.$seq.O
		    else
			echo "    Changed: \"$f\""
			$sudo cp -f $f $f.$seq.O
		        [ -f $buf ] && $sudo cp -f $buf $f
		    fi
		fi

	    done
	    ;;

	*)
	    bozo
	    ;;
    esac
    return 0
}

trap "_wrapup; exit \$status" 0 1 2 3 15

# by default don't output timestamps
timestamp=false

# extra stuff for tracing QA runs	- off/on via $qatrace
qatrace=false
qadepot=mazur.melbourne
qasrc=`hostname`
# constants - meaningful as state transitions in qavis
qanotyet=1	# test not yet started
qarunning=2	# test still going
qafailed=3	# test failed
qapassed=4	# test passed


PCP_TRACE_TIMEOUT=15
export PCP_TRACE_TIMEOUT

# generic initialization... this may take a while to run, because (unless
# $quick is true) make is run.
. ./common

# we have to cheat a bit... but we need to create a check.[pid|sts] file
# to tell hangcheck that we are alive, but not ready to run yet.
if [ "$HANGCHECK" = true -a "$USER" = pcpqa ]
then
    # for hangcheck ...
    # Save pid of check in a well known place, so that hangcheck can be sure it
    # has the right pid (getting the pid from ps output is not reliable enough).
    #
    if [ -f "$CHECKPID" ]
    then
    	checkpidowner=`/bin/sh "ls -l $CHECKPID" | $PCP_AWK_PROG '{ print $3 }'`
    	if [ "$checkpidowner" != pcpqa ]
    	then
    	    $sudo rm -f "$CHECKPID"
    	else
    	    #  There should be a BIG FAT WARNING here if QA is trying to
    	    #  run tests twice!
    	    echo "$myname: a check.pid file already exists... are you already running tests?!" >&2
    	    exit 1
    	fi
    fi
    [ ! -f "$CHECKPID" ] && echo "$mypid" >"$CHECKPID"

    # for hangcheck ...
    # Save the status of check in a well known place, so that hangcheck can be
    # sure to know where check is up to (getting test number from ps output is
    # not reliable enough since the trace stuff has been introduced).
    #
    if [ -f "$CHECKSTS" ]
    then
    	checkpidowner=`/bin/sh "ls -l $CHECKSTS" | $PCP_AWK_PROG '{ print $3 }'`
    	if [ "$checkpidowner" != pcpqa ]
    	then
    	    $sudo rm -f "$CHECKSTS"
    	else
    	    echo "$myname: a check.sts file already exists... are you already running tests?!" >&2
    	    exit 1
    	fi
    fi
    [ ! -f "$CHECKSTS" ] && echo "preamble" >"$CHECKSTS"
fi

[ -f check.time ] || touch check.time

[ "`_get_config pmcd`" != on ] && _change_config pmcd on

if $showme
then
    qatrace=false
fi

if $qatrace
then
    # if tracing turned on, make sure trace agent running ok
    switchon=`pmprobe -h $qadepot trace.control.reset 2>&1 | $PCP_AWK_PROG '{ print $2 }'`
    [ "$switchon" != "1" ] && qatrace=false
fi

if $qatrace
then
    for seq in $list
    do
	$verbose && printf "Preparing pmtrace tags: %-.16s:%s\r" \
	  "$qasrc" "$seq"
	pmtrace -qh $qadepot -v $qanotyet "$qasrc:$seq" 2>/dev/null
    done
    $verbose && printf "%68s\r" " "
fi

torun=`echo $list | wc -w | sed -e 's/ //g'`
haverun=0

for seq in $list
do
    err=false
    if $showme
    then
	echo $seq
	continue
    fi
    if [ $torun -gt 9 ]
    then
	pct=`expr 100 \* $haverun / $torun`
	haverun=`expr $haverun + 1`
	$PCP_ECHO_PROG $PCP_ECHO_N "[$pct%] ""$PCP_ECHO_C"
    fi
    $PCP_ECHO_PROG $PCP_ECHO_N "$seq""$PCP_ECHO_C"
    if [ ! -f $seq ]
    then
	echo " [not run, missing]"
	notrun="$notrun $seq"
	continue
    else
	# really going to try and run this one
	#
	rm -f $seq.out.bad
	lasttime=`sed -n -e "/^$seq /s/.* //p" <check.time`
	[ "X$lasttime" != X ] && $PCP_ECHO_PROG $PCP_ECHO_N " ${lasttime}s ...""$PCP_ECHO_C"
	rm -f core $seq.notrun

    	# acquire lock
    	_get_lock
    	if [ $? != 0 ]
    	then echo "$myname: could not acquire lock; exiting" 2>&1
    	fi

	if $check_config
	then
	    # save checksums for critical conf and control files
	    [ ! -f $tmp.checksums ] && _checksums get >$tmp.checksums
	fi

	start=`_wallclock`
	$timestamp && _timestamp

	# for hangcheck ...
	[ "$HANGCHECK" = true -a "$USER" = pcpqa ] && echo "$seq" >"$CHECKSTS"

	if $qatrace
	then
	    pmtrace -qh $qadepot -v $qarunning "$qasrc:$seq" 2>/dev/null
	    pmtrace -qh $qadepot -e "sh $seq" "$qasrc:$seq" >$tmp.out.1 2>&1
	    sts=$?
	    # check for trace errors on first line of test & blow them away
	    $PCP_AWK_PROG '/pmtrace: / {if (NR != 1) print $0; next} {print $0}' $tmp.out.1 > $tmp.out
	else
	    sh $seq >$tmp.out 2>&1
	    sts=$?
	fi
	$timestamp && _timestamp
	stop=`_wallclock`

	# for hangcheck ...
	[ "$HANGCHECK" = true -a "$USER" = pcpqa ] && echo "working" >"$CHECKSTS"

	if $check_config
	then
	    # check the saved checksums
	    _checksums check $tmp.checksums >$tmp.check
	    if [ -s $tmp.check ]
	    then
		echo "$myname: $seq: ERROR: test failed to restore the following config files:" >>$tmp.out
		cat $tmp.check >>$tmp.out
		$PCP_ECHO_PROG $PCP_ECHO_N " [config not restored]""$PCP_ECHO_C"
	    fi
	fi

    	# remove the lock
    	_release_lock

	if [ -f core ]
	then
	    $PCP_ECHO_PROG $PCP_ECHO_N " [dumped core]""$PCP_ECHO_C"
	    mv core $seq.core
	    err=true
	fi

	if [ -f $seq.notrun ]
	then
	    [ $color = true ] && tput bold && tput setaf 4 # blue
	    echo " [not run] `cat $seq.notrun`"
	    [ $color = true ] && tput sgr0 # reset
	    notrun="$notrun $seq"
	else
	    if [ $sts -ne 0 ]
	    then
		$PCP_ECHO_PROG $PCP_ECHO_N " [failed, exit status $sts]""$PCP_ECHO_C"
		err=true
	    fi
	    if [ ! -f $seq.out ]
	    then
		echo " - no qualified output"
		mv $tmp.out $seq.out.bad
		err=true
	    else
		if diff $seq.out $tmp.out >/dev/null 2>&1
		then
		    echo ""
		    if $err
		    then
			:
		    else
			echo "$seq `expr $stop - $start`" >>$tmp.time
		    fi
		else
		    [ $color = true ] && tput bold && tput setaf 1 # red
		    echo " - output mismatch (see $seq.out.bad)"
		    [ $color = true ] && tput sgr0 # reset
		    mv $tmp.out $seq.out.bad
		    $diff $seq.out $seq.out.bad
		    err=true
		fi
	    fi

	    # really tried to run the test, update the state
	    #
	    if $qatrace
	    then
		if $err
		then
		    pmtrace -qh $qadepot -v $qafailed "$qasrc:$seq:$qaown" 2>/dev/null
		else
		    pmtrace -qh $qadepot -v $qapassed "$qasrc:$seq:$qaown" 2>/dev/null
		fi
	    fi
	fi
    fi

    # come here for each test, except when $showme is true
    #
    if $err
    then
	bad="$bad $seq"
	n_bad=`expr $n_bad + 1`
	quick=false
	[ $diff = true ] || echo "Check local PMCD is still alive ..."
	$OPTION_AGENTS && _haveagents
	$OPTION_LOGGER && _havelogger
    fi
    [ -f $seq.notrun ] || try=`expr $try + 1`
    rm -f $seq.notrun
done

interrupt=false
status=$n_bad
exit