diff options
Diffstat (limited to 'src/pmie/examples')
50 files changed, 740 insertions, 0 deletions
diff --git a/src/pmie/examples/GNUmakefile b/src/pmie/examples/GNUmakefile new file mode 100644 index 0000000..f4db24f --- /dev/null +++ b/src/pmie/examples/GNUmakefile @@ -0,0 +1,91 @@ +# +# Copyright (c) 2000-2001 Silicon Graphics, Inc. All Rights Reserved. +# +# This program is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation; either version 2 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +# or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +# for more details. +# + +TOPDIR = ../../.. +include $(TOPDIR)/src/include/builddefs + +UAG_SOURCE = uag.head $(sort $(wildcard uag.[0-9][0-9])) +UPM_SOURCE = upm.head $(sort $(wildcard upm.[0-9][0-9])) +CPU_SOURCE = cpu.head $(sort $(wildcard cpu.[0-9][0-9])) +DISK_SOURCE = disk.head $(sort $(wildcard disk.[0-9][0-9])) +FILESYS_SOURCE = filesys.head $(sort $(wildcard filesys.[0-9][0-9])) +RAS_SOURCE = ras.head $(sort $(wildcard ras.[0-9][0-9])) +SWAP_SOURCE = swap.head $(sort $(wildcard swap.[0-9][0-9])) +NETWORK_SOURCE = network.head $(sort $(wildcard network.[0-9][0-9])) +ENVIRON_SOURCE = environ.head $(sort $(wildcard environ.[0-9][0-9])) +WEBREPORT_SOURCE= webreport.head $(sort $(wildcard webreport.[0-9][0-9])) + +TARGETS = UAG UPM CPU DISK FILESYS RAS SWAP NETWORK ENVIRON \ + WEBREPORT + +LDIRT = $(TARGETS) + +LSRCFILES = README $(UAG_SOURCE) $(UPM_SOURCE) $(CPU_SOURCE) \ + $(DISK_SOURCE) $(FILESYS_SOURCE) $(RAS_SOURCE) $(SWAP_SOURCE) \ + $(NETWORK_SOURCE) $(ENVIRON_SOURCE) $(WEBREPORT_SOURCE) +EX_DIR = $(PCP_SHARE_DIR)/examples/pmie + +default: $(TARGETS) README + +install: default + $(INSTALL) -m 755 -d $(EX_DIR) + $(INSTALL) -m 644 $(TARGETS) README $(EX_DIR) + +UAG: $(UAG_SOURCE) + rm -f UAG + for file in $(UAG_SOURCE); do cat $$file >>UAG; echo >>UAG; done + +UPM: $(UPM_SOURCE) + rm -f UPM + for file in $(UPM_SOURCE); do cat $$file >>UPM; echo >>UPM; done + +CPU: $(CPU_SOURCE) + rm -f CPU + for file in $(CPU_SOURCE); do cat $$file >>CPU; echo >>CPU; done + +DISK: $(DISK_SOURCE) + rm -f DISK DISK.in + for file in $(DISK_SOURCE); do cat $$file >>DISK.in; echo >>DISK.in; done + sed -e "s@/usr/pcp/bin/pmpost@$(PCP_BINADM_DIR)/pmpost@" <DISK.in >DISK + rm -f DISK.in + +FILESYS: $(FILESYS_SOURCE) + rm -f FILESYS + for file in $(FILESYS_SOURCE); do cat $$file >>FILESYS; echo >>FILESYS; done + +RAS: $(RAS_SOURCE) + rm -f RAS + for file in $(RAS_SOURCE); do cat $$file >>RAS; echo >>RAS; done + +SWAP: $(SWAP_SOURCE) + rm -f SWAP + for file in $(SWAP_SOURCE); do cat $$file >>SWAP; echo >>SWAP; done + +NETWORK: $(NETWORK_SOURCE) + rm -f NETWORK + for file in $(NETWORK_SOURCE); do cat $$file >>NETWORK; echo >>NETWORK; done + +ENVIRON: $(ENVIRON_SOURCE) + rm -f ENVIRON + for file in $(ENVIRON_SOURCE); do cat $$file >>ENVIRON; echo >>ENVIRON; done + +WEBREPORT: $(WEBREPORT_SOURCE) + rm -f WEBREPORT + for file in $(WEBREPORT_SOURCE); do cat $$file >>WEBREPORT; echo >>WEBREPORT; done + +include $(BUILDRULES) + +default_pcp : default + +install_pcp : install diff --git a/src/pmie/examples/README b/src/pmie/examples/README new file mode 100644 index 0000000..1bdf4a6 --- /dev/null +++ b/src/pmie/examples/README @@ -0,0 +1,27 @@ +Example Expressions and Rules for pmie(1) + +The files in this directory contain a number of sample rules +for the Performance Co-Pilot inference engine pmie(1). + +In some cases the rules could be used directly. In others the +rules would require customization (host names, thresholds, rule +evaluation frequency, choose a suitable alarm/action, etc) before +they could be used at a particular site. + +Each file contains a set of related rules as follows: + +CPU general CPU utilization and saturation monitoring + +DISK general disk utilization and saturation monitoring + +FILESYS general file system monitoring + +RAS simple reliability and availability monitoring + (see also the shping PMDA, pmdashping(1)) + +UAG examples from the Performance Co-Pilot User's and + Administrator's Guide + +UPM examples from the Performance Co-Pilot Programmer's + Guide + diff --git a/src/pmie/examples/cpu.00 b/src/pmie/examples/cpu.00 new file mode 100644 index 0000000..403dfa4 --- /dev/null +++ b/src/pmie/examples/cpu.00 @@ -0,0 +1,9 @@ +// +// Unusual usr-sys split when some CPU is more than 20% in usr mode +// and sys mode is at least 1.5 times usr mode +// +cpu_usr_sys = + some_inst ( + $percpu.cpu.sys > $percpu.cpu.user * 1.5 && $percpu.cpu.user > 0.2 + ) + -> alarm "Unusual sys time: " "%i "; diff --git a/src/pmie/examples/cpu.01 b/src/pmie/examples/cpu.01 new file mode 100644 index 0000000..799aa99 --- /dev/null +++ b/src/pmie/examples/cpu.01 @@ -0,0 +1,15 @@ +// +// Over all CPUs, syscall_rate > 1000 * no_of_cpus +// +cpu_syscall = + $all.syscall > 1000 count/sec * hinv.ncpu + -> print "high aggregate syscalls: %v"; + +// Sustained high syscall rate on a single CPU +// +delta = 30 sec; +percpu_syscall = + some_inst ( + $percpu.syscall > 2000 count/sec + ) + -> syslog "Sustained syscalls per second? " "[%i] %v "; diff --git a/src/pmie/examples/cpu.02 b/src/pmie/examples/cpu.02 new file mode 100644 index 0000000..d5b1fbb --- /dev/null +++ b/src/pmie/examples/cpu.02 @@ -0,0 +1,10 @@ +// +// the 1 minute load average exceeds 5 * number of CPUs on any host +// + +hosts = ":gonzo :moomba"; // change as required +delta = 1 minute; // no need to evaluate more often than this +high_load = + some_host ( + $all.load $hosts #'1 minute' > 5 * hinv.ncpu $hosts + ) -> alarm "High Load Average? " "%h: %v "; diff --git a/src/pmie/examples/cpu.head b/src/pmie/examples/cpu.head new file mode 100644 index 0000000..52a493f --- /dev/null +++ b/src/pmie/examples/cpu.head @@ -0,0 +1,12 @@ +// +// Some Common Performance Monitoring Scenarios +// +// The CPU Group +// + +delta = 2 sec; // more often for demonstration purposes + +// common prefixes +// +percpu = "kernel.percpu"; +all = "kernel.all"; diff --git a/src/pmie/examples/disk.00 b/src/pmie/examples/disk.00 new file mode 100644 index 0000000..a69a8d7 --- /dev/null +++ b/src/pmie/examples/disk.00 @@ -0,0 +1,28 @@ +// +// Any disk performing more than 40 I/Os per second, sustained over +// at least 30 seconds is probably busy +// +delta = 30 seconds; +disk_busy = + some_inst ( + $disk.dev.total > 40 count/sec + ) + -> shell 15 mins "Mail -s 'Heavy sustained disk traffic' sysadm </dev/null"; + +// Try and catch bursts of activity ... more than 60 I/Os per second +// for at least 25% of 8 consecutive 3 second samples +// +delta = 3 sec; +disk_burst = + some_inst ( + 25%_sample ( + $disk.dev.total @0..7 > 60 count/sec + ) + ) + -> alarm 5 mins "Disk Burst? " "%i "; + +// any SCSI disk controller performing more than 3 Mbytes per sec is busy +// +some_inst $disk.ctl.blktotal * 0.5 > 3 Mbyte/sec + -> alarm "Busy Disk Controller: " "%i "; + diff --git a/src/pmie/examples/disk.10 b/src/pmie/examples/disk.10 new file mode 100644 index 0000000..9cbf50d --- /dev/null +++ b/src/pmie/examples/disk.10 @@ -0,0 +1,23 @@ +// +// A subset of the disks on a particular host are either busy +// (more than 30 I/Os per second averaged over these disks) or one +// disk is busy (more than 50 I/Os per second) with write-dominated +// (more than 75%) activity + +delta = 10 sec; + +myhost = "moomba"; // the host of interest +mydisks = "#dks1d1 #dks1d2 #dks3d2"; // the disks of interest on this host + +metric = "disk.dev"; + +disk_group_busy = + ( + avg_inst ( $metric.total :$myhost $mydisks ) > 10 count/sec || + some_inst ( + $metric.total :$myhost $mydisks > 50 count/sec && + $metric.write :$myhost $mydisks > + 3 * $metric.write :$myhost $mydisks + ) + ) + -> alarm "Busy disks: $mydisks on host: $myhost)"; diff --git a/src/pmie/examples/disk.20 b/src/pmie/examples/disk.20 new file mode 100644 index 0000000..ec086e5 --- /dev/null +++ b/src/pmie/examples/disk.20 @@ -0,0 +1,13 @@ +// +// Assume the / and /usr file systems are on different partitions +// of the same disk (/dev/dsk0d1 in the example below). +// Add an entry to the file $PCP_LOG_DIR/NOTICES when this disk is +// busy and either of the file systems is more than 90% full. +// +// Suggestion from: Steve Daniels (steve@houdini.denver.sgi.com) + +delta = 60; + +( filesys.full #'/dev/root' > 90 || filesys.full #'/dev/usr' > 90 ) +&& disk.dev.total #'dks0d1' > 40 count/sec + -> shell 15min "/usr/pcp/bin/pmpost 'dks0d1 busy when / or /usr nearly full'"; diff --git a/src/pmie/examples/disk.head b/src/pmie/examples/disk.head new file mode 100644 index 0000000..20f133e --- /dev/null +++ b/src/pmie/examples/disk.head @@ -0,0 +1,11 @@ +// +// Some Common Performance Monitoring Scenarios +// +// The Disk Group +// + +delta = 15 sec; // often enough for disks? + +// common prefixes +// +disk = "disk"; diff --git a/src/pmie/examples/environ.00 b/src/pmie/examples/environ.00 new file mode 100644 index 0000000..85e5597 --- /dev/null +++ b/src/pmie/examples/environ.00 @@ -0,0 +1,18 @@ +// +// Absolute temperature ceiling. +// +// Rules donated by Kevin Wang at Silicon Graphics +// + +some_host ( environ.temp $HOSTS > 33 ) +-> print 10 min "absolute temperature alarm! " "%h: %v degrees "; + +// +// Watch the machine room temperature. If it rises more than 2 degrees +// every $delta, danger! +// This is different from the absolute rule above ... this one +// gives early warning of sustained temperature increases. +// +some_host ( + environ.temp $HOSTS @0 - environ.temp $HOSTS @1 > 2 +) -> print "temperature rise alarm: " "%h: %v degree rise in $DELTA_STR "; diff --git a/src/pmie/examples/environ.head b/src/pmie/examples/environ.head new file mode 100644 index 0000000..78936b9 --- /dev/null +++ b/src/pmie/examples/environ.head @@ -0,0 +1,17 @@ +// +// Some Common Performance Monitoring Scenarios +// +// The Environ Group +// +// Note: need environ PMDA installed on a Challenge L or XL for +// required metrics to be available + +// replace with your hosts +HOSTS = ":localhost :foo"; + +// replace this with your e-mail address +MINDER = "root@localhost"; + +// 1 minute rulesets in this group +delta = 1 min; // numbers are diff than strings +DELTA_STR = "1 minute"; // strings are diff than numbers diff --git a/src/pmie/examples/filesys.00 b/src/pmie/examples/filesys.00 new file mode 100644 index 0000000..937e123 --- /dev/null +++ b/src/pmie/examples/filesys.00 @@ -0,0 +1,14 @@ +// +// Either the /tmp or the /usr filesystem being +// more than 95% full +// + +delta = 5 mins; // often enough for file system fullness? + +tmp_full = + $fsys.free #'/dev/root' / $fsys.capacity #'/dev/root' < 0.05 + -> syslog "/dev/root filesystem (almost) full"; + +usr_full = + $fsys.free #'/dev/usr' / $fsys.capacity #'/dev/usr' < 0.05 + -> syslog "/dev/usr filesystem (almost) full"; diff --git a/src/pmie/examples/filesys.10 b/src/pmie/examples/filesys.10 new file mode 100644 index 0000000..2cf9488 --- /dev/null +++ b/src/pmie/examples/filesys.10 @@ -0,0 +1,14 @@ +// +// Some read activity through the buffer cache and the cache read +// hit ratio is less than 80% +// (lots of file system reads causing physical I/O) +// + +delta = 1 min; // check every minute + +blkio = "kernel.all.io"; +poor_read_hits = + (($blkio.lread - $blkio.bread) / $blkio.lread) < 0.8 && $blkio.lread > 100 + -> alarm 20 min "poor buffer cache read hit ratio (%v)"; + // Note: %v in alarm string is bound to the left most + // expression in the predicate diff --git a/src/pmie/examples/filesys.20 b/src/pmie/examples/filesys.20 new file mode 100644 index 0000000..856d8ed --- /dev/null +++ b/src/pmie/examples/filesys.20 @@ -0,0 +1,14 @@ +// +// at least $threshold full and at the current rate of growth will fill +// the file system in less than $lead_time +// ie. used + $lead_time * growth-rate > capacity + +delta = 1 min; // check every minute +threshold = 40; // must be at least this full now (percentage) +lead_time = "15min"; // lead time before the filesystem will be full + +some_inst ( + 100 * filesys.used / filesys.capacity > $threshold && + filesys.used + $lead_time * ( rate filesys.used ) > + filesys.capacity +) -> print "filesystem will be full within $lead_time:" " %i"; diff --git a/src/pmie/examples/filesys.head b/src/pmie/examples/filesys.head new file mode 100644 index 0000000..de4c703 --- /dev/null +++ b/src/pmie/examples/filesys.head @@ -0,0 +1,10 @@ +// +// Some Common Performance Monitoring Scenarios +// +// The File System Group +// + +// common prefixes +// +fsys = "filesys"; + diff --git a/src/pmie/examples/network.00 b/src/pmie/examples/network.00 new file mode 100644 index 0000000..44c2183 --- /dev/null +++ b/src/pmie/examples/network.00 @@ -0,0 +1,10 @@ +// +// Report when some interface has seen more than 15 errors per second +// on at least 3 of the last 4 observations +// +// Rule donated by Kevin Wang at Silicon Graphics +// + +some_host some_inst 75%_sample ( + network.interface.total.errors $HOSTS @0..3 > 15 +) -> print 5 min "high network interface errors" "%h[%i] %v errors/sec "; diff --git a/src/pmie/examples/network.head b/src/pmie/examples/network.head new file mode 100644 index 0000000..909e773 --- /dev/null +++ b/src/pmie/examples/network.head @@ -0,0 +1,15 @@ +// +// Some Common Performance Monitoring Scenarios +// +// The Network Group +// + +// replace with your hosts +HOSTS = ":localhost :foo"; + +// replace this with your e-mail address +MINDER = "root@localhost"; + +// 10 second rulesets in this group +delta = 10 sec; // numbers are diff than strings +DELTA_STR = "10 seconds"; // strings are diff than numbers diff --git a/src/pmie/examples/ras.00 b/src/pmie/examples/ras.00 new file mode 100644 index 0000000..930b964 --- /dev/null +++ b/src/pmie/examples/ras.00 @@ -0,0 +1,11 @@ +// +// For Origin systems, sequence number errors are not indicative of +// a problem, but persistent checkbit and/or retry errors may indicate +// a CrayLink interconnect problem. +// +some_inst ( all_sample ( + hw.router.perport.cb_errors @0..2 > 0 || + hw.router.perport.retry_errors @0..2 > 0 +) ) + -> alarm 30mins "CrayLink SN and/or Retry errors: " "%i "; + diff --git a/src/pmie/examples/ras.head b/src/pmie/examples/ras.head new file mode 100644 index 0000000..d571913 --- /dev/null +++ b/src/pmie/examples/ras.head @@ -0,0 +1,5 @@ +// +// Some System Reliability, Availability and Serviceability (RAS) Checks +// + +delta = 20 sec; diff --git a/src/pmie/examples/swap.00 b/src/pmie/examples/swap.00 new file mode 100644 index 0000000..2e3f309 --- /dev/null +++ b/src/pmie/examples/swap.00 @@ -0,0 +1,23 @@ +// +// report when swap > 50-75% full and when swap > 75% full +// +// Rules donated by Kevin Wang at Silicon Graphics +// +// note: the sort hack '9999999' to keep the header first; later +// removed by sed +// note: -o option to ps(1) requires IRIX 6.2 or later ... for IRIX 5.3 +// this would have to be re-written using ps -el + +SWAP="swap"; +some_host ( + ($SWAP.free $HOSTS / $SWAP.length $HOSTS) * 100 < 50 && + ($SWAP.free $HOSTS / $SWAP.length $HOSTS) * 100 >= 25 +) -> print 10 min "swap more than half-full: " "%h: %v% free " & + shell 10 min "rsh -n guest@%h /sbin/ps -eo 'ruser=UID,pid=PID,ppid=PPID,pcpu=%CPU,sz=9999999SZ,rss=RSS,stime=STIME,time=TIME,args=CMD' | sort +4 -nr | sed -e 's/9999999SZ / SZ:/' | /usr/sbin/Mail -s '%h swap more than half-full (%v% free)' $MINDER &"; + +some_host ( + ($SWAP.free $HOSTS / $SWAP.length $HOSTS) * 100 < 25 +) -> print 10 min "swap almost full: " "%h: %v% free " & + shell 10 min "rsh -n guest@%h /sbin/ps -eo 'ruser=UID,pid=PID,ppid=PPID,pcpu=%CPU,sz=9999999SZ,rss=RSS,stime=STIME,time=TIME,args=CMD' | sort +4 -nr | sed -e 's/9999999SZ / SZ:/' | /usr/sbin/Mail -s '%h swap almost full (%v% free)' $MINDER &"; + + diff --git a/src/pmie/examples/swap.head b/src/pmie/examples/swap.head new file mode 100644 index 0000000..254cf4a --- /dev/null +++ b/src/pmie/examples/swap.head @@ -0,0 +1,13 @@ +// +// The Swap Group +// + +// replace with your hosts +HOSTS = ":localhost :foo"; + +// replace this with your e-mail address +MINDER = "root@localhost"; + +// 20 second rulesets in this group +delta = 20 sec; // numbers are diff than strings +DELTA_STR = "20 seconds"; // strings are diff than numbers diff --git a/src/pmie/examples/uag.00 b/src/pmie/examples/uag.00 new file mode 100644 index 0000000..6cb9638 --- /dev/null +++ b/src/pmie/examples/uag.00 @@ -0,0 +1,5 @@ +// +// a simple expression, with multiple values +// +iops = disk.dev.total; + diff --git a/src/pmie/examples/uag.01 b/src/pmie/examples/uag.01 new file mode 100644 index 0000000..7e6d691 --- /dev/null +++ b/src/pmie/examples/uag.01 @@ -0,0 +1,4 @@ +// +// total disk write percentage +// +wrt_pct = (disk.all.write / disk.all.total) * 100; diff --git a/src/pmie/examples/uag.02 b/src/pmie/examples/uag.02 new file mode 100644 index 0000000..6127209 --- /dev/null +++ b/src/pmie/examples/uag.02 @@ -0,0 +1,8 @@ +// +// some varied expressions +// +pct_wrt = (disk.all.write / disk.all.total) * 100; +busy_wrt = disk.dev.total > 10 && + disk.dev.write > disk.dev.read; +busy = some_inst disk.dev.total > 60 -> print "[%i] high disk i/o "; + diff --git a/src/pmie/examples/uag.03 b/src/pmie/examples/uag.03 new file mode 100644 index 0000000..01e4d7c --- /dev/null +++ b/src/pmie/examples/uag.03 @@ -0,0 +1,7 @@ +// +// simple use of a macro +// +disk = "disk.all"; +pct_wrt = ($disk.write / $disk.total) * 100; + + diff --git a/src/pmie/examples/uag.04 b/src/pmie/examples/uag.04 new file mode 100644 index 0000000..e13a5ef --- /dev/null +++ b/src/pmie/examples/uag.04 @@ -0,0 +1,52 @@ +// +// perverse example to show all possible choices of units for numeric +// constants +// +mem.freemem > 1 byte; +mem.freemem > 1 Kbyte; +mem.freemem > 1 Mbyte; +mem.freemem > 1 Gbyte; +mem.freemem > 1 Tbyte; + +disk.dev.blktotal > 1 Mbyte / nsec; +disk.dev.blktotal > 1 Mbyte / nanosecond; +disk.dev.blktotal > 1 Mbyte / usec; +disk.dev.blktotal > 1 Mbyte / microsecond; +disk.dev.blktotal > 1 Mbyte / msec; +disk.dev.blktotal > 1 Mbyte / millisecond; +disk.dev.blktotal > 1 Mbyte / sec; +disk.dev.blktotal > 1 Mbyte / second; +disk.dev.blktotal > 1 Mbyte / min; +disk.dev.blktotal > 1 Mbyte / minute; +disk.dev.blktotal > 1 Mbyte / hour; + +hinv.ncpu > 1 count; +hinv.ncpu > 1 Kcount; +hinv.ncpu > 1 count; +hinv.ncpu > 1 Gcount; +hinv.ncpu > 1 Tcount; + +mem.freemem > 1 bytes; +mem.freemem > 1 Kbytes; +mem.freemem > 1 Mbytes; +mem.freemem > 1 Gbytes; +mem.freemem > 1 Tbytes; + +disk.dev.blktotal > 1 Mbyte / nsecs; +disk.dev.blktotal > 1 Mbyte / nanoseconds; +disk.dev.blktotal > 1 Mbyte / usecs; +disk.dev.blktotal > 1 Mbyte / microseconds; +disk.dev.blktotal > 1 Mbyte / msecs; +disk.dev.blktotal > 1 Mbyte / milliseconds; +disk.dev.blktotal > 1 Mbyte / secs; +disk.dev.blktotal > 1 Mbyte / seconds; +disk.dev.blktotal > 1 Mbyte / mins; +disk.dev.blktotal > 1 Mbyte / minutes; +disk.dev.blktotal > 1 Mbyte / hours; + +hinv.ncpu > 1 counts; +hinv.ncpu > 1 Kcounts; +hinv.ncpu > 1 counts; +hinv.ncpu > 1 Gcounts; +hinv.ncpu > 1 Tcounts; + diff --git a/src/pmie/examples/uag.10 b/src/pmie/examples/uag.10 new file mode 100644 index 0000000..7135f6b --- /dev/null +++ b/src/pmie/examples/uag.10 @@ -0,0 +1,32 @@ +// +// metric expressions + +// all instances +// +enet = network.interface.total.packets; + +// restricted instance (loopback interface only) +// +enet_r = network.interface.total.packets #lo0; + +// restricted instances with weird instance names ... +// note instance names are "identifiers" in the grammar, so single +// quotes required for tricky characters, like /, spaces, etc, _not_ +// double quotes +// +root_n_usr = filesys.free #'/dev/root' #'/dev/usr'; + +// multiple hosts +// +num_cpu = hinv.ncpu :babylon.engr.sgi.com :gonzo :sandpit; + +// multiple sample times +// +mem_trend = mem.freemem @0..3; + +// multi-dimension variations +// + +// missing instance for non-singular instance domain, plus multiple hosts +// +net_view = network.interface.total.packets :gonzo :moomba; diff --git a/src/pmie/examples/uag.11 b/src/pmie/examples/uag.11 new file mode 100644 index 0000000..7d4b86b --- /dev/null +++ b/src/pmie/examples/uag.11 @@ -0,0 +1,7 @@ +// +// relational (logical) expressions +// +hosts = ":gonzo"; +intfs = "#ec0 #ec2"; +all_intf = network.interface.in.packets + $hosts $intfs @0..2 > 300 count/sec; diff --git a/src/pmie/examples/uag.12 b/src/pmie/examples/uag.12 new file mode 100644 index 0000000..3850168 --- /dev/null +++ b/src/pmie/examples/uag.12 @@ -0,0 +1,12 @@ +// +// quantification examples +// + +// some_instance +all_intf = network.interface.in.packets + #ec0 #ec2 @0..2 > 300 count/sec; +any_sample = some_sample + network.interface.in.packets + #ec0 #ec2 @0..2 > 300 count/sec; + + diff --git a/src/pmie/examples/uag.13 b/src/pmie/examples/uag.13 new file mode 100644 index 0000000..9990407 --- /dev/null +++ b/src/pmie/examples/uag.13 @@ -0,0 +1,33 @@ +// +// nested quantification +// + +Servers = ":moomba :gonzo"; // change as desired + +// read and write rate per disk per host +// +rd = disk.dev.read $Servers; +wr = disk.dev.write $Servers; + +// one value per host, true if 20% or more of the disks are doing +// significant reading or writing +// +rd_20 = 20%_inst disk.dev.read $Servers > 40; +wr_20 = 20%_inst disk.dev.write $Servers > 40; + +// single truth value: more than 20% of the disks busy reading or writing +// on all hosts? +// +summary = all_host ( + 20%_inst disk.dev.read $Servers > 40 || + 20%_inst disk.dev.write $Servers > 40 + ); + +// alternate form +// +summary2 = all_host ( + 20%_inst ( + disk.dev.read $Servers > 40 || + disk.dev.write $Servers > 40 + ) + ); diff --git a/src/pmie/examples/uag.20 b/src/pmie/examples/uag.20 new file mode 100644 index 0000000..475017c --- /dev/null +++ b/src/pmie/examples/uag.20 @@ -0,0 +1,7 @@ +// +// a rule expression with multiple actions and %-binding in the +// arguments for the action methods +// +some_inst ( disk.dev.total > 60 ) + -> syslog 10 mins "[%i] busy, %v IOPS " & + shell 1 hour "echo 'Disk %i is REALLY busy. Running at %v I/Os per second' | Mail -s 'pmie alarm' sysadm"; diff --git a/src/pmie/examples/uag.21 b/src/pmie/examples/uag.21 new file mode 100644 index 0000000..c841ff2 --- /dev/null +++ b/src/pmie/examples/uag.21 @@ -0,0 +1,9 @@ +// +// a rule expression with multiple actions and %-binding in the +// arguments for the action methods ... use some creative string +// composition for the final message +// +some_inst ( disk.dev.total > 50 ) + -> syslog 10 mins "Busy disks: " "%i @ %v IOPS " & + shell 1 hour "echo 'REALLY busy disks: " "%i @ %v I/Os per second " "' | Mail -s 'pmie alarm' sysadm"; + diff --git a/src/pmie/examples/uag.30 b/src/pmie/examples/uag.30 new file mode 100644 index 0000000..b15a781 --- /dev/null +++ b/src/pmie/examples/uag.30 @@ -0,0 +1,17 @@ +// +// intrinsic operators +// + +m = mem.freemem; +rate_m = rate mem.freemem; + +// At least 2 CPUs doing some reasonable amount of work +// +poke = ":moomba :'mac-larry' :bitbucket"; // note '' to escape - in host name +u = kernel.percpu.cpu.user $poke; +s = kernel.percpu.cpu.sys $poke; +some_host ( + count_inst ( kernel.percpu.cpu.user $poke + + kernel.percpu.cpu.sys $poke > 0.7 ) >= 2 + ) + -> alarm "2 or more busy CPUs"; diff --git a/src/pmie/examples/uag.head b/src/pmie/examples/uag.head new file mode 100644 index 0000000..9ba4f40 --- /dev/null +++ b/src/pmie/examples/uag.head @@ -0,0 +1,3 @@ +// +// Examples from the Performance Co-Pilot User's and Administrator's Guide +// diff --git a/src/pmie/examples/upm.00 b/src/pmie/examples/upm.00 new file mode 100644 index 0000000..1b9e04c --- /dev/null +++ b/src/pmie/examples/upm.00 @@ -0,0 +1,6 @@ +// +// If the total context switch rate exceeds 10000 per second per CPU +// then display an alarm notifier +// +kernel.all.pswitch / hinv.ncpu > 10000 count/sec + -> alarm "high context switch rate %v"; diff --git a/src/pmie/examples/upm.01 b/src/pmie/examples/upm.01 new file mode 100644 index 0000000..d83bd79 --- /dev/null +++ b/src/pmie/examples/upm.01 @@ -0,0 +1,4 @@ +all_sample ( + kernel.all.pswitch @0..9 > 10 Kcount/sec * hinv.ncpu +) -> shell 5 min "xwsh -e 'top'"; + diff --git a/src/pmie/examples/upm.02 b/src/pmie/examples/upm.02 new file mode 100644 index 0000000..eaa0c42 --- /dev/null +++ b/src/pmie/examples/upm.02 @@ -0,0 +1,19 @@ +delta = 5 sec; // force evaluation once every 5 seconds from here on + +// If for any disk, for all 4 samples (20 seconds), the disk is performing +// more than 40 I/Os per second, then print a message to standard output and +// then launch dkvis(1) +// +some_inst all_sample + disk.dev.total @0..3 > 40 count/sec + -> print "disks busy for 20 sec:" " %i" & + shell 5 min "dkvis"; + +// If any disk is performing more than 60 I/Os per second, then +// print a message identifying the busy disk to standard output and +// launch dkvis(1) +some_inst ( + disk.dev.total > 60 count/sec +) -> print "busy disks:" " %i" & + shell 5 min "dkvis"; + diff --git a/src/pmie/examples/upm.03 b/src/pmie/examples/upm.03 new file mode 100644 index 0000000..ff7501e --- /dev/null +++ b/src/pmie/examples/upm.03 @@ -0,0 +1,8 @@ +// +// Refine the preceding rule to apply only between the hours of 9am and 5pm, +// and to require that just 3 of the four samples exceed the threshold +// +$hour >= 9 && $hour <= 17 && some_inst 75 %_sample + disk.dev.total @0..3 > 40 count/sec + -> print "disk busy for 20 sec" & + shell 5 min "dkvis"; diff --git a/src/pmie/examples/upm.04 b/src/pmie/examples/upm.04 new file mode 100644 index 0000000..36c9699 --- /dev/null +++ b/src/pmie/examples/upm.04 @@ -0,0 +1,10 @@ +// +// Refine the preceding rule further to print the host name and disk name +// for which the threshold is exceeded +// +$hour >= 9 && $hour <= 17 && +some_inst ( + 75 %_sample ( + disk.dev.total @0..3 > 40 count/sec + ) +) -> print "disks busy for 20 sec:" " [%h]%i"; diff --git a/src/pmie/examples/upm.05 b/src/pmie/examples/upm.05 new file mode 100644 index 0000000..702747e --- /dev/null +++ b/src/pmie/examples/upm.05 @@ -0,0 +1,9 @@ +// +// Macro for use ... +// +bc = "buffer_cache"; +// Using the above macro; If the buffer cache is in use (more than 50 read +// requests) with hit ratio less than 90%, then popup an alarm +// +$bc.getblks > 50 && $bc.getfound / $bc.getblks < 0.9 + -> alarm "poor buffer cache hit rate"; diff --git a/src/pmie/examples/upm.06 b/src/pmie/examples/upm.06 new file mode 100644 index 0000000..dbf9da9 --- /dev/null +++ b/src/pmie/examples/upm.06 @@ -0,0 +1,10 @@ +delta = 10 mins; // force evaluation once every 10 minutes from here on + +// If either the / or the /usr filesystem is more than 95% full, display +// an alarm popup, but not if it has already been displayed during the last +// 24 hours +// +filesys.free #'/dev/root' / filesys.capacity #'/dev/root' < 0.05 + -> alarm 24 hour "root filesystem (almost) full"; +filesys.free #'/dev/usr' / filesys.capacity #'/dev/usr' < 0.05 + -> alarm 24 hour "/usr filesystem (almost) full"; diff --git a/src/pmie/examples/upm.07 b/src/pmie/examples/upm.07 new file mode 100644 index 0000000..f325ee6 --- /dev/null +++ b/src/pmie/examples/upm.07 @@ -0,0 +1,8 @@ +// +// The following rule requires a machine that supports the PCP environment +// metrics. If the machine environment temperature rises more than 2 +// degrees over a 10 minute interval, write an entry in the system log +// +environ.temp @1 - environ.temp @0 > 2 + -> alarm "temperature rising fast" & + syslog "machine room temperature rise alarm"; diff --git a/src/pmie/examples/upm.08 b/src/pmie/examples/upm.08 new file mode 100644 index 0000000..8ab0e49 --- /dev/null +++ b/src/pmie/examples/upm.08 @@ -0,0 +1,13 @@ +// +// Something interesting if you have performance problems with +// your Oracle data base ... +// +db = "oracle.ptg1"; +host = ":moomba.melbourne.sgi.com"; +lru = "#'cache buffers lru chain'"; +gets = "$db.latch.gets $host $lru"; +total = "$db.latch.gets $host $lru + $db.latch.misses $host $lru + + $db.latch.immisses $host $lru"; + +$total > 100 && $gets / $total < 0.2 + -> alarm "high LRU latch contention"; diff --git a/src/pmie/examples/upm.09 b/src/pmie/examples/upm.09 new file mode 100644 index 0000000..e2f12cf --- /dev/null +++ b/src/pmie/examples/upm.09 @@ -0,0 +1,11 @@ +// Busy disk? + +delta = 20 sec; // force evaluation once every 20 seconds from here on + +// If any disk is performing more than 60 I/Os per second, then +// print a message to standard output and launch dkvis(1) +// +some_inst + disk.dev.total > 60 count/sec + -> print "disk busy for 20 sec" "%v IOPS %i@%h" & + shell 5 min "dkvis"; diff --git a/src/pmie/examples/upm.10 b/src/pmie/examples/upm.10 new file mode 100644 index 0000000..ce40c1a --- /dev/null +++ b/src/pmie/examples/upm.10 @@ -0,0 +1,11 @@ +delta = 1 minute; +ruleset + kernel.all.load #'1 minute' > 10 * hinv.ncpu -> + print "extreme load average %v" +else kernel.all.load #'1 minute' > 2 * hinv.ncpu -> + print "moderate load average %v" +unknown -> + print "load average unavailable" +otherwise -> + print "load average OK" +; diff --git a/src/pmie/examples/upm.head b/src/pmie/examples/upm.head new file mode 100644 index 0000000..3a286e9 --- /dev/null +++ b/src/pmie/examples/upm.head @@ -0,0 +1,5 @@ +// +// Examples from the pmie(1) man page +// + +delta = 1 sec; // force evaluation once per second diff --git a/src/pmie/examples/webreport.00 b/src/pmie/examples/webreport.00 new file mode 100644 index 0000000..70e04d8 --- /dev/null +++ b/src/pmie/examples/webreport.00 @@ -0,0 +1,8 @@ +// +// Request rate throughput (requests per second) summaries +// + +// you may replace the metric below with any of the other +// web.allservers.requests metrics + +request_rate = web.allservers.requests.total; diff --git a/src/pmie/examples/webreport.01 b/src/pmie/examples/webreport.01 new file mode 100644 index 0000000..cab9272 --- /dev/null +++ b/src/pmie/examples/webreport.01 @@ -0,0 +1,8 @@ +// +// Data throughput (Kbytes per minute) summaries +// + +// you may replace the metric below with any of the other +// web.allservers.bytes metrics + +data_rate = web.allservers.bytes.total * 60 / 1024; diff --git a/src/pmie/examples/webreport.head b/src/pmie/examples/webreport.head new file mode 100644 index 0000000..91deb82 --- /dev/null +++ b/src/pmie/examples/webreport.head @@ -0,0 +1,11 @@ +// +// Some Common Performance Monitoring Scenarios +// +// The WEBREPORT Group +// +// Intended to be used with archive logs to produce summaries, e.g. +// pmie -v -A 1hour -S @10:00am -T @2:00pm -a somearchive WEBREPORT +// produces hourly summaries for the hours 10am to 2pm +// + +delta = 1 hour; // change to suit, else use -t from the command line |