Listing 1: The "Sitter" script

  1:#! /bin/bash
  2:#
  3:# sitter:
  4:# @(#) - "babysit" the network and daemons, complain to console. jf 3/1996
  5:# 
  6:# revision history:
  7:# 3/96 - first version, basic functions.
  8:# 4/96 - add logging to LOGFILE when events occur, added date 
  9:# 10/96 - FAKEPING flag added, SERVICES cleanup.
 10:#
 11:# 1.. Check IP address on localnet -- is network / network card up?
 12:# 2. Check IP address on intranets -- is routing / ISDN up?
 13:# 3. Check IP address on Internet -- is the Internet router up?
 14:# 4. Check DNS on all DNS boxes -- has one of the named's died?
 15:# 5. Check Services (s) -- has our machine locked up?
 16:# 6. The End.  P.S. This is "network management," but it does not cost $20000
 17:
 18:### configuration:
 19:OKFILE=/tmp/$$OK
 20:LOG="/usr/adm/sitter"
 21:TIMEOUT=3
 22:FAKEPING="domain"  # if unset, use real ping!
 23:# Test:
 24:# LOCALNET="167.195.160.5 167.195.160.5"
 25:LOCALNET="167.195.160.6 167.195.160.128"
 26:# Test: REMOTENET="167.195.166.9 167.195.166.9"
 27:REMOTENET="167.195.166.1 167.195.166.2"
 28:# Test: INET="198.41.0.9 198.41.0.9"
 29:INET="128.9.0.107 198.41.0.4"
 30:# Test: DNS="wpo moria lorien"
 31:##
 32:DNS="rivendell moria"
 33:# Test: SERVICES="lorien:http wpo:http lorien:domain"
 34:SERVICES="wpo:smtp lorien:http"
 35:
 36:## initialization
 37:
 38:stty onlcr 0<&1 
 39:
 40:typeset -i COUNT
 41:
 42:pingok() {
 43:## REAL ping:
 44:## if I see [space]0% it means zero percent packet loss... need the space
 45:## to differentiate between 100%, 80%, and so forth...
 46:## exit silently if everything is ok, return output if not.
 47:## FAKE ping:
 48:## I'll use the FAKEPING TCP socket to throw out a connection to something
 49:## that SHOULD be up.
 50:
 51:[ -z "$FAKEPING" ] && {
 52:  ping $1 -c1 | grep -q " 0\%" && return 0
 53:  echo "$1"
 54:  return 1 ## oh no, not 0% packet loss, so there is a problem!
 55:} || ## If I don't do a real ping, do a FAKE ping:
 56:{
 57:## Since telnet has too large a timeout for me, I spawn it
 58:## and kill it when I'm tired of waiting for it, assuming it failed.
 59:
 60:  >$OKFILE
 61:  ## I have to use a file (OKFILE) to tell the rest of the script when
 62:  ## the telnet connects OK, because in order to spurn the timeout,
 63:  ## it's necessary to spawn, which means no backwards communication.
 64:  echo -e "TEST J close" | telnet -e J $1 $FAKEPING >/dev/null 2>&1 && 
 65:    echo "OK">$OKFILE & 
 66:  sleep $TIMEOUT
 67:  [ -s $OKFILE ] && return 0 # OK exists!
 68:  ## Telnet can be annoying and NOT DIE when its parent dies (under Linux,
 69:  ## anyway), so kill it explicitly
 70:  ps -l | awk '/'$!'/ && /telnet/ {print $3}' | xargs kill >/dev/null 2>&1
 71:
 72:  echo $1 
 73:  return 1  ## bad news!
 74:
 75:} ## end of fake ping
 76:} ## end of pingok()
 77:
 78:## main program:
 79:## check LOCALNET
 80:## 1. Check IP address on localnet -- is network / network card up?
 81:
 82:## begin output subshell
 83:PROBLEM=`
 84:LSTAT=""
 85:COUNT=0
 86:for i in $LOCALNET ; do LSTAT=$LSTAT"$(pingok $i)" 
 87:  let $(( COUNT = COUNT + $? ))
 88:done
 89:
 90:## if LSTAT is non-empty, then we have a problem.  Handle depending upon # of
 91:## failures.  
 92:[ -n "$LSTAT" ] && { 
 93:  [ $COUNT -gt 1 ] && echo "Can NOT PING more than one station on the network, check physical network." ||
 94:    echo "I can PING one host on the physical network, but not the other ($LSTAT). Please check:"
 95:    awk '/'$LSTAT'/ {print $2}' < /etc/hosts
 96:    }
 97:
 98:## 2. Check IP address on intranets -- is routing / ISDN up?
 99:COUNT=0
100:LSTAT=""
101:## an example of temporarily turning off FAKEPING by enclosing whole thing
102:## in parens:
103:( FAKEPING=""
104:for i in $REMOTENET ; do LSTAT=$LSTAT"$(pingok $i)"
105:  let $(( COUNT = COUNT + $? ))
106:done
107:
108:[ -n "$LSTAT" ] &&  {
109:  [ $COUNT -gt 1 ] && echo "Can not PING more than one station on the remote network, PPP is probably down." ||
110:    echo "I can PING one host on the physical network, but not the other (' $LSTAT'). Please check:"
111:    awk '/'$LSTAT'/ {print $2}' < /etc/hosts
112:  }
113:) ## back to our previous environment. (FAKEPING)
114:
115:## 3. Check IP address on Internet -- is the Internet router up?
116:
117:COUNT=0
118:LSTAT=""
119:for i in $INET ; do LSTAT=$LSTAT"$(pingok $i)"
120:  let $(( COUNT = COUNT + $? ))
121:done
122:
123:[ -n "$LSTAT" ] && 
124:  [ $COUNT -gt 1 ] && echo "Can not PING more than one root server on the Internet, check DOAS router/network." 
125:## we don't care if only one of the root servers on the inet is down.  
126:
127:## 4. Check DNS on all DNS boxes -- has one of the named's died?
128:
129:for i in $DNS ; do
130:  nslookup localhost $i >/dev/null 2>&1
131:  [ $? -ne 0 ] && echo "Problem with nameserver on $i -- please restart it!"
132:done
133:
134:## 5. Check Services (s) -- has our machine locked up?
135:## Any of these that are down are worthy of notice!  This will be "cleaner" 
136:## once I rewrite this in perl, in my copious spare time ;-P
137:(
138:  for i in $SERVICES; do
139:    set -- $(IFS=: ; echo $i)   ### split entry into fields
140:    FAKEPING=$2
141:    pingok $1 || echo "Can't reach $1 with the $2 service, please check!"
142:  done
143:)
144:
145:## end output subshell:
146:` 
147:
148:[ -n "$PROBLEM" ] && echo -e "\rSitter: $(date '+%x %T') $PROBLEM" | tee -a $LOG
149:
150:## cleanup
151:rm $OKFILE

