Cover V06, I03
Article
Figure 1
Listing 1

mar97.tar


The "Sitter" script

1:#! /bin/bash
2:#
3:# sitter:
4:# @(#) - "babysit" the network and daemons, complain to console.
jf 3/1996
5:#
6:# revision history:
7:# 3/96 - first version, basic functions.
8:# 4/96 - add logging to LOGFILE when events occur, added date
9:# 10/96 - FAKEPING flag added, SERVICES cleanup.
10:#
11:# 1. Check IP address on localnet -- is network / network card up?
12:# 2. Check IP address on intranets -- is routing / ISDN up?
13:# 3. Check IP address on Internet -- is the Internet router up?
14:# 4. Check DNS on all DNS boxes -- has one of the named's died?
15:# 5. Check Services (s) -- has our machine locked up?
16:# 6. The End.  P.S. This is "network management," but it does
not cost $20000
17:
18:### configuration:
19:OKFILE=/tmp/$$OK
20:LOG="/usr/adm/sitter"
21:TIMEOUT=3
22:FAKEPING="domain"  # if unset, use real ping!
23:# Test:
24:# LOCALNET="167.195.160.5 167.195.160.5"
25:LOCALNET="167.195.160.6 167.195.160.128"
26:# Test: REMOTENET="167.195.166.9 167.195.166.9"
27:REMOTENET="167.195.166.1 167.195.166.2"
28:# Test: INET="198.41.0.9 198.41.0.9"
29:INET="128.9.0.107 198.41.0.4"
30:# Test: DNS="wpo moria lorien"
31:
32:DNS="rivendell moria"
33:# Test: SERVICES="lorien:http wpo:http lorien:domain"
34:SERVICES="wpo:smtp lorien:http"
35:
36:## initialization
37:
38:stty onlcr 0<&1
39:
40:typeset -i COUNT
41:
42:pingok() {
43:## REAL ping:
44:## if I see [space]0% it means zero percent packet loss... need
45:## the space to differentiate between 100%, 80%, and so forth...
46:## exit silently if everything is ok, return output if not.
47:## FAKE ping:
48:## I'll use the FAKEPING TCP socket to throw out a connection to
49:## something that SHOULD be up.
50:
51:[ -z "$FAKEPING" ] && {
52:  ping $1 -c1 | grep -q " 0\%" && return 0
53:  echo "$1"
54:  return 1 ## oh no, not 0% packet loss, so there is a problem!
55:} || ## If I don't do a real ping, do a FAKE ping:
56:{
57:## Since telnet has too large a timeout for me, I spawn it and
58:## kill it when I'm tired of waiting for it, assuming it failed.
59:
60:  >$OKFILE
61:  ## I have to use a file (OKFILE) to tell the rest of the
62:  ## script when the telnet connects OK, because in order to
63:  ## spurn the timeout, it's necessary to spawn, which means no
backwards communication.
64:  echo -e "TEST J close" | telnet -e J $1 $FAKEPING >/dev/null
2>&1 &&
65:    echo "OK">$OKFILE &
66:  sleep $TIMEOUT
67:  [ -s $OKFILE ] && return 0 # OK exists!
68:  ## Telnet can be annoying and NOT DIE when its parent dies
69:  ## (under Linux, anyway), so kill it explicitly
70:  ps -l | awk '/'$!'/ && /telnet/ {print $3}' | xargs kill
>/dev/null 2>&1
71:
72:  echo $1
73:  return 1  ## bad news!
74:
75:} ## end of fake ping
76:} ## end of pingok()
77:
78:## main program:
79:## check LOCALNET
80:## 1. Check IP address on localnet -- is network / network card up?
81:
82:## begin output subshell
83:PROBLEM=`
84:LSTAT=""
85:COUNT=0
86:for i in $LOCALNET ; do LSTAT=$LSTAT"$(pingok $i)"
87:  let $(( COUNT = COUNT + $? ))
88:done
89:
90:## if LSTAT is non-empty, then we have a problem.  Handle
91:## depending upon # of failures.
92:[ -n "$LSTAT" ] && {
93:  [ $COUNT -gt 1 ] && echo "Can NOT PING more than one station
on the network, check physical network." ||
94:    echo "I can PING one host on the physical network, but not
the other ($LSTAT). Please check:"
95:    awk '/'$LSTAT'/ {print $2}' < /etc/hosts
96:    }
97:
98:## 2. Check IP address on intranets -- is routing / ISDN up?
99:COUNT=0
100:LSTAT=""
101:## an example of temporarily turning off FAKEPING by enclosing
102:## whole thing in parens:
103:( FAKEPING=""
104:for i in $REMOTENET ; do LSTAT=$LSTAT"$(pingok $i)"
105:  let $(( COUNT = COUNT + $? ))
106:done
107:
108:[ -n "$LSTAT" ] &&  {
109:  [ $COUNT -gt 1 ] && echo "Can not PING more than one station
on the remote network, PPP is probably down." ||
110:    echo "I can PING one host on the physical network, but not
the other (' $LSTAT'). Please check:"
111:    awk '/'$LSTAT'/ {print $2}' < /etc/hosts
112:  }
113:) ## back to our previous environment. (FAKEPING)
114:
115:## 3. Check IP address on Internet -- is the Internet router up?
116:
117:COUNT=0
118:LSTAT=""
119:for i in $INET ; do LSTAT=$LSTAT"$(pingok $i)"
120:  let $(( COUNT = COUNT + $? ))
121:done
122:
123:[ -n "$LSTAT" ] &&
124:  [ $COUNT -gt 1 ] && echo "Can not PING more than one root
server on the Internet, check DOAS router/network."
125:## we don't care if only one of the root servers on the inet is
down.
126:
127:## 4. Check DNS on all DNS boxes -- has one of the named's died?
128:
129:for i in $DNS ; do
130:  nslookup localhost $i >/dev/null 2>&1
131:  [ $? -ne 0 ] && echo "Problem with nameserver on $i -- please
restart it!"
132:done
133:
134:## 5. Check Services (s) -- has our machine locked up?
135:## Any of these that are down are worthy of notice!  This will be
136:## "cleaner" once I rewrite this in perl, in my copious spare
time ;-P
137:(
138:  for i in $SERVICES; do
139:    set -- $(IFS=: ; echo $i)   ### split entry into fields
140:    FAKEPING=$2
141:    pingok $1 || echo "Can't reach $1 with the $2 service,
please check!"
142:  done
143:)
144:
145:## end output subshell:
146:`
147:
148:[ -n "$PROBLEM" ] && echo -e "\rSitter: $(date '+%x %T')
$PROBLEM" | tee -a $LOG
149:
150:## cleanup
151:rm $OKFILE
152:## End of File