Cover V04, I06
Article
Figure 1
Figure 2
Figure 3
Listing 1
Listing 10
Listing 2
Listing 3
Listing 4
Listing 5
Listing 6
Listing 7
Listing 8
Listing 9

nov95.tar


Listing 2: check_alive.ksh--Connection Monitor script

#!/bin/ksh
#=====================================================
# "@(#)check_alive.ksh"
#
# Check if a particular machine is alive by pinging it.
# If the machine does not respond we try once more.
# Log an error if the system does not respond. If the
# system responds, log an okay message.
# The ping command can be set to a particular command
# depending on the OS.
# "newping" is used for UNIX machines and is the
# default.
#
# Author :
# 	Ravindra Nemlekar
#
# Parameters : (Optional)
#    Systems which are to be monitored.
#    If no parameters, then problem specific monitor
#	or the generic monitor list is used.
#
# Calling mechanism :
#   Can be called from cron after every unit of
#   time equal to the reporting time. For example
#   if you want servers to be checked every
#   15 minutes, then the cron entry should be
# 0,15,30,45 * * * * /usr/local/bin/check_alive.ksh
#   Can also be called from other scripts to check
#   if the server is alive before executing any
#   command on it.
#
# Return value :
#  0 - Host is alive and rsh'able
#  1 - Cannot connect to host
#  2 - Host connected but not responding
#  3 - Connection refused by host
#  4 - Network problem
#  5 - Host unreachable
# 254  Host alive but not rsh'able
#		  (i.e. non-UNIX host).
# 255  Unknown Error
#====================================================

BINDIR=/usr/local/admin/sysmon
BASEDIR=/usr/local/admin/sysmon
CONDITION_NAME=Condition.NO_CONNECT
HOST_DIR=$BASEDIR/hosts
PATH=/usr/local/admin/bin:$PATH	# required for newping
tmpfile=/tmp/hostlist.$$
# Wait time (in seconds) between 2 tries
SLEEP_TIME=3
# Avoid permission problems
umask 0

# Pick up the condition specific data
if [ -d $BASEDIR/config.$CONDITION_NAME ] ; then
CONFIGDIR=$BASEDIR/config.$CONDITION_NAME
else
CONFIGDIR=$BASEDIR
fi
# File contains list of hosts monitored
HOST_LIST=$CONFIGDIR/monlist
# Arguments is present are the hosts which are
# to be checked
if [ $# -ne 0 ] ; then
echo "$*" > $tmpfile
HOST_LIST=$tmpfile
fi


#=================================================
# Create a msg directory for each server if it
# doesn't already exist.
for host in `cat $HOST_LIST | grep -v \# `
do
if [ ! -d $HOST_DIR/$host ]
then
mkdir $HOST_DIR/$host
chmod 777 $HOST_DIR/$host
else
touch $HOST_DIR/$host
fi
done

# We ping each host. If the host does not respond,
# we wait for some time and then try again. It it
# fails, we log an error. If the host is responding,
# log an okay if there was previously a problem.

for host in $(cat $HOST_LIST | grep -v \#)
do
# Process the host only if it is present in
# the master list.
$BASEDIR/check_in_master_list.ksh $host \
$BASEDIR/MASTER_LIST
if [ 0 -ne $? ] ; then
continue # Ignore this host
fi

unset PING
# Read this for each host so that each
# gets the initialized parameters
# Read the global config file, then local
# config files would override the parameters
# which are to be reset for each. Each
# parameter is initialized to some default
# value so that even if no config files are
# present, the script does not misbehave.
if [ -f $BASEDIR/config.generic ] ; then
# read the global file first
. $BASEDIR/config.generic
fi
if [ -f $BASEDIR/config.$CONDITION_NAME/\
config.generic ] ; then
# read the condition specific global file
. $BASEDIR/config.$CONDITION_NAME/\
config.generic
fi
if [ -f $BASEDIR/hosts/$host/\
config.$CONDITION_NAME ] ; then
# read the condition specific file for
# the problem on that host.
. $BASEDIR/hosts/$host/config.$CONDITION_NAME
fi
TIME_NOW=$(date "+%h-%d %H:%M")
EXIT_VALUE=1
# Default ping is the rpc based ping.
PING=${PING:-newping}
$PING $host
status=$?
if [ "$status" -ne 0 ] ; then
sleep $SLEEP_TIME
$PING $host
status=$?
fi
case "$status" in
0)  # Host is up. Log the host as OK if
# previous error existed
if [ -f  $HOST_DIR/$host/$CONDITION_NAME ]
then
echo "$TIME_NOW: $host now reachable" > \
$HOST_DIR/$host/End.$CONDITION_NAME
fi
;;

1)  MSG="CAN'T CONNECT" ;;

2)  MSG="CONNECTED but NO RESPONSE" ;;

3)  MSG="CONNECTION REFUSED" ;;

4)  MSG="NETWORK PROBLEM" ;;

5)  MSG="HOST UNREACHABLE" ;;

255) MSG="UNKNOWN ERROR" ;;

esac     # End of case loop

EXIT_VALUE=$status
if [ $status -ne 0 ] ; then
echo "$TIME_NOW: $host: $MSG" > \
$HOST_DIR/$host/$CONDITION_NAME
else
# Host alive but cannot be rsh'ed
if [ "$PING" != "newping" ] ; then
EXIT_VALUE=254
fi
fi
done      # End of Host Check

# Hosts were passed from the command line
if [ $# -ne 0 ] ; then
/bin/rm -f  $tmpfile
fi

# Return the exit value of the last server. This
# is [more] meaningful when the check is done on
# one server only.
exit $EXIT_VALUE
# End of File