Listing 2: check_alive.ksh--Connection Monitor script
#!/bin/ksh
#=====================================================
# "@(#)check_alive.ksh"
#
# Check if a particular machine is alive by pinging it.
# If the machine does not respond we try once more.
# Log an error if the system does not respond. If the
# system responds, log an okay message.
# The ping command can be set to a particular command
# depending on the OS.
# "newping" is used for UNIX machines and is the
# default.
#
# Author :
# Ravindra Nemlekar
#
# Parameters : (Optional)
# Systems which are to be monitored.
# If no parameters, then problem specific monitor
# or the generic monitor list is used.
#
# Calling mechanism :
# Can be called from cron after every unit of
# time equal to the reporting time. For example
# if you want servers to be checked every
# 15 minutes, then the cron entry should be
# 0,15,30,45 * * * * /usr/local/bin/check_alive.ksh
# Can also be called from other scripts to check
# if the server is alive before executing any
# command on it.
#
# Return value :
# 0 - Host is alive and rsh'able
# 1 - Cannot connect to host
# 2 - Host connected but not responding
# 3 - Connection refused by host
# 4 - Network problem
# 5 - Host unreachable
# 254 Host alive but not rsh'able
# (i.e. non-UNIX host).
# 255 Unknown Error
#====================================================
BINDIR=/usr/local/admin/sysmon
BASEDIR=/usr/local/admin/sysmon
CONDITION_NAME=Condition.NO_CONNECT
HOST_DIR=$BASEDIR/hosts
PATH=/usr/local/admin/bin:$PATH # required for newping
tmpfile=/tmp/hostlist.$$
# Wait time (in seconds) between 2 tries
SLEEP_TIME=3
# Avoid permission problems
umask 0
# Pick up the condition specific data
if [ -d $BASEDIR/config.$CONDITION_NAME ] ; then
CONFIGDIR=$BASEDIR/config.$CONDITION_NAME
else
CONFIGDIR=$BASEDIR
fi
# File contains list of hosts monitored
HOST_LIST=$CONFIGDIR/monlist
# Arguments is present are the hosts which are
# to be checked
if [ $# -ne 0 ] ; then
echo "$*" > $tmpfile
HOST_LIST=$tmpfile
fi
#=================================================
# Create a msg directory for each server if it
# doesn't already exist.
for host in `cat $HOST_LIST | grep -v \# `
do
if [ ! -d $HOST_DIR/$host ]
then
mkdir $HOST_DIR/$host
chmod 777 $HOST_DIR/$host
else
touch $HOST_DIR/$host
fi
done
# We ping each host. If the host does not respond,
# we wait for some time and then try again. It it
# fails, we log an error. If the host is responding,
# log an okay if there was previously a problem.
for host in $(cat $HOST_LIST | grep -v \#)
do
# Process the host only if it is present in
# the master list.
$BASEDIR/check_in_master_list.ksh $host \
$BASEDIR/MASTER_LIST
if [ 0 -ne $? ] ; then
continue # Ignore this host
fi
unset PING
# Read this for each host so that each
# gets the initialized parameters
# Read the global config file, then local
# config files would override the parameters
# which are to be reset for each. Each
# parameter is initialized to some default
# value so that even if no config files are
# present, the script does not misbehave.
if [ -f $BASEDIR/config.generic ] ; then
# read the global file first
. $BASEDIR/config.generic
fi
if [ -f $BASEDIR/config.$CONDITION_NAME/\
config.generic ] ; then
# read the condition specific global file
. $BASEDIR/config.$CONDITION_NAME/\
config.generic
fi
if [ -f $BASEDIR/hosts/$host/\
config.$CONDITION_NAME ] ; then
# read the condition specific file for
# the problem on that host.
. $BASEDIR/hosts/$host/config.$CONDITION_NAME
fi
TIME_NOW=$(date "+%h-%d %H:%M")
EXIT_VALUE=1
# Default ping is the rpc based ping.
PING=${PING:-newping}
$PING $host
status=$?
if [ "$status" -ne 0 ] ; then
sleep $SLEEP_TIME
$PING $host
status=$?
fi
case "$status" in
0) # Host is up. Log the host as OK if
# previous error existed
if [ -f $HOST_DIR/$host/$CONDITION_NAME ]
then
echo "$TIME_NOW: $host now reachable" > \
$HOST_DIR/$host/End.$CONDITION_NAME
fi
;;
1) MSG="CAN'T CONNECT" ;;
2) MSG="CONNECTED but NO RESPONSE" ;;
3) MSG="CONNECTION REFUSED" ;;
4) MSG="NETWORK PROBLEM" ;;
5) MSG="HOST UNREACHABLE" ;;
255) MSG="UNKNOWN ERROR" ;;
esac # End of case loop
EXIT_VALUE=$status
if [ $status -ne 0 ] ; then
echo "$TIME_NOW: $host: $MSG" > \
$HOST_DIR/$host/$CONDITION_NAME
else
# Host alive but cannot be rsh'ed
if [ "$PING" != "newping" ] ; then
EXIT_VALUE=254
fi
fi
done # End of Host Check
# Hosts were passed from the command line
if [ $# -ne 0 ] ; then
/bin/rm -f $tmpfile
fi
# Return the exit value of the last server. This
# is [more] meaningful when the check is done on
# one server only.
exit $EXIT_VALUE
# End of File
|