Cover V04, I05
Article
Figure 1
Figure 2
Figure 3
Figure 4
Figure 5
Figure 6
Listing 1

sep95.tar


Listing 1: procmon source code

#!/usr/local/bin/perl
#
#
# This program requires the PERL ctime(PERL) library for generating date
# strings in the standard ctime format.
#
require "ctime.pl";
#
# This program requires the syslog(PERL) library for successful delivery
# of logging information to syslog on the host machine.  What syslog does
# with the information is up to syslog and the how the administrator
# has it configured.
#
require "syslog.pl";

#
# Look to see if the confiuration file for the process monitor is
# present in /etc.  If so, then load in the configuration file.
#
# If not, then use the default values, included here.
#
if ( -e "/etc/procmon.cfg" )
{
printf STDOUT "Found /etc/procmon.cfg ... loading ...\n";
require "/etc/procmon.cfg";
$loaded_config = "/etc/procmon.cfg";
}
else
{
#
# 5 minute delay
#
printf STDOUT "no config file... using defaults ...\n";
$delay_between = 300;
$ConfigDir = "/usr/local/bin";
$loaded_config = "<DEFAULTS>";
}

#
# This is the name of this program.  DO NOT CHANGE THIS.
#
$program = "procmon";
#
# This is the name of the process list and command file
#
$command_file = "$ConfigDir/procmon.cmd";

#
# Establish the signal handler
#
$SIG{'HUP'} = "IGNORE";         # signal value 1
$SIG{'INT'} = "IGNORE";         # signal value 2
$SIG{'QUIT'} = "IGNORE";        # signal value 3
$SIG{'ILL'} = "IGNORE";         # signal value 4
$SIG{'TRAP'} = "IGNORE";        # signal value 5
$SIG{'IOT'} = "IGNORE";         # signal value 6
$SIG{'ABRT'} = "IGNORE";        # signal value 6, yes this is right!
$SIG{'EMT'} = "IGNORE";         # signal value 7
$SIG{'FPE'} = "IGNORE";         # signal value 8
$SIG{'KILL'} = "DEFAULT";       # signal value 9, can't be caught anyway
$SIG{'BUS'} = "IGNORE";         # signal value 10
$SIG{'SEGV'} = "IGNORE";        # signal value 11
$SIG{'SYS'} = "IGNORE";         # signal value 12
$SIG{"PIPE"} = "IGNORE";        # signal value 13
$SIG{'ALRM'} = "IGNORE";        # signal value 14
$SIG{'TERM'} = "DEFAULT";       # signal value 15
$SIG{'USR1'} = "IGNORE";        # signal value 16
$SIG{'USR2'} = "IGNORE";        # signal value 17
$SIG{'CLD'} = "IGNORE";         # signal value 18
$SIG{'CHLD'} = "IGNORE";        # signal value 18, yes this is right too!
$SIG{'PWR'} = "IGNORE";         # signal value 19
$SIG{'WINCH'} = "IGNORE";       # signal value 20
$SIG{'PHONE'} = "IGNORE";       # signal value 21, AT&T UNIX/PC only!
$SIG{'POLL'} = "DEFAULT";       # signal value 22
$SIG{'STOP'} = "IGNORE";        # signal value 23
$SIG{'TSTP'} = "IGNORE";        # signal value 24
$SIG{'CONT'} = "IGNORE";        # signal value 25
$SIG{'TTIN'} = "IGNORE";        # signal value 26
$SIG{'TTOU'} = "IGNORE";        # signal value 27
$SIG{'VTALRM'} = "IGNORE";      # signal value 28
$SIG{'PROF'} = "IGNORE";        # signal value 29

#
# Close Standard Input and Standard output
#
close( STDIN );
close( STDOUT );
close( STDERR );

#
# open syslog for recording the startup messages as debug messages
#
&openlog( $program, "ndelay,pid", "user" );
#
# Record the startup of the monitor
#
&syslog( info,  "Process Monitor started");
&syslog( info,  "Loaded config file $loaded_config");
&syslog( info,  "Command File: $command_file");
&syslog( info,  "Loop Delay = $delay_between");
#
# Open the list of processes to be monitored.
#
if ( -e "$command_file" )
{
open( LIST, "$command_file" );
}
else
{
&syslog( crit,  "CAN'T LOAD COMMAND FILE : $command_file: does not exist" );
exit(2);
}
#
while (<LIST>)
{
chop;
#
# we split because each entry has the name of the command that would be
# present in a ps -e listing, and the name of the command that is used to
# start it should it not be running.
#
# An exclamation point is used between the two fields in the file.
#
( $process_name, $start_process ) = split(/!/,$_ );
&syslog( info,  "Adding $process_name to stored process list");
#
# Save the name of the process being monitored into an array.
#
@process_list = ( @process_list, $process_name );
#
# save the start command in an associative array using the process_name
# as the key.
#
$start_commands{$process_name} = $start_process;
#
# The associative array last_failure is used to store the last failure time
# of the indicated process.
#
$last_failure{$process_name} = "NEVER";
#
# The associative array last _start is used to store the time the process
# was last started
#
$last_start{$process_name} = "UNKNOWN";
}
$num_processes = @process_list;
&syslog( info,  "Monitoring : $num_processes processes");

#
# Loop forever
#
while (1 == 1)
{
EACH_PROCESS:
foreach $process_name (@process_list)
{
#
# This program was originally written for AT&T System V UNIX
# and derivatives.  (Someday I will port it to BSD versions!)
#
open( PS, "ps -e | grep $process_name |" ) || \
&syslog( warn, "can't create PS pipe : $!");
while (<PS>)
{
chop;
$_name = "";
#
# There are a log of spaces in the PS output, so these have to
# be squeezed to one space.
#
tr/a-zA-Z0-9?:/ /cs;
#
# Read the PS list and process the information
#
( $junk, $_pid, $_tty, $_time, $_name ) = split(/ /,$_ );
#
# Check to see if we have any information
#
if ( $_name ne "" )
{
#
# We likely have the process running
#
#
# FRom here we go to the next process, as it is still
# running, and we have made a syslog entry to that
# effect.
#
&syslog( "info", "$process_name running as PID $_pid");
close(PS);
next EACH_PROCESS;
}
#
# The process is not running, so record an entry in
# syslog.
#
}
close(PS);
&syslog( "crit", "$process_name is NOT running");
#
# When did the process last fail?  Saving this allows the
# system administrator to keep tabs on the failure rate of
# the process.
#
&syslog( "crit", "Last Failure of $process_name, @
$last_failure{$process_name}" );
chop( $current_time = &ctime(time) );
#
# Se the last failure to the current time.
#
$last_failure{$process_name} = $current_time;
#
# If we have a command to execute to restart the service,
# execute the command
#
if ( defined( $start_commands{$process_name} ) )
{
#
# record the sequence of event to restart the
# service in syslog.
#
&syslog( "crit", "issuing $start_commands{$process_name} to system");
#
# execute the system command, and save the return code to decide
# if it was a clean start.
#
$retcode = system("$start_commands{$process_name}");
#
# Record the return code in syslog
#
&syslog( "info", "$start_commands{$process_name} returns $retcode");
#
# Calculate the time in ctime(3C) format
chop( $current_time = &ctime(time) );
$last_start{$process_name} = $current_time;
#
# Save the return code - it is in the sdtandard format, so must be
# divided by 256 to get the real return value.
#
$retcode = $retcode / 256;
}
}
#
# From here we have processed each of the commands in the monitoring list
# We will now pause for station identification .....
#
$secs = sleep($delay_between);
}

sub sig_handler
{
local ($sig) = @_;
&closelog();
&openlog( $program, "ndelay,cons,pid", "user" );
&syslog( "crit", "PROCESS MONITOR: SIGNAL CAUGHT SIG$sig-
TERMINATING");
&closelog();
exit(0);
}