Cover V03, I02
Article
Listing 1
Listing 2
Listing 3
Sidebar 1

mar94.tar


Listing 2: The probe script

#!/opt/bin/perl
#
# Listing #2
#
#                  health_probe
#
$ver = '(03jan94)';
$debug = ''; # Null for production program.
#
# SunOS 4.1.x, Solaris 2.x version, Perl 4.036
# John Lees, Department of Computer Science,
# Michigan State University, A714 Wells Hall,
# East Lansing MI 48824-1027, lees@cps.msu.edu
#
# Available by anonymous FTP from:
#   ftp.cps.msu.edu:/pub/prip/lees/sysadmin/
#
# Probe hosts under a netgroup for interesting things:
#  - Check that xntpd is running and time is good
#  - Disk usage on common filesystems
#  - Load and number of users
#  - Check that important deamons are running
#  - Uptime, in days
#  - Ypwhich
#
$help = 'Usage: probe [-h] [-m] [-n NETGROUP] [-s]
-h  Help.
-m  Mail only, nothing to stdout.
-n  Netgroup to use as top of tree. Default is "cps".
-s  Stdout only, do not mail reports to lab managers.
';
#------------------------------------------------------
# There is a main program and three subprograms:
#
#   do_poll(host)
#      Interrogate a host.
#
#   do_print("s" or "b", string to print)
#      Where "s" means to stdout only, "b" means both
#      stdout and mail to the manager of the current
#      netgroup/lab.
#
#   getngrp(netgroup)
#      Build array of host groups.
#------------------------------------------------------

# Things go crazy with two pipes if stdout is piped.
select((select(STDOUT), $| = 1)[0]);

#
### Global variables.
#
require 'getopts.pl';
do Getopts('hmn:s');

chop($date = `date`);
chop($hostname = `uname -n`);
$current = '';                  # Used by do_print.
@getngrp_stack = ();            # Used by getngrp.
%getngrp_hosts = ();            # Used by getngrp.
# Skip these. Leading and trailing blanks needed.
$getngrp_skipgroup = ' dumb_pc ';
$getngrp_skiphost = ' bogus.cps.msu.edu ';

$lab = '';                      # Current lab.
#          1         2         3         4         5
#012345678901234567890123456789012345678901234567890
$status_head =
'Xntpd    / /usr /var /home Load Usr Daemon  Up Ypwhich'
;
# Returns for newping. Newping was published in Volume
# 2 Number 4 of Sys Admin magazine. As published it was
# not usable. We use a modified version. Original
# version from ftp.uu.net:/published/sysadmin/1993/.
# Our version for Solaris is available from:
# ftp.cps.msu.edu:/pub/prip/lees/sysadmin/
@pmsg = ('okay',
'connection timed out (dead machine)?',
'response timed out (hung or overloaded machine)?',
'connection refused (permissions)?',
'network or hardware problem?',
'host unreachable?',
'confused?');

# ===== BEGIN MAIN PROGRAM =====
&do_print('s',
"Health Probe $ver from $hostname\n$date\n");

if ($opt_h) {   # Help.
print "\n$help";
exit;
}

# Build the list of hosts. See the getngrp routine for
# a description of how our netgroups are organized.
if ($opt_n) {
do getngrp($opt_n);
}
else {
do getngrp('cps');
}

# Foreach subgroup, foreach host.
foreach $key (sort(keys %getngrp_hosts)) {
split(/\s/, $key);

# The highest level subgroup is an NIS domain. We
# will send mail to manager@$lab later.
$lab = $_[1];
$subgroup = join(' ', @_[1 .. $#_]);
&do_print('b', sprintf("\n%s\n%16.16s  %s\n",
$subgroup, ' ', $status_head));

# Go through the hosts for this lab.
foreach $host (split(/\s/, $getngrp_hosts{$key})) {
&do_poll($host);
}
}

exit 0;
# ===== END MAIN PROGRAM =====

#------------------------------------------------------
# do_poll | Subroutine to check things on a host.
#----------
sub do_poll {
local($host) = $_[0];
local(@ntp, @offset);
# We do substrs into this:
$status_line =
'                                               ';
@rsh = ();

print $host "\n";

# First make sure the beastie is alive. Give it 4
# seconds.
if (system("/opt/bin/newping $host 4"
." >/dev/null 2>&1")) {
# Nope, but try again just in case.
if ($nprc = system("/opt/bin/newping $host 3"
." >/dev/null 2>&1")) {
$nprc = $nprc / 256;
$prc = $nprc;
if ($nprc < 0 || $nprc > 5) {
$prc = 6;
}
# Print message corresponding to error from
# newping.
&do_print('b',
sprintf("%16.16s:  NEWPING(%x): %s\n",
$host, $nprc, $pmsg[$prc]));
}
}
else {
# The machine seems to be alive.
# Try to talk to the xntp daemon and analyze
# what is returned. cps_probe_ntp is the ntp
# program from the version 2 xntp package.
@ntp = split(/\s+/,
`/opt/bin/cps_probe_ntp $host 2>&1`);
@offset = split(/:/, $ntp[2]);
if ( $offset[0] ne 'offset') {
if ($ntp[2] eq 'refused') {
substr($status_line, 0, 5) = 'DEAD!';
}
else {
substr($status_line, 0, 5) = 'HUH??';
}
}
else {
# We got a proper report back from the daemon.
$offset =
substr($ntp[2], index($ntp[2], ':')+1);
# Absolute value of offset.
$offset = $offset < 0 ? 0 - $offset : $offset;
# Make offset line up if it is small enough.
if ($offset < 10) {
substr($status_line, 0, 5) =
sprintf("%5.3f", $offset);
}
else {
substr($status_line, 0, 5) =
substr($offset, 0, 5);
}
}
#
# Check a bunch of stuff. Another way to do this
# would be to have a "probe_client" which is run
# on each system, tailored to the architecture
# and OS, perhaps. But then there would be two
# (or more) programs to maintain.
$dfroot = ''; $dfusr = ''; $dfhome = '';
$dfvar = ''; $ostype = ''; $users = '';
$load = ''; $ypwhich = ''; $uptime = '0';

# This mess with fork is an attempt to keep the
# probe from getting stuck on hung machines. It
# is usually successful.
$redo_count = 0;
FORK: {
if ($pid = fork) {
# Parent. Give the child ample time to write
# the /tmp file.
for ($i = 0; $i < 15; $i++) {
# Sleep for one second.
select(undef, undef, undef, 1.0);
if (-s "/tmp/probe-$$") {
last;
}
}
# Sleep for one-half second.
select(undef, undef, undef, 0.5);
# If the child still has not written to the
# file, assume it is hung and try to kill it.
if (-z "/tmp/probe-$$") {
kill(1, $pid);
select(undef, undef, undef, 0.5);
kill(9, $pid);
}

} elsif (defined $pid) {
# We're the child. Write to a /tmp file.
$parent = getppid;
open(TMP, ">/tmp/probe-$parent");
# ostype is a local shell script.
$cmd = "'ostype;"
." df / /usr /var /home; uptime; ypwhich'";
print(TMP
`/usr/ucb/rsh -n $host $cmd 2>&1`);
close(TMP);
exit;
} elsif ($! =~ /No more process/) {
# EAGAIN
if ($redo_count < 5) {
sleep 5;
redo FORK;
} else {
die "Out of processes at host"
." \"$host\"\n";
}
} else {
# Fork error.
die "Argh, cannot fork for host"
." \"$host\": $!\n";
}
} # FORK:

# Back together again. There should be something
# in the /tmp file. The "wait" is necessary to
# reap the child.
wait;
if (open(TMP, "</tmp/probe-$$")) {
@rsh = <TMP>;
close(TMP);
unlink("/tmp/probe-$$");
}
else {
# Nothing from the child process.
$rsh[0] = 'NO RESPONSE!';
}

# It is not safe to make assumptions about how
# many lines are returned by the rsh. The df
# will return additional lines if the mountpoint
# names are long.
chop($ostype = $rsh[0]);
foreach $line (@rsh) {
@line = split(/\s+/, $line);
$last = $line[$#line];
if ($last eq '/') {
$dfroot = $line[$#line - 1];
next;
}
if ($last eq '/usr') {
$dfusr = $line[$#line - 1];
next;
}
if ($last eq '/var') {
$dfvar = $line[$#line - 1];
next;
}
if ($last eq '/home') {
$dfhome = $line[$#line - 1];
next;
}
# Users and load. The number of tokens in the
# uptime line depends on how long the system
# has been up.
if ($line =~ /load average/) {
foreach $token (@line) {
if (substr($token, 0, 3) eq 'day') {
# $users is the previous token.
$uptime = $users;
}
if (substr($token, 0, 4) eq 'user') {
last;
}
$users = $token;
}
$load = $last;
next;
}
} # foreach rsh

# Ypwhich.
$ypwhich = $rsh[$#rsh];
$ypwhich =~ s/^([^\.]+).*\n$/$1/;

# Fill in the status line if we got good stuff.
if ($#rsh > 1) {
substr($status_line,  6, 4) =
sprintf("%4.4s", $dfroot);
substr($status_line, 11, 4) =
sprintf("%4.4s", $dfusr);
substr($status_line, 16, 4) =
sprintf("%4.4s", $dfvar);
substr($status_line, 21, 4) =
sprintf("%4.4s", $dfhome);
substr($status_line, 26, 5) =
sprintf("%5.1f", $load);
substr($status_line, 32, 3) =
sprintf("%3.3s", $users);
substr($status_line, 43, 3) =
sprintf("%3.3s", $uptime);
$status_line .= $ypwhich;

# Check on the popular daemons. This is
# different depending on OS type.
if ("$ostype" eq 'SunOS-4'
|| "$ostype" eq 'NeXT') {
substr($status_line, 36, 6) = 'blmpsx';
$cmd = "ps -ax | grep '?'";
@rsh = split(/\n/,
`/usr/ucb/rsh -n $host $cmd 2>&1`);
foreach $line (@rsh) {
if ($line =~ /screenblank/) {
substr($status_line, 36, 1) = 'B';
next; }
if ($line =~ /\/usr\/lib\/lpd/) {
substr($status_line, 37, 1) = 'L';
next; }
if ($line =~ /sendmail/) {
substr($status_line, 38, 1) = 'M';
next; }
if ($line =~ /yppasswdd/) {
substr($status_line, 39, 1) = 'P';
next; }
if ($line =~ /ypserv/) {
substr($status_line, 40, 1) = 'S';
next; }
if ($line =~ /xntpd/) {
substr($status_line, 41, 1) = 'X';
next; }
} #foreach
}
else {
# SVR4 (Sun Solaris 2.x, DEC OSF/1).
substr($status_line, 36, 6) = 'blmstx';
$cmd = "ps -e | grep '?'";
@rsh = split(/\n/,
`/usr/ucb/rsh -n $host $cmd 2>&1`);
foreach $line (@rsh) {
# Truncated to 8 chars by ps -e
if ($line =~ /screenbl/) {
substr($status_line, 36, 1) = 'B';
next; }
if ($line =~ /lpsched/) {
substr($status_line, 37, 1) = 'L';
next; }
if ($line =~ /sendmail/) {
substr($status_line, 38, 1) = 'M';
next; }
if ($line =~ /ypserv/) {
substr($status_line, 39, 1) = 'S';
next; }
if ($line =~ /ttymon/) {
substr($status_line, 40, 1) = 'T';
next; }
if ($line =~ /xntpd/) {
substr($status_line, 41, 1) = 'X';
next; }
} #foreach
} #else not BSD
}
else {
# Fill in the problem.
if ($#rsh == -1) {
$status_line =
substr($status_line, 0, 7)
. 'NO RESPONSE!';
}
else {
$rsh[0] =~ tr/\n//d;
$status_line =
substr($status_line, 0, 7).$rsh[0];
}
}

# Finally, display the results.
$host =~ s/^([a-z0-9-]*).*$/$1/;
&do_print('b', sprintf("%16.16s: %s\n",
$host, $status_line));
}
} # End of "do_poll" subroutine.

#------------------------------------------------------
# do_print | Subroutine to print to both stdout and to
#----------- the currently open mail pipe. We are
# sloppy with the pipe. The next open on the same
# filehandle closes the previous pipe. Exiting from
# the script closes the final pipe. The overall goal
# is that a complete report goes to stdout, while a
# copy of the output covering each lab goes to the lab
# manager.
#
# Arguments:
# 0 - "s" for stdout, "b" for both.
# 1 - string to print.
#
# Global variables used:
#   $current, $debug, $lab, $getngrp_skip*
# Global variables modified:
#   $current
sub do_print {
if ($_[0] eq 'b' && (! $opt_s)) {
# Print to both.
# If this is the first time the routine is
# called for this lab, open a pipe to the mail
# command for the report to the lab manager.
if ($current ne $lab) {
$current = $lab;
if ($debug eq '') {
open(MAIL,
"|mail -s \"Host Probe for"
." $lab\" manager\@$lab");
}
else {
open(MAIL,
"|mail -s \"DEBUG Host Probe for"
." $lab\" lees");
}
select((select(MAIL), $| = 1)[0]);
chop($date = `date`);
printf MAIL "CPS Host Probe $ver from"
." $hostname\n$date\n\n";
printf MAIL " BSD: screenBlank Lpd    "
." sendMail ypPasswdd ypServe Xntpd\n";
printf MAIL "SVR4: screenBlank Lpsched"
." sendMail ypServe   Ttymon  Xntpd\n";
printf MAIL
"Skipping groups:$getngrp_skipgroup\n";
printf MAIL
"Skipping hosts:$getngrp_skiphost\n";
}
print MAIL $_[1];
}

# Print to stdout.
print $_[1]
unless $opt_m;
} # End of "do_print" subroutine.

#------------------------------------------------------
# getngrp | The single argument is a netgroup entry.
#---------- This is either a subgroup name, or a
# parentheses enclosed tuple for a host. Normally, we
# start things off with a call of the form:
# do getngrp('cps').
#
# Our netgroup database looks like this:
#
# cps           lab1 lab2...
# lab1          lab1_s lab1_c
# lab1_s        (server1.cps.msu.edu,-,)\
#               (server2.cps.msu.edu,-,)
# lab1_c        lab1_next lab1_sparc
# lab1_next     (one.cps.msu.edu,-,)\
#               (two.cps.msu.edu,-,)
# lab1_sparc    lab1_ss1 lab1_ss2 lab1_ss10
# lab1_ss1      (three.cps.msu.edu,-,)\
#               (four.cps.msu.edu,-,)
# lab1_ss2      (five.cps.msu.edu,-,)\
#               (six.cps.msu.edu,-,)
# lab1_ss10     (seven.cps.msu.edu,-,)\
#               (eight.cps.msu.edu,-,)
# ...and so on.
#
# Two global arrays are used. @getngrp_stack is used
# as a temporary stack, %getngrp_lab accumulates the
# result. Its key is a concatenation of the netgroups,
# e.g., "cps prip prip_c", and the data is a
# concatenation of all the hosts for the key.
# Recursive routine.
#
# The global variables getngrp_skip* are checked for
# things to skip.
sub getngrp {
local($arg) = $_[0];
local($entry);

# Either a host tuple or a subgroup.
if (split(/\s+/, $arg) == 1) {
if ($arg =~ /\(/) {
# This is a host tuple. Skip hosts having
# a -gw suffix, because these are additional
# ports on the same box.
$arg =~ s/^\(([^,]*).*$/$1/;
$getngrp_hosts{"@getngrp_stack"} .= "$arg "
unless ($arg =~ /-gw/
|| $getngrp_skiphost =~ / $arg /);
}
else {
# Subgroup. Check the exception list.
if (! ($getngrp_skipgroup =~ / $arg /)) {
push(@getngrp_stack, $arg);
foreach $entry (split(/\s+/,
`ypmatch $arg netgroup`)) {
&getngrp($entry);
}
pop(@getngrp_stack);
}
}
}
} # End of "getngrp" subroutine.
# End of health_probe script.