#!/bin/csh -f # $Id: check_atm,v 1.11 2001/05/21 09:34:27 jklasek Exp $ # 1/1997,3/1999 J. Klasek # 1999-10-25 J. E. Klasek jklasek AT zid tuwien ac at # # check_atm [-t|--test] [-v|--verbose] [-f|--force] # # Test if the a fatal ATM error condition has araised and restart # the ILMI daemon on demand to restart the ATM interface(s). # # Parameters: # -t|--test Just show actions but don't execute them. # Error are reported on STDOUT instead of using email. # -v|--verbose Verbose output # -f|--force Force action (without condition testing) # # Notes: # This script is called from cron. # Script is protected against multiple invokation using file system based locking # by means of "ln"-atomic test-and-set operation in conjunction with # polling. # A status file to the given logfile is created, called ${logfile}.status. # Permission to create this file must be granted. # # Crontab sample: # 0,5,10,15,20,25,30,35,40,45,50,55 * * * * /pd/komstuff/bin/check_atm # # 10/1999 J.E. Klasek # Taken from chk_mountd @auto.tuwien.ac.at:/net/einstein/install/node_config/tools/admin # # $Log: check_atm,v $ # Revision 1.11 2001/05/21 09:34:27 jklasek # Changed: Support for changed pathes in SunATM 4.0 (if existent). # # Revision 1.10 2001/01/25 13:29:56 jk # Added: Gather statistic and setup data by means of some atm and lane tools for # usage in the notification e-mail. # # Revision 1.9 2001/01/22 15:03:00 jk # Changed: send-Alias uses STDIN for mail message; prevent OS specific syslog format # dependency; retry interval and count reduced. # Added: Debugging Log (if variable exists). # Fixed: supressing of PID output on background process creation. # # Revision 1.8 1999/12/17 17:17:40 jklasek # Fixed: Doku im Scriptkommentar korrigiert, aktualisiert. # # Revision 1.7 1999/12/17 17:13:00 jklasek # New: Option --force/-f und --verbose/-v # # Revision 1.6 1999/11/10 08:13:41 jklasek # Fix: korrekte $lold Initialisierung. # # Revision 1.5 1999/11/09 16:42:36 jklasek # Fix: Restart bereits bei erstmaligem Vorkommen der Fehlermeldung initieren. # Change: Logisch äquivalente Umordnung der Zustandserkennung. # # Revision 1.4 1999/11/02 15:55:05 jklasek # Fix: send: [$myname@$host] # # Revision 1.3 1999/11/02 15:50:14 jklasek # Fix: $myname-Verwendung in send-Kommandos. # # Revision 1.2 1999/10/28 11:53:30 jklasek # Changed: Experimental to official mode. Dokumentation in header and configuration section. # # Revision 1.1 1999/10/27 15:24:52 jklasek # Initial revision # # set verbos set myname="$0" set myname="$myname:t" unalias * set test= while ($#argv > 0) if ("$1" == "-t" || "$1" == "--test") then set ftest else if ("$1" == "-f" || "$1" == "--force") then # force interface reload set fforce else if ("$1" == "-v" || "$1" == "--verbose") then set fverbose else break endif shift end # any parameter -> testmode too if ($#argv > 0) then set ftest endif ####### CONFIG SECTION ################################################# # ### ONLY IF EXPERIMENTAL #set test='echo Expirmental: ' # # mail to set mailto="emergency@noc.tuwien.at.ac" #set mailto="jklasek@noc.tuwien.ac.at" if ($?ftest) set mailto="jklasek@noc.tuwien.at.ac" # send alias send 'mailx -s \!^ "$mailto"' if ($?ftest) alias send 'cat ' # ATM binary dir. if (-d /etc/opt/SUNWatm/bin) then # SunATM 3.0 set atmbin=/etc/opt/SUNWatm/bin else if (-d /etc/opt/SUNWconn/atm/bin) then # SunATM 4.0 # Links are also existent in /etc/opt/SUNWconn/bin/ but # these are not suitable for searching the ps output. set atmbin=/etc/opt/SUNWconn/atm/bin else send "ERROR: no ATM software installed?" ! /tmp/.$USER.new.lock.$$ ln /tmp/.$USER.new.lock.$$ "/tmp/.$USER.$lock.active" >& /dev/null # "Es kann nur Einen geben ..." if ($status) then sleep $intervall @ timeout-- @ waited += $intervall if ($timeout > 0) goto retry_lock if (! -e $pstat) then touch $pstat echo "${myname}@${host}: Lock timeout after $waited seconds\!" echo " Previous process still running (pid=`cat /tmp/.$USER.$lock.active`)" echo " Try to resolve this hangup situation manually." endif /bin/rm -f /tmp/.$USER.new.lock.$$ exit 2 endif onintr unlock /bin/rm -f /tmp/.$USER.new.lock.$$ # PID in lock file vermerken echo $$ >! "/tmp/.$USER.$lock.active" ##### END LOCK ### check if action is forced if ($?fforce) then if ($?fverbose) echo "Force action (skipping condition checking) ..." goto restart endif ### check log status # This is a sample we are looking for: # they are coming very fast, approx. 5 per seconds # Solaris 2.6 format: # Oct 11 11:53:29 news unix: WARNING: laner: entry_query does setup with munged ATM address # Oct 11 11:53:29 news unix: WARNING: laner: bldng SETUP with null leading byte in dest atm addr, call type = 4 # Solaris 8 format: # Oct 11 11:53:29 news genunix: [ID 370175 kern.warning] WARNING: laner: entry_query does setup with munged ATM address # Oct 11 11:53:29 news genunix: [ID 370176 kern.warning] WARNING: laner: bldng SETUP with null leading byte in dest atm addr, call type = 4 # take the last 2 lines and extract only one part of the message pair; # some other message may temporarily appear and nothing is done, but # in one of the future intervals a match will occure. if ($?fverbose) then echo "Checking logfile $logfile ..." endif # ignore first part of syslog message to prevent tagging to OS specific format set l="`tail -2 $logfile | grep ' WARNING: laner: bldng SETUP with null leading byte in dest atm addr'`" # currently not in use because an error in the pipe results in an empty $l set s=status # Problem temporary or permanent away if ("$l" == "") goto unlock if (-r $loglast) then set lold="`cat $loglast`" else # $l != "": Message occured, but no previous exists => ATM failure started! echo "$l" >$loglast # force recover action set lold="" endif # no new line? -> do nothing if ("$l" == "$lold") goto unlock # make debug entry if enabled (variable exists) if ($?logdebug) then echo "###### "`date`" ###########################################################" >> $logdebug echo "new: $l" >> $logdebug echo "tail ${logfile}:" >> $logdebug tail -6 $logfile >> $logdebug echo "status ($loglast):" >> $logdebug cat $loglast >> $logdebug endif if ($?fverbose) then echo "ATM IFs currently not working - starting reinitializing sequence ..." endif cat < 0 && $count < 5) if ($count > 3) then $test /bin/kill -KILL $pid else $test /bin/kill -TERM $pid endif if ($?fverbose) then echo "Killing process $pid, round $count ..." endif @ count++ sleep 4 set pid=(`getpidlist`) end if ($#pid > 0 && ! $?ftest) then if ($?fverbose) then echo "ERROR: Can't terminate $proc, PID = ($pid)\!" endif cat <$loglast endif # reset error flag /bin/rm -f $pstat if ($?ftest || $?fverbose) then echo "Starting $proc" endif if (! $?ftest) then # enclosed in () to prevent PID output ($proc >& /dev/null &) endif else if (! -e $pstat) cat <