OpenVMS Source Code Demos
WATCHDOG
1000 %TITLE "Terminator_xxx.bas" !
%IDENT "Version_50.1" !
declare string constant k_version = "50.1" , ! &
k_program = "Terminator" !
!
!0123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890
!1 2 3 4 5 6 7 8 9 0 1 2 3
!=========================================================================================================================
! Title : Terminator_xxx.bas (a.k.a. Watchdog_xxx.bas)
! Model : Cyberdyne Systems Model 101 (T-800 series)
! Author : Neil S. Rieck
! Purpose: 1) log out idle terminals
! 2) kill "run away" processes (tell everyone on the grid "TRON LIVES")
! Target : VMS 4.5 (and up)
!=========================================================================================================================
! History:
!
! Ver Who When What
! --- --- ------ ---------------------------------------------------------------------------------------------------------
! 30 NSR 900926 1. Total rewrite (optimized & shortened)
! 2. the process stats array is now dimensioned at run time by the value of MaxProcessCnt which was was
! returned by $GetSYI
! 3. $GetSYI is used to find the processor type so we can adjust the minimum CPU time required when
! considering a process active
! 31 NSR 901019 1. added code to send messages to the MASTER_PID when dealing with a sub-proc
! NSR 910121 2. warning at 15 mins and logout at 20 minutes
! 32 NSR 921117 1. warning at 25 mins and logout at 30 minutes
! 2. changed watchdog messages
! NSR 921120 1. warning at 20 + 40 minutes, logout at 60 minutes
! 33 NSR 921121 1. changed the name of this program to TERMINATOR and made the messages Arnold'esk
! 2. now give warnings at 15, 30, + 45 minutes
! 34 NSR 930424 1. now give warnings at 30 + 60 minutes
! 2. now kick them off at 90 minutes
! NSR 930706 3. now warn at 75 minutes only (kick at 90)
! NSR 931222 4. now don't log out the console (so I can test ARDIS SERVER)
! NSR 940104 5. now warn at 30 mins and kick off at 45 mins
! 35 NSR 940129 1. now can log out the console again
! 2. changed the warning message to something less annoying (will print on the 24th line of a terminal)
! NSR 940301 3. now warn at 45 mins and kick off at 60 mins
! NSR 940308 4. give people in account PCPCSM (P.C. Partners) extra time
! NSR 940322 5. added 'PRINT RC' to logfile before LIB$STOP
! 6. added a test for SS$_SUSPENDED to GetJpi
! 36 NSR 940420 1. now warn at 60 mins and kick off at 75 mins
! 2. added code to detect user's that have consumed more that 1 hour of cpu time
! 37 NSR 941117 1. changed warn array from byte to long
! 2. changed the name 'Watchdog' to 'Terminator' in the messages sent to terminals
! NSR 960723 3. modified the logging code that detects when a user has consumed too much CPU time
! 4. added code to support DECWindows devices under VMS 5.5-2
! 38 NSR 960815 1. added code to properly handle DECwindows devices under Motif 1.2-3 (using $GetDvi)
! 960826 2. added code to get the image name so we can deal with certain DECwindows tasks
! 960906 3. added code to leave DECwindows task alone during the day
! 39 NSR 980304 1. added code to change the sleep time from system logical: CSMIS$WATCHDOG_SLEEP_MINS
! 40 AGD 980609 1. Replaced wcsm$src with csmis$inc
! 41 NSR 990207 1. noticed that some users can evade the watchdog by running WorkDB via spawn from TekWar (this blocks
! our nightly RMS file tuning).
! 2. changed MinBIO from 8 to 24 ("$ sho proc/acc" will generate 17 on any size VAX)
! 3. changed MinCPU from 4 {40 mS} to 10 {100 mS} ("$ sho proc/acc" will use ~ 12 mS on a VAX-3800)
! 42 NSR 000208 1. renovated for use with sys$library starlet
! NSR 000510 2. now user "SYSTEM" is never bothered (we could have system problems)
! 3. now device "OPA0:" is never bothered (we could have system problems)
! NSR 000824 4. added come debugging code to find out why certain tasks are being killed by the watch dog
! NSR 000825 5. now DECwindows tasks without a terminal are never stopped (Font Server)
! NSR 010622 6. alpha renovation
! NSR 011001 7. now NEIL, STEVE, and AL are no longer excluded from consideration
! 43 NSR 081103 1. documentation changes
! 44 NSR 120302 1. added code to kill runaway Apache processes (happens once every few weeks)
! 45 NSR 120303 1. mini-cleanup
! 2. now terminate users who consume more than 30 minutes of CPU time (previously was 60 minutes)
! NSR 120305 3. a few tweaks
! 46 NSR 120907 1. added code to kill runaway TCPware processes (happens once every few months)
! 47 NSR 120907 1. renovated to do away with "[.inc]vms_structures.inc"
! 2. introduced some optimizations (why request return lengths then never use them?) bf_47.2
! 3. a few more optimizations
! 48 NSR 130327 1. now need to watchdog some user procs without an attached terminal (like "SSHD 9999A") bf_48.1
! 2. increased maxium limit for Apache child processes bf_48.2
! 3. increased maxium limit for Apache worker processes bf_48.3
! NSR 130328 4. more tweaks to the log file
! 49 NSR 130329 1. more changes to SSHD section after a good night's sleep :-) bf_49.1
! 2. increased maxium limit for Apache child processes (again) bf_49.2
! 3. restored maxium limit for Apache worker processes bf_49.3
! 4. changes to logging logic
! 5. increased MinCPU because idle "SSHD PTD" always seem to consume enough CPU resources
! 6. increased MinBIO
! 50 NSR 130330 1. changes to logging logic
!=========================================================================================================================
! Notes:
!
! 1. I have employed multiple RETURNS in subroutines so that this program will consume the fewest possible resources on
! my over-worked VAX-11/730. It makes the logic a little hard to follow in some places. Sorry.
! 2. DCL commands used to COMPILE & LINK:
! a. BASIC watchdog_47.bas (yields: watchdog_30.obj)
! b. LINK watchdog_47.obj (yields: watchdog_30.exe)
! 3. DCL commands used to RUN:
! a. EDIT WATCHDOG.COM then insert the following:
! $ run directory:WatchDog_47 -
! /process_name = "Watch_Dog" -
! /UIC=[1,4] -
! /noswap -
! /priv = (world,oper) -
! /input = nl: -
! /ERROR = csmis$log:watchdog.err -
! /output = csmis$log:watchdog.out -
! /prior = 2
! $ exit
! b. @WatchDog
! c. place '@ directory:WatchDog.com' in SYS$MANAGER:SYSTARTUP.COM so watch dog is run every boot
!=========================================================================================================================
OPTION type = explicit ! cuz tricks are for kids
OPTION size = (real double) ! force of habit
on error goto common_trap ! old school trapping
!
! OpenVMS system related stuff
!
%include "starlet" %from %library "sys$library:basic$starlet" ! system services
%include "$ssdef" %from %library "sys$library:basic$starlet" ! ss$
%include "$jpidef" %from %library "sys$library:basic$starlet" ! jpi$
%include "$syidef" %from %library "sys$library:basic$starlet" ! syi$
%include "$brkdef" %from %library "sys$library:basic$starlet" ! brk$
%include "lib$routines" %from %library "sys$library:basic$starlet" ! lib$
%include "$dvidef" %from %library "sys$library:basic$starlet" ! dvi$
%include "$iledef" %from %library "sys$library:basic$starlet" ! ile3$ (Item List Entry 3 structures)
!~~~ %include "$iosbdef" %from %library "sys$library:basic$starlet" x iosb$ (iosb structures)
!
! I need the following iosb to get around a limitation found in the BASIC version of starlet
!
! question : How did I know?
! answer : Hacking
! reference: https://neilrieck.net/docs/openvms_notes_hacking_starlet.html
!
record my_iosb !
variant !
case ! vanilla
group one !
word iosb$w_status !
word iosb$w_bcnt !
long iosb$l_dev_depend !
end group !
case ! used in sys$getqui
group two !
long iosb$l_getxxi_status !
long iosb$l_reserved !
end group !
case ! used to satisfy the compiler
group three !
basic$quadword iosb$quad ! unsigned quad word (system calls)
end group !
end variant !
end record !
!
!
%include "[.inc]device_controls.inc" ! vt escape sequences
!
external string function WCSM_dt_stamp ! date-time stamp (ccyymmddhhmmss)
!
external string function WCSM_trnlnm(string, string) ! translate logical name
!
declare long rc , ! Return Code (system status) &
MinCPU , ! minimum cpu time treshold &
MinBIO , ! minimum BIO treshold &
log_level% , ! &
sleep_minutes% , ! &
temp% , ! &
web% , ! &
day% , ! &
string junk$ , ! &
prefix$ , ! &
snap$ ! date-time snap shot
!
declare my_iosb IosbJpi ! IO Status block (for GetJpiW)
declare my_iosb IosbBrk ! IO Status block (for BrkThru)
declare my_iosb IosbSyi ! IO Status block (for GetSyiW)
!
!-----------------------------------------------------------------------
! declare new data type called: SyiRec
!-----------------------------------------------------------------------
record SyiRec ! structure of SYI Record
ile3 ItemVar(1) ! 0 -> 1 items
long list_term ! mark end-of-list
end record SyiRec !
!
declare SyiRec SyiBuf ! Now declare a variable using it
!
! storage for data returned by GetSYI
!
declare long CPU_Type , ! CPU Identification number &
MaxProcessCnt , ! Maximum Process Count (from SysGen) &
Master_PID ! Master PID of sub process
!-----------------------------------------------------------------------
! Make SyiBuf Entries
!-----------------------------------------------------------------------
SyiBuf::ItemVar(0)::ile3$w_length = 4 ! long (4 bytes)
SyiBuf::ItemVar(0)::ile3$w_code = SYI$_CPU ! Process ID #
SyiBuf::ItemVar(0)::ile3$ps_bufaddr = LOC(CPU_Type) !
SyiBuf::ItemVar(0)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
SyiBuf::ItemVar(1)::ile3$w_length = 4 ! long
SyiBuf::ItemVar(1)::ile3$w_code = SYI$_MaxProcessCnt ! max process count
SyiBuf::ItemVar(1)::ile3$ps_bufaddr = LOC(MaxProcessCnt) !
SyiBuf::ItemVar(1)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
SyiBuf::LIST_TERM = SYI$C_ListEnd ! mark end
!
! /// DVI Setup ///
!
! declare a new data type called: DviRec
!
record DviRec ! structure of Dvi Record
ile3 ItemVar(0) ! 0 -> 0 items
long list_term ! mark end-of-list
end record DviRec !
!
declare DviRec DviBuf ! Now declare a variable using it
!
!-----------------------------------------------------------------------
! declare new data type called: JpiRec
!-----------------------------------------------------------------------
record JpiRec ! structure of JPI Record
ile3 ItemVar( 12 ) ! 0 -> 12 items
long list_term ! mark end-of-list
end record JpiRec !
!
declare JpiRec JpiBuf ! Now declare a variable using it
!
! Storage for info returned by GETJPI
!
! Note: variables with an '_RtnLn' suffix are 'Return Length' variables.
! After a call, VMS places a count of the number of bytes returned in each '_RtnLn' variable
!
Declare long PID , ! process PID &
NewCPU , ! recorded CPU time &
NewBIO , ! recorded I/O count &
Subcount , ! recorded # of subprocesses &
PrcIdx ! proc index in VMS table
MAP(Jpi)string UserName = 12 , ! user name &
long UserName_RtnLn , ! length returned &
string ProcName = 15 , ! process name &
long ProcName_RtnLn , ! &
string TTY_Name = 10 , ! TTY port name (_LTA9999:) &
long TTY_Name_RtnLn , ! length returned &
string Account = 12 , ! user's account &
long Account_RtnLn , ! length returned &
string ImagName = 255 , ! Image Name &
long ImagName_RtnLn , ! length returned &
long grp_buffer , ! group (decimal) &
long mem_buffer ! member (decimal)
!
!-----------------------------------------------------------------------
! Make JpiBuf Entries
!-----------------------------------------------------------------------
JpiBuf::ItemVar(0)::ile3$w_length = 4 ! long (4 bytes)
JpiBuf::ItemVar(0)::ile3$w_code = JPI$_PID ! Process ID #
JpiBuf::ItemVar(0)::ile3$ps_bufaddr = LOC( PID ) ! location of PID
JpiBuf::ItemVar(0)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(1)::ile3$w_length = 4 ! long
JpiBuf::ItemVar(1)::ile3$w_code = JPI$_CpuTim ! CPU time used (in 10 mS ticks)
JpiBuf::ItemVar(1)::ile3$ps_bufaddr = LOC( NewCPU ) !
JpiBuf::ItemVar(1)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(2)::ile3$w_length = 4 ! long
JpiBuf::ItemVar(2)::ile3$w_code = JPI$_BufIo ! Buffered IO count
JpiBuf::ItemVar(2)::ile3$ps_bufaddr = LOC( NewBIO ) !
JpiBuf::ItemVar(2)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(3)::ile3$w_length = 12 ! size of UserName
JpiBuf::ItemVar(3)::ile3$w_code = JPI$_UserName ! Username
JpiBuf::ItemVar(3)::ile3$ps_bufaddr = LOC( UserName ) !
JpiBuf::ItemVar(3)::ile3$ps_retlen_addr = LOC( UserName_RtnLn ) !
!
JpiBuf::ItemVar(4)::ile3$w_length = 10 ! size of TTY_Name
JpiBuf::ItemVar(4)::ile3$w_code = JPI$_Terminal ! terminal name
JpiBuf::ItemVar(4)::ile3$ps_bufaddr = LOC( TTY_Name ) !
JpiBuf::ItemVar(4)::ile3$ps_retlen_addr = LOC( TTY_Name_RtnLn ) !
!
JpiBuf::ItemVar(5)::ile3$w_length = 4 ! long
JpiBuf::ItemVar(5)::ile3$w_code = JPI$_Proc_Index ! proc_index
JpiBuf::ItemVar(5)::ile3$ps_bufaddr = LOC( PrcIdx ) !
JpiBuf::ItemVar(5)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(6)::ile3$w_length = 4 ! long
JpiBuf::ItemVar(6)::ile3$w_code = JPI$_PrcCnt ! # of subprocs
JpiBuf::ItemVar(6)::ile3$ps_bufaddr = LOC( Subcount) !
JpiBuf::ItemVar(6)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(7)::ile3$w_length = 4 ! long
JpiBuf::ItemVar(7)::ile3$w_code = JPI$_Master_PID !
JpiBuf::ItemVar(7)::ile3$ps_bufaddr = LOC( Master_PID ) ! master PID
JpiBuf::ItemVar(7)::ile3$ps_retlen_addr = 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(8)::ile3$w_length = 12 ! long
JpiBuf::ItemVar(8)::ile3$w_code = JPI$_Account !
JpiBuf::ItemVar(8)::ile3$ps_bufaddr = LOC( Account ) ! user's account
JpiBuf::ItemVar(8)::ile3$ps_retlen_addr = LOC( Account_RtnLn ) !
!
JpiBuf::ItemVar(9)::ile3$w_length = 255 ! string
JpiBuf::ItemVar(9)::ile3$w_code = JPI$_ImagName !
JpiBuf::ItemVar(9)::ile3$ps_bufaddr = LOC( ImagName ) ! image name
JpiBuf::ItemVar(9)::ile3$ps_retlen_addr = LOC( ImagName_RtnLn ) !
!
JpiBuf::ItemVar(10)::ile3$w_length = 4 ! long
JpiBuf::ItemVar(10)::ile3$w_code = JPI$_Grp !
JpiBuf::ItemVar(10)::ile3$ps_bufaddr = loc(grp_buffer ) ! group
JpiBuf::ItemVar(10)::ile3$ps_retlen_addr= 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(11)::ile3$w_length = 4 ! long
JpiBuf::ItemVar(11)::ile3$w_code = JPI$_Mem !
JpiBuf::ItemVar(11)::ile3$ps_bufaddr = loc(mem_buffer ) ! member
JpiBuf::ItemVar(11)::ile3$ps_retlen_addr= 0 ! 0=don't record byte count bf_47.2
!
JpiBuf::ItemVar(12)::ile3$w_length = 15 ! size of ProcName
JpiBuf::ItemVar(12)::ile3$w_code = JPI$_PrcNam ! Process name
JpiBuf::ItemVar(12)::ile3$ps_bufaddr = LOC( ProcName ) !
JpiBuf::ItemVar(12)::ile3$ps_retlen_addr= LOC( ProcName_RtnLn ) !
!
JpiBuf::LIST_TERM = JPI$C_ListEnd ! end of list
!-----------------------------------------------------------------------
! Declare a new data type called JpiRec2
! (for use with sub procs)
!-----------------------------------------------------------------------
record JpiRec2 ! structure of JPI Record
ile3 ItemVar ! 0 -> 1 items
long list_term ! mark end-of-list
end record JpiRec2 !
!
declare JpiRec2 JpiBuf2 ! Now declare a variable using it
!
JpiBuf2::ItemVar::ile3$w_length = 10 ! size of TTY_Name
JpiBuf2::ItemVar::ile3$w_code = JPI$_Terminal ! terminal name
JpiBuf2::ItemVar::ile3$ps_bufaddr = LOC(TTY_Name) !
JpiBuf2::ItemVar::ile3$ps_retlen_addr = LOC(TTY_Name_RtnLn) !
!
JpiBuf2::LIST_TERM = JPI$C_ListEnd ! end of list
!-----------------------------------------------------------------------
! Misc Declarations
!-----------------------------------------------------------------------
Declare String Message$ , ! string for breakthru &
trailer$ , ! etc... &
Long SeedPID , ! &
desired_action% ! 3 = give him the boot
! 2 = give him the boot
! 1 = only warn him
! 0 = skip
!=======================================================================
! Initialize
!=======================================================================
init:
2000 margin #0, 132 ! in case logging is turned on
print k_program +"_"+ k_version !
print string$(len(k_program +"_"+ k_version), asc("=")) ! what will the optimizer do with this?
snap$ = wcsm_dt_stamp !
print "-i-Starting at: "+ left$(snap$,8) +"."+ right$(snap$,9) ! ccyymmdd.hhmmss
print "-i-reminder: group + member values are displayed in decimal rather than octal"
!
! Use GetSYI to find out...
! 1. processor type
! 2. MaxProcessCnt (from SYSGEN)
!
rc = sys$getsyiw (,,, SyiBuf by ref, IosbSyi::iosb$quad by ref,,) ! get system info
if rc <> SS$_Normal then !
print "-e-GetSyi: "+ str$( rc ) !
sleep 1 !
call lib$Stop (rc by value) ! kill self (seems rather extreme)
end if !
!
!~ select CPU_type x
!~ case pr$_sid_typ730 , x 0.3 MIPS CPUs (VAX730) &
!~ pr$_sid_typ750 , x 0.6 MIPS CPUs (VAX750) &
!~ pr$_sid_typ780 x 1.0 MIPS CPUs (VAX780)
!~ MinCpu = 10 x 10 x 10 = 100 mS
!~ MinBIO = 24 x
!~ case else x Larger MIPS System
MinCpu = 20 ! 20 x 10 = 200 mS
MinBIO = 50 !
!~ end select x
!
DIM LONG CpuTim (MaxProcessCnt) ! cpu time buffer
DIM LONG Bufioc (MaxProcessCnt) ! buffered i/o count buffer
DIM LONG PID (MaxProcessCnt) ! PID buffer
DIM LONG Warn (MaxProcessCnt) ! warning count buffer
!
! mat CpuTim = zer
! mat BufIoc = zer
! mat PID = zer
! mat Warn = zer
!
!=======================================================================
! MAIN
!=======================================================================
main:
!
log_level% = -1 ! this will force an entry to the log file
sleep_minutes% = 0 ! this will force an entry to the log file
3000 while 1 ! do this forever
snap$ = wcsm_dt_stamp ! get snapshot of current time
!-------------------------------------------------------------------
! <<< support logging >>>
!
! 0 = most logging off (except program errors + warnings)
! note: all process terminations are warnings
! 1 = more info
! 2 = even more info (including: DECwindows)
! 3 = even more info (inclduing: NEIL, STEVE, DAVE)
! 4 = log everything
!-------------------------------------------------------------------
when error in !
junk$ = WCSM_TrnLnm ("CSMIS$WATCHDOG_LOG","LNM$SYSTEM_TABLE") !
temp% = integer(junk$) !
temp% = 3 if temp% > 3 !
use !
print "-e-logical CSMIS$WATCHDOG_LOG not numeric: "+ junk$ !
temp% = 0 ! no logging
end when !
select temp% !
case < 0 !
temp% = 0 !
case > 5 !
temp% = 5 !
end select !
if log_level% <> temp% then ! if changing (since last pass thru)
log_level% = temp% !
print "-i-Logging level set to "+ str$(log_level%) +" at "+ left$(snap$,8) +"."+ right$(snap$,9)
end if !
!-------------------------------------------------------------------
! <<< support sleep >>>
!
! caveats:
! 1) do not change from 5 minutes without checking code below. Search for ">= 18"
! 2) run this program interactively then hit <enter> 18 times (once per second) to log everyone out
!-------------------------------------------------------------------
when error in !
junk$ = WCSM_TrnLnm ("CSMIS$WATCHDOG_SLEEP_MINS","LNM$SYSTEM_TABLE")
temp% = integer(junk$) !
select temp% !
case < 1, > 5 !
temp% = 5 ! default to 5 minutes
end select !
use !
print "-e-logical CSMIS$WATCHDOG_SLEEP_MINS not numeric: "+ junk$
temp% = 5 ! default to 5 minutes
end when !
if sleep_minutes% <> temp% then ! if changing
sleep_minutes% = temp% !
print "-i-Sleep_Minutes now set to "+ str$(sleep_minutes%) +" at "+ left$(snap$,8) +"."+ right$(snap$,9)
end if !
!
select mid$(snap$,9,2) ! check the current hour bf_48.1
case "06" to "22" !
day% = 1 !
case else !
day% = 0 !
end select !
!===================================================================
! <<< original watchdog code starts here >>>
!===================================================================
SeedPID = -1 ! Do a wildcard GETJPI
rc = 0 ! clear previous status
!
while rc <> SS$_NoMoreProc ! -------------------------------------------------
rc = sys$getjpiw( , &
SeedPID by ref,, &
JpiBuf by ref, &
IosbJpi::iosb$quad by ref,,) ! get process info
Select rc !
case SS$_Normal ! the call worked so fall thru
case SS$_suspended,SS$_NoPriv,SS$_NonExpr,SS$_NoMoreProc ! nothing to kill
iterate !
case else !
print "-i-Seed : "+ str$( SeedPID ) ! print seed (for debug)
print "-e-GetJpi: "+ str$( rc ) ! print rc (for debug)
sleep 1 !
call LIB$Stop(rc by value) ! oops, better kill self
end select !
!
! this stub is just for debugging/hacking/testing etc.
!
select log_level% !
case >= 4 ! log everything
prefix$ = "-i-debug-L4>" !
gosub display_proc_info !
goto scan_continue !
case >= 3 ! log selective
select edit$(left$(UserName,UserName_RtnLn), 32+2) !
case "NEIL","STEVE","DAVE" ! log developers
prefix$ = "-i-debug-L3>" !
gosub display_proc_info !
goto scan_continue !
end select !
end select !
!
if log_level% >=2 then !
if pos(left$(ImagName,ImagName_RtnLn),"DECW$",1)>0 then ! log DECWindows stuff
prefix$ = "-i-debug-L2>" !
gosub display_proc_info !
end if !
end if !
scan_continue: !
!
!===============================================================
! This block of code deals with certain Apache runaway problems (uses too much CPU time)
! 1) I just tuned Apache so that child processes do not live forever even though keepalives are enabled
! KeepAlive On
! MaxKeepAliveRequests 99
! KeepAliveTimeout 120
! MaxRequestsPerChild 999
! This means that APACHE$SWS should be killing them gracefully before watchdog does
! 2) "I think" encrpyted connections consume more CPU time than we previously thought
! 3) Apache worker processes only run once then exit
! 4) Apache worker processes running Dave's powernode stuff "may" require more CPU time
! caveat: 60 seconds of CPU time is really a really long time
!===============================================================
web% = 0 ! init to not-web-mode
junk$ = edit$(left$(ProcName,ProcName_RtnLn),32+2) ! upcase, no white space
iterate if junk$ = "APACHE$SWS" ! never touch the master Apache process
!
if pos(junk$,"APACHE$SWS",1)>0 then ! eg. APACHE$SWS0009 (child process)
web% = 999 ! maximum limit = 999 CPU seconds bf_49.2
goto check_this_proc !
end if !
if pos(junk$,"APACHE$WW_",1)>0 then ! eg. APACHE$WW_62712 (worker process)
web% = 60 ! maximum limit = 60 CPU seconds bf_49.3
goto check_this_proc !
end if !
!---------------------------------------------------------------
! This is special code which deals with certain TCPware runaway problems.
! No warning will be sent to the offender, we will just terminate it.
!---------------------------------------------------------------
iterate if junk$ = "SSHDMASTER" ! never touch the master SSHD process
!---------------------------------------------------------------
!OpenVMS V8.4 on node KAWC15 27-MAR-2013 11:22:06.82 Uptime 181 13:46:58
! Pid Process Name State Pri I/O CPU Page flts Pages
!00144060 SSHD 0272 LEF 6 14158 0 00:00:01.17 832 798 <<< SYSTEM (for specified USER)
!000B3282 SSHD 0272A PTD LEF 4 3146 0 00:00:00.67 2539 607 <<< USER=AN_DHALLA
!001136B0 SSHD 0272B PTD LEF 8 4922 0 00:00:00.78 1144 798 <<< USER=AN_DHALLA
!0012624F SSHD 0272C PTD LEF 9 6270 0 00:00:01.08 1683 759 <<< USER=AN_DHALLA
!---------------------------------------------------------------
if pos(junk$,"SSHD",1)>0 then ! eg. SSHD 0123 (client-related process)
if pos(junk$,"PTD",1)=0 then ! if this is the SYSTEM proc (no PTD)
if day% = 1 then ! if day time bf_48.1
web% = 600 ! harder to kill during the day
else ! else night time
web% = 300 ! allow easier kill at night
end if !
goto check_this_proc !
end if !
end if !
!===============================================================
! 1. if no tty, this could be a detached system task so skip
! 2. if no tty, this could be a detached DECwindows task (like session manager) so continue
! 3. if no tty, but master_pid <> pid (this is a subprocess), then send a message to the parent task
!===============================================================
if TTY_Name_RtnLn=0 then ! if no terminal ----------------------------------
!
!~~~ iterate if master_PID=0 x if a system proc (VMS 4.5)
!
! ignore SYSTEM [1,4] .. [1,10]
! ignore CUSTODIAN [346,6]
!
select grp_buffer ! test group
case <= 7 ! if SYSTEM (group: 1->7)
print "-i-skipping tests of SYSTEM process" if log_level% >= 4
iterate !
case ((3 * 64) + (4 * 8) + 6) ! if CUSTODIAN grp: 346 (octal:346 = decimal:230)
if mem_buffer = 6 then ! then ignore member 6
print "-i-skipping tests of CUSTODIAN process" if log_level% >= 4
iterate !
end if !
end select !
!
iterate if pid = master_PID ! if not a sub proc
!
! This is a subprocess so do another GetJPI to find out which terminal the parent process is using
!
rc = sys$getjpiw ( ,Master_PID by ref,, &
JpiBuf2 by ref, &
IosbJpi::iosb$quad by ref,, )
iterate if TTY_Name_RtnLn=0 ! if still no terminal (then not a user)
end if ! -------------------------------------------------
!
! We've found a user, so let's see if he's busy
!
check_this_proc:
gosub Check_User ! gather stats (etc.)
next ! Go back for next process ------------------------
!
sleep sleep_minutes% * 60 ! Hibernate
snap$ = wcsm_dt_stamp ! ccyymmddhhmmss
print "-i-wake: "+ left$(snap$,8) +"."+ right$(snap$,9) if log_level% > 0
next ! Check whole system again ------------------------
!
!=======================================================================
! Check If User Has Been Active Since Last Pass
!=======================================================================
4000 CHECK_USER:
!
! Program Logic Notes:
!
! 1. When this program ran on a small system with DZ11's, we used a small array to record process info for each device.
! The device name was used to index the array so that the index for OPA0: was 0, for TTA0: was 1, for TTB0: was 9,
! for TTC0: was 17, etc.
! 2. This method was unusable when we went to terminal servers because the terminal server numbers are dynamic and keep
! cycling between LTA1: LTA9999: (even if you don't have 10,000 servers!!) and we didn't want 10,000 element arrays.
! 3. Now we use JPI$_Proc_Index (from SYS$GetJpiW) to generate a unique number in the range of 1 -> MaxProcessCnt
! (sysgen parameter).
!
goto transcribe_n_exit2 if PID(PrcIdx) <> PID ! transcribe only if new user in this slot...
goto transcribe_n_exit2 if Subcount > 0 ! Transcribe only if sub procs exist...
!
! note: if Apache is started by user=SYSTEM at boot, then we must test CPU usage here because
! user=SYSTEM escapes termination tests approximately 40 lines below
!
if web% > 0 then ! if a web process...
select NewCPU ! time in 10 mS ticks
case >= 100 * web% ! remember that web% is in seconds
prefix$ = "-w-Web-9>" !
gosub display_proc_info ! display info no matter what the logging level
print "-w-CPU time: "+ str$(NewCPU) +" (10 mS ticks)" ! time in 10 mS ticks
print "-w- : "+ str$(web%) +" (secs)" !
rc = sys$delprc(PID by ref,) ! then give him the boot
select rc !
case SS$_Normal !
print "-w-web process terminated" !
case SS$_NonExpr ! just logged out ???
case else !
print "-e-DelPrc: "+ str$(rc) ! could not kill proc
sleep 1 ! never go too fast
call LIB$Stop(rc by value) ! kill self (this is rather extreme)
end select !
goto skip_transcribe_exit ! cuz nothing to transcribe
case >= 100 * web% / 2 ! remember that web% is in seconds
if log_level% >= 1 then !
prefix$ = "-i-Web-8>" !
gosub display_proc_info !
print "-i-above half CPU limit" !
end if !
case else !
if log_level% >= 2 then !
prefix$ = "-i-Web-7>" !
gosub display_proc_info !
print "-i-below half CPU limit" !
end if !
end select !
goto transcribe_n_exit2 !
end if !
!
! make sure certain processes never consume more than 59 minutes of CPU time
!
select edit$(left$(UserName,UserName_RtnLn),32+2) !
case "SYSTEM" !
if log_level% >= 2 then !
prefix$ = "-i-sys-1>" !
gosub display_proc_info !
print "-i-skipping further tests on this process (grp)" !
end if !
goto transcribe_n_exit2 ! don't ever kill SYSTEM processes (exit now)
case "NEIL","STEVE","DAVE" ! don't test max CPU time of these people
if log_level% >= 1 then !
prefix$ = "-i-usr-1>" !
gosub display_proc_info !
print "-i-skipping max cpu tests on this process (usr)" !
end if !
case else !
select NewCPU ! let's see how much CPU time was used
case >= 100 * 60 * 59 ! if >= 59 minutes of cpu time
snap$ = wcsm_dt_stamp !
junk$ = edit$(left$(TTY_Name,TTY_Name_RtnLn),32+2) !
junk$ = "NONE" if edit$(junk$,4+2) = "" ! could be web process
print "-e-process: "+ edit$(left$(UserName,UserName_RtnLn),32+2) +&
" on term "+ junk$ +&
" has used too much CPU time: "+ left$(snap$,8) +"."+ right$(snap$,9)
Warn(PrcIdx) = 999 ! put peg count over the limit
goto inactive_user ! blow him away
case else !
if log_level% >= 1 then !
prefix$ = "-i-usr-9>" !
gosub display_proc_info !
print "-i-max cpu tests passed" !
end if !
end select !
end select !
!
print "-i-bio stats: idx=";PrcIdx;" prev=";Bufioc(PrcIdx);" curr=";NewBIO if log_level% >= 2
goto transcribe_n_exit if ((Bufioc(PrcIdx) + MinBIO) <= NewBIO ) ! if enough BIOs
!
print "-i-cpu stats: idx=";PrcIdx;" prev=";CpuTim(PrcIdx);" curr=";NewCpu if log_level% >= 2
goto transcribe_n_exit if ((Cputim(PrcIdx) + MinCpu) <= NewCPU ) ! if enough CPU
!
select edit$(left$(TTY_Name,TTY_Name_RtnLn),32+2) !
case "OPA0:" ! ignore the console device
print "-i- skipping termination on device OPA0" !
goto transcribe_n_exit !
end select !
!
if pos(left$(ImagName,ImagName_RtnLn),"DECW$",1%)>0 then ! if this is a DECWindows task...
select mid$(wcsm_dt_stamp,9,4) ! get current 24 hour time (2359)
case < "0800", > "1700" ! if not busness hours, then test
goto inactive_user !
case else ! else ignore during busness hours
goto transcribe_n_exit !
end select !
end if !
!-----------------------------------------------------------------------
! This User Is Inactive (since the previous pass 5 minutes ago), so...
! 1. update the warning count (he will get a least 15)
! 2. warn him if necessary
! 3. blow him away if we've warned him too often
! 4. do not transcribe (maybe he will eventually build up enough to trigger a transcribe)
!-----------------------------------------------------------------------
inactive_user: !
Warn(PrcIdx) = Warn(PrcIdx) + 1 ! increment warning count
print "-i-warning incremented to "+str$(Warn(PrcIdx)) if log_level% >= 2
gosub Warn_User ! warn the user (also sets desired_action%)
if desired_action% >= 2 then ! if desired_action% >= 2 (then terminate him)
prefix$ = "-w-debug-U9>" !
gosub display_proc_info if log_level% >= 1 !
rc = sys$delprc(PID by ref,) ! you are terminated
select rc ! how did it go?
case SS$_Normal !
print "-w-user process terminated" !
case SS$_NonExpr ! just logged out ???
!~~~ case SS$_NoSuchDev x just logged out ???
case else !
print "-e-DelPrc: "+ str$(rc) !
sleep 1 ! never go too fast
call LIB$Stop(rc by value) ! kill watch dog
end select !
end if !
goto skip_transcribe_exit ! do not transcribe
!-----------------------------------------------------------------------
! <<< Transcribe Statistics >>>
!
! user was active (or new) so transcribe stats, reset warn, then exit
!-----------------------------------------------------------------------
transcribe_n_exit: !
print "-i-transcribing stats (resetting warn)" if log_level% >= 2 !
transcribe_n_exit2: !
PID (PrcIdx) = PID ! record PID incase of new user
Cputim(PrcIdx) = NewCPU ! record cpu time
Bufioc(PrcIdx) = NewBIO ! record i/o count
Warn (PrcIdx) = 0 ! RESET
skip_transcribe_exit: !
Return !
!=======================================================================
! Build Warning Message and send to user's terminal
!=======================================================================
5000 Warn_User:
!
! notes:
! 1. since many of our green-screen apps have their own built in watch dogs set for 60 + 75 minutes (with a
! 1 minute resolution), we will set our watchdog times for 80 + 90 minutes
! 2. since we run every 'sleep_minutes%' minutes, a value of 16 yields (16 * 'sleep_minutes%') minutes
!
select Warn(PrcIdx) ! get warning counts
case >= 999 ! too much cpu time was used
desired_action% = 3 ! flag = warn & stop
case >= 18 ! 18 times (90 minutes / 5 minutes)
desired_action% = 2 ! flag = warn & stop
case = 17 !
desired_action% = 0 ! flag = skip
case = 16 ! 16 times (80 minutes / 5 minutes)
desired_action% = 1 ! flag = warn only
case else !
desired_action% = 0 ! flag = skip
end select !
!
! send a message to the user if the desired_action% <> 0
!
select desired_action% !
case 0 ! skip
goto warn_user_exit !
case 1, ! warn only &
2 ! warn then kill
snap$ = wcsm_dt_stamp !
message$= ! &
vt$SaveCursor + ! &
vt$Normal + ! &
vt$Message + ! &
bel + ! &
"Terminator Time: " + ! &
left$( snap$, 8) + ! ccyymmdd &
"." + ! &
right$( snap$, 9) + ! hhmmss &
" Note: You have been inactive for " + ! &
str$(Warn(PrcIdx)*sleep_minutes%) +" minutes" !
case 3 !
snap$ = wcsm_dt_stamp !
message$= ! &
vt$SaveCursor + ! &
vt$Normal + ! &
vt$Message + ! &
bel + ! &
"Terminator Time: " + ! &
left$( snap$, 8) + ! ccyymmdd &
"." + ! &
right$( snap$, 9) + ! hhmmss &
" Note: You have used too much CPU time" !
end select !
!
select desired_action% !
case 1 !
message$ = message$ + VT$RestoreCursor !
case 2, 3 !
message$ = message$ + cr + lf + "Process Terminated"+ cr + lf + cr + lf
end select !
!
! experimental code for GetDvi (possible DECwindows support) Motif 1.2-3
!
map(dvi) string phyDevNam$ = 80 , &
long phyDevNam_RtnLn !
!
if left$(TTY_Name,TTY_Name_RtnLn) <> "" then ! if we have a terminal name
!
DviBuf::ItemVar(0)::ile3$w_length = 80 ! length code
DviBuf::ItemVar(0)::ile3$w_code = Dvi$_TT_phyDevNam ! item
DviBuf::ItemVar(0)::ile3$ps_bufaddr = LOC( phyDevNam$) ! addr of var
DviBuf::ItemVar(0)::ile3$ps_retlen_addr = LOC( phyDevNam_RtnLn) ! addr of var
!
rc = sys$getdviw(,,left$(TTY_Name,TTY_Name_RtnLn),DviBuf by ref,,,,)!
print "-e-GetDvi rc: "+ str$(rc) if (rc and 1%) <> 1% !
!
print !
print "-i-debug-U1> tty_name> "+ left$(TTY_Name,TTY_Name_RtnLn) + &
" dev_name> "+ left$(phyDevNam$,phyDevNam_RtnLn) + &
" time> "+ left$( snap$, 8) +"."+ right$( snap$, 9) ! ccyymmdd.hhmmss
prefix$ = "-i-debug-U2>" !
gosub display_proc_info !
!
if phyDevNam_RtnLn = 0 then !
print "-i-disconnected task (not sending warning)" ! don't send messages to disconnected procs
goto warn_user_exit !
end if !
end if !
!
select left$( left$(TTY_Name,TTY_Name_RtnLn), 2) !
case "TX","TT" ! async controller
case "LT","FT" ! lat + DECterm
case "NT","RT" ! tcp/ip + decnet
case "TW","PY" ! DEC Windows Device (pre VMS 6.0)
prefix$ = "DECwind>" !
gosub display_proc_info if log_level% >= 2 !
case "MB" ! mail box device
prefix$ = "MailBox>" !
gosub display_proc_info if log_level% >= 2 !
goto warn_user_exit ! jump (not a terminal)
case "" !
prefix$ = "None >" !
gosub display_proc_info if log_level% >= 2 !
goto warn_user_exit ! jump (nothing to send to)
case else !
prefix$ = "Unknown>" !
gosub display_proc_info if log_level% >= 2 !
end select !
!
! Send Message to Terminal
!
rc = sys$brkthru( , &
message$ by desc, &
left$(TTY_Name,TTY_Name_RtnLn) by desc, &
BRK$C_Device by value, &
IosbBrk::iosb$quad by ref,,,,,, )
select rc !
case SS$_Normal !
case SS$_NoSuchDev ! just logged out ???
case else !
print "-e-BrkThru: "+ str$(rc) !
sleep 1 ! never go too fast
call LIB$Stop(rc by value) ! kill watch dog (is this wise?)
end select !
!
warn_user_exit: !
return !
!=======================================================================
! display process related info (to log file)
!=======================================================================
display_proc_info: !
print !
snap$ = wcsm_dt_stamp !
print "-i-time: "+ left$(snap$,8) +"."+ right$(snap$,9) ! ccyymmdd.hhmmss
print format$(left$(prefix$,12) ,"'LLLLLLLLLLL" ) +&
" PID>"+ format$(PID ,"########" ) +&
" Prc>"+ format$(left$(ProcName ,ProcName_RtnLn ) ,"'LLLLLLLLLLLLLL" ) +&
" User>"+ format$(left$(username ,username_RtnLn ) ,"'LLLLLLLLLLLLLL" ) +&
" Acnt>"+ left$(Account ,Account_RtnLn ) +&
" Term>"+ left$(tty_name ,tty_name_RtnLn )
print " Grp>"+ format$(grp_buffer ,"###" ) +&
" Mem>"+ format$(mem_buffer ,"###" ) +&
" Img>"+ left$(ImagName ,ImagName_RtnLn ) !
return !
!=======================================================================
! Common Trap (BASIC error handler)
!
! this will go to sys$output (sys$error)
!=======================================================================
32000 common_trap:
snap$ = wcsm_dt_stamp !
print &
cr + lf + "-i-Line = "+ str$(erl) +&
cr + lf + "-i-Error= "+ str$(err) +&
cr + lf + "-i-Text = "+ ert$(err) +&
cr + lf + "-i-Time = "+ left$(snap$,8) +"."+ right$(snap$,9) !
!
resume fini ! fix stack + exit
!
!=======================================================================
! adios
!=======================================================================
Fini:
end !
!########################################################################################################################
!
32100 %include "[.fun]wcsm_dt_stamp.fun"
! function string WCSM_DT_Stamp
!
32110 %include "[.fun]wcsm_trnlnm.fun"
! function string WCSM_Trnlnm
!