--- loncom/cgi/clusterstatus.pl 2003/07/31 15:35:02 1.9 +++ loncom/cgi/clusterstatus.pl 2003/08/05 18:33:08 1.16 @@ -3,10 +3,11 @@ $|=1; # The LearningOnline Network with CAPA # Cluster Status # -# $Id: clusterstatus.pl,v 1.9 2003/07/31 15:35:02 www Exp $ +# $Id: clusterstatus.pl,v 1.16 2003/08/05 18:33:08 www Exp $ use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; +use strict; use LWP::UserAgent(); use HTTP::Headers; @@ -19,6 +20,28 @@ my %connectionstatus=(); my %perlvar=(); my $mode; +my $concount=0; +my $fromcache; + +my %domaindescription = (); +my %domain_auth_def = (); +my %domain_auth_arg_def = (); + +my %hostname=(); +my %hostip=(); +my %hostdom=(); +my %hostrole=(); +my %libserv=(); + +my $maxusers=0; +my $maxload=0; +my $totalusers=0; + +my %FORM=(); + +my $stat_total=0; +my $stat_notyet=0; +my $stat_fromcache=0; sub select_form { my ($def,$name,%hash) = @_; @@ -47,13 +70,26 @@ sub hidden { sub request { my ($local,$url,$cachetime)=@_; + $cachetime*=(0.5+rand); my $key=&key($local,$url); my $reply=''; + $stat_total++; +# if fromcache flag is set, only return cached values + if ($fromcache) { + if ($FORM{$key.'_time'}) { + return $FORM{$key}; + $stat_fromcache++; + } else { + return 'not_yet'; + $stat_notyet++; + } + } +# normal mode, refresh when expired or not yet present if ($FORM{$key.'_time'}) { if ((time-$FORM{$key.'_time'})<$cachetime) { $reply=$FORM{$key}; &hidden($key.'_time',$FORM{$key.'_time'}); - &hidden($key.'_fromcache',1); + $stat_fromcache++; } } unless ($reply) { @@ -61,7 +97,7 @@ sub request { $reply='local_unknown'; } else { - my $ua=new LWP::UserAgent(timeout => 20); + my $ua=new LWP::UserAgent(timeout => 10); my $request=new HTTP::Request('GET', "http://".$hostname{$local}.$url); @@ -91,10 +127,17 @@ sub connected { unless ($hostname{$remote}) { return 'remote_unknown'; } my $url='/cgi-bin/ping.pl?'.$remote; # -# Slowly phase this in: if not cached, only do 10 percent of the cases +# Slowly phase this in: if not cached, only do 5 percent of the cases, +# but always do the first five. # unless ($FORM{&key($local,$url)}) { - unless (rand>0.9) { return 'not_yet'; } + unless (($concount<=5) || (rand>0.95)) { + $stat_total++; + $stat_notyet++; + return 'not_yet'; + } else { + $concount++; + } } # # Actually do the query @@ -169,20 +212,40 @@ sub server { print &otherwindow($local,'/server-status','Server Status'); } +# ========================================================= Produce a green bar +sub bar { + my $parm=shift; + my $number=int($parm+0.5); + print "
"; + for (my $i=0;$i<$number;$i++) { + print "+"; + } + print "
"; +} + # ========================================================== Show server status sub serverstatus { - my $local=shift; + my ($local,$trouble)=@_; print (< "; + if ($trouble) { + print (""); + } + print "
$local $hostdom{$local} ($hostname{$local}; $hostrole{$local})
$domaindescription{$hostdom{$local}} -
+
ENDHEADER &login($local);&server($local);&users($local);&versions($local); &loncron($local);&lond($local);&lonc($local);&runloncron($local); - print "
"; + print "
$trouble
"; +# version + if ($host{$local.'_version'}) { + print "
Version: ".$host{$local.'_version'} + } # load if (($host{$local.'_load_doomed'}>0.5) || ($mode eq 'load_doomed')) { print "
Load: ".$host{$local.'_load'} @@ -200,6 +263,19 @@ ENDHEADER if ($host{$local.'_mysql'}) { print "
MySQL Database: ".$host{$local.'_mysql'} } +# connections + if ($host{$local.'_notconnected'}) { + print "
Not connected: "; + foreach (split(/ /,$host{$local.'_notconnected'})) { + if ($_) { + print " $_"; + } + } + } +# errors + if ($host{$local.'_errors'}) { + print "
loncron errors: ".$host{$local.'_errors'}; + } print "

"; } @@ -220,6 +296,260 @@ sub doomedness { return sort { $alldoomed{$b} <=> $alldoomed{$a} } @allhosts; } +sub resetvars { + $maxusers=0; + $maxload=0; + $totalusers=0; + $stat_total=0; + $stat_notyet=0; + $stat_fromcache=0; + + undef %host; + %host=(); +} + +sub mainloop { + &resetvars(); +# ==================================================== Main Loop over all Hosts + +foreach my $local (sort keys %hostname) { + $host{$local.'_unresponsive_doomed'}=0; +# -- Check general status + &statuslist($local,'General'); + my %loncron=&replyhash($local,'/lon-status/loncron_simple.txt',1200); + if (defined($loncron{'local_error'})) { + $host{$local.'_loncron'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + if ((time-$loncron{'time'})>$oneday) { + $host{$local.'_loncron'}='Stale.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + $host{$local.'_loncron_doomed'}=$loncron{'notices'} + +4*$loncron{'warnings'} + +100*$loncron{'errors'}; + $host{$local.'_errors'}=$loncron{'errors'}; + } + } +# -- Check version + &statuslist($local,'Version'); + my $version=&request($local,'/lon-status/version.txt',7200); + if ($version eq 'local_error') { + $host{$local.'_version'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + $host{$local.'_version'}=$version; + } +# -- Check user status + &statuslist($local,'Users'); + my %userstatus=&replyhash($local,'/cgi-bin/userstatus.pl?simple',600); + if (defined($userstatus{'local_error'})) { + $host{$local.'_userstatus'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + $host{$local.'_users_doomed'}=$userstatus{'Active'}; + $host{$local.'_users'}=$userstatus{'Active'}; + unless ($host{$local.'_users'}) { $host{$local.'_users'}=0; } + if ($host{$local.'_users'}>$maxusers) { + $maxusers=$host{$local.'_users'}; + } + $totalusers+=$host{$local.'_users'}; + my ($sload,$mload,$lload)=split(/ /,$userstatus{'loadavg'}); + $host{$local.'_load_doomed'}=$mload; + if ($mload>$maxload) { + $maxload=$mload; + } + $host{$local.'_load'}=$userstatus{'loadavg'}; + } +# -- Check mysql status + &statuslist($local,'Database'); + my %mysql=&replyhash($local,'/lon-status/mysql.txt',3600); + if (defined($mysql{'local_error'})) { + $host{$local.'_mysql'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + if ((time-$mysql{'time'})>(7*$oneday)) { + if ($hostrole{$local} eq 'library') { + $host{$local.'_mysql'}='Stale.'; + $host{$local.'_mysql_doomed'}=1; + } + if ($mysql{'mysql'} eq 'defunct') { + $host{$local.'_mysql'}='Defunct (maybe stale).'; + $host{$local.'_mysql_doomed'}=2; + } + } elsif ($mysql{'mysql'} eq 'defunct') { + $host{$local.'_mysql'}='Defunct.'; + $host{$local.'_mysql_doomed'}=3; + } + } +# -- Check rpm status + &statuslist($local,'RPMs'); + my %checkrpms=&replyhash($local,'/lon-status/checkrpms.txt',7200); + if (defined($checkrpms{'local_error'})) { + $host{$local.'_checkrpms'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + if ((time-$checkrpms{'time'})>(4*$oneday)) { + $host{$local.'_checkrpms'}='Stale.'; + $host{$local.'_checkrpms_doomed'}=50; + $host{$local.'_unresponsive_doomed'}++; + } elsif ($checkrpms{'status'} eq 'fail') { + $host{$local.'_checkrpms'}='Could not checked RPMs.'; + $host{$local.'_checkrpms_doomed'}=100; + } elsif ($checkrpms{'rpmcount'}) { + $host{$local.'_checkrpms'}='Outdated RPMs: '. + $checkrpms{'rpmcount'}; + $host{$local.'_checkrpms_doomed'}=$checkrpms{'rpmcount'}; + } + } +# -- Check connections + &statuslist($local,'Connections'); + $host{$local.'_notconnected'}=''; + $host{$local.'_notconnected_doomed'}=0; + foreach my $remote (sort keys %hostname) { + my $status=&connected($local,$remote); + $connectionstatus{$local.'_TO_'.$remote}=$status; + unless (($status eq 'ok') || ($status eq 'not_yet')) { + $host{$local.'_notconnected'}.=' '.$remote; + $host{$local.'_notconnected_doomed'}++; + } + } +# =============================================================== End Main Loop +} + +} + +sub reports { +# ====================================================================== Output + if ($mode=~/\_doomed$/) { +# Output by doomedness + foreach (&doomedness($mode)) { + &serverstatus($_); + } + } elsif ($mode eq 'connections') { + print + "". + ""; + foreach my $remote (sort keys %hostname) { + print ''; + } + print "\n"; +# connection matrix + foreach my $local (sort keys %hostname) { + print ''; + foreach my $remote (sort keys %hostname) { + if ($connectionstatus{$local.'_TO_'.$remote} eq 'not_yet') { + my $cellcolor='#FFFFFF'; + if ($local eq $remote) { $cellcolor='#DDDDDD'; } + print ''; + } elsif ($connectionstatus{$local.'_TO_'.$remote} eq 'ok') { + my $cellcolor='#BBDDBB'; + if ($local eq $remote) { $cellcolor='#99DD99'; } + print +''; + } else { + my $cellcolor='#DDBBBB'; + if ($connectionstatus{$local.'_TO_'.$remote} eq 'local_error') { + if ($local eq $remote) { + $cellcolor='#DD88AA'; + } else { + $cellcolor='#DDAACC'; + } + } else { + if ($local eq $remote) { $cellcolor='#DD9999'; } + } + print + ''; + } + } + print "\n"; + } + print "
 '.$remote.'
'.$local.'not yet testedok'. + $connectionstatus{$local.'_TO_'.$remote}.'
'; + &lonc($local); &lond($remote); + print '
"; + } elsif ($mode eq 'users') { +# Users + if ($maxusers) { + my $factor=50/$maxusers; + print "

Total active user(s): $totalusers

". + ""; + + foreach my $local (sort keys %hostname) { + if (defined($host{$local.'_users'})) { + print +'\n"; + } + } + print "
'.$local. + '
'. + $domaindescription{$hostdom{$local}}. + '
'; + &users($local); + print + ''. + $host{$local.'_users'}.'
"; + } else { + print "No active users logged in."; + } + } elsif ($mode eq 'load') { +# Load + if ($maxload) { + my $factor=50/$maxload; + print + ""; + foreach my $local (sort keys %hostname) { + if (defined($host{$local.'_load_doomed'})) { + print +'\n"; + } + } + print "
'. + $local. + '
'. + $domaindescription{$hostdom{$local}}. + '
'; + &server($local); + print + ''. + $host{$local.'_load_doomed'}.'
"; + } else { + print "No workload."; + } + } elsif ($mode eq 'trouble') { + my $count=0; + foreach my $local (sort keys %hostname) { + my $trouble=''; + if ($host{$local.'_unresponsive_doomed'}>3) { + $trouble='Does not respond to several queries.
'; + } + if ($host{$local.'_errors'}) { + $trouble='Has loncron errors.
'; + } elsif ($host{$local.'_loncron_doomed'}>600) { + $trouble='High loncron count.
'; + } + if ($host{$local.'_load_doomed'}>5) { + $trouble='High load.
'; + } + if ($host{$local.'_users_doomed'}>200) { + $trouble='High user volume.
'; + } + if ($host{$local.'_mysql_doomed'}>1) { + $trouble='MySQL database apparently offline.
'; + } + if ($host{$local.'_checkrpms_doomed'}>100) { + $trouble='RPMs outdated.
'; + } + if ($trouble) { $count++; &serverstatus($local,$trouble); } + } + unless ($count) { print "No mayor trouble."; } + } +} + # ====================================================================== Status sub statuslist { my ($local,$what)=@_; @@ -227,7 +557,8 @@ sub statuslist { "\n"; } -# +# ============================================================================= +# ============================================================================= # Main program # # ========================================================= Get form parameters @@ -256,7 +587,7 @@ foreach $pair (@pairs) { # ====================================================== Determine refresh rate -my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:60); +my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:30); if ($refresh<30) { $refresh=30; } my $starttime=time; @@ -271,13 +602,14 @@ my %modes=('trouble' => 'Trouble', 'load_doomed' => 'Doomed: Load', 'unresponsive_doomed' => 'Doomed: Status could not be determined', 'users' => 'User Report', + 'load' => 'Load Report', 'connections' => 'Connections Matrix'); $mode=$FORM{'mode'}; unless ($modes{$mode}) { $mode='trouble'; } # ================================================================ Send Headers print "Content-type: text/html\n\n". - "\n"; + "\n"; # -------------------- Read loncapa.conf (and by default, loncapa_apache.conf). my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); %perlvar=%{$perlvarref}; @@ -289,7 +621,6 @@ delete $perlvar{'lonSqlAccess'}; # remov { my $config=IO::File->new("$perlvar{'lonTabDir'}/hosts.tab"); - $total=0; while (my $configline=<$config>) { $configline=~s/#.*$//; unless ($configline=~/\w/) { next; } @@ -298,7 +629,6 @@ delete $perlvar{'lonSqlAccess'}; # remov $hostdom{$id}=$domain; $hostrole{$id}=$role; $hostip{$id}=$ip; - $total++; if (($role eq 'library') && ($id ne $perlvar{'lonHostID'})) { $libserv{$id}=$name; } @@ -307,9 +637,6 @@ delete $perlvar{'lonSqlAccess'}; # remov # ------------------------------------------------------------ Read domain file { my $fh=IO::File->new($perlvar{'lonTabDir'}.'/domain.tab'); - %domaindescription = (); - %domain_auth_def = (); - %domain_auth_arg_def = (); if ($fh) { while (<$fh>) { next if (/^(\#|\s*$)/); @@ -323,8 +650,7 @@ delete $perlvar{'lonSqlAccess'}; # remov } } - -print "

Cluster Status ".localtime()."

"; +print "

LON-CAPA Cluster Status ".localtime()."

"; print "
\n". "
". "
\n";; @@ -332,107 +658,37 @@ print "
'; &hidden('refresh',$refresh); -# ==================================================== Main Loop over all Hosts - -foreach $local (sort keys %hostname) { - $host{$local.'_unresponsive_doomed'}=0; -# -- Check general status - &statuslist($local,'General'); - my %loncron=&replyhash($local,'/lon-status/loncron_simple.txt',1200); - if (defined($loncron{'local_error'})) { - $host{$local.'_loncron'}='Could not determine.'; - $host{$local.'_unresponsive_doomed'}++; - } else { - if ((time-$loncron{'time'})>$oneday) { - $host{$local.'_loncron'}='Stale.'; - $host{$local.'_unresponsive_doomed'}++; - } else { - } - } -# -- Check user status - &statuslist($local,'Users'); - my %userstatus=&replyhash($local,'/cgi-bin/userstatus.pl?simple',600); - if (defined($userstatus{'local_error'})) { - $host{$local.'_userstatus'}='Could not determine.'; - $host{$local.'_unresponsive_doomed'}++; - } else { - $host{$local.'_users_doomed'}=$userstatus{'Active'}; - $host{$local.'_users'}=$userstatus{'Active'}; - my ($sload,$mload,$lload)=split(/ /,$userstatus{'loadavg'}); - $host{$local.'_load_doomed'}=$mload; - $host{$local.'_load'}=$userstatus{'loadavg'}; - } -# -- Check mysql status - &statuslist($local,'Database'); - my %mysql=&replyhash($local,'/lon-status/mysql.txt',3600); - if (defined($mysql{'local_error'})) { - $host{$local.'_mysql'}='Could not determine.'; - $host{$local.'_unresponsive_doomed'}++; - } else { - if ((time-$mysql{'time'})>(7*$oneday)) { - if ($hostrole{$local} eq 'library') { - $host{$local.'_mysql'}='Stale.'; - $host{$local.'_mysql_doomed'}=1; - } - if ($mysql{'mysql'} eq 'defunct') { - $host{$local.'_mysql'}='Defunct (maybe stale).'; - $host{$local.'_mysql_doomed'}=2; - } - } elsif ($mysql{'mysql'} eq 'defunct') { - $host{$local.'_mysql'}='Defunct.'; - $host{$local.'_mysql_doomed'}=3; - } - } -# -- Check rpm status - &statuslist($local,'RPMs'); - my %checkrpms=&replyhash($local,'/lon-status/checkrpms.txt',7200); - if (defined($checkrpms{'local_error'})) { - $host{$local.'_checkrpms'}='Could not determine.'; - $host{$local.'_unresponsive_doomed'}++; + if (!$FORM{'runonetime'}) { + print + "

Gathering initial cluster data

This may take some time ...
"; + $fromcache=0; + &mainloop(); + &statuslist('Done initial run.'); + &reports(); } else { - if ((time-$checkrpms{'time'})>(4*$oneday)) { - $host{$local.'_checkrpms'}='Stale.'; - $host{$local.'_checkrpms_doomed'}=50; - $host{$local.'_unresponsive_doomed'}++; - } elsif ($checkrpms{'status'} eq 'fail') { - $host{$local.'_checkrpms'}='Could not checked RPMs.'; - $host{$local.'_checkrpms_doomed'}=100; - } elsif ($checkrpms{'rpmcount'}) { - $host{$local.'_checkrpms'}='Outdated RPMs: '. - $checkrpms{'rpmcount'}; - $host{$local.'_checkrpms_doomed'}=$checkrpms{'rpmcount'}; - } - } -# -- Check connections - &statuslist($local,'Connections'); - $host{$local.'_notconnected'}=''; - $host{$local.'_notconnected_doomed'}=0; - foreach $remote (sort keys %hostname) { - my $status=&connected($local,$remote); - $connectionstatus{$local.'_TO_'.$remote}=$status; - unless (($status eq 'ok') || ($status eq 'not_yet')) { - $host{$local.'_notconnected'}.=' '.$remote; - $host{$local.'_notconnected_doomed'}++; - } - } -# =============================================================== End Mail Loop -} -&statuslist('Done.'); -# ====================================================================== Output - if ($mode=~/\_doomed$/) { -# Output by doomedness - foreach (&doomedness($mode)) { - &serverstatus($_); - } - } + $fromcache=1; + &mainloop(); + &statuslist('Done gathering cached data'); + &reports(); + $fromcache=0; + &mainloop(); + } + &hidden('runonetime',1); +print '
Total number of queries: '.$stat_total. + '
Percent complete: '. + int(($stat_total-$stat_notyet)/$stat_total*100.). + '
Percent from cache: '. + int($stat_fromcache/$stat_total*100.).'
'; + # ============================================================== Close, refresh print "
"; exit 0;