--- loncom/cgi/clusterstatus.pl 2003/07/31 15:35:02 1.9
+++ loncom/cgi/clusterstatus.pl 2003/08/05 18:33:08 1.16
@@ -3,10 +3,11 @@ $|=1;
# The LearningOnline Network with CAPA
# Cluster Status
#
-# $Id: clusterstatus.pl,v 1.9 2003/07/31 15:35:02 www Exp $
+# $Id: clusterstatus.pl,v 1.16 2003/08/05 18:33:08 www Exp $
use lib '/home/httpd/lib/perl/';
use LONCAPA::Configuration;
+use strict;
use LWP::UserAgent();
use HTTP::Headers;
@@ -19,6 +20,28 @@ my %connectionstatus=();
my %perlvar=();
my $mode;
+my $concount=0;
+my $fromcache;
+
+my %domaindescription = ();
+my %domain_auth_def = ();
+my %domain_auth_arg_def = ();
+
+my %hostname=();
+my %hostip=();
+my %hostdom=();
+my %hostrole=();
+my %libserv=();
+
+my $maxusers=0;
+my $maxload=0;
+my $totalusers=0;
+
+my %FORM=();
+
+my $stat_total=0;
+my $stat_notyet=0;
+my $stat_fromcache=0;
sub select_form {
my ($def,$name,%hash) = @_;
@@ -47,13 +70,26 @@ sub hidden {
sub request {
my ($local,$url,$cachetime)=@_;
+ $cachetime*=(0.5+rand);
my $key=&key($local,$url);
my $reply='';
+ $stat_total++;
+# if fromcache flag is set, only return cached values
+ if ($fromcache) {
+ if ($FORM{$key.'_time'}) {
+ return $FORM{$key};
+ $stat_fromcache++;
+ } else {
+ return 'not_yet';
+ $stat_notyet++;
+ }
+ }
+# normal mode, refresh when expired or not yet present
if ($FORM{$key.'_time'}) {
if ((time-$FORM{$key.'_time'})<$cachetime) {
$reply=$FORM{$key};
&hidden($key.'_time',$FORM{$key.'_time'});
- &hidden($key.'_fromcache',1);
+ $stat_fromcache++;
}
}
unless ($reply) {
@@ -61,7 +97,7 @@ sub request {
$reply='local_unknown';
} else {
- my $ua=new LWP::UserAgent(timeout => 20);
+ my $ua=new LWP::UserAgent(timeout => 10);
my $request=new HTTP::Request('GET',
"http://".$hostname{$local}.$url);
@@ -91,10 +127,17 @@ sub connected {
unless ($hostname{$remote}) { return 'remote_unknown'; }
my $url='/cgi-bin/ping.pl?'.$remote;
#
-# Slowly phase this in: if not cached, only do 10 percent of the cases
+# Slowly phase this in: if not cached, only do 5 percent of the cases,
+# but always do the first five.
#
unless ($FORM{&key($local,$url)}) {
- unless (rand>0.9) { return 'not_yet'; }
+ unless (($concount<=5) || (rand>0.95)) {
+ $stat_total++;
+ $stat_notyet++;
+ return 'not_yet';
+ } else {
+ $concount++;
+ }
}
#
# Actually do the query
@@ -169,20 +212,40 @@ sub server {
print &otherwindow($local,'/server-status','Server Status');
}
+# ========================================================= Produce a green bar
+sub bar {
+ my $parm=shift;
+ my $number=int($parm+0.5);
+ print "
";
+ for (my $i=0;$i<$number;$i++) {
+ print "+";
+ }
+ print " |
";
+}
+
# ========================================================== Show server status
sub serverstatus {
- my $local=shift;
+ my ($local,$trouble)=@_;
print (<
$local $hostdom{$local} ($hostname{$local}; $hostrole{$local})
$domaindescription{$hostdom{$local}}
- |
+ |
ENDHEADER
&login($local);&server($local);&users($local);&versions($local);
&loncron($local);&lond($local);&lonc($local);&runloncron($local);
- print " |
";
+ print " |
";
+ if ($trouble) {
+ print ("$trouble |
");
+ }
+ print "";
+# version
+ if ($host{$local.'_version'}) {
+ print " Version: ".$host{$local.'_version'}
+ }
# load
if (($host{$local.'_load_doomed'}>0.5) || ($mode eq 'load_doomed')) {
print " Load: ".$host{$local.'_load'}
@@ -200,6 +263,19 @@ ENDHEADER
if ($host{$local.'_mysql'}) {
print " MySQL Database: ".$host{$local.'_mysql'}
}
+# connections
+ if ($host{$local.'_notconnected'}) {
+ print " Not connected: ";
+ foreach (split(/ /,$host{$local.'_notconnected'})) {
+ if ($_) {
+ print " $_";
+ }
+ }
+ }
+# errors
+ if ($host{$local.'_errors'}) {
+ print " loncron errors: ".$host{$local.'_errors'};
+ }
print " |
";
}
@@ -220,6 +296,260 @@ sub doomedness {
return sort { $alldoomed{$b} <=> $alldoomed{$a} } @allhosts;
}
+sub resetvars {
+ $maxusers=0;
+ $maxload=0;
+ $totalusers=0;
+ $stat_total=0;
+ $stat_notyet=0;
+ $stat_fromcache=0;
+
+ undef %host;
+ %host=();
+}
+
+sub mainloop {
+ &resetvars();
+# ==================================================== Main Loop over all Hosts
+
+foreach my $local (sort keys %hostname) {
+ $host{$local.'_unresponsive_doomed'}=0;
+# -- Check general status
+ &statuslist($local,'General');
+ my %loncron=&replyhash($local,'/lon-status/loncron_simple.txt',1200);
+ if (defined($loncron{'local_error'})) {
+ $host{$local.'_loncron'}='Could not determine.';
+ $host{$local.'_unresponsive_doomed'}++;
+ } else {
+ if ((time-$loncron{'time'})>$oneday) {
+ $host{$local.'_loncron'}='Stale.';
+ $host{$local.'_unresponsive_doomed'}++;
+ } else {
+ $host{$local.'_loncron_doomed'}=$loncron{'notices'}
+ +4*$loncron{'warnings'}
+ +100*$loncron{'errors'};
+ $host{$local.'_errors'}=$loncron{'errors'};
+ }
+ }
+# -- Check version
+ &statuslist($local,'Version');
+ my $version=&request($local,'/lon-status/version.txt',7200);
+ if ($version eq 'local_error') {
+ $host{$local.'_version'}='Could not determine.';
+ $host{$local.'_unresponsive_doomed'}++;
+ } else {
+ $host{$local.'_version'}=$version;
+ }
+# -- Check user status
+ &statuslist($local,'Users');
+ my %userstatus=&replyhash($local,'/cgi-bin/userstatus.pl?simple',600);
+ if (defined($userstatus{'local_error'})) {
+ $host{$local.'_userstatus'}='Could not determine.';
+ $host{$local.'_unresponsive_doomed'}++;
+ } else {
+ $host{$local.'_users_doomed'}=$userstatus{'Active'};
+ $host{$local.'_users'}=$userstatus{'Active'};
+ unless ($host{$local.'_users'}) { $host{$local.'_users'}=0; }
+ if ($host{$local.'_users'}>$maxusers) {
+ $maxusers=$host{$local.'_users'};
+ }
+ $totalusers+=$host{$local.'_users'};
+ my ($sload,$mload,$lload)=split(/ /,$userstatus{'loadavg'});
+ $host{$local.'_load_doomed'}=$mload;
+ if ($mload>$maxload) {
+ $maxload=$mload;
+ }
+ $host{$local.'_load'}=$userstatus{'loadavg'};
+ }
+# -- Check mysql status
+ &statuslist($local,'Database');
+ my %mysql=&replyhash($local,'/lon-status/mysql.txt',3600);
+ if (defined($mysql{'local_error'})) {
+ $host{$local.'_mysql'}='Could not determine.';
+ $host{$local.'_unresponsive_doomed'}++;
+ } else {
+ if ((time-$mysql{'time'})>(7*$oneday)) {
+ if ($hostrole{$local} eq 'library') {
+ $host{$local.'_mysql'}='Stale.';
+ $host{$local.'_mysql_doomed'}=1;
+ }
+ if ($mysql{'mysql'} eq 'defunct') {
+ $host{$local.'_mysql'}='Defunct (maybe stale).';
+ $host{$local.'_mysql_doomed'}=2;
+ }
+ } elsif ($mysql{'mysql'} eq 'defunct') {
+ $host{$local.'_mysql'}='Defunct.';
+ $host{$local.'_mysql_doomed'}=3;
+ }
+ }
+# -- Check rpm status
+ &statuslist($local,'RPMs');
+ my %checkrpms=&replyhash($local,'/lon-status/checkrpms.txt',7200);
+ if (defined($checkrpms{'local_error'})) {
+ $host{$local.'_checkrpms'}='Could not determine.';
+ $host{$local.'_unresponsive_doomed'}++;
+ } else {
+ if ((time-$checkrpms{'time'})>(4*$oneday)) {
+ $host{$local.'_checkrpms'}='Stale.';
+ $host{$local.'_checkrpms_doomed'}=50;
+ $host{$local.'_unresponsive_doomed'}++;
+ } elsif ($checkrpms{'status'} eq 'fail') {
+ $host{$local.'_checkrpms'}='Could not checked RPMs.';
+ $host{$local.'_checkrpms_doomed'}=100;
+ } elsif ($checkrpms{'rpmcount'}) {
+ $host{$local.'_checkrpms'}='Outdated RPMs: '.
+ $checkrpms{'rpmcount'};
+ $host{$local.'_checkrpms_doomed'}=$checkrpms{'rpmcount'};
+ }
+ }
+# -- Check connections
+ &statuslist($local,'Connections');
+ $host{$local.'_notconnected'}='';
+ $host{$local.'_notconnected_doomed'}=0;
+ foreach my $remote (sort keys %hostname) {
+ my $status=&connected($local,$remote);
+ $connectionstatus{$local.'_TO_'.$remote}=$status;
+ unless (($status eq 'ok') || ($status eq 'not_yet')) {
+ $host{$local.'_notconnected'}.=' '.$remote;
+ $host{$local.'_notconnected_doomed'}++;
+ }
+ }
+# =============================================================== End Main Loop
+}
+
+}
+
+sub reports {
+# ====================================================================== Output
+ if ($mode=~/\_doomed$/) {
+# Output by doomedness
+ foreach (&doomedness($mode)) {
+ &serverstatus($_);
+ }
+ } elsif ($mode eq 'connections') {
+ print
+ "".
+ " | ";
+ foreach my $remote (sort keys %hostname) {
+ print ''.$remote.' | ';
+ }
+ print "
\n";
+# connection matrix
+ foreach my $local (sort keys %hostname) {
+ print ''.$local.' | ';
+ foreach my $remote (sort keys %hostname) {
+ if ($connectionstatus{$local.'_TO_'.$remote} eq 'not_yet') {
+ my $cellcolor='#FFFFFF';
+ if ($local eq $remote) { $cellcolor='#DDDDDD'; }
+ print 'not yet tested | ';
+ } elsif ($connectionstatus{$local.'_TO_'.$remote} eq 'ok') {
+ my $cellcolor='#BBDDBB';
+ if ($local eq $remote) { $cellcolor='#99DD99'; }
+ print
+'ok | ';
+ } else {
+ my $cellcolor='#DDBBBB';
+ if ($connectionstatus{$local.'_TO_'.$remote} eq 'local_error') {
+ if ($local eq $remote) {
+ $cellcolor='#DD88AA';
+ } else {
+ $cellcolor='#DDAACC';
+ }
+ } else {
+ if ($local eq $remote) { $cellcolor='#DD9999'; }
+ }
+ print
+ ''.
+ $connectionstatus{$local.'_TO_'.$remote}.' ';
+ &lonc($local); &lond($remote);
+ print ' | ';
+ }
+ }
+ print "
\n";
+ }
+ print "
";
+ } elsif ($mode eq 'users') {
+# Users
+ if ($maxusers) {
+ my $factor=50/$maxusers;
+ print "Total active user(s): $totalusers
".
+ "";
+
+ foreach my $local (sort keys %hostname) {
+ if (defined($host{$local.'_users'})) {
+ print
+''.$local.
+ ' '.
+ $domaindescription{$hostdom{$local}}.
+ ' | ';
+ &users($local);
+ print
+ ' | '.
+ $host{$local.'_users'}.' | |
\n";
+ }
+ }
+ print "
";
+ } else {
+ print "No active users logged in.";
+ }
+ } elsif ($mode eq 'load') {
+# Load
+ if ($maxload) {
+ my $factor=50/$maxload;
+ print
+ "";
+ foreach my $local (sort keys %hostname) {
+ if (defined($host{$local.'_load_doomed'})) {
+ print
+''.
+ $local.
+ ' '.
+ $domaindescription{$hostdom{$local}}.
+ ' | ';
+ &server($local);
+ print
+ ' | '.
+ $host{$local.'_load_doomed'}.' | |
\n";
+ }
+ }
+ print "
";
+ } else {
+ print "No workload.";
+ }
+ } elsif ($mode eq 'trouble') {
+ my $count=0;
+ foreach my $local (sort keys %hostname) {
+ my $trouble='';
+ if ($host{$local.'_unresponsive_doomed'}>3) {
+ $trouble='Does not respond to several queries.
';
+ }
+ if ($host{$local.'_errors'}) {
+ $trouble='Has loncron errors.
';
+ } elsif ($host{$local.'_loncron_doomed'}>600) {
+ $trouble='High loncron count.
';
+ }
+ if ($host{$local.'_load_doomed'}>5) {
+ $trouble='High load.
';
+ }
+ if ($host{$local.'_users_doomed'}>200) {
+ $trouble='High user volume.
';
+ }
+ if ($host{$local.'_mysql_doomed'}>1) {
+ $trouble='MySQL database apparently offline.
';
+ }
+ if ($host{$local.'_checkrpms_doomed'}>100) {
+ $trouble='RPMs outdated.
';
+ }
+ if ($trouble) { $count++; &serverstatus($local,$trouble); }
+ }
+ unless ($count) { print "No mayor trouble."; }
+ }
+}
+
# ====================================================================== Status
sub statuslist {
my ($local,$what)=@_;
@@ -227,7 +557,8 @@ sub statuslist {
"\n";
}
-#
+# =============================================================================
+# =============================================================================
# Main program
#
# ========================================================= Get form parameters
@@ -256,7 +587,7 @@ foreach $pair (@pairs) {
# ====================================================== Determine refresh rate
-my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:60);
+my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:30);
if ($refresh<30) { $refresh=30; }
my $starttime=time;
@@ -271,13 +602,14 @@ my %modes=('trouble' => 'Trouble',
'load_doomed' => 'Doomed: Load',
'unresponsive_doomed' => 'Doomed: Status could not be determined',
'users' => 'User Report',
+ 'load' => 'Load Report',
'connections' => 'Connections Matrix');
$mode=$FORM{'mode'};
unless ($modes{$mode}) { $mode='trouble'; }
# ================================================================ Send Headers
print "Content-type: text/html\n\n".
- "\n";
+ "\n";
# -------------------- Read loncapa.conf (and by default, loncapa_apache.conf).
my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
%perlvar=%{$perlvarref};
@@ -289,7 +621,6 @@ delete $perlvar{'lonSqlAccess'}; # remov
{
my $config=IO::File->new("$perlvar{'lonTabDir'}/hosts.tab");
- $total=0;
while (my $configline=<$config>) {
$configline=~s/#.*$//;
unless ($configline=~/\w/) { next; }
@@ -298,7 +629,6 @@ delete $perlvar{'lonSqlAccess'}; # remov
$hostdom{$id}=$domain;
$hostrole{$id}=$role;
$hostip{$id}=$ip;
- $total++;
if (($role eq 'library') && ($id ne $perlvar{'lonHostID'})) {
$libserv{$id}=$name;
}
@@ -307,9 +637,6 @@ delete $perlvar{'lonSqlAccess'}; # remov
# ------------------------------------------------------------ Read domain file
{
my $fh=IO::File->new($perlvar{'lonTabDir'}.'/domain.tab');
- %domaindescription = ();
- %domain_auth_def = ();
- %domain_auth_arg_def = ();
if ($fh) {
while (<$fh>) {
next if (/^(\#|\s*$)/);
@@ -323,8 +650,7 @@ delete $perlvar{'lonSqlAccess'}; # remov
}
}
-
-print "Cluster Status ".localtime()."
";
+print "LON-CAPA Cluster Status ".localtime()."
";
print "\n";;
@@ -332,107 +658,37 @@ print "';
&hidden('refresh',$refresh);
-# ==================================================== Main Loop over all Hosts
-
-foreach $local (sort keys %hostname) {
- $host{$local.'_unresponsive_doomed'}=0;
-# -- Check general status
- &statuslist($local,'General');
- my %loncron=&replyhash($local,'/lon-status/loncron_simple.txt',1200);
- if (defined($loncron{'local_error'})) {
- $host{$local.'_loncron'}='Could not determine.';
- $host{$local.'_unresponsive_doomed'}++;
- } else {
- if ((time-$loncron{'time'})>$oneday) {
- $host{$local.'_loncron'}='Stale.';
- $host{$local.'_unresponsive_doomed'}++;
- } else {
- }
- }
-# -- Check user status
- &statuslist($local,'Users');
- my %userstatus=&replyhash($local,'/cgi-bin/userstatus.pl?simple',600);
- if (defined($userstatus{'local_error'})) {
- $host{$local.'_userstatus'}='Could not determine.';
- $host{$local.'_unresponsive_doomed'}++;
- } else {
- $host{$local.'_users_doomed'}=$userstatus{'Active'};
- $host{$local.'_users'}=$userstatus{'Active'};
- my ($sload,$mload,$lload)=split(/ /,$userstatus{'loadavg'});
- $host{$local.'_load_doomed'}=$mload;
- $host{$local.'_load'}=$userstatus{'loadavg'};
- }
-# -- Check mysql status
- &statuslist($local,'Database');
- my %mysql=&replyhash($local,'/lon-status/mysql.txt',3600);
- if (defined($mysql{'local_error'})) {
- $host{$local.'_mysql'}='Could not determine.';
- $host{$local.'_unresponsive_doomed'}++;
- } else {
- if ((time-$mysql{'time'})>(7*$oneday)) {
- if ($hostrole{$local} eq 'library') {
- $host{$local.'_mysql'}='Stale.';
- $host{$local.'_mysql_doomed'}=1;
- }
- if ($mysql{'mysql'} eq 'defunct') {
- $host{$local.'_mysql'}='Defunct (maybe stale).';
- $host{$local.'_mysql_doomed'}=2;
- }
- } elsif ($mysql{'mysql'} eq 'defunct') {
- $host{$local.'_mysql'}='Defunct.';
- $host{$local.'_mysql_doomed'}=3;
- }
- }
-# -- Check rpm status
- &statuslist($local,'RPMs');
- my %checkrpms=&replyhash($local,'/lon-status/checkrpms.txt',7200);
- if (defined($checkrpms{'local_error'})) {
- $host{$local.'_checkrpms'}='Could not determine.';
- $host{$local.'_unresponsive_doomed'}++;
+ if (!$FORM{'runonetime'}) {
+ print
+ "Gathering initial cluster data
This may take some time ...
";
+ $fromcache=0;
+ &mainloop();
+ &statuslist('Done initial run.');
+ &reports();
} else {
- if ((time-$checkrpms{'time'})>(4*$oneday)) {
- $host{$local.'_checkrpms'}='Stale.';
- $host{$local.'_checkrpms_doomed'}=50;
- $host{$local.'_unresponsive_doomed'}++;
- } elsif ($checkrpms{'status'} eq 'fail') {
- $host{$local.'_checkrpms'}='Could not checked RPMs.';
- $host{$local.'_checkrpms_doomed'}=100;
- } elsif ($checkrpms{'rpmcount'}) {
- $host{$local.'_checkrpms'}='Outdated RPMs: '.
- $checkrpms{'rpmcount'};
- $host{$local.'_checkrpms_doomed'}=$checkrpms{'rpmcount'};
- }
- }
-# -- Check connections
- &statuslist($local,'Connections');
- $host{$local.'_notconnected'}='';
- $host{$local.'_notconnected_doomed'}=0;
- foreach $remote (sort keys %hostname) {
- my $status=&connected($local,$remote);
- $connectionstatus{$local.'_TO_'.$remote}=$status;
- unless (($status eq 'ok') || ($status eq 'not_yet')) {
- $host{$local.'_notconnected'}.=' '.$remote;
- $host{$local.'_notconnected_doomed'}++;
- }
- }
-# =============================================================== End Mail Loop
-}
-&statuslist('Done.');
-# ====================================================================== Output
- if ($mode=~/\_doomed$/) {
-# Output by doomedness
- foreach (&doomedness($mode)) {
- &serverstatus($_);
- }
- }
+ $fromcache=1;
+ &mainloop();
+ &statuslist('Done gathering cached data');
+ &reports();
+ $fromcache=0;
+ &mainloop();
+ }
+ &hidden('runonetime',1);
+print '
Total number of queries: '.$stat_total.
+ '
Percent complete: '.
+ int(($stat_total-$stat_notyet)/$stat_total*100.).
+ '
Percent from cache: '.
+ int($stat_fromcache/$stat_total*100.).'';
+
# ============================================================== Close, refresh
print "";
exit 0;