--- loncom/cgi/clusterstatus.pl 2003/07/31 15:35:02 1.9
+++ loncom/cgi/clusterstatus.pl 2003/08/01 19:20:26 1.14
@@ -3,7 +3,7 @@ $|=1;
# The LearningOnline Network with CAPA
# Cluster Status
#
-# $Id: clusterstatus.pl,v 1.9 2003/07/31 15:35:02 www Exp $
+# $Id: clusterstatus.pl,v 1.14 2003/08/01 19:20:26 www Exp $
use lib '/home/httpd/lib/perl/';
use LONCAPA::Configuration;
@@ -19,6 +19,7 @@ my %connectionstatus=();
my %perlvar=();
my $mode;
+my $concount=0;
sub select_form {
my ($def,$name,%hash) = @_;
@@ -47,6 +48,7 @@ sub hidden {
sub request {
my ($local,$url,$cachetime)=@_;
+ $cachetime*=(0.5+rand);
my $key=&key($local,$url);
my $reply='';
if ($FORM{$key.'_time'}) {
@@ -61,7 +63,7 @@ sub request {
$reply='local_unknown';
} else {
- my $ua=new LWP::UserAgent(timeout => 20);
+ my $ua=new LWP::UserAgent(timeout => 10);
my $request=new HTTP::Request('GET',
"http://".$hostname{$local}.$url);
@@ -91,10 +93,15 @@ sub connected {
unless ($hostname{$remote}) { return 'remote_unknown'; }
my $url='/cgi-bin/ping.pl?'.$remote;
#
-# Slowly phase this in: if not cached, only do 10 percent of the cases
+# Slowly phase this in: if not cached, only do 5 percent of the cases,
+# but always do the first five.
#
unless ($FORM{&key($local,$url)}) {
- unless (rand>0.9) { return 'not_yet'; }
+ unless (($concount<=5) || (rand>0.95)) {
+ return 'not_yet';
+ } else {
+ $concount++;
+ }
}
#
# Actually do the query
@@ -169,20 +176,36 @@ sub server {
print &otherwindow($local,'/server-status','Server Status');
}
+# ========================================================= Produce a green bar
+sub bar {
+ my $parm=shift;
+ my $number=int($parm+0.5);
+ print "
";
+ for (my $i=0;$i<$number;$i++) {
+ print "+";
+ }
+ print " |
";
+}
+
# ========================================================== Show server status
sub serverstatus {
- my $local=shift;
+ my ($local,$trouble)=@_;
print (<
$local $hostdom{$local} ($hostname{$local}; $hostrole{$local})
$domaindescription{$hostdom{$local}}
- |
+ |
ENDHEADER
&login($local);&server($local);&users($local);&versions($local);
&loncron($local);&lond($local);&lonc($local);&runloncron($local);
- print " |
";
+ print " |
";
+ if ($trouble) {
+ print ("$trouble |
");
+ }
+ print "";
# load
if (($host{$local.'_load_doomed'}>0.5) || ($mode eq 'load_doomed')) {
print " Load: ".$host{$local.'_load'}
@@ -200,6 +223,19 @@ ENDHEADER
if ($host{$local.'_mysql'}) {
print " MySQL Database: ".$host{$local.'_mysql'}
}
+# connections
+ if ($host{$local.'_notconnected'}) {
+ print " Not connected: ";
+ foreach (split(/ /,$host{$local.'_notconnected'})) {
+ if ($_) {
+ print " $_";
+ }
+ }
+ }
+# errors
+ if ($host{$local.'_errors'}) {
+ print " loncron errors: ".$host{$local.'_errors'};
+ }
print " |
";
}
@@ -256,7 +292,7 @@ foreach $pair (@pairs) {
# ====================================================== Determine refresh rate
-my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:60);
+my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:120);
if ($refresh<30) { $refresh=30; }
my $starttime=time;
@@ -271,13 +307,14 @@ my %modes=('trouble' => 'Trouble',
'load_doomed' => 'Doomed: Load',
'unresponsive_doomed' => 'Doomed: Status could not be determined',
'users' => 'User Report',
+ 'load' => 'Load Report',
'connections' => 'Connections Matrix');
$mode=$FORM{'mode'};
unless ($modes{$mode}) { $mode='trouble'; }
# ================================================================ Send Headers
print "Content-type: text/html\n\n".
- "\n";
+ "\n";
# -------------------- Read loncapa.conf (and by default, loncapa_apache.conf).
my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
%perlvar=%{$perlvarref};
@@ -323,8 +360,7 @@ delete $perlvar{'lonSqlAccess'}; # remov
}
}
-
-print "Cluster Status ".localtime()."
";
+print "LON-CAPA Cluster Status ".localtime()."
";
print "\n";;
@@ -334,6 +370,10 @@ print 'Choose next report: '.&select_for
# ==================================================== Main Loop over all Hosts
+my $maxusers=0;
+my $maxload=0;
+my $totalusers=0;
+
foreach $local (sort keys %hostname) {
$host{$local.'_unresponsive_doomed'}=0;
# -- Check general status
@@ -347,6 +387,10 @@ foreach $local (sort keys %hostname) {
$host{$local.'_loncron'}='Stale.';
$host{$local.'_unresponsive_doomed'}++;
} else {
+ $host{$local.'_loncron_doomed'}=$loncron{'notices'}
+ +4*$loncron{'warnings'}
+ +100*$loncron{'errors'};
+ $host{$local.'_errors'}=$loncron{'errors'};
}
}
# -- Check user status
@@ -358,8 +402,16 @@ foreach $local (sort keys %hostname) {
} else {
$host{$local.'_users_doomed'}=$userstatus{'Active'};
$host{$local.'_users'}=$userstatus{'Active'};
+ unless ($host{$local.'_users'}) { $host{$local.'_users'}=0; }
+ if ($host{$local.'_users'}>$maxusers) {
+ $maxusers=$host{$local.'_users'};
+ }
+ $totalusers+=$host{$local.'_users'};
my ($sload,$mload,$lload)=split(/ /,$userstatus{'loadavg'});
$host{$local.'_load_doomed'}=$mload;
+ if ($mload>$maxload) {
+ $maxload=$mload;
+ }
$host{$local.'_load'}=$userstatus{'loadavg'};
}
# -- Check mysql status
@@ -424,15 +476,131 @@ foreach $local (sort keys %hostname) {
foreach (&doomedness($mode)) {
&serverstatus($_);
}
+ } elsif ($mode eq 'connections') {
+ print
+ "".
+ " | ";
+ foreach my $remote (sort keys %hostname) {
+ print ''.$remote.' | ';
+ }
+ print "
\n";
+# connection matrix
+ foreach my $local (sort keys %hostname) {
+ print ''.$local.' | ';
+ foreach my $remote (sort keys %hostname) {
+ if ($connectionstatus{$local.'_TO_'.$remote} eq 'not_yet') {
+ my $cellcolor='#FFFFFF';
+ if ($local eq $remote) { $cellcolor='#DDDDDD'; }
+ print 'not yet tested | ';
+ } elsif ($connectionstatus{$local.'_TO_'.$remote} eq 'ok') {
+ my $cellcolor='#BBDDBB';
+ if ($local eq $remote) { $cellcolor='#99DD99'; }
+ print
+'ok | ';
+ } else {
+ my $cellcolor='#DDBBBB';
+ if ($connectionstatus{$local.'_TO_'.$remote} eq 'local_error') {
+ if ($local eq $remote) {
+ $cellcolor='#DD88AA';
+ } else {
+ $cellcolor='#DDAACC';
+ }
+ } else {
+ if ($local eq $remote) { $cellcolor='#DD9999'; }
+ }
+ print
+ ''.
+ $connectionstatus{$local.'_TO_'.$remote}.' ';
+ &lonc($local); &lond($remote);
+ print ' | ';
+ }
+ }
+ print "
\n";
+ }
+ print "
";
+ } elsif ($mode eq 'users') {
+# Users
+ if ($maxusers) {
+ my $factor=50/$maxusers;
+ print "Total active user(s): $totalusers
".
+ "";
+
+ foreach $local (sort keys %hostname) {
+ if (defined($host{$local.'_users'})) {
+ print
+''.$local.
+ ' | ';
+ &users($local);
+ print
+ ' | '.
+ $host{$local.'_users'}.' | |
\n";
+ }
+ }
+ print "
";
+ } else {
+ print "No active users logged in.";
+ }
+ } elsif ($mode eq 'load') {
+# Load
+ if ($maxload) {
+ my $factor=50/$maxload;
+ print
+ "";
+ foreach $local (sort keys %hostname) {
+ if (defined($host{$local.'_load_doomed'})) {
+ print
+''.
+ $local.
+ ' | ';
+ &server($local);
+ print
+ ' | '.
+ $host{$local.'_load_doomed'}.' | |
\n";
+ }
+ }
+ print "
";
+ } else {
+ print "No workload.";
+ }
+ } elsif ($mode eq 'trouble') {
+ my $count=0;
+ foreach $local (sort keys %hostname) {
+ my $trouble='';
+ if ($host{$local.'_errors'}) {
+ $trouble='Has loncron errors.
';
+ } elsif ($host{$local.'_loncron_doomed'}>600) {
+ $trouble='High loncron count.
';
+ }
+ if ($host{$local.'_load_doomed'}>5) {
+ $trouble='High load.
';
+ }
+ if ($host{$local.'_users_doomed'}>200) {
+ $trouble='High user volume.
';
+ }
+ if ($host{$local.'_mysql_doomed'}>1) {
+ $trouble='MySQL database apparently offline.
';
+ }
+ if ($host{$local.'_checkrpms_doomed'}>100) {
+ $trouble='RPMs outdated.
';
+ }
+ if ($trouble) { $count++; &serverstatus($local,$trouble); }
+ }
+ unless ($count) { print "No mayor trouble."; }
}
# ============================================================== Close, refresh
print "";
exit 0;