--- loncom/cgi/clusterstatus.pl 2003/02/03 18:03:52 1.6 +++ loncom/cgi/clusterstatus.pl 2003/07/31 19:18:16 1.11 @@ -2,10 +2,8 @@ $|=1; # The LearningOnline Network with CAPA # Cluster Status -# (Versions -# (Running loncron -# 09/06/01 Gerd Kortemeyer) -# 02/18/02,02/19/02 Gerd Kortemeyer) +# +# $Id: clusterstatus.pl,v 1.11 2003/07/31 19:18:16 www Exp $ use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; @@ -13,51 +11,306 @@ use LONCAPA::Configuration; use LWP::UserAgent(); use HTTP::Headers; use IO::File; -use Net::Ping; -sub online { - my $host=shift; - return 1; -# ping is broken - my $p=Net::Ping->new("tcp",10); - my $online=$p->ping("$host"); - $p->close(); - undef ($p); - return $online; +my %host=(); +my $oneday=60*60*24; + +my %connectionstatus=(); +my %perlvar=(); + +my $mode; + +sub select_form { + my ($def,$name,%hash) = @_; + my $selectform = ""; + return $selectform; +} + + +sub key { + my ($local,$url)=@_; + my $key=$local.'_'.$url; + $key=~s/\W/\_/gs; + return $key; +} + +sub hidden { + my ($name,$value)=@_; + print "\n"; +} + +sub request { + my ($local,$url,$cachetime)=@_; + my $key=&key($local,$url); + my $reply=''; + if ($FORM{$key.'_time'}) { + if ((time-$FORM{$key.'_time'})<$cachetime) { + $reply=$FORM{$key}; + &hidden($key.'_time',$FORM{$key.'_time'}); + &hidden($key.'_fromcache',1); + } + } + unless ($reply) { + unless ($hostname{$local}) { + $reply='local_unknown'; + } else { + + my $ua=new LWP::UserAgent(timeout => 20); + + my $request=new HTTP::Request('GET', + "http://".$hostname{$local}.$url); + $request->authorization_basic('lonadm','litelite'); + + my $response=$ua->request($request); + + unless ($response->is_success) { + $reply='local_error'; + } else { + $reply=$response->content; + chomp($reply); + } + } + &hidden($key.'_time',time); + } + &hidden($key,$reply); + return $reply; } +# ============================================= Are local and remote connected? sub connected { my ($local,$remote)=@_; $local=~s/\W//g; $remote=~s/\W//g; - unless ($hostname{$local}) { return 'local_unknown'; } unless ($hostname{$remote}) { return 'remote_unknown'; } + my $url='/cgi-bin/ping.pl?'.$remote; +# +# Slowly phase this in: if not cached, only do 10 percent of the cases +# + unless ($FORM{&key($local,$url)}) { + unless (rand>0.9) { return 'not_yet'; } + } +# +# Actually do the query +# + &statuslist($local,'connecting '.$remote); + my $reply=&request($local,$url,3600); + $reply=(split("\n",$reply))[0]; + $reply=~s/\W//g; + if ($reply ne $remote) { return $reply; } + return 'ok'; +} +# ============================================================ Get a reply hash - unless (&online($hostname{$local})) { return 'local_offline'; } +sub replyhash { + my %returnhash=(); + foreach (split(/\&/,&request(@_))) { + my ($name,$value)=split(/\=/,$_); + if ($name) { + unless ($value) { $value=''; } + $returnhash{$name}=$value; + } + } + return %returnhash; +} - my $ua=new LWP::UserAgent; - - my $request=new HTTP::Request('GET', - "http://".$hostname{$local}.'/cgi-bin/ping.pl?'.$remote); +# ================================================================ Link to host - my $response=$ua->request($request); +sub otherwindow { + my ($local,$url,$label)=@_; + return + " $label "; +} - unless ($response->is_success) { return 'local_error'; } +sub login { + my $local=shift; + print &otherwindow($local,'/adm/login?domain='.$perlvar{'lonDefDomain'}, + 'Login'); +} - my $reply=$response->content; - $reply=(split("\n",$reply))[0]; - $reply=~s/\W//g; - if ($reply ne $remote) { return $reply; } - return 'ok'; +sub runloncron { + my $local=shift; + print &otherwindow($local,'/cgi-bin/loncron.pl','Run loncron'); } - +sub loncron { + my $local=shift; + print &otherwindow($local,'/lon-status','loncron'); +} + +sub lonc { + my $local=shift; + print &otherwindow($local,'/lon-status/loncstatus.txt','lonc'); +} + +sub lond { + my $local=shift; + print &otherwindow($local,'/lon-status/londstatus.txt','lond'); +} + +sub users { + my $local=shift; + print &otherwindow($local,'/cgi-bin/userstatus.pl','Users'); +} + +sub versions { + my $local=shift; + print &otherwindow($local,'/cgi-bin/lonversions.pl','Versions'); +} + +sub server { + my $local=shift; + print &otherwindow($local,'/server-status','Server Status'); +} + +# ========================================================= Produce a green bar +sub bar { + my $parm=shift; + my $number=int($parm+0.5); + print "
"; + for (my $i=0;$i<$number;$i++) { + print "+"; + } + print "
"; +} + +# ========================================================== Show server status + +sub serverstatus { + my ($local,$trouble)=@_; + print (< + +"; + if ($trouble) { + print (""); + } + print "
+$local $hostdom{$local} ($hostname{$local}; $hostrole{$local}) +
$domaindescription{$hostdom{$local}} +
+ENDHEADER + &login($local);&server($local);&users($local);&versions($local); + &loncron($local);&lond($local);&lonc($local);&runloncron($local); + print "
$trouble
"; +# load + if (($host{$local.'_load_doomed'}>0.5) || ($mode eq 'load_doomed')) { + print "
Load: ".$host{$local.'_load'} + } +# users + if (($host{$local.'_users_doomed'}>10) || ($mode eq 'users_doomed')) { + print "
Active Users: ".$host{$local.'_users'} + } + +# checkrpms + if ($host{$local.'_checkrpms'}) { + print "
RPMs: ".$host{$local.'_checkrpms'} + } +# mysql + if ($host{$local.'_mysql'}) { + print "
MySQL Database: ".$host{$local.'_mysql'} + } +# connections + if ($host{$local.'_notconnected'}) { + print "
Not connected: "; + foreach (split(/ /,$host{$local.'_notconnected'})) { + if ($_) { + print " $_"; + } + } + } +# errors + if ($host{$local.'_errors'}) { + print "
loncron errors: ".$host{$local.'_errors'}; + } + print "

"; +} + +# =========================================================== Doomedness sorted + +sub doomedness { + my $crit=shift; + my %alldoomed=(); + my @allhosts=(); + foreach (keys %host) { + if ($_=~/^(\w+)\_$crit$/) { + if ($host{$_}) { + push (@allhosts,$1); + $alldoomed{$1}=$host{$_}; + } + } + } + return sort { $alldoomed{$b} <=> $alldoomed{$a} } @allhosts; +} + +# ====================================================================== Status +sub statuslist { + my ($local,$what)=@_; + print +"\n"; +} + +# +# Main program +# +# ========================================================= Get form parameters +my $buffer; + +read(STDIN, $buffer, $ENV{'CONTENT_LENGTH'}); +my @pairs=split(/&/,$buffer); +my $pair; my $name; my $value; +undef %FORM; +%FORM=(); +foreach $pair (@pairs) { + ($name,$value) = split(/=/,$pair); + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; + $FORM{$name}=$value; +} + +$buffer=$ENV{'QUERY_STRING'}; +@pairs=split(/&/,$buffer); +foreach $pair (@pairs) { + ($name,$value) = split(/=/,$pair); + $value =~ tr/+/ /; + $value =~ s/%([a-fA-F0-9][a-fA-F0-9])/pack("C",hex($1))/eg; + $FORM{$name}=$value; +} + +# ====================================================== Determine refresh rate + +my $refresh=(($FORM{'refresh'}=~/^\d+$/)?$FORM{'refresh'}:120); +if ($refresh<30) { $refresh=30; } +my $starttime=time; + +# ============================================================== Determine mode + +my %modes=('trouble' => 'Trouble', + 'users_doomed' => 'Doomed: Users', + 'loncron_doomed' => 'Doomed: General (loncron)', + 'mysql_doomed' => 'Doomed: Database (mysql)', + 'notconnected_doomed' => 'Doomed: Connections', + 'checkrpms_doomed' => 'Doomed: RPMs', + 'load_doomed' => 'Doomed: Load', + 'unresponsive_doomed' => 'Doomed: Status could not be determined', + 'users' => 'User Report', + 'load' => 'Load Report', + 'connections' => 'Connections Matrix'); + +$mode=$FORM{'mode'}; +unless ($modes{$mode}) { $mode='trouble'; } +# ================================================================ Send Headers print "Content-type: text/html\n\n". - "\n"; + "\n"; # -------------------- Read loncapa.conf (and by default, loncapa_apache.conf). my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf'); -my %perlvar=%{$perlvarref}; +%perlvar=%{$perlvarref}; undef $perlvarref; # remove since sensitive and not needed delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed delete $perlvar{'lonSqlAccess'}; # remove since sensitive and not needed @@ -68,6 +321,8 @@ delete $perlvar{'lonSqlAccess'}; # remov $total=0; while (my $configline=<$config>) { + $configline=~s/#.*$//; + unless ($configline=~/\w/) { next; } my ($id,$domain,$role,$name,$ip)=split(/:/,$configline); $hostname{$id}=$name; $hostdom{$id}=$domain; @@ -79,78 +334,251 @@ delete $perlvar{'lonSqlAccess'}; # remov } } } - -print -"

Cluster Status

"; -print "Please be patient while building cluster status ...

\n"; -$table=''; - -$table.="". -""; -foreach $remote (sort keys %hostname) { - $table.="$remote"; +# ------------------------------------------------------------ Read domain file +{ + my $fh=IO::File->new($perlvar{'lonTabDir'}.'/domain.tab'); + %domaindescription = (); + %domain_auth_def = (); + %domain_auth_arg_def = (); + if ($fh) { + while (<$fh>) { + next if (/^(\#|\s*$)/); + chomp; + my ($domain, $domain_description, $def_auth, $def_auth_arg) + = split(/:/,$_,4); + $domain_auth_def{$domain}=$def_auth; + $domain_auth_arg_def{$domain}=$def_auth_arg; + $domaindescription{$domain}=$domain_description; + } + } } -$table.=""; + +print "

LON-CAPA Cluster Status ".localtime()."

"; +print "\n". +"
". +"\n";; +print "\n"; +print 'Choose next report: '.&select_form($mode,'mode',%modes).'
'; +&hidden('refresh',$refresh); + +# ==================================================== Main Loop over all Hosts + +my $maxusers=0; +my $maxload=0; +my $totalusers=0; + foreach $local (sort keys %hostname) { - print "Checking $local "; - $table.="$local
". - $hostrole{$local}.' '.$hostdom{$local}.'
'. - "". - $hostname{$local}."
". - "New Report". - ""; - if (&online($hostname{$local})) { - foreach $remote (sort keys %hostname) { - $status=&connected($local,$remote); - if ($status eq 'ok') { - $table.="
". - "lonc"; - if (&online($hostname{$remote})) { - $table.=" lond". - ""; - } else { - $table.=' offline'; - } - - } - } - } else { - print "offline"; - $table.='
'; - } - $table.="\n"; - print "
\n"; -} -$table.="
From (local)To (remote)
Offline
"; -print $table; -print ""; + $host{$local.'_unresponsive_doomed'}=0; +# -- Check general status + &statuslist($local,'General'); + my %loncron=&replyhash($local,'/lon-status/loncron_simple.txt',1200); + if (defined($loncron{'local_error'})) { + $host{$local.'_loncron'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + if ((time-$loncron{'time'})>$oneday) { + $host{$local.'_loncron'}='Stale.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + $host{$local.'_loncron_doomed'}=$loncron{'notices'} + +4*$loncron{'warnings'} + +100*$loncron{'errors'}; + $host{$local.'_errors'}=$loncron{'errors'}; + } + } +# -- Check user status + &statuslist($local,'Users'); + my %userstatus=&replyhash($local,'/cgi-bin/userstatus.pl?simple',600); + if (defined($userstatus{'local_error'})) { + $host{$local.'_userstatus'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + $host{$local.'_users_doomed'}=$userstatus{'Active'}; + $host{$local.'_users'}=$userstatus{'Active'}; + unless ($host{$local.'_users'}) { $host{$local.'_users'}=0; } + if ($host{$local.'_users'}>$maxusers) { + $maxusers=$host{$local.'_users'}; + } + $totalusers+=$host{$local.'_users'}; + my ($sload,$mload,$lload)=split(/ /,$userstatus{'loadavg'}); + $host{$local.'_load_doomed'}=$mload; + if ($mload>$maxload) { + $maxload=$mload; + } + $host{$local.'_load'}=$userstatus{'loadavg'}; + } +# -- Check mysql status + &statuslist($local,'Database'); + my %mysql=&replyhash($local,'/lon-status/mysql.txt',3600); + if (defined($mysql{'local_error'})) { + $host{$local.'_mysql'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + if ((time-$mysql{'time'})>(7*$oneday)) { + if ($hostrole{$local} eq 'library') { + $host{$local.'_mysql'}='Stale.'; + $host{$local.'_mysql_doomed'}=1; + } + if ($mysql{'mysql'} eq 'defunct') { + $host{$local.'_mysql'}='Defunct (maybe stale).'; + $host{$local.'_mysql_doomed'}=2; + } + } elsif ($mysql{'mysql'} eq 'defunct') { + $host{$local.'_mysql'}='Defunct.'; + $host{$local.'_mysql_doomed'}=3; + } + } +# -- Check rpm status + &statuslist($local,'RPMs'); + my %checkrpms=&replyhash($local,'/lon-status/checkrpms.txt',7200); + if (defined($checkrpms{'local_error'})) { + $host{$local.'_checkrpms'}='Could not determine.'; + $host{$local.'_unresponsive_doomed'}++; + } else { + if ((time-$checkrpms{'time'})>(4*$oneday)) { + $host{$local.'_checkrpms'}='Stale.'; + $host{$local.'_checkrpms_doomed'}=50; + $host{$local.'_unresponsive_doomed'}++; + } elsif ($checkrpms{'status'} eq 'fail') { + $host{$local.'_checkrpms'}='Could not checked RPMs.'; + $host{$local.'_checkrpms_doomed'}=100; + } elsif ($checkrpms{'rpmcount'}) { + $host{$local.'_checkrpms'}='Outdated RPMs: '. + $checkrpms{'rpmcount'}; + $host{$local.'_checkrpms_doomed'}=$checkrpms{'rpmcount'}; + } + } +# -- Check connections + &statuslist($local,'Connections'); + $host{$local.'_notconnected'}=''; + $host{$local.'_notconnected_doomed'}=0; + foreach $remote (sort keys %hostname) { + my $status=&connected($local,$remote); + $connectionstatus{$local.'_TO_'.$remote}=$status; + unless (($status eq 'ok') || ($status eq 'not_yet')) { + $host{$local.'_notconnected'}.=' '.$remote; + $host{$local.'_notconnected_doomed'}++; + } + } +# =============================================================== End Mail Loop +} +&statuslist('Done.'); +# ====================================================================== Output + if ($mode=~/\_doomed$/) { +# Output by doomedness + foreach (&doomedness($mode)) { + &serverstatus($_); + } + } elsif ($mode eq 'connections') { + print + "". + ""; + foreach my $remote (sort keys %hostname) { + print ''; + } + print "\n"; +# connection matrix + foreach my $local (sort keys %hostname) { + print ''; + foreach my $remote (sort keys %hostname) { + if ($connectionstatus{$local.'_TO_'.$remote} eq 'not_yet') { + print ''; + } elsif ($connectionstatus{$local.'_TO_'.$remote} eq 'ok') { + print +''; + } else { + print + ''; + } + } + print "\n"; + } + print "
 '.$remote.'
'.$local.' ok'. + $connectionstatus{$local.'_TO_'.$remote}.'
'; + &lonc($local); &lond($remote); + print '
"; + } elsif ($mode eq 'users') { +# Users + if ($maxusers) { + my $factor=50/$maxusers; + print "

Total active user(s): $totalusers

". + ""; + + foreach $local (sort keys %hostname) { + if (defined($host{$local.'_users'})) { + print +'\n"; + } + } + print "
'.$local. + ''; + &users(); + print + ''. + $host{$local.'_users'}.'
"; + } else { + print "No active users logged in."; + } + } elsif ($mode eq 'load') { +# Load + if ($maxload) { + my $factor=50/$maxload; + print + ""; + foreach $local (sort keys %hostname) { + if (defined($host{$local.'_load_doomed'})) { + print +'\n"; + } + } + print "
'. + $local. + ''; + &server(); + print + ''. + $host{$local.'_load_doomed'}.'
"; + } else { + print "No workload."; + } + } elsif ($mode eq 'trouble') { + my $count=0; + foreach $local (sort keys %hostname) { + my $trouble=''; + if ($host{$local.'_errors'}) { + $trouble='Has loncron errors.
'; + } elsif ($host{$local.'_loncron_doomed'}>600) { + $trouble='High loncron count.
'; + } + if ($host{$local.'_load_doomed'}>5) { + $trouble='High load.
'; + } + if ($host{$local.'_users_doomed'}>200) { + $trouble='High user volume.
'; + } + if ($host{$local.'_mysql_doomed'}>1) { + $trouble='MySQL database apparently offline.
'; + } + if ($host{$local.'_checkrpms_doomed'}>100) { + $trouble='RPMs outdated.
'; + } + if ($trouble) { $count++; &serverstatus($local,$trouble); } + } + unless ($count) { print "No mayor trouble."; } + } +# ============================================================== Close, refresh +print ""; +exit 0;