;
close (LOADAVGH);
-
+
&log($fh,"$loadavg");
-
+
my @parts=split(/\s+/,$loadavg);
- if ($parts[1]>4.0) {
+ if ($parts[1]>$loadtarget{'error'}) {
$errors++;
- } elsif ($parts[1]>2.0) {
+ } elsif ($parts[1]>$loadtarget{'warn'}) {
$warnings++;
- } elsif ($parts[1]>1.0) {
+ } elsif ($parts[1]>$loadtarget{'note'}) {
$notices++;
}
@@ -230,14 +263,14 @@ sub log_machine_info {
&log($fh,"");
open (DFH,"df|");
- while (my $line=) {
- &log($fh,&encode_entities($line,'<>&"'));
+ while (my $line=) {
+ &log($fh,&encode_entities($line,'<>&"'));
@parts=split(/\s+/,$line);
my $usage=$parts[4];
$usage=~s/\W//g;
- if ($usage>90) {
+ if ($usage>90) {
$warnings++;
- $notices++;
+ $notices++;
} elsif ($usage>80) {
$warnings++;
} elsif ($usage>60) {
@@ -254,8 +287,8 @@ sub log_machine_info {
my $psproc=0;
open (PSH,"ps aux --cols 140 |");
- while (my $line=) {
- &log($fh,&encode_entities($line,'<>&"'));
+ while (my $line=) {
+ &log($fh,&encode_entities($line,'<>&"'));
$psproc++;
}
close (PSH);
@@ -266,12 +299,7 @@ sub log_machine_info {
&log($fh,"distprobe
");
&log($fh,"");
- open(DSH,"$perlvar{'lonDaemons'}/distprobe |");
- while (my $line=) {
- &log($fh,&encode_entities($line,'<>&"'));
- $psproc++;
- }
- close(DSH);
+ &log($fh,&encode_entities(&LONCAPA::distro(),'<>&"'));
&log($fh,"
");
&errout($fh);
@@ -282,12 +310,14 @@ sub start_logging {
my %simplestatus=();
my $now=time;
my $date=localtime($now);
-
+
&log($fh,(<
+
+
LON Status Report $perlvar{'lonHostID'}
+
@@ -298,11 +328,11 @@ sub start_logging {
Machine Information
Temporary Files
Session Tokens
+WebDAV Session Tokens
httpd
lonsql
lond
lonc
-lonhttpd
lonnet
Connections
Delayed Messages
@@ -329,15 +359,20 @@ ENDHEADERS
"".$role.
" | ".&Apache::lonnet::hostname($id)." | \n");
}
- &log($fh,"Spare Hosts
");
- foreach my $type (sort(keys(%Apache::lonnet::spareid))) {
- &log($fh,"- $type\n
");
- foreach my $id (@{ $Apache::lonnet::spareid{$type} }) {
- &log($fh,"- $id
\n");
- }
- &log($fh,"
\n \n");
+ &log($fh,"Spare Hosts
");
+ if (keys(%Apache::lonnet::spareid) > 0) {
+ &log($fh,"");
+ foreach my $type (sort(keys(%Apache::lonnet::spareid))) {
+ &log($fh,"- $type\n
");
+ foreach my $id (@{ $Apache::lonnet::spareid{$type} }) {
+ &log($fh,"- $id
\n");
+ }
+ &log($fh,"
\n \n");
+ }
+ &log($fh,"
\n");
+ } else {
+ &log($fh,"No spare hosts specified
\n");
}
- &log($fh,"
\n");
return $fh;
}
@@ -345,33 +380,127 @@ ENDHEADERS
sub clean_tmp {
my ($fh)=@_;
&log($fh,'
Temporary Files
');
- my $cleaned=0;
- my $old=0;
- while (my $fname=<$perlvar{'lonDaemons'}/tmp/*>) {
- my ($dev,$ino,$mode,$nlink,
- $uid,$gid,$rdev,$size,
- $atime,$mtime,$ctime,
- $blksize,$blocks)=stat($fname);
- my $now=time;
- my $since=$now-$mtime;
- if ($since>$perlvar{'lonExpire'}) {
- my $line='';
- if (open(PROBE,$fname)) {
- $line=;
- close(PROBE);
- }
- unless ($line=~/^CHECKOUTTOKEN\&/) {
- $cleaned++;
- unlink("$fname");
- } else {
- if ($since>365*$perlvar{'lonExpire'}) {
- $cleaned++;
- unlink("$fname");
- } else { $old++; }
- }
- }
+ my ($cleaned,$old,$removed) = (0,0,0);
+ my %errors = (
+ dir => [],
+ file => [],
+ failopen => [],
+ );
+ my %error_titles = (
+ dir => 'failed to remove empty directory:',
+ file => 'failed to unlike stale file',
+ failopen => 'failed to open file or directory'
+ );
+ ($cleaned,$old,$removed) = &recursive_clean_tmp('',$cleaned,$old,$removed,\%errors);
+ &log($fh,"Cleaned up: ".$cleaned." files; removed: $removed empty directories; (found: $old old checkout tokens)");
+ foreach my $key (sort(keys(%errors))) {
+ if (ref($errors{$key}) eq 'ARRAY') {
+ if (@{$errors{$key}} > 0) {
+ &log($fh,"Error during cleanup ($error_titles{$key}):- ".
+ join('
- ',@{$errors{$key}}).'
');
+ }
+ }
+ }
+}
+
+sub recursive_clean_tmp {
+ my ($subdir,$cleaned,$old,$removed,$errors) = @_;
+ my $base = "$perlvar{'lonDaemons'}/tmp";
+ my $path = $base;
+ next if ($subdir =~ m{\.\./});
+ next unless (ref($errors) eq 'HASH');
+ unless ($subdir eq '') {
+ $path .= '/'.$subdir;
+ }
+ if (opendir(my $dh,"$path")) {
+ while (my $file = readdir($dh)) {
+ next if ($file =~ /^\.\.?$/);
+ my $fname = "$path/$file";
+ if (-d $fname) {
+ my $innerdir;
+ if ($subdir eq '') {
+ $innerdir = $file;
+ } else {
+ $innerdir = $subdir.'/'.$file;
+ }
+ ($cleaned,$old,$removed) =
+ &recursive_clean_tmp($innerdir,$cleaned,$old,$removed,$errors);
+ my @doms = &Apache::lonnet::current_machine_domains();
+
+ if (open(my $dirhandle,$fname)) {
+ unless (($innerdir eq 'helprequests') ||
+ (($innerdir =~ /^addcourse/) && ($innerdir !~ m{/\d+$}))) {
+ my @contents = grep {!/^\.\.?$/} readdir($dirhandle);
+ join('&&',@contents)."\n";
+ if (scalar(grep {!/^\.\.?$/} readdir($dirhandle)) == 0) {
+ closedir($dirhandle);
+ if ($fname =~ m{^\Q$perlvar{'lonDaemons'}\E/tmp/}) {
+ if (rmdir($fname)) {
+ $removed ++;
+ } elsif (ref($errors->{dir}) eq 'ARRAY') {
+ push(@{$errors->{dir}},$fname);
+ }
+ }
+ }
+ } else {
+ closedir($dirhandle);
+ }
+ }
+ } else {
+ my ($dev,$ino,$mode,$nlink,
+ $uid,$gid,$rdev,$size,
+ $atime,$mtime,$ctime,
+ $blksize,$blocks)=stat($fname);
+ my $now=time;
+ my $since=$now-$mtime;
+ if ($since>$perlvar{'lonExpire'}) {
+ if ($subdir eq '') {
+ my $line='';
+ if ($fname =~ /\.db$/) {
+ if (unlink($fname)) {
+ $cleaned++;
+ } elsif (ref($errors->{file}) eq 'ARRAY') {
+ push(@{$errors->{file}},$fname);
+ }
+ } elsif (open(PROBE,$fname)) {
+ my $line='';
+ $line=;
+ close(PROBE);
+ if ($line=~/^CHECKOUTTOKEN\&/) {
+ if ($since>365*$perlvar{'lonExpire'}) {
+ if (unlink($fname)) {
+ $cleaned++;
+ } elsif (ref($errors->{file}) eq 'ARRAY') {
+ push(@{$errors->{file}},$fname);
+ }
+ } else {
+ $old++;
+ }
+ } else {
+ if (unlink($fname)) {
+ $cleaned++;
+ } elsif (ref($errors->{file}) eq 'ARRAY') {
+ push(@{$errors->{file}},$fname);
+ }
+ }
+ } elsif (ref($errors->{failopen}) eq 'ARRAY') {
+ push(@{$errors->{failopen}},$fname);
+ }
+ } else {
+ if (unlink($fname)) {
+ $cleaned++;
+ } elsif (ref($errors->{file}) eq 'ARRAY') {
+ push(@{$errors->{file}},$fname);
+ }
+ }
+ }
+ }
+ }
+ closedir($dh);
+ } elsif (ref($errors->{failopen}) eq 'ARRAY') {
+ push(@{$errors->{failopen}},$path);
}
- &log($fh,"Cleaned up ".$cleaned." files (".$old." old checkout tokens).");
+ return ($cleaned,$old,$removed);
}
# ------------------------------------------------------------ clean out lonIDs
@@ -381,43 +510,123 @@ sub clean_lonIDs {
my $cleaned=0;
my $active=0;
while (my $fname=<$perlvar{'lonIDsDir'}/*>) {
- my ($dev,$ino,$mode,$nlink,
- $uid,$gid,$rdev,$size,
- $atime,$mtime,$ctime,
- $blksize,$blocks)=stat($fname);
- my $now=time;
- my $since=$now-$mtime;
- if ($since>$perlvar{'lonExpire'}) {
- $cleaned++;
- &log($fh,"Unlinking $fname
");
- unlink("$fname");
- } else {
- $active++;
- }
+ my $now=time;
+ if (-l $fname) {
+ my $linkfname = readlink($fname);
+ if (-f $linkfname) {
+ if ($linkfname =~ m{^$perlvar{'lonIDsDir'}/[^/]+\.id$}) {
+ my @data = stat($linkfname);
+ my $mtime = $data[9];
+ my $since=$now-$mtime;
+ if ($since>$perlvar{'lonExpire'}) {
+ if (unlink($linkfname)) {
+ $cleaned++;
+ &log($fh,"Unlinking $linkfname
");
+ unlink($fname);
+ }
+ }
+ }
+ } else {
+ unlink($fname);
+ }
+ } elsif (-f $fname) {
+ my @data = stat($fname);
+ my $mtime = $data[9];
+ my $since=$now-$mtime;
+ if ($since>$perlvar{'lonExpire'}) {
+ if (unlink($fname)) {
+ $cleaned++;
+ &log($fh,"Unlinking $fname
");
+ }
+ } else {
+ $active++;
+ }
+ }
}
&log($fh,"Cleaned up ".$cleaned." stale session token(s).
");
&log($fh,"$active open session(s)
");
}
+# -------------------------------------------------------- clean out balanceIDs
+
+sub clean_balanceIDs {
+ my ($fh)=@_;
+ &log($fh,'
Session Tokens
');
+ my $cleaned=0;
+ my $active=0;
+ if (-d $perlvar{'lonBalanceDir'}) {
+ while (my $fname=<$perlvar{'lonBalanceDir'}/*.id>) {
+ my ($dev,$ino,$mode,$nlink,
+ $uid,$gid,$rdev,$size,
+ $atime,$mtime,$ctime,
+ $blksize,$blocks)=stat($fname);
+ my $now=time;
+ my $since=$now-$mtime;
+ if ($since>$perlvar{'lonExpire'}) {
+ $cleaned++;
+ &log($fh,"Unlinking $fname
");
+ unlink("$fname");
+ } else {
+ $active++;
+ }
+ }
+ }
+ &log($fh,"Cleaned up ".$cleaned." stale balancer files
");
+ &log($fh,"$active unexpired balancer files
");
+}
+
+# ------------------------------------------------ clean out webDAV Session IDs
+sub clean_webDAV_sessionIDs {
+ my ($fh)=@_;
+ if ($perlvar{'lonRole'} eq 'library') {
+ &log($fh,'
WebDAV Session Tokens
');
+ my $cleaned=0;
+ my $active=0;
+ my $now = time;
+ if (-d $perlvar{'lonDAVsessDir'}) {
+ while (my $fname=<$perlvar{'lonDAVsessDir'}/*>) {
+ my @stats = stat($fname);
+ my $since=$now-$stats[9];
+ if ($since>$perlvar{'lonExpire'}) {
+ $cleaned++;
+ &log($fh,"Unlinking $fname
");
+ unlink("$fname");
+ } else {
+ $active++;
+ }
+ }
+ &log($fh,"Cleaned up ".$cleaned." stale webDAV session token(s).
");
+ &log($fh,"$active open webDAV session(s)
");
+ }
+ }
+}
+
+# ----------------------------------------------------------- clean out sockets
+sub clean_sockets {
+ my ($fh)=@_;
+ my $cleaned=0;
+ opendir(SOCKETS,$perlvar{'lonSockDir'});
+ while (my $fname=readdir(SOCKETS)) {
+ next if (-d $fname
+ || $fname=~/(mysqlsock|maximasock|rsock|\Q$perlvar{'lonSockDir'}\E)/);
+ $cleaned++;
+ &log($fh,"Unlinking $fname
");
+ unlink("/home/httpd/sockets/$fname");
+ }
+ &log($fh,"Cleaned up ".$cleaned." stale sockets.
");
+}
+
# ----------------------------------------------------------------------- httpd
sub check_httpd_logs {
my ($fh)=@_;
- &log($fh,'
httpd
Access Log
');
-
- open (DFH,"tail -n25 /etc/httpd/logs/access_log|");
- while (my $line=) { &log($fh,&encode_entities($line,'<>&"')) };
- close (DFH);
-
- &log($fh,"
Error Log
");
-
- open (DFH,"tail -n25 /etc/httpd/logs/error_log|");
- while (my $line=) {
- &log($fh,"$line");
- if ($line=~/\[error\]/) { $notices++; }
+ if (open(PIPE,"./lchttpdlogs|")) {
+ while (my $line=) {
+ &log($fh,$line);
+ if ($line=~/\[error\]/) { $notices++; }
+ }
+ close(PIPE);
}
- close (DFH);
- &log($fh,"
");
&errout($fh);
}
@@ -426,19 +635,19 @@ sub check_httpd_logs {
sub rotate_lonnet_logs {
my ($fh)=@_;
&log($fh,'
lonnet
Temp Log
');
- print "checking logs\n";
+ print "Checking logs.\n";
if (-e "$perlvar{'lonDaemons'}/logs/lonnet.log"){
open (DFH,"tail -n50 $perlvar{'lonDaemons'}/logs/lonnet.log|");
- while (my $line=) {
+ while (my $line=) {
&log($fh,&encode_entities($line,'<>&"'));
}
close (DFH);
}
&log($fh,"
Perm Log
");
-
+
if (-e "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") {
open(DFH,"tail -n10 $perlvar{'lonDaemons'}/logs/lonnet.perm.log|");
- while (my $line=) {
+ while (my $line=) {
&log($fh,&encode_entities($line,'<>&"'));
}
close (DFH);
@@ -453,19 +662,24 @@ sub rotate_lonnet_logs {
sub rotate_other_logs {
my ($fh) = @_;
- my $fname="$perlvar{'lonDaemons'}/logs/autoenroll.log";
- &rotate_logfile($fname,$fh,'Auto Enroll log');
- $fname="$perlvar{'lonDaemons'}/logs/autocreate.log";
- &rotate_logfile($fname,$fh,'Create Course log');
- $fname="$perlvar{'lonDaemons'}/logs/searchcat.log";
- &rotate_logfile($fname,$fh,'Search Cataloguing log');
+ my %logs = (
+ autoenroll => 'Auto Enroll log',
+ autocreate => 'Create Course log',
+ searchcat => 'Search Cataloguing log',
+ autoupdate => 'Auto Update log',
+ refreshcourseids_db => 'Refresh CourseIDs db log',
+ );
+ foreach my $item (keys(%logs)) {
+ my $fname=$perlvar{'lonDaemons'}.'/logs/'.$item.'.log';
+ &rotate_logfile($fname,$fh,$logs{$item});
+ }
}
# ----------------------------------------------------------------- Connections
sub test_connections {
my ($fh)=@_;
&log($fh,'
Connections
');
- print "testing connections\n";
+ print "Testing connections.\n";
&log($fh,"");
my ($good,$bad)=(0,0);
my %hostname = &Apache::lonnet::all_hostnames();
@@ -497,28 +711,200 @@ sub test_connections {
# ------------------------------------------------------------ Delayed messages
sub check_delayed_msg {
- my ($fh)=@_;
+ my ($fh,$weightsref,$exclusionsref)=@_;
&log($fh,'
Delayed Messages
');
- print "checking buffers\n";
+ print "Checking buffers.\n";
&log($fh,'Scanning Permanent Log
');
my $unsend=0;
+ my $ignored=0;
- my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log");
- while (my $line=<$dfh>) {
- my ($time,$sdf,$dserv,$dcmd)=split(/:/,$line);
- if ($sdf eq 'F') {
- my $local=localtime($time);
- &log($fh,"Failed: $time, $dserv, $dcmd
");
- $warnings++;
- }
- if ($sdf eq 'S') { $unsend--; }
- if ($sdf eq 'D') { $unsend++; }
+ my %hostname = &Apache::lonnet::all_hostnames();
+ my $numhosts = scalar(keys(%hostname));
+ my $checkbackwards = 0;
+ my $checkfrom = 0;
+ my $checkexcluded = 0;
+ my (%bymachine,%weights,%exclusions,%serverhomes);
+ if (ref($weightsref) eq 'HASH') {
+ %weights = %{$weightsref};
+ }
+ if (ref($exclusionsref) eq 'HASH') {
+ %exclusions = %{$exclusionsref};
+ if (keys(%exclusions)) {
+ $checkexcluded = 1;
+ %serverhomes = &read_serverhomeIDs();
+ }
}
- &log($fh,"Total unsend messages: $unsend
\n");
- $warnings=$warnings+5*$unsend;
+#
+# For LON-CAPA 1.2.0 to 2.1.3 (release dates: 8/31/2004 and 3/31/2006) any
+# entry logged in lonnet.perm.log for completion of a delayed (critical)
+# transaction lacked the hostID for the remote node to which the command
+# to be completed was sent.
+#
+# Because of this, exclusion of items in lonnet.perm.log for nodes which are
+# no longer part of the cluster from adding to the overall "unsend" count
+# needs additional effort besides the changes made in loncron rev. 1.105.
+#
+# For "S" (completion) events logging in LON-CAPA 1.2.0 through 2.1.3 included
+# "LondTransaction=HASH(hexadecimal)->getClient() :$cmd, where the hexadecimal
+# is a memory location, and $cmd is the command sent to the remote node.
+#
+# Starting with 2.2.0 (released 8/21/2006) logging for "S" (completion) events
+# had sethost:$host_id:$cmd after LondTransaction=HASH(hexadecimal)->getClient()
+#
+# Starting with 2.4.1 (released 6/13/2007) logging for "S" replaced echoing the
+# getClient() call with the result of the Transaction->getClient() call itself
+# undef for completion of delivery of a delayed message.
+#
+# The net effect of these changes is that lonnet.perm.log is now accessed three
+# times: (a) oldest record is checked, if earlier than release date for 2.5.0
+# then (b) file is read backwards, with timestamp recorded for most recent
+# instance of logged "S" event for "update" command without "sethost:$host_id:"
+# then (c) file is read forward with records ignored which predate the timestamp
+# recorded in (b), if one was found.
+#
+# In (c), when calculating the unsend total, i.e., the difference between delayed
+# transactions ("D") and sent transactions ("S"), transactions are ignored if the
+# target node is no longer in the cluster, and also (for "update" commands), if
+# the target node is in the list of nodes excluded from the count, in the domain
+# configuration for this machine's default domain. The idea here is to remove
+# delayed "update" commands for nodes for which inbound access to port 5663,
+# is blocked, but are still part of the LON-CAPA network, (i.e., they can still
+# replicate content from other nodes).
+#
+
+ my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r");
+ if (defined($dfh)) {
+ while (my $line=<$dfh>) {
+ my ($time,$sdf,$rest)=split(/:/,$line,3);
+ if ($time < 1541185772) {
+ $checkbackwards = 1;
+ }
+ last;
+ }
+ undef $dfh;
+ }
+
+ if ($checkbackwards) {
+ if (tie *BW, 'File::ReadBackwards', "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") {
+ while(my $line=) {
+ if ($line =~ /\QLondTransaction=HASH\E[^:]+:update:/) {
+ ($checkfrom) = split(/:/,$line,2);
+ last;
+ }
+ }
+ close(BW);
+ }
+ }
+ $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r");
+ if (defined($dfh)) {
+ while (my $line=<$dfh>) {
+ my ($time,$sdf,$rest)=split(/:/,$line,3);
+ next unless (($sdf eq 'F') || ($sdf eq 'S') || ($sdf eq 'D'));
+ next if (($checkfrom) && ($time <= $checkfrom));
+ my ($dserv,$dcmd);
+ if ($sdf eq 'S') {
+ my ($serva,$cmda,$servb,$cmdb) = split(/:/,$rest);
+ if ($cmda eq 'sethost') {
+ chomp($cmdb);
+ $dcmd = $cmdb;
+ } else {
+ $dcmd = $cmda;
+ }
+ if (($serva =~ /^LondTransaction/) || ($serva eq '')) {
+ unless (($servb eq '') || ($servb =~ m{^/})) {
+ $dserv = $servb;
+ }
+ } else {
+ $dserv = $serva;
+ }
+ } else {
+ ($dserv,$dcmd) = split(/:/,$rest);
+ }
+ if ($sdf eq 'F') {
+ my $local=localtime($time);
+ &log($fh,"Failed: $time, $dserv, $dcmd
");
+ $warnings++;
+ }
+ next if ((($dserv eq '') || ($dcmd eq '')) && ($sdf ne 'F'));
+ if ($sdf eq 'S') {
+ if ($dcmd eq 'update') {
+ if ($hostname{$dserv}) {
+ if ($exclusions{$serverhomes{$hostname{$dserv}}}) {
+ $ignored --;
+ } else {
+ $unsend --;
+ }
+ }
+ if (exists($bymachine{$dserv})) {
+ $bymachine{$dserv} --;
+ } else {
+ $bymachine{$dserv} = -1;
+ }
+ } else {
+ if ($hostname{$dserv}) {
+ $unsend --;
+ }
+ }
+ } elsif ($sdf eq 'D') {
+ if ($dcmd eq 'update') {
+ if ($hostname{$dserv}) {
+ if ($exclusions{$serverhomes{$hostname{$dserv}}}) {
+ $ignored ++;
+ } else {
+ $unsend ++;
+ }
+ }
+ if (exists($bymachine{$dserv})) {
+ $bymachine{$dserv} ++;
+ } else {
+ $bymachine{$dserv} = 1;
+ }
+ } else {
+ if ($hostname{$dserv}) {
+ $unsend ++;
+ }
+ }
+ }
+ }
+ undef $dfh;
+ my $nodest = 0;
+ my $retired = 0;
+ my %active;
+ if (keys(%bymachine)) {
+ unless ($checkexcluded) {
+ %serverhomes = &read_serverhomeIDs();
+ }
+ foreach my $key (keys(%bymachine)) {
+ if ($bymachine{$key} > 0) {
+ if ($hostname{$key}) {
+ $active{$serverhomes{$hostname{$key}}} += $bymachine{$key};
+ } else {
+ $retired ++;
+ $nodest += $bymachine{$key};
+ }
+ }
+ }
+ }
+ if (keys(%active)) {
+ &log($fh,"Unsend messages by node, active (undegraded) nodes in cluster
\n");
+ foreach my $key (sort(keys(%active))) {
+ &log($fh,&encode_entities("$key => $active{$key}",'<>&"')."\n");
+ }
+ }
+ &log($fh,"Total unsend messages: $unsend for ".scalar(keys(%active))." active (undegraded) nodes in cluster.
\n");
+ if (keys(%exclusions) > 0) {
+ &log($fh,"Total incomplete updates $ignored for ".scalar(keys(%exclusions))." degraded nodes in cluster.
\n");
+ }
+ if ($retired) {
+ &log($fh,"Total unsent $nodest for $retired nodes no longer in cluster.
\n");
+ }
+ if ($unsend > 0) {
+ $warnings=$warnings+$weights{'U'}*$unsend;
+ }
+ }
if ($unsend) { $simplestatus{'unsend'}=$unsend; }
&log($fh,"Outgoing Buffer
\n");
@@ -534,22 +920,41 @@ sub check_delayed_msg {
close (DFH);
# pong to all servers that have delayed messages
# this will trigger a reverse connection, which should flush the buffers
- foreach my $tryserver (keys %servers) {
- my $answer=&Apache::lonnet::reply("pong",$tryserver);
- &log($fh,"Pong to $tryserver: $answer
");
+ foreach my $tryserver (sort(keys(%servers))) {
+ if ($hostname{$tryserver} || !$numhosts) {
+ my $answer;
+ eval {
+ local $SIG{ ALRM } = sub { die "TIMEOUT" };
+ alarm(20);
+ $answer = &Apache::lonnet::reply("pong",$tryserver);
+ alarm(0);
+ };
+ if ($@ && $@ =~ m/TIMEOUT/) {
+ &log($fh,"Attempted pong to $tryserver timed out
");
+ print "Time out while contacting: $tryserver for pong.\n";
+ } else {
+ &log($fh,"Pong to $tryserver: $answer
");
+ }
+ } else {
+ &log($fh,"$tryserver has delayed messages, but is not part of the cluster -- skipping 'Pong'.
");
+ }
}
}
sub finish_logging {
- my ($fh)=@_;
+ my ($fh,$weightsref)=@_;
+ my %weights;
+ if (ref($weightsref) eq 'HASH') {
+ %weights = %{$weightsref};
+ }
&log($fh,"\n");
- $totalcount=$notices+4*$warnings+100*$errors;
+ $totalcount=($weights{'N'}*$notices)+($weights{'W'}*$warnings)+($weights{'E'}*$errors);
&errout($fh);
&log($fh,"Total Error Count: $totalcount
");
my $now=time;
my $date=localtime($now);
&log($fh,"
$date ($now)\n");
- print "lon-status webpage updated\n";
+ print "lon-status webpage updated.\n";
$fh->close();
if ($errors) { $simplestatus{'errors'}=$errors; }
@@ -569,23 +974,338 @@ sub log_simplestatus {
$sfh->close();
}
+sub write_loncaparevs {
+ print "Retrieving LON-CAPA version information.\n";
+ my %hostname = &Apache::lonnet::all_hostnames();
+ my $output;
+ foreach my $id (sort(keys(%hostname))) {
+ if ($id ne '') {
+ my $loncaparev;
+ eval {
+ local $SIG{ ALRM } = sub { die "TIMEOUT" };
+ alarm(10);
+ $loncaparev =
+ &Apache::lonnet::get_server_loncaparev('',$id,1,'loncron');
+ alarm(0);
+ };
+ if ($@ && $@ =~ m/TIMEOUT/) {
+ print "Time out while contacting lonHost: $id for version.\n";
+ }
+ if ($loncaparev =~ /^[\w.\-]+$/) {
+ $output .= $id.':'.$loncaparev."\n";
+ }
+ }
+ }
+ if ($output) {
+ if (open(my $fh,">$perlvar{'lonTabDir'}/loncaparevs.tab")) {
+ print $fh $output;
+ close($fh);
+ &Apache::lonnet::load_loncaparevs();
+ }
+ }
+ return;
+}
+
+sub write_serverhomeIDs {
+ print "Retrieving LON-CAPA lonHostID information.\n";
+ my %name_to_host = &Apache::lonnet::all_names();
+ my $output;
+ foreach my $name (sort(keys(%name_to_host))) {
+ if ($name ne '') {
+ if (ref($name_to_host{$name}) eq 'ARRAY') {
+ my $serverhomeID;
+ eval {
+ local $SIG{ ALRM } = sub { die "TIMEOUT" };
+ alarm(10);
+ $serverhomeID =
+ &Apache::lonnet::get_server_homeID($name,1,'loncron');
+ alarm(0);
+ };
+ if ($@ && $@ =~ m/TIMEOUT/) {
+ print "Time out while contacting server: $name\n";
+ }
+ if ($serverhomeID ne '') {
+ $output .= $name.':'.$serverhomeID."\n";
+ } else {
+ $output .= $name.':'.$name_to_host{$name}->[0]."\n";
+ }
+ }
+ }
+ }
+ if ($output) {
+ if (open(my $fh,">$perlvar{'lonTabDir'}/serverhomeIDs.tab")) {
+ print $fh $output;
+ close($fh);
+ &Apache::lonnet::load_serverhomeIDs();
+ }
+ }
+ return;
+}
+
+sub write_checksums {
+ my $distro = &LONCAPA::distro();
+ if ($distro) {
+ print "Retrieving file version and checksumming.\n";
+ my $numchksums = 0;
+ my ($chksumsref,$versionsref) =
+ &LONCAPA::Checksumming::get_checksums($distro,$perlvar{'lonDaemons'},
+ $perlvar{'lonLib'},
+ $perlvar{'lonIncludes'},
+ $perlvar{'lonTabDir'});
+ if (ref($chksumsref) eq 'HASH') {
+ $numchksums = scalar(keys(%{$chksumsref}));
+ }
+ print "File version retrieved and checksumming completed for $numchksums files.\n";
+ } else {
+ print "File version retrieval and checksumming skipped - could not determine Linux distro.\n";
+ }
+ return;
+}
+
+sub write_hostips {
+ my $lontabdir = $perlvar{'lonTabDir'};
+ my $defdom = $perlvar{'lonDefDomain'};
+ my $lonhost = $perlvar{'lonHostID'};
+ my $newfile = "$lontabdir/currhostips.tab";
+ my $oldfile = "$lontabdir/prevhostips.tab";
+ my (%prevhosts,%currhosts,%ipchange);
+ if ((-e $newfile) && (-s $newfile)) {
+ move($newfile,$oldfile);
+ chmod(0644,$oldfile);
+ if (open(my $fh,'<',$oldfile)) {
+ while (my $line=<$fh>) {
+ chomp($line);
+ if ($line =~ /^([^:]+):([\d.]+)$/) {
+ $prevhosts{$1} = $2;
+ }
+ }
+ close($fh);
+ }
+ }
+ my ($ip_info,$cached) =
+ &Apache::lonnet::is_cached_new('iphost','iphost');
+ if (!$cached) {
+ &Apache::lonnet::get_iphost();
+ ($ip_info,$cached) =
+ &Apache::lonnet::is_cached_new('iphost','iphost');
+ }
+ if (ref($ip_info) eq 'ARRAY') {
+ %currhosts = %{$ip_info->[1]};
+ if (open(my $fh,'>',$newfile)) {
+ foreach my $key (keys(%currhosts)) {
+ print $fh "$key:$currhosts{$key}\n";
+ }
+ close($fh);
+ chmod(0644,$newfile);
+ }
+ }
+ if (keys(%prevhosts) && keys(%currhosts)) {
+ foreach my $key (keys(%prevhosts)) {
+ unless ($currhosts{$key} eq $prevhosts{$key}) {
+ $ipchange{$key} = $prevhosts{$key}.' | '.$currhosts{$key};
+ }
+ }
+ foreach my $key (keys(%currhosts)) {
+ unless ($currhosts{$key} eq $prevhosts{$key}) {
+ $ipchange{$key} = $prevhosts{$key}.' | '.$currhosts{$key};
+ }
+ }
+ }
+ if (&Apache::lonnet::domain($defdom,'primary') eq $lonhost) {
+ if (keys(%ipchange)) {
+ if (open(my $fh,'>>',$perlvar{'lonDaemons'}.'/logs/hostip.log')) {
+ print $fh "********************\n".localtime(time).' Changes --'."\n".
+ "| Hostname | Previous IP | New IP |\n".
+ " --------------------------------- \n";
+ foreach my $hostname (sort(keys(%ipchange))) {
+ print $fh "| $hostname | $ipchange{$hostname} |\n";
+ }
+ print $fh "\n*******************\n\n";
+ close($fh);
+ }
+ my $emailto = &Apache::loncommon::build_recipient_list(undef,
+ 'hostipmail',$defdom);
+ if ($emailto) {
+ my $subject = "LON-CAPA Hostname to IP change ($perlvar{'lonHostID'})";
+ my $chgmail = "To: $emailto\n".
+ "Subject: $subject\n".
+ "Content-type: text/plain\; charset=UTF-8\n".
+ "MIME-Version: 1.0\n\n".
+ "Host/IP changes\n".
+ " \n".
+ "| Hostname | Previous IP | New IP |\n".
+ " --------------------------------- \n";
+ foreach my $hostname (sort(keys(%ipchange))) {
+ $chgmail .= "| $hostname | $ipchange{$hostname} |\n";
+ }
+ $chgmail .= "\n\n";
+ if (open(my $mailh, "|/usr/lib/sendmail -oi -t -odb")) {
+ print $mailh $chgmail;
+ close($mailh);
+ print "Sending mail notification of hostname/IP changes.\n";
+ }
+ }
+ }
+ }
+ return;
+}
+
+sub get_domain_config {
+ my ($dom,$primlibserv,$isprimary,$url,%confhash);
+ $dom = $perlvar{'lonDefDomain'};
+ $primlibserv = &Apache::lonnet::domain($dom,'primary');
+ if ($primlibserv eq $perlvar{'lonHostID'}) {
+ $isprimary = 1;
+ } elsif ($primlibserv ne '') {
+ my $protocol = $Apache::lonnet::protocol{$primlibserv};
+ my $hostname = &Apache::lonnet::hostname($primlibserv);
+ unless ($protocol eq 'https') {
+ $protocol = 'http';
+ }
+ $url = $protocol.'://'.$hostname.'/cgi-bin/listdomconfig.pl?primary='.$primlibserv.'&format=raw';
+ }
+ if ($isprimary) {
+ my $lonusersdir = $perlvar{'lonUsersDir'};
+ my $fname = $lonusersdir.'/'.$dom.'/configuration.db';
+ if (-e $fname) {
+ my $dbref=&LONCAPA::locking_hash_tie($fname,&GDBM_READER());
+ if (ref($dbref) eq 'HASH') {
+ foreach my $key (sort(keys(%{$dbref}))) {
+ my $value = $dbref->{$key};
+ if ($value =~ s/^__FROZEN__//) {
+ $value = thaw(&LONCAPA::unescape($value));
+ } else {
+ $value = &LONCAPA::unescape($value);
+ }
+ $confhash{$key} = $value;
+ }
+ &LONCAPA::locking_hash_untie($dbref);
+ }
+ }
+ } else {
+ my $ua=new LWP::UserAgent;
+ $ua->timeout(5);
+ my $request=new HTTP::Request('GET',$url);
+ my $response=$ua->request($request);
+ unless ($response->is_error()) {
+ my $content = $response->content;
+ if ($content) {
+ my @pairs=split(/\&/,$content);
+ foreach my $item (@pairs) {
+ my ($key,$value)=split(/=/,$item,2);
+ my $what = &LONCAPA::unescape($key);
+ if ($value =~ s/^__FROZEN__//) {
+ $value = thaw(&LONCAPA::unescape($value));
+ } else {
+ $value = &LONCAPA::unescape($value);
+ }
+ $confhash{$what}=$value;
+ }
+ }
+ }
+ }
+ return \%confhash;
+}
+
+sub get_permcount_settings {
+ my ($domconf) = @_;
+ my ($defaults,$names) = &Apache::loncommon::lon_status_items();
+ my (%weights,$threshold,$sysmail,$reportstatus,%exclusions);
+ foreach my $type ('E','W','N','U') {
+ $weights{$type} = $defaults->{$type};
+ }
+ $threshold = $defaults->{'threshold'};
+ $sysmail = $defaults->{'sysmail'};
+ $reportstatus = 1;
+ if (ref($domconf) eq 'HASH') {
+ if (ref($domconf->{'contacts'}) eq 'HASH') {
+ if ($domconf->{'contacts'}{'reportstatus'} == 0) {
+ $reportstatus = 0;
+ }
+ if (ref($domconf->{'contacts'}{'lonstatus'}) eq 'HASH') {
+ if (ref($domconf->{'contacts'}{'lonstatus'}{weights}) eq 'HASH') {
+ foreach my $type ('E','W','N','U') {
+ if (exists($domconf->{'contacts'}{'lonstatus'}{weights}{$type})) {
+ $weights{$type} = $domconf->{'contacts'}{'lonstatus'}{weights}{$type};
+ }
+ }
+ }
+ if (ref($domconf->{'contacts'}{'lonstatus'}{'excluded'}) eq 'ARRAY') {
+ my @excluded = @{$domconf->{'contacts'}{'lonstatus'}{'excluded'}};
+ if (@excluded) {
+ map { $exclusions{$_} = 1; } @excluded;
+ }
+ }
+ if (exists($domconf->{'contacts'}{'lonstatus'}{'threshold'})) {
+ $threshold = $domconf->{'contacts'}{'lonstatus'}{'threshold'};
+ }
+ if (exists($domconf->{'contacts'}{'lonstatus'}{'sysmail'})) {
+ $sysmail = $domconf->{'contacts'}{'lonstatus'}{'sysmail'};
+ }
+ }
+ }
+ }
+ return ($threshold,$sysmail,$reportstatus,\%weights,\%exclusions);
+}
+
+sub read_serverhomeIDs {
+ my %server;
+ if (-e "$perlvar{'lonTabDir'}/serverhomeIDs.tab") {
+ if (open(my $fh,'<',"$perlvar{'lonTabDir'}/serverhomeIDs.tab")) {
+ while (<$fh>) {
+ my($host,$id) = split(/:/);
+ chomp($id);
+ $server{$host} = $id;
+ }
+ close($fh);
+ }
+ }
+ return %server;
+}
+
sub send_mail {
- print "sending mail\n";
- my $emailto="$perlvar{'lonAdmEMail'}";
- if ($totalcount>2500) {
+ my ($sysmail,$reportstatus) = @_;
+ my $defdom = $perlvar{'lonDefDomain'};
+ my $origmail = $perlvar{'lonAdmEMail'};
+ my $emailto = &Apache::loncommon::build_recipient_list(undef,
+ 'lonstatusmail',$defdom,$origmail);
+ if (($totalcount>$sysmail) && ($reportstatus)) {
$emailto.=",$perlvar{'lonSysEMail'}";
}
- my $subj="LON: $perlvar{'lonHostID'} E:$errors W:$warnings N:$notices";
-
- my $result=system("metasend -b -S 4000000 -t $emailto -s '$subj' -f $statusdir/index.html -m text/html >& /dev/null");
- if ($result != 0) {
- $result=system("mail -s '$subj' $emailto < $statusdir/index.html");
+ my $from;
+ my $hostname=`/bin/hostname`;
+ chop($hostname);
+ $hostname=~s/[^\w\.]//g;
+ if ($hostname) {
+ $from = 'www@'.$hostname;
+ }
+ my $subj="LON: $perlvar{'lonHostID'} E:$errors W:$warnings N:$notices";
+ my $loncronmail = "To: $emailto\n".
+ "From: $from\n".
+ "Subject: ".$subj."\n".
+ "Content-type: text/html\; charset=UTF-8\n".
+ "MIME-Version: 1.0\n\n";
+ if (open(my $fh,"<$statusdir/index.html")) {
+ while (<$fh>) {
+ $loncronmail .= $_;
+ }
+ close($fh);
+ } else {
+ $loncronmail .= "Failed to read from http://$hostname/lon-status/index.html\n";
+ }
+ $loncronmail .= "\n\n";
+ if (open(my $mailh, "|/usr/lib/sendmail -oi -t -odb")) {
+ print $mailh $loncronmail;
+ close($mailh);
+ print "Sending mail.\n";
+ } else {
+ print "Sending mail failed.\n";
}
}
sub usage {
print(< \$help,
"justcheckdaemons" => \$justcheckdaemons,
"noemail" => \$noemail,
"justcheckconnections" => \$justcheckconnections,
- "justreload" => \$justreload
+ "justreload" => \$justreload,
+ "justiptables" => \$justiptables
);
if ($help) { &usage(); return; }
# --------------------------------- Read loncapa_apache.conf and loncapa.conf
@@ -623,36 +1347,74 @@ sub main () {
undef $perlvarref;
delete $perlvar{'lonReceipt'}; # remove since sensitive and not needed
delete $perlvar{'lonSqlAccess'}; # remove since sensitive and not needed
-
+ chdir($perlvar{'lonDaemons'});
# --------------------------------------- Make sure that LON-CAPA is configured
# I only test for one thing here (lonHostID). This is just a safeguard.
if ('{[[[[lonHostID]]]]}' eq $perlvar{'lonHostID'}) {
print("Unconfigured machine.\n");
my $emailto=$perlvar{'lonSysEMail'};
- my $hostname=`/bin/hostname`;
- chop $hostname;
- $hostname=~s/[^\w\.]//g; # make sure is safe to pass through shell
+ my $hostname = Sys::Hostname::FQDN::fqdn();
+ $hostname=~s/\.+/./g;
+ $hostname=~s/\-+/-/g;
+ $hostname=~s/[^\w\.-]//g; # make sure is safe to pass through shell
my $subj="LON: Unconfigured machine $hostname";
- system("echo 'Unconfigured machine $hostname.' |\
- mailto $emailto -s '$subj' > /dev/null");
+ system("echo 'Unconfigured machine $hostname.' |".
+ " mail -s '$subj' $emailto > /dev/null");
exit 1;
}
# ----------------------------- Make sure this process is running from user=www
my $wwwid=getpwnam('www');
if ($wwwid!=$<) {
- print("User ID mismatch. This program must be run as user 'www'\n");
+ print("User ID mismatch. This program must be run as user 'www'.\n");
my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
my $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
- system("echo 'User ID mismatch. loncron must be run as user www.' |\
- mailto $emailto -s '$subj' > /dev/null");
+ system("echo 'User ID mismatch. loncron must be run as user www.' |".
+ " mail -s '$subj' $emailto > /dev/null");
exit 1;
}
# -------------------------------------------- Force reload of host information
- &Apache::lonnet::load_hosts_tab(1);
- &Apache::lonnet::load_domain_tab(1);
- &Apache::lonnet::get_iphost(1);
+ my $nomemcache;
+ if ($justcheckdaemons) {
+ $nomemcache=1;
+ my $memcachepidfile="$perlvar{'lonDaemons'}/logs/memcached.pid";
+ my $memcachepid;
+ if (-e $memcachepidfile) {
+ my $memfh=IO::File->new($memcachepidfile);
+ $memcachepid=<$memfh>;
+ chomp($memcachepid);
+ if ($memcachepid =~ /^\d+$/ && kill 0 => $memcachepid) {
+ undef($nomemcache);
+ }
+ }
+ }
+ if (!$justiptables) {
+ &Apache::lonnet::load_hosts_tab(1,$nomemcache);
+ &Apache::lonnet::load_domain_tab(1,$nomemcache);
+ &Apache::lonnet::get_iphost(1,$nomemcache);
+ }
+
+# ----------------------------------------- Force firewall update for lond port
+
+ if ((!$justcheckdaemons) && (!$justreload)) {
+ my $now = time;
+ my $tmpfile = $perlvar{'lonDaemons'}.'/tmp/lciptables_iphost_'.
+ $now.$$.int(rand(10000));
+ if (open(my $fh,">$tmpfile")) {
+ my %iphosts = &Apache::lonnet::get_iphost();
+ foreach my $key (keys(%iphosts)) {
+ print $fh "$key\n";
+ }
+ close($fh);
+ if (&LONCAPA::try_to_lock('/tmp/lock_lciptables')) {
+ my $execpath = $perlvar{'lonDaemons'}.'/lciptables';
+ system("$execpath $tmpfile");
+ unlink('/tmp/lock_lciptables'); # Remove the lock file.
+ }
+ unlink($tmpfile);
+ }
+ }
# ---------------------------------------------------------------- Start report
@@ -660,27 +1422,29 @@ sub main () {
$warnings=0;
$notices=0;
-
+
my $fh;
- if (!$justcheckdaemons && !$justcheckconnections && !$justreload) {
+ if (!$justcheckdaemons && !$justcheckconnections && !$justreload && !$justiptables) {
$fh=&start_logging();
&log_machine_info($fh);
&clean_tmp($fh);
&clean_lonIDs($fh);
+ &clean_balanceIDs($fh);
+ &clean_webDAV_sessionIDs($fh);
&check_httpd_logs($fh);
&rotate_lonnet_logs($fh);
&rotate_other_logs($fh);
}
- if (!$justcheckconnections && !$justreload) {
+ if (!$justcheckconnections && !$justreload && !$justiptables) {
+ &checkon_daemon($fh,'lonmemcached',40000);
&checkon_daemon($fh,'lonsql',200000);
if ( &checkon_daemon($fh,'lond',40000,'USR1') eq 'running') {
&checkon_daemon($fh,'lond',40000,'USR2');
}
&checkon_daemon($fh,'lonc',40000,'USR1');
- &checkon_daemon($fh,'lonhttpd',40000);
- &checkon_daemon($fh,'lonmemcached',40000);
&checkon_daemon($fh,'lonmaxima',40000);
+ &checkon_daemon($fh,'lonr',40000);
}
if ($justreload) {
&checkon_daemon($fh,'lond',40000,'USR2');
@@ -689,22 +1453,21 @@ sub main () {
if ($justcheckconnections) {
&test_connections($fh);
}
- if (!$justcheckdaemons && !$justcheckconnections && !$justreload) {
- &check_delayed_msg($fh);
- &finish_logging($fh);
+ if (!$justcheckdaemons && !$justcheckconnections && !$justreload && !$justiptables) {
+ my $domconf = &get_domain_config();
+ my ($threshold,$sysmail,$reportstatus,$weightsref,$exclusionsref) =
+ &get_permcount_settings($domconf);
+ &check_delayed_msg($fh,$weightsref,$exclusionsref);
+ &finish_logging($fh,$weightsref);
&log_simplestatus();
-
- if ($totalcount>200 && !$noemail) { &send_mail(); }
+ &write_loncaparevs();
+ &write_serverhomeIDs();
+ &write_checksums();
+ &write_hostips();
+ if ($totalcount>$threshold && !$noemail) { &send_mail($sysmail,$reportstatus); }
}
}
&main();
1;
-
-
-
-
-
-
-