--- loncom/loncron 2015/09/20 18:31:21 1.103 +++ loncom/loncron 2021/01/30 22:25:53 1.103.2.9 @@ -2,7 +2,7 @@ # Housekeeping program, started by cron, loncontrol and loncron.pl # -# $Id: loncron,v 1.103 2015/09/20 18:31:21 raeburn Exp $ +# $Id: loncron,v 1.103.2.9 2021/01/30 22:25:53 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # @@ -37,10 +37,16 @@ use LONCAPA; use Apache::lonnet; use Apache::loncommon; +use LWP::UserAgent(); +use HTTP::Request(); use IO::File; use IO::Socket; use HTML::Entities; use Getopt::Long; +use File::ReadBackwards; +use File::Copy; +use Sys::Hostname::FQDN(); + #globals use vars qw (%perlvar %simplestatus $errors $warnings $notices $totalcount); @@ -73,14 +79,14 @@ sub rotate_logfile { rename("$file.2","$file.3"); rename("$file.1","$file.2"); rename("$file","$file.1"); - } + } } sub start_daemon { my ($fh,$daemon,$pidfile,$args) = @_; my $progname=$daemon; if ($daemon eq 'lonc') { - $progname='loncnew'; + $progname='loncnew'; } my $error_fname="$perlvar{'lonDaemons'}/logs/${daemon}_errors"; &rotate_logfile($error_fname,$fh,'error logs'); @@ -111,20 +117,23 @@ sub checkon_daemon { my $result; &log($fh,'
');
printf("%-15s ",$daemon);
- if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
- open (DFH,"tail -n25 $perlvar{'lonDaemons'}/logs/$daemon.log|");
- while (my $line=
Unable to start $daemon
"); } } - - if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){ - &log($fh,""); - open (DFH,"tail -n100 $perlvar{'lonDaemons'}/logs/$daemon.log|"); - while (my $line="); - } + if ($fh) { + if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){ + &log($fh,") { - &log($fh,"$line"); - if ($line=~/WARNING/) { $notices++; } - if ($line=~/CRITICAL/) { $notices++; } - }; - close (DFH); - &log($fh,"
"); + if (open(DFH,"tail -n100 $perlvar{'lonDaemons'}/logs/$daemon.log|")) { + while (my $line="); + } + } } - + my $fname="$perlvar{'lonDaemons'}/logs/$daemon.log"; &rotate_logfile($fname,$fh,'logs'); @@ -215,13 +226,13 @@ sub log_machine_info { my ($fh)=@_; &log($fh,') { + &log($fh,"$line"); + if ($line=~/WARNING/) { $notices++; } + if ($line=~/CRITICAL/) { $notices++; } + } + close (DFH); + } + &log($fh,"
"); open (DFH,"df|"); - while (my $line=) { - &log($fh,&encode_entities($line,'<>&"')); + while (my $line= ) { + &log($fh,&encode_entities($line,'<>&"')); @parts=split(/\s+/,$line); my $usage=$parts[4]; $usage=~s/\W//g; - if ($usage>90) { + if ($usage>90) { $warnings++; - $notices++; + $notices++; } elsif ($usage>80) { $warnings++; } elsif ($usage>60) { @@ -259,8 +270,8 @@ sub log_machine_info { my $psproc=0; open (PSH,"ps aux --cols 140 |"); - while (my $line= ) { - &log($fh,&encode_entities($line,'<>&"')); + while (my $line= ) { + &log($fh,&encode_entities($line,'<>&"')); $psproc++; } close (PSH); @@ -282,7 +293,7 @@ sub start_logging { my %simplestatus=(); my $now=time; my $date=localtime($now); - + &log($fh,(< @@ -398,12 +409,12 @@ sub recursive_clean_tmp { ($cleaned,$old,$removed) = &recursive_clean_tmp($innerdir,$cleaned,$old,$removed,$errors); my @doms = &Apache::lonnet::current_machine_domains(); - + if (open(my $dirhandle,$fname)) { unless (($innerdir eq 'helprequests') || (($innerdir =~ /^addcourse/) && ($innerdir !~ m{/\d+$}))) { my @contents = grep {!/^\.\.?$/} readdir($dirhandle); - join('&&',@contents)."\n"; + join('&&',@contents)."\n"; if (scalar(grep {!/^\.\.?$/} readdir($dirhandle)) == 0) { closedir($dirhandle); if ($fname =~ m{^\Q$perlvar{'lonDaemons'}\E/tmp/}) { @@ -456,7 +467,7 @@ sub recursive_clean_tmp { } } } elsif (ref($errors->{failopen}) eq 'ARRAY') { - push(@{$errors->{failopen}},$fname); + push(@{$errors->{failopen}},$fname); } } else { if (unlink($fname)) { @@ -482,24 +493,71 @@ sub clean_lonIDs { my $cleaned=0; my $active=0; while (my $fname=<$perlvar{'lonIDsDir'}/*>) { - my ($dev,$ino,$mode,$nlink, - $uid,$gid,$rdev,$size, - $atime,$mtime,$ctime, - $blksize,$blocks)=stat($fname); - my $now=time; - my $since=$now-$mtime; - if ($since>$perlvar{'lonExpire'}) { - $cleaned++; - &log($fh,"Unlinking $fname
"); - unlink("$fname"); - } else { - $active++; - } + my $now=time; + if (-l $fname) { + my $linkfname = readlink($fname); + if (-f $linkfname) { + if ($linkfname =~ m{^$perlvar{'lonIDsDir'}/[^/]+\.id$}) { + my @data = stat($linkfname); + my $mtime = $data[9]; + my $since=$now-$mtime; + if ($since>$perlvar{'lonExpire'}) { + if (unlink($linkfname)) { + $cleaned++; + &log($fh,"Unlinking $linkfname
"); + unlink($fname); + } + } + } + } else { + unlink($fname); + } + } elsif (-f $fname) { + my @data = stat($fname); + my $mtime = $data[9]; + my $since=$now-$mtime; + if ($since>$perlvar{'lonExpire'}) { + if (unlink($fname)) { + $cleaned++; + &log($fh,"Unlinking $fname
"); + } + } else { + $active++; + } + } } &log($fh,"Cleaned up ".$cleaned." stale session token(s).
"); &log($fh,"$active open session(s)
"); } +# -------------------------------------------------------- clean out balanceIDs + +sub clean_balanceIDs { + my ($fh)=@_; + &log($fh,'Session Tokens
'); + my $cleaned=0; + my $active=0; + if (-d $perlvar{'lonBalanceDir'}) { + while (my $fname=<$perlvar{'lonBalanceDir'}/*.id>) { + my ($dev,$ino,$mode,$nlink, + $uid,$gid,$rdev,$size, + $atime,$mtime,$ctime, + $blksize,$blocks)=stat($fname); + my $now=time; + my $since=$now-$mtime; + if ($since>$perlvar{'lonExpire'}) { + $cleaned++; + &log($fh,"Unlinking $fname
"); + unlink("$fname"); + } else { + $active++; + } + } + } + &log($fh,"Cleaned up ".$cleaned." stale balancer files
"); + &log($fh,"$active unexpired balancer files
"); +} + # ------------------------------------------------ clean out webDAV Session IDs sub clean_webDAV_sessionIDs { my ($fh)=@_; @@ -563,16 +621,16 @@ sub rotate_lonnet_logs { print "Checking logs.\n"; if (-e "$perlvar{'lonDaemons'}/logs/lonnet.log"){ open (DFH,"tail -n50 $perlvar{'lonDaemons'}/logs/lonnet.log|"); - while (my $line=) { + while (my $line= ) { &log($fh,&encode_entities($line,'<>&"')); } close (DFH); } &log($fh,"
"); - + if (-e "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") { open(DFH,"tail -n10 $perlvar{'lonDaemons'}/logs/lonnet.perm.log|"); - while (my $line=\n"); close (DFH); - my %hostname = &Apache::lonnet::all_hostnames(); - my $numhosts = scalar(keys(%hostname)); # pong to all servers that have delayed messages # this will trigger a reverse connection, which should flush the buffers foreach my $tryserver (sort(keys(%servers))) { @@ -699,9 +925,13 @@ sub check_delayed_msg { } sub finish_logging { - my ($fh)=@_; + my ($fh,$weightsref)=@_; + my %weights; + if (ref($weightsref) eq 'HASH') { + %weights = %{$weightsref}; + } &log($fh,"\n"); - $totalcount=$notices+4*$warnings+100*$errors; + $totalcount=($weights{'N'}*$notices)+($weights{'W'}*$warnings)+($weights{'E'}*$errors); &errout($fh); &log($fh,") { + while (my $line= ) { &log($fh,&encode_entities($line,'<>&"')); } close (DFH); @@ -636,29 +694,199 @@ sub test_connections { # ------------------------------------------------------------ Delayed messages sub check_delayed_msg { - my ($fh)=@_; + my ($fh,$weightsref,$exclusionsref)=@_; &log($fh,' Delayed Messages
'); print "Checking buffers.\n"; &log($fh,'Scanning Permanent Log
'); my $unsend=0; + my $ignored=0; - my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log"); - while (my $line=<$dfh>) { - my ($time,$sdf,$dserv,$dcmd)=split(/:/,$line); - if ($sdf eq 'F') { - my $local=localtime($time); - &log($fh,"Failed: $time, $dserv, $dcmd
"); - $warnings++; - } - if ($sdf eq 'S') { $unsend--; } - if ($sdf eq 'D') { $unsend++; } + my %hostname = &Apache::lonnet::all_hostnames(); + my $numhosts = scalar(keys(%hostname)); + my $checkbackwards = 0; + my $checkfrom = 0; + my $checkexcluded = 0; + my (%bymachine,%weights,%exclusions,%serverhomes); + if (ref($weightsref) eq 'HASH') { + %weights = %{$weightsref}; + } + if (ref($exclusionsref) eq 'HASH') { + %exclusions = %{$exclusionsref}; + if (keys(%exclusions)) { + $checkexcluded = 1; + %serverhomes = &read_serverhomeIDs(); + } } - &log($fh,"Total unsend messages: $unsend
\n"); - if ($unsend > 0) { - $warnings=$warnings+5*$unsend; +# +# For LON-CAPA 1.2.0 to 2.1.3 (release dates: 8/31/2004 and 3/31/2006) any +# entry logged in lonnet.perm.log for completion of a delayed (critical) +# transaction lacked the hostID for the remote node to which the command +# to be completed was sent. +# +# Because of this, exclusion of items in lonnet.perm.log for nodes which are +# no longer part of the cluster from adding to the overall "unsend" count +# needs additional effort besides the changes made in loncron rev. 1.105. +# +# For "S" (completion) events logging in LON-CAPA 1.2.0 through 2.1.3 included +# "LondTransaction=HASH(hexadecimal)->getClient() :$cmd, where the hexadecimal +# is a memory location, and $cmd is the command sent to the remote node. +# +# Starting with 2.2.0 (released 8/21/2006) logging for "S" (completion) events +# had sethost:$host_id:$cmd after LondTransaction=HASH(hexadecimal)->getClient() +# +# Starting with 2.4.1 (released 6/13/2007) logging for "S" replaced echoing the +# getClient() call with the result of the Transaction->getClient() call itself +# undef for completion of delivery of a delayed message. +# +# The net effect of these changes is that lonnet.perm.log is now accessed three +# times: (a) oldest record is checked, if earlier than release date for 2.5.0 +# then (b) file is read backwards, with timestamp recorded for most recent +# instance of logged "S" event for "update" command without "sethost:$host_id:" +# then (c) file is read forward with records ignored which predate the timestamp +# recorded in (b), if one was found. +# +# In (c), when calculating the unsend total, i.e., the difference between delayed +# transactions ("D") and sent transactions ("S"), transactions are ignored if the +# target node is no longer in the cluster, and also (for "update" commands), if +# the target node is in the list of nodes excluded from the count, in the domain +# configuration for this machine's default domain. The idea here is to remove +# delayed "update" commands for nodes for which inbound access to port 5663, +# is blocked, but are still part of the LON-CAPA network, (i.e., they can still +# replicate content from other nodes). +# + + my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r"); + if (defined($dfh)) { + while (my $line=<$dfh>) { + my ($time,$sdf,$rest)=split(/:/,$line,3); + if ($time < 1541185772) { + $checkbackwards = 1; + } + last; + } + undef $dfh; + } + + if ($checkbackwards) { + if (tie *BW, 'File::ReadBackwards', "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") { + while(my $line=) { + if ($line =~ /\QLondTransaction=HASH\E[^:]+:update:/) { + ($checkfrom) = split(/:/,$line,2); + last; + } + } + close(BW); + } + } + $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r"); + if (defined($dfh)) { + while (my $line=<$dfh>) { + my ($time,$sdf,$rest)=split(/:/,$line,3); + next unless (($sdf eq 'F') || ($sdf eq 'S') || ($sdf eq 'D')); + next if (($checkfrom) && ($time <= $checkfrom)); + my ($dserv,$dcmd); + if ($sdf eq 'S') { + my ($serva,$cmda,$servb,$cmdb) = split(/:/,$rest); + if ($cmda eq 'sethost') { + chomp($cmdb); + $dcmd = $cmdb; + } else { + $dcmd = $cmda; + } + if (($serva =~ /^LondTransaction/) || ($serva eq '')) { + unless (($servb eq '') || ($servb =~ m{^/})) { + $dserv = $servb; + } + } else { + $dserv = $serva; + } + } else { + ($dserv,$dcmd) = split(/:/,$rest); + } + if ($sdf eq 'F') { + my $local=localtime($time); + &log($fh,"Failed: $time, $dserv, $dcmd
"); + $warnings++; + } + next if ((($dserv eq '') || ($dcmd eq '')) && ($sdf ne 'F')); + if ($sdf eq 'S') { + if ($dcmd eq 'update') { + if ($hostname{$dserv}) { + if ($exclusions{$serverhomes{$hostname{$dserv}}}) { + $ignored --; + } else { + $unsend --; + } + } + if (exists($bymachine{$dserv})) { + $bymachine{$dserv} --; + } else { + $bymachine{$dserv} = -1; + } + } else { + if ($hostname{$dserv}) { + $unsend --; + } + } + } elsif ($sdf eq 'D') { + if ($dcmd eq 'update') { + if ($hostname{$dserv}) { + if ($exclusions{$serverhomes{$hostname{$dserv}}}) { + $ignored ++; + } else { + $unsend ++; + } + } + if (exists($bymachine{$dserv})) { + $bymachine{$dserv} ++; + } else { + $bymachine{$dserv} = 1; + } + } else { + if ($hostname{$dserv}) { + $unsend ++; + } + } + } + } + undef $dfh; + my $nodest = 0; + my $retired = 0; + my %active; + if (keys(%bymachine)) { + unless ($checkexcluded) { + %serverhomes = &read_serverhomeIDs(); + } + foreach my $key (keys(%bymachine)) { + if ($bymachine{$key} > 0) { + if ($hostname{$key}) { + $active{$serverhomes{$hostname{$key}}} += $bymachine{$key}; + } else { + $retired ++; + $nodest += $bymachine{$key}; + } + } + } + } + if (keys(%active)) { + &log($fh,"Unsend messages by node, active (undegraded) nodes in cluster
\n"); + foreach my $key (sort(keys(%active))) { + &log($fh,&encode_entities("$key => $active{$key}",'<>&"')."\n"); + } + } + &log($fh,"Total unsend messages: $unsend for ".scalar(keys(%active))." active (undegraded) nodes in cluster.
\n"); + if (keys(%exclusions) > 0) { + &log($fh,"Total incomplete updates $ignored for ".scalar(keys(%exclusions))." degraded nodes in cluster.
\n"); + } + if ($retired) { + &log($fh,"Total unsent $nodest for $retired nodes no longer in cluster.
\n"); + } + if ($unsend > 0) { + $warnings=$warnings+$weights{'U'}*$unsend; + } } if ($unsend) { $simplestatus{'unsend'}=$unsend; } @@ -673,8 +901,6 @@ sub check_delayed_msg { } &log($fh,"