--- loncom/loncron 2018/11/06 15:37:42 1.113 +++ loncom/loncron 2018/11/18 22:50:46 1.114 @@ -2,7 +2,7 @@ # Housekeeping program, started by cron, loncontrol and loncron.pl # -# $Id: loncron,v 1.113 2018/11/06 15:37:42 raeburn Exp $ +# $Id: loncron,v 1.114 2018/11/18 22:50:46 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # @@ -43,6 +43,7 @@ use HTML::Entities; use Getopt::Long; use GDBM_File; use Storable qw(thaw); +use File::ReadBackwards; #globals use vars qw (%perlvar %simplestatus $errors $warnings $notices $totalcount); @@ -646,35 +647,199 @@ sub test_connections { # ------------------------------------------------------------ Delayed messages sub check_delayed_msg { - my ($fh)=@_; + my ($fh,$weightsref,$exclusionsref)=@_; &log($fh,'
Total unsend messages: $unsend
\n"); - if ($unsend > 0) { - $warnings=$warnings+5*$unsend; +# +# For LON-CAPA 1.2.0 to 2.1.3 (release dates: 8/31/2004 and 3/31/2006) any +# entry logged in lonnet.perm.log for completion of a delayed (critical) +# transaction lacked the hostID for the remote node to which the command +# to be completed was sent. +# +# Because of this, exclusion of items in lonnet.perm.log for nodes which are +# no longer part of the cluster from adding to the overall "unsend" count +# needs additional effort besides the changes made in loncron rev. 1.105. +# +# For "S" (completion) events logging in LON-CAPA 1.2.0 through 2.1.3 included +# "LondTransaction=HASH(hexadecimal)->getClient() :$cmd, where the hexadecimal +# is a memory location, and $cmd is the command sent to the remote node. +# +# Starting with 2.2.0 (released 8/21/2006) logging for "S" (completion) events +# had sethost:$host_id:$cmd after LondTransaction=HASH(hexadecimal)->getClient() +# +# Starting with 2.4.1 (released 6/13/2007) logging for "S" replaced echoing the +# getClient() call with the result of the Transaction->getClient() call itself +# undef for completion of delivery of a delayed message. +# +# The net effect of these changes is that lonnet.perm.log is now accessed three +# times: (a) oldest record is checked, if earlier than release date for 2.5.0 +# then (b) file is read backwards, with timestamp recorded for most recent +# instance of logged "S" event for "update" command without "sethost:$host_id:" +# then (c) file is read forward with records ignored which predate the timestamp +# recorded in (b), if one was found. +# +# In (c), when calculating the unsend total, i.e., the difference between delayed +# transactions ("D") and sent transactions ("S"), transactions are ignored if the +# target node is no longer in the cluster, and also (for "update" commands), if +# the target node is in the list of nodes excluded from the count, in the domain +# configuration for this machine's default domain. The idea here is to remove +# delayed "update" commands for nodes for which inbound access to port 5663, +# is blocked, but are still part of the LON-CAPA network, (i.e., they can still +# replicate content from other nodes). +# + + my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r"); + if (defined($dfh)) { + while (my $line=<$dfh>) { + my ($time,$sdf,$rest)=split(/:/,$line,3); + if ($time < 1541185772) { + $checkbackwards = 1; + } + last; + } + undef $dfh; + } + + if ($checkbackwards) { + if (tie *BW, 'File::ReadBackwards', "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") { + while(my $line=Unsend messages by node, active (undegraded) nodes in cluster
\n"); + foreach my $key (sort(keys(%active))) { + &log($fh,&encode_entities("$key => $active{$key}",'<>&"')."\n"); + } + } + &log($fh,"Total unsend messages: $unsend for ".scalar(keys(%active))." active (undegraded) nodes in cluster.
\n"); + if (keys(%exclusions) > 0) { + &log($fh,"Total incomplete updates $ignored for ".scalar(keys(%exclusions))." degraded nodes in cluster.
\n"); + } + if ($retired) { + &log($fh,"Total unsent $nodest for $retired nodes no longer in cluster.
\n"); + } + if ($unsend > 0) { + $warnings=$warnings+$weights{'U'}*$unsend; + } } if ($unsend) { $simplestatus{'unsend'}=$unsend; } @@ -713,7 +878,11 @@ sub check_delayed_msg { } sub finish_logging { - my ($fh,%weights)=@_; + my ($fh,$weightsref)=@_; + my %weights; + if (ref($weightsref) eq 'HASH') { + %weights = %{$weightsref}; + } &log($fh,"\n"); $totalcount=($weights{'N'}*$notices)+($weights{'W'}*$warnings)+($weights{'E'}*$errors); &errout($fh); @@ -1086,6 +1255,62 @@ sub reset_nosslverify_pids { return; } +sub get_permcount_settings { + my ($domconf) = @_; + my ($defaults,$names) = &Apache::loncommon::lon_status_items(); + my (%weights,$threshold,$sysmail,$reportstatus,%exclusions); + foreach my $type ('E','W','N','U') { + $weights{$type} = $defaults->{$type}; + } + $threshold = $defaults->{'threshold'}; + $sysmail = $defaults->{'sysmail'}; + $reportstatus = 1; + if (ref($domconf) eq 'HASH') { + if (ref($domconf->{'contacts'}) eq 'HASH') { + if ($domconf->{'contacts'}{'reportstatus'} == 0) { + $reportstatus = 0; + } + if (ref($domconf->{'contacts'}{'lonstatus'}) eq 'HASH') { + if (ref($domconf->{'contacts'}{'lonstatus'}{weights}) eq 'HASH') { + foreach my $type ('E','W','N','U') { + if (exists($domconf->{'contacts'}{'lonstatus'}{weights}{$type})) { + $weights{$type} = $domconf->{'contacts'}{'lonstatus'}{weights}{$type}; + } + } + } + if (ref($domconf->{'contacts'}{'lonstatus'}{'excluded'}) eq 'ARRAY') { + my @excluded = @{$domconf->{'contacts'}{'lonstatus'}{'excluded'}}; + if (@excluded) { + map { $exclusions{$_} = 1; } @excluded; + } + } + if (exists($domconf->{'contacts'}{'lonstatus'}{'threshold'})) { + $threshold = $domconf->{'contacts'}{'lonstatus'}{'threshold'}; + } + if (exists($domconf->{'contacts'}{'lonstatus'}{'sysmail'})) { + $sysmail = $domconf->{'contacts'}{'lonstatus'}{'sysmail'}; + } + } + } + } + return ($threshold,$sysmail,$reportstatus,\%weights,\%exclusions); +} + +sub read_serverhomeIDs { + my %server; + if (-e "$perlvar{'lonTabDir'}/serverhomeIDs.tab") { + if (open(my $fh,'<',"$perlvar{'lonTabDir'}/serverhomeIDs.tab")) { + while (<$fh>) { + my($host,$id) = split(/:/); + chomp($id); + $server{$host} = $id; + } + close($fh); + } + } + return %server; +} + sub send_mail { my ($sysmail,$reportstatus) = @_; my $defdom = $perlvar{'lonDefDomain'}; @@ -1273,8 +1498,10 @@ sub main () { &test_connections($fh); } if (!$justcheckdaemons && !$justcheckconnections && !$justreload) { - &check_delayed_msg($fh); - &log_simplestatus(); + my $domconf = &get_domain_config(); + my ($threshold,$sysmail,$reportstatus,$weightsref,$exclusionsref) = + &get_permcount_settings($domconf); + &check_delayed_msg($fh,$weightsref,$exclusionsref); &write_loncaparevs(); &write_serverhomeIDs(); &write_checksums(); @@ -1287,37 +1514,9 @@ sub main () { &checkon_daemon($fh,'lond',40000,'USR2'); &reset_nosslverify_pids($fh,%sslrem); } - my $domconf = &get_domain_config(); - my ($defaults,$names) = &Apache::loncommon::lon_status_items(); - my (%weights,$threshold); - foreach my $type ('E','W','N') { - $weights{$type} = $defaults->{$type}; - } - my $threshold = $defaults->{'threshold'}; - my $sysmail = $defaults->{'sysmail'}; - my $reportstatus = 1; - if (ref($domconf->{'contacts'}) eq 'HASH') { - if ($domconf->{'contacts'}{'reportstatus'} == 0) { - $reportstatus = 0; - } - if (ref($domconf->{'contacts'}{'lonstatus'}) eq 'HASH') { - if (ref($domconf->{'contacts'}{'lonstatus'}{weights}) eq 'HASH') { - foreach my $type ('E','W','N') { - if (exists($domconf->{'contacts'}{'lonstatus'}{weights}{$type})) { - $weights{$type} = $domconf->{'contacts'}{'lonstatus'}{weights}{$type}; - } - } - } - } - if (exists($domconf->{'contacts'}{'lonstatus'}{'threshold'})) { - $threshold = $domconf->{'contacts'}{'lonstatus'}{'threshold'}; - } - if (exists($domconf->{'contacts'}{'lonstatus'}{'sysmail'})) { - $sysmail = $domconf->{'contacts'}{'lonstatus'}{'sysmail'}; - } - } - &finish_logging($fh,%weights); - if ($totalcount>$threshold && !$noemail) { &send_mail($sysmail,$reportstatus); } + &finish_logging($fh,$weightsref); + &log_simplestatus(); + if ($totalcount>$threshold && !$noemail) { &send_mail($sysmail,$reportstatus); } } }