--- loncom/loncron 2018/10/25 02:48:56 1.109
+++ loncom/loncron 2018/11/18 22:50:46 1.114
@@ -2,7 +2,7 @@
# Housekeeping program, started by cron, loncontrol and loncron.pl
#
-# $Id: loncron,v 1.109 2018/10/25 02:48:56 raeburn Exp $
+# $Id: loncron,v 1.114 2018/11/18 22:50:46 raeburn Exp $
#
# Copyright Michigan State University Board of Trustees
#
@@ -43,6 +43,7 @@ use HTML::Entities;
use Getopt::Long;
use GDBM_File;
use Storable qw(thaw);
+use File::ReadBackwards;
#globals
use vars qw (%perlvar %simplestatus $errors $warnings $notices $totalcount);
@@ -113,17 +114,20 @@ sub checkon_daemon {
my $result;
&log($fh,'
'.$daemon.'
Log
');
printf("%-15s ",$daemon);
- if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
- open (DFH,"tail -n25 $perlvar{'lonDaemons'}/logs/$daemon.log|");
- while (my $line=) {
- &log($fh,"$line");
- if ($line=~/INFO/) { $notices++; }
- if ($line=~/WARNING/) { $notices++; }
- if ($line=~/CRITICAL/) { $warnings++; }
- };
- close (DFH);
+ if ($fh) {
+ if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
+ if (open(DFH,"tail -n25 $perlvar{'lonDaemons'}/logs/$daemon.log|")) {
+ while (my $line=) {
+ &log($fh,"$line");
+ if ($line=~/INFO/) { $notices++; }
+ if ($line=~/WARNING/) { $notices++; }
+ if ($line=~/CRITICAL/) { $warnings++; }
+ }
+ close (DFH);
+ }
+ }
+ &log($fh,"
");
}
- &log($fh,"");
my $pidfile="$perlvar{'lonDaemons'}/logs/$daemon.pid";
@@ -169,7 +173,7 @@ sub checkon_daemon {
&clean_lonc_childpids();
}
&log($fh,"$daemon not running, trying to start
");
-
+
if (&start_daemon($fh,$daemon,$pidfile,$args)) {
&log($fh,"$daemon at pid $daemonpid responding
");
$simplestatus{$daemon}='restarted';
@@ -194,17 +198,19 @@ sub checkon_daemon {
&log($fh,"Unable to start $daemon
");
}
}
-
- if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
- &log($fh,"");
- open (DFH,"tail -n100 $perlvar{'lonDaemons'}/logs/$daemon.log|");
- while (my $line=) {
- &log($fh,"$line");
- if ($line=~/WARNING/) { $notices++; }
- if ($line=~/CRITICAL/) { $notices++; }
- };
- close (DFH);
- &log($fh,"
");
+ if ($fh) {
+ if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
+ &log($fh,"");
+ if (open(DFH,"tail -n100 $perlvar{'lonDaemons'}/logs/$daemon.log|")) {
+ while (my $line=) {
+ &log($fh,"$line");
+ if ($line=~/WARNING/) { $notices++; }
+ if ($line=~/CRITICAL/) { $notices++; }
+ }
+ close (DFH);
+ }
+ &log($fh,"
");
+ }
}
}
@@ -641,35 +647,199 @@ sub test_connections {
# ------------------------------------------------------------ Delayed messages
sub check_delayed_msg {
- my ($fh)=@_;
+ my ($fh,$weightsref,$exclusionsref)=@_;
&log($fh,'
Delayed Messages
');
print "Checking buffers.\n";
&log($fh,'Scanning Permanent Log
');
my $unsend=0;
+ my $ignored=0;
my %hostname = &Apache::lonnet::all_hostnames();
my $numhosts = scalar(keys(%hostname));
-
- my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log");
- while (my $line=<$dfh>) {
- my ($time,$sdf,$dserv,$dcmd)=split(/:/,$line);
- if ($numhosts) {
- next unless ($hostname{$dserv});
- }
- if ($sdf eq 'F') {
- my $local=localtime($time);
- &log($fh,"Failed: $time, $dserv, $dcmd
");
- $warnings++;
- }
- if ($sdf eq 'S') { $unsend--; }
- if ($sdf eq 'D') { $unsend++; }
+ my $checkbackwards = 0;
+ my $checkfrom = 0;
+ my $checkexcluded = 0;
+ my (%bymachine,%weights,%exclusions,%serverhomes);
+ if (ref($weightsref) eq 'HASH') {
+ %weights = %{$weightsref};
+ }
+ if (ref($exclusionsref) eq 'HASH') {
+ %exclusions = %{$exclusionsref};
+ if (keys(%exclusions)) {
+ $checkexcluded = 1;
+ %serverhomes = &read_serverhomeIDs();
+ }
}
- &log($fh,"Total unsend messages: $unsend
\n");
- if ($unsend > 0) {
- $warnings=$warnings+5*$unsend;
+#
+# For LON-CAPA 1.2.0 to 2.1.3 (release dates: 8/31/2004 and 3/31/2006) any
+# entry logged in lonnet.perm.log for completion of a delayed (critical)
+# transaction lacked the hostID for the remote node to which the command
+# to be completed was sent.
+#
+# Because of this, exclusion of items in lonnet.perm.log for nodes which are
+# no longer part of the cluster from adding to the overall "unsend" count
+# needs additional effort besides the changes made in loncron rev. 1.105.
+#
+# For "S" (completion) events logging in LON-CAPA 1.2.0 through 2.1.3 included
+# "LondTransaction=HASH(hexadecimal)->getClient() :$cmd, where the hexadecimal
+# is a memory location, and $cmd is the command sent to the remote node.
+#
+# Starting with 2.2.0 (released 8/21/2006) logging for "S" (completion) events
+# had sethost:$host_id:$cmd after LondTransaction=HASH(hexadecimal)->getClient()
+#
+# Starting with 2.4.1 (released 6/13/2007) logging for "S" replaced echoing the
+# getClient() call with the result of the Transaction->getClient() call itself
+# undef for completion of delivery of a delayed message.
+#
+# The net effect of these changes is that lonnet.perm.log is now accessed three
+# times: (a) oldest record is checked, if earlier than release date for 2.5.0
+# then (b) file is read backwards, with timestamp recorded for most recent
+# instance of logged "S" event for "update" command without "sethost:$host_id:"
+# then (c) file is read forward with records ignored which predate the timestamp
+# recorded in (b), if one was found.
+#
+# In (c), when calculating the unsend total, i.e., the difference between delayed
+# transactions ("D") and sent transactions ("S"), transactions are ignored if the
+# target node is no longer in the cluster, and also (for "update" commands), if
+# the target node is in the list of nodes excluded from the count, in the domain
+# configuration for this machine's default domain. The idea here is to remove
+# delayed "update" commands for nodes for which inbound access to port 5663,
+# is blocked, but are still part of the LON-CAPA network, (i.e., they can still
+# replicate content from other nodes).
+#
+
+ my $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r");
+ if (defined($dfh)) {
+ while (my $line=<$dfh>) {
+ my ($time,$sdf,$rest)=split(/:/,$line,3);
+ if ($time < 1541185772) {
+ $checkbackwards = 1;
+ }
+ last;
+ }
+ undef $dfh;
+ }
+
+ if ($checkbackwards) {
+ if (tie *BW, 'File::ReadBackwards', "$perlvar{'lonDaemons'}/logs/lonnet.perm.log") {
+ while(my $line=) {
+ if ($line =~ /\QLondTransaction=HASH\E[^:]+:update:/) {
+ ($checkfrom) = split(/:/,$line,2);
+ last;
+ }
+ }
+ close(BW);
+ }
+ }
+ $dfh=IO::File->new("$perlvar{'lonDaemons'}/logs/lonnet.perm.log","r");
+ if (defined($dfh)) {
+ while (my $line=<$dfh>) {
+ my ($time,$sdf,$rest)=split(/:/,$line,3);
+ next unless (($sdf eq 'F') || ($sdf eq 'S') || ($sdf eq 'D'));
+ next if (($checkfrom) && ($time <= $checkfrom));
+ my ($dserv,$dcmd);
+ if ($sdf eq 'S') {
+ my ($serva,$cmda,$servb,$cmdb) = split(/:/,$rest);
+ if ($cmda eq 'sethost') {
+ chomp($cmdb);
+ $dcmd = $cmdb;
+ } else {
+ $dcmd = $cmda;
+ }
+ if (($serva =~ /^LondTransaction/) || ($serva eq '')) {
+ unless (($servb eq '') || ($servb =~ m{^/})) {
+ $dserv = $servb;
+ }
+ } else {
+ $dserv = $serva;
+ }
+ } else {
+ ($dserv,$dcmd) = split(/:/,$rest);
+ }
+ if ($sdf eq 'F') {
+ my $local=localtime($time);
+ &log($fh,"Failed: $time, $dserv, $dcmd
");
+ $warnings++;
+ }
+ next if ((($dserv eq '') || ($dcmd eq '')) && ($sdf ne 'F'));
+ if ($sdf eq 'S') {
+ if ($dcmd eq 'update') {
+ if ($hostname{$dserv}) {
+ if ($exclusions{$serverhomes{$hostname{$dserv}}}) {
+ $ignored --;
+ } else {
+ $unsend --;
+ }
+ }
+ if (exists($bymachine{$dserv})) {
+ $bymachine{$dserv} --;
+ } else {
+ $bymachine{$dserv} = -1;
+ }
+ } else {
+ if ($hostname{$dserv}) {
+ $unsend --;
+ }
+ }
+ } elsif ($sdf eq 'D') {
+ if ($dcmd eq 'update') {
+ if ($hostname{$dserv}) {
+ if ($exclusions{$serverhomes{$hostname{$dserv}}}) {
+ $ignored ++;
+ } else {
+ $unsend ++;
+ }
+ }
+ if (exists($bymachine{$dserv})) {
+ $bymachine{$dserv} ++;
+ } else {
+ $bymachine{$dserv} = 1;
+ }
+ } else {
+ if ($hostname{$dserv}) {
+ $unsend ++;
+ }
+ }
+ }
+ }
+ undef $dfh;
+ my $nodest = 0;
+ my $retired = 0;
+ my %active;
+ if (keys(%bymachine)) {
+ unless ($checkexcluded) {
+ %serverhomes = &read_serverhomeIDs();
+ }
+ foreach my $key (keys(%bymachine)) {
+ if ($bymachine{$key} > 0) {
+ if ($hostname{$key}) {
+ $active{$serverhomes{$hostname{$key}}} += $bymachine{$key};
+ } else {
+ $retired ++;
+ $nodest += $bymachine{$key};
+ }
+ }
+ }
+ }
+ if (keys(%active)) {
+ &log($fh,"Unsend messages by node, active (undegraded) nodes in cluster
\n");
+ foreach my $key (sort(keys(%active))) {
+ &log($fh,&encode_entities("$key => $active{$key}",'<>&"')."\n");
+ }
+ }
+ &log($fh,"Total unsend messages: $unsend for ".scalar(keys(%active))." active (undegraded) nodes in cluster.
\n");
+ if (keys(%exclusions) > 0) {
+ &log($fh,"Total incomplete updates $ignored for ".scalar(keys(%exclusions))." degraded nodes in cluster.
\n");
+ }
+ if ($retired) {
+ &log($fh,"Total unsent $nodest for $retired nodes no longer in cluster.
\n");
+ }
+ if ($unsend > 0) {
+ $warnings=$warnings+$weights{'U'}*$unsend;
+ }
}
if ($unsend) { $simplestatus{'unsend'}=$unsend; }
@@ -708,9 +878,13 @@ sub check_delayed_msg {
}
sub finish_logging {
- my ($fh)=@_;
+ my ($fh,$weightsref)=@_;
+ my %weights;
+ if (ref($weightsref) eq 'HASH') {
+ %weights = %{$weightsref};
+ }
&log($fh,"\n");
- $totalcount=$notices+4*$warnings+100*$errors;
+ $totalcount=($weights{'N'}*$notices)+($weights{'W'}*$warnings)+($weights{'E'}*$errors);
&errout($fh);
&log($fh,"Total Error Count: $totalcount
");
my $now=time;
@@ -856,20 +1030,8 @@ sub clean_lonc_childpids {
}
sub write_connection_config {
- my ($isprimary,$domconf,$url,%connectssl,%changes);
- my $primaryLibServer = &Apache::lonnet::domain($perlvar{'lonDefDomain'},'primary');
- if ($primaryLibServer eq $perlvar{'lonHostID'}) {
- $isprimary = 1;
- } elsif ($primaryLibServer ne '') {
- my $protocol = $Apache::lonnet::protocol{$primaryLibServer};
- my $hostname = &Apache::lonnet::hostname($primaryLibServer);
- unless ($protocol eq 'https') {
- $protocol = 'http';
- }
- $url = $protocol.'://'.$hostname.'/cgi-bin/listdomconfig.pl';
- }
- my $domconf = &get_domain_config($perlvar{'lonDefDomain'},$primaryLibServer,$isprimary,
- $url);
+ my ($domconf,%connectssl,%changes);
+ $domconf = &get_domain_config();
if (ref($domconf) eq 'HASH') {
if (ref($domconf->{'ssl'}) eq 'HASH') {
foreach my $connect ('connto','connfrom') {
@@ -922,8 +1084,19 @@ sub write_connection_config {
}
sub get_domain_config {
- my ($dom,$primlibserv,$isprimary,$url) = @_;
- my %confhash;
+ my ($dom,$primlibserv,$isprimary,$url,%confhash);
+ $dom = $perlvar{'lonDefDomain'};
+ $primlibserv = &Apache::lonnet::domain($dom,'primary');
+ if ($primlibserv eq $perlvar{'lonHostID'}) {
+ $isprimary = 1;
+ } elsif ($primlibserv ne '') {
+ my $protocol = $Apache::lonnet::protocol{$primlibserv};
+ my $hostname = &Apache::lonnet::hostname($primlibserv);
+ unless ($protocol eq 'https') {
+ $protocol = 'http';
+ }
+ $url = $protocol.'://'.$hostname.'/cgi-bin/listdomconfig.pl';
+ }
if ($isprimary) {
my $lonusersdir = $perlvar{'lonUsersDir'};
my $fname = $lonusersdir.'/'.$dom.'/configuration.db';
@@ -1082,12 +1255,69 @@ sub reset_nosslverify_pids {
return;
}
+sub get_permcount_settings {
+ my ($domconf) = @_;
+ my ($defaults,$names) = &Apache::loncommon::lon_status_items();
+ my (%weights,$threshold,$sysmail,$reportstatus,%exclusions);
+ foreach my $type ('E','W','N','U') {
+ $weights{$type} = $defaults->{$type};
+ }
+ $threshold = $defaults->{'threshold'};
+ $sysmail = $defaults->{'sysmail'};
+ $reportstatus = 1;
+ if (ref($domconf) eq 'HASH') {
+ if (ref($domconf->{'contacts'}) eq 'HASH') {
+ if ($domconf->{'contacts'}{'reportstatus'} == 0) {
+ $reportstatus = 0;
+ }
+ if (ref($domconf->{'contacts'}{'lonstatus'}) eq 'HASH') {
+ if (ref($domconf->{'contacts'}{'lonstatus'}{weights}) eq 'HASH') {
+ foreach my $type ('E','W','N','U') {
+ if (exists($domconf->{'contacts'}{'lonstatus'}{weights}{$type})) {
+ $weights{$type} = $domconf->{'contacts'}{'lonstatus'}{weights}{$type};
+ }
+ }
+ }
+ if (ref($domconf->{'contacts'}{'lonstatus'}{'excluded'}) eq 'ARRAY') {
+ my @excluded = @{$domconf->{'contacts'}{'lonstatus'}{'excluded'}};
+ if (@excluded) {
+ map { $exclusions{$_} = 1; } @excluded;
+ }
+ }
+ if (exists($domconf->{'contacts'}{'lonstatus'}{'threshold'})) {
+ $threshold = $domconf->{'contacts'}{'lonstatus'}{'threshold'};
+ }
+ if (exists($domconf->{'contacts'}{'lonstatus'}{'sysmail'})) {
+ $sysmail = $domconf->{'contacts'}{'lonstatus'}{'sysmail'};
+ }
+ }
+ }
+ }
+ return ($threshold,$sysmail,$reportstatus,\%weights,\%exclusions);
+}
+
+sub read_serverhomeIDs {
+ my %server;
+ if (-e "$perlvar{'lonTabDir'}/serverhomeIDs.tab") {
+ if (open(my $fh,'<',"$perlvar{'lonTabDir'}/serverhomeIDs.tab")) {
+ while (<$fh>) {
+ my($host,$id) = split(/:/);
+ chomp($id);
+ $server{$host} = $id;
+ }
+ close($fh);
+ }
+ }
+ return %server;
+}
+
sub send_mail {
+ my ($sysmail,$reportstatus) = @_;
my $defdom = $perlvar{'lonDefDomain'};
my $origmail = $perlvar{'lonAdmEMail'};
my $emailto = &Apache::loncommon::build_recipient_list(undef,
'lonstatusmail',$defdom,$origmail);
- if ($totalcount>2500) {
+ if (($totalcount>$sysmail) && ($reportstatus)) {
$emailto.=",$perlvar{'lonSysEMail'}";
}
my $from;
@@ -1171,8 +1401,8 @@ sub main () {
chop $hostname;
$hostname=~s/[^\w\.]//g; # make sure is safe to pass through shell
my $subj="LON: Unconfigured machine $hostname";
- system("echo 'Unconfigured machine $hostname.' |\
- mailto $emailto -s '$subj' > /dev/null");
+ system("echo 'Unconfigured machine $hostname.' |".
+ " mail -s '$subj' $emailto > /dev/null");
exit 1;
}
@@ -1182,8 +1412,8 @@ sub main () {
print("User ID mismatch. This program must be run as user 'www'.\n");
my $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
my $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
- system("echo 'User ID mismatch. loncron must be run as user www.' |\
- mailto $emailto -s '$subj' > /dev/null");
+ system("echo 'User ID mismatch. loncron must be run as user www.' |".
+ " mail -s '$subj' $emailto > /dev/null");
exit 1;
}
@@ -1268,8 +1498,10 @@ sub main () {
&test_connections($fh);
}
if (!$justcheckdaemons && !$justcheckconnections && !$justreload) {
- &check_delayed_msg($fh);
- &log_simplestatus();
+ my $domconf = &get_domain_config();
+ my ($threshold,$sysmail,$reportstatus,$weightsref,$exclusionsref) =
+ &get_permcount_settings($domconf);
+ &check_delayed_msg($fh,$weightsref,$exclusionsref);
&write_loncaparevs();
&write_serverhomeIDs();
&write_checksums();
@@ -1282,8 +1514,9 @@ sub main () {
&checkon_daemon($fh,'lond',40000,'USR2');
&reset_nosslverify_pids($fh,%sslrem);
}
- &finish_logging($fh);
- if ($totalcount>200 && !$noemail) { &send_mail(); }
+ &finish_logging($fh,$weightsref);
+ &log_simplestatus();
+ if ($totalcount>$threshold && !$noemail) { &send_mail($sysmail,$reportstatus); }
}
}