--- loncom/loncron 2004/05/11 20:19:46 1.48 +++ loncom/loncron 2007/04/11 23:08:43 1.72 @@ -2,7 +2,7 @@ # Housekeeping program, started by cron, loncontrol and loncron.pl # -# $Id: loncron,v 1.48 2004/05/11 20:19:46 albertel Exp $ +# $Id: loncron,v 1.72 2007/04/11 23:08:43 albertel Exp $ # # Copyright Michigan State University Board of Trustees # @@ -32,31 +32,18 @@ use strict; use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; +use Apache::lonnet; use IO::File; use IO::Socket; use HTML::Entities; +use Getopt::Long; #globals use vars qw (%perlvar %simplestatus $errors $warnings $notices $totalcount); my $statusdir="/home/httpd/html/lon-status"; -# -------------------------------------------------- Non-critical communication -sub reply { - my ($cmd,$server)=@_; - my $peerfile="$perlvar{'lonSockDir'}/$server"; - my $client=IO::Socket::UNIX->new(Peer =>"$peerfile", - Type => SOCK_STREAM, - Timeout => 10) - or return "con_lost"; - print $client "$cmd\n"; - my $answer=<$client>; - chomp($answer); - if (!$answer) { $answer="con_lost"; } - return $answer; -} - # --------------------------------------------------------- Output error status sub log { @@ -76,21 +63,27 @@ ENDERROUT } sub start_daemon { - my ($fh,$daemon,$pidfile) = @_; + my ($fh,$daemon,$pidfile,$args) = @_; my $progname=$daemon; - if ($daemon eq 'lonc' && $ARGV[0] eq 'new') { + if ($daemon eq 'lonc') { $progname='loncnew'; - print "new "; } - system("$perlvar{'lonDaemons'}/$progname 2>>$perlvar{'lonDaemons'}/logs/${daemon}_errors"); - sleep 2; + my $error_fname="$perlvar{'lonDaemons'}/logs/${daemon}_errors"; + my $size=(stat($error_fname))[7]; + if ($size>40000) { + &log($fh,"
Rotating error logs ...
"); + rename("$error_fname.2","$error_fname.3"); + rename("$error_fname.1","$error_fname.2"); + rename("$error_fname","$error_fname.1"); + } + system("$perlvar{'lonDaemons'}/$progname 2>$perlvar{'lonDaemons'}/logs/${daemon}_errors"); + sleep 1; if (-e $pidfile) { &log($fh,"Seems like it started ...
"); my $lfh=IO::File->new("$pidfile"); my $daemonpid=<$lfh>; chomp($daemonpid); - sleep 2; - if (kill 0 => $daemonpid) { + if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) { return 1; } else { return 0; @@ -102,10 +95,11 @@ sub start_daemon { } sub checkon_daemon { - my ($fh,$daemon,$maxsize,$sendusr1)=@_; + my ($fh,$daemon,$maxsize,$send,$args)=@_; + my $result; &log($fh,'');
- printf("%-10s ",$daemon);
+ printf("%-15s ",$daemon);
if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
open (DFH,"tail -n25 $perlvar{'lonDaemons'}/logs/$daemon.log|");
while (my $line= Give it one more try ...$daemon at pid $daemonpid responding");
- if ($sendusr1) { &log($fh,", sending USR1"); }
+ if ($send) { &log($fh,", sending $send"); }
&log($fh,"
");
- if ($sendusr1) { kill USR1 => $daemonpid; }
+ if ($send eq 'USR1') { kill USR1 => $daemonpid; }
+ if ($send eq 'USR2') { kill USR2 => $daemonpid; }
$restartflag=0;
- print "running\n";
+ if ($send eq 'USR2') {
+ $result = 'reloaded';
+ print "reloaded\n";
+ } else {
+ $result = 'running';
+ print "running\n";
+ }
} else {
$errors++;
&log($fh,"$daemon at pid $daemonpid not responding
");
@@ -143,28 +144,33 @@ sub checkon_daemon {
if ($restartflag==1) {
$simplestatus{$daemon}='off';
$errors++;
+ my $kadaemon=$daemon;
+ if ($kadaemon eq 'lonmemcached') { $kadaemon='memcached'; }
&log($fh,'
Killall '.$daemon.': '.
- `killall $daemon 2>&1`.' - ');
- sleep 2;
+ `killall $kadaemon 2>&1`.' - ');
+ sleep 1;
&log($fh,unlink($pidfile).' - '.
- `killall -9 $daemon 2>&1`.
+ `killall -9 $kadaemon 2>&1`.
'
');
&log($fh,"$daemon not running, trying to start
");
- if (&start_daemon($fh,$daemon,$pidfile)) {
+ if (&start_daemon($fh,$daemon,$pidfile,$args)) {
&log($fh,"$daemon at pid $daemonpid responding
");
$simplestatus{$daemon}='restarted';
+ $result = 'started';
print "started\n";
} else {
$errors++;
&log($fh,"$daemon at pid $daemonpid not responding
");
&log($fh,"$daemon at pid $daemonpid responding
");
$simplestatus{$daemon}='restarted';
+ $result = 'started';
print "started\n";
} else {
+ $result = 'failed';
print " failed\n";
$simplestatus{$daemon}='failed';
$errors++; $errors++;
@@ -201,6 +207,7 @@ sub checkon_daemon {
}
&errout($fh);
+ return $result;
}
# --------------------------------------------------------------------- Machine
@@ -251,7 +258,7 @@ sub log_machine_info {
&log($fh,"");
my $psproc=0;
- open (PSH,"ps -aux --cols 140 |");
+ open (PSH,"ps aux --cols 140 |");
while (my $line=
distprobe
");
+ &log($fh,"");
+ open(DSH,"$perlvar{'lonDaemons'}/distprobe |");
+ while (my $line=
");
+
&errout($fh);
}
sub start_logging {
- my ($hostdom,$hostrole,$hostname,$spareid)=@_;
my $fh=IO::File->new(">$statusdir/newstatus.html");
my %simplestatus=();
my $now=time;
@@ -309,17 +325,24 @@ ENDHEADERS
&encode_entities($perlvar{$varname},'<>&"')."\n");
}
&log($fh,"Hosts
");
- foreach my $id (sort(keys(%{$hostname}))) {
+ my %hostname = &Apache::lonnet::all_hostnames();
+ foreach my $id (sort(keys(%hostname))) {
+ my $role = (&Apache::lonnet::is_library($id) ? 'library'
+ : 'access');
&log($fh,
- "
\n");
- }
- &log($fh,"$id ".$hostdom->{$id}.
- " ".$hostrole->{$id}.
- " ".$hostname->{$id}." Spare Hosts
");
- foreach my $id (sort(keys(%{$spareid}))) {
- &log($fh,"
\n");
+ &log($fh,"\n");
return $fh;
}
@@ -446,26 +469,34 @@ sub rotate_lonnet_logs {
# ----------------------------------------------------------------- Connections
sub test_connections {
- my ($fh,$hostname)=@_;
+ my ($fh)=@_;
&log($fh,' \n");
+ }
+ &log($fh,"$id ".&Apache::lonnet::host_domain($id).
+ " ".$role.
+ " ".&Apache::lonnet::hostname($id)." Spare Hosts
");
+ foreach my $type (sort(keys(%Apache::lonnet::spareid))) {
+ &log($fh,"
");
+ foreach my $id (@{ $Apache::lonnet::spareid{$type} }) {
+ &log($fh,"
\nConnections
');
print "testing connections\n";
&log($fh,"");
- foreach my $tryserver (sort(keys(%{$hostname}))) {
+ my ($good,$bad)=(0,0);
+ my %hostname = &Apache::lonnet::all_hostnames();
+ foreach my $tryserver (sort(keys(%hostname))) {
print(".");
my $result;
- my $answer=reply("pong",$tryserver);
+ my $answer=&Apache::lonnet::reply("ping",$tryserver);
if ($answer eq "$tryserver:$perlvar{'lonHostID'}") {
$result="ok";
+ $good++;
} else {
$result=$answer;
$warnings++;
- if ($answer eq 'con_lost') { $warnings++; }
+ if ($answer eq 'con_lost') {
+ $bad++;
+ $warnings++;
+ } else {
+ $good++; #self connection
+ }
}
if ($answer =~ /con_lost/) { print(" $tryserver down\n"); }
&log($fh,"
");
-
+ print "\n$good good, $bad bad connections\n";
&errout($fh);
}
@@ -497,13 +528,22 @@ sub check_delayed_msg {
if ($unsend) { $simplestatus{'unsend'}=$unsend; }
&log($fh," \n");
}
&log($fh,"$tryserver $result Outgoing Buffer
\n");
-
+# list directory with delayed messages and remember offline servers
+ my %servers=();
open (DFH,"ls -lF $perlvar{'lonSockDir'}/delayed|");
- while (my $line=
\n");
close (DFH);
+# pong to all servers that have delayed messages
+# this will trigger a reverse connection, which should flush the buffers
+ foreach my $tryserver (keys %servers) {
+ my $answer=&Apache::lonnet::reply("pong",$tryserver);
+ &log($fh,"Pong to $tryserver: $answer
");
+ }
}
sub finish_logging {
@@ -538,15 +578,51 @@ sub log_simplestatus {
sub send_mail {
print "sending mail\n";
my $emailto="$perlvar{'lonAdmEMail'}";
- if ($totalcount>1000) {
+ if ($totalcount>2500) {
$emailto.=",$perlvar{'lonSysEMail'}";
}
my $subj="LON: $perlvar{'lonHostID'} E:$errors W:$warnings N:$notices";
- system("metasend -b -t $emailto -s '$subj' -f $statusdir/index.html -m text/html");
+
+ my $result=system("metasend -b -S 4000000 -t $emailto -s '$subj' -f $statusdir/index.html -m text/html >& /dev/null");
+ if ($result != 0) {
+ $result=system("mail -s '$subj' $emailto < $statusdir/index.html");
+ }
+}
+
+sub usage {
+ print(<