--- loncom/loncron 2004/05/11 20:19:46 1.48 +++ loncom/loncron 2009/02/11 15:24:28 1.79 @@ -2,7 +2,7 @@ # Housekeeping program, started by cron, loncontrol and loncron.pl # -# $Id: loncron,v 1.48 2004/05/11 20:19:46 albertel Exp $ +# $Id: loncron,v 1.79 2009/02/11 15:24:28 raeburn Exp $ # # Copyright Michigan State University Board of Trustees # @@ -32,31 +32,19 @@ use strict; use lib '/home/httpd/lib/perl/'; use LONCAPA::Configuration; +use Apache::lonnet; +use Apache::loncommon; use IO::File; use IO::Socket; use HTML::Entities; +use Getopt::Long; #globals use vars qw (%perlvar %simplestatus $errors $warnings $notices $totalcount); my $statusdir="/home/httpd/html/lon-status"; -# -------------------------------------------------- Non-critical communication -sub reply { - my ($cmd,$server)=@_; - my $peerfile="$perlvar{'lonSockDir'}/$server"; - my $client=IO::Socket::UNIX->new(Peer =>"$peerfile", - Type => SOCK_STREAM, - Timeout => 10) - or return "con_lost"; - print $client "$cmd\n"; - my $answer=<$client>; - chomp($answer); - if (!$answer) { $answer="con_lost"; } - return $answer; -} - # --------------------------------------------------------- Output error status sub log { @@ -75,22 +63,36 @@ sub errout { ENDERROUT } +sub rotate_logfile { + my ($file,$fh,$description) = @_; + my $size=(stat($file))[7]; + if ($size>40000) { + &log($fh,"
Rotating $description ...
"); + rename("$file.2","$file.3"); + rename("$file.1","$file.2"); + rename("$file","$file.1"); + } +} + sub start_daemon { - my ($fh,$daemon,$pidfile) = @_; + my ($fh,$daemon,$pidfile,$args) = @_; my $progname=$daemon; - if ($daemon eq 'lonc' && $ARGV[0] eq 'new') { + if ($daemon eq 'lonc') { $progname='loncnew'; - print "new "; } - system("$perlvar{'lonDaemons'}/$progname 2>>$perlvar{'lonDaemons'}/logs/${daemon}_errors"); - sleep 2; + my $error_fname="$perlvar{'lonDaemons'}/logs/${daemon}_errors"; + &rotate_logfile($error_fname,$fh,'error logs'); + if ($daemon eq 'lonc') { + &clean_sockets($fh); + } + system("$perlvar{'lonDaemons'}/$progname 2>$perlvar{'lonDaemons'}/logs/${daemon}_errors"); + sleep 1; if (-e $pidfile) { &log($fh,"Seems like it started ...
"); my $lfh=IO::File->new("$pidfile"); my $daemonpid=<$lfh>; chomp($daemonpid); - sleep 2; - if (kill 0 => $daemonpid) { + if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) { return 1; } else { return 0; @@ -102,10 +104,11 @@ sub start_daemon { } sub checkon_daemon { - my ($fh,$daemon,$maxsize,$sendusr1)=@_; + my ($fh,$daemon,$maxsize,$send,$args)=@_; + my $result; &log($fh,'');
- printf("%-10s ",$daemon);
+ printf("%-15s ",$daemon);
if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
open (DFH,"tail -n25 $perlvar{'lonDaemons'}/logs/$daemon.log|");
while (my $line= Give it one more try ... Rotating logs ... Cleaned up ".$cleaned." stale sockets. Rotating logs ...$daemon at pid $daemonpid responding");
- if ($sendusr1) { &log($fh,", sending USR1"); }
+ if ($send) { &log($fh,", sending $send"); }
&log($fh,"
");
- if ($sendusr1) { kill USR1 => $daemonpid; }
+ if ($send eq 'USR1') { kill USR1 => $daemonpid; }
+ if ($send eq 'USR2') { kill USR2 => $daemonpid; }
$restartflag=0;
- print "running\n";
+ if ($send eq 'USR2') {
+ $result = 'reloaded';
+ print "reloaded\n";
+ } else {
+ $result = 'running';
+ print "running\n";
+ }
} else {
$errors++;
&log($fh,"$daemon at pid $daemonpid not responding
");
@@ -143,28 +153,33 @@ sub checkon_daemon {
if ($restartflag==1) {
$simplestatus{$daemon}='off';
$errors++;
+ my $kadaemon=$daemon;
+ if ($kadaemon eq 'lonmemcached') { $kadaemon='memcached'; }
&log($fh,'
Killall '.$daemon.': '.
- `killall $daemon 2>&1`.' - ');
- sleep 2;
+ `killall $kadaemon 2>&1`.' - ');
+ sleep 1;
&log($fh,unlink($pidfile).' - '.
- `killall -9 $daemon 2>&1`.
+ `killall -9 $kadaemon 2>&1`.
'
');
&log($fh,"$daemon not running, trying to start
");
- if (&start_daemon($fh,$daemon,$pidfile)) {
+ if (&start_daemon($fh,$daemon,$pidfile,$args)) {
&log($fh,"$daemon at pid $daemonpid responding
");
$simplestatus{$daemon}='restarted';
+ $result = 'started';
print "started\n";
} else {
$errors++;
&log($fh,"$daemon at pid $daemonpid not responding
");
&log($fh,"$daemon at pid $daemonpid responding
");
$simplestatus{$daemon}='restarted';
+ $result = 'started';
print "started\n";
} else {
+ $result = 'failed';
print " failed\n";
$simplestatus{$daemon}='failed';
$errors++; $errors++;
@@ -187,20 +202,10 @@ sub checkon_daemon {
}
my $fname="$perlvar{'lonDaemons'}/logs/$daemon.log";
-
- my ($dev,$ino,$mode,$nlink,
- $uid,$gid,$rdev,$size,
- $atime,$mtime,$ctime,
- $blksize,$blocks)=stat($fname);
-
- if ($size>$maxsize) {
- &log($fh,"");
my $psproc=0;
- open (PSH,"ps -aux --cols 140 |");
+ open (PSH,"ps aux --cols 140 |");
while (my $line=
");
&errout($fh);
}
+sub rotate_other_logs {
+ my ($fh) = @_;
+ my $fname="$perlvar{'lonDaemons'}/logs/autoenroll.log";
+ &rotate_logfile($fname,$fh,'Auto Enroll log');
+ $fname="$perlvar{'lonDaemons'}/logs/autocreate.log";
+ &rotate_logfile($fname,$fh,'Create Course log');
+ $fname="$perlvar{'lonDaemons'}/logs/searchcat.log";
+ &rotate_logfile($fname,$fh,'Search Cataloguing log');
+}
+
# ----------------------------------------------------------------- Connections
sub test_connections {
- my ($fh,$hostname)=@_;
+ my ($fh)=@_;
&log($fh,'distprobe
");
+ &log($fh,"");
+ open(DSH,"$perlvar{'lonDaemons'}/distprobe |");
+ while (my $line=
");
+
&errout($fh);
}
sub start_logging {
- my ($hostdom,$hostrole,$hostname,$spareid)=@_;
my $fh=IO::File->new(">$statusdir/newstatus.html");
my %simplestatus=();
my $now=time;
@@ -291,7 +305,6 @@ sub start_logging {
Hosts
");
- foreach my $id (sort(keys(%{$hostname}))) {
+ my %hostname = &Apache::lonnet::all_hostnames();
+ foreach my $id (sort(keys(%hostname))) {
+ my $role = (&Apache::lonnet::is_library($id) ? 'library'
+ : 'access');
&log($fh,
- "
\n");
- }
- &log($fh,"$id ".$hostdom->{$id}.
- " ".$hostrole->{$id}.
- " ".$hostname->{$id}." Spare Hosts
");
- foreach my $id (sort(keys(%{$spareid}))) {
- &log($fh,"
\n");
+ &log($fh,"\n");
return $fh;
}
@@ -381,6 +401,21 @@ sub clean_lonIDs {
&log($fh," \n");
+ }
+ &log($fh,"$id ".&Apache::lonnet::host_domain($id).
+ " ".$role.
+ " ".&Apache::lonnet::hostname($id)." Spare Hosts
");
+ foreach my $type (sort(keys(%Apache::lonnet::spareid))) {
+ &log($fh,"
");
+ foreach my $id (@{ $Apache::lonnet::spareid{$type} }) {
+ &log($fh,"
\n$active open session(s)
");
}
+# ----------------------------------------------------------- clean out sockets
+sub clean_sockets {
+ my ($fh)=@_;
+ my $cleaned=0;
+ opendir(SOCKETS,$perlvar{'lonSockDir'});
+ while (my $fname=readdir(SOCKETS)) {
+ next if (-d $fname
+ || $fname=~/(mysqlsock|maximasock|\Q$perlvar{'lonSockDir'}\E)/);
+ $cleaned++;
+ &log($fh,"Unlinking $fname
");
+ unlink("/home/httpd/sockets/$fname");
+ }
+ &log($fh,"Connections
');
print "testing connections\n";
&log($fh,"");
- foreach my $tryserver (sort(keys(%{$hostname}))) {
+ my ($good,$bad)=(0,0);
+ my %hostname = &Apache::lonnet::all_hostnames();
+ foreach my $tryserver (sort(keys(%hostname))) {
print(".");
my $result;
- my $answer=reply("pong",$tryserver);
+ my $answer=&Apache::lonnet::reply("ping",$tryserver);
if ($answer eq "$tryserver:$perlvar{'lonHostID'}") {
$result="ok";
+ $good++;
} else {
$result=$answer;
$warnings++;
- if ($answer eq 'con_lost') { $warnings++; }
+ if ($answer eq 'con_lost') {
+ $bad++;
+ $warnings++;
+ } else {
+ $good++; #self connection
+ }
}
if ($answer =~ /con_lost/) { print(" $tryserver down\n"); }
&log($fh,"
");
-
+ print "\n$good good, $bad bad connections\n";
&errout($fh);
}
@@ -497,13 +539,22 @@ sub check_delayed_msg {
if ($unsend) { $simplestatus{'unsend'}=$unsend; }
&log($fh," \n");
}
&log($fh,"$tryserver $result Outgoing Buffer
\n");
-
+# list directory with delayed messages and remember offline servers
+ my %servers=();
open (DFH,"ls -lF $perlvar{'lonSockDir'}/delayed|");
- while (my $line=
\n");
close (DFH);
+# pong to all servers that have delayed messages
+# this will trigger a reverse connection, which should flush the buffers
+ foreach my $tryserver (keys %servers) {
+ my $answer=&Apache::lonnet::reply("pong",$tryserver);
+ &log($fh,"Pong to $tryserver: $answer
");
+ }
}
sub finish_logging {
@@ -525,7 +576,7 @@ sub finish_logging {
}
sub log_simplestatus {
- rename ("$statusdir/newstatus.html","$statusdir/index.html");
+ rename("$statusdir/newstatus.html","$statusdir/index.html");
my $sfh=IO::File->new(">$statusdir/loncron_simple.txt");
foreach (keys %simplestatus) {
@@ -537,23 +588,62 @@ sub log_simplestatus {
sub send_mail {
print "sending mail\n";
- my $emailto="$perlvar{'lonAdmEMail'}";
- if ($totalcount>1000) {
+ my $defdom = $perlvar{'lonDefDomain'};
+ my $origmail = $perlvar{'lonAdmEMail'};
+ my $emailto = &Apache::loncommon::build_recipient_list(undef,
+ 'lonstatusmail',$defdom,$origmail);
+ if ($totalcount>2500) {
$emailto.=",$perlvar{'lonSysEMail'}";
}
my $subj="LON: $perlvar{'lonHostID'} E:$errors W:$warnings N:$notices";
- system("metasend -b -t $emailto -s '$subj' -f $statusdir/index.html -m text/html");
+
+ my $result=system("metasend -b -S 4000000 -t $emailto -s '$subj' -f $statusdir/index.html -m text/html >& /dev/null");
+ if ($result != 0) {
+ $result=system("mail -s '$subj' $emailto < $statusdir/index.html");
+ }
+}
+
+sub usage {
+ print(<