--- loncom/loncron	2005/05/26 20:25:00	1.59
+++ loncom/loncron	2007/02/02 12:59:13	1.70
@@ -2,7 +2,7 @@
 
 # Housekeeping program, started by cron, loncontrol and loncron.pl
 #
-# $Id: loncron,v 1.59 2005/05/26 20:25:00 albertel Exp $
+# $Id: loncron,v 1.70 2007/02/02 12:59:13 raeburn Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -45,13 +45,13 @@ my $statusdir="/home/httpd/html/lon-stat
 
 # -------------------------------------------------- Non-critical communication
 sub reply {
-    my ($cmd,$server)=@_;
-    my $peerfile="$perlvar{'lonSockDir'}/$server";
+    my ($cmd,$server,$hostname)=@_;
+    my $peerfile="$perlvar{'lonSockDir'}/".$hostname->{$server};
     my $client=IO::Socket::UNIX->new(Peer    =>"$peerfile",
                                      Type    => SOCK_STREAM,
                                      Timeout => 10)
        or return "con_lost";
-    print $client "$cmd\n";
+    print $client "sethost:$server:$cmd\n";
     my $answer=<$client>;
     chomp($answer);
     if (!$answer) { $answer="con_lost"; }
@@ -92,14 +92,13 @@ sub start_daemon {
 	rename("$error_fname","$error_fname.1");
     }
     system("$perlvar{'lonDaemons'}/$progname 2>$perlvar{'lonDaemons'}/logs/${daemon}_errors");
-    sleep 2;
+    sleep 1;
     if (-e $pidfile) {
 	&log($fh,"<p>Seems like it started ...</p>");
 	my $lfh=IO::File->new("$pidfile");
 	my $daemonpid=<$lfh>;
 	chomp($daemonpid);
-	sleep 2;
-	if (kill 0 => $daemonpid) {
+	if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) {
 	    return 1;
 	} else {
 	    return 0;
@@ -113,6 +112,7 @@ sub start_daemon {
 sub checkon_daemon {
     my ($fh,$daemon,$maxsize,$send,$args)=@_;
 
+    my $result;
     &log($fh,'<hr /><a name="'.$daemon.'" /><h2>'.$daemon.'</h2><h3>Log</h3><p style="white-space: pre;"><tt>');
     printf("%-15s ",$daemon);
     if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
@@ -135,7 +135,7 @@ sub checkon_daemon {
 	my $lfh=IO::File->new("$pidfile");
 	$daemonpid=<$lfh>;
 	chomp($daemonpid);
-	if (kill 0 => $daemonpid) {
+	if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) {
 	    &log($fh,"<h3>$daemon at pid $daemonpid responding");
 	    if ($send) { &log($fh,", sending $send"); }
 	    &log($fh,"</h3>");
@@ -143,8 +143,10 @@ sub checkon_daemon {
 	    if ($send eq 'USR2') { kill USR2 => $daemonpid; }
 	    $restartflag=0;
 	    if ($send eq 'USR2') {
+		$result = 'reloaded';
 		print "reloaded\n";
 	    } else {
+		$result = 'running';
 		print "running\n";
 	    }
 	} else {
@@ -161,7 +163,7 @@ sub checkon_daemon {
 	if ($kadaemon eq 'lonmemcached') { $kadaemon='memcached'; }
 	&log($fh,'<br><font color="red">Killall '.$daemon.': '.
 	    `killall $kadaemon 2>&1`.' - ');
-	sleep 2;
+	sleep 1;
 	&log($fh,unlink($pidfile).' - '.
 	    `killall -9 $kadaemon 2>&1`.
 	    '</font><br>');
@@ -170,6 +172,7 @@ sub checkon_daemon {
 	if (&start_daemon($fh,$daemon,$pidfile,$args)) {
 	    &log($fh,"<h3>$daemon at pid $daemonpid responding</h3>");
 	    $simplestatus{$daemon}='restarted';
+	    $result = 'started';
 	    print "started\n";
 	} else {
 	    $errors++;
@@ -179,8 +182,10 @@ sub checkon_daemon {
 	    if (&start_daemon($fh,$daemon,$pidfile,$args)) {
 		&log($fh,"<h3>$daemon at pid $daemonpid responding</h3>");
 		$simplestatus{$daemon}='restarted';
+		$result = 'started';
 		print "started\n";
 	    } else {
+		$result = 'failed';
 		print " failed\n";
 		$simplestatus{$daemon}='failed';
 		$errors++; $errors++;
@@ -217,6 +222,7 @@ sub checkon_daemon {
     }
 
     &errout($fh);
+    return $result;
 }
 
 # --------------------------------------------------------------------- Machine
@@ -278,6 +284,16 @@ sub log_machine_info {
     if ($psproc>200) { $notices++; }
     if ($psproc>250) { $notices++; }
 
+    &log($fh,"<h3>distprobe</h3>");
+    &log($fh,"<pre>");
+    open(DSH,"$perlvar{'lonDaemons'}/distprobe |");
+    while (my $line=<DSH>) { 
+	&log($fh,&encode_entities($line,'<>&"')); 
+	$psproc++;
+    }
+    close(DSH);
+    &log($fh,"</pre>");
+
     &errout($fh);
 }
 
@@ -470,7 +486,7 @@ sub test_connections {
     foreach my $tryserver (sort(keys(%{$hostname}))) {
 	print(".");
 	my $result;
-	my $answer=reply("ping",$tryserver);
+	my $answer=&reply("ping",$tryserver,$hostname);
 	if ($answer eq "$tryserver:$perlvar{'lonHostID'}") {
 	    $result="<b>ok</b>";
 	    $good++;
@@ -495,7 +511,7 @@ sub test_connections {
 
 # ------------------------------------------------------------ Delayed messages
 sub check_delayed_msg {
-    my ($fh)=@_;
+    my ($fh,$hostname)=@_;
     &log($fh,'<hr /><a name="delayed" /><h2>Delayed Messages</h2>');
     print "checking buffers\n";
     
@@ -520,13 +536,22 @@ sub check_delayed_msg {
 
     if ($unsend) { $simplestatus{'unsend'}=$unsend; }
     &log($fh,"<h3>Outgoing Buffer</h3>\n<pre>");
-
+# list directory with delayed messages and remember offline servers
+    my %servers=();
     open (DFH,"ls -lF $perlvar{'lonSockDir'}/delayed|");
-    while (my $line=<DFH>) { 
+    while (my $line=<DFH>) {
+        my ($server)=($line=~/\.(\w+)$/);
+        if ($server) { $servers{$server}=1; }
 	&log($fh,&encode_entities($line,'<>&"'));
     }
     &log($fh,"</pre>\n");
     close (DFH);
+# pong to all servers that have delayed messages
+# this will trigger a reverse connection, which should flush the buffers
+    foreach my $tryserver (keys %servers) {
+	my $answer=&reply("pong",$tryserver,$hostname);
+	&log($fh,"Pong to $tryserver: $answer<br />");
+    }
 }
 
 sub finish_logging {
@@ -686,12 +711,15 @@ sub main () {
     }
     if (!$justcheckconnections && !$justreload) {
 	&checkon_daemon($fh,'lonsql',200000);
-	&checkon_daemon($fh,'lond',40000,'USR1');
+	if ( &checkon_daemon($fh,'lond',40000,'USR1') eq 'running') {
+	    &checkon_daemon($fh,'lond',40000,'USR2');
+	}
 	my $args='new';
 	if ($oldlonc) { $args = ''; }
 	&checkon_daemon($fh,'lonc',40000,'USR1',$args);
 	&checkon_daemon($fh,'lonhttpd',40000);
 	&checkon_daemon($fh,'lonmemcached',40000);
+        &checkon_daemon($fh,'lonmaxima',40000);
     }
     if ($justreload) {
 	&checkon_daemon($fh,'lond',40000,'USR2');
@@ -699,11 +727,11 @@ sub main () {
 	if ($oldlonc) { $args = ''; }
 	&checkon_daemon($fh,'lonc',40000,'USR2',$args);
     }
-    if (!$justcheckdaemons && !$justreload) {
+    if ($justcheckconnections) {
 	&test_connections($fh,\%hostname);
     }
     if (!$justcheckdaemons && !$justcheckconnections && !$justreload) {
-	&check_delayed_msg($fh);
+	&check_delayed_msg($fh,\%hostname);
 	&finish_logging($fh);
 	&log_simplestatus();