--- loncom/loncron	2004/06/09 13:30:41	1.53
+++ loncom/loncron	2006/10/25 21:33:54	1.68
@@ -2,7 +2,7 @@
 
 # Housekeeping program, started by cron, loncontrol and loncron.pl
 #
-# $Id: loncron,v 1.53 2004/06/09 13:30:41 albertel Exp $
+# $Id: loncron,v 1.68 2006/10/25 21:33:54 www Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -45,13 +45,13 @@ my $statusdir="/home/httpd/html/lon-stat
 
 # -------------------------------------------------- Non-critical communication
 sub reply {
-    my ($cmd,$server)=@_;
-    my $peerfile="$perlvar{'lonSockDir'}/$server";
+    my ($cmd,$server,$hostname)=@_;
+    my $peerfile="$perlvar{'lonSockDir'}/".$hostname->{$server};
     my $client=IO::Socket::UNIX->new(Peer    =>"$peerfile",
                                      Type    => SOCK_STREAM,
                                      Timeout => 10)
        or return "con_lost";
-    print $client "$cmd\n";
+    print $client "sethost:$server:$cmd\n";
     my $answer=<$client>;
     chomp($answer);
     if (!$answer) { $answer="con_lost"; }
@@ -92,14 +92,13 @@ sub start_daemon {
 	rename("$error_fname","$error_fname.1");
     }
     system("$perlvar{'lonDaemons'}/$progname 2>$perlvar{'lonDaemons'}/logs/${daemon}_errors");
-    sleep 2;
+    sleep 1;
     if (-e $pidfile) {
 	&log($fh,"<p>Seems like it started ...</p>");
 	my $lfh=IO::File->new("$pidfile");
 	my $daemonpid=<$lfh>;
 	chomp($daemonpid);
-	sleep 2;
-	if (kill 0 => $daemonpid) {
+	if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) {
 	    return 1;
 	} else {
 	    return 0;
@@ -111,10 +110,11 @@ sub start_daemon {
 }
 
 sub checkon_daemon {
-    my ($fh,$daemon,$maxsize,$sendusr1,$args)=@_;
+    my ($fh,$daemon,$maxsize,$send,$args)=@_;
 
+    my $result;
     &log($fh,'<hr /><a name="'.$daemon.'" /><h2>'.$daemon.'</h2><h3>Log</h3><p style="white-space: pre;"><tt>');
-    printf("%-10s ",$daemon);
+    printf("%-15s ",$daemon);
     if (-e "$perlvar{'lonDaemons'}/logs/$daemon.log"){
 	open (DFH,"tail -n25 $perlvar{'lonDaemons'}/logs/$daemon.log|");
 	while (my $line=<DFH>) { 
@@ -135,13 +135,20 @@ sub checkon_daemon {
 	my $lfh=IO::File->new("$pidfile");
 	$daemonpid=<$lfh>;
 	chomp($daemonpid);
-	if (kill 0 => $daemonpid) {
+	if ($daemonpid =~ /^\d+$/ && kill 0 => $daemonpid) {
 	    &log($fh,"<h3>$daemon at pid $daemonpid responding");
-	    if ($sendusr1) { &log($fh,", sending USR1"); }
+	    if ($send) { &log($fh,", sending $send"); }
 	    &log($fh,"</h3>");
-	    if ($sendusr1) { kill USR1 => $daemonpid; }
+	    if ($send eq 'USR1') { kill USR1 => $daemonpid; }
+	    if ($send eq 'USR2') { kill USR2 => $daemonpid; }
 	    $restartflag=0;
-	    print "running\n";
+	    if ($send eq 'USR2') {
+		$result = 'reloaded';
+		print "reloaded\n";
+	    } else {
+		$result = 'running';
+		print "running\n";
+	    }
 	} else {
 	    $errors++;
 	    &log($fh,"<h3>$daemon at pid $daemonpid not responding</h3>");
@@ -152,17 +159,20 @@ sub checkon_daemon {
     if ($restartflag==1) {
 	$simplestatus{$daemon}='off';
 	$errors++;
+	my $kadaemon=$daemon;
+	if ($kadaemon eq 'lonmemcached') { $kadaemon='memcached'; }
 	&log($fh,'<br><font color="red">Killall '.$daemon.': '.
-	    `killall $daemon 2>&1`.' - ');
-	sleep 2;
+	    `killall $kadaemon 2>&1`.' - ');
+	sleep 1;
 	&log($fh,unlink($pidfile).' - '.
-	    `killall -9 $daemon 2>&1`.
+	    `killall -9 $kadaemon 2>&1`.
 	    '</font><br>');
 	&log($fh,"<h3>$daemon not running, trying to start</h3>");
 	
 	if (&start_daemon($fh,$daemon,$pidfile,$args)) {
 	    &log($fh,"<h3>$daemon at pid $daemonpid responding</h3>");
 	    $simplestatus{$daemon}='restarted';
+	    $result = 'started';
 	    print "started\n";
 	} else {
 	    $errors++;
@@ -172,8 +182,10 @@ sub checkon_daemon {
 	    if (&start_daemon($fh,$daemon,$pidfile,$args)) {
 		&log($fh,"<h3>$daemon at pid $daemonpid responding</h3>");
 		$simplestatus{$daemon}='restarted';
+		$result = 'started';
 		print "started\n";
 	    } else {
+		$result = 'failed';
 		print " failed\n";
 		$simplestatus{$daemon}='failed';
 		$errors++; $errors++;
@@ -210,6 +222,7 @@ sub checkon_daemon {
     }
 
     &errout($fh);
+    return $result;
 }
 
 # --------------------------------------------------------------------- Machine
@@ -271,6 +284,16 @@ sub log_machine_info {
     if ($psproc>200) { $notices++; }
     if ($psproc>250) { $notices++; }
 
+    &log($fh,"<h3>distprobe</h3>");
+    &log($fh,"<pre>");
+    open(DSH,"$perlvar{'lonDaemons'}/distprobe |");
+    while (my $line=<DSH>) { 
+	&log($fh,&encode_entities($line,'<>&"')); 
+	$psproc++;
+    }
+    close(DSH);
+    &log($fh,"</pre>");
+
     &errout($fh);
 }
 
@@ -463,7 +486,7 @@ sub test_connections {
     foreach my $tryserver (sort(keys(%{$hostname}))) {
 	print(".");
 	my $result;
-	my $answer=reply("pong",$tryserver);
+	my $answer=&reply("ping",$tryserver,$hostname);
 	if ($answer eq "$tryserver:$perlvar{'lonHostID'}") {
 	    $result="<b>ok</b>";
 	    $good++;
@@ -488,7 +511,7 @@ sub test_connections {
 
 # ------------------------------------------------------------ Delayed messages
 sub check_delayed_msg {
-    my ($fh)=@_;
+    my ($fh,$hostname)=@_;
     &log($fh,'<hr /><a name="delayed" /><h2>Delayed Messages</h2>');
     print "checking buffers\n";
     
@@ -513,13 +536,22 @@ sub check_delayed_msg {
 
     if ($unsend) { $simplestatus{'unsend'}=$unsend; }
     &log($fh,"<h3>Outgoing Buffer</h3>\n<pre>");
-
+# list directory with delayed messages and remember offline servers
+    my %servers=();
     open (DFH,"ls -lF $perlvar{'lonSockDir'}/delayed|");
-    while (my $line=<DFH>) { 
+    while (my $line=<DFH>) {
+        my ($server)=($line=~/\.(\w+)$/);
+        if ($server) { $servers{$server}=1; }
 	&log($fh,&encode_entities($line,'<>&"'));
     }
     &log($fh,"</pre>\n");
     close (DFH);
+# pong to all servers that have delayed messages
+# this will trigger a reverse connection, which should flush the buffers
+    foreach my $tryserver (keys %servers) {
+	my $answer=&reply("pong",$tryserver,$hostname);
+	&log($fh,"Pong to $tryserver: $answer");
+    }
 }
 
 sub finish_logging {
@@ -554,12 +586,12 @@ sub log_simplestatus {
 sub send_mail {
     print "sending mail\n";
     my $emailto="$perlvar{'lonAdmEMail'}";
-    if ($totalcount>1000) {
+    if ($totalcount>2500) {
 	$emailto.=",$perlvar{'lonSysEMail'}";
     }
     my $subj="LON: $perlvar{'lonHostID'} E:$errors W:$warnings N:$notices"; 
 
-    my $result=system("metasend -b -t $emailto -s '$subj' -f $statusdir/index.html -m text/html >& /dev/null");
+    my $result=system("metasend -b -S 4000000 -t $emailto -s '$subj' -f $statusdir/index.html -m text/html >& /dev/null");
     if ($result != 0) {
 	$result=system("mail -s '$subj' $emailto < $statusdir/index.html");
     }
@@ -581,18 +613,24 @@ Options:
                                 running, do not send emails do not
                                 check the lonc/d connections, do not
                                 generate lon-status
+   --justreload            Only tell the daemons to reload the config files,
+				do not send emails do not
+                                check if the daemons are running, do not
+                                generate lon-status
                            
 USAGE
 }
 
 # ================================================================ Main Program
 sub main () {
-    my ($oldlonc,$help,$justcheckdaemons,$noemail,$justcheckconnections);
+    my ($oldlonc,$help,$justcheckdaemons,$noemail,$justcheckconnections,
+	$justreload);
     &GetOptions("help"                 => \$help,
 		"oldlonc"              => \$oldlonc,
 		"justcheckdaemons"     => \$justcheckdaemons,
 		"noemail"              => \$noemail,
-		"justcheckconnections" => \$justcheckconnections
+		"justcheckconnections" => \$justcheckconnections,
+		"justreload"           => \$justreload
 		);
     if ($help) { &usage(); return; }
 # --------------------------------- Read loncapa_apache.conf and loncapa.conf
@@ -633,8 +671,9 @@ sub main () {
     my (%hostname,%hostdom,%hostrole,%spareid);
     while (my $configline=<$config>) {
 	next if ($configline =~ /^(\#|\s*\$)/);
-	my ($id,$domain,$role,$name,$ip,$domdescr)=split(/:/,$configline);
-	if ($id && $domain && $role && $name && $ip) {
+	my ($id,$domain,$role,$name)=split(/:/,$configline);
+	if ($id && $domain && $role && $name) {
+	    $name=~s/\s//g;
 	    $hostname{$id}=$name;
 	    $hostdom{$id}=$domain;
 	    $hostrole{$id}=$role;
@@ -661,7 +700,7 @@ sub main () {
 
 	
     my $fh;
-    if (!$justcheckdaemons && !$justcheckconnections) {
+    if (!$justcheckdaemons && !$justcheckconnections && !$justreload) {
 	$fh=&start_logging(\%hostdom,\%hostrole,\%hostname,\%spareid);
 
 	&log_machine_info($fh);
@@ -670,19 +709,28 @@ sub main () {
 	&check_httpd_logs($fh);
 	&rotate_lonnet_logs($fh);
     }
-    if (!$justcheckconnections) {
+    if (!$justcheckconnections && !$justreload) {
 	&checkon_daemon($fh,'lonsql',200000);
-	&checkon_daemon($fh,'lond',40000,1);
+	if ( &checkon_daemon($fh,'lond',40000,'USR1') eq 'running') {
+	    &checkon_daemon($fh,'lond',40000,'USR2');
+	}
 	my $args='new';
 	if ($oldlonc) { $args = ''; }
-	&checkon_daemon($fh,'lonc',40000,1,$args);
+	&checkon_daemon($fh,'lonc',40000,'USR1',$args);
 	&checkon_daemon($fh,'lonhttpd',40000);
+	&checkon_daemon($fh,'lonmemcached',40000);
+    }
+    if ($justreload) {
+	&checkon_daemon($fh,'lond',40000,'USR2');
+	my $args='new';
+	if ($oldlonc) { $args = ''; }
+	&checkon_daemon($fh,'lonc',40000,'USR2',$args);
     }
-    if (!$justcheckdaemons) {
+    if ($justcheckconnections) {
 	&test_connections($fh,\%hostname);
     }
-    if (!$justcheckdaemons && !$justcheckconnections) {
-	&check_delayed_msg($fh);
+    if (!$justcheckdaemons && !$justcheckconnections && !$justreload) {
+	&check_delayed_msg($fh,\%hostname);
 	&finish_logging($fh);
 	&log_simplestatus();