--- loncom/Attic/lonc	2001/03/13 21:15:40	1.14
+++ loncom/Attic/lonc	2001/11/27 23:11:42	1.20
@@ -16,7 +16,7 @@
 # 12/05 Scott Harrison
 # 12/05 Gerd Kortemeyer
 # 01/10/01 Scott Harrison
-# 03/14/01 Gerd Kortemeyer
+# 03/14/01,03/15,06/12,11/26,11/27 Gerd Kortemeyer
 # 
 # based on nonforker from Perl Cookbook
 # - server who multiplexes without forking
@@ -30,6 +30,9 @@ use Fcntl;
 use Tie::RefHash;
 use Crypt::IDEA;
 
+my $status='';
+my $lastlog='';
+
 # grabs exception and records it to log before exiting
 sub catchexception {
     my ($signal)=@_;
@@ -41,7 +44,7 @@ sub catchexception {
     die($@);
 }
 
-$childmaxattempts=10;
+$childmaxattempts=5;
 
 # -------------------------------- Set signal handlers to record abnormal exits
 
@@ -116,7 +119,13 @@ sub REAPER {                        # ta
 
 sub HUNTSMAN {                      # signal handler for SIGINT
     local($SIG{CHLD}) = 'IGNORE';   # we're going to kill our children
-    kill 'INT' => keys %children;
+    map {
+        $wasserver=$children{$_};
+        &status("Closing $wasserver");
+        &logthis('Closing '.$wasserver.': '.&subreply('exit',$wasserver));
+        &status("Kill PID $_ for $wasserver");
+	kill ('INT',$_);
+    } keys %children;
     my $execdir=$perlvar{'lonDaemons'};
     unlink("$execdir/logs/lonc.pid");
     &logthis("<font color=red>CRITICAL: Shutting down</font>");
@@ -125,13 +134,32 @@ sub HUNTSMAN {                      # si
 
 sub HUPSMAN {                      # signal handler for SIGHUP
     local($SIG{CHLD}) = 'IGNORE';  # we're going to kill our children
-    kill 'INT' => keys %children;
+    map {
+        $wasserver=$children{$_};
+        &status("Closing $wasserver");
+        &logthis('Closing '.$wasserver.': '.&subreply('exit',$wasserver));
+        &status("Kill PID $_ for $wasserver");
+	kill ('INT',$_);
+    } keys %children;
     &logthis("<font color=red>CRITICAL: Restarting</font>");
     unlink("$execdir/logs/lonc.pid");
     my $execdir=$perlvar{'lonDaemons'};
     exec("$execdir/lonc");         # here we go again
 }
 
+sub checkchildren {
+    &initnewstatus();
+    &logstatus();
+    &logthis('Going to check on the children');
+    map {
+	sleep 1;
+        unless (kill 'USR1' => $_) {
+	    &logthis ('Child '.$_.' is dead');
+            &logstatus($$.' is dead');
+        } 
+    } sort keys %children;
+}
+
 sub USRMAN {
     &logthis("USR1: Trying to establish connections again");
     foreach $thisserver (keys %hostip) {
@@ -141,6 +169,7 @@ sub USRMAN {
         ." >$answer<");
     }
     %childatt=();
+    &checkchildren();
 }
 
 # -------------------------------------------------- Non-critical communication
@@ -153,10 +182,18 @@ sub subreply {
                                       Type    => SOCK_STREAM,
                                       Timeout => 10)
        or return "con_lost";
-    print $sclient "$cmd\n";
-    my $answer=<$sclient>;
-    chomp($answer);
-    if (!$answer) { $answer="con_lost"; }
+    $SIG{ALRM}=sub { die "timeout" };
+    $SIG{__DIE__}='DEFAULT';
+    eval {
+     alarm(10);
+     print $sclient "$cmd\n";
+     my $answer=<$sclient>;
+     chomp($answer);
+     alarm(0);
+    };
+    if ((!$answer) || ($@=~/timeout/)) { $answer="con_lost"; }
+    $SIG{ALRM}='DEFAULT';
+    $SIG{__DIE__}=\&catchexception;
  } else { $answer='self_reply'; }
  return $answer;
 }
@@ -169,6 +206,7 @@ sub logthis {
     my $fh=IO::File->new(">>$execdir/logs/lonc.log");
     my $now=time;
     my $local=localtime($now);
+    $lastlog=$local.': '.$message;
     print $fh "$local ($$): $message\n";
 }
 
@@ -181,6 +219,31 @@ sub logperm {
     my $fh=IO::File->new(">>$execdir/logs/lonnet.perm.log");
     print $fh "$now:$message:$local\n";
 }
+# ------------------------------------------------------------------ Log status
+
+sub logstatus {
+    my $docdir=$perlvar{'lonDocRoot'};
+    my $fh=IO::File->new(">>$docdir/lon-status/loncstatus.txt");
+    print $fh $$."\t".$status."\t".$lastlog."\n";
+}
+
+sub initnewstatus {
+    my $docdir=$perlvar{'lonDocRoot'};
+    my $fh=IO::File->new(">$docdir/lon-status/loncstatus.txt");
+    my $now=time;
+    my $local=localtime($now);
+    print $fh "LONC status $local - parent $$\n\n";
+}
+
+# -------------------------------------------------------------- Status setting
+
+sub status {
+    my $what=shift;
+    my $now=time;
+    my $local=localtime($now);
+    $status=$local.': '.$what;
+}
+
 
 # ---------------------------------------------------- Fork once and dissociate
 
@@ -204,6 +267,8 @@ $SIG{HUP}=$SIG{USR1}='IGNORE';
     
 # Fork off our children, one for every server
 
+&status("Forking ...");
+
 foreach $thisserver (keys %hostip) {
     make_new_child($thisserver);
 }
@@ -218,11 +283,13 @@ $SIG{USR1} = \&USRMAN;
 
 # And maintain the population.
 while (1) {
+    &status("Sleeping");
     sleep;                          # wait for a signal (i.e., child's death)
                                     # See who died and start new one
+    &status("Woke up");
     foreach $thisserver (keys %hostip) {
         if (!$childpid{$thisserver}) {
-	    if ($childatt{$thisserver}<=$childmaxattempts) {
+	    if ($childatt{$thisserver}<$childmaxattempts) {
 	       $childatt{$thisserver}++;
                &logthis(
    "<font color=yellow>INFO: Trying to reconnect for $thisserver "
@@ -257,7 +324,8 @@ sub make_new_child {
     } else {
         # Child can *not* return from this subroutine.
         $SIG{INT} = 'DEFAULT';      # make SIGINT kill us as it did before
-    
+        $SIG{USR1}= \&logstatus;
+   
         # unblock signals
         sigprocmask(SIG_UNBLOCK, $sigset)
             or die "Can't unblock SIGINT for fork: $!\n";
@@ -267,7 +335,11 @@ sub make_new_child {
 $port = "$perlvar{'lonSockDir'}/$conserver";
 
 unlink($port);
+
 # ---------------------------------------------------- Client to network server
+
+&status("Opening TCP: $conserver");
+
 unless (
   $remotesock = IO::Socket::INET->new(PeerAddr => $hostip{$conserver},
                                       PeerPort => $perlvar{'londPort'},
@@ -280,20 +352,47 @@ unless (
        sleep($st);
        exit; 
      };
-# --------------------------------------- Send a ping to make other end do USR1
+# ----------------------------------------------------------------- Init dialog
+
+&status("Init dialogue: $conserver");
+
+     $SIG{ALRM}=sub { die "timeout" };
+     $SIG{__DIE__}='DEFAULT';
+     eval {
+         alarm(60);
 print $remotesock "init\n";
 $answer=<$remotesock>;
 print $remotesock "$answer";
 $answer=<$remotesock>;
 chomp($answer);
+          alarm(0);
+     };
+     $SIG{ALRM}='DEFAULT';
+     $SIG{__DIE__}=\&catchexception;
+ 
+     if ($@=~/timeout/) {
+	 &logthis("Timed out during init: $conserver");
+         exit;
+     }
+
+
 &logthis("Init reply for $conserver: >$answer<");
+if ($answer ne 'ok') {
+       my $st=120+int(rand(240));
+       &logthis(
+"<font color=blue>WARNING: Init failed $conserver ($st secs)</font>");
+       sleep($st);
+       exit; 
+}
 sleep 5;
+&status("Ponging $conserver");
 print $remotesock "pong\n";
 $answer=<$remotesock>;
 chomp($answer);
 &logthis("Pong reply for $conserver: >$answer<");
 # ----------------------------------------------------------- Initialize cipher
 
+&status("Initialize cipher: $conserver");
 print $remotesock "ekey\n";
 my $buildkey=<$remotesock>;
 my $key=$conserver.$perlvar{'lonHostID'};
@@ -315,7 +414,7 @@ if ($cipher=new IDEA $cipherkey) {
 }
 
 # ----------------------------------------- We're online, send delayed messages
-
+    &status("Checking for delayed messages");
     my @allbuffered;
     my $path="$perlvar{'lonSockDir'}/delayed";
     opendir(DIRHANDLE,$path);
@@ -323,6 +422,7 @@ if ($cipher=new IDEA $cipherkey) {
     closedir(DIRHANDLE);
     my $dfname;
     map {
+        &status("Sending delayed $conserver $_");
         $dfname="$path/$_";
         &logthis($dfname);
         my $wcmd;
@@ -345,11 +445,19 @@ if ($cipher=new IDEA $cipherkey) {
             }
             $cmd="enc:$cmdlength:$encrequest\n";
         }
-
+    $SIG{ALRM}=sub { die "timeout" };
+    $SIG{__DIE__}='DEFAULT';
+    eval {
+        alarm(60);
         print $remotesock "$cmd\n";
         $answer=<$remotesock>;
 	chomp($answer);
-        if ($answer ne '') {
+        alarm(0);
+    };
+    $SIG{ALRM}='DEFAULT';
+    $SIG{__DIE__}=\&catchexception;
+
+        if (($answer ne '') && ($@!~/timeout/)) {
 	    unlink("$dfname");
             &logthis("Delayed $cmd to $conserver: >$answer<");
             &logperm("S:$conserver:$bcmd");
@@ -357,6 +465,7 @@ if ($cipher=new IDEA $cipherkey) {
     } @allbuffered;
 
 # ------------------------------------------------------- Listen to UNIX socket
+&status("Opening socket $conserver");
 unless (
   $server = IO::Socket::UNIX->new(Local  => $port,
                                   Type   => SOCK_STREAM,
@@ -394,11 +503,11 @@ while (1) {
     # check for new information on the connections we have
 
     # anything to read or accept?
-    foreach $client ($select->can_read(1)) {
+    foreach $client ($select->can_read(0.1)) {
 
         if ($client == $server) {
             # accept a new connection
-
+            &status("Accept new connection: $conserver");
             $client = $server->accept();
             $select->add($client);
             nonblock($client);
@@ -413,6 +522,7 @@ while (1) {
                 delete $outbuffer{$client};
                 delete $ready{$client};
 
+                &status("Idle $conserver");
                 $select->remove($client);
                 close $client;
                 next;
@@ -443,16 +553,21 @@ while (1) {
         $rv = $client->send($outbuffer{$client}, 0);
         unless (defined $rv) {
             # Whine, but move on.
-            warn "I was told I could write, but I can't.\n";
+            &logthis("I was told I could write, but I can't.\n");
             next;
         }
+        $errno=$!;
         if (($rv == length $outbuffer{$client}) ||
-            ($! == POSIX::EWOULDBLOCK)) {
+            ($errno == POSIX::EWOULDBLOCK) || ($errno == 0)) {
             substr($outbuffer{$client}, 0, $rv) = '';
             delete $outbuffer{$client} unless length $outbuffer{$client};
         } else {
             # Couldn't write all the data, and it wasn't because
             # it would have blocked.  Shutdown and move on.
+
+	    &logthis("Dropping data with ".$errno.": ".
+                     length($outbuffer{$client}).", $rv");
+
             delete $inbuffer{$client};
             delete $outbuffer{$client};
             delete $ready{$client};
@@ -492,8 +607,27 @@ sub handle {
             }
             $request="enc:$cmdlength:$encrequest\n";
         }
+# --------------------------------------------------------------- Main exchange
+    $SIG{ALRM}=sub { die "timeout" };
+    $SIG{__DIE__}='DEFAULT';
+    eval {
+        alarm(300);
+        &status("Sending $conserver: $request");
         print $remotesock "$request";
+        &status("Waiting for reply from $conserver: $request");
         $answer=<$remotesock>;
+        &status("Received reply: $request");
+        alarm(0);
+    };
+    if ($@=~/timeout/) { 
+       $answer='';
+       &logthis(
+        "<font color=red>CRITICAL: Timeout $conserver: $request</font>");
+    }  
+    $SIG{ALRM}='DEFAULT';
+    $SIG{__DIE__}=\&catchexception;
+
+
         if ($answer) {
 	   if ($answer =~ /^enc/) {
                my ($cmd,$cmdlength,$encinput)=split(/:/,$answer);
@@ -515,6 +649,7 @@ sub handle {
 # ===================================================== Done processing request
     }
     delete $ready{$client};
+    &status("Completed $conserver: $request");
 # -------------------------------------------------------------- End non-forker
 }
 # ---------------------------------------------------------- End make_new_child