--- loncom/loncnew	2003/07/02 01:12:35	1.12
+++ loncom/loncnew	2003/08/03 00:44:31	1.17
@@ -2,13 +2,12 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.12 2003/07/02 01:12:35 foxr Exp $
+# $Id: loncnew,v 1.17 2003/08/03 00:44:31 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
 # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
-#
-# LON-CAPA is free software; you can redistribute it and/or modify
+## LON-CAPA is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation; either version 2 of the License, or
 # (at your option) any later version.
@@ -27,7 +26,7 @@
 # http://www.lon-capa.org/
 #
 #
-# new lonc handles n requestors spread out bver m connections to londs.
+# new lonc handles n request out bver m connections to londs.
 # This module is based on the Event class.
 #   Development iterations:
 #    - Setup basic event loop.   (done)
@@ -46,10 +45,28 @@
 
 # Change log:
 #    $Log: loncnew,v $
-#    Revision 1.12  2003/07/02 01:12:35  foxr
-#    - Add some debugging to killthemall
-#    - Add better error handling to LondReadable
-#    - Remove tick logging in the timer handler.
+#    Revision 1.17  2003/08/03 00:44:31  foxr
+#    1. Correct handling of connection failure: Assume it means the host is
+#       unreachable and fail all of the queued transactions.  Note that the
+#       inflight transactions should fail on their own time due either to timeout
+#       or send/receive failures.
+#    2. Correct handling of logs for forced death signals.  Pull the signal
+#       from the event watcher.
+#
+#    Revision 1.16  2003/07/29 02:33:05  foxr
+#    Add SIGINT processing to child processes to toggle annoying trace mode
+#    on/off.. will try to use this to isolate the compute boud process issue.
+#
+#    Revision 1.15  2003/07/15 02:07:05  foxr
+#    Added code for lonc/lond transaction timeouts.  Who knows if it works right.
+#    The intent is for a timeout to fail any transaction in progress and kill
+#    off the sockt that timed out.
+#
+#    Revision 1.14  2003/07/03 02:10:18  foxr
+#    Get all of the signals to work correctly.
+#
+#    Revision 1.13  2003/07/02 01:31:55  foxr
+#    Added kill -HUP logic (restart).
 #
 #    Revision 1.11  2003/06/25 01:54:44  foxr
 #    Fix more problems with transaction failure.
@@ -95,12 +112,12 @@ use LONCAPA::HashIterator;
 #
 #   Disable all signals we might receive from outside for now.
 #
-$SIG{QUIT}  = IGNORE;
-$SIG{HUP}   = IGNORE;
-$SIG{USR1}  = IGNORE;
-$SIG{INT}   = IGNORE;
-$SIG{CHLD}  = IGNORE;
-$SIG{__DIE__}  = IGNORE;
+#$SIG{QUIT}  = IGNORE;
+#$SIG{HUP}   = IGNORE;
+#$SIG{USR1}  = IGNORE;
+#$SIG{INT}   = IGNORE;
+#$SIG{CHLD}  = IGNORE;
+#$SIG{__DIE__}  = IGNORE;
 
 
 # Read the httpd configuration file to get perl variables
@@ -119,6 +136,7 @@ my $MaxConnectionCount = 10;	# Will get
 my $ClientConnection = 0;	# Uniquifier for client events.
 
 my $DebugLevel = 0;
+my $NextDebugLevel= 10;		# So Sigint can toggle this.
 my $IdleTimeout= 3600;		# Wait an hour before pruning connections.
 
 #
@@ -134,6 +152,7 @@ my $WorkQueue       = Queue->new(); # Qu
 my $ConnectionCount = 0;
 my $IdleSeconds     = 0;	# Number of seconds idle.
 my $Status          = "";	# Current status string.
+my $RecentLogEntry  = "";
 my $ConnectionRetries=5;	# Number of connection retries allowed.
 my $ConnectionRetriesLeft=5;	# Number of connection retries remaining.
 
@@ -210,6 +229,7 @@ sub Log {
     my $execdir = $perlvar{'lonDaemons'};
     my $fh      = IO::File->new(">>$execdir/logs/lonc.log");
     my $msg = sprintf($finalformat, $message);
+    $RecentLogEntry = $msg;
     print $fh $msg;
     
     
@@ -253,7 +273,7 @@ sub Debug {
     my $level   = shift;
     my $message = shift;
     if ($level <= $DebugLevel) {
-	print $message." host = ".$RemoteHost."\n";
+	Log("INFO", "-Debug- $message host = $RemotHost");
     }
 }
 
@@ -283,6 +303,21 @@ sub ShowStatus {
 
 =pod
 
+=head 2 SocketTimeout
+
+    Called when an action on the socket times out.  The socket is 
+   destroyed and any active transaction is failed.
+
+
+=cut
+sub SocketTimeout {
+    my $Socket = shift;
+    
+    KillSocket($Socket);
+}
+
+=pod
+
 =head2 Tick
 
 Invoked  each timer tick.
@@ -307,7 +342,13 @@ sub Tick {
     } else {
 	$IdleSeconds = 0;	# Reset idle count if not idle.
     }
-
+    #
+    #  For each inflight transaction, tick down its timeout counter.
+    #
+    foreach $item (keys %ActiveTransactions) {
+	my $Socket = $ActiveTransactions{$item}->getServer();
+	$Socket->Tick();
+    }
     # Do we have work in the queue, but no connections to service them?
     # If so, try to make some new connections to get things going again.
     #
@@ -445,6 +486,12 @@ sub ClientWritable {
 		
 	    } else {		# Partial string sent.
 		$Watcher->data(substr($Data, $result));
+		if($result == 0) {    # client hung up on us!!
+		    Log("INFO", "lonc pipe client hung up on us!");
+		    $Watcher->cancel;
+		    $Socket->shutdown(2);
+		    $Socket->close();
+		}
 	    }
 	    
 	} else {			# Error of some sort...
@@ -562,6 +609,7 @@ Parameters:
 
 sub FailTransaction {
     my $transaction = shift;
+    Log("WARNING", "Failing transaction ".$transaction->getRequest());
     Debug(1, "Failing transaction: ".$transaction->getRequest());
     if (!$transaction->isDeferred()) { # If the transaction is deferred we'll get to it.
 	my $client  = $transaction->getClient();
@@ -624,6 +672,7 @@ nonzero if we are allowed to create a ne
 sub KillSocket {
     my $Socket = shift;
 
+    Log("WARNING", "Shutting down a socket");
     $Socket->Shutdown();
 
     #  If the socket came from the active connection set,
@@ -720,14 +769,16 @@ sub LondReadable {
 
     SocketDump(6, $Socket);
     my $status = $Socket->Readable();
+
     &Debug(2, "Socket->Readable returned: $status");
 
     if($status != 0) {
 	 # bad return from socket read. Currently this means that
 	# The socket has become disconnected. We fail the transaction.
 
+	Log("WARNING",
+	    "Lond connection lost.");
 	if(exists($ActiveTransactions{$Socket})) {
-	    Debug(3,"Lond connection lost failing transaction");
 	    FailTransaction($ActiveTransactions{$Socket});
 	}
 	$Watcher->cancel();
@@ -1132,7 +1183,9 @@ sub QueueTransaction {
 	$WorkQueue->enqueue($requestData);
 	if($ConnectionCount < $MaxConnectionCount) {
 	    Debug(4,"Starting additional lond connection");
-	    MakeLondConnection();
+	    if(MakeLondConnection() == 0) {
+		EmptyQueue();	# Fail transactions, can't make connection.
+	    }
 	}
     } else {			# Can start the request:
 	Debug(8,"Can start...");
@@ -1290,6 +1343,25 @@ sub SetupLoncListener {
 	      fd     => $socket);
 }
 
+=pod 
+
+=head2 ChildStatus
+ 
+Child USR1 signal handler to report the most recent status
+into the status file.
+
+=cut
+sub ChildStatus {
+    my $event = shift;
+    my $watcher = $event->w;
+
+    Debug(2, "Reporting child status because : ".$watcher->data);
+    my $docdir = $perlvar{'lonDocRoot'};
+    my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
+    print $fh $$."\t".$RemoteHost."\t".$Status."\t".
+	$RecentLogEntry."\n";
+}
+
 =pod
 
 =head2 SignalledToDeath
@@ -1300,17 +1372,35 @@ Called in response to a signal that caus
 
 
 sub SignalledToDeath {
-    Debug(2,"Signalled to death!");
-    my ($signal) = @_;
+    my $event  = shift;
+    my $watcher= $event->w;
+
+    Debug(2,"Signalled to death! via ".$watcher->data);
+    my ($signal) = $watcher->data;
     chomp($signal);
     Log("CRITICAL", "Abnormal exit.  Child $$ for $RemoteHost "
 	."died through "."\"$signal\"");
     LogPerm("F:lonc: $$ on $RemoteHost signalled to death: "
 	    ."\"$signal\"");
-    die("Signal abnormal end");
     exit 0;
 
 }
+
+=head2 ToggleDebug
+
+This sub toggles trace debugging on and off.
+
+=cut
+
+sub ToggleDebug {
+    my $Current    = $DebugLevel;
+       $DebugLevel = $NextDebugLevel;
+       $NextDebugLevel = $Current;
+
+    Log("SUCCESS", "New debugging level for $RemoteHost now $DebugLevel");
+
+}
+
 =head2 ChildProcess
 
 This sub implements a child process for a single lonc daemon.
@@ -1320,14 +1410,22 @@ This sub implements a child process for
 sub ChildProcess {
 
 
-    # For now turn off signals.
-    
-    $SIG{QUIT}  = \&SignalledToDeath;
-    $SIG{HUP}   = IGNORE;
-    $SIG{USR1}  = IGNORE;
-    $SIG{INT}   = DEFAULT;
-    $SIG{CHLD}  = IGNORE;
-    $SIG{__DIE__}  = \&SignalledToDeath;
+    #
+    #  Signals must be handled by the Event framework...
+#
+
+    Event->signal(signal   => "QUIT",
+		  cb       => \&SignalledToDeath,
+		  data     => "QUIT");
+    Event->signal(signal   => "HUP",
+		  cb       => \&ChildStatus,
+		  data     => "HUP");
+    Event->signal(signal   => "USR1",
+		  cb       => \&ChildStatus,
+		  data     => "USR1");
+    Event->signal(signal   => "INT",
+		  cb       => \&ToggleDebug,
+		  data     => "INT");
 
     SetupTimer();
     
@@ -1339,7 +1437,7 @@ sub ChildProcess {
 
 # Setup the initial server connection:
     
-     # &MakeLondConnection(); // let first work requirest do it.
+     # &MakeLondConnection(); // let first work requirest do� it.
 
 
     Debug(9,"Entering event loop");
@@ -1359,6 +1457,7 @@ sub CreateChild {
     Log("CRITICAL", "Forking server for ".$host);
     $pid          = fork;
     if($pid) {			# Parent
+	$RemoteHost = "Parent";
 	$ChildHash{$pid} = $RemoteHost;
 	sigprocmask(SIG_UNBLOCK, $sigset);
 
@@ -1406,6 +1505,8 @@ open (PIDSAVE, ">$execdir/logs/lonc.pid"
 print PIDSAVE "$$\n";
 close(PIDSAVE);
 
+
+
 if (POSIX::setsid() < 0) {
     print "Could not create new session\n";
     exit -1;
@@ -1432,9 +1533,10 @@ ShowStatus("Parent keeping the flock");
 #   Set up parent signals:
 #
 
-$SIG{INT}  = \&KillThemAll;
-$SIG{TERM} = \&KillThemAll; 
-
+$SIG{INT}  = \&Terminate;
+$SIG{TERM} = \&Terminate; 
+$SIG{HUP}  = \&Restart;
+$SIG{USR1} = \&CheckKids; 
 
 while(1) {
     $deadchild = wait();
@@ -1448,12 +1550,58 @@ while(1) {
     }
 }
 
+
+
+=pod
+
+=head1 CheckKids
+
+  Since kids do not die as easily in this implementation
+as the previous one, there  is no need to restart the
+dead ones (all dead kids get restarted when they die!!)
+The only thing this function does is to pass USR1 to the
+kids so that they report their status.
+
+=cut
+
+sub CheckKids {
+    Debug(2, "Checking status of children");
+    my $docdir = $perlvar{'lonDocRoot'};
+    my $fh = IO::File->new(">$docdir/lon-status/loncstatus.txt");
+    my $now=time;
+    my $local=localtime($now);
+    print $fh "LONC status $local - parent $$ \n\n";
+    foreach $pid (keys %ChildHash) {
+	Debug(2, "Sending USR1 -> $pid");
+	kill 'USR1' => $pid;	# Tell Child to report status.
+	sleep 1;		# Wait so file doesn't intermix.
+    }
+}
+
+=pod
+
+=head1 Restart
+
+Signal handler for HUP... all children are killed and
+we self restart.  This is an el-cheapo way to re read
+the config file.
+
+=cut
+
+sub Restart {
+    KillThemAll;		# First kill all the children.
+    Log("CRITICAL", "Restarting");
+    my $execdir = $perlvar{'lonDaemons'};
+    unlink("$execdir/logs/lonc.pid");
+    exec("$execdir/lonc");
+}
+
 =pod
 
 =head1 KillThemAll
 
 Signal handler that kills all children by sending them a 
-SIGINT.  Responds to sigint and sigterm.
+SIGHUP.  Responds to sigint and sigterm.
 
 =cut
 
@@ -1465,14 +1613,30 @@ sub KillThemAll {
 	Debug(2, "Killing lonc for $serving pid = $pid");
 	ShowStatus("Killing lonc for $serving pid = $pid");
 	Log("CRITICAL", "Killing lonc for $serving pid = $pid");
-	kill('INT', $pid);
+	kill 'QUIT' => $pid;
+	delete($ChildHash{$pid});
     }
-    Log("CRITICAL", "Killing the master process.");
-    exit
+    my $execdir = $perlvar{'lonDaemons'};
+    unlink("$execdir/logs/lonc.pid");
+
 }
 
 =pod
 
+=head1 Terminate
+ 
+Terminate the system.
+
+=cut
+
+sub Terminate {
+    KillThemAll;
+    Log("CRITICAL","Master process exiting");
+    exit 0;
+
+}
+=pod
+
 =head1 Theory
 
 The event class is used to build this as a single process with an