--- loncom/loncnew	2003/10/21 14:24:42	1.29
+++ loncom/loncnew	2004/01/13 09:57:18	1.39
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.29 2003/10/21 14:24:42 foxr Exp $
+# $Id: loncnew,v 1.39 2004/01/13 09:57:18 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -35,114 +35,16 @@
 #    - Add ability to create/negotiate lond connections (done).
 #    - Add general logic for dispatching requests and timeouts. (done).
 #    - Add support for the lonc/lond requests.          (done).
-#    - Add logging/status monitoring.
-#    - Add Signal handling - HUP restarts. USR1 status report.
+#    - Add logging/status monitoring.                    (done)
+#    - Add Signal handling - HUP restarts. USR1 status report. (done)
 #    - Add Configuration file I/O                       (done).
-#    - Add management/status request interface.
+#    - Add management/status request interface.         (done)
 #    - Add deferred request capability.                  (done)
-#    - Detect transmission timeouts.
+#    - Detect transmission timeouts.                     (done)
 #
 
-# Change log:
-#    $Log: loncnew,v $
-#    Revision 1.29  2003/10/21 14:24:42  foxr
-#    Fix little typo that may explain growth of connections
-#
-#    Revision 1.28  2003/10/14 15:36:21  albertel
-#    - making it easier to run loncnew,
-#       /etc/init.d/loncontrol startnew
-#       /etc/init.d/loncontrol restartnew
-#      will now start loncnew in place of lonc
-#
-#    Revision 1.27  2003/10/07 11:23:03  foxr
-#    Installed and tested code to process reinit in parent server.
-#
-#    Revision 1.26  2003/09/30 11:11:17  foxr
-#    Add book-keeping hashes to support the re-init procedure.
-#
-#    Revision 1.25  2003/09/23 11:22:14  foxr
-#    Tested ability to receive sigusr2  This is now logged and must be
-#    properly implemented as a re-read of hosts and re-init of appropriate
-#    children.
-#
-#    Revision 1.24  2003/09/16 09:46:42  foxr
-#    Added skeletal infrastructure to support SIGUSR2 update hosts request.
-#
-#    Revision 1.23  2003/09/15 09:24:49  foxr
-#    Add use strict and fix all the fallout from that.
-#
-#    Revision 1.22  2003/09/02 10:34:47  foxr
-#    - Fix errors in host dead detection logic (too many cases where the
-#      retries left were not getting incremented or just not checked).
-#    - Added some additional status to the ps axuww display:
-#      o Remaining retries on a host.
-#      o >>> DEAD <<< indicator if I've given up on a host.
-#    - Tested the SIGHUP will reset the retries remaining count (thanks to
-#      the above status stuff, and get allow the loncnew to re-try again
-#      on the host (thanks to the log).
-#
-#    Revision 1.21  2003/08/26 09:19:51  foxr
-#    How embarrassing... put in the SocketTimeout function in loncnew and forgot
-#    to actually hook it into the LondTransaction.  Added this to MakeLondConnection
-#    where it belongs... hopefully transactions (not just connection attempts) will
-#    timeout more speedily than the socket errors will catch it.
-#
-#    Revision 1.20  2003/08/25 18:48:11  albertel
-#    - fixing a forgotten ;
-#
-#    Revision 1.19  2003/08/19 09:31:46  foxr
-#    Get socket directory from configuration rather than the old hard coded test
-#    way that I forgot to un-hard code.
-#
-#    Revision 1.18  2003/08/06 09:52:29  foxr
-#    Also needed to remember to fail in-flight transactions if their sends fail.
-#
-#    Revision 1.17  2003/08/03 00:44:31  foxr
-#    1. Correct handling of connection failure: Assume it means the host is
-#       unreachable and fail all of the queued transactions.  Note that the
-#       inflight transactions should fail on their own time due either to timeout
-#       or send/receive failures.
-#    2. Correct handling of logs for forced death signals.  Pull the signal
-#       from the event watcher.
-#
-#    Revision 1.16  2003/07/29 02:33:05  foxr
-#    Add SIGINT processing to child processes to toggle annoying trace mode
-#    on/off.. will try to use this to isolate the compute boud process issue.
-#
-#    Revision 1.15  2003/07/15 02:07:05  foxr
-#    Added code for lonc/lond transaction timeouts.  Who knows if it works right.
-#    The intent is for a timeout to fail any transaction in progress and kill
-#    off the sockt that timed out.
-#
-#    Revision 1.14  2003/07/03 02:10:18  foxr
-#    Get all of the signals to work correctly.
-#
-#    Revision 1.13  2003/07/02 01:31:55  foxr
-#    Added kill -HUP logic (restart).
-#
-#    Revision 1.11  2003/06/25 01:54:44  foxr
-#    Fix more problems with transaction failure.
-#
-#    Revision 1.10  2003/06/24 02:46:04  foxr
-#    Put a limit on  the number of times we'll retry a connection.
-#    Start getting the signal stuff put in as well...note that need to get signals
-#    going or else the client will permanently give up on dead servers.
-#
-#    Revision 1.9  2003/06/13 02:38:43  foxr
-#    Add logging in 'expected format'
-#
-#    Revision 1.8  2003/06/11 02:04:35  foxr
-#    Support delayed transactions... this is done uniformly by encapsulating
-#    transactions in an object ... a LondTransaction that is implemented by
-#    LondTransaction.pm
-#
-#    Revision 1.7  2003/06/03 01:59:39  foxr
-#    complete coding to support deferred transactions.
-#
-#
 use strict;
 use lib "/home/httpd/lib/perl/";
-use lib "/home/foxr/newloncapa/types";
 use Event qw(:DEFAULT );
 use POSIX qw(:signal_h);
 use POSIX;
@@ -187,6 +89,8 @@ my $DebugLevel = 0;
 my $NextDebugLevel= 2;		# So Sigint can toggle this.
 my $IdleTimeout= 3600;		# Wait an hour before pruning connections.
 
+my $LogTransactions = 0;	# When True, all transactions/replies get logged.
+
 #
 #  The variables below are only used by the child processes.
 #
@@ -201,8 +105,8 @@ my $ConnectionCount = 0;
 my $IdleSeconds     = 0;	# Number of seconds idle.
 my $Status          = "";	# Current status string.
 my $RecentLogEntry  = "";
-my $ConnectionRetries=5;	# Number of connection retries allowed.
-my $ConnectionRetriesLeft=5;	# Number of connection retries remaining.
+my $ConnectionRetries=2;	# Number of connection retries allowed.
+my $ConnectionRetriesLeft=2;	# Number of connection retries remaining.
 
 #
 #   The hash below gives the HTML format for log messages
@@ -308,7 +212,6 @@ sub GetPeername {
 	return $peerfile;
     }
 }
-#----------------------------- Timer management ------------------------
 =pod
 
 =head2 Debug
@@ -360,11 +263,14 @@ sub ShowStatus {
 =cut
 sub SocketTimeout {
     my $Socket = shift;
-    
+    Log("WARNING", "A socket timeout was detected");
+    Debug(0, " SocketTimeout called: ");
+    $Socket->Dump();
     KillSocket($Socket);	# A transaction timeout also counts as
                                 # a connection failure:
     $ConnectionRetriesLeft--;
 }
+#----------------------------- Timer management ------------------------
 
 =pod
 
@@ -399,9 +305,13 @@ sub Tick {
     #
     #  For each inflight transaction, tick down its timeout counter.
     #
-    foreach my $item (keys %ActiveTransactions) {
-	my $Socket = $ActiveTransactions{$item}->getServer();
-	$Socket->Tick();
+
+    foreach my $item (keys %ActiveConnections) {
+	my $State = $ActiveConnections{$item}->data->GetState();
+	if ($State ne 'Idle') {
+	    Debug(5,"Ticking Socket $State $item");
+	    $ActiveConnections{$item}->data->Tick();
+	}
     }
     # Do we have work in the queue, but no connections to service them?
     # If so, try to make some new connections to get things going again.
@@ -448,7 +358,7 @@ Trigger disconnections of idle sockets.
 
 sub SetupTimer {
     Debug(6, "SetupTimer");
-    Event->timer(interval => 1, debug => 1, cb => \&Tick );
+    Event->timer(interval => 1, cb => \&Tick );
 }
 
 =pod
@@ -605,6 +515,9 @@ sub CompleteTransaction {
 
     if (!$Transaction->isDeferred()) { # Normal transaction
 	my $data   = $Socket->GetReply(); # Data to send.
+	if($LogTransactions) {
+	    Log("SUCCESS", "Reply from lond: '$data'");
+	}
 	StartClientReply($Transaction, $data);
     } else {			# Delete deferred transaction file.
 	Log("SUCCESS", "A delayed transaction was completed");
@@ -669,10 +582,10 @@ Parameters:
 sub FailTransaction {
     my $transaction = shift;
     Log("WARNING", "Failing transaction ".$transaction->getRequest());
-    Debug(5, "Failing transaction: ".$transaction->getRequest());
+    Debug(1, "Failing transaction: ".$transaction->getRequest());
     if (!$transaction->isDeferred()) { # If the transaction is deferred we'll get to it.
 	my $client  = $transaction->getClient();
-	Debug(5," Replying con_lost to ".$transaction->getRequest());
+	Debug(1," Replying con_lost to ".$transaction->getRequest());
 	StartClientReply($transaction, "con_lost\n");
     }
     if($ConnectionRetriesLeft <= 0) {
@@ -749,9 +662,9 @@ sub KillSocket {
     }
     if(exists($ActiveConnections{$Socket})) {
 	delete($ActiveConnections{$Socket});
+	$ConnectionCount--;
+	if ($ConnectionCount < 0) { $ConnectionCount = 0; }
     }
-    $ConnectionCount--;
-
     #  If the connection count has gone to zero and there is work in the
     #  work queue, the work all gets failed with con_lost.
     #
@@ -1133,7 +1046,7 @@ sub MakeLondConnection {
     my $Connection = LondConnection->new(&GetServerHost(),
 					 &GetServerPort());
 
-    if($Connection == undef) {	# Needs to be more robust later.
+    if($Connection eq undef) {	# Needs to be more robust later.
 	Log("CRITICAL","Failed to make a connection with lond.");
 	$ConnectionRetriesLeft--;
 	return 0;		# Failure.
@@ -1145,7 +1058,7 @@ sub MakeLondConnection {
 	# exchange underway.
 	#
 	my $Socket = $Connection->GetSocket();
-	if($Socket == undef) {
+	if($Socket eq undef) {
 	    die "did not get a socket from the connection";
 	} else {
 	    &Debug(9,"MakeLondConnection got socket: ".$Socket);
@@ -1309,6 +1222,9 @@ sub ClientRequest {
 	    exit;
 	}
 	Debug(8, "Complete transaction received: ".$data);
+	if($LogTransactions) {
+	    Log("SUCCESS", "Transaction: '$data'"); # Transaction has \n.
+	}
 	my $Transaction = LondTransaction->new($data);
 	$Transaction->SetClient($socket);
 	QueueTransaction($Transaction);
@@ -1417,6 +1333,24 @@ sub SetupLoncListener {
 	      fd     => $socket);
 }
 
+#
+#   Toggle transaction logging.
+#  Implicit inputs:  
+#     LogTransactions
+#  Implicit Outputs:
+#     LogTransactions
+sub ToggleTransactionLogging {
+    print STDERR "Toggle transaction logging...\n";
+    if(!$LogTransactions) {
+	$LogTransactions = 1;
+    } else {
+	$LogTransactions = 0;
+    }
+
+
+    Log("SUCCESS", "Toggled transaction logging: $LogTransactions \n");
+}
+
 =pod 
 
 =head2 ChildStatus
@@ -1436,6 +1370,20 @@ sub ChildStatus {
     my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
     print $fh $$."\t".$RemoteHost."\t".$Status."\t".
 	$RecentLogEntry."\n";
+    #
+    #  Write out information about each of the connections:
+    #
+    print $fh "Active connection statuses: \n";
+    my $i = 1;
+    print STDERR  "================================= Socket Status Dump:\n";
+    foreach my $item (keys %ActiveConnections) {
+	my $Socket = $ActiveConnections{$item}->data;
+	my $state  = $Socket->GetState();
+	print $fh "Connection $i State: $state\n";
+	print STDERR "---------------------- Connection $i \n";
+	$Socket->Dump();
+	$i++;	
+    }
     $ConnectionRetriesLeft = $ConnectionRetries;
 }
 
@@ -1500,6 +1448,8 @@ sub ChildProcess {
     Event->signal(signal   => "USR1",
 		  cb       => \&ChildStatus,
 		  data     => "USR1");
+    Event->signal(signal   => "USR2",
+		  cb       => \&ToggleTransactionLogging);
     Event->signal(signal   => "INT",
 		  cb       => \&ToggleDebug,
 		  data     => "INT");
@@ -1594,6 +1544,7 @@ ShowStatus("Forking node servers");
 
 Log("CRITICAL", "--------------- Starting children ---------------");
 
+LondConnection::ReadConfig;               # Read standard config files.
 my $HostIterator = LondConnection::GetHostIterator;
 while (! $HostIterator->end()) {