--- loncom/loncnew	2007/05/01 01:04:23	1.85
+++ loncom/loncnew	2010/12/21 11:17:03	1.92
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.85 2007/05/01 01:04:23 albertel Exp $
+# $Id: loncnew,v 1.92 2010/12/21 11:17:03 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -85,7 +85,7 @@ my $ClientConnection = 0;	# Uniquifier f
 
 my $DebugLevel = 0;
 my $NextDebugLevel= 2;		# So Sigint can toggle this.
-my $IdleTimeout= 600;		# Wait 10 minutes before pruning connections.
+my $IdleTimeout= 7200;		# Wait two hours before pruning connections.
 
 my $LogTransactions = 0;	# When True, all transactions/replies get logged.
 my $executable      = $0;	# Get the full path to me.
@@ -158,6 +158,7 @@ sub LogPerm {
     my $now=time;
     my $local=localtime($now);
     my $fh=IO::File->new(">>$execdir/logs/lonnet.perm.log");
+    chomp($message);
     print $fh "$now:$message:$local\n";
 }
 
@@ -440,7 +441,8 @@ Trigger disconnections of idle sockets.
 
 sub SetupTimer {
     Debug(6, "SetupTimer");
-    Event->timer(interval => 1, cb => \&Tick );
+    Event->timer(interval => 1, cb => \&Tick,
+	hard => 1);
 }
 
 =pod
@@ -608,8 +610,8 @@ sub CompleteTransaction {
 	StartClientReply($Transaction, $data);
     } else {			# Delete deferred transaction file.
 	Log("SUCCESS", "A delayed transaction was completed");
-	LogPerm("S:$Transaction->getClient() :".$Transaction->getRequest());
-	unlink $Transaction->getFile();
+	LogPerm("S:".$Transaction->getClient().":".$Transaction->getRequest());
+	unlink($Transaction->getFile());
     }
 }
 
@@ -760,6 +762,7 @@ sub KillSocket {
 	delete ($ActiveTransactions{$Socket});
     }
     if(exists($ActiveConnections{$Socket})) {
+	$ActiveConnections{$Socket}->cancel;
 	delete($ActiveConnections{$Socket});
 	$ConnectionCount--;
 	if ($ConnectionCount < 0) { $ConnectionCount = 0; }
@@ -771,6 +774,7 @@ sub KillSocket {
 	EmptyQueue();
 	CloseAllLondConnections; # Should all already be closed but...
     }
+    UpdateStatus();
 }
 
 =pod
@@ -1205,7 +1209,7 @@ sub MakeLondConnection {
 					 &GetServerPort(),
 					 &GetHostId());
 
-    if($Connection eq undef) {	# Needs to be more robust later.
+    if($Connection eq undef) {	
 	Log("CRITICAL","Failed to make a connection with lond.");
 	$ConnectionRetriesLeft--;
 	return 0;		# Failure.
@@ -1382,14 +1386,23 @@ sub ClientRequest {
     $data = $data.$thisread;	# Append new data.
     $watcher->data($data);
     if($data =~ /\n$/) {	# Request entirely read.
-	if($data eq "close_connection_exit\n") {
+	if ($data eq "close_connection_exit\n") {
 	    Log("CRITICAL",
 		"Request Close Connection ... exiting");
 	    CloseAllLondConnections();
 	    exit;
+	} elsif ($data eq "reset_retries\n") {
+	    Log("INFO", "Resetting Connection Retries.");
+	    $ConnectionRetriesLeft = $ConnectionRetries;
+	    &UpdateStatus();
+	    my $Transaction = LondTransaction->new($data);
+	    $Transaction->SetClient($socket);
+	    StartClientReply($Transaction, "ok\n");
+	    $watcher->cancel();
+	    return;
 	}
 	Debug(8, "Complete transaction received: ".$data);
-	if($LogTransactions) {
+	if ($LogTransactions) {
 	    Log("SUCCESS", "Transaction: '$data'"); # Transaction has \n.
 	}
 	my $Transaction = LondTransaction->new($data);
@@ -1763,6 +1776,7 @@ sub CreateChild {
     my $sigset = POSIX::SigSet->new(SIGINT);
     sigprocmask(SIG_BLOCK, $sigset);
     $RemoteHost = $host;
+    ShowStatus('Parent keeping the flock'); # Update time in status message.
     Log("CRITICAL", "Forking server for ".$host);
     my $pid          = fork;
     if($pid) {			# Parent
@@ -1897,7 +1911,7 @@ sub parent_listen {
 
 sub parent_clean_up {
     my ($loncapa_host) = @_;
-    Debug(-1, "parent_clean_up: $loncapa_host");
+    Debug(1, "parent_clean_up: $loncapa_host");
 
     my $socket_file = &GetLoncSocketPath($loncapa_host);
     unlink($socket_file);	# No problem if it doesn't exist yet [startup e.g.]
@@ -2255,3 +2269,183 @@ A hash of lond connections that have no
 can be closed if they are idle for a long enough time.
 
 =cut
+
+=pod
+
+=head1 Log messages
+
+The following is a list of log messages that can appear in the 
+lonc.log file.  Each log file has a severity and a message.
+
+=over 2
+
+=item Warning  A socket timeout was detected
+
+If there are pending transactions in the socket's queue,
+they are failed (saved if critical).  If the connection
+retry count gets exceeded by this, the
+remote host is marked as dead.
+Called when timeouts occured during the connection and
+connection dialog with a remote host.
+
+=item Critical Host makred DEAD <hostname>   
+
+The numer of retry counts for contacting a host was
+exceeded. The host is marked dead an no 
+further attempts will be made by that child.
+
+=item Info lonc pipe client hung up on us     
+
+Write to the client pipe indicated no data transferred
+Socket to remote host is shut down.  Reply to the client 
+is discarded.  Note: This is commented out in &ClientWriteable
+
+=item Success  Reply from lond: <data>   
+
+Can be enabled for debugging by setting LogTransactions to nonzero.
+Indicates a successful transaction with lond, <data> is the data received
+from the remote lond.
+
+=item Success A delayed transaction was completed  
+
+A transaction that must be reliable was executed and completed
+as lonc restarted.  This is followed by a mesage of the form
+
+  S: client-name : request
+
+=item WARNING  Failing transaction <cmd>:<subcmd>  
+
+Transaction failed on a socket, but the failure retry count for the remote
+node has not yet been exhausted (the node is not yet marked dead).
+cmd is the command, subcmd is the subcommand.  This results from a con_lost
+when communicating with lond.
+
+=item WARNING Shutting down a socket     
+
+Called when a socket is being closed to lond.  This is emitted both when 
+idle pruning is being done and when the socket has been disconnected by the remote.
+
+=item WARNING Lond connection lost.
+
+Called when a read from lond's socket failed indicating lond has closed the 
+connection or died.  This should be followed by one or more
+
+ "WARNING Failing transaction..." msgs for each in-flight or queued transaction.
+
+=item INFO Connected to lond version:  <version> 
+
+When connection negotiation is complete, the lond version is requested and logged here.
+
+=item SUCCESS Connection n to host now ready for action
+
+Emitted when connection has been completed with lond. n is then number of 
+concurrent connections and host, the host to which the connection has just
+been established.
+
+=item WARNING Connection to host has been disconnected
+
+Write to a lond resulted in failure status.  Connection to lond is dropped.
+
+=item SUCCESS Created connection n to host host 
+
+Initial connection request to host..(before negotiation).
+
+=item CRITICAL Request Close Connection ... exiting
+
+Client has sent "close_connection_exit"   The loncnew server is exiting.
+
+=item INFO Resetting Connection Retries 
+
+Client has sent "reset_retries" The lond connection retries are reset to zero for the
+corresponding lond.
+
+=item SUCCESS Transaction <data>
+
+Only emitted if the global variable $LogTransactions was set to true.
+A client has requested a lond transaction <data> is the contents of the request.
+
+=item SUCCESS Toggled transaction logging <LogTransactions>
+                                    
+The state of the $LogTransactions global has been toggled, and its current value
+(after being toggled) is displayed.  When non zero additional logging of transactions
+is enabled for debugging purposes.  Transaction logging is toggled on receipt of a USR2
+signal.
+
+=item CRITICAL Abnormal exit. Child <pid> for <host> died thorugh signal.
+
+QUIT signal received.  lonc child process is exiting.
+
+=item SUCCESS New debugging level for <RemoteHost> now <DebugLevel>
+                                    
+Debugging toggled for the host loncnew is talking with.
+Currently debugging is a level based scheme with higher number 
+conveying more information.  The daemon starts out at
+DebugLevel 0 and can toggle back and forth between that and
+DebugLevel 2  These are controlled by
+the global variables $DebugLevel and $NextDebugLevel
+The debug level can go up to 9.
+SIGINT toggles the debug level.  The higher the debug level the 
+more debugging information is spewed.  See the Debug
+sub in loncnew.
+
+=item CRITICAL Forking server for host  
+
+A child is being created to service requests for the specified host.
+
+
+=item WARNING Request for a second child on hostname
+                                    
+Somehow loncnew was asked to start a second child on a host that already had a child
+servicing it.  This request is not honored, but themessage is emitted.  This could happen
+due to a race condition.  When a client attempts to contact loncnew for a new host, a child
+is forked off to handle the requests for that server.  The parent then backs off the Unix
+domain socket leaving it for the child to service all requests.  If in the time between
+creating the child, and backing off, a new connection request comes in to the unix domain
+socket, this could trigger (unlikely but remotely possible),.
+
+=item CRITICAL ------ Starting Children ----
+
+This message should probably be changed to "Entering event loop"  as the loncnew only starts
+children as needed.  This message is emitted as new events are established and
+the event processing loop is entered.
+
+=item INFO Updating connections via SIGUSR2
+                                    
+SIGUSR2 received. The original code would kill all clients, re-read the host file,
+then restart children for each host.  Now that childrean aree started on demand, this
+just kills all child processes and lets requests start them as needed again.
+
+
+=item CRITICAL Restarting
+
+SigHUP received.  all the children are killed and the script exec's itself to start again.
+
+=item CRITICAL Nicely killing lonc for host pid = <pid>
+
+Attempting to kill the child that is serving the specified host (pid given) cleanly via
+SIGQUIT  The child should handle that, clean up nicely and exit.
+
+=item CRITICAL Nastily killing lonc for host pid = <pid>
+
+The child specified did not die when requested via SIGQUIT.  Therefore it is killed
+via SIGKILL.
+
+=item CRITICAL Asked to kill children.. first be nice..
+
+In the parent's INT handler.  INT kills the child processes.  This inidicate loncnew
+is about to attempt to kill all known children via SIGQUIT.  This message should be followed 
+by one "Nicely killing" message for each extant child.
+
+=item CRITICAL Now kill children nasty 
+
+In the parent's INT handler. remaining children are about to be killed via
+SIGKILL. Should be followed by a Nastily killing... for each lonc child that 
+refused to die.
+
+=item CRITICAL Master process exiting
+
+In the parent's INT handler. just prior to the exit 0 call.
+
+=back
+
+=cut