--- loncom/loncnew	2007/03/28 20:28:29	1.81
+++ loncom/loncnew	2011/01/24 11:02:32	1.95
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.81 2007/03/28 20:28:29 albertel Exp $
+# $Id: loncnew,v 1.95 2011/01/24 11:02:32 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -72,7 +72,8 @@ my %perlvar    = %{$perlvarref};
 #
 #  parent and shared variables.
 
-my %ChildHash;			# by pid -> host.
+my %ChildPid;			# by pid -> host.
+my %ChildHost;			# by host.
 my %listening_to;		# Socket->host table for who the parent
                                 # is listening to.
 my %parent_dispatchers;         # host-> listener watcher events. 
@@ -84,7 +85,7 @@ my $ClientConnection = 0;	# Uniquifier f
 
 my $DebugLevel = 0;
 my $NextDebugLevel= 2;		# So Sigint can toggle this.
-my $IdleTimeout= 600;		# Wait 10 minutes before pruning connections.
+my $IdleTimeout= 5*60;		# Seconds to wait prior to pruning connections.
 
 my $LogTransactions = 0;	# When True, all transactions/replies get logged.
 my $executable      = $0;	# Get the full path to me.
@@ -94,6 +95,7 @@ my $executable      = $0;	# Get the full
 #
 my $RemoteHost;			# Name of host child is talking to.
 my $RemoteHostId;		# default lonid of host child is talking to.
+my @all_host_ids;
 my $UnixSocketDir= $perlvar{'lonSockDir'};
 my $IdleConnections = Stack->new(); # Set of idle connections
 my %ActiveConnections;		# Connections to the remote lond.
@@ -156,6 +158,7 @@ sub LogPerm {
     my $now=time;
     my $local=localtime($now);
     my $fh=IO::File->new(">>$execdir/logs/lonnet.perm.log");
+    chomp($message);
     print $fh "$now:$message:$local\n";
 }
 
@@ -438,7 +441,8 @@ Trigger disconnections of idle sockets.
 
 sub SetupTimer {
     Debug(6, "SetupTimer");
-    Event->timer(interval => 1, cb => \&Tick );
+    Event->timer(interval => 1, cb => \&Tick,
+	hard => 1);
 }
 
 =pod
@@ -500,6 +504,9 @@ the data and Event->w->fd is the socket
 sub ClientWritable {
     my $Event    = shift;
     my $Watcher  = $Event->w;
+    if (!defined($Watcher)) {
+	&child_exit(-1,'No watcher for event in ClientWritable');
+    }
     my $Data     = $Watcher->data;
     my $Socket   = $Watcher->fd;
 
@@ -563,6 +570,7 @@ sub ClientWritable {
 	}
     } else {
 	$Watcher->cancel();	# A delayed request...just cancel.
+	return;
     }
 }
 
@@ -602,8 +610,8 @@ sub CompleteTransaction {
 	StartClientReply($Transaction, $data);
     } else {			# Delete deferred transaction file.
 	Log("SUCCESS", "A delayed transaction was completed");
-	LogPerm("S:$Transaction->getClient() :".$Transaction->getRequest());
-	unlink $Transaction->getFile();
+	LogPerm("S:".$Transaction->getClient().":".$Transaction->getRequest());
+	unlink($Transaction->getFile());
     }
 }
 
@@ -754,6 +762,7 @@ sub KillSocket {
 	delete ($ActiveTransactions{$Socket});
     }
     if(exists($ActiveConnections{$Socket})) {
+	$ActiveConnections{$Socket}->cancel;
 	delete($ActiveConnections{$Socket});
 	$ConnectionCount--;
 	if ($ConnectionCount < 0) { $ConnectionCount = 0; }
@@ -765,6 +774,7 @@ sub KillSocket {
 	EmptyQueue();
 	CloseAllLondConnections; # Should all already be closed but...
     }
+    UpdateStatus();
 }
 
 =pod
@@ -936,7 +946,8 @@ sub LondReadable {
 	    CompleteTransaction($Socket, 
 				$ActiveTransactions{$Socket});
 	} else {
-	    Log("SUCCESS", "Connection ".$ConnectionCount." to "
+	    my $count = $Socket->GetClientData();
+	    Log("SUCCESS", "Connection ".$count." to "
 		.$RemoteHost." now ready for action");
 	}
 	ServerToIdle($Socket);	# Next work unit or idle.
@@ -1161,10 +1172,7 @@ sub QueueDelayed {
     Debug(4, "Delayed path: ".$path);
     opendir(DIRHANDLE, $path);
 
-    use Apache::lonnet;
-    my @all_host_ids = &Apache::lonnet::machine_ids($RemoteHost);
-
-    my $host_id_re = '(?:'.join('|',@all_host_ids).')';
+    my $host_id_re = '(?:'.join('|',map {quotemeta($_)} (@all_host_ids)).')';
     my @alldelayed = grep(/\.$host_id_re$/, readdir(DIRHANDLE));
     closedir(DIRHANDLE);
     foreach my $dfname (sort(@alldelayed)) {
@@ -1202,12 +1210,13 @@ sub MakeLondConnection {
 					 &GetServerPort(),
 					 &GetHostId());
 
-    if($Connection eq undef) {	# Needs to be more robust later.
+    if($Connection eq undef) {	
 	Log("CRITICAL","Failed to make a connection with lond.");
 	$ConnectionRetriesLeft--;
 	return 0;		# Failure.
     }  else {
 
+	$LondConnecting = 1;	# Connection in progress.
 	# The connection needs to have writability 
 	# monitored in order to send the init sequence
 	# that starts the whole authentication/key
@@ -1232,13 +1241,13 @@ sub MakeLondConnection {
 	    &SetupTimer;	# Need to handle timeouts with connections...
 	}
 	$ConnectionCount++;
+	$Connection->SetClientData($ConnectionCount);
 	Debug(4, "Connection count = ".$ConnectionCount);
 	if($ConnectionCount == 1) { # First Connection:
 	    QueueDelayed;
 	}
 	Log("SUCESS", "Created connection ".$ConnectionCount
 	    ." to host ".GetServerHost());
-	$LondConnecting = 1;	# Connection in progress.
 	return 1;		# Return success.
     }
     
@@ -1344,8 +1353,7 @@ sub QueueTransaction {
     }
 }
 
-#-------------------------- Lonc UNIX socket handling ---------------------
-
+#-------------------------- Lonc UNIX socket handling -------------------
 =pod
 
 =head2 ClientRequest
@@ -1379,14 +1387,23 @@ sub ClientRequest {
     $data = $data.$thisread;	# Append new data.
     $watcher->data($data);
     if($data =~ /\n$/) {	# Request entirely read.
-	if($data eq "close_connection_exit\n") {
+	if ($data eq "close_connection_exit\n") {
 	    Log("CRITICAL",
 		"Request Close Connection ... exiting");
 	    CloseAllLondConnections();
 	    exit;
+	} elsif ($data eq "reset_retries\n") {
+	    Log("INFO", "Resetting Connection Retries.");
+	    $ConnectionRetriesLeft = $ConnectionRetries;
+	    &UpdateStatus();
+	    my $Transaction = LondTransaction->new($data);
+	    $Transaction->SetClient($socket);
+	    StartClientReply($Transaction, "ok\n");
+	    $watcher->cancel();
+	    return;
 	}
 	Debug(8, "Complete transaction received: ".$data);
-	if($LogTransactions) {
+	if ($LogTransactions) {
 	    Log("SUCCESS", "Transaction: '$data'"); # Transaction has \n.
 	}
 	my $Transaction = LondTransaction->new($data);
@@ -1717,6 +1734,13 @@ sub ChildProcess {
 		  cb       => \&ToggleDebug,
 		  data     => "INT");
 
+    # Block the pipe signal we'll get when the socket disconnects.  We detect 
+    # socket disconnection via send/receive failures. On disconnect, the
+    # socket becomes readable .. which will force the disconnect detection.
+
+    my $set = POSIX::SigSet->new(SIGPIPE);
+    sigprocmask(SIG_BLOCK, $set);
+
     #  Figure out if we got passed a socket or need to open one to listen for
     #  client requests.
 
@@ -1760,13 +1784,14 @@ sub CreateChild {
     my $sigset = POSIX::SigSet->new(SIGINT);
     sigprocmask(SIG_BLOCK, $sigset);
     $RemoteHost = $host;
+    ShowStatus('Parent keeping the flock'); # Update time in status message.
     Log("CRITICAL", "Forking server for ".$host);
     my $pid          = fork;
     if($pid) {			# Parent
 	$RemoteHost = "Parent";
-	$ChildHash{$pid} = $host;
+	$ChildPid{$pid} = $host;
 	sigprocmask(SIG_UNBLOCK, $sigset);
-
+	undef(@all_host_ids);
     } else {			# child.
 	$RemoteHostId = $hostid;
 	ShowStatus("Connected to ".$RemoteHost);
@@ -1809,37 +1834,41 @@ sub parent_client_connection {
 }
 
 sub get_remote_hostname {
-	my ($event)   = @_;
-	my $watcher   = $event->w;
-	my $socket    = $watcher->fd;
-	
-	my $thisread;
-	my $rv = $socket->recv($thisread, POSIX::BUFSIZ, 0);
-	Debug(8, "rcv:  data length = ".length($thisread)." read =".$thisread);
-	if (!defined($rv) || length($thisread) == 0) {
-	    # Likely eof on socket.
-	    Debug(5,"Client Socket closed on lonc for p_c_c");
-	    close($socket);
-	    $watcher->cancel();
-	    return;
-	}
+    my ($event)   = @_;
+    my $watcher   = $event->w;
+    my $socket    = $watcher->fd;
 
-	my $data    = $watcher->data().$thisread;
-	$watcher->data($data);
-	if($data =~ /\n$/) {	# Request entirely read.
-	    chomp($data);
-	} else {
-	    return;
-	}
-	
-	&Debug(5,"Creating child for $data (parent_client_connection)");
-	my ($hostname,$lonid) = split(':',$data,2);
+    my $thisread;
+    my $rv = $socket->recv($thisread, POSIX::BUFSIZ, 0);
+    Debug(8, "rcv:  data length = ".length($thisread)." read =".$thisread);
+    if (!defined($rv) || length($thisread) == 0) {
+	# Likely eof on socket.
+	Debug(5,"Client Socket closed on lonc for p_c_c");
+	close($socket);
+	$watcher->cancel();
+	return;
+    }
+
+    my $data    = $watcher->data().$thisread;
+    $watcher->data($data);
+    if($data =~ /\n$/) {	# Request entirely read.
+	chomp($data);
+    } else {
+	return;
+    }
+
+    &Debug(5,"Creating child for $data (parent_client_connection)");
+    (my $hostname,my $lonid,@all_host_ids) = split(':',$data);
+    $ChildHost{$hostname}++;
+    if ($ChildHost{$hostname} == 1) {
 	&CreateChild($hostname,$lonid);
-	
-	# Clean up the listen since now the child takes over until it exits.
-	$watcher->cancel();		# Nolonger listening to this event
-	$socket->send("done\n");
-	$socket->close();
+    } else {
+	&Log('WARNING',"Request for a second child on $hostname");
+    }
+    # Clean up the listen since now the child takes over until it exits.
+    $watcher->cancel();		# Nolonger listening to this event
+    $socket->send("done\n");
+    $socket->close();
 }
 
 # parent_listen:
@@ -1890,7 +1919,7 @@ sub parent_listen {
 
 sub parent_clean_up {
     my ($loncapa_host) = @_;
-    Debug(5, "parent_clean_up: $loncapa_host");
+    Debug(1, "parent_clean_up: $loncapa_host");
 
     my $socket_file = &GetLoncSocketPath($loncapa_host);
     unlink($socket_file);	# No problem if it doesn't exist yet [startup e.g.]
@@ -1899,14 +1928,12 @@ sub parent_clean_up {
 }
 
 
-# listen_on_all_unix_sockets:
-#    This sub initiates a listen on all unix domain lonc client sockets.
-#    This will be called in the case where we are trimming idle processes.
-#    When idle processes are trimmed, loncnew starts up with no children,
-#    and only spawns off children when a connection request occurs on the
-#    client unix socket.  The spawned child continues to run until it has
-#    been idle a while at which point it eventually exits and once more
-#    the parent picks up the listen.
+
+#    This sub initiates a listen on the common unix domain lonc client socket.
+#    loncnew starts up with no children, and only spawns off children when a
+#    connection request occurs on the common client unix socket.  The spawned
+#    child continues to run until it has been idle a while at which point it
+#    eventually exits and once more the parent picks up the listen.
 #
 #  Parameters:
 #      NONE
@@ -1915,18 +1942,6 @@ sub parent_clean_up {
 #  Returns:
 #     NONE
 #
-sub listen_on_all_unix_sockets {
-    Debug(5, "listen_on_all_unix_sockets");
-    my $host_iterator      =   &LondConnection::GetHostIterator();
-    while (!$host_iterator->end()) {
-	my $host_entry_ref =   $host_iterator->get();
-	my $host_name      = $host_entry_ref->[3];
-	Debug(9, "Listen for $host_name");
-	&parent_listen($host_name);
-	$host_iterator->next();
-    }
-}
-
 sub listen_on_common_socket {
     Debug(5, "listen_on_common_socket");
     &parent_listen();
@@ -1951,10 +1966,11 @@ sub server_died {
 	}
 	# need the host to restart:
 
-	my $host = $ChildHash{$pid};
+	my $host = $ChildPid{$pid};
 	if($host) {		# It's for real...
 	    &Debug(9, "Caught sigchild for $host");
-	    delete($ChildHash{$pid});
+	    delete($ChildPid{$pid});
+	    delete($ChildHost{$host});
 	    &parent_clean_up($host);
 
 	} else {
@@ -2078,7 +2094,7 @@ sub CheckKids {
     foreach my $host (keys %parent_dispatchers) {
 	print $fh "LONC Parent process listening for $host\n";
     }
-    foreach my $pid (keys %ChildHash) {
+    foreach my $pid (keys %ChildPid) {
 	Debug(2, "Sending USR1 -> $pid");
 	kill 'USR1' => $pid;	# Tell Child to report status.
     }
@@ -2155,13 +2171,19 @@ SIGHUP.  Responds to sigint and sigterm.
 
 sub KillThemAll {
     Debug(2, "Kill them all!!");
-    local($SIG{CHLD}) = 'IGNORE';      # Our children >will< die.
-    foreach my $pid (keys %ChildHash) {
-	my $serving = $ChildHash{$pid};
+    
+    #local($SIG{CHLD}) = 'IGNORE';
+    # Our children >will< die.
+    # but we need to catch their death and cleanup after them in case this is 
+    # a restart set of kills
+    my @allpids = keys(%ChildPid);
+    foreach my $pid (@allpids) {
+	my $serving = $ChildPid{$pid};
 	ShowStatus("Nicely Killing lonc for $serving pid = $pid");
 	Log("CRITICAL", "Nicely Killing lonc for $serving pid = $pid");
 	kill 'QUIT' => $pid;
     }
+    ShowStatus("Finished killing child processes off.");
 }
 
 
@@ -2173,12 +2195,12 @@ sub really_kill_them_all_dammit
 {
     Debug(2, "Kill them all Dammit");
     local($SIG{CHLD} = 'IGNORE'); # In case some purist reenabled them.
-    foreach my $pid (keys %ChildHash) {
-	my $serving = $ChildHash{$pid};
+    foreach my $pid (keys %ChildPid) {
+	my $serving = $ChildPid{$pid};
 	&ShowStatus("Nastily killing lonc for $serving pid = $pid");
 	Log("CRITICAL", "Nastily killing lonc for $serving pid = $pid");
 	kill 'KILL' => $pid;
-	delete($ChildHash{$pid});
+	delete($ChildPid{$pid});
 	my $execdir = $perlvar{'lonDaemons'};
 	unlink("$execdir/logs/lonc.pid");
     }
@@ -2255,3 +2277,183 @@ A hash of lond connections that have no
 can be closed if they are idle for a long enough time.
 
 =cut
+
+=pod
+
+=head1 Log messages
+
+The following is a list of log messages that can appear in the 
+lonc.log file.  Each log file has a severity and a message.
+
+=over 2
+
+=item Warning  A socket timeout was detected
+
+If there are pending transactions in the socket's queue,
+they are failed (saved if critical).  If the connection
+retry count gets exceeded by this, the
+remote host is marked as dead.
+Called when timeouts occured during the connection and
+connection dialog with a remote host.
+
+=item Critical Host makred DEAD <hostname>   
+
+The numer of retry counts for contacting a host was
+exceeded. The host is marked dead an no 
+further attempts will be made by that child.
+
+=item Info lonc pipe client hung up on us     
+
+Write to the client pipe indicated no data transferred
+Socket to remote host is shut down.  Reply to the client 
+is discarded.  Note: This is commented out in &ClientWriteable
+
+=item Success  Reply from lond: <data>   
+
+Can be enabled for debugging by setting LogTransactions to nonzero.
+Indicates a successful transaction with lond, <data> is the data received
+from the remote lond.
+
+=item Success A delayed transaction was completed  
+
+A transaction that must be reliable was executed and completed
+as lonc restarted.  This is followed by a mesage of the form
+
+  S: client-name : request
+
+=item WARNING  Failing transaction <cmd>:<subcmd>  
+
+Transaction failed on a socket, but the failure retry count for the remote
+node has not yet been exhausted (the node is not yet marked dead).
+cmd is the command, subcmd is the subcommand.  This results from a con_lost
+when communicating with lond.
+
+=item WARNING Shutting down a socket     
+
+Called when a socket is being closed to lond.  This is emitted both when 
+idle pruning is being done and when the socket has been disconnected by the remote.
+
+=item WARNING Lond connection lost.
+
+Called when a read from lond's socket failed indicating lond has closed the 
+connection or died.  This should be followed by one or more
+
+ "WARNING Failing transaction..." msgs for each in-flight or queued transaction.
+
+=item INFO Connected to lond version:  <version> 
+
+When connection negotiation is complete, the lond version is requested and logged here.
+
+=item SUCCESS Connection n to host now ready for action
+
+Emitted when connection has been completed with lond. n is then number of 
+concurrent connections and host, the host to which the connection has just
+been established.
+
+=item WARNING Connection to host has been disconnected
+
+Write to a lond resulted in failure status.  Connection to lond is dropped.
+
+=item SUCCESS Created connection n to host host 
+
+Initial connection request to host..(before negotiation).
+
+=item CRITICAL Request Close Connection ... exiting
+
+Client has sent "close_connection_exit"   The loncnew server is exiting.
+
+=item INFO Resetting Connection Retries 
+
+Client has sent "reset_retries" The lond connection retries are reset to zero for the
+corresponding lond.
+
+=item SUCCESS Transaction <data>
+
+Only emitted if the global variable $LogTransactions was set to true.
+A client has requested a lond transaction <data> is the contents of the request.
+
+=item SUCCESS Toggled transaction logging <LogTransactions>
+                                    
+The state of the $LogTransactions global has been toggled, and its current value
+(after being toggled) is displayed.  When non zero additional logging of transactions
+is enabled for debugging purposes.  Transaction logging is toggled on receipt of a USR2
+signal.
+
+=item CRITICAL Abnormal exit. Child <pid> for <host> died thorugh signal.
+
+QUIT signal received.  lonc child process is exiting.
+
+=item SUCCESS New debugging level for <RemoteHost> now <DebugLevel>
+                                    
+Debugging toggled for the host loncnew is talking with.
+Currently debugging is a level based scheme with higher number 
+conveying more information.  The daemon starts out at
+DebugLevel 0 and can toggle back and forth between that and
+DebugLevel 2  These are controlled by
+the global variables $DebugLevel and $NextDebugLevel
+The debug level can go up to 9.
+SIGINT toggles the debug level.  The higher the debug level the 
+more debugging information is spewed.  See the Debug
+sub in loncnew.
+
+=item CRITICAL Forking server for host  
+
+A child is being created to service requests for the specified host.
+
+
+=item WARNING Request for a second child on hostname
+                                    
+Somehow loncnew was asked to start a second child on a host that already had a child
+servicing it.  This request is not honored, but themessage is emitted.  This could happen
+due to a race condition.  When a client attempts to contact loncnew for a new host, a child
+is forked off to handle the requests for that server.  The parent then backs off the Unix
+domain socket leaving it for the child to service all requests.  If in the time between
+creating the child, and backing off, a new connection request comes in to the unix domain
+socket, this could trigger (unlikely but remotely possible),.
+
+=item CRITICAL ------ Starting Children ----
+
+This message should probably be changed to "Entering event loop"  as the loncnew only starts
+children as needed.  This message is emitted as new events are established and
+the event processing loop is entered.
+
+=item INFO Updating connections via SIGUSR2
+                                    
+SIGUSR2 received. The original code would kill all clients, re-read the host file,
+then restart children for each host.  Now that childrean aree started on demand, this
+just kills all child processes and lets requests start them as needed again.
+
+
+=item CRITICAL Restarting
+
+SigHUP received.  all the children are killed and the script exec's itself to start again.
+
+=item CRITICAL Nicely killing lonc for host pid = <pid>
+
+Attempting to kill the child that is serving the specified host (pid given) cleanly via
+SIGQUIT  The child should handle that, clean up nicely and exit.
+
+=item CRITICAL Nastily killing lonc for host pid = <pid>
+
+The child specified did not die when requested via SIGQUIT.  Therefore it is killed
+via SIGKILL.
+
+=item CRITICAL Asked to kill children.. first be nice..
+
+In the parent's INT handler.  INT kills the child processes.  This inidicate loncnew
+is about to attempt to kill all known children via SIGQUIT.  This message should be followed 
+by one "Nicely killing" message for each extant child.
+
+=item CRITICAL Now kill children nasty 
+
+In the parent's INT handler. remaining children are about to be killed via
+SIGKILL. Should be followed by a Nastily killing... for each lonc child that 
+refused to die.
+
+=item CRITICAL Master process exiting
+
+In the parent's INT handler. just prior to the exit 0 call.
+
+=back
+
+=cut