--- loncom/loncnew	2003/08/19 09:31:46	1.19
+++ loncom/loncnew	2003/10/07 11:23:03	1.27
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.19 2003/08/19 09:31:46 foxr Exp $
+# $Id: loncnew,v 1.27 2003/10/07 11:23:03 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -45,6 +45,42 @@
 
 # Change log:
 #    $Log: loncnew,v $
+#    Revision 1.27  2003/10/07 11:23:03  foxr
+#    Installed and tested code to process reinit in parent server.
+#
+#    Revision 1.26  2003/09/30 11:11:17  foxr
+#    Add book-keeping hashes to support the re-init procedure.
+#
+#    Revision 1.25  2003/09/23 11:22:14  foxr
+#    Tested ability to receive sigusr2  This is now logged and must be
+#    properly implemented as a re-read of hosts and re-init of appropriate
+#    children.
+#
+#    Revision 1.24  2003/09/16 09:46:42  foxr
+#    Added skeletal infrastructure to support SIGUSR2 update hosts request.
+#
+#    Revision 1.23  2003/09/15 09:24:49  foxr
+#    Add use strict and fix all the fallout from that.
+#
+#    Revision 1.22  2003/09/02 10:34:47  foxr
+#    - Fix errors in host dead detection logic (too many cases where the
+#      retries left were not getting incremented or just not checked).
+#    - Added some additional status to the ps axuww display:
+#      o Remaining retries on a host.
+#      o >>> DEAD <<< indicator if I've given up on a host.
+#    - Tested the SIGHUP will reset the retries remaining count (thanks to
+#      the above status stuff, and get allow the loncnew to re-try again
+#      on the host (thanks to the log).
+#
+#    Revision 1.21  2003/08/26 09:19:51  foxr
+#    How embarrassing... put in the SocketTimeout function in loncnew and forgot
+#    to actually hook it into the LondTransaction.  Added this to MakeLondConnection
+#    where it belongs... hopefully transactions (not just connection attempts) will
+#    timeout more speedily than the socket errors will catch it.
+#
+#    Revision 1.20  2003/08/25 18:48:11  albertel
+#    - fixing a forgotten ;
+#
 #    Revision 1.19  2003/08/19 09:31:46  foxr
 #    Get socket directory from configuration rather than the old hard coded test
 #    way that I forgot to un-hard code.
@@ -81,7 +117,7 @@
 #    Revision 1.10  2003/06/24 02:46:04  foxr
 #    Put a limit on  the number of times we'll retry a connection.
 #    Start getting the signal stuff put in as well...note that need to get signals
-#    going or else 6the client will permanently give up on dead servers.
+#    going or else the client will permanently give up on dead servers.
 #
 #    Revision 1.9  2003/06/13 02:38:43  foxr
 #    Add logging in 'expected format'
@@ -95,7 +131,7 @@
 #    complete coding to support deferred transactions.
 #
 #
-
+use strict;
 use lib "/home/httpd/lib/perl/";
 use lib "/home/foxr/newloncapa/types";
 use Event qw(:DEFAULT );
@@ -119,12 +155,6 @@ use LONCAPA::HashIterator;
 #
 #   Disable all signals we might receive from outside for now.
 #
-#$SIG{QUIT}  = IGNORE;
-#$SIG{HUP}   = IGNORE;
-#$SIG{USR1}  = IGNORE;
-#$SIG{INT}   = IGNORE;
-#$SIG{CHLD}  = IGNORE;
-#$SIG{__DIE__}  = IGNORE;
 
 
 # Read the httpd configuration file to get perl variables
@@ -137,6 +167,8 @@ my %perlvar    = %{$perlvarref};
 #  parent and shared variables.
 
 my %ChildHash;			# by pid -> host.
+my %HostToPid;			# By host -> pid.
+my %HostHash;			# by loncapaname -> IP.
 
 
 my $MaxConnectionCount = 10;	# Will get from config later.
@@ -150,7 +182,7 @@ my $IdleTimeout= 3600;		# Wait an hour b
 #  The variables below are only used by the child processes.
 #
 my $RemoteHost;			# Name of host child is talking to.
-my $UnixSocketDir= $perlvar{'lonSockDir'}
+my $UnixSocketDir= $perlvar{'lonSockDir'};
 my $IdleConnections = Stack->new(); # Set of idle connections
 my %ActiveConnections;		# Connections to the remote lond.
 my %ActiveTransactions;		# LondTransactions in flight.
@@ -259,7 +291,7 @@ sub GetPeername {
     my $peerip;
     if($AdrFamily == AF_INET) {
 	($peerport, $peerip) = sockaddr_in($peer);
-	my $peername    = gethostbyaddr($iaddr, $AdrFamily);
+	my $peername    = gethostbyaddr($peerip, $AdrFamily);
 	return $peername;
     } elsif ($AdrFamily == AF_UNIX) {
 	my $peerfile;
@@ -280,7 +312,7 @@ sub Debug {
     my $level   = shift;
     my $message = shift;
     if ($level <= $DebugLevel) {
-	Log("INFO", "-Debug- $message host = $RemotHost");
+	Log("INFO", "-Debug- $message host = $RemoteHost");
     }
 }
 
@@ -320,7 +352,9 @@ sub ShowStatus {
 sub SocketTimeout {
     my $Socket = shift;
     
-    KillSocket($Socket);
+    KillSocket($Socket);	# A transaction timeout also counts as
+                                # a connection failure:
+    $ConnectionRetriesLeft--;
 }
 
 =pod
@@ -334,8 +368,12 @@ Invoked  each timer tick.
 
 sub Tick {
     my $client;
-    ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount);
-
+    if($ConnectionRetriesLeft > 0) {
+	ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount
+		   ." Retries remaining: ".$ConnectionRetriesLeft);
+    } else {
+	ShowStatus(GetServerHost()." >> DEAD <<");
+    }
     # Is it time to prune connection count:
 
 
@@ -343,7 +381,7 @@ sub Tick {
        ($WorkQueue->Count() == 0)) { # Idle connections and nothing to do?
 	$IdleSeconds++;
 	if($IdleSeconds > $IdleTimeout) { # Prune a connection...
-	    $Socket = $IdleConnections->pop();
+	    my $Socket = $IdleConnections->pop();
 	    KillSocket($Socket);
 	}
     } else {
@@ -352,7 +390,7 @@ sub Tick {
     #
     #  For each inflight transaction, tick down its timeout counter.
     #
-    foreach $item (keys %ActiveTransactions) {
+    foreach my $item (keys %ActiveTransactions) {
 	my $Socket = $ActiveTransactions{$item}->getServer();
 	$Socket->Tick();
     }
@@ -366,10 +404,16 @@ sub Tick {
 	    my $Connections = ($Requests <= $MaxConnectionCount) ?
 		$Requests : $MaxConnectionCount;
 	    Debug(1,"Work but no connections, start ".$Connections." of them");
-	    for ($i =0; $i < $Connections; $i++) {
-		MakeLondConnection();
+	    my $successCount = 0;
+	    for (my $i =0; $i < $Connections; $i++) {
+		$successCount += MakeLondConnection();
+	    }
+	    if($successCount == 0) { # All connections failed:
+		Debug(1,"Work in queue failed to make any connectiouns\n");
+		EmptyQueue();	# Fail pending transactions with con_lost.
 	    }
 	} else {
+	    ShowStatus(GetServerHost()." >>> DEAD!!! <<<");
 	    Debug(1,"Work in queue, but gave up on connections..flushing\n");
 	    EmptyQueue();	# Connections can't be established.
 	}
@@ -421,7 +465,7 @@ sub ServerToIdle {
 
     #  If there's work to do, start the transaction:
 
-    $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction
+    my $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction
     unless($reqdata eq undef)  {
 	Debug(9, "Queue gave request data: ".$reqdata->getRequest());
 	&StartRequest($Socket,  $reqdata);
@@ -555,7 +599,7 @@ sub CompleteTransaction {
 	StartClientReply($Transaction, $data);
     } else {			# Delete deferred transaction file.
 	Log("SUCCESS", "A delayed transaction was completed");
-	LogPerm("S:$Client:".$Transaction->getRequest());
+	LogPerm("S:$Transaction->getClient() :".$Transaction->getRequest());
 	unlink $Transaction->getFile();
     }
 }
@@ -623,6 +667,9 @@ sub FailTransaction {
 	Debug(1," Replying con_lost to ".$transaction->getRequest());
 	StartClientReply($transaction, "con_lost\n");
     }
+    if($ConnectionRetriesLeft <= 0) {
+	Log("CRITICAL", "Host marked dead: ".GetServerHost());
+    }
 
 }
 
@@ -634,6 +681,7 @@ sub FailTransaction {
 
 =cut
 sub EmptyQueue {
+    $ConnectionRetriesLeft--;	# Counts as connection failure too.
     while($WorkQueue->Count()) {
 	my $request = $WorkQueue->dequeue(); # This is a transaction
 	FailTransaction($request);
@@ -648,7 +696,7 @@ Close all connections open on lond prior
 
 =cut
 sub CloseAllLondConnections {
-    foreach $Socket (keys %ActiveConnections) {
+    foreach my $Socket (keys %ActiveConnections) {
 	KillSocket($Socket);
     }
 }
@@ -700,7 +748,7 @@ sub KillSocket {
     #  work queue, the work all gets failed with con_lost.
     #
     if($ConnectionCount == 0) {
-	EmptyQueue;
+	EmptyQueue();
     }
 }
 
@@ -769,7 +817,7 @@ sub LondReadable {
     my $Socket     = $Watcher->data;
     my $client     = undef;
 
-    &Debug(6,"LondReadable called state = ".$State);
+    &Debug(6,"LondReadable called state = ".$Socket->GetState());
 
 
     my $State = $Socket->GetState(); # All action depends on the state.
@@ -790,6 +838,7 @@ sub LondReadable {
 	}
 	$Watcher->cancel();
 	KillSocket($Socket);
+	$ConnectionRetriesLeft--;       # Counts as connection failure
 	return;
     }
     SocketDump(6,$Socket);
@@ -823,6 +872,10 @@ sub LondReadable {
     } elsif ($State eq "Idle") {
 	# If necessary, complete a transaction and then go into the
 	# idle queue.
+	#  Note that a trasition to idle indicates a live lond
+	# on the other end so reset the connection retries.
+	#
+	$ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
 	$Watcher->cancel();
 	if(exists($ActiveTransactions{$Socket})) {
 	    Debug(8,"Completing transaction!!");
@@ -1035,8 +1088,7 @@ sub QueueDelayed {
     Debug(4, "Delayed path: ".$path);
     opendir(DIRHANDLE, $path);
     
-    @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE;
-    Debug(4, "Got ".$alldelayed." delayed files");
+    my @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE;
     closedir(DIRHANDLE);
     my $dfname;
     my $reqfile;
@@ -1078,7 +1130,7 @@ sub MakeLondConnection {
 	$ConnectionRetriesLeft--;
 	return 0;		# Failure.
     }  else {
-	$ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
+
 	# The connection needs to have writability 
 	# monitored in order to send the init sequence
 	# that starts the whole authentication/key
@@ -1091,8 +1143,9 @@ sub MakeLondConnection {
 	    &Debug(9,"MakeLondConnection got socket: ".$Socket);
 	}
 	
-	
-	$event = Event->io(fd       => $Socket,
+	$Connection->SetTimeoutCallback(\&SocketTimeout);
+
+	my $event = Event->io(fd       => $Socket,
 			   poll     => 'w',
 			   cb       => \&LondWritable,
 			   data     => $Connection,
@@ -1148,7 +1201,7 @@ sub StartRequest {
     $ActiveTransactions{$Lond} = $Request;
 
     $Lond->InitiateTransaction($Request->getRequest());
-    $event = Event->io(fd      => $Socket,
+    my $event = Event->io(fd      => $Socket,
 		       poll    => "w",
 		       cb      => \&LondWritable,
 		       data    => $Lond,
@@ -1190,9 +1243,14 @@ sub QueueTransaction {
 	Debug(8,"Must queue...");
 	$WorkQueue->enqueue($requestData);
 	if($ConnectionCount < $MaxConnectionCount) {
-	    Debug(4,"Starting additional lond connection");
-	    if(MakeLondConnection() == 0) {
-		EmptyQueue();	# Fail transactions, can't make connection.
+	    if($ConnectionRetriesLeft > 0) {
+		Debug(4,"Starting additional lond connection");
+		if(MakeLondConnection() == 0) {
+		    EmptyQueue();	# Fail transactions, can't make connection.
+		}
+	    } else {
+		ShowStatus(GetServerHost()." >>> DEAD !!!! <<<");
+		EmptyQueue();	# It's worse than that ... he's dead Jim.
 	    }
 	}
     } else {			# Can start the request:
@@ -1358,6 +1416,8 @@ sub SetupLoncListener {
 Child USR1 signal handler to report the most recent status
 into the status file.
 
+We also use this to reset the retries count in order to allow the
+client to retry connections with a previously dead server.
 =cut
 sub ChildStatus {
     my $event = shift;
@@ -1368,6 +1428,7 @@ sub ChildStatus {
     my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
     print $fh $$."\t".$RemoteHost."\t".$Status."\t".
 	$RecentLogEntry."\n";
+    $ConnectionRetriesLeft = $ConnectionRetries;
 }
 
 =pod
@@ -1463,15 +1524,16 @@ sub CreateChild {
     my $host = shift;
     $RemoteHost = $host;
     Log("CRITICAL", "Forking server for ".$host);
-    $pid          = fork;
+    my $pid          = fork;
     if($pid) {			# Parent
 	$RemoteHost = "Parent";
-	$ChildHash{$pid} = $RemoteHost;
+	$ChildHash{$pid} = $host;
+	$HostToPid{$host}= $pid;
 	sigprocmask(SIG_UNBLOCK, $sigset);
 
     } else {			# child.
 	ShowStatus("Connected to ".$RemoteHost);
-	$SIG{INT} = DEFAULT;
+	$SIG{INT} = 'DEFAULT';
 	sigprocmask(SIG_UNBLOCK, $sigset);
 	ChildProcess;		# Does not return.
     }
@@ -1508,7 +1570,7 @@ if ($childpid != 0) {
 #
 
 ShowStatus("Parent writing pid file:");
-$execdir = $perlvar{'lonDaemons'};
+my $execdir = $perlvar{'lonDaemons'};
 open (PIDSAVE, ">$execdir/logs/lonc.pid");
 print PIDSAVE "$$\n";
 close(PIDSAVE);
@@ -1527,8 +1589,9 @@ Log("CRITICAL", "--------------- Startin
 my $HostIterator = LondConnection::GetHostIterator;
 while (! $HostIterator->end()) {
 
-    $hostentryref = $HostIterator->get();
+    my $hostentryref = $HostIterator->get();
     CreateChild($hostentryref->[0]);
+    $HostHash{$hostentryref->[0]} = $hostentryref->[4];
     $HostIterator->next();
 }
 $RemoteHost = "Parent Server";
@@ -1545,11 +1608,13 @@ $SIG{INT}  = \&Terminate;
 $SIG{TERM} = \&Terminate; 
 $SIG{HUP}  = \&Restart;
 $SIG{USR1} = \&CheckKids; 
+$SIG{USR2} = \&UpdateKids;	# LonManage update request.
 
 while(1) {
-    $deadchild = wait();
+    my $deadchild = wait();
     if(exists $ChildHash{$deadchild}) {	# need to restart.
-	$deadhost = $ChildHash{$deadchild};
+	my $deadhost = $ChildHash{$deadchild};
+	delete($HostToPid{$deadhost});
 	delete($ChildHash{$deadchild});
 	Log("WARNING","Lost child pid= ".$deadchild.
 	      "Connected to host ".$deadhost);
@@ -1579,7 +1644,7 @@ sub CheckKids {
     my $now=time;
     my $local=localtime($now);
     print $fh "LONC status $local - parent $$ \n\n";
-    foreach $pid (keys %ChildHash) {
+    foreach my $pid (keys %ChildHash) {
 	Debug(2, "Sending USR1 -> $pid");
 	kill 'USR1' => $pid;	# Tell Child to report status.
 	sleep 1;		# Wait so file doesn't intermix.
@@ -1588,6 +1653,114 @@ sub CheckKids {
 
 =pod
 
+=head1  UpdateKids
+
+parent's SIGUSR2 handler.  This handler:
+
+=item
+
+Rereads the hosts file.
+
+=item
+ 
+Kills off (via sigint) children for hosts that have disappeared.
+
+=item
+
+QUITs  children for hosts that already exist (this just forces a status display
+and resets the connection retry count for that host.
+
+=item
+
+Starts new children for hosts that have been added to the hosts.tab file since
+the start of the master program and maintains them.
+
+=cut
+
+sub UpdateKids {
+
+    Log("INFO", "Updating connections via SIGUSR2");
+
+    #  Just in case we need to kill our own lonc, we wait a few seconds to
+    #  give it a chance to receive and relay lond's response to the 
+    #  re-init command.
+    #
+
+    sleep(2);			# Wait a couple of seconds.
+
+    my %hosts;                   # Indexed by loncapa hostname, value=ip.
+    
+    # Need to re-read  the host table:
+    
+    
+    LondConnection::ReadConfig();
+    my $I = LondConnection::GetHostIterator;
+    while (! $I->end()) {
+	my $item = $I->get();
+	$hosts{$item->[0]} = $item->[4];
+	$I->next();
+    }
+
+    #  The logic below is written for clarity not for efficiency.
+    #  Since I anticipate that this function is only rarely called, that's
+    #  appropriate.  There are certainly ways to combine the loops below,
+    #  and anyone wishing to obscure the logic is welcome to go for it.
+    #  Note that we don't re-direct sigchild.  Instead we do what's needed
+    #  to the data structures that keep track of children to ensure that
+    #  when sigchild is honored, no new child is born.
+    #
+
+    #  For each existing child; if it's host doesn't exist, kill the child.
+
+    foreach my $child (keys %ChildHash) {
+	my $oldhost = $ChildHash{$child};
+	if (!(exists $hosts{$oldhost})) {
+	    Log("CRITICAL", "Killing child for $oldhost  host no longer exists");
+	    delete $ChildHash{$child};
+	    delete $HostToPid{$oldhost};
+	    kill 'QUIT' => $child;
+	}
+    }
+    # For each remaining existing child; if it's host's ip has changed,
+    # Restart the child on the new IP.
+
+    foreach my $child (keys %ChildHash) {
+	my $oldhost = $ChildHash{$child};
+	my $oldip   = $HostHash{$oldhost};
+	if ($hosts{$oldhost} ne $oldip) {
+
+	    # kill the old child.
+
+	    Log("CRITICAL", "Killing child for $oldhost host ip has changed...");
+	    delete $ChildHash{$child};
+	    delete $HostToPid{$oldhost};
+	    kill 'QUIT' => $child;
+
+	    # Do the book-keeping needed to start a new child on the
+	    # new ip.
+
+	    $HostHash{$oldhost} = $hosts{$oldhost};
+	    CreateChild($oldhost);
+	}
+    }
+    # Finally, for each new host, not in the host hash, create a
+    # enter the host and create a new child.
+    # Force a status display of any existing process.
+
+    foreach my $host (keys %hosts) {
+	if(!(exists $HostHash{$host})) {
+	    Log("INFO", "New host $host discovered in hosts.tab...");
+	    $HostHash{$host} = $hosts{$host};
+	    CreateChild($host);
+	} else {
+	    kill 'HUP' => $HostToPid{$host};    # status display.
+	}
+    }
+}
+
+
+=pod
+
 =head1 Restart
 
 Signal handler for HUP... all children are killed and
@@ -1597,7 +1770,7 @@ the config file.
 =cut
 
 sub Restart {
-    KillThemAll;		# First kill all the children.
+    &KillThemAll;		# First kill all the children.
     Log("CRITICAL", "Restarting");
     my $execdir = $perlvar{'lonDaemons'};
     unlink("$execdir/logs/lonc.pid");
@@ -1616,7 +1789,7 @@ SIGHUP.  Responds to sigint and sigterm.
 sub KillThemAll {
     Debug(2, "Kill them all!!");
     local($SIG{CHLD}) = 'IGNORE';      # Our children >will< die.
-    foreach $pid (keys %ChildHash) {
+    foreach my $pid (keys %ChildHash) {
 	my $serving = $ChildHash{$pid};
 	Debug(2, "Killing lonc for $serving pid = $pid");
 	ShowStatus("Killing lonc for $serving pid = $pid");