--- loncom/loncnew	2003/08/26 09:19:51	1.21
+++ loncom/loncnew	2003/10/21 14:24:42	1.29
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.21 2003/08/26 09:19:51 foxr Exp $
+# $Id: loncnew,v 1.29 2003/10/21 14:24:42 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -45,6 +45,42 @@
 
 # Change log:
 #    $Log: loncnew,v $
+#    Revision 1.29  2003/10/21 14:24:42  foxr
+#    Fix little typo that may explain growth of connections
+#
+#    Revision 1.28  2003/10/14 15:36:21  albertel
+#    - making it easier to run loncnew,
+#       /etc/init.d/loncontrol startnew
+#       /etc/init.d/loncontrol restartnew
+#      will now start loncnew in place of lonc
+#
+#    Revision 1.27  2003/10/07 11:23:03  foxr
+#    Installed and tested code to process reinit in parent server.
+#
+#    Revision 1.26  2003/09/30 11:11:17  foxr
+#    Add book-keeping hashes to support the re-init procedure.
+#
+#    Revision 1.25  2003/09/23 11:22:14  foxr
+#    Tested ability to receive sigusr2  This is now logged and must be
+#    properly implemented as a re-read of hosts and re-init of appropriate
+#    children.
+#
+#    Revision 1.24  2003/09/16 09:46:42  foxr
+#    Added skeletal infrastructure to support SIGUSR2 update hosts request.
+#
+#    Revision 1.23  2003/09/15 09:24:49  foxr
+#    Add use strict and fix all the fallout from that.
+#
+#    Revision 1.22  2003/09/02 10:34:47  foxr
+#    - Fix errors in host dead detection logic (too many cases where the
+#      retries left were not getting incremented or just not checked).
+#    - Added some additional status to the ps axuww display:
+#      o Remaining retries on a host.
+#      o >>> DEAD <<< indicator if I've given up on a host.
+#    - Tested the SIGHUP will reset the retries remaining count (thanks to
+#      the above status stuff, and get allow the loncnew to re-try again
+#      on the host (thanks to the log).
+#
 #    Revision 1.21  2003/08/26 09:19:51  foxr
 #    How embarrassing... put in the SocketTimeout function in loncnew and forgot
 #    to actually hook it into the LondTransaction.  Added this to MakeLondConnection
@@ -90,7 +126,7 @@
 #    Revision 1.10  2003/06/24 02:46:04  foxr
 #    Put a limit on  the number of times we'll retry a connection.
 #    Start getting the signal stuff put in as well...note that need to get signals
-#    going or else 6the client will permanently give up on dead servers.
+#    going or else the client will permanently give up on dead servers.
 #
 #    Revision 1.9  2003/06/13 02:38:43  foxr
 #    Add logging in 'expected format'
@@ -104,7 +140,7 @@
 #    complete coding to support deferred transactions.
 #
 #
-
+use strict;
 use lib "/home/httpd/lib/perl/";
 use lib "/home/foxr/newloncapa/types";
 use Event qw(:DEFAULT );
@@ -128,12 +164,6 @@ use LONCAPA::HashIterator;
 #
 #   Disable all signals we might receive from outside for now.
 #
-#$SIG{QUIT}  = IGNORE;
-#$SIG{HUP}   = IGNORE;
-#$SIG{USR1}  = IGNORE;
-#$SIG{INT}   = IGNORE;
-#$SIG{CHLD}  = IGNORE;
-#$SIG{__DIE__}  = IGNORE;
 
 
 # Read the httpd configuration file to get perl variables
@@ -146,13 +176,15 @@ my %perlvar    = %{$perlvarref};
 #  parent and shared variables.
 
 my %ChildHash;			# by pid -> host.
+my %HostToPid;			# By host -> pid.
+my %HostHash;			# by loncapaname -> IP.
 
 
 my $MaxConnectionCount = 10;	# Will get from config later.
 my $ClientConnection = 0;	# Uniquifier for client events.
 
 my $DebugLevel = 0;
-my $NextDebugLevel= 10;		# So Sigint can toggle this.
+my $NextDebugLevel= 2;		# So Sigint can toggle this.
 my $IdleTimeout= 3600;		# Wait an hour before pruning connections.
 
 #
@@ -268,7 +300,7 @@ sub GetPeername {
     my $peerip;
     if($AdrFamily == AF_INET) {
 	($peerport, $peerip) = sockaddr_in($peer);
-	my $peername    = gethostbyaddr($iaddr, $AdrFamily);
+	my $peername    = gethostbyaddr($peerip, $AdrFamily);
 	return $peername;
     } elsif ($AdrFamily == AF_UNIX) {
 	my $peerfile;
@@ -289,7 +321,7 @@ sub Debug {
     my $level   = shift;
     my $message = shift;
     if ($level <= $DebugLevel) {
-	Log("INFO", "-Debug- $message host = $RemotHost");
+	Log("INFO", "-Debug- $message host = $RemoteHost");
     }
 }
 
@@ -329,7 +361,9 @@ sub ShowStatus {
 sub SocketTimeout {
     my $Socket = shift;
     
-    KillSocket($Socket);
+    KillSocket($Socket);	# A transaction timeout also counts as
+                                # a connection failure:
+    $ConnectionRetriesLeft--;
 }
 
 =pod
@@ -343,8 +377,12 @@ Invoked  each timer tick.
 
 sub Tick {
     my $client;
-    ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount);
-
+    if($ConnectionRetriesLeft > 0) {
+	ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount
+		   ." Retries remaining: ".$ConnectionRetriesLeft);
+    } else {
+	ShowStatus(GetServerHost()." >> DEAD <<");
+    }
     # Is it time to prune connection count:
 
 
@@ -352,7 +390,7 @@ sub Tick {
        ($WorkQueue->Count() == 0)) { # Idle connections and nothing to do?
 	$IdleSeconds++;
 	if($IdleSeconds > $IdleTimeout) { # Prune a connection...
-	    $Socket = $IdleConnections->pop();
+	    my $Socket = $IdleConnections->pop();
 	    KillSocket($Socket);
 	}
     } else {
@@ -361,7 +399,7 @@ sub Tick {
     #
     #  For each inflight transaction, tick down its timeout counter.
     #
-    foreach $item (keys %ActiveTransactions) {
+    foreach my $item (keys %ActiveTransactions) {
 	my $Socket = $ActiveTransactions{$item}->getServer();
 	$Socket->Tick();
     }
@@ -374,12 +412,18 @@ sub Tick {
 	if ($ConnectionRetriesLeft > 0) {
 	    my $Connections = ($Requests <= $MaxConnectionCount) ?
 		$Requests : $MaxConnectionCount;
-	    Debug(1,"Work but no connections, start ".$Connections." of them");
-	    for ($i =0; $i < $Connections; $i++) {
-		MakeLondConnection();
+	    Debug(5,"Work but no connections, start ".$Connections." of them");
+	    my $successCount = 0;
+	    for (my $i =0; $i < $Connections; $i++) {
+		$successCount += MakeLondConnection();
+	    }
+	    if($successCount == 0) { # All connections failed:
+		Debug(5,"Work in queue failed to make any connectiouns\n");
+		EmptyQueue();	# Fail pending transactions with con_lost.
 	    }
 	} else {
-	    Debug(1,"Work in queue, but gave up on connections..flushing\n");
+	    ShowStatus(GetServerHost()." >>> DEAD!!! <<<");
+	    Debug(5,"Work in queue, but gave up on connections..flushing\n");
 	    EmptyQueue();	# Connections can't be established.
 	}
        
@@ -426,19 +470,19 @@ sub ServerToIdle {
     my $Socket   = shift;	# Get the socket.
     delete($ActiveTransactions{$Socket}); # Server has no transaction
 
-    &Debug(6, "Server to idle");
+    &Debug(5, "Server to idle");
 
     #  If there's work to do, start the transaction:
 
-    $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction
-    unless($reqdata eq undef)  {
-	Debug(9, "Queue gave request data: ".$reqdata->getRequest());
+    my $reqdata = $WorkQueue->dequeue(); # This is a LondTransaction
+    if ($reqdata ne undef)  {
+	Debug(5, "Queue gave request data: ".$reqdata->getRequest());
 	&StartRequest($Socket,  $reqdata);
 
     } else {
 	
     #  There's no work waiting, so push the server to idle list.
-	&Debug(8, "No new work requests, server connection going idle");
+	&Debug(5, "No new work requests, server connection going idle");
 	$IdleConnections->push($Socket);
     }
 }
@@ -484,7 +528,7 @@ sub ClientWritable {
 	# request.
 	
 	&Debug(9,"Send result is ".$result." Defined: ".defined($result));
-	if(defined($result)) {
+	if($result ne undef) {
 	    &Debug(9, "send result was defined");
 	    if($result == length($Data)) { # Entire string sent.
 		&Debug(9, "ClientWritable data all written");
@@ -555,7 +599,7 @@ The transaction that is being completed.
 =cut
 
 sub CompleteTransaction {
-    &Debug(6,"Complete transaction");
+    &Debug(5,"Complete transaction");
     my $Socket = shift;
     my $Transaction = shift;
 
@@ -564,7 +608,7 @@ sub CompleteTransaction {
 	StartClientReply($Transaction, $data);
     } else {			# Delete deferred transaction file.
 	Log("SUCCESS", "A delayed transaction was completed");
-	LogPerm("S:$Client:".$Transaction->getRequest());
+	LogPerm("S:$Transaction->getClient() :".$Transaction->getRequest());
 	unlink $Transaction->getFile();
     }
 }
@@ -594,7 +638,6 @@ sub StartClientReply {
     &Debug(8," Reply was: ".$data);
     my $Serial         = $ActiveClients{$Client};
     my $desc           = sprintf("Connection to lonc client %d",
-
 				 $Serial);
     Event->io(fd       => $Client,
 	      poll     => "w",
@@ -626,12 +669,15 @@ Parameters:
 sub FailTransaction {
     my $transaction = shift;
     Log("WARNING", "Failing transaction ".$transaction->getRequest());
-    Debug(1, "Failing transaction: ".$transaction->getRequest());
+    Debug(5, "Failing transaction: ".$transaction->getRequest());
     if (!$transaction->isDeferred()) { # If the transaction is deferred we'll get to it.
 	my $client  = $transaction->getClient();
-	Debug(1," Replying con_lost to ".$transaction->getRequest());
+	Debug(5," Replying con_lost to ".$transaction->getRequest());
 	StartClientReply($transaction, "con_lost\n");
     }
+    if($ConnectionRetriesLeft <= 0) {
+	Log("CRITICAL", "Host marked dead: ".GetServerHost());
+    }
 
 }
 
@@ -643,6 +689,7 @@ sub FailTransaction {
 
 =cut
 sub EmptyQueue {
+    $ConnectionRetriesLeft--;	# Counts as connection failure too.
     while($WorkQueue->Count()) {
 	my $request = $WorkQueue->dequeue(); # This is a transaction
 	FailTransaction($request);
@@ -657,7 +704,7 @@ Close all connections open on lond prior
 
 =cut
 sub CloseAllLondConnections {
-    foreach $Socket (keys %ActiveConnections) {
+    foreach my $Socket (keys %ActiveConnections) {
 	KillSocket($Socket);
     }
 }
@@ -709,7 +756,7 @@ sub KillSocket {
     #  work queue, the work all gets failed with con_lost.
     #
     if($ConnectionCount == 0) {
-	EmptyQueue;
+	EmptyQueue();
     }
 }
 
@@ -778,7 +825,7 @@ sub LondReadable {
     my $Socket     = $Watcher->data;
     my $client     = undef;
 
-    &Debug(6,"LondReadable called state = ".$State);
+    &Debug(6,"LondReadable called state = ".$Socket->GetState());
 
 
     my $State = $Socket->GetState(); # All action depends on the state.
@@ -799,6 +846,7 @@ sub LondReadable {
 	}
 	$Watcher->cancel();
 	KillSocket($Socket);
+	$ConnectionRetriesLeft--;       # Counts as connection failure
 	return;
     }
     SocketDump(6,$Socket);
@@ -832,9 +880,13 @@ sub LondReadable {
     } elsif ($State eq "Idle") {
 	# If necessary, complete a transaction and then go into the
 	# idle queue.
+	#  Note that a trasition to idle indicates a live lond
+	# on the other end so reset the connection retries.
+	#
+	$ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
 	$Watcher->cancel();
 	if(exists($ActiveTransactions{$Socket})) {
-	    Debug(8,"Completing transaction!!");
+	    Debug(5,"Completing transaction!!");
 	    CompleteTransaction($Socket, 
 				$ActiveTransactions{$Socket});
 	} else {
@@ -1044,8 +1096,7 @@ sub QueueDelayed {
     Debug(4, "Delayed path: ".$path);
     opendir(DIRHANDLE, $path);
     
-    @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE;
-    Debug(4, "Got ".$alldelayed." delayed files");
+    my @alldelayed = grep /\.$RemoteHost$/, readdir DIRHANDLE;
     closedir(DIRHANDLE);
     my $dfname;
     my $reqfile;
@@ -1087,7 +1138,7 @@ sub MakeLondConnection {
 	$ConnectionRetriesLeft--;
 	return 0;		# Failure.
     }  else {
-	$ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
+
 	# The connection needs to have writability 
 	# monitored in order to send the init sequence
 	# that starts the whole authentication/key
@@ -1102,7 +1153,7 @@ sub MakeLondConnection {
 	
 	$Connection->SetTimeoutCallback(\&SocketTimeout);
 
-	$event = Event->io(fd       => $Socket,
+	my $event = Event->io(fd       => $Socket,
 			   poll     => 'w',
 			   cb       => \&LondWritable,
 			   data     => $Connection,
@@ -1158,7 +1209,7 @@ sub StartRequest {
     $ActiveTransactions{$Lond} = $Request;
 
     $Lond->InitiateTransaction($Request->getRequest());
-    $event = Event->io(fd      => $Socket,
+    my $event = Event->io(fd      => $Socket,
 		       poll    => "w",
 		       cb      => \&LondWritable,
 		       data    => $Lond,
@@ -1197,12 +1248,17 @@ sub QueueTransaction {
 
     my $LondSocket    = $IdleConnections->pop();
     if(!defined $LondSocket) {	# Need to queue request.
-	Debug(8,"Must queue...");
+	Debug(5,"Must queue...");
 	$WorkQueue->enqueue($requestData);
 	if($ConnectionCount < $MaxConnectionCount) {
-	    Debug(4,"Starting additional lond connection");
-	    if(MakeLondConnection() == 0) {
-		EmptyQueue();	# Fail transactions, can't make connection.
+	    if($ConnectionRetriesLeft > 0) {
+		Debug(5,"Starting additional lond connection");
+		if(MakeLondConnection() == 0) {
+		    EmptyQueue();	# Fail transactions, can't make connection.
+		}
+	    } else {
+		ShowStatus(GetServerHost()." >>> DEAD !!!! <<<");
+		EmptyQueue();	# It's worse than that ... he's dead Jim.
 	    }
 	}
     } else {			# Can start the request:
@@ -1234,7 +1290,7 @@ sub ClientRequest {
     my $rv = $socket->recv($thisread, POSIX::BUFSIZ, 0);
     Debug(8, "rcv:  data length = ".length($thisread)
 	  ." read =".$thisread);
-    unless (defined $rv && length($thisread)) {
+    unless (defined $rv  && length($thisread)) {
 	 # Likely eof on socket.
 	Debug(5,"Client Socket closed on lonc for ".$RemoteHost);
 	close($socket);
@@ -1368,6 +1424,8 @@ sub SetupLoncListener {
 Child USR1 signal handler to report the most recent status
 into the status file.
 
+We also use this to reset the retries count in order to allow the
+client to retry connections with a previously dead server.
 =cut
 sub ChildStatus {
     my $event = shift;
@@ -1378,6 +1436,7 @@ sub ChildStatus {
     my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
     print $fh $$."\t".$RemoteHost."\t".$Status."\t".
 	$RecentLogEntry."\n";
+    $ConnectionRetriesLeft = $ConnectionRetries;
 }
 
 =pod
@@ -1473,15 +1532,16 @@ sub CreateChild {
     my $host = shift;
     $RemoteHost = $host;
     Log("CRITICAL", "Forking server for ".$host);
-    $pid          = fork;
+    my $pid          = fork;
     if($pid) {			# Parent
 	$RemoteHost = "Parent";
-	$ChildHash{$pid} = $RemoteHost;
+	$ChildHash{$pid} = $host;
+	$HostToPid{$host}= $pid;
 	sigprocmask(SIG_UNBLOCK, $sigset);
 
     } else {			# child.
 	ShowStatus("Connected to ".$RemoteHost);
-	$SIG{INT} = DEFAULT;
+	$SIG{INT} = 'DEFAULT';
 	sigprocmask(SIG_UNBLOCK, $sigset);
 	ChildProcess;		# Does not return.
     }
@@ -1518,7 +1578,7 @@ if ($childpid != 0) {
 #
 
 ShowStatus("Parent writing pid file:");
-$execdir = $perlvar{'lonDaemons'};
+my $execdir = $perlvar{'lonDaemons'};
 open (PIDSAVE, ">$execdir/logs/lonc.pid");
 print PIDSAVE "$$\n";
 close(PIDSAVE);
@@ -1537,8 +1597,9 @@ Log("CRITICAL", "--------------- Startin
 my $HostIterator = LondConnection::GetHostIterator;
 while (! $HostIterator->end()) {
 
-    $hostentryref = $HostIterator->get();
+    my $hostentryref = $HostIterator->get();
     CreateChild($hostentryref->[0]);
+    $HostHash{$hostentryref->[0]} = $hostentryref->[4];
     $HostIterator->next();
 }
 $RemoteHost = "Parent Server";
@@ -1555,11 +1616,13 @@ $SIG{INT}  = \&Terminate;
 $SIG{TERM} = \&Terminate; 
 $SIG{HUP}  = \&Restart;
 $SIG{USR1} = \&CheckKids; 
+$SIG{USR2} = \&UpdateKids;	# LonManage update request.
 
 while(1) {
-    $deadchild = wait();
+    my $deadchild = wait();
     if(exists $ChildHash{$deadchild}) {	# need to restart.
-	$deadhost = $ChildHash{$deadchild};
+	my $deadhost = $ChildHash{$deadchild};
+	delete($HostToPid{$deadhost});
 	delete($ChildHash{$deadchild});
 	Log("WARNING","Lost child pid= ".$deadchild.
 	      "Connected to host ".$deadhost);
@@ -1589,7 +1652,7 @@ sub CheckKids {
     my $now=time;
     my $local=localtime($now);
     print $fh "LONC status $local - parent $$ \n\n";
-    foreach $pid (keys %ChildHash) {
+    foreach my $pid (keys %ChildHash) {
 	Debug(2, "Sending USR1 -> $pid");
 	kill 'USR1' => $pid;	# Tell Child to report status.
 	sleep 1;		# Wait so file doesn't intermix.
@@ -1598,6 +1661,114 @@ sub CheckKids {
 
 =pod
 
+=head1  UpdateKids
+
+parent's SIGUSR2 handler.  This handler:
+
+=item
+
+Rereads the hosts file.
+
+=item
+ 
+Kills off (via sigint) children for hosts that have disappeared.
+
+=item
+
+QUITs  children for hosts that already exist (this just forces a status display
+and resets the connection retry count for that host.
+
+=item
+
+Starts new children for hosts that have been added to the hosts.tab file since
+the start of the master program and maintains them.
+
+=cut
+
+sub UpdateKids {
+
+    Log("INFO", "Updating connections via SIGUSR2");
+
+    #  Just in case we need to kill our own lonc, we wait a few seconds to
+    #  give it a chance to receive and relay lond's response to the 
+    #  re-init command.
+    #
+
+    sleep(2);			# Wait a couple of seconds.
+
+    my %hosts;                   # Indexed by loncapa hostname, value=ip.
+    
+    # Need to re-read  the host table:
+    
+    
+    LondConnection::ReadConfig();
+    my $I = LondConnection::GetHostIterator;
+    while (! $I->end()) {
+	my $item = $I->get();
+	$hosts{$item->[0]} = $item->[4];
+	$I->next();
+    }
+
+    #  The logic below is written for clarity not for efficiency.
+    #  Since I anticipate that this function is only rarely called, that's
+    #  appropriate.  There are certainly ways to combine the loops below,
+    #  and anyone wishing to obscure the logic is welcome to go for it.
+    #  Note that we don't re-direct sigchild.  Instead we do what's needed
+    #  to the data structures that keep track of children to ensure that
+    #  when sigchild is honored, no new child is born.
+    #
+
+    #  For each existing child; if it's host doesn't exist, kill the child.
+
+    foreach my $child (keys %ChildHash) {
+	my $oldhost = $ChildHash{$child};
+	if (!(exists $hosts{$oldhost})) {
+	    Log("CRITICAL", "Killing child for $oldhost  host no longer exists");
+	    delete $ChildHash{$child};
+	    delete $HostToPid{$oldhost};
+	    kill 'QUIT' => $child;
+	}
+    }
+    # For each remaining existing child; if it's host's ip has changed,
+    # Restart the child on the new IP.
+
+    foreach my $child (keys %ChildHash) {
+	my $oldhost = $ChildHash{$child};
+	my $oldip   = $HostHash{$oldhost};
+	if ($hosts{$oldhost} ne $oldip) {
+
+	    # kill the old child.
+
+	    Log("CRITICAL", "Killing child for $oldhost host ip has changed...");
+	    delete $ChildHash{$child};
+	    delete $HostToPid{$oldhost};
+	    kill 'QUIT' => $child;
+
+	    # Do the book-keeping needed to start a new child on the
+	    # new ip.
+
+	    $HostHash{$oldhost} = $hosts{$oldhost};
+	    CreateChild($oldhost);
+	}
+    }
+    # Finally, for each new host, not in the host hash, create a
+    # enter the host and create a new child.
+    # Force a status display of any existing process.
+
+    foreach my $host (keys %hosts) {
+	if(!(exists $HostHash{$host})) {
+	    Log("INFO", "New host $host discovered in hosts.tab...");
+	    $HostHash{$host} = $hosts{$host};
+	    CreateChild($host);
+	} else {
+	    kill 'HUP' => $HostToPid{$host};    # status display.
+	}
+    }
+}
+
+
+=pod
+
 =head1 Restart
 
 Signal handler for HUP... all children are killed and
@@ -1607,11 +1778,11 @@ the config file.
 =cut
 
 sub Restart {
-    KillThemAll;		# First kill all the children.
+    &KillThemAll;		# First kill all the children.
     Log("CRITICAL", "Restarting");
     my $execdir = $perlvar{'lonDaemons'};
     unlink("$execdir/logs/lonc.pid");
-    exec("$execdir/lonc");
+    exec("$execdir/loncnew");
 }
 
 =pod
@@ -1626,7 +1797,7 @@ SIGHUP.  Responds to sigint and sigterm.
 sub KillThemAll {
     Debug(2, "Kill them all!!");
     local($SIG{CHLD}) = 'IGNORE';      # Our children >will< die.
-    foreach $pid (keys %ChildHash) {
+    foreach my $pid (keys %ChildHash) {
 	my $serving = $ChildHash{$pid};
 	Debug(2, "Killing lonc for $serving pid = $pid");
 	ShowStatus("Killing lonc for $serving pid = $pid");