--- loncom/loncnew	2003/06/13 02:38:43	1.9
+++ loncom/loncnew	2003/07/29 02:33:05	1.16
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.9 2003/06/13 02:38:43 foxr Exp $
+# $Id: loncnew,v 1.16 2003/07/29 02:33:05 foxr Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -27,7 +27,7 @@
 # http://www.lon-capa.org/
 #
 #
-# new lonc handles n requestors spread out bver m connections to londs.
+# new lonc handles n request out bver m connections to londs.
 # This module is based on the Event class.
 #   Development iterations:
 #    - Setup basic event loop.   (done)
@@ -46,6 +46,29 @@
 
 # Change log:
 #    $Log: loncnew,v $
+#    Revision 1.16  2003/07/29 02:33:05  foxr
+#    Add SIGINT processing to child processes to toggle annoying trace mode
+#    on/off.. will try to use this to isolate the compute boud process issue.
+#
+#    Revision 1.15  2003/07/15 02:07:05  foxr
+#    Added code for lonc/lond transaction timeouts.  Who knows if it works right.
+#    The intent is for a timeout to fail any transaction in progress and kill
+#    off the sockt that timed out.
+#
+#    Revision 1.14  2003/07/03 02:10:18  foxr
+#    Get all of the signals to work correctly.
+#
+#    Revision 1.13  2003/07/02 01:31:55  foxr
+#    Added kill -HUP logic (restart).
+#
+#    Revision 1.11  2003/06/25 01:54:44  foxr
+#    Fix more problems with transaction failure.
+#
+#    Revision 1.10  2003/06/24 02:46:04  foxr
+#    Put a limit on  the number of times we'll retry a connection.
+#    Start getting the signal stuff put in as well...note that need to get signals
+#    going or else 6the client will permanently give up on dead servers.
+#
 #    Revision 1.9  2003/06/13 02:38:43  foxr
 #    Add logging in 'expected format'
 #
@@ -63,6 +86,7 @@ use lib "/home/httpd/lib/perl/";
 use lib "/home/foxr/newloncapa/types";
 use Event qw(:DEFAULT );
 use POSIX qw(:signal_h);
+use POSIX;
 use IO::Socket;
 use IO::Socket::INET;
 use IO::Socket::UNIX;
@@ -81,12 +105,12 @@ use LONCAPA::HashIterator;
 #
 #   Disable all signals we might receive from outside for now.
 #
-$SIG{QUIT}  = IGNORE;
-$SIG{HUP}   = IGNORE;
-$SIG{USR1}  = IGNORE;
-$SIG{INT}   = IGNORE;
-$SIG{CHLD}  = IGNORE;
-$SIG{__DIE__}  = IGNORE;
+#$SIG{QUIT}  = IGNORE;
+#$SIG{HUP}   = IGNORE;
+#$SIG{USR1}  = IGNORE;
+#$SIG{INT}   = IGNORE;
+#$SIG{CHLD}  = IGNORE;
+#$SIG{__DIE__}  = IGNORE;
 
 
 # Read the httpd configuration file to get perl variables
@@ -105,6 +129,7 @@ my $MaxConnectionCount = 10;	# Will get
 my $ClientConnection = 0;	# Uniquifier for client events.
 
 my $DebugLevel = 0;
+my $NextDebugLevel= 10;		# So Sigint can toggle this.
 my $IdleTimeout= 3600;		# Wait an hour before pruning connections.
 
 #
@@ -120,7 +145,9 @@ my $WorkQueue       = Queue->new(); # Qu
 my $ConnectionCount = 0;
 my $IdleSeconds     = 0;	# Number of seconds idle.
 my $Status          = "";	# Current status string.
-
+my $RecentLogEntry  = "";
+my $ConnectionRetries=5;	# Number of connection retries allowed.
+my $ConnectionRetriesLeft=5;	# Number of connection retries remaining.
 
 #
 #   The hash below gives the HTML format for log messages
@@ -134,7 +161,23 @@ $LogFormats{"INFO"}     = "<font color=y
 $LogFormats{"WARNING"}  = "<font color=blue>WARNING: %s</font>";
 $LogFormats{"DEFAULT"}  = " %s ";
 
-my $lastlog = '';		# Used for status reporting.
+
+
+=pod
+
+=head2 LogPerm
+
+Makes an entry into the permanent log file.
+
+=cut
+sub LogPerm {
+    my $message=shift;
+    my $execdir=$perlvar{'lonDaemons'};
+    my $now=time;
+    my $local=localtime($now);
+    my $fh=IO::File->new(">>$execdir/logs/lonnet.perm.log");
+    print $fh "$now:$message:$local\n";
+}
 
 =pod
 
@@ -179,8 +222,10 @@ sub Log {
     my $execdir = $perlvar{'lonDaemons'};
     my $fh      = IO::File->new(">>$execdir/logs/lonc.log");
     my $msg = sprintf($finalformat, $message);
+    $RecentLogEntry = $msg;
     print $fh $msg;
     
+    
 }
 
 
@@ -221,7 +266,7 @@ sub Debug {
     my $level   = shift;
     my $message = shift;
     if ($level <= $DebugLevel) {
-	print $message." host = ".$RemoteHost."\n";
+	Log("INFO", "-Debug- $message host = $RemotHost");
     }
 }
 
@@ -238,13 +283,30 @@ sub SocketDump {
 =head2 ShowStatus
 
  Place some text as our pid status.
+ and as what we return in a SIGUSR1
 
 =cut
 sub ShowStatus {
-    my $status = shift;
-    $0 =  "lonc: ".$status;
-    $Status  = $status;		# Make available for logging.
+    my $state = shift;
+    my $now = time;
+    my $local = localtime($now);
+    $Status   = $local.": ".$state;
+    $0='lonc: '.$state.' '.$local;
+}
+
+=pod
+
+=head 2 SocketTimeout
+
+    Called when an action on the socket times out.  The socket is 
+   destroyed and any active transaction is failed.
+
 
+=cut
+sub SocketTimeout {
+    my $Socket = shift;
+    
+    KillSocket($Socket);
 }
 
 =pod
@@ -259,11 +321,7 @@ Invoked  each timer tick.
 sub Tick {
     my $client;
     ShowStatus(GetServerHost()." Connection count: ".$ConnectionCount);
-    Debug(6, "Tick");
-    Debug(6, "    Current connection count: ".$ConnectionCount);
-    foreach $client (keys %ActiveClients) {
-	Debug(7, "    Have client:  with id: ".$ActiveClients{$client});
-    }
+
     # Is it time to prune connection count:
 
 
@@ -277,18 +335,29 @@ sub Tick {
     } else {
 	$IdleSeconds = 0;	# Reset idle count if not idle.
     }
-
+    #
+    #  For each inflight transaction, tick down its timeout counter.
+    #
+    foreach $item (keys %ActiveTransactions) {
+	my $Socket = $ActiveTransactions{$item}->getServer();
+	$Socket->Tick();
+    }
     # Do we have work in the queue, but no connections to service them?
     # If so, try to make some new connections to get things going again.
     #
     
     my $Requests = $WorkQueue->Count();
-    if (($ConnectionCount == 0)  && ($Requests > 0)) {
-	my $Connections = ($Requests <= $MaxConnectionCount) ?
-	                           $Requests : $MaxConnectionCount;
-	Debug(1,"Work but no connections, starting ".$Connections." of them");
-	for ($i =0; $i < $Connections; $i++) {
-	    MakeLondConnection();
+    if (($ConnectionCount == 0)  && ($Requests > 0)) { 
+	if ($ConnectionRetriesLeft > 0) {
+	    my $Connections = ($Requests <= $MaxConnectionCount) ?
+		$Requests : $MaxConnectionCount;
+	    Debug(1,"Work but no connections, start ".$Connections." of them");
+	    for ($i =0; $i < $Connections; $i++) {
+		MakeLondConnection();
+	    }
+	} else {
+	    Debug(1,"Work in queue, but gave up on connections..flushing\n");
+	    EmptyQueue();	# Connections can't be established.
 	}
        
     }
@@ -410,6 +479,12 @@ sub ClientWritable {
 		
 	    } else {		# Partial string sent.
 		$Watcher->data(substr($Data, $result));
+		if($result == 0) {    # client hung up on us!!
+		    Log("INFO", "lonc pipe client hung up on us!");
+		    $Watcher->cancel;
+		    $Socket->shutdown(2);
+		    $Socket->close();
+		}
 	    }
 	    
 	} else {			# Error of some sort...
@@ -466,6 +541,7 @@ sub CompleteTransaction {
 	StartClientReply($Transaction, $data);
     } else {			# Delete deferred transaction file.
 	Log("SUCCESS", "A delayed transaction was completed");
+	LogPerm("S:$Client:".$Transaction->getRequest());
 	unlink $Transaction->getFile();
     }
 }
@@ -489,6 +565,7 @@ sub StartClientReply {
     my $Transaction   = shift;
     my $data     = shift;
 
+
     my $Client   = $Transaction->getClient();
 
     &Debug(8," Reply was: ".$data);
@@ -525,13 +602,12 @@ Parameters:
 
 sub FailTransaction {
     my $transaction = shift;
-    my $Lond        = $transaction->getServer();
-    if (!$client->isDeferred()) { # If the transaction is deferred we'll get to it.
-	my $client  = $transcation->getClient();
-	StartClientReply($client, "con_lost");
+    Debug(1, "Failing transaction: ".$transaction->getRequest());
+    if (!$transaction->isDeferred()) { # If the transaction is deferred we'll get to it.
+	my $client  = $transaction->getClient();
+	Debug(1," Replying con_lost to ".$transaction->getRequest());
+	StartClientReply($transaction, "con_lost\n");
     }
-# not needed, done elsewhere if active.
-#    delete $ActiveTransactions{$Lond};
 
 }
 
@@ -544,7 +620,7 @@ sub FailTransaction {
 =cut
 sub EmptyQueue {
     while($WorkQueue->Count()) {
-	my $request = $Workqueue->dequeue(); # This is a transaction
+	my $request = $WorkQueue->dequeue(); # This is a transaction
 	FailTransaction($request);
     }
 }
@@ -683,8 +759,10 @@ sub LondReadable {
     my $State = $Socket->GetState(); # All action depends on the state.
 
     SocketDump(6, $Socket);
+    my $status = $Socket->Readable();
+    &Debug(2, "Socket->Readable returned: $status");
 
-    if($Socket->Readable() != 0) {
+    if($status != 0) {
 	 # bad return from socket read. Currently this means that
 	# The socket has become disconnected. We fail the transaction.
 
@@ -949,7 +1027,7 @@ sub QueueDelayed {
 	my $Handle = IO::File->new($reqfile);
 	my $cmd    = <$Handle>;
 	chomp $cmd;		# There may or may not be a newline...
-	$cmd = $cmd."\ny";	# now for sure there's exactly one newline.
+	$cmd = $cmd."\n";	# now for sure there's exactly one newline.
 	my $Transaction = LondTransaction->new($cmd);
 	$Transaction->SetDeferred($reqfile);
 	QueueTransaction($Transaction);
@@ -978,7 +1056,10 @@ sub MakeLondConnection {
 
     if($Connection == undef) {	# Needs to be more robust later.
 	Log("CRITICAL","Failed to make a connection with lond.");
+	$ConnectionRetriesLeft--;
+	return 0;		# Failure.
     }  else {
+	$ConnectionRetriesLeft = $ConnectionRetries; # success resets the count
 	# The connection needs to have writability 
 	# monitored in order to send the init sequence
 	# that starts the whole authentication/key
@@ -1006,6 +1087,7 @@ sub MakeLondConnection {
 	}
 	Log("SUCESS", "Created connection ".$ConnectionCount
 	    ." to host ".GetServerHost());
+	return 1;		# Return success.
     }
     
 }
@@ -1103,7 +1185,6 @@ sub QueueTransaction {
 =pod
 
 =head2 ClientRequest
-
 Callback that is called when data can be read from the UNIX domain
 socket connecting us with an apache server process.
 
@@ -1128,12 +1209,13 @@ sub ClientRequest {
 	close($socket);
 	$watcher->cancel();
 	delete($ActiveClients{$socket});
+	return;
     }
     Debug(8,"Data: ".$data." this read: ".$thisread);
     $data = $data.$thisread;	# Append new data.
     $watcher->data($data);
     if($data =~ /(.*\n)/) {	# Request entirely read.
-	if($data == "close_connection_exit\n") {
+	if($data eq "close_connection_exit\n") {
 	    Log("CRITICAL",
 		"Request Close Connection ... exiting");
 	    CloseAllLondConnections();
@@ -1248,8 +1330,65 @@ sub SetupLoncListener {
 	      fd     => $socket);
 }
 
+=pod 
+
+=head2 ChildStatus
+ 
+Child USR1 signal handler to report the most recent status
+into the status file.
+
+=cut
+sub ChildStatus {
+    my $event = shift;
+    my $watcher = $event->w;
+
+    Debug(2, "Reporting child status because : ".$watcher->data);
+    my $docdir = $perlvar{'lonDocRoot'};
+    my $fh = IO::File->new(">>$docdir/lon-status/loncstatus.txt");
+    print $fh $$."\t".$RemoteHost."\t".$Status."\t".
+	$RecentLogEntry."\n";
+}
+
 =pod
 
+=head2 SignalledToDeath
+
+Called in response to a signal that causes a chid process to die.
+
+=cut
+
+
+sub SignalledToDeath {
+    my $event  = shift;
+    my $watcher= $event->w;
+
+    Debug(2,"Signalled to death! via ".$watcher->data);
+    my ($signal) = @_;
+    chomp($signal);
+    Log("CRITICAL", "Abnormal exit.  Child $$ for $RemoteHost "
+	."died through "."\"$signal\"");
+    LogPerm("F:lonc: $$ on $RemoteHost signalled to death: "
+	    ."\"$signal\"");
+    die("Signal abnormal end");
+    exit 0;
+
+}
+
+=head2 ToggleDebug
+
+This sub toggles trace debugging on and off.
+
+=cut
+
+sub ToggleDebug {
+    my $Current    = $DebugLevel;
+       $DebugLevel = $NextDebugLevel;
+       $NextDebugLevel = $Current;
+
+    Log("SUCCESS", "New debugging level for $RemoteHost now $DebugLevel");
+
+}
+
 =head2 ChildProcess
 
 This sub implements a child process for a single lonc daemon.
@@ -1259,14 +1398,22 @@ This sub implements a child process for
 sub ChildProcess {
 
 
-    # For now turn off signals.
-    
-    $SIG{QUIT}  = IGNORE;
-    $SIG{HUP}   = IGNORE;
-    $SIG{USR1}  = IGNORE;
-    $SIG{INT}   = IGNORE;
-    $SIG{CHLD}  = IGNORE;
-    $SIG{__DIE__}  = IGNORE;
+    #
+    #  Signals must be handled by the Event framework...
+#
+
+    Event->signal(signal   => "QUIT",
+		  cb       => \&SignalledToDeath,
+		  data     => "QUIT");
+    Event->signal(signal   => "HUP",
+		  cb       => \&ChildStatus,
+		  data     => "HUP");
+    Event->signal(signal   => "USR1",
+		  cb       => \&ChildStatus,
+		  data     => "USR1");
+    Event->signal(signal   => "INT",
+		  cb       => \&ToggleDebug,
+		  data     => "INT");
 
     SetupTimer();
     
@@ -1278,12 +1425,9 @@ sub ChildProcess {
 
 # Setup the initial server connection:
     
-    &MakeLondConnection();
+     # &MakeLondConnection(); // let first work requirest do� it.
+
 
-    if($ConnectionCount == 0) {
-	Debug(1,"Could not make initial connection..\n");
-	Debug(1,"Will retry when there's work to do\n");
-    }
     Debug(9,"Entering event loop");
     my $ret = Event::loop();		#  Start the main event loop.
     
@@ -1294,15 +1438,21 @@ sub ChildProcess {
 #  Create a new child for host passed in:
 
 sub CreateChild {
+    my $sigset = POSIX::SigSet->new(SIGINT);
+    sigprocmask(SIG_BLOCK, $sigset);
     my $host = shift;
     $RemoteHost = $host;
     Log("CRITICAL", "Forking server for ".$host);
     $pid          = fork;
     if($pid) {			# Parent
 	$ChildHash{$pid} = $RemoteHost;
+	sigprocmask(SIG_UNBLOCK, $sigset);
+
     } else {			# child.
 	ShowStatus("Connected to ".$RemoteHost);
-	ChildProcess;
+	$SIG{INT} = DEFAULT;
+	sigprocmask(SIG_UNBLOCK, $sigset);
+	ChildProcess;		# Does not return.
     }
 
 }
@@ -1358,11 +1508,21 @@ while (! $HostIterator->end()) {
     CreateChild($hostentryref->[0]);
     $HostIterator->next();
 }
+$RemoteHost = "Parent Server";
 
 # Maintain the population:
 
 ShowStatus("Parent keeping the flock");
 
+#
+#   Set up parent signals:
+#
+
+$SIG{INT}  = \&Terminate;
+$SIG{TERM} = \&Terminate; 
+$SIG{HUP}  = \&Restart;
+$SIG{USR1} = \&CheckKids; 
+
 while(1) {
     $deadchild = wait();
     if(exists $ChildHash{$deadchild}) {	# need to restart.
@@ -1375,6 +1535,93 @@ while(1) {
     }
 }
 
+
+
+=pod
+
+=head1 CheckKids
+
+  Since kids do not die as easily in this implementation
+as the previous one, there  is no need to restart the
+dead ones (all dead kids get restarted when they die!!)
+The only thing this function does is to pass USR1 to the
+kids so that they report their status.
+
+=cut
+
+sub CheckKids {
+    Debug(2, "Checking status of children");
+    my $docdir = $perlvar{'lonDocRoot'};
+    my $fh = IO::File->new(">$docdir/lon-status/loncstatus.txt");
+    my $now=time;
+    my $local=localtime($now);
+    print $fh "LONC status $local - parent $$ \n\n";
+    foreach $pid (keys %ChildHash) {
+	Debug(2, "Sending USR1 -> $pid");
+	kill 'USR1' => $pid;	# Tell Child to report status.
+	sleep 1;		# Wait so file doesn't intermix.
+    }
+}
+
+=pod
+
+=head1 Restart
+
+Signal handler for HUP... all children are killed and
+we self restart.  This is an el-cheapo way to re read
+the config file.
+
+=cut
+
+sub Restart {
+    KillThemAll;		# First kill all the children.
+    Log("CRITICAL", "Restarting");
+    my $execdir = $perlvar{'lonDaemons'};
+    unlink("$execdir/logs/lonc.pid");
+    exec("$execdir/lonc");
+}
+
+=pod
+
+=head1 KillThemAll
+
+Signal handler that kills all children by sending them a 
+SIGINT.  Responds to sigint and sigterm.
+
+=cut
+
+sub KillThemAll {
+    Debug(2, "Kill them all!!");
+    local($SIG{CHLD}) = 'IGNORE';      # Our children >will< die.
+    foreach $pid (keys %ChildHash) {
+	my $serving = $ChildHash{$pid};
+	Debug(2, "Killing lonc for $serving pid = $pid");
+	ShowStatus("Killing lonc for $serving pid = $pid");
+	Log("CRITICAL", "Killing lonc for $serving pid = $pid");
+	kill('INT', $pid);
+	delete($ChildeHash{$pid});
+    }
+    my $execdir = $perlvar{'lonDaemons'};
+    unlink("$execdir/logs/lonc.pid");
+    ShowStatus("Killing the master process");
+    Log("CRITICAL", "Killing the master process.");
+}
+
+=pod
+
+=head1 Terminate
+ 
+Terminate the system.
+
+=cut
+
+sub Terminate {
+    KillThemAll;
+    exit;
+
+}
+=pod
+
 =head1 Theory
 
 The event class is used to build this as a single process with an