--- loncom/loncnew	2018/01/16 18:13:29	1.103
+++ loncom/loncnew	2018/12/06 13:52:28	1.106
@@ -2,7 +2,7 @@
 # The LearningOnline Network with CAPA
 # lonc maintains the connections to remote computers
 #
-# $Id: loncnew,v 1.103 2018/01/16 18:13:29 raeburn Exp $
+# $Id: loncnew,v 1.106 2018/12/06 13:52:28 raeburn Exp $
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -74,6 +74,7 @@ my %perlvar    = %{$perlvarref};
 
 my %ChildPid;			# by pid -> host.
 my %ChildHost;			# by host.
+my %ChildKeyMode;               # by pid -> keymode
 my %listening_to;		# Socket->host table for who the parent
                                 # is listening to.
 my %parent_dispatchers;         # host-> listener watcher events. 
@@ -741,7 +742,7 @@ Parameters:
  
   The socket to kill off.
 
-=item Restart
+=item restart
 
 non-zero if we are allowed to create a new connection.
 
@@ -749,6 +750,7 @@ non-zero if we are allowed to create a n
 
 sub KillSocket {
     my $Socket = shift;
+    my $restart = shift;
 
     Log("WARNING", "Shutting down a socket");
     $Socket->Shutdown();
@@ -765,16 +767,24 @@ sub KillSocket {
     if(exists($ActiveConnections{$Socket})) {
 	$ActiveConnections{$Socket}->cancel;
 	delete($ActiveConnections{$Socket});
-	$ConnectionCount--;
+        # Decrement ConnectionCount unless we will immediately
+        # re-connect (i.e., $restart is true), because this was
+        # a connection where the SSL channel for exchange of the
+        # shared key failed, and we may use an insecure channel.
+        unless ($restart) {
+	    $ConnectionCount--;
+        }
 	if ($ConnectionCount < 0) { $ConnectionCount = 0; }
     }
     #  If the connection count has gone to zero and there is work in the
     #  work queue, the work all gets failed with con_lost.
     #
+  
     if($ConnectionCount == 0) {
 	$LondConnecting = 0;	# No connections so also not connecting.
 	EmptyQueue();
-	CloseAllLondConnections; # Should all already be closed but...
+	CloseAllLondConnections(); # Should all already be closed but...
+        &clear_childpid($$);
     }
     UpdateStatus();
 }
@@ -871,17 +881,33 @@ sub LondReadable {
 
 	Log("WARNING",
 	    "Lond connection lost.");
+        my $state_on_exit = $Socket->GetState();
 	if(exists($ActiveTransactions{$Socket})) {
 	    FailTransaction($ActiveTransactions{$Socket});
 	} else {
 	    #  Socket is connecting and failed... need to mark
 	    #  no longer connecting.
-	   
 	    $LondConnecting = 0;
 	}
 	$Watcher->cancel();
-	KillSocket($Socket);
-	$ConnectionRetriesLeft--;       # Counts as connection failure
+        if ($state_on_exit eq 'ReInitNoSSL') {
+            # SSL certificate verification failed, and insecure connection
+            # allowed. Send restart arg to KillSocket(), so EmptyQueue() 
+            # is not called, as we still hope to process queued request.
+
+            KillSocket($Socket,1);
+
+            # Re-initiate creation of Lond Connection for use with queued
+            # request.
+
+            ShowStatus("Connected to ".$RemoteHost);
+            Log("WARNING","No SSL channel (verification failed), will try with insecure channel");
+            &MakeLondConnection(1);
+
+        } else {
+	    KillSocket($Socket);
+	    $ConnectionRetriesLeft--;       # Counts as connection failure         
+        }
 	return;
     }
     SocketDump(6,$Socket);
@@ -892,6 +918,8 @@ sub LondReadable {
     if($State eq "Initialized") {
 
 
+    } elsif ($State eq "ReInitNoSSL") {
+
     } elsif ($State eq "ChallengeReceived") {
 	#  The challenge must be echoed back;  The state machine
 	# in the connection takes care of setting that up.  Just
@@ -928,7 +956,14 @@ sub LondReadable {
     } elsif ($State eq "ReceivingKey") {
 
     } elsif ($State eq "Idle") {
-   
+
+        if ($ConnectionCount == 1) { 
+            # Write child Pid file to keep track of ssl and insecure
+            # connections
+
+            &record_childpid($Socket);
+        }
+
 	# This is as good a spot as any to get the peer version
 	# string:
    
@@ -1093,7 +1128,9 @@ sub LondWritable {
 
 	$Watcher->cb(\&LondReadable);
 	$Watcher->poll("r");
-	
+
+    } elsif ($State eq "ReInitNoSSL") {
+
     } elsif ($State eq "ChallengeReceived") {
 	# We received the challenge, now we 
 	# are echoing it back. This is a no-op,
@@ -1204,7 +1241,8 @@ start off on it.
 
 =cut
 
-sub MakeLondConnection {     
+sub MakeLondConnection {
+    my ($restart) = @_;
     Debug(4,"MakeLondConnection to ".GetServerHost()." on port "
 	  .GetServerPort());
 
@@ -1212,12 +1250,11 @@ sub MakeLondConnection {
 					 &GetServerPort(),
 					 &GetHostId());
 
-    if($Connection eq undef) {	
+    if($Connection eq undef) {
 	Log("CRITICAL","Failed to make a connection with lond.");
 	$ConnectionRetriesLeft--;
 	return 0;		# Failure.
     }  else {
-
 	$LondConnecting = 1;	# Connection in progress.
 	# The connection needs to have writability 
 	# monitored in order to send the init sequence
@@ -1242,7 +1279,9 @@ sub MakeLondConnection {
 	if ($ConnectionCount == 0) {
 	    &SetupTimer;	# Need to handle timeouts with connections...
 	}
-	$ConnectionCount++;
+        unless ($restart) {
+	    $ConnectionCount++;
+        }
 	$Connection->SetClientData($ConnectionCount);
 	Debug(4, "Connection count = ".$ConnectionCount);
 	if($ConnectionCount == 1) { # First Connection:
@@ -1652,6 +1691,7 @@ sub SignalledToDeath {
 	."died through "."\"$signal\"");
     #LogPerm("F:lonc: $$ on $RemoteHost signalled to death: "
 #	    ."\"$signal\"");
+    &clear_childpid($$);
     exit 0;
 
 }
@@ -1972,6 +2012,7 @@ sub server_died {
 	my $host = $ChildPid{$pid};
 	if($host) {		# It's for real...
 	    &Debug(9, "Caught sigchild for $host");
+            &clear_childpid($pid);
 	    delete($ChildPid{$pid});
 	    delete($ChildHost{$host});
 	    &parent_clean_up($host);
@@ -2143,6 +2184,7 @@ sub UpdateKids {
 
     &KillThemAll();
     LondConnection->ResetReadConfig();
+    ShowStatus('Parent keeping the flock');
 }
 
 
@@ -2187,6 +2229,7 @@ sub KillThemAll {
 	ShowStatus("Nicely Killing lonc for $serving pid = $pid");
 	Log("CRITICAL", "Nicely Killing lonc for $serving pid = $pid");
 	kill 'QUIT' => $pid;
+        &clear_childpid($pid);
     }
     ShowStatus("Finished killing child processes off.");
 }
@@ -2206,6 +2249,7 @@ sub really_kill_them_all_dammit
 	Log("CRITICAL", "Nastily killing lonc for $serving pid = $pid");
 	kill 'KILL' => $pid;
 	delete($ChildPid{$pid});
+        delete($ChildKeyMode{$pid});
 	my $execdir = $perlvar{'lonDaemons'};
 	unlink("$execdir/logs/lonc.pid");
     }
@@ -2234,13 +2278,60 @@ sub Terminate {
 
 }
 
+=pod
+
+=cut
+
 sub my_hostname {
-    use Net::Domain();
-    my $name = &Net::Domain::hostfqdn();
+    use Sys::Hostname::FQDN();
+    my $name = Sys::Hostname::FQDN::fqdn();
     &Debug(9,"Name is $name");
     return $name;
 }
 
+sub record_childpid {
+    my ($Socket) = @_;
+    my $docdir = $perlvar{'lonDocRoot'};
+    my $authmode = $Socket->GetKeyMode();
+    my $peer = $Socket->PeerLoncapaHim();
+    if (($authmode eq 'ssl') || ($authmode eq 'insecure')) {
+        my $childpid = $$;
+        if ($childpid) {
+            unless (exists($ChildKeyMode{$childpid})) {
+                $ChildKeyMode{$childpid} = $authmode;
+            }
+            if (-d "$docdir/lon-status/loncchld") {
+                unless (-e "$docdir/lon-status/loncchld/$childpid") {
+                    if (open (my $pidfh,'>',"$docdir/lon-status/loncchld/$childpid")) {
+                        print $pidfh "$peer:$authmode\n";
+                        close($pidfh);
+                    }
+                }
+            }
+        }
+    }
+    return;
+}
+
+sub clear_childpid {
+    my ($childpid) = @_; 
+    my $docdir = $perlvar{'lonDocRoot'};
+    if (-d "$docdir/lon-status/loncchld") {
+        if ($childpid =~ /^\d+$/) {
+            if (($ChildKeyMode{$childpid} eq 'insecure') ||
+                ($ChildKeyMode{$childpid} eq 'ssl')) {
+                if (-e "$docdir/lon-status/loncchld/$childpid") {
+                    unlink("$docdir/lon-status/loncchld/$childpid");
+                }
+            }
+        }
+    }
+    if (exists($ChildKeyMode{$childpid})) {
+        delete($ChildKeyMode{$childpid});
+    }
+    return;
+}
+
 =pod
 
 =head1 Theory
@@ -2345,6 +2436,12 @@ connection or died.  This should be foll
 
  "WARNING Failing transaction..." msgs for each in-flight or queued transaction.
 
+=item WARNING No SSL channel (verification failed), will try with insecure channel.
+
+Called when promotion of a socket to SSL failed because SSL certificate verification failed.
+Domain configuration must also permit insecure channel use for key exchange. Connection
+negotiation will start again from the beginning, but with Authentication Mode not set to ssl.
+
 =item INFO Connected to lond version:  <version> 
 
 When connection negotiation is complete, the lond version is requested and logged here.