Annotation of loncom/lonc, revision 1.56
1.1 albertel 1: #!/usr/bin/perl
2:
3: # The LearningOnline Network
4: # lonc - LON TCP-Client Domain-Socket-Server
5: # provides persistent TCP connections to the other servers in the network
6: # through multiplexed domain sockets
7: #
1.56 ! albertel 8: # $Id: lonc,v 1.55 2003/09/17 19:05:03 albertel Exp $
1.22 www 9: #
10: # Copyright Michigan State University Board of Trustees
11: #
12: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
13: #
14: # LON-CAPA is free software; you can redistribute it and/or modify
15: # it under the terms of the GNU General Public License as published by
16: # the Free Software Foundation; either version 2 of the License, or
17: # (at your option) any later version.
18: #
19: # LON-CAPA is distributed in the hope that it will be useful,
20: # but WITHOUT ANY WARRANTY; without even the implied warranty of
21: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22: # GNU General Public License for more details.
23: #
24: # You should have received a copy of the GNU General Public License
25: # along with LON-CAPA; if not, write to the Free Software
26: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
27: #
28: # /home/httpd/html/adm/gpl.txt
29: #
30: # http://www.lon-capa.org/
31: #
1.1 albertel 32: # PID in subdir logs/lonc.pid
33: # kill kills
34: # HUP restarts
35: # USR1 tries to open connections again
36:
1.2 www 37: # 6/4/99,6/5,6/7,6/8,6/9,6/10,6/11,6/12,7/14,7/19,
1.5 www 38: # 10/8,10/9,10/15,11/18,12/22,
1.10 www 39: # 2/8,7/25 Gerd Kortemeyer
40: # 12/05 Gerd Kortemeyer
1.23 harris41 41: # YEAR=2001
1.21 www 42: # 03/14/01,03/15,06/12,11/26,11/27,11/28 Gerd Kortemeyer
1.26 www 43: # YEAR=2002
1.29 www 44: # 2/19/02,02/22/02,02/25/02 Gerd Kortemeyer
1.33 foxr 45: # 3/07/02 Ron Fox
1.1 albertel 46: # based on nonforker from Perl Cookbook
47: # - server who multiplexes without forking
1.40 harris41 48:
49: use lib '/home/httpd/lib/perl/';
50: use LONCAPA::Configuration;
1.1 albertel 51:
52: use POSIX;
53: use IO::Socket;
54: use IO::Select;
55: use IO::File;
56: use Socket;
57: use Fcntl;
58: use Tie::RefHash;
59: use Crypt::IDEA;
1.32 foxr 60: #use Net::Ping;
1.26 www 61: use LWP::UserAgent();
1.1 albertel 62:
1.30 www 63: $status='';
64: $lastlog='';
65: $conserver='SHELL';
1.32 foxr 66: $DEBUG = 0; # Set to 1 for annoyingly complete logs.
1.49 albertel 67: $VERSION='$Revison$'; #' stupid emacs
68: $remoteVERSION;
1.8 harris41 69: # -------------------------------- Set signal handlers to record abnormal exits
70:
1.29 www 71: &status("Init exception handlers");
1.26 www 72: $SIG{QUIT}=\&catchexception;
1.8 harris41 73: $SIG{__DIE__}=\&catchexception;
74:
1.41 matthew 75: # ---------------------------------- Read loncapa_apache.conf and loncapa.conf
1.42 harris41 76: &status("Read loncapa.conf and loncapa_apache.conf");
77: my $perlvarref=LONCAPA::Configuration::read_conf('loncapa.conf');
1.40 harris41 78: my %perlvar=%{$perlvarref};
79: undef $perlvarref;
1.7 www 80:
1.13 harris41 81: # ----------------------------- Make sure this process is running from user=www
1.29 www 82: &status("Check user ID");
1.13 harris41 83: my $wwwid=getpwnam('www');
84: if ($wwwid!=$<) {
85: $emailto="$perlvar{'lonAdmEMail'},$perlvar{'lonSysEMail'}";
86: $subj="LON: $perlvar{'lonHostID'} User ID mismatch";
1.14 www 87: system("echo 'User ID mismatch. lonc must be run as user www.' |\
1.13 harris41 88: mailto $emailto -s '$subj' > /dev/null");
89: exit 1;
90: }
91:
1.7 www 92: # --------------------------------------------- Check if other instance running
93:
94: my $pidfile="$perlvar{'lonDaemons'}/logs/lonc.pid";
95:
96: if (-e $pidfile) {
97: my $lfh=IO::File->new("$pidfile");
98: my $pide=<$lfh>;
99: chomp($pide);
1.11 harris41 100: if (kill 0 => $pide) { die "already running"; }
1.7 www 101: }
1.1 albertel 102:
103: # ------------------------------------------------------------- Read hosts file
104:
1.11 harris41 105: open (CONFIG,"$perlvar{'lonTabDir'}/hosts.tab") || die "Can't read host file";
1.1 albertel 106:
107: while ($configline=<CONFIG>) {
108: my ($id,$domain,$role,$name,$ip)=split(/:/,$configline);
109: chomp($ip);
1.28 www 110: if ($ip) {
111: $hostip{$id}=$ip;
112: $hostname{$id}=$name;
113: }
1.1 albertel 114: }
1.27 www 115:
1.1 albertel 116: close(CONFIG);
117:
118: # -------------------------------------------------------- Routines for forking
119:
120: %children = (); # keys are current child process IDs,
121: # values are hosts
122: %childpid = (); # the other way around
123:
124: %childatt = (); # number of attempts to start server
125: # for ID
126:
1.54 albertel 127: $childmaxattempts=15;
1.3 www 128:
1.1 albertel 129: # ---------------------------------------------------- Fork once and dissociate
1.29 www 130: &status("Fork and dissociate");
1.1 albertel 131: $fpid=fork;
132: exit if $fpid;
1.11 harris41 133: die "Couldn't fork: $!" unless defined ($fpid);
1.1 albertel 134:
1.11 harris41 135: POSIX::setsid() or die "Can't start new session: $!";
1.1 albertel 136:
1.30 www 137: $conserver='PARENT';
138:
1.1 albertel 139: # ------------------------------------------------------- Write our PID on disk
1.29 www 140: &status("Write PID");
1.1 albertel 141: $execdir=$perlvar{'lonDaemons'};
142: open (PIDSAVE,">$execdir/logs/lonc.pid");
143: print PIDSAVE "$$\n";
144: close(PIDSAVE);
1.5 www 145: &logthis("<font color=red>CRITICAL: ---------- Starting ----------</font>");
1.1 albertel 146:
147: # ----------------------------- Ignore signals generated during initial startup
148: $SIG{HUP}=$SIG{USR1}='IGNORE';
149: # ------------------------------------------------------- Now we are on our own
150:
151: # Fork off our children, one for every server
152:
1.18 www 153: &status("Forking ...");
154:
1.1 albertel 155: foreach $thisserver (keys %hostip) {
1.32 foxr 156: #if (&online($hostname{$thisserver})) {
1.26 www 157: make_new_child($thisserver);
1.32 foxr 158: #}
1.1 albertel 159: }
160:
161: &logthis("Done starting initial servers");
162: # ----------------------------------------------------- Install signal handlers
163:
1.32 foxr 164:
1.1 albertel 165: $SIG{INT} = $SIG{TERM} = \&HUNTSMAN;
166: $SIG{HUP} = \&HUPSMAN;
167: $SIG{USR1} = \&USRMAN;
168:
169: # And maintain the population.
170: while (1) {
1.32 foxr 171: my $deadpid = wait; # Wait for the next child to die.
1.39 foxr 172: # See who died and start new one
173: # or a signal (e.g. USR1 for restart).
174: # if a signal, the wait will fail
175: # This is ordinarily detected by
176: # checking for the existence of the
177: # pid index inthe children hash since
178: # the return value from a failed wait is -1
179: # which is an impossible PID.
1.18 www 180: &status("Woke up");
1.30 www 181: my $skipping='';
1.32 foxr 182:
183: if(exists($children{$deadpid})) {
184:
185: $thisserver = $children{$deadpid}; # Look name of dead guy's peer.
186:
187: delete($children{$deadpid}); # Get rid of dead hash entry.
188:
189: if($childatt{$thisserver} < $childmaxattempts) {
190: $childatt{$thisserver}++;
191: &logthis(
192: "<font color=yellow>INFO: Trying to reconnect for $thisserver "
193: ."($childatt{$thisserver} of $childmaxattempts attempts)</font>");
194: make_new_child($thisserver);
195:
196: }
197: else {
198: $skipping .= $thisserver.' ';
199: }
200: if($skipping) {
201: &logthis("<font color=blue>WARNING: Skipped $skipping</font>");
202:
203: }
1.30 www 204: }
1.32 foxr 205:
1.1 albertel 206: }
207:
208:
1.32 foxr 209:
1.1 albertel 210: sub make_new_child {
211:
1.30 www 212: $newserver=shift;
1.1 albertel 213: my $pid;
214: my $sigset;
1.30 www 215: &logthis("Attempting to start child for server $newserver");
1.1 albertel 216: # block signal for fork
217: $sigset = POSIX::SigSet->new(SIGINT);
218: sigprocmask(SIG_BLOCK, $sigset)
1.11 harris41 219: or die "Can't block SIGINT for fork: $!\n";
1.1 albertel 220:
1.11 harris41 221: die "fork: $!" unless defined ($pid = fork);
1.1 albertel 222:
223: if ($pid) {
224: # Parent records the child's birth and returns.
225: sigprocmask(SIG_UNBLOCK, $sigset)
1.11 harris41 226: or die "Can't unblock SIGINT for fork: $!\n";
1.30 www 227: $children{$pid} = $newserver;
1.32 foxr 228: $childpid{$newserver} = $pid;
1.1 albertel 229: return;
230: } else {
1.30 www 231: $conserver=$newserver;
1.1 albertel 232: # Child can *not* return from this subroutine.
233: $SIG{INT} = 'DEFAULT'; # make SIGINT kill us as it did before
1.18 www 234: $SIG{USR1}= \&logstatus;
235:
1.1 albertel 236: # unblock signals
237: sigprocmask(SIG_UNBLOCK, $sigset)
1.11 harris41 238: or die "Can't unblock SIGINT for fork: $!\n";
1.1 albertel 239:
240: # ----------------------------- This is the modified main program of non-forker
241:
242: $port = "$perlvar{'lonSockDir'}/$conserver";
243:
244: unlink($port);
1.18 www 245:
1.29 www 246: # -------------------------------------------------------------- Open other end
1.1 albertel 247:
1.29 www 248: &openremote($conserver);
1.32 foxr 249: &logthis("<font color=green> Connection to $conserver open </font>");
1.3 www 250: # ----------------------------------------- We're online, send delayed messages
1.18 www 251: &status("Checking for delayed messages");
1.32 foxr 252:
1.4 www 253: my @allbuffered;
1.3 www 254: my $path="$perlvar{'lonSockDir'}/delayed";
1.4 www 255: opendir(DIRHANDLE,$path);
256: @allbuffered=grep /\.$conserver$/, readdir DIRHANDLE;
257: closedir(DIRHANDLE);
1.3 www 258: my $dfname;
1.44 www 259: foreach (sort @allbuffered) {
1.30 www 260: &status("Sending delayed: $_");
1.4 www 261: $dfname="$path/$_";
1.32 foxr 262: if($DEBUG) { &logthis('Sending '.$dfname); }
1.3 www 263: my $wcmd;
264: {
265: my $dfh=IO::File->new($dfname);
1.4 www 266: $cmd=<$dfh>;
1.3 www 267: }
268: chomp($cmd);
269: my $bcmd=$cmd;
270: if ($cmd =~ /^encrypt\:/) {
271: my $rcmd=$cmd;
272: $rcmd =~ s/^encrypt\://;
273: chomp($rcmd);
274: my $cmdlength=length($rcmd);
275: $rcmd.=" ";
276: my $encrequest='';
277: for (my $encidx=0;$encidx<=$cmdlength;$encidx+=8) {
278: $encrequest.=
279: unpack("H16",$cipher->encrypt(substr($rcmd,$encidx,8)));
280: }
281: $cmd="enc:$cmdlength:$encrequest\n";
282: }
1.33 foxr 283: $answer = londtransaction($remotesock, $cmd, 60);
1.3 www 284: chomp($answer);
1.20 www 285:
286: if (($answer ne '') && ($@!~/timeout/)) {
1.3 www 287: unlink("$dfname");
1.30 www 288: &logthis("Delayed $cmd: >$answer<");
1.3 www 289: &logperm("S:$conserver:$bcmd");
290: }
1.23 harris41 291: }
1.32 foxr 292: if($DEBUG) { &logthis("<font color=green> Delayed transactions sent"); }
1.1 albertel 293:
294: # ------------------------------------------------------- Listen to UNIX socket
1.30 www 295: &status("Opening socket");
1.1 albertel 296: unless (
297: $server = IO::Socket::UNIX->new(Local => $port,
298: Type => SOCK_STREAM,
299: Listen => 10 )
1.5 www 300: ) {
301: my $st=120+int(rand(240));
302: &logthis(
303: "<font color=blue>WARNING: ".
1.33 foxr 304: "Can't make server socket ($st secs): .. exiting</font>");
1.5 www 305: sleep($st);
1.1 albertel 306: exit;
307: };
1.32 foxr 308:
1.1 albertel 309: # -----------------------------------------------------------------------------
310:
1.5 www 311: &logthis("<font color=green>$conserver online</font>");
312:
313: # -----------------------------------------------------------------------------
1.1 albertel 314: # begin with empty buffers
315: %inbuffer = ();
316: %outbuffer = ();
317: %ready = ();
1.35 foxr 318: %servers = (); # To be compatible with make filevector. indexed by
1.37 foxr 319: # File ids, values are sockets.
1.35 foxr 320: # note that the accept socket is omitted.
1.1 albertel 321:
322: tie %ready, 'Tie::RefHash';
323:
1.37 foxr 324: # nonblock($server);
325: # $select = IO::Select->new($server);
1.1 albertel 326:
327: # Main loop: check reads/accepts, check writes, check ready to process
1.37 foxr 328:
1.46 albertel 329: status("Main loop $conserver");
1.1 albertel 330: while (1) {
331: my $client;
332: my $rv;
333: my $data;
334:
1.35 foxr 335: my $infdset; # bit vec of fd's to select on input.
336:
337: my $outfdset; # Bit vec of fd's to select on output.
338:
339:
340: $infdset = MakeFileVector(\%servers);
341: $outfdset= MakeFileVector(\%outbuffer);
1.37 foxr 342: vec($infdset, $server->fileno, 1) = 1;
343: if($DEBUG) {
344: &logthis("Adding ".$server->fileno.
345: " to input select vector (listner)".
346: unpack("b*",$infdset)."\n");
1.1 albertel 347: }
1.37 foxr 348: DoSelect(\$infdset, \$outfdset); # Wait for input.
349: if($DEBUG) {
350: &logthis("Doselect completed!");
351: &logthis("ins = ".unpack("b*",$infdset)."\n");
352: &logthis("outs= ".unpack("b*",$outfdset)."\n");
353:
1.1 albertel 354: }
1.15 www 355:
1.37 foxr 356: # Checkfor new connections:
357: if (vec($infdset, $server->fileno, 1)) {
358: if($DEBUG) {
359: &logthis("New connection established");
360: }
361: # accept a new connection
362: &status("Accept new connection: $conserver");
363: $client = $server->accept();
1.55 albertel 364: if (!$client) {
365: &logthis("Got stupid nonexisent client on ".$server->fileno." $conserver \n");
366: } else {
367: if($DEBUG) {
368: &logthis("New client fd = ".$client->fileno."\n");
369: }
370: $servers{$client->fileno} = $client;
371: nonblock($client);
372: $client->sockopt(SO_KEEPALIVE, 1); # Enable monitoring of
373: # connection liveness.
1.37 foxr 374: }
375: }
376: HandleInput($infdset, \%servers, \%inbuffer, \%outbuffer, \%ready);
377: HandleOutput($outfdset, \%servers, \%outbuffer, \%inbuffer,
378: \%ready);
379: # -------------------------------------------------------- Wow, connection lost
1.15 www 380:
1.37 foxr 381: }
382:
1.1 albertel 383: }
384: }
1.25 albertel 385:
1.1 albertel 386: # ------------------------------------------------------- End of make_new_child
387:
1.35 foxr 388:
389: #
390: # Make a vector of file descriptors to wait for in a select.
391: # parameters:
392: # \%fdhash -reference to a hash which has IO::Socket's as indices.
393: # We only care about the indices, not the values.
394: # A select vector is created from all indices of the hash.
395:
396: sub MakeFileVector
397: {
398: my $fdhash = shift;
399: my $selvar = "";
400:
1.37 foxr 401: foreach $socket (keys %$fdhash) {
402: if($DEBUG) {
403: &logthis("Adding ".$socket.
404: "to select vector. (client)\n");
405: }
406: vec($selvar, $socket, 1) = 1;
1.35 foxr 407: }
408: return $selvar;
409: }
410:
411:
412: #
413: # HandleOutput:
414: # Processes output on a buffered set of file descriptors which are
415: # ready to be read.
416: # Parameters:
1.37 foxr 417: # $selvector - Vector of file descriptors which are writable.
1.35 foxr 418: # \%sockets - Vector of socket references indexed by socket.
419: # \%buffers - Reference to a hash containing output buffers.
420: # Hashes are indexed by sockets. The file descriptors of some
421: # of those sockets will be present in $selvector.
422: # For each one of those, we will attempt to write the output
423: # buffer to the socket. Note that we will assume that
424: # the sockets are being run in non blocking mode.
425: # \%inbufs - Reference to hash containing input buffers.
426: # \%readys - Reference to hash containing flags for items with complete
427: # requests.
428: #
429: sub HandleOutput
430: {
431: my $selvector = shift;
432: my $sockets = shift;
433: my $buffers = shift;
434: my $inbufs = shift;
435: my $readys = shift;
1.37 foxr 436: my $sock;
1.35 foxr 437:
1.37 foxr 438: if($DEBUG) {
439: &logthis("HandleOutput entered\n");
440: }
441:
442: foreach $sock (keys %$sockets) {
1.35 foxr 443: my $socket = $sockets->{$sock};
1.37 foxr 444: if(vec($selvector, $sock, 1)) { # $socket is writable.
445: if($DEBUG) {
446: &logthis("Sending $buffers->{$sock} \n");
447: }
448: my $rv = $socket->send($buffers->{$sock}, 0);
1.35 foxr 449: $errno = $!;
450: unless ($buffers->{$sock} eq "con_lost\n") {
451: unless (defined $rv) { # Write failed... could be EINTR
452: unless ($errno == POSIX::EINTR) {
453: &logthis("Write failed on writable socket");
454: } # EINTR is not an error .. just retry.
455: next;
456: }
457: if( ($rv == length $buffers->{$sock}) ||
458: ($errno == POSIX::EWOULDBLOCK) ||
459: ($errno == POSIX::EAGAIN) || # same as above.
460: ($errno == POSIX::EINTR) || # signal during IO
461: ($errno == 0)) {
462: substr($buffers->{$sock}, 0, $rv)=""; # delete written part
463: delete $buffers->{$sock} unless length $buffers->{$sock};
464: } else {
465: # For some reason the write failed with an error code
466: # we didn't look for. Shutdown the socket.
467: &logthis("Unable to write data with ".$errno.": ".
468: "Dropping data: ".length($buffers->{$sock}).
469: ", $rv");
470: #
471: # kill off the buffers in the hash:
472:
473: delete $buffers->{$sock};
474: delete $inbufs->{$sock};
475: delete $readys->{$sock};
476:
1.37 foxr 477: close($socket); # Close the client socket.
1.35 foxr 478: next;
479: }
480: } else { # Kludgy way to mark lond connection lost.
481: &logthis(
482: "<font color=red>CRITICAL lond connection lost</font>");
483: status("Connection lost");
484: $remotesock->shutdown(2);
485: &logthis("Attempting to open a new connection");
1.37 foxr 486: &openremote($conserver);
1.35 foxr 487: }
488:
489: }
490: }
491:
492: }
493: #
494: # HandleInput - Deals with input on client sockets.
495: # Each socket has an associated input buffer.
496: # For each readable socket, the currently available
497: # data is appended to this buffer.
498: # If necessary, the buffer is created.
499: # On various failures, we may shutdown the client.
500: # Parameters:
501: # $selvec - Vector of readable sockets.
502: # \%sockets - Refers to the Hash of sockets indexed by sockets.
503: # Each of these may or may not have it's fd bit set
504: # in the $selvec.
505: # \%ibufs - Refers to the hash of input buffers indexed by socket.
506: # \%obufs - Hash of output buffers indexed by socket.
507: # \%ready - Hash of ready flags indicating the existence of a completed
508: # Request.
509: sub HandleInput
510: {
511:
512: # Marshall the parameters. Note that the hashes are actually
513: # references not values.
514:
515: my $selvec = shift;
516: my $sockets = shift;
517: my $ibufs = shift;
518: my $obufs = shift;
519: my $ready = shift;
1.37 foxr 520: my $sock;
1.35 foxr 521:
1.38 foxr 522: if($DEBUG) {
523: &logthis("Entered HandleInput\n");
524: }
1.37 foxr 525: foreach $sock (keys %$sockets) {
1.35 foxr 526: my $socket = $sockets->{$sock};
1.37 foxr 527: if(vec($selvec, $sock, 1)) { # Socket which is readable.
1.35 foxr 528:
529: # Attempt to read the data and do error management.
530: my $data = '';
1.37 foxr 531: my $rv = $socket->recv($data, POSIX::BUFSIZ, 0);
532: if($DEBUG) {
533: &logthis("Received $data from socket");
534: }
1.35 foxr 535: unless (defined($rv) && length $data) {
536:
537: # Read an end of file.. this is a disconnect from the peer.
538:
539: delete $sockets->{$sock};
540: delete $ibufs->{$sock};
541: delete $obufs->{$sock};
542: delete $ready->{$sock};
543:
544: status("Idle");
1.37 foxr 545: close $socket;
1.35 foxr 546: next;
547: }
548: # Append the read data to the input buffer. If the buffer
549: # now contains a \n the request is complete and we can
550: # mark this in the $ready hash (one request for each \n.)
551:
552: $ibufs->{$sock} .= $data;
553: while($ibufs->{$sock} =~ s/(.*\n)//) {
554: push(@{$ready->{$sock}}, $1);
555: }
556:
557: }
558: }
559: # Now handle any requests which are ready:
560:
561: foreach $client (keys %ready) {
562: handle($client);
1.36 foxr 563: }
564: }
565:
566: # DoSelect: does a select with no timeout. On signal (errno == EINTR),
567: # the select is retried until there are items in the returned
568: # vectors.
569: #
570: # Parameters:
571: # \$readvec - Reference to a vector of file descriptors to
572: # check for readability.
573: # \$writevec - Reference to a vector of file descriptors to check for
574: # writability.
575: # On exit, the referents are modified with vectors indicating which
576: # file handles are readable/writable.
577: #
578: sub DoSelect {
579: my $readvec = shift;
580: my $writevec= shift;
581: my $outs;
582: my $ins;
583:
584: while (1) {
1.37 foxr 585: my $nfds = select( $ins = $$readvec, $outs = $$writevec, undef, undef);
586: if($nfds) {
587: if($DEBUG) {
588: &logthis("select exited with ".$nfds." fds\n");
589: &logthis("ins = ".unpack("b*",$ins).
590: " readvec = ".unpack("b*",$$readvec)."\n");
591: &logthis("outs = ".unpack("b*",$outs).
592: " writevec = ".unpack("b*",$$writevec)."\n");
593: }
1.36 foxr 594: $$readvec = $ins;
595: $$writevec = $outs;
596: return;
597: } else {
1.37 foxr 598: if($DEBUG) {
599: &logthis("Select exited with no bits set in mask\n");
600: }
1.36 foxr 601: die "Select failed" unless $! == EINTR;
602: }
1.35 foxr 603: }
604: }
605:
1.1 albertel 606: # handle($socket) deals with all pending requests for $client
1.35 foxr 607: #
1.1 albertel 608: sub handle {
609: # requests are in $ready{$client}
610: # send output to $outbuffer{$client}
611: my $client = shift;
612: my $request;
613: foreach $request (@{$ready{$client}}) {
614: # ============================================================= Process request
615: # $request is the text of the request
616: # put text of reply into $outbuffer{$client}
1.29 www 617: # ------------------------------------------------------------ Is this the end?
1.33 foxr 618: chomp($request);
1.32 foxr 619: if($DEBUG) {
620: &logthis("<font color=green> Request $request processing starts</font>");
621: }
1.29 www 622: if ($request eq "close_connection_exit\n") {
1.30 www 623: &status("Request close connection");
1.29 www 624: &logthis(
1.32 foxr 625: "<font color=red>CRITICAL: Request Close Connection ... exiting</font>");
1.29 www 626: $remotesock->shutdown(2);
627: $server->close();
628: exit;
629: }
1.1 albertel 630: # -----------------------------------------------------------------------------
631: if ($request =~ /^encrypt\:/) {
632: my $cmd=$request;
633: $cmd =~ s/^encrypt\://;
634: chomp($cmd);
635: my $cmdlength=length($cmd);
636: $cmd.=" ";
637: my $encrequest='';
638: for (my $encidx=0;$encidx<=$cmdlength;$encidx+=8) {
639: $encrequest.=
640: unpack("H16",$cipher->encrypt(substr($cmd,$encidx,8)));
641: }
1.33 foxr 642: $request="enc:$cmdlength:$encrequest";
1.1 albertel 643: }
1.19 www 644: # --------------------------------------------------------------- Main exchange
1.54 albertel 645: $answer = londtransaction($remotesock, $request, 60);
1.33 foxr 646:
647: if($DEBUG) {
648: &logthis("<font color=green> Request data exchange complete");
649: }
650: if ($@=~/timeout/) {
651: $answer='';
652: &logthis(
653: "<font color=red>CRITICAL: Timeout: $request</font>");
654: }
1.19 www 655:
656:
1.1 albertel 657: if ($answer) {
658: if ($answer =~ /^enc/) {
659: my ($cmd,$cmdlength,$encinput)=split(/:/,$answer);
660: chomp($encinput);
661: $answer='';
662: for (my $encidx=0;$encidx<length($encinput);$encidx+=16) {
663: $answer.=$cipher->decrypt(
664: pack("H16",substr($encinput,$encidx,16))
665: );
666: }
667: $answer=substr($answer,0,$cmdlength);
668: $answer.="\n";
669: }
1.33 foxr 670: if($DEBUG) {
671: &logthis("sending $answer to client\n");
672: }
1.1 albertel 673: $outbuffer{$client} .= $answer;
674: } else {
675: $outbuffer{$client} .= "con_lost\n";
676: }
677:
1.30 www 678: &status("Completed: $request");
1.32 foxr 679: if($DEBUG) {
680: &logthis("<font color=green> Request processing complete</font>");
681: }
1.1 albertel 682: # ===================================================== Done processing request
683: }
684: delete $ready{$client};
685: # -------------------------------------------------------------- End non-forker
1.32 foxr 686: if($DEBUG) {
687: &logthis("<font color=green> requests for child handled</font>");
688: }
1.1 albertel 689: }
690: # ---------------------------------------------------------- End make_new_child
691:
692: # nonblock($socket) puts socket into nonblocking mode
693: sub nonblock {
694: my $socket = shift;
695: my $flags;
696:
697:
698: $flags = fcntl($socket, F_GETFL, 0)
1.11 harris41 699: or die "Can't get flags for socket: $!\n";
1.1 albertel 700: fcntl($socket, F_SETFL, $flags | O_NONBLOCK)
1.11 harris41 701: or die "Can't make socket nonblocking: $!\n";
1.29 www 702: }
703:
704:
705: sub openremote {
706: # ---------------------------------------------------- Client to network server
707:
708: my $conserver=shift;
709:
1.49 albertel 710: &status("Opening TCP $conserver");
1.32 foxr 711: my $st=120+int(rand(240)); # Sleep before opening:
1.29 www 712:
1.49 albertel 713: unless (
1.56 ! albertel 714: $remotesock = IO::Socket::INET->new(PeerAddr => $hostname{$conserver},
1.49 albertel 715: PeerPort => $perlvar{'londPort'},
716: Proto => "tcp",
717: Type => SOCK_STREAM)
718: ) {
719:
720: &logthis(
721: "<font color=blue>WARNING: Couldn't connect to $conserver ($st secs): </font>");
722: sleep($st);
723: exit;
724: };
1.29 www 725: # ----------------------------------------------------------------- Init dialog
726:
1.49 albertel 727: &logthis("<font color=green>INFO Connected to $conserver, initing</font>");
728: &status("Init dialogue: $conserver");
1.29 www 729:
1.49 albertel 730: $answer = londtransaction($remotesock, "init", 60);
1.33 foxr 731: chomp($answer);
732: $answer = londtransaction($remotesock, $answer, 60);
733: chomp($answer);
1.29 www 734:
1.49 albertel 735: if ($@=~/timeout/) {
736: &logthis("Timed out during init.. exiting");
737: exit;
738: }
739:
740: if ($answer ne 'ok') {
741: &logthis("Init reply: >$answer<");
742: my $st=120+int(rand(240));
743: &logthis("<font color=blue>WARNING: Init failed ($st secs)</font>");
744: sleep($st);
745: exit;
746: }
747:
748: $answer = londtransaction($remotesock,"sethost:$conserver",60);
749: chomp($answer);
750: if ( $answer ne 'ok') {
751: &logthis('<font color="blue">WARNING: unable to specify remote host'.
752: $answer.'</font>');
753: }
754:
755: $answer = londtransaction($remotesock,"version:$VERSION",60);
756: chomp($answer);
757: if ($answer =~ /^version:/) {
758: $remoteVERSION=(split(/:/,$answer))[1];
759: } else {
760: &logthis('<font color="blue">WARNING: request remote version failed :'.
761: $answer.': my version is :'.$VERSION.':</font>');
762: }
1.29 www 763:
1.49 albertel 764: sleep 5;
765: &status("Ponging $conserver");
1.53 albertel 766: $answer= londtransaction($remotesock,"pong",60);
1.49 albertel 767: chomp($answer);
768: if ($answer!~/^$conserver/) {
769: &logthis("Pong reply: >$answer<");
770: }
1.29 www 771: # ----------------------------------------------------------- Initialize cipher
772:
1.49 albertel 773: &status("Initialize cipher");
1.53 albertel 774: my $buildkey=londtransaction($remotesock,"ekey",60);
1.49 albertel 775: my $key=$conserver.$perlvar{'lonHostID'};
776: $key=~tr/a-z/A-Z/;
777: $key=~tr/G-P/0-9/;
778: $key=~tr/Q-Z/0-9/;
779: $key=$key.$buildkey.$key.$buildkey.$key.$buildkey;
780: $key=substr($key,0,32);
781: my $cipherkey=pack("H32",$key);
782: if ($cipher=new IDEA $cipherkey) {
783: &logthis("Secure connection initialized");
784: } else {
785: my $st=120+int(rand(240));
786: &logthis("<font color=blue>WARNING: ".
787: "Could not establish secure connection ($st secs)!</font>");
788: sleep($st);
789: exit;
790: }
1.32 foxr 791: &logthis("<font color=green> Remote open success </font>");
1.8 harris41 792: }
1.30 www 793:
794:
795:
796: # grabs exception and records it to log before exiting
797: sub catchexception {
798: my ($signal)=@_;
799: $SIG{QUIT}='DEFAULT';
800: $SIG{__DIE__}='DEFAULT';
801: chomp($signal);
802: &logthis("<font color=red>CRITICAL: "
803: ."ABNORMAL EXIT. Child $$ for server [$wasserver] died through "
1.33 foxr 804: ."\"$signal\" with parameter </font>");
805: die("Signal abend");
1.30 www 806: }
807:
808: # -------------------------------------- Routines to see if other box available
809:
1.32 foxr 810: #sub online {
811: # my $host=shift;
812: # &status("Pinging ".$host);
813: # my $p=Net::Ping->new("tcp",20);
814: # my $online=$p->ping("$host");
815: # $p->close();
816: # undef ($p);
817: # return $online;
818: #}
1.30 www 819:
820: sub connected {
821: my ($local,$remote)=@_;
822: &status("Checking connection $local to $remote");
823: $local=~s/\W//g;
824: $remote=~s/\W//g;
825:
826: unless ($hostname{$local}) { return 'local_unknown'; }
827: unless ($hostname{$remote}) { return 'remote_unknown'; }
828:
1.32 foxr 829: #unless (&online($hostname{$local})) { return 'local_offline'; }
1.30 www 830:
831: my $ua=new LWP::UserAgent;
832:
833: my $request=new HTTP::Request('GET',
834: "http://".$hostname{$local}.'/cgi-bin/ping.pl?'.$remote);
835:
836: my $response=$ua->request($request);
837:
838: unless ($response->is_success) { return 'local_error'; }
839:
840: my $reply=$response->content;
841: $reply=(split("\n",$reply))[0];
842: $reply=~s/\W//g;
843: if ($reply ne $remote) { return $reply; }
844: return 'ok';
845: }
846:
847:
848:
849: sub hangup {
850: foreach (keys %children) {
851: $wasserver=$children{$_};
852: &status("Closing $wasserver");
853: &logthis('Closing '.$wasserver.': '.&subreply('exit',$wasserver));
854: &status("Kill PID $_ for $wasserver");
855: kill ('INT',$_);
856: }
857: }
858:
859: sub HUNTSMAN { # signal handler for SIGINT
860: local($SIG{CHLD}) = 'IGNORE'; # we're going to kill our children
861: &hangup();
862: my $execdir=$perlvar{'lonDaemons'};
863: unlink("$execdir/logs/lonc.pid");
864: &logthis("<font color=red>CRITICAL: Shutting down</font>");
865: exit; # clean up with dignity
866: }
867:
868: sub HUPSMAN { # signal handler for SIGHUP
869: local($SIG{CHLD}) = 'IGNORE'; # we're going to kill our children
870: &hangup();
871: &logthis("<font color=red>CRITICAL: Restarting</font>");
1.50 foxr 872: my $execdir=$perlvar{'lonDaemons'};
1.30 www 873: unlink("$execdir/logs/lonc.pid");
874: exec("$execdir/lonc"); # here we go again
875: }
876:
877: sub checkchildren {
878: &initnewstatus();
879: &logstatus();
880: &logthis('Going to check on the children');
881: foreach (sort keys %children) {
882: sleep 1;
883: unless (kill 'USR1' => $_) {
884: &logthis ('<font color=red>CRITICAL: Child '.$_.' is dead</font>');
885: &logstatus($$.' is dead');
886: }
887: }
888: }
889:
890: sub USRMAN {
891: &logthis("USR1: Trying to establish connections again");
1.39 foxr 892: #
893: # It is really important not to just clear the childatt hash or we will
894: # lose all memory of the children. What we really want to do is this:
895: # For each index where childatt is >= $childmaxattempts
896: # Zero the associated counter and do a make_child for the host.
897: # Regardles, the childatt entry is zeroed:
898: my $host;
899: foreach $host (keys %childatt) {
900: if ($childatt{$host} >= $childmaxattempts) {
901: $childatt{$host} = 0;
902: &logthis("<font color=green>INFO: Restarting child for server: "
903: .$host."</font>\n");
904: make_new_child($host);
905: }
906: else {
907: $childatt{$host} = 0;
908: }
909: }
910: &checkchildren(); # See if any children are still dead...
1.30 www 911: }
912:
913: # -------------------------------------------------- Non-critical communication
914: sub subreply {
915: my ($cmd,$server)=@_;
916: my $answer='';
917: if ($server ne $perlvar{'lonHostID'}) {
918: my $peerfile="$perlvar{'lonSockDir'}/$server";
919: my $sclient=IO::Socket::UNIX->new(Peer =>"$peerfile",
920: Type => SOCK_STREAM,
921: Timeout => 10)
922: or return "con_lost";
923:
924:
1.33 foxr 925: $answer = londtransaction($sclient, $cmd, 10);
926:
1.30 www 927: if ((!$answer) || ($@=~/timeout/)) { $answer="con_lost"; }
928: $SIG{ALRM}='DEFAULT';
929: $SIG{__DIE__}=\&catchexception;
930: } else { $answer='self_reply'; }
931: return $answer;
932: }
933:
934: # --------------------------------------------------------------------- Logging
935:
936: sub logthis {
937: my $message=shift;
938: my $execdir=$perlvar{'lonDaemons'};
939: my $fh=IO::File->new(">>$execdir/logs/lonc.log");
940: my $now=time;
941: my $local=localtime($now);
942: $lastlog=$local.': '.$message;
943: print $fh "$local ($$) [$conserver] [$status]: $message\n";
944: }
945:
1.33 foxr 946: #-------------------------------------- londtransaction:
947: #
948: # Performs a transaction with lond with timeout support.
949: # result = londtransaction(socket,request,timeout)
950: #
951: sub londtransaction {
952: my ($socket, $request, $tmo) = @_;
953:
954: if($DEBUG) {
955: &logthis("londtransaction request: $request");
956: }
957:
958: # Set the signal handlers: ALRM for timeout and disble the others.
959:
960: $SIG{ALRM} = sub { die "timeout" };
961: $SIG{__DIE__} = 'DEFAULT';
962:
963: # Disable all but alarm so that only that can interupt the
964: # send /receive.
965: #
966: my $sigset = POSIX::SigSet->new(QUIT, USR1, HUP, INT, TERM);
967: my $priorsigs = POSIX::SigSet->new;
968: unless (defined sigprocmask(SIG_BLOCK, $sigset, $priorsigs)) {
969: &logthis("<font color=red> CRITICAL -- londtransaction ".
970: "failed to block signals </font>");
971: die "could not block signals in londtransaction";
972: }
973: $answer = '';
974: #
975: # Send request to lond.
976: #
977: eval {
978: alarm($tmo);
979: print $socket "$request\n";
980: alarm(0);
981: };
982: # If request didn't timeout, try for the response.
983: #
984:
985: if ($@!~/timeout/) {
986: eval {
987: alarm($tmo);
988: $answer = <$socket>;
989: if($DEBUG) {
990: &logthis("Received $answer in londtransaction");
991: }
992: alarm(0);
993: };
994: } else {
1.55 albertel 995: &logthis("lonc - $conserver - suiciding on send Timeout");
996: die("lonc - $conserver - suiciding on send Timeout");
1.33 foxr 997: }
1.47 albertel 998: if ($@ =~ /timeout/) {
1.55 albertel 999: &logthis("lonc - $conserver - suiciding on read Timeout");
1000: die("lonc - $conserver - suiciding on read Timeout");
1.33 foxr 1001: }
1002: #
1003: # Restore the initial sigmask set.
1004: #
1005: unless (defined sigprocmask(SIG_UNBLOCK, $priorsigs)) {
1006: &logthis("<font color=red> CRITICAL -- londtransaction ".
1007: "failed to re-enable signal processing. </font>");
1008: die "londtransaction failed to re-enable signals";
1009: }
1010: #
1011: # go back to the prior handler set.
1012: #
1013: $SIG{ALRM} = 'DEFAULT';
1014: $SIG{__DIE__} = \&cathcexception;
1015:
1016: # chomp $answer;
1017: if ($DEBUG) {
1018: &logthis("Returning $answer in londtransaction");
1019: }
1020: return $answer;
1021:
1022: }
1.30 www 1023:
1024: sub logperm {
1025: my $message=shift;
1026: my $execdir=$perlvar{'lonDaemons'};
1027: my $now=time;
1028: my $local=localtime($now);
1029: my $fh=IO::File->new(">>$execdir/logs/lonnet.perm.log");
1030: print $fh "$now:$message:$local\n";
1031: }
1032: # ------------------------------------------------------------------ Log status
1033:
1034: sub logstatus {
1035: my $docdir=$perlvar{'lonDocRoot'};
1036: my $fh=IO::File->new(">>$docdir/lon-status/loncstatus.txt");
1037: print $fh $$."\t".$conserver."\t".$status."\t".$lastlog."\n";
1038: }
1039:
1040: sub initnewstatus {
1041: my $docdir=$perlvar{'lonDocRoot'};
1042: my $fh=IO::File->new(">$docdir/lon-status/loncstatus.txt");
1043: my $now=time;
1044: my $local=localtime($now);
1045: print $fh "LONC status $local - parent $$\n\n";
1046: }
1047:
1048: # -------------------------------------------------------------- Status setting
1049:
1050: sub status {
1051: my $what=shift;
1052: my $now=time;
1053: my $local=localtime($now);
1054: $status=$local.': '.$what;
1.43 www 1055: $0='lonc: '.$what.' '.$local;
1.30 www 1056: }
1057:
1058:
1.1 albertel 1059:
1.23 harris41 1060: # ----------------------------------- POD (plain old documentation, CPAN style)
1061:
1062: =head1 NAME
1063:
1064: lonc - LON TCP-MySQL-Server Daemon for handling database requests.
1065:
1066: =head1 SYNOPSIS
1067:
1.31 harris41 1068: Usage: B<lonc>
1069:
1.23 harris41 1070: Should only be run as user=www. This is a command-line script which
1.31 harris41 1071: is invoked by B<loncron>. There is no expectation that a typical user
1072: will manually start B<lonc> from the command-line. (In other words,
1073: DO NOT START B<lonc> YOURSELF.)
1.23 harris41 1074:
1.51 bowersj2 1075: =head1 OVERVIEW
1076:
1077: =head2 Physical Overview
1078:
1079: =begin latex
1080:
1081: \begin{figure}
1082: \begin{center}
1083: \includegraphics[width=0.65\paperwidth,keepaspectratio]{LONCAPA_Network_Diagram}
1084: \end{center}
1085: \caption{\label{Overview_Of_Network}Overview of Network}
1086: \end{figure}
1087:
1088: =end latex
1089:
1090: Physically, the Network consists of relatively inexpensive
1091: upper-PC-class server machines which are linked through the commodity
1092: internet in a load-balancing, dynamically content-replicating and
1093: failover-secure way.
1094:
1095: All machines in the Network are connected with each other through
1096: two-way persistent TCP/IP connections. Clients (B<B>, B<F>, B<G> and
1097: B<H> in Fig. Overview of Network) connect to the servers via standard
1098: HTTP. There are two classes of servers, B<Library Servers> (B<A> and
1099: B<E> in Fig. Overview of Network) and B<Access Servers> (B<C>, B<D>,
1100: B<I> and B<J> in Fig. Overview of Network).
1101:
1102: B<Library Servers> X<library server> X<server, library> are used to
1103: store all personal records of a set of users, and are responsible for
1104: their initial authentication when a session is opened on any server in
1105: the Network. For Authors, Library Servers also hosts their
1106: construction area and the authoritative copy of the current and
1107: previous versions of every resource that was published by that
1108: author. Library servers can be used as backups to host sessions when
1109: all access servers in the Network are overloaded. Otherwise, for
1110: learners, access servers are used to host the sessions. Library
1111: servers need to have strong I/O capabilities.
1112:
1113: B<Access Servers> X<access server> X<server, access> provide LON-CAPA
1114: service to users, using the library servers as their data source. The
1115: network is designed so that the number of concurrent sessions can be
1116: increased over a wide range by simply adding additional access servers
1117: before having to add additional library servers. Preliminary tests
1118: showed that a library server could handle up to 10 access servers
1119: fully parallel. Access servers can generally be cheaper hardware then
1120: library servers require.
1121:
1122: The Network is divided into B<domains> X<domain>, which are logical
1123: boundaries between participating institutions. These domains can be
1124: used to limit the flow of personal user information across the
1125: network, set access privileges and enforce royalty schemes. LON-CAPA
1126: domains bear no relationship to any other domain, including domains
1127: used by the DNS system; LON-CAPA domains may be freely configured in
1128: any manner that suits your use pattern.
1129:
1130: =head2 Example Transactions
1131:
1132: Fig. Overview of Network also depicts examples for several kinds of
1133: transactions conducted across the Network.
1134:
1135: An instructor at client B<B> modifies and publishes a resource on her
1136: Home Server B<A>. Server B<A> has a record of all server machines
1137: currently subscribed to this resource, and replicates it to servers
1138: B<D> and B<I>. However, server B<D> is currently offline, so the
1139: update notification gets buffered on B<A> until B<D> comes online
1140: again. Servers B<C> and B<J> are currently not subscribed to this
1141: resource.
1142:
1143: Learners B<F> and B<G> have open sessions on server B<I>, and the new
1144: resource is immediately available to them.
1145:
1146: Learner B<H> tries to connect to server B<I> for a new session,
1147: however, the machine is not reachable, so he connects to another
1148: Access Server B<J> instead. This server currently does not have all
1149: necessary resources locally present to host learner B<H>, but
1150: subscribes to them and replicates them as they are accessed by B<H>.
1151:
1152: Learner B<H> solves a problem on server B<J>. Library Server B<E> is
1153: B<H>'s Home Server, so this information gets forwarded to B<E>, where
1154: the records of H are updated.
1155:
1.52 bowersj2 1156: =head2 lond, lonc, and lonnet
1.51 bowersj2 1157:
1158: =begin latex
1159:
1160: \begin{figure}
1.52 bowersj2 1161: \includegraphics[width=0.65\paperwidth,keepaspectratio]{LONCAPA_Network_Diagram2}
1.51 bowersj2 1162: \caption{\label{Overview_Of_Network_Communication}Overview of
1163: Network Communication} \end{figure}
1164:
1165: =end latex
1166:
1167: Fig. Overview of Network Communication elaborates on the details of
1168: this network infrastructure. It depicts three servers (B<A>, B<B> and
1169: B<C>) and a client who has a session on server B<C>.
1170:
1171: As B<C> accesses different resources in the system, different
1172: handlers, which are incorporated as modules into the child processes
1173: of the web server software, process these requests.
1174:
1175: Our current implementation uses C<mod_perl> inside of the Apache web
1176: server software. As an example, server B<C> currently has four active
1177: web server software child processes. The chain of handlers dealing
1178: with a certain resource is determined by both the server content
1179: resource area (see below) and the MIME type, which in turn is
1180: determined by the URL extension. For most URL structures, both an
1181: authentication handler and a content handler are registered.
1182:
1183: Handlers use a common library C<lonnet> X<lonnet> to interact with
1184: both locally present temporary session data and data across the server
1185: network. For example, lonnet provides routines for finding the home
1186: server of a user, finding the server with the lowest loadavg, sending
1187: simple command-reply sequences, and sending critical messages such as
1188: a homework completion, etc. For a non-critical message, the routines
1189: reply with a simple "connection lost" if the message could not be
1190: delivered. For critical messages, lonnet tries to re-establish
1191: connections, re-send the command, etc. If no valid reply could be
1192: received, it answers "connection deferred" and stores the message in
1193: buffer space to be sent at a later point in time. Also, failed
1194: critical messages are logged.
1195:
1196: The interface between C<lonnet> and the Network is established by a
1197: multiplexed UNIX domain socket, denoted B<DS> in Fig. Overview of
1198: Network Communication. The rationale behind this rather involved
1199: architecture is that httpd processes (Apache children) dynamically
1200: come and go on the timescale of minutes, based on workload and number
1201: of processed requests. Over the lifetime of an httpd child, however,
1202: it has to establish several hundred connections to several different
1203: servers in the Network.
1204:
1205: On the other hand, establishing a TCP/IP connection is resource
1206: consuming for both ends of the line, and to optimize this connectivity
1207: between different servers, connections in the Network are designed to
1208: be persistent on the timescale of months, until either end is
1209: rebooted. This mechanism will be elaborated on below.
1210:
1211: =begin latex
1212:
1213: \begin{figure}
1214: \begin{lyxcode}
1215: msul1:msu:library:zaphod.lite.msu.edu:35.8.63.51
1216:
1217: msua1:msu:access:agrajag.lite.msu.edu:35.8.63.68
1218:
1219: msul2:msu:library:frootmig.lite.msu.edu:35.8.63.69
1220:
1221: msua2:msu:access:bistromath.lite.msu.edu:35.8.63.67
1222:
1223: hubl14:hub:library:hubs128-pc-14.cl.msu.edu:35.8.116.34
1224:
1225: hubl15:hub:library:hubs128-pc-15.cl.msu.edu:35.8.116.35
1226:
1227: hubl16:hub:library:hubs128-pc-16.cl.msu.edu:35.8.116.36
1228:
1229: huba20:hub:access:hubs128-pc-20.cl.msu.edu:35.8.116.40
1230:
1231: huba21:hub:access:hubs128-pc-21.cl.msu.edu:35.8.116.41
1232:
1233: huba22:hub:access:hubs128-pc-22.cl.msu.edu:35.8.116.42
1234:
1235: huba23:hub:access:hubs128-pc-23.cl.msu.edu:35.8.116.43
1236:
1237: hubl25:other:library:hubs128-pc-25.cl.msu.edu:35.8.116.45
1238:
1239: huba27:other:access:hubs128-pc-27.cl.msu.edu:35.8.116.47
1240: \end{lyxcode}
1241:
1242: \caption{\label{Example_Of_hosts.tab}Example of Hosts Lookup table\texttt{/home/httpd/lonTabs/hosts.tab}}
1243: \end{figure}
1244:
1245: =end latex
1246:
1247: Establishing a connection to a UNIX domain socket is far less resource
1248: consuming than the establishing of a TCP/IP connection. C<lonc>
1249: X<lonc> is a proxy daemon that forks off a child for every server in
1250: the Network. Which servers are members of the Network is determined by
1251: a lookup table, such as the one in Fig. Examples of Hosts. In order,
1252: the entries denote an internal name for the server, the domain of the
1253: server, the type of the server, the host name and the IP address.
1254:
1255: The C<lonc> parent process maintains the population and listens for
1256: signals to restart or shutdown, as well as I<USR1>. Every child
1257: establishes a multiplexed UNIX domain socket for its server and opens
1258: a TCP/IP connection to the lond daemon (discussed below) on the remote
1259: machine, which it keeps alive. If the connection is interrupted, the
1260: child dies, whereupon the parent makes several attempts to fork
1261: another child for that server.
1262:
1263: When starting a new child (a new connection), first an init-sequence
1264: is carried out, which includes receiving the information from the
1265: remote C<lond> which is needed to establish the 128-bit encryption key
1266: - the key is different for every connection. Next, any buffered
1267: (delayed) messages for the server are sent.
1268:
1269: In normal operation, the child listens to the UNIX socket, forwards
1270: requests to the TCP connection, gets the reply from C<lond>, and sends
1271: it back to the UNIX socket. Also, C<lonc> takes care to the encryption
1272: and decryption of messages.
1273:
1274: C<lond> X<lond> is the remote end of the TCP/IP connection and acts as
1275: a remote command processor. It receives commands, executes them, and
1276: sends replies. In normal operation, a C<lonc> child is constantly
1277: connected to a dedicated C<lond> child on the remote server, and the
1278: same is true vice versa (two persistent connections per server
1279: combination).
1280:
1281: lond listens to a TCP/IP port (denoted B<P> in Fig. Overview of
1282: Network Communication) and forks off enough child processes to have
1283: one for each other server in the network plus two spare children. The
1284: parent process maintains the population and listens for signals to
1285: restart or shutdown. Client servers are authenticated by IP.
1286:
1287: When a new client server comes online, C<lond> sends a signal I<USR1>
1288: to lonc, whereupon C<lonc> tries again to reestablish all lost
1289: connections, even if it had given up on them before - a new client
1290: connecting could mean that that machine came online again after an
1291: interruption.
1292:
1293: The gray boxes in Fig. Overview of Network Communication denote the
1294: entities involved in an example transaction of the Network. The Client
1295: is logged into server B<C>, while server B<B> is her Home
1296: Server. Server B<C> can be an access server or a library server, while
1297: server B<B> is a library server. She submits a solution to a homework
1298: problem, which is processed by the appropriate handler for the MIME
1299: type "problem". Through C<lonnet>, the handler writes information
1300: about this transaction to the local session data. To make a permanent
1301: log entry, C<lonnet> establishes a connection to the UNIX domain
1302: socket for server B<B>. C<lonc> receives this command, encrypts it,
1303: and sends it through the persistent TCP/IP connection to the TCP/IP
1304: port of the remote C<lond>. C<lond> decrypts the command, executes it
1305: by writing to the permanent user data files of the client, and sends
1306: back a reply regarding the success of the operation. If the operation
1307: was unsuccessful, or the connection would have broken down, C<lonc>
1308: would write the command into a FIFO buffer stack to be sent again
1309: later. C<lonc> now sends a reply regarding the overall success of the
1310: operation to C<lonnet> via the UNIX domain port, which is eventually
1311: received back by the handler.
1312:
1313: =head2 Dynamic Resource Replication
1314:
1315: Since resources are assembled into higher order resources simply by
1316: reference, in principle it would be sufficient to retrieve them from
1317: the respective Home Servers of the authors. However, there are several
1318: problems with this simple approach: since the resource assembly
1319: mechanism is designed to facilitate content assembly from a large
1320: number of widely distributed sources, individual sessions would depend
1321: on a large number of machines and network connections to be available,
1322: thus be rather fragile. Also, frequently accessed resources could
1323: potentially drive individual machines in the network into overload
1324: situations.
1325:
1326: Finally, since most resources depend on content handlers on the Access
1327: Servers to be served to a client within the session context, the raw
1328: source would first have to be transferred across the Network from the
1329: respective Library Server to the Access Server, processed there, and
1330: then transferred on to the client.
1331:
1332: =begin latex
1333:
1334: \begin{figure}
1335: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Dynamic_Replication_Request}
1336: \caption{\label{Dynamic_Replication}Dynamic Replication}
1337: \end{figure}
1338:
1339: =end latex
1340:
1341: To enable resource assembly in a reliable and scalable way, a dynamic
1342: resource replication scheme was developed. Fig. "Dynamic Replication"
1343: shows the details of this mechanism.
1344:
1345: Anytime a resource out of the resource space is requested, a handler
1346: routine is called which in turn calls the replication routine. As a
1347: first step, this routines determines whether or not the resource is
1348: currently in replication transfer (Step B<D1a>). During replication
1349: transfer, the incoming data is stored in a temporary file, and Step
1350: B<D1a> checks for the presence of that file. If transfer of a resource
1351: is actively going on, the controlling handler receives an error
1352: message, waits for a few seconds, and then calls the replication
1353: routine again. If the resource is still in transfer, the client will
1354: receive the message "Service currently not available".
1355:
1356: In the next step (Step B<D1b>), the replication routine checks if the
1357: URL is locally present. If it is, the replication routine returns OK
1358: to the controlling handler, which in turn passes the request on to the
1359: next handler in the chain.
1360:
1361: If the resource is not locally present, the Home Server of the
1362: resource author (as extracted from the URL) is determined (Step
1363: B<D2>). This is done by contacting all library servers in the author?s
1364: domain (as determined from the lookup table, see Fig. 1.1.2B). In Step
1365: B<D2b> a query is sent to the remote server whether or not it is the
1366: Home Server of the author (in our current implementation, an
1367: additional cache is used to store already identified Home Servers (not
1368: shown in the figure)). In Step B<D2c>, the remote server answers the
1369: query with True or False. If the Home Server was found, the routine
1370: continues, otherwise it contacts the next server (Step D2a). If no
1371: server could be found, a "File not Found" error message is issued. In
1372: our current implementation, in this step the Home Server is also
1373: written into a cache for faster access if resources by the same author
1374: are needed again (not shown in the figure).
1375:
1376: =begin latex
1377:
1378: \begin{figure}
1379: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Dynamic_Replication_Change}
1380: \caption{\label{Dynamic_Replication_Change}Dynamic Replication: Change} \end{figure}
1381:
1382: =end latex
1383:
1384: In Step B<D3a>, the routine sends a subscribe command for the URL to
1385: the Home Server of the author. The Home Server first determines if the
1386: resource is present, and if the access privileges allow it to be
1387: copied to the requesting server (B<D3b>). If this is true, the
1388: requesting server is added to the list of subscribed servers for that
1389: resource (Step B<D3c>). The Home Server will reply with either OK or
1390: an error message, which is determined in Step D4. If the remote
1391: resource was not present, the error message "File not Found" will be
1392: passed on to the client, if the access was not allowed, the error
1393: message "Access Denied" is passed on. If the operation succeeded, the
1394: requesting server sends an HTTP request for the resource out of the
1395: C</raw> server content resource area of the Home Server.
1396:
1397: The Home Server will then check if the requesting server is part of
1398: the network, and if it is subscribed to the resource (Step B<D5b>). If
1399: it is, it will send the resource via HTTP to the requesting server
1400: without any content handlers processing it (Step B<D5c>). The
1401: requesting server will store the incoming data in a temporary data
1402: file (Step B<D5a>) - this is the file that Step B<D1a> checks for. If
1403: the transfer could not complete, and appropriate error message is sent
1404: to the client (Step B<D6>). Otherwise, the transferred temporary file
1405: is renamed as the actual resource, and the replication routine returns
1406: OK to the controlling handler (Step B<D7>).
1407:
1408: Fig. "Dynamic Replication: Change" depicts the process of modifying a
1409: resource. When an author publishes a new version of a resource, the
1410: Home Server will contact every server currently subscribed to the
1411: resource (Step B<U1>), as determined from the list of subscribed
1412: servers for the resource generated in Step B<D3c>. The subscribing
1413: servers will receive and acknowledge the update message (Step
1414: B<U1c>). The update mechanism finishes when the last subscribed server
1415: has been contacted (messages to unreachable servers are buffered).
1416:
1417: Each subscribing server will check if the resource in question had
1418: been accessed recently, that is, within a configurable amount of time
1419: (Step B<U2>).
1420:
1421: If the resource had not been accessed recently, the local copy of the
1422: resource is deleted (Step B<U3a>) and an unsubscribe command is sent
1423: to the Home Server (Step B<U3b>). The Home Server will check if the
1424: server had indeed originally subscribed to the resource (Step B<U3c>)
1425: and then delete the server from the list of subscribed servers for the
1426: resource (Step B<U3d>).
1427:
1428: If the resource had been accessed recently, the modified resource will
1429: be copied over using the same mechanism as in Step B<D5a> through
1430: B<D7>, which represents steps Steps B<U4a> through B<U6> in the
1431: replication figure.
1432:
1.52 bowersj2 1433: =head2 Load Balancing
1.51 bowersj2 1434:
1.52 bowersj2 1435: X<load balancing>C<lond> provides a function to query the server's current loadavg. As
1.51 bowersj2 1436: a configuration parameter, one can determine the value of loadavg,
1437: which is to be considered 100%, for example, 2.00.
1438:
1439: Access servers can have a list of spare access servers,
1440: C</home/httpd/lonTabs/spares.tab>, to offload sessions depending on
1441: own workload. This check happens is done by the login handler. It
1442: re-directs the login information and session to the least busy spare
1443: server if itself is overloaded. An additional round-robin IP scheme
1444: possible. See Fig. "Load Balancing Sample" for an example of a
1445: load-balancing scheme.
1446:
1447: =begin latex
1448:
1449: \begin{figure}
1450: \includegraphics[width=0.75\paperwidth,keepaspectratio]{Load_Balancing_Example}
1451: \caption{\label{Load_Balancing_Example}Load Balancing Example} \end{figure}
1452:
1453: =end latex
1454:
1.23 harris41 1455: =head1 DESCRIPTION
1456:
1457: Provides persistent TCP connections to the other servers in the network
1458: through multiplexed domain sockets
1459:
1.31 harris41 1460: B<lonc> forks off children processes that correspond to the other servers
1461: in the network. Management of these processes can be done at the
1462: parent process level or the child process level.
1463:
1.51 bowersj2 1464: After forking off the children, B<lonc> the B<parent> executes a main
1465: loop which simply waits for processes to exit. As a process exits, a
1466: new process managing a link to the same peer as the exiting process is
1467: created.
1.33 foxr 1468:
1.31 harris41 1469: B<logs/lonc.log> is the location of log messages.
1470:
1471: The process management is now explained in terms of linux shell commands,
1472: subroutines internal to this code, and signal assignments:
1473:
1474: =over 4
1475:
1476: =item *
1477:
1478: PID is stored in B<logs/lonc.pid>
1479:
1480: This is the process id number of the parent B<lonc> process.
1481:
1482: =item *
1483:
1484: SIGTERM and SIGINT
1485:
1486: Parent signal assignment:
1487: $SIG{INT} = $SIG{TERM} = \&HUNTSMAN;
1488:
1489: Child signal assignment:
1490: $SIG{INT} = 'DEFAULT'; (and SIGTERM is DEFAULT also)
1491: (The child dies and a SIGALRM is sent to parent, awaking parent from slumber
1492: to restart a new child.)
1493:
1494: Command-line invocations:
1495: B<kill> B<-s> SIGTERM I<PID>
1496: B<kill> B<-s> SIGINT I<PID>
1497:
1498: Subroutine B<HUNTSMAN>:
1499: This is only invoked for the B<lonc> parent I<PID>.
1500: This kills all the children, and then the parent.
1501: The B<lonc.pid> file is cleared.
1502:
1503: =item *
1504:
1505: SIGHUP
1506:
1507: Current bug:
1508: This signal can only be processed the first time
1509: on the parent process. Subsequent SIGHUP signals
1510: have no effect.
1511:
1512: Parent signal assignment:
1513: $SIG{HUP} = \&HUPSMAN;
1514:
1515: Child signal assignment:
1516: none (nothing happens)
1517:
1518: Command-line invocations:
1519: B<kill> B<-s> SIGHUP I<PID>
1520:
1521: Subroutine B<HUPSMAN>:
1522: This is only invoked for the B<lonc> parent I<PID>,
1523: This kills all the children, and then the parent.
1524: The B<lonc.pid> file is cleared.
1525:
1526: =item *
1527:
1528: SIGUSR1
1529:
1530: Parent signal assignment:
1531: $SIG{USR1} = \&USRMAN;
1532:
1533: Child signal assignment:
1534: $SIG{USR1}= \&logstatus;
1535:
1536: Command-line invocations:
1537: B<kill> B<-s> SIGUSR1 I<PID>
1538:
1539: Subroutine B<USRMAN>:
1540: When invoked for the B<lonc> parent I<PID>,
1541: SIGUSR1 is sent to all the children, and the status of
1542: each connection is logged.
1543:
1.23 harris41 1544:
1.31 harris41 1545: =back
1.23 harris41 1546:
1547: =cut
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>