--- loncom/build/lpml_parse.pl	2002/01/09 22:18:16	1.35
+++ loncom/build/lpml_parse.pl	2002/04/21 23:36:17	1.46
@@ -1,11 +1,20 @@
 #!/usr/bin/perl
 
+# -------------------------------------------------------- Documentation notice
+# Run "perldoc ./lpml_parse.pl" in order to best view the software
+# documentation internalized in this program.
+
+# --------------------------------------------------------- Distribution notice
+# This script is distributed with the LPML software project available at
+# http://lpml.sourceforge.net
+
+# --------------------------------------------------------- License Information
 # The LearningOnline Network with CAPA
 # lpml_parse.pl - Linux Packaging Markup Language parser
 #
-# $Id: lpml_parse.pl,v 1.35 2002/01/09 22:18:16 harris41 Exp $
+# $Id: lpml_parse.pl,v 1.46 2002/04/21 23:36:17 harris41 Exp $
 #
-# Written by Scott Harrison, harris41@msu.edu
+# Written by Scott Harrison, codeharrison@yahoo.com
 #
 # Copyright Michigan State University Board of Trustees
 #
@@ -37,7 +46,7 @@
 # 11/4,11/5,11/6,11/7,11/16,11/17 - Scott Harrison
 # 12/2,12/3,12/4,12/5,12/6,12/13,12/19,12/29 - Scott Harrison
 # YEAR=2002
-# 1/8,1/9 - Scott Harrison
+# 1/8,1/9,1/29,1/31,2/5,3/21,4/8,4/12 - Scott Harrison
 #
 ###
 
@@ -68,9 +77,12 @@ use HTML::TokeParser;
 my $usage=<<END;
 **** ERROR ERROR ERROR ERROR ****
 Usage is for lpml file to come in through standard input.
-1st argument is the mode of parsing.
-2nd argument is the category permissions to use (runtime or development)
-3rd argument is the distribution (default,redhat6.2,debian2.2,redhat7.1,etc).
+1st argument is the mode of parsing:
+    install,configinstall,build,rpm,dpkg,htmldoc,textdoc,status
+2nd argument is the category permissions to use:
+    typical choices: runtime,development
+3rd argument is the distribution:
+    typical choices: default,redhat6.2,debian2.2,redhat7
 4th argument is to manually specify a sourceroot.
 5th argument is to manually specify a targetroot.
 
@@ -80,11 +92,13 @@ Example:
 
 cat ../../doc/loncapafiles.lpml |\\
 perl lpml_parse.pl html development default /home/sherbert/loncapa /tmp/install
+
+For more information, type "perldoc lpml_parse.pl".
 END
 
 # ------------------------------------------------- Grab command line arguments
 
-my $mode;
+my $mode='';
 if (@ARGV==5) {
     $mode = shift @ARGV;
 }
@@ -95,34 +109,34 @@ else {
     exit -1; # exit with error status
 }
 
-my $categorytype;
+my $categorytype='';
 if (@ARGV) {
     $categorytype = shift @ARGV;
 }
 
-my $dist;
+my $dist='';
 if (@ARGV) {
     $dist = shift @ARGV;
 }
 
-my $targetroot;
-my $sourceroot;
-my $targetrootarg;
-my $sourcerootarg;
+my $targetroot='';
+my $sourceroot='';
+my $targetrootarg='';
+my $sourcerootarg='';
 if (@ARGV) {
     $sourceroot = shift @ARGV;
 }
 if (@ARGV) {
     $targetroot = shift @ARGV;
 }
-$sourceroot=~s/\/$//;
-$targetroot=~s/\/$//;
+$sourceroot=~s/\/$//; # remove trailing directory slash
+$targetroot=~s/\/$//; # remove trailing directory slash
 $sourcerootarg=$sourceroot;
 $targetrootarg=$targetroot;
 
 my $logcmd='| tee -a WARNINGS';
 
-my $invocation;
+my $invocation; # Record how the program was invoked
 # --------------------------------------------------- Record program invocation
 if ($mode eq 'install' or $mode eq 'configinstall' or $mode eq 'build') {
     $invocation=(<<END);
@@ -130,19 +144,18 @@ if ($mode eq 'install' or $mode eq 'conf
 #             1st argument (mode) is: $mode
 #             2nd argument (category type) is: $categorytype
 #             3rd argument (distribution) is: $dist
-#             4th argument (targetroot) is: described below
-#             5th argument (sourceroot) is: described below
+#             4th argument (sourceroot) is: described below
+#             5th argument (targetroot) is: described below
 END
 }
 
-# ---------------------------------------------------- Start first pass through
-my @parsecontents = <>;
-my $parsestring = join('',@parsecontents);
-my $outstring;
+# -------------------------- Start first pass through (just gather information)
+my @parsecontents=<>;
+my $parsestring=join('',@parsecontents);
 
 # Need to make a pass through and figure out what defaults are
-# overrided.  Top-down overriding strategy (leaves don't know
-# about distant leaves).
+# overrided.  Top-down overriding strategy (tree leaves don't know
+# about distant tree leaves).
 
 my @hierarchy;
 $hierarchy[0]=0;
@@ -151,20 +164,32 @@ my $token;
 $parser = HTML::TokeParser->new(\$parsestring) or
     die('can\'t create TokeParser object');
 $parser->xml_mode('1');
-my %hash;
-my $key;
-while ($token = $parser->get_token()) {
+my %setting;
+
+# Values for the %setting hash
+my $defaultset=1; # a default setting exists for a key
+my $distset=2; # a distribution setting exists for a key
+               # (overrides default setting)
+
+my $key=''; # this is a unique key identifier (the token name with its
+            # coordinates inside the hierarchy)
+while ($token = $parser->get_token()) { # navigate through $parsestring
     if ($token->[0] eq 'S') {
 	$hloc++;
 	$hierarchy[$hloc]++;
 	$key=$token->[1].join(',',@hierarchy[0..($hloc-1)]);
 	my $thisdist=' '.$token->[2]{'dist'}.' ';
 	if ($thisdist eq ' default ') {
-	    $hash{$key}=1; # there is a default setting for this key
+	    $setting{$key}=$defaultset;
 	}
-	elsif ($dist && $hash{$key}==1 && $thisdist=~/\s$dist\s/) {
-	    $hash{$key}=2; # disregard default setting for this key if
-	                   # there is a directly requested distribution match
+	elsif (length($dist)>0 &&
+	       $setting{$key}==$defaultset &&
+	       $thisdist=~/\s$dist\s/) {
+	    $setting{$key}=$distset;
+                   # disregard default setting for this key if
+                   # there is a directly requested distribution match
+                   # (in other words, there must first be a default
+	           # setting for a key in order for it to be overridden)
 	}
     }
     if ($token->[0] eq 'E') {
@@ -172,57 +197,79 @@ while ($token = $parser->get_token()) {
     }
 }
 
-# --------------------------------------------------- Start second pass through
-undef $hloc;
-undef @hierarchy;
-undef $parser;
-$hierarchy[0]=0;
+# - Start second pass through (clean up the string to allow for easy rendering)
+
+# The string is cleaned up so that there is no white-space surrounding any
+# XML tag.  White-space inside text 'T' elements is preserved.
+
+# Clear up memory
+undef($hloc);
+undef(@hierarchy);
+undef($parser);
+$hierarchy[0]=0; # initialize hierarchy
 $parser = HTML::TokeParser->new(\$parsestring) or
     die('can\'t create TokeParser object');
 $parser->xml_mode('1');
-my $cleanstring;
-while ($token = $parser->get_token()) {
-    if ($token->[0] eq 'S') {
+my $cleanstring; # contains the output of the second step
+while ($token = $parser->get_token()) { # navigate through $parsestring
+    if ($token->[0] eq 'S') { # a start tag
 	$hloc++;
 	$hierarchy[$hloc]++;
 	$key=$token->[1].join(',',@hierarchy[0..($hloc-1)]);
-	my $thisdist=' '.$token->[2]{'dist'}.' ';
+
+	# Surround tagdist (the dist attribute of an XML tag)
+	# with white-space to allow for uniform searching a few
+	# lines below here.
+	my $tagdist=' '.$token->[2]{'dist'}.' ';
+
 	# This conditional clause is set up to ignore two sets
 	# of invalid conditions before accepting entry into
-	# the cleanstring.
-	if ($hash{$key}==2 and
-	    !($thisdist eq '  ' or $thisdist =~/\s$dist\s/)) {
+	# $cleanstring.
+
+	# Condition #1: Ignore this part of the string if the tag 
+	# has a superior distribution-specific setting and the tag
+	# being evaluated has a dist setting something other than
+	# blank or $dist.
+	if ($setting{$key}==$distset and
+	    !($tagdist eq '  ' or $tagdist =~/\s$dist\s/)) {
 	    if ($token->[4]!~/\/>$/) {
 		$parser->get_tag('/'.$token->[1]);
 		$hloc--;
 	    }
 	}
-	elsif ($thisdist ne '  ' and $thisdist!~/\s$dist\s/ and
-	       !($thisdist eq ' default ' and $hash{$key}!=2)) {
+	# Condition #2: Ignore this part of the string if the tag has
+	# is not blank and does not equal dist and
+	# either does not equal default or it has a prior $dist-specific
+	# setting.
+	elsif ($tagdist ne '  ' and $tagdist!~/\s$dist\s/ and
+	       !($tagdist eq ' default ' and $setting{$key}!=$distset)) {
 	    if ($token->[4]!~/\/>$/) {
 		$parser->get_tag('/'.$token->[1]);
 		$hloc--;
 	    }
 	}
+	# In other words, output to $cleanstring if the tag is dist=default
+	# or if the tag is set to dist=$dist for the first time.  And, always
+	# output when dist='' is not present.
 	else {
 	    $cleanstring.=$token->[4];
 	}
-	if ($token->[4]=~/\/>$/) {
-	    $hloc--;
-	}
     }
-    if ($token->[0] eq 'E') {
+    # Note: this loop DOES work with <tag /> style markup as well as
+    # <tag></tag> style markup since I always check for $token->[4] ending
+    # with "/>".
+    if ($token->[0] eq 'E') { # an end tag
 	$cleanstring.=$token->[2];
 	$hloc--;
     }
-    if ($token->[0] eq 'T') {
+    if ($token->[0] eq 'T') { # text contents inside tags
 	$cleanstring.=$token->[1];
     }
 }
 $cleanstring=&trim($cleanstring);
 $cleanstring=~s/\>\s*\n\s*\</\>\</g;
 
-# ---------------------------------------------------- Start final pass through
+# -------------------------------------------- Start final (third) pass through
 
 # storage variables
 my $lpml;
@@ -281,10 +328,6 @@ my $link_count;
 my $fileglob_count;
 my $fileglobnames_count;
 my %categorycount;
-# START TEMP WAY
-#my %bytecount;  # TEMP WAY TO COUNT INFORMATION
-#my %linecount;  # TEMP WAY TO COUNT INFORMATION
-# END TEMP WAY
 
 my @buildall;
 my @buildinfo;
@@ -370,13 +413,6 @@ exit;
 # ------------------------ Final output at end of markup parsing and formatting
 sub end {
     if ($mode eq 'html') {
-	# START TEMP WAY
-#	my $totallinecount;
-#	my $totalbytecount;
-#	map {$totallinecount+=$linecount{$_};
-#	     $totalbytecount+=$bytecount{$_}}
-# 	  @categorynamelist;
-        # END TEMP WAY
 	return "<br />&nbsp;<br />".
 	    "<a name='summary' /><font size='+2'>Summary of Source Repository".
 	    "</font>".
@@ -414,16 +450,6 @@ sub end {
 	    "</table>".
 	    "</body></html>\n";
 
-# START TEMP WAY
-#	    join("\n",(map {"<tr><td><img src='$fab{$_}.gif' ".
-#		 "alt='$_ icon' /></td>".
-# 	         "<td>$_</td><td>$categorycount{$_}</td><td>$linecount{$_}</td><td>$bytecount{$_}</td></tr>"}
-#		@categorynamelist)).
-#	    "<br />&nbsp;<br />".
-#	    "Total Lines of Code: $totallinecount".
-#	    "<br />&nbsp;<br />".
-#	    "Total Bytes: $totalbytecount".
-# END TEMP WAY
     }
     if ($mode eq 'install') {
 	return '';
@@ -970,7 +996,9 @@ sub format_directory {
 	my ($chmod,$chown)=split(/\s/,$categoryhash{$categoryname});
 	return $directory="\n<tr><td rowspan='2' bgcolor='#ffffff'>".
 	    "$categoryname</td>".
-	    "<td rowspan='2' bgcolor='#ffffff'><!-- POSTEVAL [$categoryname] verify.pl directory /$targetdir $categoryhash{$categoryname} -->&nbsp;</td>".
+	    "<td rowspan='2' bgcolor='#ffffff'><!-- POSTEVAL [$categoryname] ".
+	    "verify.pl directory /$targetdir $categoryhash{$categoryname} -->".
+	    "&nbsp;</td>".
 	    "<td rowspan='2' bgcolor='#ffffff'>$chmod<br />$chown</td>".
 	    "<td bgcolor='#ffffff'>$thtml</td></tr>".
 	    "<tr><td bgcolor='#ffffff' colspan='[{{{{{DPATHLENGTH}}}}}]'>".
@@ -1028,7 +1056,10 @@ sub format_description {
 sub format_files {
     my $text=$parser->get_text('/files');
     $parser->get_tag('/files');
-    if ($mode eq 'html') {
+    if ($mode eq 'MANIFEST') {
+	return $text;
+    }
+    elsif ($mode eq 'html') {
 	return $directories="\n<br />&nbsp;<br />".
 	    "<a name='files' />".
 	    "<font size='+2'>Files</font><br />&nbsp;<br />".
@@ -1144,19 +1175,15 @@ sub format_file {
     my $buildtest;
     $file_count++;
     $categorycount{$categoryname}++;
-    # START TEMP WAY
-#    if (-T "$sourcerootarg/$source") {
-#	$linecount{$categoryname}+=`wc -l $sourcerootarg/$source`;
-#    }
-#    my $bytesize=(-s "$sourcerootarg/$source");
-#    $bytecount{$categoryname}+=$bytesize;
-    # END TEMP WAY
     if ($source) {
 	$parser->get_tag('/file');
-	if ($mode eq 'html') {
+	if ($mode eq 'MANIFEST') {
+	    return $source."\n";
+	}
+	elsif ($mode eq 'html') {
 	    return ($file="\n<!-- FILESORT:$target -->".
 		    "<tr>".
-		    "<td><!-- POSTEVAL [$categoryname] verify.pl file '$sourcerootarg' ".
+          "<td><!-- POSTEVAL [$categoryname] verify.pl file '$sourcerootarg' ".
 		    "'$targetrootarg' ".
 		    "'$source' '$target' ".
 		    "$categoryhash{$categoryname} -->&nbsp;</td><td>".
@@ -1256,7 +1283,8 @@ END
 		$logcmd.' && echo "'.
 		'Configuration source file does not exist '.
 		''.$sourceroot.'/'.$source.'"'.
-		"$logcmd); } && perl verifymodown.pl ${targetroot}/${target} \"$categoryhash{$categoryname}\"$logcmd;\n\n";
+	      "$logcmd); } && perl verifymodown.pl ${targetroot}/${target} \"".
+		"$categoryhash{$categoryname}\"$logcmd;\n\n";
 	}
 	elsif ($mode eq 'build' && $build) {
 	    push @buildall,$sourceroot.'/'.$source;
@@ -1361,23 +1389,14 @@ sub format_fileglob {
     my @semi=($filenames2=~/(\;)/g);
     $fileglobnames_count+=scalar(@semi)+1;
     $categorycount{$categoryname}+=scalar(@semi)+1;
-    # START TEMP WAY
-#    for my $f (split(/\;/,$filenames2)) {
-#	if (-T "$sourcerootarg/$sourcedir/$f") {
-#	    $linecount{$categoryname}+=`wc -l $sourcerootarg/$sourcedir/$f`;
-#	    open OUT,">>/tmp/junk123";
-#	    print OUT "$linecount{$categoryname} $categoryname $sourcerootarg/$sourcedir/$f\n";
-#	    close OUT;
-#	}
-#	my $bytesize=(-s "$sourcerootarg/$sourcedir/$f");
-#	$bytecount{$categoryname}+=$bytesize;
-#    }
-    # END TEMP WAY
     if ($sourcedir) {
 	$parser->get_tag('/fileglob');
-	if ($mode eq 'html') {
+	if ($mode eq 'MANIFEST') {
+         return join("\n",(map {"$sourcedir$_"} split(/\;/,$filenames2)))."\n";
+	}
+	elsif ($mode eq 'html') {
 	    return $fileglob="\n<tr>".
-		"<td><!-- POSTEVAL [$categoryname] verify.pl fileglob '$sourcerootarg' ".
+      "<td><!-- POSTEVAL [$categoryname] verify.pl fileglob '$sourcerootarg' ".
 		"'$targetrootarg' ".
 		"'$glob' '$sourcedir' '$filenames2' '$targetdir' ".
 		"$categoryhash{$categoryname} -->&nbsp;</td>".
@@ -1493,6 +1512,7 @@ sub format_build {
     if ($text) {
 	$parser->get_tag('/build');
 	$build=$sourceroot.'/'.$text.';'.$tokeninfo[2]{'trigger'};
+	$build=~s/([^\\])\\\s+/$1/g; # allow for lines split onto new lines
     }
     return '';
 }
@@ -1551,7 +1571,7 @@ sub format_filenames {
     }
     return '';
 }
-# ------------------------------------------------ Format specialnotice section
+# ----------------------------------------------- Format specialnotices section
 sub format_specialnotices {
     $parser->get_tag('/specialnotices');
     return '';
@@ -1585,14 +1605,17 @@ sub trim {
 
 # ----------------------------------- POD (plain old documentation, CPAN style)
 
+=pod
+
 =head1 NAME
 
 lpml_parse.pl - This is meant to parse files meeting the lpml document type.
-See lpml.dtd.  LPML=Linux Packaging Markup Language.
 
 =head1 SYNOPSIS
 
-Usage is for lpml file to come in through standard input.
+<STDIN> | perl lpml_parse.pl <MODE> <CATEGORY> <DIST> <SOURCE> <TARGET>
+
+Usage is for the lpml file to come in through standard input.
 
 =over 4
 
@@ -1624,19 +1647,57 @@ Only the 1st argument is mandatory for t
 Example:
 
 cat ../../doc/loncapafiles.lpml |\\
-perl lpml_parse.pl html default /home/sherbert/loncapa /tmp/install
+perl lpml_parse.pl html runtime default /home/sherbert/loncapa /tmp/install
 
 =head1 DESCRIPTION
 
-I am using a multiple pass-through approach to parsing
-the lpml file.  This saves memory and makes sure the server
-will never be overloaded.
+The general flow of the script is to get command line arguments, run through
+the XML document three times, and output according to any desired mode:
+install, configinstall, build, rpm, dpkg, htmldoc, textdoc, and status.
+
+A number of coding decisions are made according to the following principle:
+installation software must be stand-alone.  Therefore, for instance, I try
+not to use the GetOpt::Long module or any other perl modules.  (I do however
+use HTML::TokeParser.)  I also have tried to keep all the MODES of
+parsing inside this file.  Therefore, format_TAG subroutines are fairly
+lengthy with their conditional logic.  A more "elegant" solution might
+be to dynamically register the parsing mode and subroutines, or maybe even work
+with stylesheets.  However, in order to make this the installation back-bone
+of choice, there are advantages for HAVING EVERYTHING IN ONE FILE.
+This way, the LPML installation software does not have to rely on OTHER
+installation software (a chicken versus the egg problem).  Besides, I would
+suggest the modes of parsing are fairly constant: install, configinstall,
+build, rpm, dpkg, htmldoc, textdoc, and status.
+
+Another coding decision is about using a multiple pass-through approach to
+parsing the lpml file.  This saves memory and makes sure the server will never
+be overloaded.  During the first pass-through, the script gathers information
+specific as to resolving what tags with what 'dist=' attributes are to be used.
+During the second pass-through, the script cleans up white-space surrounding
+the XML tags, and filters through the tags based on information regarding the
+'dist=' attributes (information gathered in the first pass-through).
+The third and final pass-through involves formatting and rendering the XML
+into whatever XML mode is chosen: install, configinstall, build, rpm, dpkg,
+htmldoc, textdoc, and status.
+
+The hierarchy mandated by the DTD does not always correspond to the hierarchy
+that is sensible for a Makefile.  For instance, in a Makefile it is sensible
+that soft-links are installed after files.  However, in an LPML document, it
+is sensible that files and links be considered together and the writer of the
+LPML document should be free to place things in whatever order makes best
+sense in terms of LOOKING at the information.  The complication that arises
+is that the parser needs to have a memory for passing values from
+leaves on the XML tree to higher-up branches.  Currently, this memory is
+hard-coded (like with the @links array), but it may benefit from a more
+formal approach in the future.
 
 =head1 README
 
-I am using a multiple pass-through approach to parsing
-the lpml file.  This saves memory and makes sure the server
-will never be overloaded.
+This parses an LPML file to generate information useful for
+source to target installation, compilation, filesystem status
+checking, RPM and Debian software packaging, and documentation.
+
+More information on LPML is available at http://lpml.sourceforge.net.
 
 =head1 PREREQUISITES
 
@@ -1650,6 +1711,14 @@ linux
 
 =head1 SCRIPT CATEGORIES
 
-Packaging/Administrative
+UNIX/System_administration
+
+=head1 AUTHOR
+
+ Scott Harrison
+ codeharrison@yahoo.com
+
+Please let me know how/if you are finding this script useful and
+any/all suggestions.  -Scott
 
 =cut