--- loncom/build/lpml_parse.pl 2002/01/09 22:18:16 1.35 +++ loncom/build/lpml_parse.pl 2002/04/21 23:36:17 1.46 @@ -1,11 +1,20 @@ #!/usr/bin/perl +# -------------------------------------------------------- Documentation notice +# Run "perldoc ./lpml_parse.pl" in order to best view the software +# documentation internalized in this program. + +# --------------------------------------------------------- Distribution notice +# This script is distributed with the LPML software project available at +# http://lpml.sourceforge.net + +# --------------------------------------------------------- License Information # The LearningOnline Network with CAPA # lpml_parse.pl - Linux Packaging Markup Language parser # -# $Id: lpml_parse.pl,v 1.35 2002/01/09 22:18:16 harris41 Exp $ +# $Id: lpml_parse.pl,v 1.46 2002/04/21 23:36:17 harris41 Exp $ # -# Written by Scott Harrison, harris41@msu.edu +# Written by Scott Harrison, codeharrison@yahoo.com # # Copyright Michigan State University Board of Trustees # @@ -37,7 +46,7 @@ # 11/4,11/5,11/6,11/7,11/16,11/17 - Scott Harrison # 12/2,12/3,12/4,12/5,12/6,12/13,12/19,12/29 - Scott Harrison # YEAR=2002 -# 1/8,1/9 - Scott Harrison +# 1/8,1/9,1/29,1/31,2/5,3/21,4/8,4/12 - Scott Harrison # ### @@ -68,9 +77,12 @@ use HTML::TokeParser; my $usage=<; -my $parsestring = join('',@parsecontents); -my $outstring; +# -------------------------- Start first pass through (just gather information) +my @parsecontents=<>; +my $parsestring=join('',@parsecontents); # Need to make a pass through and figure out what defaults are -# overrided. Top-down overriding strategy (leaves don't know -# about distant leaves). +# overrided. Top-down overriding strategy (tree leaves don't know +# about distant tree leaves). my @hierarchy; $hierarchy[0]=0; @@ -151,20 +164,32 @@ my $token; $parser = HTML::TokeParser->new(\$parsestring) or die('can\'t create TokeParser object'); $parser->xml_mode('1'); -my %hash; -my $key; -while ($token = $parser->get_token()) { +my %setting; + +# Values for the %setting hash +my $defaultset=1; # a default setting exists for a key +my $distset=2; # a distribution setting exists for a key + # (overrides default setting) + +my $key=''; # this is a unique key identifier (the token name with its + # coordinates inside the hierarchy) +while ($token = $parser->get_token()) { # navigate through $parsestring if ($token->[0] eq 'S') { $hloc++; $hierarchy[$hloc]++; $key=$token->[1].join(',',@hierarchy[0..($hloc-1)]); my $thisdist=' '.$token->[2]{'dist'}.' '; if ($thisdist eq ' default ') { - $hash{$key}=1; # there is a default setting for this key + $setting{$key}=$defaultset; } - elsif ($dist && $hash{$key}==1 && $thisdist=~/\s$dist\s/) { - $hash{$key}=2; # disregard default setting for this key if - # there is a directly requested distribution match + elsif (length($dist)>0 && + $setting{$key}==$defaultset && + $thisdist=~/\s$dist\s/) { + $setting{$key}=$distset; + # disregard default setting for this key if + # there is a directly requested distribution match + # (in other words, there must first be a default + # setting for a key in order for it to be overridden) } } if ($token->[0] eq 'E') { @@ -172,57 +197,79 @@ while ($token = $parser->get_token()) { } } -# --------------------------------------------------- Start second pass through -undef $hloc; -undef @hierarchy; -undef $parser; -$hierarchy[0]=0; +# - Start second pass through (clean up the string to allow for easy rendering) + +# The string is cleaned up so that there is no white-space surrounding any +# XML tag. White-space inside text 'T' elements is preserved. + +# Clear up memory +undef($hloc); +undef(@hierarchy); +undef($parser); +$hierarchy[0]=0; # initialize hierarchy $parser = HTML::TokeParser->new(\$parsestring) or die('can\'t create TokeParser object'); $parser->xml_mode('1'); -my $cleanstring; -while ($token = $parser->get_token()) { - if ($token->[0] eq 'S') { +my $cleanstring; # contains the output of the second step +while ($token = $parser->get_token()) { # navigate through $parsestring + if ($token->[0] eq 'S') { # a start tag $hloc++; $hierarchy[$hloc]++; $key=$token->[1].join(',',@hierarchy[0..($hloc-1)]); - my $thisdist=' '.$token->[2]{'dist'}.' '; + + # Surround tagdist (the dist attribute of an XML tag) + # with white-space to allow for uniform searching a few + # lines below here. + my $tagdist=' '.$token->[2]{'dist'}.' '; + # This conditional clause is set up to ignore two sets # of invalid conditions before accepting entry into - # the cleanstring. - if ($hash{$key}==2 and - !($thisdist eq ' ' or $thisdist =~/\s$dist\s/)) { + # $cleanstring. + + # Condition #1: Ignore this part of the string if the tag + # has a superior distribution-specific setting and the tag + # being evaluated has a dist setting something other than + # blank or $dist. + if ($setting{$key}==$distset and + !($tagdist eq ' ' or $tagdist =~/\s$dist\s/)) { if ($token->[4]!~/\/>$/) { $parser->get_tag('/'.$token->[1]); $hloc--; } } - elsif ($thisdist ne ' ' and $thisdist!~/\s$dist\s/ and - !($thisdist eq ' default ' and $hash{$key}!=2)) { + # Condition #2: Ignore this part of the string if the tag has + # is not blank and does not equal dist and + # either does not equal default or it has a prior $dist-specific + # setting. + elsif ($tagdist ne ' ' and $tagdist!~/\s$dist\s/ and + !($tagdist eq ' default ' and $setting{$key}!=$distset)) { if ($token->[4]!~/\/>$/) { $parser->get_tag('/'.$token->[1]); $hloc--; } } + # In other words, output to $cleanstring if the tag is dist=default + # or if the tag is set to dist=$dist for the first time. And, always + # output when dist='' is not present. else { $cleanstring.=$token->[4]; } - if ($token->[4]=~/\/>$/) { - $hloc--; - } } - if ($token->[0] eq 'E') { + # Note: this loop DOES work with style markup as well as + # style markup since I always check for $token->[4] ending + # with "/>". + if ($token->[0] eq 'E') { # an end tag $cleanstring.=$token->[2]; $hloc--; } - if ($token->[0] eq 'T') { + if ($token->[0] eq 'T') { # text contents inside tags $cleanstring.=$token->[1]; } } $cleanstring=&trim($cleanstring); $cleanstring=~s/\>\s*\n\s*\
". "Summary of Source Repository". "". @@ -414,16 +450,6 @@ sub end { "". "\n"; -# START TEMP WAY -# join("\n",(map {"". -# "$_$categorycount{$_}$linecount{$_}$bytecount{$_}"} -# @categorynamelist)). -# "
 
". -# "Total Lines of Code: $totallinecount". -# "
 
". -# "Total Bytes: $totalbytecount". -# END TEMP WAY } if ($mode eq 'install') { return ''; @@ -970,7 +996,9 @@ sub format_directory { my ($chmod,$chown)=split(/\s/,$categoryhash{$categoryname}); return $directory="\n". "$categoryname". - " ". + "". + " ". "$chmod
$chown". "$thtml". "". @@ -1028,7 +1056,10 @@ sub format_description { sub format_files { my $text=$parser->get_text('/files'); $parser->get_tag('/files'); - if ($mode eq 'html') { + if ($mode eq 'MANIFEST') { + return $text; + } + elsif ($mode eq 'html') { return $directories="\n
 
". "
". "Files
 
". @@ -1144,19 +1175,15 @@ sub format_file { my $buildtest; $file_count++; $categorycount{$categoryname}++; - # START TEMP WAY -# if (-T "$sourcerootarg/$source") { -# $linecount{$categoryname}+=`wc -l $sourcerootarg/$source`; -# } -# my $bytesize=(-s "$sourcerootarg/$source"); -# $bytecount{$categoryname}+=$bytesize; - # END TEMP WAY if ($source) { $parser->get_tag('/file'); - if ($mode eq 'html') { + if ($mode eq 'MANIFEST') { + return $source."\n"; + } + elsif ($mode eq 'html') { return ($file="\n". "". - " ". @@ -1256,7 +1283,8 @@ END $logcmd.' && echo "'. 'Configuration source file does not exist '. ''.$sourceroot.'/'.$source.'"'. - "$logcmd); } && perl verifymodown.pl ${targetroot}/${target} \"$categoryhash{$categoryname}\"$logcmd;\n\n"; + "$logcmd); } && perl verifymodown.pl ${targetroot}/${target} \"". + "$categoryhash{$categoryname}\"$logcmd;\n\n"; } elsif ($mode eq 'build' && $build) { push @buildall,$sourceroot.'/'.$source; @@ -1361,23 +1389,14 @@ sub format_fileglob { my @semi=($filenames2=~/(\;)/g); $fileglobnames_count+=scalar(@semi)+1; $categorycount{$categoryname}+=scalar(@semi)+1; - # START TEMP WAY -# for my $f (split(/\;/,$filenames2)) { -# if (-T "$sourcerootarg/$sourcedir/$f") { -# $linecount{$categoryname}+=`wc -l $sourcerootarg/$sourcedir/$f`; -# open OUT,">>/tmp/junk123"; -# print OUT "$linecount{$categoryname} $categoryname $sourcerootarg/$sourcedir/$f\n"; -# close OUT; -# } -# my $bytesize=(-s "$sourcerootarg/$sourcedir/$f"); -# $bytecount{$categoryname}+=$bytesize; -# } - # END TEMP WAY if ($sourcedir) { $parser->get_tag('/fileglob'); - if ($mode eq 'html') { + if ($mode eq 'MANIFEST') { + return join("\n",(map {"$sourcedir$_"} split(/\;/,$filenames2)))."\n"; + } + elsif ($mode eq 'html') { return $fileglob="\n". - " ". @@ -1493,6 +1512,7 @@ sub format_build { if ($text) { $parser->get_tag('/build'); $build=$sourceroot.'/'.$text.';'.$tokeninfo[2]{'trigger'}; + $build=~s/([^\\])\\\s+/$1/g; # allow for lines split onto new lines } return ''; } @@ -1551,7 +1571,7 @@ sub format_filenames { } return ''; } -# ------------------------------------------------ Format specialnotice section +# ----------------------------------------------- Format specialnotices section sub format_specialnotices { $parser->get_tag('/specialnotices'); return ''; @@ -1585,14 +1605,17 @@ sub trim { # ----------------------------------- POD (plain old documentation, CPAN style) +=pod + =head1 NAME lpml_parse.pl - This is meant to parse files meeting the lpml document type. -See lpml.dtd. LPML=Linux Packaging Markup Language. =head1 SYNOPSIS -Usage is for lpml file to come in through standard input. + | perl lpml_parse.pl + +Usage is for the lpml file to come in through standard input. =over 4 @@ -1624,19 +1647,57 @@ Only the 1st argument is mandatory for t Example: cat ../../doc/loncapafiles.lpml |\\ -perl lpml_parse.pl html default /home/sherbert/loncapa /tmp/install +perl lpml_parse.pl html runtime default /home/sherbert/loncapa /tmp/install =head1 DESCRIPTION -I am using a multiple pass-through approach to parsing -the lpml file. This saves memory and makes sure the server -will never be overloaded. +The general flow of the script is to get command line arguments, run through +the XML document three times, and output according to any desired mode: +install, configinstall, build, rpm, dpkg, htmldoc, textdoc, and status. + +A number of coding decisions are made according to the following principle: +installation software must be stand-alone. Therefore, for instance, I try +not to use the GetOpt::Long module or any other perl modules. (I do however +use HTML::TokeParser.) I also have tried to keep all the MODES of +parsing inside this file. Therefore, format_TAG subroutines are fairly +lengthy with their conditional logic. A more "elegant" solution might +be to dynamically register the parsing mode and subroutines, or maybe even work +with stylesheets. However, in order to make this the installation back-bone +of choice, there are advantages for HAVING EVERYTHING IN ONE FILE. +This way, the LPML installation software does not have to rely on OTHER +installation software (a chicken versus the egg problem). Besides, I would +suggest the modes of parsing are fairly constant: install, configinstall, +build, rpm, dpkg, htmldoc, textdoc, and status. + +Another coding decision is about using a multiple pass-through approach to +parsing the lpml file. This saves memory and makes sure the server will never +be overloaded. During the first pass-through, the script gathers information +specific as to resolving what tags with what 'dist=' attributes are to be used. +During the second pass-through, the script cleans up white-space surrounding +the XML tags, and filters through the tags based on information regarding the +'dist=' attributes (information gathered in the first pass-through). +The third and final pass-through involves formatting and rendering the XML +into whatever XML mode is chosen: install, configinstall, build, rpm, dpkg, +htmldoc, textdoc, and status. + +The hierarchy mandated by the DTD does not always correspond to the hierarchy +that is sensible for a Makefile. For instance, in a Makefile it is sensible +that soft-links are installed after files. However, in an LPML document, it +is sensible that files and links be considered together and the writer of the +LPML document should be free to place things in whatever order makes best +sense in terms of LOOKING at the information. The complication that arises +is that the parser needs to have a memory for passing values from +leaves on the XML tree to higher-up branches. Currently, this memory is +hard-coded (like with the @links array), but it may benefit from a more +formal approach in the future. =head1 README -I am using a multiple pass-through approach to parsing -the lpml file. This saves memory and makes sure the server -will never be overloaded. +This parses an LPML file to generate information useful for +source to target installation, compilation, filesystem status +checking, RPM and Debian software packaging, and documentation. + +More information on LPML is available at http://lpml.sourceforge.net. =head1 PREREQUISITES @@ -1650,6 +1711,14 @@ linux =head1 SCRIPT CATEGORIES -Packaging/Administrative +UNIX/System_administration + +=head1 AUTHOR + + Scott Harrison + codeharrison@yahoo.com + +Please let me know how/if you are finding this script useful and +any/all suggestions. -Scott =cut