--- loncom/build/xfml_parse.pl 2002/01/29 10:42:42 1.1 +++ loncom/build/xfml_parse.pl 2002/04/08 10:52:24 1.4 @@ -1,16 +1,67 @@ #!/usr/bin/perl +# -------------------------------------------------------- Documentation notice +# Run "perldoc ./lpml_parse.pl" in order to best view the software +# documentation internalized in this program. + +# --------------------------------------------------------- License Information +# The LearningOnline Network with CAPA +# piml_parse.pl - Linux Packaging Markup Language parser +# +# $Id: xfml_parse.pl,v 1.4 2002/04/08 10:52:24 harris41 Exp $ +# +# Written by Scott Harrison, codeharrison@yahoo.com +# +# Copyright Michigan State University Board of Trustees +# +# This file is part of the LearningOnline Network with CAPA (LON-CAPA). +# +# LON-CAPA is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# LON-CAPA is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with LON-CAPA; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# /home/httpd/html/adm/gpl.txt +# +# http://www.lon-capa.org/ +# # YEAR=2002 -# 1/26,1/27,1/28 - Scott Harrison +# 1/26,1/27,1/28,1/29,1/30,1/31,2/20,4/8 - Scott Harrison +# +### # Read in 2 XML file; first is the filter specification, the second # is the XML file to be filtered +############################################################################### +## ## +## ORGANIZATION OF THIS PERL SCRIPT ## +## 1. Notes ## +## 2. Read in filter file ## +## 3. Initialize and clear conditions ## +## 4. Run through and apply clauses ## +## ## +############################################################################### + +# ----------------------------------------------------------------------- Notes +# +# This is meant to parse files meeting the xfml document type. +# See xfml.dtd. XFML=XML Filtering Markup Language. + use HTML::TokeParser; use strict; unless (@ARGV) { - print <; my $parsestring=join('',@lines); undef @lines; +open(IN,"<$tofilter"); my @lines=; +my $parsestring=join('',@lines); undef @lines; close IN; my $parser = HTML::TokeParser->new(\$parsestring) or die('can\'t create TokeParser object'); $parser->xml_mode('1'); -# Define handling methods for mode-dependent text rendering - +# --------------------------------------------- initialize and clear conditions my %conditions; &cc; +# Define handling methods for mode-dependent text rendering $parser->{textify}={ - xfml => \&format_xfml, + 'xfml' => \&format_xfml, 'when:name' => \&format_when_name, 'when:attribute' => \&format_when_attribute, 'when:cdata' => \&format_when_cdata, - 'choice:include' => \&format_choice_include, 'choice:exclude' => \&format_choice_exclude, + 'clause' => \&format_clause, }; my $text; @@ -44,169 +97,156 @@ my $xfml; my $wloc=0; my %eha; -while (my $token = $parser->get_tag('xfml')) { - &format_xfml(@{$token}); - $text = $parser->get_text('/xfml'); -# print $xfml; -# print $text; - $token = $parser->get_tag('/xfml'); -} - -open IN,"<$tofilter"; -my @lines2=; close IN; my $parsestring2=join('',@lines2); undef @lines2; -$parser = HTML::TokeParser->new(\$parsestring2) or +# ----------------------------------------------- Run through and apply clauses +my @lines2=<>; my $output=join('',@lines2); undef @lines2; +my $lparser = HTML::TokeParser->new(\$output) or die('can\'t create TokeParser object'); -$parser->xml_mode('1'); - -my $token; -my $hloc=0; -my %ts; -my $tr; -my $echild=0; -my $exclude=0; -my $excluden=0; -my $excludea=0; -my $et=0; -my $cdata=''; -while ($token = $parser->get_token()) { -# from HTML::TokeParser documentation: -# ["S", $tag, %$attr, @$attrseq, $text] -# ["E", $tag, $text] -# ["T", $text, $is_data] -# ["C", $text] -# ["D", $text] -# ["PI", $token0, $text] -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'name'}}, -# @{$conditions{'name'}}; -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'attribute'}}, -# @{$conditions{'attribute'}}; -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'value'}}, -# @{$conditions{'value'}}; -# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'cdata'}}, -# @{$conditions{'cdata'}}; - if ($token->[0] eq 'D') { - print $token->[1]; - } - elsif ($token->[0] eq 'C') { - print $token->[1]; - } - elsif ($token->[0] eq 'S') { - $cdata=''; - $hloc++; -# if token can be excluded, then pretend it is until all conditions are -# run (eha); then output during end tag processing -# else, output - -# a token can be excluded when it is an eh key, or a child node of -# an eh key - - if ($eh{$token->[1]}) { - $echild=$token->[1]; -# print "ECHILD=$echild\n"; +$lparser->xml_mode('1'); +my $parsestring2; +while (my $token = $parser->get_tag('clause')) { + $parsestring2=$output; + $lparser = HTML::TokeParser->new(\$parsestring2); + $lparser->xml_mode('1'); + $output=''; + &format_clause(@{$token}); + $text = $parser->get_text('/clause'); + $token = $parser->get_tag('/clause'); + + my $token=''; + my $ttype=''; + my $excludeflag=0; + my $outcache=''; + while ($token = $lparser->get_token()) { + if ($token->[0] eq 'D') { $ttype='D'; $output.=$token->[1]; } + elsif ($token->[0] eq 'C') { $ttype='C'; $output.=$token->[1]; } + elsif ($token->[0] eq 'T') { + if ($ttype eq 'D' or $ttype eq 'C' or $ttype eq 'S' + or $ttype eq 'E') { + $output.=$token->[1]; + } + else { + $outcache.=$token->[1]; + } } - if ($echild) { - # run through names for echild - # then attributes and/or values and/or cdata - my $name=$token->[1]; - my @attributes=@{$token->[3]}; - my %atthash=%{$token->[2]}; - foreach my $namemlist (@{$eha{$echild}->{'name'}}) { - foreach my $namematch (@{$namemlist}) { - my $nm=$namematch; $nm=~s/^.//; $nm=~s/.$//; - if ($name=~/$nm/) { -# print "NMATCH: $nm ($name)\n"; - $excluden++; - foreach my $attributemlist - (@{$eha{$echild}->{'attribute'}}) { - foreach my $attributematch - (@{$attributemlist}) { - my ($an,$am)= - split(/\=/,$attributematch,2); - $am=~s/^.//; - $am=~s/.$//; -# print 'AM:'."($an,$am)\t"; -# print 'ATT:'.join(',',%atthash)."\n"; - if ($atthash{$an}) { - if ($atthash{$an}=~/$am/) { - $excludea++; -# print "AMATCH: $am (". -# join(',', -# @attributes) -# ."\n"; - } - } - } - } - } - } + elsif ($token->[0] eq 'S') { + if ($eh{$token->[1]} or $excludeflag==1) { + $ttype=''; + $excludeflag=1; + $outcache.=$token->[4]; + } + else { + $ttype='S'; + $output.=$token->[4]; + } + if ($excludeflag==1) { + } - $tr.=$token->[4]; } - else { - print $token->[4]; + elsif ($token->[0] eq 'E') { + if ($eh{$token->[1]} and $excludeflag==1) { + $ttype='E'; + $excludeflag=0; + $outcache.=$token->[2]; + my $retval=&evalconditions($outcache); + if (&evalconditions($outcache)) { + $output.=$outcache; + } + else { + $output.=''; + } + $outcache=''; + } + elsif ($excludeflag==1) { + $ttype=''; + $outcache.=$token->[2]; + } + else { + $output.=$token->[2]; + $ttype='E'; + } } } - elsif ($token->[0] eq 'E') { - if ($echild) { - $tr.=$token->[2]; - if ($excluden) { - foreach my $cdatamlist (@{$eha{$echild}->{'cdata'}}) { - foreach my $cdatamatch (@{$cdatamlist}) { -# print "CDATA: $cdatamatch, $cdata\n"; - my $cm=$cdatamatch; - my $not=0; - if ($cm=~/\!/) { - $not=1; - $cm=~s/^.//; + &cc; +} +print $output; + +# -------------------------------------------------------------- evalconditions +sub evalconditions { + my ($parsetext)=@_; + my $eparser = HTML::TokeParser->new(\$parsetext); + unless (@{$conditions{'name'}} or + @{$conditions{'attribute'}}) { + return 0; + } + my $nameflag=0; + my $cdataflag=0; + my $matchflag=0; + my $Ttoken=''; + while (my $token = $eparser->get_token()) { + if ($token->[0] eq 'S') { + foreach my $name (@{$conditions{'name'}}) { + my $flag=0; + my $match=$name; + if ($match=~/^\!/) { + $match=~s/^\!//g; + $flag=1; + } + $match=~s/^\///g; + $match=~s/\/$//g; + if ((!$flag and $token->[1]=~/$match/) or + ($flag and $token->[1]!~/$match/)) { + $nameflag=1; + } + } + $Ttoken=''; + } + elsif ($token->[0] eq 'E') { + foreach my $name (@{$conditions{'name'}}) { + my $flag=0; + my $match=$name; + if ($match=~/^\!/) { + $match=~s/^\!//g; + $flag=1; + } + $match=~s/^\///g; + $match=~s/\/$//g; + if ((!$flag and $token->[1]=~/$match/) or + ($flag and $token->[1]!~/$match/)) { + foreach my $cdata (@{$conditions{'cdata'}}) { + my $flag=0; + my $match=$cdata; + if ($match=~/^\!/) { + $match=~s/^\!//g; + $flag=1; } - $cm=~s/^.//; $cm=~s/.$//; - if ((!$not and $cdata!~/$cm/) - or ($not and $cdata=~/$cm/)) { -# print "CMISMATCH: $cm ($cdata)\n"; + $match=~s/^\///g; + $match=~s/\/$//g; + if ((!$flag and $Ttoken=~/$match/) or + ($flag and $Ttoken!~/$match/)) { + $cdataflag=1; } - elsif (($not and $cdata!~/$cm/) - or (!$not and $cdata=~/$cm/)) { - $exclude++; + } + if (@{$conditions{'cdata'}}) { + if ($cdataflag) { + return 0; + } + } + else { + if ($nameflag) { + return 0; } } + $nameflag=0; } } } - if ($eh{$token->[1]}) { - $echild=0; - if (!$exclude and !$excludea) { - print $tr; -# print $token->[2]; - $tr=''; - } - elsif ($exclude>0 or $excludea>0) { -# print "EXCLUDING $token->[1] $excludea $excluden\n"; - $exclude=0; $excluden=0; $excludea=0; - $tr=''; + elsif ($token->[0] eq 'T') { + if ($nameflag) { + $Ttoken.=$token->[1]; } - $exclude=0; $excluden=0; $excludea=0; - } - else { - if ($echild) { -# $tr.=$token->[2]; - } - else { - print $token->[2]; - $tr=''; - } - } - $hloc--; - } - elsif ($token->[0] eq 'T') { - if ($echild) { - $tr.=$token->[1]; - $cdata=$token->[1]; - } - else { - print $token->[1]; - $tr=''; } } + return 1; } # ------------------------------------------------------------ clear conditions @@ -215,6 +255,7 @@ sub cc { @{$conditions{'attribute'}}=(); pop @{$conditions{'attribute'}}; @{$conditions{'value'}}=(); pop @{$conditions{'value'}}; @{$conditions{'cdata'}}=(); pop @{$conditions{'cdata'}}; + %eh=(1,1); delete $eh{1}; } # --------------------------------------- remove starting and ending whitespace @@ -222,37 +263,31 @@ sub trim { my ($s)=@_; $s=~s/^\s*//; $s=~s/\s*$//; return $s; } + + + # --------------------------------------------------------- Format xfml section sub format_xfml { my (@tokeninfo)=@_; return ''; } +# ------------------------------------------------------- Format clause section +sub format_clause { + my (@tokeninfo)=@_; + return ''; +} + # ---------------------------------------------------- Format when:name section sub format_when_name { my (@tokeninfo)=@_; - $wloc++; +# $wloc++; my $att_match=$tokeninfo[2]->{'match'}; push @{$conditions{'name'}},$att_match; my $text=&trim($parser->get_text('/when:name')); $parser->get_tag('/when:name'); -# print 'Name Matching...'.$att_match; - $wloc--; - &cc unless $wloc; - return ''; -} - -# ----------------------------------------------- Format when:attribute section -sub format_when_attribute { - my (@tokeninfo)=@_; - $wloc++; - my $att_match=$tokeninfo[2]->{'match'}; - push @{$conditions{'attribute'}},$att_match; - my $text=&trim($parser->get_text('/when:attribute')); - $parser->get_tag('/when:attribute'); -# print 'Attribute Matching...'.$att_match; - $wloc--; - &cc unless $wloc; +# $wloc--; +# &cc unless $wloc; return ''; } @@ -261,21 +296,11 @@ sub format_when_cdata { my (@tokeninfo)=@_; $wloc++; my $att_match=$tokeninfo[2]->{'match'}; -# print 'Cdata Matching...'.$att_match; push @{$conditions{'cdata'}},$att_match; my $text=&trim($parser->get_text('/when:cdata')); $parser->get_tag('/when:cdata'); $wloc--; - &cc unless $wloc; - return ''; -} - -# ----------------------------------------------- Format choice:include section -sub format_choice_include { - my (@tokeninfo)=@_; - my $text=&trim($parser->get_text('/choice:include')); - $parser->get_tag('/choice:include'); - $ih{$tokeninfo[2]->{'match'}}++; +# &cc unless $wloc; return ''; } @@ -295,3 +320,69 @@ sub format_choice_exclude { [@{$conditions{'cdata'}}]; return ''; } + +# ----------------------------------- POD (plain old documentation, CPAN style) + +=pod + +=head1 NAME + +xfml_parse.pl - This is meant to parse files meeting the xfml document type. +See xfml.dtd. XFML=XML Filtering Markup Language. + +=head1 SYNOPSIS + +Usage is for lpml file to come in through standard input. + +=over 4 + +=item * + +1st argument is name of xfml file. + +=back + +Example: + + cat loncapafiles.lpml | perl xfml_parse.pl valid_hosts.xfml + +or + + perl xfml_parse.pl valid_hosts.xfml loncapafiles.lpml + +=head1 DESCRIPTION + +I am using a multiple pass-through approach to parsing +the xfml file. This saves memory and makes sure the server +will never be overloaded. + +=head1 README + +I am using a multiple pass-through approach to parsing +the xfml file. This saves memory and makes sure the server +will never be overloaded. + +=head1 PREREQUISITES + +HTML::TokeParser + +=head1 COREQUISITES + +=head1 OSNAMES + +linux + +=head1 SCRIPT CATEGORIES + +Packaging/Administrative + +=head1 AUTHOR + + Scott Harrison + codeharrison@yahoo.com + +Please let me know how/if you are finding this script useful and +any/all suggestions. -Scott + +=cut +