version 1.1, 2002/01/29 10:42:42
|
version 1.3, 2002/02/20 00:21:42
|
Line 1
|
Line 1
|
#!/usr/bin/perl |
#!/usr/bin/perl |
|
|
# YEAR=2002 |
# YEAR=2002 |
# 1/26,1/27,1/28 - Scott Harrison |
# 1/26,1/27,1/28,1/29,1/30,1/31 - Scott Harrison |
|
# |
|
### |
|
|
# Read in 2 XML file; first is the filter specification, the second |
# Read in 2 XML file; first is the filter specification, the second |
# is the XML file to be filtered |
# is the XML file to be filtered |
|
|
|
############################################################################### |
|
## ## |
|
## ORGANIZATION OF THIS PERL SCRIPT ## |
|
## 1. Notes ## |
|
## 2. Read in filter file ## |
|
## 3. Initialize and clear conditions ## |
|
## 4. Run through and apply clauses ## |
|
## ## |
|
############################################################################### |
|
|
|
# ----------------------------------------------------------------------- Notes |
|
# |
|
# This is meant to parse files meeting the xfml document type. |
|
# See xfml.dtd. XFML=XML Filtering Markup Language. |
|
|
use HTML::TokeParser; |
use HTML::TokeParser; |
use strict; |
use strict; |
|
|
Line 19 END
|
Line 36 END
|
} |
} |
|
|
my %eh; |
my %eh; |
my %ih; |
|
|
# ---------------------------------------------- Read in filter file from @ARGV |
my $tofilter=shift @ARGV; |
my $tofilter=shift @ARGV; |
my @lines=<>; my $parsestring=join('',@lines); undef @lines; |
open IN,"<$tofilter"; my @lines=<IN>; |
|
my $parsestring=join('',@lines); undef @lines; close IN; |
my $parser = HTML::TokeParser->new(\$parsestring) or |
my $parser = HTML::TokeParser->new(\$parsestring) or |
die('can\'t create TokeParser object'); |
die('can\'t create TokeParser object'); |
$parser->xml_mode('1'); |
$parser->xml_mode('1'); |
|
|
# Define handling methods for mode-dependent text rendering |
# --------------------------------------------- initialize and clear conditions |
|
|
my %conditions; &cc; |
my %conditions; &cc; |
|
|
|
# Define handling methods for mode-dependent text rendering |
$parser->{textify}={ |
$parser->{textify}={ |
xfml => \&format_xfml, |
'xfml' => \&format_xfml, |
'when:name' => \&format_when_name, |
'when:name' => \&format_when_name, |
'when:attribute' => \&format_when_attribute, |
'when:attribute' => \&format_when_attribute, |
'when:cdata' => \&format_when_cdata, |
'when:cdata' => \&format_when_cdata, |
'choice:include' => \&format_choice_include, |
|
'choice:exclude' => \&format_choice_exclude, |
'choice:exclude' => \&format_choice_exclude, |
|
'clause' => \&format_clause, |
}; |
}; |
|
|
my $text; |
my $text; |
Line 44 my $xfml;
|
Line 63 my $xfml;
|
my $wloc=0; |
my $wloc=0; |
my %eha; |
my %eha; |
|
|
while (my $token = $parser->get_tag('xfml')) { |
# ----------------------------------------------- Run through and apply clauses |
&format_xfml(@{$token}); |
my @lines2=<>; my $output=join('',@lines2); undef @lines2; |
$text = $parser->get_text('/xfml'); |
my $lparser = HTML::TokeParser->new(\$output) or |
# print $xfml; |
|
# print $text; |
|
$token = $parser->get_tag('/xfml'); |
|
} |
|
|
|
open IN,"<$tofilter"; |
|
my @lines2=<IN>; close IN; my $parsestring2=join('',@lines2); undef @lines2; |
|
$parser = HTML::TokeParser->new(\$parsestring2) or |
|
die('can\'t create TokeParser object'); |
die('can\'t create TokeParser object'); |
$parser->xml_mode('1'); |
$lparser->xml_mode('1'); |
|
my $parsestring2; |
my $token; |
while (my $token = $parser->get_tag('clause')) { |
my $hloc=0; |
$parsestring2=$output; |
my %ts; |
$lparser = HTML::TokeParser->new(\$parsestring2); |
my $tr; |
$lparser->xml_mode('1'); |
my $echild=0; |
$output=''; |
my $exclude=0; |
&format_clause(@{$token}); |
my $excluden=0; |
$text = $parser->get_text('/clause'); |
my $excludea=0; |
$token = $parser->get_tag('/clause'); |
my $et=0; |
|
my $cdata=''; |
my $token=''; |
while ($token = $parser->get_token()) { |
my $ttype=''; |
# from HTML::TokeParser documentation: |
my $excludeflag=0; |
# ["S", $tag, %$attr, @$attrseq, $text] |
my $outcache=''; |
# ["E", $tag, $text] |
while ($token = $lparser->get_token()) { |
# ["T", $text, $is_data] |
if ($token->[0] eq 'D') { $ttype='D'; $output.=$token->[1]; } |
# ["C", $text] |
elsif ($token->[0] eq 'C') { $ttype='C'; $output.=$token->[1]; } |
# ["D", $text] |
elsif ($token->[0] eq 'T') { |
# ["PI", $token0, $text] |
if ($ttype eq 'D' or $ttype eq 'C' or $ttype eq 'S' |
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'name'}}, |
or $ttype eq 'E') { |
# @{$conditions{'name'}}; |
$output.=$token->[1]; |
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'attribute'}}, |
} |
# @{$conditions{'attribute'}}; |
else { |
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'value'}}, |
$outcache.=$token->[1]; |
# @{$conditions{'value'}}; |
} |
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'cdata'}}, |
|
# @{$conditions{'cdata'}}; |
|
if ($token->[0] eq 'D') { |
|
print $token->[1]; |
|
} |
|
elsif ($token->[0] eq 'C') { |
|
print $token->[1]; |
|
} |
|
elsif ($token->[0] eq 'S') { |
|
$cdata=''; |
|
$hloc++; |
|
# if token can be excluded, then pretend it is until all conditions are |
|
# run (eha); then output during end tag processing |
|
# else, output |
|
|
|
# a token can be excluded when it is an eh key, or a child node of |
|
# an eh key |
|
|
|
if ($eh{$token->[1]}) { |
|
$echild=$token->[1]; |
|
# print "ECHILD=$echild\n"; |
|
} |
} |
if ($echild) { |
elsif ($token->[0] eq 'S') { |
# run through names for echild |
if ($eh{$token->[1]} or $excludeflag==1) { |
# then attributes and/or values and/or cdata |
$ttype=''; |
my $name=$token->[1]; |
$excludeflag=1; |
my @attributes=@{$token->[3]}; |
$outcache.=$token->[4]; |
my %atthash=%{$token->[2]}; |
} |
foreach my $namemlist (@{$eha{$echild}->{'name'}}) { |
else { |
foreach my $namematch (@{$namemlist}) { |
$ttype='S'; |
my $nm=$namematch; $nm=~s/^.//; $nm=~s/.$//; |
$output.=$token->[4]; |
if ($name=~/$nm/) { |
} |
# print "NMATCH: $nm ($name)\n"; |
if ($excludeflag==1) { |
$excluden++; |
|
foreach my $attributemlist |
|
(@{$eha{$echild}->{'attribute'}}) { |
|
foreach my $attributematch |
|
(@{$attributemlist}) { |
|
my ($an,$am)= |
|
split(/\=/,$attributematch,2); |
|
$am=~s/^.//; |
|
$am=~s/.$//; |
|
# print 'AM:'."($an,$am)\t"; |
|
# print 'ATT:'.join(',',%atthash)."\n"; |
|
if ($atthash{$an}) { |
|
if ($atthash{$an}=~/$am/) { |
|
$excludea++; |
|
# print "AMATCH: $am (". |
|
# join(',', |
|
# @attributes) |
|
# ."\n"; |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
|
} |
} |
$tr.=$token->[4]; |
|
} |
} |
else { |
elsif ($token->[0] eq 'E') { |
print $token->[4]; |
if ($eh{$token->[1]} and $excludeflag==1) { |
|
$ttype='E'; |
|
$excludeflag=0; |
|
$outcache.=$token->[2]; |
|
my $retval=&evalconditions($outcache); |
|
if (&evalconditions($outcache)) { |
|
$output.=$outcache; |
|
} |
|
else { |
|
$output.='<!-- FILTERED OUT -->'; |
|
} |
|
$outcache=''; |
|
} |
|
elsif ($excludeflag==1) { |
|
$ttype=''; |
|
$outcache.=$token->[2]; |
|
} |
|
else { |
|
$output.=$token->[2]; |
|
$ttype='E'; |
|
} |
} |
} |
} |
} |
elsif ($token->[0] eq 'E') { |
&cc; |
if ($echild) { |
} |
$tr.=$token->[2]; |
print $output; |
if ($excluden) { |
|
foreach my $cdatamlist (@{$eha{$echild}->{'cdata'}}) { |
# -------------------------------------------------------------- evalconditions |
foreach my $cdatamatch (@{$cdatamlist}) { |
sub evalconditions { |
# print "CDATA: $cdatamatch, $cdata\n"; |
my ($parsetext)=@_; |
my $cm=$cdatamatch; |
my $eparser = HTML::TokeParser->new(\$parsetext); |
my $not=0; |
unless (@{$conditions{'name'}} or |
if ($cm=~/\!/) { |
@{$conditions{'attribute'}}) { |
$not=1; |
return 0; |
$cm=~s/^.//; |
} |
|
my $nameflag=0; |
|
my $cdataflag=0; |
|
my $matchflag=0; |
|
my $Ttoken=''; |
|
while (my $token = $eparser->get_token()) { |
|
if ($token->[0] eq 'S') { |
|
foreach my $name (@{$conditions{'name'}}) { |
|
my $flag=0; |
|
my $match=$name; |
|
if ($match=~/^\!/) { |
|
$match=~s/^\!//g; |
|
$flag=1; |
|
} |
|
$match=~s/^\///g; |
|
$match=~s/\/$//g; |
|
if ((!$flag and $token->[1]=~/$match/) or |
|
($flag and $token->[1]!~/$match/)) { |
|
$nameflag=1; |
|
} |
|
} |
|
$Ttoken=''; |
|
} |
|
elsif ($token->[0] eq 'E') { |
|
foreach my $name (@{$conditions{'name'}}) { |
|
my $flag=0; |
|
my $match=$name; |
|
if ($match=~/^\!/) { |
|
$match=~s/^\!//g; |
|
$flag=1; |
|
} |
|
$match=~s/^\///g; |
|
$match=~s/\/$//g; |
|
if ((!$flag and $token->[1]=~/$match/) or |
|
($flag and $token->[1]!~/$match/)) { |
|
foreach my $cdata (@{$conditions{'cdata'}}) { |
|
my $flag=0; |
|
my $match=$cdata; |
|
if ($match=~/^\!/) { |
|
$match=~s/^\!//g; |
|
$flag=1; |
|
} |
|
$match=~s/^\///g; |
|
$match=~s/\/$//g; |
|
if ((!$flag and $Ttoken=~/$match/) or |
|
($flag and $Ttoken!~/$match/)) { |
|
$cdataflag=1; |
} |
} |
$cm=~s/^.//; $cm=~s/.$//; |
} |
if ((!$not and $cdata!~/$cm/) |
if (@{$conditions{'cdata'}}) { |
or ($not and $cdata=~/$cm/)) { |
if ($cdataflag) { |
# print "CMISMATCH: $cm ($cdata)\n"; |
return 0; |
} |
} |
elsif (($not and $cdata!~/$cm/) |
} |
or (!$not and $cdata=~/$cm/)) { |
else { |
$exclude++; |
if ($nameflag) { |
|
return 0; |
} |
} |
} |
} |
|
$nameflag=0; |
} |
} |
} |
} |
} |
} |
if ($eh{$token->[1]}) { |
elsif ($token->[0] eq 'T') { |
$echild=0; |
if ($nameflag) { |
if (!$exclude and !$excludea) { |
$Ttoken.=$token->[1]; |
print $tr; |
|
# print $token->[2]; |
|
$tr=''; |
|
} |
|
elsif ($exclude>0 or $excludea>0) { |
|
# print "EXCLUDING $token->[1] $excludea $excluden\n"; |
|
$exclude=0; $excluden=0; $excludea=0; |
|
$tr=''; |
|
} |
|
$exclude=0; $excluden=0; $excludea=0; |
|
} |
|
else { |
|
if ($echild) { |
|
# $tr.=$token->[2]; |
|
} |
|
else { |
|
print $token->[2]; |
|
$tr=''; |
|
} |
} |
} |
} |
$hloc--; |
|
} |
|
elsif ($token->[0] eq 'T') { |
|
if ($echild) { |
|
$tr.=$token->[1]; |
|
$cdata=$token->[1]; |
|
} |
|
else { |
|
print $token->[1]; |
|
$tr=''; |
|
} |
|
} |
} |
|
return 1; |
} |
} |
|
|
# ------------------------------------------------------------ clear conditions |
# ------------------------------------------------------------ clear conditions |
Line 215 sub cc {
|
Line 221 sub cc {
|
@{$conditions{'attribute'}}=(); pop @{$conditions{'attribute'}}; |
@{$conditions{'attribute'}}=(); pop @{$conditions{'attribute'}}; |
@{$conditions{'value'}}=(); pop @{$conditions{'value'}}; |
@{$conditions{'value'}}=(); pop @{$conditions{'value'}}; |
@{$conditions{'cdata'}}=(); pop @{$conditions{'cdata'}}; |
@{$conditions{'cdata'}}=(); pop @{$conditions{'cdata'}}; |
|
%eh=(1,1); delete $eh{1}; |
} |
} |
|
|
# --------------------------------------- remove starting and ending whitespace |
# --------------------------------------- remove starting and ending whitespace |
Line 222 sub trim {
|
Line 229 sub trim {
|
my ($s)=@_; $s=~s/^\s*//; $s=~s/\s*$//; return $s; |
my ($s)=@_; $s=~s/^\s*//; $s=~s/\s*$//; return $s; |
} |
} |
|
|
|
|
|
|
|
|
# --------------------------------------------------------- Format xfml section |
# --------------------------------------------------------- Format xfml section |
sub format_xfml { |
sub format_xfml { |
my (@tokeninfo)=@_; |
my (@tokeninfo)=@_; |
return ''; |
return ''; |
} |
} |
|
|
|
# ------------------------------------------------------- Format clause section |
|
sub format_clause { |
|
my (@tokeninfo)=@_; |
|
return ''; |
|
} |
|
|
# ---------------------------------------------------- Format when:name section |
# ---------------------------------------------------- Format when:name section |
sub format_when_name { |
sub format_when_name { |
my (@tokeninfo)=@_; |
my (@tokeninfo)=@_; |
$wloc++; |
# $wloc++; |
my $att_match=$tokeninfo[2]->{'match'}; |
my $att_match=$tokeninfo[2]->{'match'}; |
push @{$conditions{'name'}},$att_match; |
push @{$conditions{'name'}},$att_match; |
my $text=&trim($parser->get_text('/when:name')); |
my $text=&trim($parser->get_text('/when:name')); |
$parser->get_tag('/when:name'); |
$parser->get_tag('/when:name'); |
# print 'Name Matching...'.$att_match; |
# $wloc--; |
$wloc--; |
# &cc unless $wloc; |
&cc unless $wloc; |
|
return ''; |
|
} |
|
|
|
# ----------------------------------------------- Format when:attribute section |
|
sub format_when_attribute { |
|
my (@tokeninfo)=@_; |
|
$wloc++; |
|
my $att_match=$tokeninfo[2]->{'match'}; |
|
push @{$conditions{'attribute'}},$att_match; |
|
my $text=&trim($parser->get_text('/when:attribute')); |
|
$parser->get_tag('/when:attribute'); |
|
# print 'Attribute Matching...'.$att_match; |
|
$wloc--; |
|
&cc unless $wloc; |
|
return ''; |
return ''; |
} |
} |
|
|
Line 261 sub format_when_cdata {
|
Line 262 sub format_when_cdata {
|
my (@tokeninfo)=@_; |
my (@tokeninfo)=@_; |
$wloc++; |
$wloc++; |
my $att_match=$tokeninfo[2]->{'match'}; |
my $att_match=$tokeninfo[2]->{'match'}; |
# print 'Cdata Matching...'.$att_match; |
|
push @{$conditions{'cdata'}},$att_match; |
push @{$conditions{'cdata'}},$att_match; |
my $text=&trim($parser->get_text('/when:cdata')); |
my $text=&trim($parser->get_text('/when:cdata')); |
$parser->get_tag('/when:cdata'); |
$parser->get_tag('/when:cdata'); |
$wloc--; |
$wloc--; |
&cc unless $wloc; |
# &cc unless $wloc; |
return ''; |
|
} |
|
|
|
# ----------------------------------------------- Format choice:include section |
|
sub format_choice_include { |
|
my (@tokeninfo)=@_; |
|
my $text=&trim($parser->get_text('/choice:include')); |
|
$parser->get_tag('/choice:include'); |
|
$ih{$tokeninfo[2]->{'match'}}++; |
|
return ''; |
return ''; |
} |
} |
|
|