version 1.1, 2002/01/29 10:42:42
|
version 1.2, 2002/02/01 10:56:41
|
Line 1
|
Line 1
|
#!/usr/bin/perl |
#!/usr/bin/perl |
|
|
# YEAR=2002 |
# YEAR=2002 |
# 1/26,1/27,1/28 - Scott Harrison |
# 1/26,1/27,1/28,1/29,1/30,1/31 - Scott Harrison |
|
# |
|
### |
|
|
# Read in 2 XML file; first is the filter specification, the second |
# Read in 2 XML file; first is the filter specification, the second |
# is the XML file to be filtered |
# is the XML file to be filtered |
|
|
|
############################################################################### |
|
## ## |
|
## ORGANIZATION OF THIS PERL SCRIPT ## |
|
## 1. Notes ## |
|
## 2. Get command line arguments ## |
|
## 3. First pass through (grab distribution-specific information) ## |
|
## 4. Second pass through (parse out what is not necessary) ## |
|
## 5. Third pass through (translate markup according to specified mode) ## |
|
## 6. Functions (most all just format contents of different markup tags) ## |
|
## 7. POD (plain old documentation, CPAN style) ## |
|
## ## |
|
############################################################################### |
|
|
|
# ----------------------------------------------------------------------- Notes |
|
# |
|
# I am using a multiple pass-through approach to parsing |
|
# the xfml file. This saves memory and makes sure the server |
|
# will never be overloaded. |
|
# |
|
# This is meant to parse files meeting the piml document type. |
|
# See xfml.dtd. XFML=XML Filtering Markup Language. |
|
|
use HTML::TokeParser; |
use HTML::TokeParser; |
use strict; |
use strict; |
|
|
Line 21 END
|
Line 45 END
|
my %eh; |
my %eh; |
my %ih; |
my %ih; |
my $tofilter=shift @ARGV; |
my $tofilter=shift @ARGV; |
my @lines=<>; my $parsestring=join('',@lines); undef @lines; |
open IN,"<$tofilter"; |
|
my @lines=<IN>; my $parsestring=join('',@lines); undef @lines; |
|
close IN; |
my $parser = HTML::TokeParser->new(\$parsestring) or |
my $parser = HTML::TokeParser->new(\$parsestring) or |
die('can\'t create TokeParser object'); |
die('can\'t create TokeParser object'); |
$parser->xml_mode('1'); |
$parser->xml_mode('1'); |
Line 47 my %eha;
|
Line 73 my %eha;
|
while (my $token = $parser->get_tag('xfml')) { |
while (my $token = $parser->get_tag('xfml')) { |
&format_xfml(@{$token}); |
&format_xfml(@{$token}); |
$text = $parser->get_text('/xfml'); |
$text = $parser->get_text('/xfml'); |
# print $xfml; |
|
# print $text; |
|
$token = $parser->get_tag('/xfml'); |
$token = $parser->get_tag('/xfml'); |
} |
} |
|
|
open IN,"<$tofilter"; |
#open IN,"<$tofilter"; |
my @lines2=<IN>; close IN; my $parsestring2=join('',@lines2); undef @lines2; |
my @lines2=<>; my $parsestring2=join('',@lines2); undef @lines2; |
$parser = HTML::TokeParser->new(\$parsestring2) or |
$parser = HTML::TokeParser->new(\$parsestring2) or |
die('can\'t create TokeParser object'); |
die('can\'t create TokeParser object'); |
$parser->xml_mode('1'); |
$parser->xml_mode('1'); |
Line 68 my $excluden=0;
|
Line 92 my $excluden=0;
|
my $excludea=0; |
my $excludea=0; |
my $et=0; |
my $et=0; |
my $cdata=''; |
my $cdata=''; |
|
my $excludenold=0; |
|
my $ign=0; |
|
|
while ($token = $parser->get_token()) { |
while ($token = $parser->get_token()) { |
# from HTML::TokeParser documentation: |
|
# ["S", $tag, %$attr, @$attrseq, $text] |
|
# ["E", $tag, $text] |
|
# ["T", $text, $is_data] |
|
# ["C", $text] |
|
# ["D", $text] |
|
# ["PI", $token0, $text] |
|
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'name'}}, |
|
# @{$conditions{'name'}}; |
|
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'attribute'}}, |
|
# @{$conditions{'attribute'}}; |
|
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'value'}}, |
|
# @{$conditions{'value'}}; |
|
# push @{$eha{$tokeninfo[2]->{'nodename'}}->{'cdata'}}, |
|
# @{$conditions{'cdata'}}; |
|
if ($token->[0] eq 'D') { |
if ($token->[0] eq 'D') { |
print $token->[1]; |
print $token->[1]; |
} |
} |
Line 102 while ($token = $parser->get_token()) {
|
Line 114 while ($token = $parser->get_token()) {
|
|
|
if ($eh{$token->[1]}) { |
if ($eh{$token->[1]}) { |
$echild=$token->[1]; |
$echild=$token->[1]; |
# print "ECHILD=$echild\n"; |
|
} |
} |
if ($echild) { |
if ($echild) { |
# run through names for echild |
# run through names for echild |
Line 114 while ($token = $parser->get_token()) {
|
Line 125 while ($token = $parser->get_token()) {
|
foreach my $namematch (@{$namemlist}) { |
foreach my $namematch (@{$namemlist}) { |
my $nm=$namematch; $nm=~s/^.//; $nm=~s/.$//; |
my $nm=$namematch; $nm=~s/^.//; $nm=~s/.$//; |
if ($name=~/$nm/) { |
if ($name=~/$nm/) { |
# print "NMATCH: $nm ($name)\n"; |
$excludenold=$excluden; |
$excluden++; |
$excluden++; |
foreach my $attributemlist |
foreach my $attributemlist |
(@{$eha{$echild}->{'attribute'}}) { |
(@{$eha{$echild}->{'attribute'}}) { |
Line 124 while ($token = $parser->get_token()) {
|
Line 135 while ($token = $parser->get_token()) {
|
split(/\=/,$attributematch,2); |
split(/\=/,$attributematch,2); |
$am=~s/^.//; |
$am=~s/^.//; |
$am=~s/.$//; |
$am=~s/.$//; |
# print 'AM:'."($an,$am)\t"; |
|
# print 'ATT:'.join(',',%atthash)."\n"; |
|
if ($atthash{$an}) { |
if ($atthash{$an}) { |
if ($atthash{$an}=~/$am/) { |
if ($atthash{$an}=~/$am/) { |
$excludea++; |
$excludea++; |
# print "AMATCH: $am (". |
|
# join(',', |
|
# @attributes) |
|
# ."\n"; |
|
} |
} |
} |
} |
} |
} |
Line 150 while ($token = $parser->get_token()) {
|
Line 155 while ($token = $parser->get_token()) {
|
if ($echild) { |
if ($echild) { |
$tr.=$token->[2]; |
$tr.=$token->[2]; |
if ($excluden) { |
if ($excluden) { |
|
my $i=0; |
|
CDATALOOP: |
foreach my $cdatamlist (@{$eha{$echild}->{'cdata'}}) { |
foreach my $cdatamlist (@{$eha{$echild}->{'cdata'}}) { |
|
$i++; |
|
my $j; |
foreach my $cdatamatch (@{$cdatamlist}) { |
foreach my $cdatamatch (@{$cdatamlist}) { |
|
$j++; |
# print "CDATA: $cdatamatch, $cdata\n"; |
# print "CDATA: $cdatamatch, $cdata\n"; |
my $cm=$cdatamatch; |
my $cm=$cdatamatch; |
my $not=0; |
my $not=0; |
Line 160 while ($token = $parser->get_token()) {
|
Line 170 while ($token = $parser->get_token()) {
|
$cm=~s/^.//; |
$cm=~s/^.//; |
} |
} |
$cm=~s/^.//; $cm=~s/.$//; |
$cm=~s/^.//; $cm=~s/.$//; |
|
if ($not and $cdata=~/$cm/) { |
|
$ign=1; $exclude=0; |
|
} |
if ((!$not and $cdata!~/$cm/) |
if ((!$not and $cdata!~/$cm/) |
or ($not and $cdata=~/$cm/)) { |
or ($not and $cdata=~/$cm/)) { |
# print "CMISMATCH: $cm ($cdata)\n"; |
# nothing happens |
|
# $exclude=0; |
} |
} |
elsif (($not and $cdata!~/$cm/) |
elsif (($not and $cdata!~/$cm/) |
or (!$not and $cdata=~/$cm/)) { |
or (!$not and $cdata=~/$cm/)) { |
$exclude++; |
$exclude++ unless $ign; |
} |
} |
} |
} |
} |
} |
} |
} |
} |
} |
if ($eh{$token->[1]}) { |
if ($eh{$token->[1]}) { |
|
$ign=0; |
$echild=0; |
$echild=0; |
if (!$exclude and !$excludea) { |
if (!$exclude and !$excludea) { |
print $tr; |
print $tr; |
Line 180 while ($token = $parser->get_token()) {
|
Line 195 while ($token = $parser->get_token()) {
|
$tr=''; |
$tr=''; |
} |
} |
elsif ($exclude>0 or $excludea>0) { |
elsif ($exclude>0 or $excludea>0) { |
# print "EXCLUDING $token->[1] $excludea $excluden\n"; |
# print "EXCLUDING $token->[1] $exclude $excludea $excluden\n"; |
$exclude=0; $excluden=0; $excludea=0; |
$exclude=0; $excluden=0; $excludea=0; |
$tr=''; |
$tr=''; |
} |
} |
Line 222 sub trim {
|
Line 237 sub trim {
|
my ($s)=@_; $s=~s/^\s*//; $s=~s/\s*$//; return $s; |
my ($s)=@_; $s=~s/^\s*//; $s=~s/\s*$//; return $s; |
} |
} |
|
|
|
|
|
|
# --------------------------------------------------------- Format xfml section |
# --------------------------------------------------------- Format xfml section |
sub format_xfml { |
sub format_xfml { |
my (@tokeninfo)=@_; |
my (@tokeninfo)=@_; |
Line 236 sub format_when_name {
|
Line 253 sub format_when_name {
|
push @{$conditions{'name'}},$att_match; |
push @{$conditions{'name'}},$att_match; |
my $text=&trim($parser->get_text('/when:name')); |
my $text=&trim($parser->get_text('/when:name')); |
$parser->get_tag('/when:name'); |
$parser->get_tag('/when:name'); |
# print 'Name Matching...'.$att_match; |
|
$wloc--; |
$wloc--; |
&cc unless $wloc; |
&cc unless $wloc; |
return ''; |
return ''; |
Line 250 sub format_when_attribute {
|
Line 266 sub format_when_attribute {
|
push @{$conditions{'attribute'}},$att_match; |
push @{$conditions{'attribute'}},$att_match; |
my $text=&trim($parser->get_text('/when:attribute')); |
my $text=&trim($parser->get_text('/when:attribute')); |
$parser->get_tag('/when:attribute'); |
$parser->get_tag('/when:attribute'); |
# print 'Attribute Matching...'.$att_match; |
|
$wloc--; |
$wloc--; |
&cc unless $wloc; |
&cc unless $wloc; |
return ''; |
return ''; |
Line 261 sub format_when_cdata {
|
Line 276 sub format_when_cdata {
|
my (@tokeninfo)=@_; |
my (@tokeninfo)=@_; |
$wloc++; |
$wloc++; |
my $att_match=$tokeninfo[2]->{'match'}; |
my $att_match=$tokeninfo[2]->{'match'}; |
# print 'Cdata Matching...'.$att_match; |
|
push @{$conditions{'cdata'}},$att_match; |
push @{$conditions{'cdata'}},$att_match; |
my $text=&trim($parser->get_text('/when:cdata')); |
my $text=&trim($parser->get_text('/when:cdata')); |
$parser->get_tag('/when:cdata'); |
$parser->get_tag('/when:cdata'); |