Annotation of loncom/xml/LCParser.pm, revision 1.2

1.1       albertel    1: package HTML::LCParser;
                      2: 
1.2     ! albertel    3: # $Id: LCParser.pm,v 1.1 2002/03/27 18:37:08 albertel Exp $
1.1       albertel    4: 
                      5: require HTML::PullParser;
                      6: @ISA=qw(HTML::PullParser);
1.2     ! albertel    7: $VERSION = sprintf("%d.%02d", q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/);
1.1       albertel    8: 
                      9: use strict;
                     10: use Carp ();
                     11: use HTML::Entities qw(decode_entities);
                     12: 
                     13: my %ARGS =
                     14: (
                     15:  start       => "'S',tagname,attr,attrseq,text,line",
                     16:  end         => "'E',tagname,text,line",
                     17:  text        => "'T',text,is_cdata,line",
                     18:  process     => "'PI',token0,text,line",
                     19:  comment     => "'C',text,line",
                     20:  declaration => "'D',text,line",
                     21: );
                     22: 
                     23: 
                     24: sub new
                     25: {
                     26:     my $class = shift;
                     27:     my %cnf;
                     28:     if (@_ == 1) {
                     29: 	my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
                     30: 	%cnf = ($type => $_[0]);
                     31:     }
                     32:     else {
                     33: 	%cnf = @_;
                     34:     }
                     35: 
                     36:     my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
                     37: 
                     38:     my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
                     39: 
                     40:     $self->{textify} = $textify;
1.2     ! albertel   41:     $self->attr_encoded(1);
1.1       albertel   42:     $self;
                     43: }
                     44: 
                     45: 
                     46: sub get_tag
                     47: {
                     48:     my $self = shift;
                     49:     my $token;
                     50:     while (1) {
                     51: 	$token = $self->get_token || return undef;
                     52: 	my $type = shift @$token;
                     53: 	next unless $type eq "S" || $type eq "E";
                     54: 	substr($token->[0], 0, 0) = "/" if $type eq "E";
                     55: 	return $token unless @_;
                     56: 	for (@_) {
                     57: 	    return $token if $token->[0] eq $_;
                     58: 	}
                     59:     }
                     60: }
                     61: 
                     62: 
                     63: sub get_text
                     64: {
                     65:     my $self = shift;
                     66:     my $endat = shift;
                     67:     my @text;
                     68:     while (my $token = $self->get_token) {
                     69: 	my $type = $token->[0];
                     70: 	if ($type eq "T") {
                     71: 	    my $text = $token->[1];
                     72: 	    push(@text, $text);
                     73: 	} elsif ($type =~ /^[SE]$/) {
                     74: 	    my $tag = $token->[1];
                     75: 	    if ($type eq "S") {
                     76: 		if (exists $self->{textify}{$tag}) {
                     77: 		    my $alt = $self->{textify}{$tag};
                     78: 		    my $text;
                     79: 		    if (ref($alt)) {
                     80: 			$text = &$alt(@$token);
                     81: 		    } else {
                     82: 			$text = $token->[2]{$alt || "alt"};
                     83: 			$text = "[\U$tag]" unless defined $text;
                     84: 		    }
                     85: 		    push(@text, $text);
                     86: 		    next;
                     87: 		}
                     88: 	    } else {
                     89: 		$tag = "/$tag";
                     90: 	    }
                     91: 	    if (!defined($endat) || $endat eq $tag) {
                     92: 		 $self->unget_token($token);
                     93: 		 last;
                     94: 	    }
                     95: 	}
                     96:     }
                     97:     join("", @text);
                     98: }
                     99: 
                    100: 
                    101: sub get_trimmed_text
                    102: {
                    103:     my $self = shift;
                    104:     my $text = $self->get_text(@_);
                    105:     $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
                    106:     $text;
                    107: }
                    108: 
                    109: 1;
                    110: 
                    111: 
                    112: __END__
                    113: 
                    114: =head1 NAME
                    115: 
                    116: HTML::LCParser - Alternative HTML::Parser interface
                    117: 
                    118: =head1 SYNOPSIS
                    119: 
                    120:  require HTML::LCParser;
                    121:  $p = HTML::LCParser->new("index.html") || die "Can't open: $!";
                    122:  while (my $token = $p->get_token) {
                    123:      #...
                    124:  }
                    125: 
                    126: =head1 DESCRIPTION
                    127: 
                    128: The C<HTML::LCParser> is an alternative interface to the
                    129: C<HTML::Parser> class.  It is an C<HTML::PullParser> subclass.
                    130: 
                    131: The following methods are available:
                    132: 
                    133: =over 4
                    134: 
                    135: =item $p = HTML::LCParser->new( $file_or_doc );
                    136: 
                    137: The object constructor argument is either a file name, a file handle
                    138: object, or the complete document to be parsed.
                    139: 
                    140: If the argument is a plain scalar, then it is taken as the name of a
                    141: file to be opened and parsed.  If the file can't be opened for
                    142: reading, then the constructor will return an undefined value and $!
                    143: will tell you why it failed.
                    144: 
                    145: If the argument is a reference to a plain scalar, then this scalar is
                    146: taken to be the literal document to parse.  The value of this
                    147: scalar should not be changed before all tokens have been extracted.
                    148: 
                    149: Otherwise the argument is taken to be some object that the
                    150: C<HTML::LCParser> can read() from when it needs more data.  Typically
                    151: it will be a filehandle of some kind.  The stream will be read() until
                    152: EOF, but not closed.
                    153: 
1.2     ! albertel  154: It also will turn attr_encoded on by default.
1.1       albertel  155: 
                    156: =item $p->get_token
                    157: 
                    158: This method will return the next I<token> found in the HTML document,
                    159: or C<undef> at the end of the document.  The token is returned as an
                    160: array reference.  The first element of the array will be a (mostly)
                    161: single character string denoting the type of this token: "S" for start
                    162: tag, "E" for end tag, "T" for text, "C" for comment, "D" for
                    163: declaration, and "PI" for process instructions.  The rest of the array
                    164: is the same as the arguments passed to the corresponding HTML::Parser
                    165: v2 compatible callbacks (see L<HTML::Parser>).  In summary, returned
                    166: tokens look like this:
                    167: 
                    168:   ["S",  $tag, $attr, $attrseq, $text, $line]
                    169:   ["E",  $tag, $text, $line]
                    170:   ["T",  $text, $is_data, $line]
                    171:   ["C",  $text, $line]
                    172:   ["D",  $text, $line]
                    173:   ["PI", $token0, $text, $line]
                    174: 
                    175: where $attr is a hash reference, $attrseq is an array reference and
                    176: the rest are plain scalars.
                    177: 
                    178: =item $p->unget_token($token,...)
                    179: 
                    180: If you find out you have read too many tokens you can push them back,
                    181: so that they are returned the next time $p->get_token is called.
                    182: 
                    183: =item $p->get_tag( [$tag, ...] )
                    184: 
                    185: This method returns the next start or end tag (skipping any other
                    186: tokens), or C<undef> if there are no more tags in the document.  If
                    187: one or more arguments are given, then we skip tokens until one of the
                    188: specified tag types is found.  For example:
                    189: 
                    190:    $p->get_tag("font", "/font");
                    191: 
                    192: will find the next start or end tag for a font-element.
                    193: 
                    194: The tag information is returned as an array reference in the same form
                    195: as for $p->get_token above, but the type code (first element) is
                    196: missing. A start tag will be returned like this:
                    197: 
                    198:   [$tag, $attr, $attrseq, $text]
                    199: 
                    200: The tagname of end tags are prefixed with "/", i.e. end tag is
                    201: returned like this:
                    202: 
                    203:   ["/$tag", $text]
                    204: 
                    205: =item $p->get_text( [$endtag] )
                    206: 
                    207: This method returns all text found at the current position. It will
                    208: return a zero length string if the next token is not text.  The
                    209: optional $endtag argument specifies that any text occurring before the
                    210: given tag is to be returned. All entities are unmodified.
                    211: 
                    212: The $p->{textify} attribute is a hash that defines how certain tags can
                    213: be treated as text.  If the name of a start tag matches a key in this
                    214: hash then this tag is converted to text.  The hash value is used to
                    215: specify which tag attribute to obtain the text from.  If this tag
                    216: attribute is missing, then the upper case name of the tag enclosed in
                    217: brackets is returned, e.g. "[IMG]".  The hash value can also be a
                    218: subroutine reference.  In this case the routine is called with the
                    219: start tag token content as its argument and the return value is treated
                    220: as the text.
                    221: 
                    222: The default $p->{textify} value is:
                    223: 
                    224:   {img => "alt", applet => "alt"}
                    225: 
                    226: This means that <IMG> and <APPLET> tags are treated as text, and that
                    227: the text to substitute can be found in the ALT attribute.
                    228: 
                    229: =item $p->get_trimmed_text( [$endtag] )
                    230: 
                    231: Same as $p->get_text above, but will collapse any sequences of white
                    232: space to a single space character.  Leading and trailing white space is
                    233: removed.
                    234: 
                    235: =back
                    236: 
                    237: =head1 EXAMPLES
                    238: 
                    239: This example extracts all links from a document.  It will print one
                    240: line for each link, containing the URL and the textual description
                    241: between the <A>...</A> tags:
                    242: 
                    243:   use HTML::LCParser;
                    244:   $p = HTML::LCParser->new(shift||"index.html");
                    245: 
                    246:   while (my $token = $p->get_tag("a")) {
                    247:       my $url = $token->[1]{href} || "-";
                    248:       my $text = $p->get_trimmed_text("/a");
                    249:       print "$url\t$text\n";
                    250:   }
                    251: 
                    252: This example extract the <TITLE> from the document:
                    253: 
                    254:   use HTML::LCParser;
                    255:   $p = HTML::LCParser->new(shift||"index.html");
                    256:   if ($p->get_tag("title")) {
                    257:       my $title = $p->get_trimmed_text;
                    258:       print "Title: $title\n";
                    259:   }
                    260: 
                    261: =head1 SEE ALSO
                    262: 
                    263: L<HTML::PullParser>, L<HTML::Parser>
                    264: 
                    265: =head1 COPYRIGHT
                    266: 
                    267: Copyright 1998-2001 Gisle Aas.
                    268: 
                    269: This library is free software; you can redistribute it and/or
                    270: modify it under the same terms as Perl itself.
                    271: 
                    272: =cut

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>