Annotation of loncom/xml/LCParser.pm, revision 1.1
1.1 ! albertel 1: package HTML::LCParser;
! 2:
! 3: # $Id: TokeParser.pm,v 2.24 2001/03/26 07:32:17 gisle Exp $
! 4:
! 5: require HTML::PullParser;
! 6: @ISA=qw(HTML::PullParser);
! 7: $VERSION = sprintf("%d.%02d", q$Revision: 2.24 $ =~ /(\d+)\.(\d+)/);
! 8:
! 9: use strict;
! 10: use Carp ();
! 11: use HTML::Entities qw(decode_entities);
! 12:
! 13: my %ARGS =
! 14: (
! 15: start => "'S',tagname,attr,attrseq,text,line",
! 16: end => "'E',tagname,text,line",
! 17: text => "'T',text,is_cdata,line",
! 18: process => "'PI',token0,text,line",
! 19: comment => "'C',text,line",
! 20: declaration => "'D',text,line",
! 21: );
! 22:
! 23:
! 24: sub new
! 25: {
! 26: my $class = shift;
! 27: my %cnf;
! 28: if (@_ == 1) {
! 29: my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
! 30: %cnf = ($type => $_[0]);
! 31: }
! 32: else {
! 33: %cnf = @_;
! 34: }
! 35:
! 36: my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
! 37:
! 38: my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
! 39:
! 40: $self->{textify} = $textify;
! 41: $self->encoded_entities(1);
! 42: $self;
! 43: }
! 44:
! 45:
! 46: sub get_tag
! 47: {
! 48: my $self = shift;
! 49: my $token;
! 50: while (1) {
! 51: $token = $self->get_token || return undef;
! 52: my $type = shift @$token;
! 53: next unless $type eq "S" || $type eq "E";
! 54: substr($token->[0], 0, 0) = "/" if $type eq "E";
! 55: return $token unless @_;
! 56: for (@_) {
! 57: return $token if $token->[0] eq $_;
! 58: }
! 59: }
! 60: }
! 61:
! 62:
! 63: sub get_text
! 64: {
! 65: my $self = shift;
! 66: my $endat = shift;
! 67: my @text;
! 68: while (my $token = $self->get_token) {
! 69: my $type = $token->[0];
! 70: if ($type eq "T") {
! 71: my $text = $token->[1];
! 72: push(@text, $text);
! 73: } elsif ($type =~ /^[SE]$/) {
! 74: my $tag = $token->[1];
! 75: if ($type eq "S") {
! 76: if (exists $self->{textify}{$tag}) {
! 77: my $alt = $self->{textify}{$tag};
! 78: my $text;
! 79: if (ref($alt)) {
! 80: $text = &$alt(@$token);
! 81: } else {
! 82: $text = $token->[2]{$alt || "alt"};
! 83: $text = "[\U$tag]" unless defined $text;
! 84: }
! 85: push(@text, $text);
! 86: next;
! 87: }
! 88: } else {
! 89: $tag = "/$tag";
! 90: }
! 91: if (!defined($endat) || $endat eq $tag) {
! 92: $self->unget_token($token);
! 93: last;
! 94: }
! 95: }
! 96: }
! 97: join("", @text);
! 98: }
! 99:
! 100:
! 101: sub get_trimmed_text
! 102: {
! 103: my $self = shift;
! 104: my $text = $self->get_text(@_);
! 105: $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
! 106: $text;
! 107: }
! 108:
! 109: 1;
! 110:
! 111:
! 112: __END__
! 113:
! 114: =head1 NAME
! 115:
! 116: HTML::LCParser - Alternative HTML::Parser interface
! 117:
! 118: =head1 SYNOPSIS
! 119:
! 120: require HTML::LCParser;
! 121: $p = HTML::LCParser->new("index.html") || die "Can't open: $!";
! 122: while (my $token = $p->get_token) {
! 123: #...
! 124: }
! 125:
! 126: =head1 DESCRIPTION
! 127:
! 128: The C<HTML::LCParser> is an alternative interface to the
! 129: C<HTML::Parser> class. It is an C<HTML::PullParser> subclass.
! 130:
! 131: The following methods are available:
! 132:
! 133: =over 4
! 134:
! 135: =item $p = HTML::LCParser->new( $file_or_doc );
! 136:
! 137: The object constructor argument is either a file name, a file handle
! 138: object, or the complete document to be parsed.
! 139:
! 140: If the argument is a plain scalar, then it is taken as the name of a
! 141: file to be opened and parsed. If the file can't be opened for
! 142: reading, then the constructor will return an undefined value and $!
! 143: will tell you why it failed.
! 144:
! 145: If the argument is a reference to a plain scalar, then this scalar is
! 146: taken to be the literal document to parse. The value of this
! 147: scalar should not be changed before all tokens have been extracted.
! 148:
! 149: Otherwise the argument is taken to be some object that the
! 150: C<HTML::LCParser> can read() from when it needs more data. Typically
! 151: it will be a filehandle of some kind. The stream will be read() until
! 152: EOF, but not closed.
! 153:
! 154: It also will turn encoded_entities on by default.
! 155:
! 156: =item $p->get_token
! 157:
! 158: This method will return the next I<token> found in the HTML document,
! 159: or C<undef> at the end of the document. The token is returned as an
! 160: array reference. The first element of the array will be a (mostly)
! 161: single character string denoting the type of this token: "S" for start
! 162: tag, "E" for end tag, "T" for text, "C" for comment, "D" for
! 163: declaration, and "PI" for process instructions. The rest of the array
! 164: is the same as the arguments passed to the corresponding HTML::Parser
! 165: v2 compatible callbacks (see L<HTML::Parser>). In summary, returned
! 166: tokens look like this:
! 167:
! 168: ["S", $tag, $attr, $attrseq, $text, $line]
! 169: ["E", $tag, $text, $line]
! 170: ["T", $text, $is_data, $line]
! 171: ["C", $text, $line]
! 172: ["D", $text, $line]
! 173: ["PI", $token0, $text, $line]
! 174:
! 175: where $attr is a hash reference, $attrseq is an array reference and
! 176: the rest are plain scalars.
! 177:
! 178: =item $p->unget_token($token,...)
! 179:
! 180: If you find out you have read too many tokens you can push them back,
! 181: so that they are returned the next time $p->get_token is called.
! 182:
! 183: =item $p->get_tag( [$tag, ...] )
! 184:
! 185: This method returns the next start or end tag (skipping any other
! 186: tokens), or C<undef> if there are no more tags in the document. If
! 187: one or more arguments are given, then we skip tokens until one of the
! 188: specified tag types is found. For example:
! 189:
! 190: $p->get_tag("font", "/font");
! 191:
! 192: will find the next start or end tag for a font-element.
! 193:
! 194: The tag information is returned as an array reference in the same form
! 195: as for $p->get_token above, but the type code (first element) is
! 196: missing. A start tag will be returned like this:
! 197:
! 198: [$tag, $attr, $attrseq, $text]
! 199:
! 200: The tagname of end tags are prefixed with "/", i.e. end tag is
! 201: returned like this:
! 202:
! 203: ["/$tag", $text]
! 204:
! 205: =item $p->get_text( [$endtag] )
! 206:
! 207: This method returns all text found at the current position. It will
! 208: return a zero length string if the next token is not text. The
! 209: optional $endtag argument specifies that any text occurring before the
! 210: given tag is to be returned. All entities are unmodified.
! 211:
! 212: The $p->{textify} attribute is a hash that defines how certain tags can
! 213: be treated as text. If the name of a start tag matches a key in this
! 214: hash then this tag is converted to text. The hash value is used to
! 215: specify which tag attribute to obtain the text from. If this tag
! 216: attribute is missing, then the upper case name of the tag enclosed in
! 217: brackets is returned, e.g. "[IMG]". The hash value can also be a
! 218: subroutine reference. In this case the routine is called with the
! 219: start tag token content as its argument and the return value is treated
! 220: as the text.
! 221:
! 222: The default $p->{textify} value is:
! 223:
! 224: {img => "alt", applet => "alt"}
! 225:
! 226: This means that <IMG> and <APPLET> tags are treated as text, and that
! 227: the text to substitute can be found in the ALT attribute.
! 228:
! 229: =item $p->get_trimmed_text( [$endtag] )
! 230:
! 231: Same as $p->get_text above, but will collapse any sequences of white
! 232: space to a single space character. Leading and trailing white space is
! 233: removed.
! 234:
! 235: =back
! 236:
! 237: =head1 EXAMPLES
! 238:
! 239: This example extracts all links from a document. It will print one
! 240: line for each link, containing the URL and the textual description
! 241: between the <A>...</A> tags:
! 242:
! 243: use HTML::LCParser;
! 244: $p = HTML::LCParser->new(shift||"index.html");
! 245:
! 246: while (my $token = $p->get_tag("a")) {
! 247: my $url = $token->[1]{href} || "-";
! 248: my $text = $p->get_trimmed_text("/a");
! 249: print "$url\t$text\n";
! 250: }
! 251:
! 252: This example extract the <TITLE> from the document:
! 253:
! 254: use HTML::LCParser;
! 255: $p = HTML::LCParser->new(shift||"index.html");
! 256: if ($p->get_tag("title")) {
! 257: my $title = $p->get_trimmed_text;
! 258: print "Title: $title\n";
! 259: }
! 260:
! 261: =head1 SEE ALSO
! 262:
! 263: L<HTML::PullParser>, L<HTML::Parser>
! 264:
! 265: =head1 COPYRIGHT
! 266:
! 267: Copyright 1998-2001 Gisle Aas.
! 268:
! 269: This library is free software; you can redistribute it and/or
! 270: modify it under the same terms as Perl itself.
! 271:
! 272: =cut
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>