Annotation of loncom/xml/LCParser.pm, revision 1.2
1.1 albertel 1: package HTML::LCParser;
2:
1.2 ! albertel 3: # $Id: LCParser.pm,v 1.1 2002/03/27 18:37:08 albertel Exp $
1.1 albertel 4:
5: require HTML::PullParser;
6: @ISA=qw(HTML::PullParser);
1.2 ! albertel 7: $VERSION = sprintf("%d.%02d", q$Revision: 1.1 $ =~ /(\d+)\.(\d+)/);
1.1 albertel 8:
9: use strict;
10: use Carp ();
11: use HTML::Entities qw(decode_entities);
12:
13: my %ARGS =
14: (
15: start => "'S',tagname,attr,attrseq,text,line",
16: end => "'E',tagname,text,line",
17: text => "'T',text,is_cdata,line",
18: process => "'PI',token0,text,line",
19: comment => "'C',text,line",
20: declaration => "'D',text,line",
21: );
22:
23:
24: sub new
25: {
26: my $class = shift;
27: my %cnf;
28: if (@_ == 1) {
29: my $type = (ref($_[0]) eq "SCALAR") ? "doc" : "file";
30: %cnf = ($type => $_[0]);
31: }
32: else {
33: %cnf = @_;
34: }
35:
36: my $textify = delete $cnf{textify} || {img => "alt", applet => "alt"};
37:
38: my $self = $class->SUPER::new(%cnf, %ARGS) || return undef;
39:
40: $self->{textify} = $textify;
1.2 ! albertel 41: $self->attr_encoded(1);
1.1 albertel 42: $self;
43: }
44:
45:
46: sub get_tag
47: {
48: my $self = shift;
49: my $token;
50: while (1) {
51: $token = $self->get_token || return undef;
52: my $type = shift @$token;
53: next unless $type eq "S" || $type eq "E";
54: substr($token->[0], 0, 0) = "/" if $type eq "E";
55: return $token unless @_;
56: for (@_) {
57: return $token if $token->[0] eq $_;
58: }
59: }
60: }
61:
62:
63: sub get_text
64: {
65: my $self = shift;
66: my $endat = shift;
67: my @text;
68: while (my $token = $self->get_token) {
69: my $type = $token->[0];
70: if ($type eq "T") {
71: my $text = $token->[1];
72: push(@text, $text);
73: } elsif ($type =~ /^[SE]$/) {
74: my $tag = $token->[1];
75: if ($type eq "S") {
76: if (exists $self->{textify}{$tag}) {
77: my $alt = $self->{textify}{$tag};
78: my $text;
79: if (ref($alt)) {
80: $text = &$alt(@$token);
81: } else {
82: $text = $token->[2]{$alt || "alt"};
83: $text = "[\U$tag]" unless defined $text;
84: }
85: push(@text, $text);
86: next;
87: }
88: } else {
89: $tag = "/$tag";
90: }
91: if (!defined($endat) || $endat eq $tag) {
92: $self->unget_token($token);
93: last;
94: }
95: }
96: }
97: join("", @text);
98: }
99:
100:
101: sub get_trimmed_text
102: {
103: my $self = shift;
104: my $text = $self->get_text(@_);
105: $text =~ s/^\s+//; $text =~ s/\s+$//; $text =~ s/\s+/ /g;
106: $text;
107: }
108:
109: 1;
110:
111:
112: __END__
113:
114: =head1 NAME
115:
116: HTML::LCParser - Alternative HTML::Parser interface
117:
118: =head1 SYNOPSIS
119:
120: require HTML::LCParser;
121: $p = HTML::LCParser->new("index.html") || die "Can't open: $!";
122: while (my $token = $p->get_token) {
123: #...
124: }
125:
126: =head1 DESCRIPTION
127:
128: The C<HTML::LCParser> is an alternative interface to the
129: C<HTML::Parser> class. It is an C<HTML::PullParser> subclass.
130:
131: The following methods are available:
132:
133: =over 4
134:
135: =item $p = HTML::LCParser->new( $file_or_doc );
136:
137: The object constructor argument is either a file name, a file handle
138: object, or the complete document to be parsed.
139:
140: If the argument is a plain scalar, then it is taken as the name of a
141: file to be opened and parsed. If the file can't be opened for
142: reading, then the constructor will return an undefined value and $!
143: will tell you why it failed.
144:
145: If the argument is a reference to a plain scalar, then this scalar is
146: taken to be the literal document to parse. The value of this
147: scalar should not be changed before all tokens have been extracted.
148:
149: Otherwise the argument is taken to be some object that the
150: C<HTML::LCParser> can read() from when it needs more data. Typically
151: it will be a filehandle of some kind. The stream will be read() until
152: EOF, but not closed.
153:
1.2 ! albertel 154: It also will turn attr_encoded on by default.
1.1 albertel 155:
156: =item $p->get_token
157:
158: This method will return the next I<token> found in the HTML document,
159: or C<undef> at the end of the document. The token is returned as an
160: array reference. The first element of the array will be a (mostly)
161: single character string denoting the type of this token: "S" for start
162: tag, "E" for end tag, "T" for text, "C" for comment, "D" for
163: declaration, and "PI" for process instructions. The rest of the array
164: is the same as the arguments passed to the corresponding HTML::Parser
165: v2 compatible callbacks (see L<HTML::Parser>). In summary, returned
166: tokens look like this:
167:
168: ["S", $tag, $attr, $attrseq, $text, $line]
169: ["E", $tag, $text, $line]
170: ["T", $text, $is_data, $line]
171: ["C", $text, $line]
172: ["D", $text, $line]
173: ["PI", $token0, $text, $line]
174:
175: where $attr is a hash reference, $attrseq is an array reference and
176: the rest are plain scalars.
177:
178: =item $p->unget_token($token,...)
179:
180: If you find out you have read too many tokens you can push them back,
181: so that they are returned the next time $p->get_token is called.
182:
183: =item $p->get_tag( [$tag, ...] )
184:
185: This method returns the next start or end tag (skipping any other
186: tokens), or C<undef> if there are no more tags in the document. If
187: one or more arguments are given, then we skip tokens until one of the
188: specified tag types is found. For example:
189:
190: $p->get_tag("font", "/font");
191:
192: will find the next start or end tag for a font-element.
193:
194: The tag information is returned as an array reference in the same form
195: as for $p->get_token above, but the type code (first element) is
196: missing. A start tag will be returned like this:
197:
198: [$tag, $attr, $attrseq, $text]
199:
200: The tagname of end tags are prefixed with "/", i.e. end tag is
201: returned like this:
202:
203: ["/$tag", $text]
204:
205: =item $p->get_text( [$endtag] )
206:
207: This method returns all text found at the current position. It will
208: return a zero length string if the next token is not text. The
209: optional $endtag argument specifies that any text occurring before the
210: given tag is to be returned. All entities are unmodified.
211:
212: The $p->{textify} attribute is a hash that defines how certain tags can
213: be treated as text. If the name of a start tag matches a key in this
214: hash then this tag is converted to text. The hash value is used to
215: specify which tag attribute to obtain the text from. If this tag
216: attribute is missing, then the upper case name of the tag enclosed in
217: brackets is returned, e.g. "[IMG]". The hash value can also be a
218: subroutine reference. In this case the routine is called with the
219: start tag token content as its argument and the return value is treated
220: as the text.
221:
222: The default $p->{textify} value is:
223:
224: {img => "alt", applet => "alt"}
225:
226: This means that <IMG> and <APPLET> tags are treated as text, and that
227: the text to substitute can be found in the ALT attribute.
228:
229: =item $p->get_trimmed_text( [$endtag] )
230:
231: Same as $p->get_text above, but will collapse any sequences of white
232: space to a single space character. Leading and trailing white space is
233: removed.
234:
235: =back
236:
237: =head1 EXAMPLES
238:
239: This example extracts all links from a document. It will print one
240: line for each link, containing the URL and the textual description
241: between the <A>...</A> tags:
242:
243: use HTML::LCParser;
244: $p = HTML::LCParser->new(shift||"index.html");
245:
246: while (my $token = $p->get_tag("a")) {
247: my $url = $token->[1]{href} || "-";
248: my $text = $p->get_trimmed_text("/a");
249: print "$url\t$text\n";
250: }
251:
252: This example extract the <TITLE> from the document:
253:
254: use HTML::LCParser;
255: $p = HTML::LCParser->new(shift||"index.html");
256: if ($p->get_tag("title")) {
257: my $title = $p->get_trimmed_text;
258: print "Title: $title\n";
259: }
260:
261: =head1 SEE ALSO
262:
263: L<HTML::PullParser>, L<HTML::Parser>
264:
265: =head1 COPYRIGHT
266:
267: Copyright 1998-2001 Gisle Aas.
268:
269: This library is free software; you can redistribute it and/or
270: modify it under the same terms as Perl itself.
271:
272: =cut
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>