Annotation of loncom/homework/cleanxml/html_to_xml.pm, revision 1.2
1.1 damieng 1: # The LearningOnline Network
2: # Second step to clean a file.
3: #
1.2 ! damieng 4: # $Id: html_to_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $
1.1 damieng 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ###
29:
30: #!/usr/bin/perl
31:
32:
33: package Apache::html_to_xml;
34:
35: use strict;
36: use utf8;
37: use warnings;
38: use HTML::Parser ();
39:
40: # always closing, end tags are ignored:
41: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext');
42:
43: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe');
44:
45:
46: my $result;
47: my @stack;
48: my $close_warning;
49: my $warnings; # 1 = print warnings
50:
51:
52: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML.
53: sub html_to_xml {
1.2 ! damieng 54: my($textref, $warn, $case_sensitive) = @_;
1.1 damieng 55: $warnings = $warn;
1.2 ! damieng 56: if (!defined $case_sensitive) {
! 57: $case_sensitive = 0;
! 58: }
1.1 damieng 59: $result = '';
60: @stack = ();
61: $close_warning = '';
62: my $p = HTML::Parser->new( api_version => 3,
63: start_h => [\&start, "tagname, attr, attrseq"],
64: end_h => [\&end, "tagname"],
65: text_h => [\&text, "dtext"],
66: comment_h => [\&comment, "tokens"],
67: declaration_h => [\&declaration, "tokens"],
68: process_h => [\&process, "token0"],
69: );
70: # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase
1.2 ! damieng 71: # This is a problem with the Task elements, so it is disabled in that case
! 72: if ($case_sensitive) {
! 73: $p->case_sensitive(1);
! 74: }
1.1 damieng 75: $p->empty_element_tags(1);
76: $result .= "<?xml version='1.0' encoding='UTF-8'?>\n";
77: $p->parse($$textref);
78: for (my $i=scalar(@stack)-1; $i>=0; $i--) {
79: if ($close_warning ne '') {
80: $close_warning .= ', ';
81: }
82: $close_warning .= $stack[$i];
83: $result .= '</'.$stack[$i].'>';
84: }
85: if ($warnings && $close_warning ne '') {
86: print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n";
87: }
88: return \$result;
89: }
90:
91: sub start {
92: my($tagname, $attr, $attrseq) = @_;
93:
94: # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files
95: # (in problem files there are not so many missing tags)
96: # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
97:
98: if ($tagname eq 'o:p') {
99: return;
100: }
101:
102: if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname>
103: $result .= "<$tagname>";
104: return;
105: }
106:
107: #$tagname = lc($tagname); this is done by default by the parser
108: $tagname = fix_tag($tagname);
109: if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' &&
110: !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) {
111: # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse
112: # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><FONT FACE="Palatino"><TD...
113: # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is)
114: if ($warnings) {
115: print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n";
116: }
117: start('td');
118: }
119: if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') {
120: end('p');
121: } elsif ($tagname eq 'li') {
122: my $ind_li = last_index_of(\@stack, 'li');
123: my $ind_ul = last_index_of(\@stack, 'ul');
124: my $ind_ol = last_index_of(\@stack, 'ol');
125: if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) {
126: end('li');
127: }
128: } elsif ($tagname eq 'tr') {
129: my $ind_table = last_index_of(\@stack, 'table');
130: my $ind_tr = last_index_of(\@stack, 'tr');
131: if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) {
132: end('tr');
133: }
134: } elsif ($tagname eq 'td' || $tagname eq 'th') {
135: my $ind_table = last_index_of(\@stack, 'table');
136: my $ind_td = last_index_of(\@stack, 'td');
137: my $ind_th = last_index_of(\@stack, 'th');
138: my $ind_tr = last_index_of(\@stack, 'tr');
139: if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) {
140: start('tr');
141: $ind_tr = last_index_of(\@stack, 'tr');
142: }
143: if ($ind_td != -1 && $ind_tr < $ind_td) {
144: end('td');
145: } elsif ($ind_th != -1 && $ind_tr < $ind_th) {
146: end('th');
147: }
148: } elsif ($tagname eq 'dd' || $tagname eq 'dt') {
149: my $ind_dd = last_index_of(\@stack, 'dd');
150: my $ind_dt = last_index_of(\@stack, 'dt');
151: my $ind_dl = last_index_of(\@stack, 'dl');
152: if ($ind_dl == -1) {
153: start('dl');
154: $ind_dl = last_index_of(\@stack, 'dl');
155: }
156: if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) {
157: end('dd');
158: } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) {
159: end('dt');
160: }
161: } elsif ($tagname eq 'option') {
162: my $ind_option = last_index_of(\@stack, 'option');
163: if ($ind_option != -1) {
164: end('option');
165: }
166: } elsif ($tagname eq 'area') {
167: my $ind_area = last_index_of(\@stack, 'area');
168: if ($ind_area != -1) {
169: end('area');
170: }
171: } elsif ($tagname eq 'a') {
172: my $ind_a = last_index_of(\@stack, 'a');
173: if ($ind_a != -1) {
174: end('a');
175: }
176: } elsif ($tagname eq 'num') {
177: my $ind_num = last_index_of(\@stack, 'num');
178: if ($ind_num != -1) {
179: end('num');
180: }
181: }
182:
183: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway).
184: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead.
185: # Paragraphs inside paragraphs will be fixed later.
186:
187: # my @styles = ();
188: # if ($tagname eq 'p') {
189: # for (my $i=scalar(@stack)-1; $i>=0; $i--) {
190: # if ($stack[$i] eq 'p') {
191: # # save the styles
192: # for (my $j=$i+1; $j<scalar(@stack); $j++) {
193: # if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) {
194: # push(@styles, $stack[$j]);
195: # }
196: # }
197: # # close the p
198: # end('p');
199: # last;
200: # } elsif (index_of(\@block_html, $stack[$i]) != -1) {
201: # # stop looking
202: # last;
203: # }
204: # }
205: # }
206: $result .= '<'.$tagname;
207: my %seen = ();
208: foreach my $att_name (@$attrseq) {
209: my $att_name_modified = $att_name;
210: $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g;
211: $att_name_modified =~ s/^[\-.0-9]*//;
212: if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) {
213: if ($seen{$att_name_modified}) {
214: if ($warnings) {
215: print "Warning: Ignoring duplicate attribute: $att_name\n";
216: }
217: next;
218: }
219: $seen{$att_name_modified}++;
220: my $att_value = $attr->{$att_name};
221: $att_value =~ s/^[“”]|[“”]$//g;
222: $att_value =~ s/&/&/g;
223: $att_value =~ s/</</g;
224: $att_value =~ s/>/>/g;
225: $att_value =~ s/"/"/g;
226: if ($tagname eq 'embed' && $att_name_modified eq 'script') {
227: # newlines are encoded to preserve Protein Explorer scripts in embed script attributes:
228: $att_value =~ s/\x0A/
/g;
229: $att_value =~ s/\x0D/
/g;
230: }
231: if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' ||
232: $att_value eq 'http://www.w3.org/TR/REC-html40')) {
233: next;
234: }
235: $result .= ' '.$att_name_modified.'="'.$att_value.'"';
236: }
237: }
238: if (index_of(\@empty, $tagname) != -1) {
239: $result .= '/>';
240: } else {
241: $result .= '>';
242: push(@stack, $tagname);
243: if (scalar(@stack) > 500) {
244: die "This document has a crazy depth - I'm out !";
245: }
246: }
247: # reopen the styles, if any
248: #for (my $j=0; $j<scalar(@styles); $j++) {
249: # start($styles[$j], {}, ());
250: #}
251: }
252:
253: sub end {
254: my($tagname) = @_;
255:
256: if ($tagname eq 'o:p') {
257: return;
258: }
259:
260: $tagname = fix_tag($tagname);
261: if (index_of(\@empty, $tagname) != -1) {
262: return;
263: }
264: if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') {
265: # handle <th>text</td> as if it was <th>text</th>
266: $tagname = 'th';
267: } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') {
268: # handle <td>text</th> as if it was <td>text</td>
269: $tagname = 'td';
270: }
271: my $found = 0;
272: for (my $i=scalar(@stack)-1; $i>=0; $i--) {
273: if ($stack[$i] eq $tagname) {
274: for (my $j=scalar(@stack)-1; $j>$i; $j--) {
275: if ($close_warning ne '') {
276: $close_warning .= ', ';
277: }
278: $close_warning .= $stack[$j];
279: $result .= '</'.$stack[$j].'>';
280: }
281: splice(@stack, $i, scalar(@stack)-$i);
282: $found = 1;
283: last;
284: } elsif (index_of(\@stack, 'web') != -1) {
285: die "There is a web element with missing end tags inside - it has to be fixed by hand";
286: }
287: }
288: if ($found) {
289: $result .= '</'.$tagname.'>';
290: } elsif ($tagname eq 'p') {
291: $result .= '<p/>';
292: }
293: }
294:
295: sub text {
296: my($dtext) = @_;
297: $dtext =~ s/&/&/g;
298: $dtext =~ s/</</g;
299: $dtext =~ s/>/>/g;
300: $dtext =~ s/"/"/g;
301: $result .= $dtext;
302: }
303:
304: sub comment {
305: my($tokens) = @_;
306: # NOTE: the HTML parser thinks this is a comment: </ br>
307: # and LON-CAPA has sometimes turned that into <![CDATA[</ br>]]>
308: foreach my $comment (@$tokens) {
309: $comment =~ s/--/- /g;
310: $comment =~ s/^-|-$/ /g;
311: $result .= '<!--'.$comment.'-->';
312: }
313: }
314:
315: sub declaration {
316: my($tokens) = @_;
317: # ignore them
318: #$result .= '<!';
319: #$result .= join(' ', @$tokens);
320: #$result .= '>';
321: }
322:
323: sub process {
324: my($token0) = @_;
325: if ($token0 ne '') {
326: $result .= '<?'.$token0.'>';
327: }
328: }
329:
330: sub index_of {
331: my ($array, $value) = @_;
332: for (my $i=0; $i<scalar(@{$array}); $i++) {
333: if ($array->[$i] eq $value) {
334: return $i;
335: }
336: }
337: return -1;
338: }
339:
340: sub last_index_of {
341: my ($array, $value) = @_;
342: for (my $i=scalar(@{$array})-1; $i>=0; $i--) {
343: if ($array->[$i] eq $value) {
344: return $i;
345: }
346: }
347: return -1;
348: }
349:
350: sub fix_tag {
351: my ($tag) = @_;
352: #$tag = lc($tag); this is done by default by the parser
353: if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) {
354: if ($warnings) {
355: print "Warning: bad start tag:'".$tag."'";
356: }
357: if ($tag =~ /<[a-zA-Z]/) {
358: $tag =~ s/^[^<]*<//; # a<b -> b
359: }
360: if ($tag =~ /[a-zA-Z]=/) {
361: $tag =~ s/=.*$//; # a=b -> a
362: }
363: if ($tag =~ /[a-zA-Z]\//) {
364: $tag =~ s/\/.*$//; # a/b -> a
365: }
366: if ($tag =~ /:/) {
367: # a:b -> b except when : at the end
368: if ($tag =~ /^[^:]*:$/) {
369: $tag =~ s/://;
370: } else {
371: $tag =~ s/^.*://;
372: }
373: }
374: $tag =~ s/^[0-9\-\.]+//;
375: $tag =~ s/[^a-zA-Z0-9_\-\.]//g;
376: if ($warnings) {
377: print " (converted to $tag)\n";
378: }
379: }
380: return($tag);
381: }
382:
383:
384: ##
385: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
386: # @param {Array<string>} array - reference to the array of strings
387: # @param {string} value - the string to look for
388: # @returns 1 if found, 0 otherwise
389: ##
390: sub string_in_array {
391: my ($array, $value) = @_;
392: foreach my $v (@{$array}) {
393: if ($v eq $value) {
394: return 1;
395: }
396: }
397: return 0;
398: }
399:
400:
401: 1;
402: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>