Annotation of modules/damieng/clean_xml/html_to_xml.pm, revision 1.1
1.1 ! damieng 1: #!/usr/bin/perl
! 2:
! 3:
! 4: package html_to_xml;
! 5:
! 6: use strict;
! 7: use utf8;
! 8: use warnings;
! 9: use HTML::Parser ();
! 10:
! 11: # always closing, end tags are ignored:
! 12: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext');
! 13:
! 14: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe');
! 15:
! 16:
! 17: my $result;
! 18: my @stack;
! 19: my $close_warning;
! 20:
! 21:
! 22: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML.
! 23: sub html_to_xml {
! 24: my($textref) = @_;
! 25: $result = '';
! 26: @stack = ();
! 27: $close_warning = '';
! 28: my $p = HTML::Parser->new( api_version => 3,
! 29: start_h => [\&start, "tagname, attr, attrseq"],
! 30: end_h => [\&end, "tagname"],
! 31: text_h => [\&text, "dtext"],
! 32: comment_h => [\&comment, "tokens"],
! 33: declaration_h => [\&declaration, "tokens"],
! 34: process_h => [\&process, "token0"],
! 35: );
! 36: # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase
! 37: $p->empty_element_tags(1);
! 38: $result .= "<?xml version='1.0' encoding='UTF-8'?>\n";
! 39: $p->parse($$textref);
! 40: for (my $i=scalar(@stack)-1; $i>=0; $i--) {
! 41: if ($close_warning ne '') {
! 42: $close_warning .= ', ';
! 43: }
! 44: $close_warning .= $stack[$i];
! 45: $result .= '</'.$stack[$i].'>';
! 46: }
! 47: if ($close_warning ne '') {
! 48: print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n";
! 49: }
! 50: return \$result;
! 51: }
! 52:
! 53: sub start {
! 54: my($tagname, $attr, $attrseq) = @_;
! 55:
! 56: # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files
! 57: # (in problem files there are not so many missing tags)
! 58: # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
! 59:
! 60: if ($tagname eq 'o:p') {
! 61: return;
! 62: }
! 63:
! 64: if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname>
! 65: $result .= "<$tagname>";
! 66: return;
! 67: }
! 68:
! 69: #$tagname = lc($tagname); this is done by default by the parser
! 70: $tagname = fix_tag($tagname);
! 71: if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' &&
! 72: !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) {
! 73: # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse
! 74: # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><FONT FACE="Palatino"><TD...
! 75: # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is)
! 76: print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n";
! 77: start('td');
! 78: }
! 79: if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') {
! 80: end('p');
! 81: } elsif ($tagname eq 'li') {
! 82: my $ind_li = last_index_of(\@stack, 'li');
! 83: my $ind_ul = last_index_of(\@stack, 'ul');
! 84: my $ind_ol = last_index_of(\@stack, 'ol');
! 85: if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) {
! 86: end('li');
! 87: }
! 88: } elsif ($tagname eq 'tr') {
! 89: my $ind_table = last_index_of(\@stack, 'table');
! 90: my $ind_tr = last_index_of(\@stack, 'tr');
! 91: if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) {
! 92: end('tr');
! 93: }
! 94: } elsif ($tagname eq 'td' || $tagname eq 'th') {
! 95: my $ind_table = last_index_of(\@stack, 'table');
! 96: my $ind_td = last_index_of(\@stack, 'td');
! 97: my $ind_th = last_index_of(\@stack, 'th');
! 98: my $ind_tr = last_index_of(\@stack, 'tr');
! 99: if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) {
! 100: start('tr');
! 101: $ind_tr = last_index_of(\@stack, 'tr');
! 102: }
! 103: if ($ind_td != -1 && $ind_tr < $ind_td) {
! 104: end('td');
! 105: } elsif ($ind_th != -1 && $ind_tr < $ind_th) {
! 106: end('th');
! 107: }
! 108: } elsif ($tagname eq 'dd' || $tagname eq 'dt') {
! 109: my $ind_dd = last_index_of(\@stack, 'dd');
! 110: my $ind_dt = last_index_of(\@stack, 'dt');
! 111: my $ind_dl = last_index_of(\@stack, 'dl');
! 112: if ($ind_dl == -1) {
! 113: start('dl');
! 114: $ind_dl = last_index_of(\@stack, 'dl');
! 115: }
! 116: if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) {
! 117: end('dd');
! 118: } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) {
! 119: end('dt');
! 120: }
! 121: } elsif ($tagname eq 'option') {
! 122: my $ind_option = last_index_of(\@stack, 'option');
! 123: if ($ind_option != -1) {
! 124: end('option');
! 125: }
! 126: } elsif ($tagname eq 'area') {
! 127: my $ind_area = last_index_of(\@stack, 'area');
! 128: if ($ind_area != -1) {
! 129: end('area');
! 130: }
! 131: } elsif ($tagname eq 'a') {
! 132: my $ind_a = last_index_of(\@stack, 'a');
! 133: if ($ind_a != -1) {
! 134: end('a');
! 135: }
! 136: } elsif ($tagname eq 'num') {
! 137: my $ind_num = last_index_of(\@stack, 'num');
! 138: if ($ind_num != -1) {
! 139: end('num');
! 140: }
! 141: }
! 142:
! 143: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway).
! 144: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead.
! 145: # Paragraphs inside paragraphs will be fixed later.
! 146:
! 147: # my @styles = ();
! 148: # if ($tagname eq 'p') {
! 149: # for (my $i=scalar(@stack)-1; $i>=0; $i--) {
! 150: # if ($stack[$i] eq 'p') {
! 151: # # save the styles
! 152: # for (my $j=$i+1; $j<scalar(@stack); $j++) {
! 153: # if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) {
! 154: # push(@styles, $stack[$j]);
! 155: # }
! 156: # }
! 157: # # close the p
! 158: # end('p');
! 159: # last;
! 160: # } elsif (index_of(\@block_html, $stack[$i]) != -1) {
! 161: # # stop looking
! 162: # last;
! 163: # }
! 164: # }
! 165: # }
! 166: $result .= '<'.$tagname;
! 167: my %seen = ();
! 168: foreach my $att_name (@$attrseq) {
! 169: my $att_name_modified = $att_name;
! 170: $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g;
! 171: $att_name_modified =~ s/^[\-.0-9]*//;
! 172: if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) {
! 173: if ($seen{$att_name_modified}) {
! 174: print "Warning: Ignoring duplicate attribute: $att_name\n";
! 175: next;
! 176: }
! 177: $seen{$att_name_modified}++;
! 178: my $att_value = $attr->{$att_name};
! 179: $att_value =~ s/^[“”]|[“”]$//g;
! 180: $att_value =~ s/&/&/g;
! 181: $att_value =~ s/</</g;
! 182: $att_value =~ s/>/>/g;
! 183: $att_value =~ s/"/"/g;
! 184: if ($tagname eq 'embed' && $att_name_modified eq 'script') {
! 185: # newlines are encoded to preserve Protein Explorer scripts in embed script attributes:
! 186: $att_value =~ s/\x0A/
/g;
! 187: $att_value =~ s/\x0D/
/g;
! 188: }
! 189: if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' ||
! 190: $att_value eq 'http://www.w3.org/TR/REC-html40')) {
! 191: next;
! 192: }
! 193: $result .= ' '.$att_name_modified.'="'.$att_value.'"';
! 194: }
! 195: }
! 196: if (index_of(\@empty, $tagname) != -1) {
! 197: $result .= '/>';
! 198: } else {
! 199: $result .= '>';
! 200: push(@stack, $tagname);
! 201: if (scalar(@stack) > 500) {
! 202: die "This document has a crazy depth - I'm out !";
! 203: }
! 204: }
! 205: # reopen the styles, if any
! 206: #for (my $j=0; $j<scalar(@styles); $j++) {
! 207: # start($styles[$j], {}, ());
! 208: #}
! 209: }
! 210:
! 211: sub end {
! 212: my($tagname) = @_;
! 213:
! 214: if ($tagname eq 'o:p') {
! 215: return;
! 216: }
! 217:
! 218: $tagname = fix_tag($tagname);
! 219: if (index_of(\@empty, $tagname) != -1) {
! 220: return;
! 221: }
! 222: if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') {
! 223: # handle <th>text</td> as if it was <th>text</th>
! 224: $tagname = 'th';
! 225: } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') {
! 226: # handle <td>text</th> as if it was <td>text</td>
! 227: $tagname = 'td';
! 228: }
! 229: my $found = 0;
! 230: for (my $i=scalar(@stack)-1; $i>=0; $i--) {
! 231: if ($stack[$i] eq $tagname) {
! 232: for (my $j=scalar(@stack)-1; $j>$i; $j--) {
! 233: if ($close_warning ne '') {
! 234: $close_warning .= ', ';
! 235: }
! 236: $close_warning .= $stack[$j];
! 237: $result .= '</'.$stack[$j].'>';
! 238: }
! 239: splice(@stack, $i, scalar(@stack)-$i);
! 240: $found = 1;
! 241: last;
! 242: } elsif (index_of(\@stack, 'web') != -1) {
! 243: die "There is a web element with missing end tags inside - it has to be fixed by hand";
! 244: }
! 245: }
! 246: if ($found) {
! 247: $result .= '</'.$tagname.'>';
! 248: } elsif ($tagname eq 'p') {
! 249: $result .= '<p/>';
! 250: }
! 251: }
! 252:
! 253: sub text {
! 254: my($dtext) = @_;
! 255: $dtext =~ s/&/&/g;
! 256: $dtext =~ s/</</g;
! 257: $dtext =~ s/>/>/g;
! 258: $dtext =~ s/"/"/g;
! 259: $result .= $dtext;
! 260: }
! 261:
! 262: sub comment {
! 263: my($tokens) = @_;
! 264: # NOTE: the HTML parser thinks this is a comment: </ br>
! 265: # and LON-CAPA has sometimes turned that into <![CDATA[</ br>]]>
! 266: foreach my $comment (@$tokens) {
! 267: $comment =~ s/--/- /g;
! 268: $comment =~ s/^-|-$/ /g;
! 269: $result .= '<!--'.$comment.'-->';
! 270: }
! 271: }
! 272:
! 273: sub declaration {
! 274: my($tokens) = @_;
! 275: # ignore them
! 276: #$result .= '<!';
! 277: #$result .= join(' ', @$tokens);
! 278: #$result .= '>';
! 279: }
! 280:
! 281: sub process {
! 282: my($token0) = @_;
! 283: if ($token0 ne '') {
! 284: $result .= '<?'.$token0.'>';
! 285: }
! 286: }
! 287:
! 288: sub index_of {
! 289: my ($array, $value) = @_;
! 290: for (my $i=0; $i<scalar(@{$array}); $i++) {
! 291: if ($array->[$i] eq $value) {
! 292: return $i;
! 293: }
! 294: }
! 295: return -1;
! 296: }
! 297:
! 298: sub last_index_of {
! 299: my ($array, $value) = @_;
! 300: for (my $i=scalar(@{$array})-1; $i>=0; $i--) {
! 301: if ($array->[$i] eq $value) {
! 302: return $i;
! 303: }
! 304: }
! 305: return -1;
! 306: }
! 307:
! 308: sub fix_tag {
! 309: my ($tag) = @_;
! 310: #$tag = lc($tag); this is done by default by the parser
! 311: if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) {
! 312: print "Warning: bad start tag:'".$tag."'";
! 313: if ($tag =~ /<[a-zA-Z]/) {
! 314: $tag =~ s/^[^<]*<//; # a<b -> b
! 315: }
! 316: if ($tag =~ /[a-zA-Z]=/) {
! 317: $tag =~ s/=.*$//; # a=b -> a
! 318: }
! 319: if ($tag =~ /[a-zA-Z]\//) {
! 320: $tag =~ s/\/.*$//; # a/b -> a
! 321: }
! 322: if ($tag =~ /:/) {
! 323: # a:b -> b except when : at the end
! 324: if ($tag =~ /^[^:]*:$/) {
! 325: $tag =~ s/://;
! 326: } else {
! 327: $tag =~ s/^.*://;
! 328: }
! 329: }
! 330: $tag =~ s/^[0-9\-\.]+//;
! 331: $tag =~ s/[^a-zA-Z0-9_\-\.]//g;
! 332: print " (converted to $tag)\n";
! 333: }
! 334: return($tag);
! 335: }
! 336:
! 337:
! 338: ##
! 339: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
! 340: # @param {Array<string>} array - reference to the array of strings
! 341: # @param {string} value - the string to look for
! 342: # @returns 1 if found, 0 otherwise
! 343: ##
! 344: sub string_in_array {
! 345: my ($array, $value) = @_;
! 346: foreach my $v (@{$array}) {
! 347: if ($v eq $value) {
! 348: return 1;
! 349: }
! 350: }
! 351: return 0;
! 352: }
! 353:
! 354:
! 355: 1;
! 356: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>