loncom/homework/cleanxml/html_to_xml.pm - annotate

Return to html_to_xml.pm CVS log
Up to [LON-CAPA] / loncom / homework / cleanxml
Annotation of loncom/homework/cleanxml/html_to_xml.pm, revision 1.2

1.1       damieng     1: # The LearningOnline Network
                      2: # Second step to clean a file.
                      3: #
1.2     ! damieng     4: # $Id: html_to_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $
1.1       damieng     5: #
                      6: # Copyright Michigan State University Board of Trustees
                      7: #
                      8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                      9: #
                     10: # LON-CAPA is free software; you can redistribute it and/or modify
                     11: # it under the terms of the GNU General Public License as published by
                     12: # the Free Software Foundation; either version 2 of the License, or
                     13: # (at your option) any later version.
                     14: #
                     15: # LON-CAPA is distributed in the hope that it will be useful,
                     16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     18: # GNU General Public License for more details.
                     19: #
                     20: # You should have received a copy of the GNU General Public License
                     21: # along with LON-CAPA; if not, write to the Free Software
                     22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     23: #
                     24: # /home/httpd/html/adm/gpl.txt
                     25: #
                     26: # http://www.lon-capa.org/
                     27: #
                     28: ###
                     29: 
                     30: #!/usr/bin/perl
                     31: 
                     32: 
                     33: package Apache::html_to_xml;
                     34: 
                     35: use strict;
                     36: use utf8;
                     37: use warnings;
                     38: use HTML::Parser ();
                     39: 
                     40: # always closing, end tags are ignored:
                     41: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext');
                     42: 
                     43: #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe');
                     44: 
                     45: 
                     46: my $result;
                     47: my @stack;
                     48: my $close_warning;
                     49: my $warnings; # 1 = print warnings
                     50: 
                     51: 
                     52: # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML.
                     53: sub html_to_xml {
1.2     ! damieng    54:   my($textref, $warn, $case_sensitive) = @_;
1.1       damieng    55:   $warnings = $warn;
1.2     ! damieng    56:   if (!defined $case_sensitive) {
        !            57:     $case_sensitive = 0;
        !            58:   }
1.1       damieng    59:   $result = '';
                     60:   @stack = ();
                     61:   $close_warning = '';
                     62:   my $p = HTML::Parser->new( api_version => 3,
                     63:                           start_h => [\&start, "tagname, attr, attrseq"],
                     64:                           end_h   => [\&end,   "tagname"],
                     65:                           text_h  => [\&text, "dtext"],
                     66:                           comment_h  => [\&comment, "tokens"],
                     67:                           declaration_h  => [\&declaration, "tokens"],
                     68:                           process_h  => [\&process, "token0"],
                     69:                         );
                     70:   # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase
1.2     ! damieng    71:   # This is a problem with the Task elements, so it is disabled in that case
        !            72:   if ($case_sensitive) {
        !            73:     $p->case_sensitive(1);
        !            74:   }
1.1       damieng    75:   $p->empty_element_tags(1);
                     76:   $result .= "<?xml version='1.0' encoding='UTF-8'?>\n";
                     77:   $p->parse($$textref);
                     78:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
                     79:     if ($close_warning ne '') {
                     80:       $close_warning .= ', ';
                     81:     }
                     82:     $close_warning .= $stack[$i];
                     83:     $result .= '</'.$stack[$i].'>';
                     84:   }
                     85:   if ($warnings && $close_warning ne '') {
                     86:     print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n";
                     87:   }
                     88:   return \$result;
                     89: }
                     90: 
                     91: sub start {
                     92:   my($tagname, $attr, $attrseq) = @_;
                     93:   
                     94:   # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files
                     95:   # (in problem files there are not so many missing tags)
                     96:   # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser
                     97:   
                     98:   if ($tagname eq 'o:p') {
                     99:     return;
                    100:   }
                    101:   
                    102:   if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname>
                    103:     $result .= "&lt;$tagname&gt;";
                    104:     return;
                    105:   }
                    106:   
                    107:   #$tagname = lc($tagname); this is done by default by the parser
                    108:   $tagname = fix_tag($tagname);
                    109:   if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' &&
                    110:       !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) {
                    111:     # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse
                    112:     # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><FONT FACE="Palatino"><TD...
                    113:     # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is)
                    114:     if ($warnings) {
                    115:       print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n";
                    116:     }
                    117:     start('td');
                    118:   }
                    119:   if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') {
                    120:     end('p');
                    121:   } elsif ($tagname eq 'li') {
                    122:     my $ind_li = last_index_of(\@stack, 'li');
                    123:     my $ind_ul = last_index_of(\@stack, 'ul');
                    124:     my $ind_ol = last_index_of(\@stack, 'ol');
                    125:     if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) {
                    126:       end('li');
                    127:     }
                    128:   } elsif ($tagname eq 'tr') {
                    129:     my $ind_table = last_index_of(\@stack, 'table');
                    130:     my $ind_tr = last_index_of(\@stack, 'tr');
                    131:     if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) {
                    132:       end('tr');
                    133:     }
                    134:   } elsif ($tagname eq 'td' || $tagname eq 'th') {
                    135:     my $ind_table = last_index_of(\@stack, 'table');
                    136:     my $ind_td = last_index_of(\@stack, 'td');
                    137:     my $ind_th = last_index_of(\@stack, 'th');
                    138:     my $ind_tr = last_index_of(\@stack, 'tr');
                    139:     if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) {
                    140:       start('tr');
                    141:       $ind_tr = last_index_of(\@stack, 'tr');
                    142:     }
                    143:     if ($ind_td != -1 && $ind_tr < $ind_td) {
                    144:       end('td');
                    145:     } elsif ($ind_th != -1 && $ind_tr < $ind_th) {
                    146:       end('th');
                    147:     }
                    148:   } elsif ($tagname eq 'dd' || $tagname eq 'dt') {
                    149:     my $ind_dd = last_index_of(\@stack, 'dd');
                    150:     my $ind_dt = last_index_of(\@stack, 'dt');
                    151:     my $ind_dl = last_index_of(\@stack, 'dl');
                    152:     if ($ind_dl == -1) {
                    153:       start('dl');
                    154:       $ind_dl = last_index_of(\@stack, 'dl');
                    155:     }
                    156:     if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) {
                    157:       end('dd');
                    158:     } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) {
                    159:       end('dt');
                    160:     }
                    161:   } elsif ($tagname eq 'option') {
                    162:     my $ind_option = last_index_of(\@stack, 'option');
                    163:     if ($ind_option != -1) {
                    164:       end('option');
                    165:     }
                    166:   } elsif ($tagname eq 'area') {
                    167:     my $ind_area = last_index_of(\@stack, 'area');
                    168:     if ($ind_area != -1) {
                    169:       end('area');
                    170:     }
                    171:   } elsif ($tagname eq 'a') {
                    172:     my $ind_a = last_index_of(\@stack, 'a');
                    173:     if ($ind_a != -1) {
                    174:       end('a');
                    175:     }
                    176:   } elsif ($tagname eq 'num') {
                    177:     my $ind_num = last_index_of(\@stack, 'num');
                    178:     if ($ind_num != -1) {
                    179:       end('num');
                    180:     }
                    181:   }
                    182: 
                    183: # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway).
                    184: # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead.
                    185: # Paragraphs inside paragraphs will be fixed later.
                    186: 
                    187: #   my @styles = ();
                    188: #   if ($tagname eq 'p') {
                    189: #     for (my $i=scalar(@stack)-1; $i>=0; $i--) {
                    190: #       if ($stack[$i] eq 'p') {
                    191: #         # save the styles
                    192: #         for (my $j=$i+1; $j<scalar(@stack); $j++) {
                    193: #           if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) {
                    194: #             push(@styles, $stack[$j]);
                    195: #           }
                    196: #         }
                    197: #         # close the p
                    198: #         end('p');
                    199: #         last;
                    200: #       } elsif (index_of(\@block_html, $stack[$i]) != -1) {
                    201: #         # stop looking
                    202: #         last;
                    203: #       }
                    204: #     }
                    205: #   }
                    206:   $result .= '<'.$tagname;
                    207:   my %seen = ();
                    208:   foreach my $att_name (@$attrseq) {
                    209:     my $att_name_modified = $att_name;
                    210:     $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g;
                    211:     $att_name_modified =~ s/^[\-.0-9]*//;
                    212:     if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) {
                    213:       if ($seen{$att_name_modified}) {
                    214:         if ($warnings) {
                    215:           print "Warning: Ignoring duplicate attribute: $att_name\n";
                    216:         }
                    217:         next;
                    218:       }
                    219:       $seen{$att_name_modified}++;
                    220:       my $att_value = $attr->{$att_name};
                    221:       $att_value =~ s/^[“”]|[“”]$//g;
                    222:       $att_value =~ s/&/&amp;/g;
                    223:       $att_value =~ s/</&lt;/g;
                    224:       $att_value =~ s/>/&gt;/g;
                    225:       $att_value =~ s/"/&quot;/g;
                    226:       if ($tagname eq 'embed' && $att_name_modified eq 'script') {
                    227:         # newlines are encoded to preserve Protein Explorer scripts in embed script attributes:
                    228:         $att_value =~ s/\x0A/&#xA;/g;
                    229:         $att_value =~ s/\x0D/&#xD;/g;
                    230:       }
                    231:       if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' ||
                    232:           $att_value eq 'http://www.w3.org/TR/REC-html40')) {
                    233:         next;
                    234:       }
                    235:       $result .= ' '.$att_name_modified.'="'.$att_value.'"';
                    236:     }
                    237:   }
                    238:   if (index_of(\@empty, $tagname) != -1) {
                    239:     $result .= '/>';
                    240:   } else {
                    241:     $result .= '>';
                    242:     push(@stack, $tagname);
                    243:     if (scalar(@stack) > 500) {
                    244:       die "This document has a crazy depth - I'm out !";
                    245:     }
                    246:   }
                    247:   # reopen the styles, if any
                    248:   #for (my $j=0; $j<scalar(@styles); $j++) {
                    249:   #  start($styles[$j], {}, ());
                    250:   #}
                    251: }
                    252: 
                    253: sub end {
                    254:   my($tagname) = @_;
                    255:   
                    256:   if ($tagname eq 'o:p') {
                    257:     return;
                    258:   }
                    259:   
                    260:   $tagname = fix_tag($tagname);
                    261:   if (index_of(\@empty, $tagname) != -1) {
                    262:     return;
                    263:   }
                    264:   if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') {
                    265:     # handle <th>text</td> as if it was <th>text</th>
                    266:     $tagname = 'th';
                    267:   } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') {
                    268:     # handle <td>text</th> as if it was <td>text</td>
                    269:     $tagname = 'td';
                    270:   }
                    271:   my $found = 0;
                    272:   for (my $i=scalar(@stack)-1; $i>=0; $i--) {
                    273:     if ($stack[$i] eq $tagname) {
                    274:       for (my $j=scalar(@stack)-1; $j>$i; $j--) {
                    275:         if ($close_warning ne '') {
                    276:           $close_warning .= ', ';
                    277:         }
                    278:         $close_warning .= $stack[$j];
                    279:         $result .= '</'.$stack[$j].'>';
                    280:       }
                    281:       splice(@stack, $i, scalar(@stack)-$i);
                    282:       $found = 1;
                    283:       last;
                    284:     } elsif (index_of(\@stack, 'web') != -1) {
                    285:       die "There is a web element with missing end tags inside - it has to be fixed by hand";
                    286:     }
                    287:   }
                    288:   if ($found) {
                    289:     $result .= '</'.$tagname.'>';
                    290:   } elsif ($tagname eq 'p') {
                    291:     $result .= '<p/>';
                    292:   }
                    293: }
                    294: 
                    295: sub text {
                    296:   my($dtext) = @_;
                    297:   $dtext =~ s/&/&amp;/g;
                    298:   $dtext =~ s/</&lt;/g;
                    299:   $dtext =~ s/>/&gt;/g;
                    300:   $dtext =~ s/"/&quot;/g;
                    301:   $result .= $dtext;
                    302: }
                    303: 
                    304: sub comment {
                    305:   my($tokens) = @_;
                    306:   # NOTE: the HTML parser thinks this is a comment: </ br>
                    307:   # and LON-CAPA has sometimes turned that into <![CDATA[</ br>]]>
                    308:   foreach my $comment (@$tokens) {
                    309:     $comment =~ s/--/- /g;
                    310:     $comment =~ s/^-|-$/ /g;
                    311:     $result .= '<!--'.$comment.'-->';
                    312:   }
                    313: }
                    314: 
                    315: sub declaration {
                    316:   my($tokens) = @_;
                    317:   # ignore them
                    318:   #$result .= '<!';
                    319:   #$result .= join(' ', @$tokens);
                    320:   #$result .= '>';
                    321: }
                    322: 
                    323: sub process {
                    324:   my($token0) = @_;
                    325:   if ($token0 ne '') {
                    326:     $result .= '<?'.$token0.'>';
                    327:   }
                    328: }
                    329: 
                    330: sub index_of {
                    331:   my ($array, $value) = @_;
                    332:   for (my $i=0; $i<scalar(@{$array}); $i++) {
                    333:     if ($array->[$i] eq $value) {
                    334:       return $i;
                    335:     }
                    336:   }
                    337:   return -1;
                    338: }
                    339: 
                    340: sub last_index_of {
                    341:   my ($array, $value) = @_;
                    342:   for (my $i=scalar(@{$array})-1; $i>=0; $i--) {
                    343:     if ($array->[$i] eq $value) {
                    344:       return $i;
                    345:     }
                    346:   }
                    347:   return -1;
                    348: }
                    349: 
                    350: sub fix_tag {
                    351:   my ($tag) = @_;
                    352:   #$tag = lc($tag); this is done by default by the parser
                    353:   if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) {
                    354:     if ($warnings) {
                    355:       print "Warning: bad start tag:'".$tag."'";
                    356:     }
                    357:     if ($tag =~ /<[a-zA-Z]/) {
                    358:       $tag =~ s/^[^<]*<//; # a<b -> b
                    359:     }
                    360:     if ($tag =~ /[a-zA-Z]=/) {
                    361:       $tag =~ s/=.*$//; # a=b -> a
                    362:     }
                    363:     if ($tag =~ /[a-zA-Z]\//) {
                    364:       $tag =~ s/\/.*$//; # a/b -> a
                    365:     }
                    366:     if ($tag =~ /:/) {
                    367:       # a:b -> b except when : at the end
                    368:       if ($tag =~ /^[^:]*:$/) {
                    369:         $tag =~ s/://;
                    370:       } else {
                    371:         $tag =~ s/^.*://;
                    372:       }
                    373:     }
                    374:     $tag =~ s/^[0-9\-\.]+//;
                    375:     $tag =~ s/[^a-zA-Z0-9_\-\.]//g;
                    376:     if ($warnings) {
                    377:       print " (converted to $tag)\n";
                    378:     }
                    379:   }
                    380:   return($tag);
                    381: }
                    382: 
                    383: 
                    384: ##
                    385: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
                    386: # @param {Array<string>} array - reference to the array of strings
                    387: # @param {string} value - the string to look for
                    388: # @returns 1 if found, 0 otherwise
                    389: ##
                    390: sub string_in_array {
                    391:   my ($array, $value) = @_;
                    392:   foreach my $v (@{$array}) {
                    393:     if ($v eq $value) {
                    394:       return 1;
                    395:     }
                    396:   }
                    397:   return 0;
                    398: }
                    399: 
                    400: 
                    401: 1;
                    402: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>