modules/damieng/clean_xml/html_to_xml.pm - view

File: [LON-CAPA] / modules / damieng / clean_xml / html_to_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Fri Apr 17 15:35:01 2015 UTC (9 years, 6 months ago) by damieng
Branches: MAIN
CVS tags: HEAD

added clean_xml and graphical_editor

#!/usr/bin/perl package html_to_xml; use strict; use utf8; use warnings; use HTML::Parser (); # always closing, end tags are ignored: my @empty = ('base','br','col','hr','img','input','keygen','link','meta','param','source','track','wbr', 'frame', 'embed','startouttext','endouttext'); #my @block_html = ('html','body','h1','h2','h3','h4','h5','h6','div','p','ul','ol','table','tbody','tr','td','th','dl','pre','noscript','blockquote','object','applet','embed','map','form','fieldset','iframe'); my $result; my @stack; my $close_warning; # This takes non-well-formed UTF-8 LC+HTML and returns well-formed but non-valid XML LC+XHTML. sub html_to_xml { my($textref) = @_; $result = ''; @stack = (); $close_warning = ''; my $p = HTML::Parser->new( api_version => 3, start_h => [\&start, "tagname, attr, attrseq"], end_h => [\&end, "tagname"], text_h => [\&text, "dtext"], comment_h => [\&comment, "tokens"], declaration_h => [\&declaration, "tokens"], process_h => [\&process, "token0"], ); # NOTE: by default, the HTML parser turns all attribute and elements names to lowercase $p->empty_element_tags(1); $result .= "<?xml version='1.0' encoding='UTF-8'?>\n"; $p->parse($$textref); for (my $i=scalar(@stack)-1; $i>=0; $i--) { if ($close_warning ne '') { $close_warning .= ', '; } $close_warning .= $stack[$i]; $result .= '</'.$stack[$i].'>'; } if ($close_warning ne '') { print "Warning: the parser had to add closing tags to understand the document ($close_warning)\n"; } return \$result; } sub start { my($tagname, $attr, $attrseq) = @_; # NOTE: we could do things more like web browsers, but I'm nore sure the result would be better with LON-CAPA files # (in problem files there are not so many missing tags) # See http://www.w3.org/TR/html5/syntax.html#an-introduction-to-error-handling-and-strange-cases-in-the-parser if ($tagname eq 'o:p') { return; } if ($tagname =~ /@.*\.[a-z]{2,3}$/) { # email <name@hostname> $result .= "<$tagname>"; return; } #$tagname = lc($tagname); this is done by default by the parser $tagname = fix_tag($tagname); if (scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'tr' && $tagname ne 'tr' && $tagname ne 'td' && $tagname ne 'th' && !string_in_array(['part','block','comment','endouttext','problemtype','standalone','startouttext','tex','translated','web','while','randomlist','font','b','form'], $tagname)) { # NOTE: a 'part' or 'block' element between tr and td will not be valid, but changing tag order would make things worse # font and b will be removed in post_xml, so we can leave it for now, to handle things like <TR><TD... # form is to avoid an empty form in some cases (it might not work anyway, but it is better to keep this bug the way it is) print "Warning: a <td> tag was added because a $tagname element was directly under a tr\n"; start('td'); } if ($tagname eq 'p' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'p') { end('p'); } elsif ($tagname eq 'li') { my $ind_li = last_index_of(\@stack, 'li'); my $ind_ul = last_index_of(\@stack, 'ul'); my $ind_ol = last_index_of(\@stack, 'ol'); if ($ind_li != -1 && ($ind_ul == -1 || $ind_ul < $ind_li) && ($ind_ol == -1 || $ind_ol < $ind_li)) { end('li'); } } elsif ($tagname eq 'tr') { my $ind_table = last_index_of(\@stack, 'table'); my $ind_tr = last_index_of(\@stack, 'tr'); if ($ind_tr != -1 && ($ind_table == -1 || $ind_table < $ind_tr)) { end('tr'); } } elsif ($tagname eq 'td' || $tagname eq 'th') { my $ind_table = last_index_of(\@stack, 'table'); my $ind_td = last_index_of(\@stack, 'td'); my $ind_th = last_index_of(\@stack, 'th'); my $ind_tr = last_index_of(\@stack, 'tr'); if ($ind_tr == -1 || ($ind_table != -1 && $ind_table > $ind_tr)) { start('tr'); $ind_tr = last_index_of(\@stack, 'tr'); } if ($ind_td != -1 && $ind_tr < $ind_td) { end('td'); } elsif ($ind_th != -1 && $ind_tr < $ind_th) { end('th'); } } elsif ($tagname eq 'dd' || $tagname eq 'dt') { my $ind_dd = last_index_of(\@stack, 'dd'); my $ind_dt = last_index_of(\@stack, 'dt'); my $ind_dl = last_index_of(\@stack, 'dl'); if ($ind_dl == -1) { start('dl'); $ind_dl = last_index_of(\@stack, 'dl'); } if ($ind_dd != -1 && ($ind_dl == -1 || $ind_dl < $ind_dd)) { end('dd'); } elsif ($ind_dt != -1 && ($ind_dl == -1 || $ind_dl < $ind_dt)) { end('dt'); } } elsif ($tagname eq 'option') { my $ind_option = last_index_of(\@stack, 'option'); if ($ind_option != -1) { end('option'); } } elsif ($tagname eq 'area') { my $ind_area = last_index_of(\@stack, 'area'); if ($ind_area != -1) { end('area'); } } elsif ($tagname eq 'a') { my $ind_a = last_index_of(\@stack, 'a'); if ($ind_a != -1) { end('a'); } } elsif ($tagname eq 'num') { my $ind_num = last_index_of(\@stack, 'num'); if ($ind_num != -1) { end('num'); } } # HTML interpretation of non-closing elements and style is too complex (and error-prone, anyway). # Since LON-CAPA elements are all supposed to be closed, this interpretation is SGML-like instead. # Paragraphs inside paragraphs will be fixed later. # my @styles = (); # if ($tagname eq 'p') { # for (my $i=scalar(@stack)-1; $i>=0; $i--) { # if ($stack[$i] eq 'p') { # # save the styles # for (my $j=$i+1; $j<scalar(@stack); $j++) { # if (index_of(['b','i','em','strong','sub','sup'], $stack[$j]) != -1) { # push(@styles, $stack[$j]); # } # } # # close the p # end('p'); # last; # } elsif (index_of(\@block_html, $stack[$i]) != -1) { # # stop looking # last; # } # } # } $result .= '<'.$tagname; my %seen = (); foreach my $att_name (@$attrseq) { my $att_name_modified = $att_name; $att_name_modified =~ s/[^\-a-zA-Z0-9_:.]//g; $att_name_modified =~ s/^[\-.0-9]*//; if ($att_name_modified ne '' && index($att_name_modified, ':') == -1) { if ($seen{$att_name_modified}) { print "Warning: Ignoring duplicate attribute: $att_name\n"; next; } $seen{$att_name_modified}++; my $att_value = $attr->{$att_name}; $att_value =~ s/^[“”]|[“”]$//g; $att_value =~ s/&/&/g; $att_value =~ s/</</g; $att_value =~ s/>/>/g; $att_value =~ s/"/"/g; if ($tagname eq 'embed' && $att_name_modified eq 'script') { # newlines are encoded to preserve Protein Explorer scripts in embed script attributes: $att_value =~ s/\x0A/
/g; $att_value =~ s/\x0D//g; } if ($att_name_modified eq 'xmlns' && ($att_value eq 'http://www.w3.org/1999/xhtml' || $att_value eq 'http://www.w3.org/TR/REC-html40')) { next; } $result .= ' '.$att_name_modified.'="'.$att_value.'"'; } } if (index_of(\@empty, $tagname) != -1) { $result .= '/>'; } else { $result .= '>'; push(@stack, $tagname); if (scalar(@stack) > 500) { die "This document has a crazy depth - I'm out !"; } } # reopen the styles, if any #for (my $j=0; $j<scalar(@styles); $j++) { # start($styles[$j], {}, ()); #} } sub end { my($tagname) = @_; if ($tagname eq 'o:p') { return; } $tagname = fix_tag($tagname); if (index_of(\@empty, $tagname) != -1) { return; } if ($tagname eq 'td' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'th') { # handle <th>text</td> as if it was <th>text</th> $tagname = 'th'; } elsif ($tagname eq 'th' && scalar(@stack) > 0 && $stack[scalar(@stack)-1] eq 'td') { # handle <td>text</th> as if it was <td>text</td> $tagname = 'td'; } my $found = 0; for (my $i=scalar(@stack)-1; $i>=0; $i--) { if ($stack[$i] eq $tagname) { for (my $j=scalar(@stack)-1; $j>$i; $j--) { if ($close_warning ne '') { $close_warning .= ', '; } $close_warning .= $stack[$j]; $result .= '</'.$stack[$j].'>'; } splice(@stack, $i, scalar(@stack)-$i); $found = 1; last; } elsif (index_of(\@stack, 'web') != -1) { die "There is a web element with missing end tags inside - it has to be fixed by hand"; } } if ($found) { $result .= '</'.$tagname.'>'; } elsif ($tagname eq 'p') { $result .= ''; } } sub text { my($dtext) = @_; $dtext =~ s/&/&/g; $dtext =~ s/</</g; $dtext =~ s/>/>/g; $dtext =~ s/"/"/g; $result .= $dtext; } sub comment { my($tokens) = @_; # NOTE: the HTML parser thinks this is a comment: # and LON-CAPA has sometimes turned that into <![CDATA[]]> foreach my $comment (@$tokens) { $comment =~ s/--/- /g; $comment =~ s/^-|-$/ /g; $result .= ''; } } sub declaration { my($tokens) = @_; # ignore them #$result .= '<!'; #$result .= join(' ', @$tokens); #$result .= '>'; } sub process { my($token0) = @_; if ($token0 ne '') { $result .= '<?'.$token0.'>'; } } sub index_of { my ($array, $value) = @_; for (my $i=0; $i<scalar(@{$array}); $i++) { if ($array->[$i] eq $value) { return $i; } } return -1; } sub last_index_of { my ($array, $value) = @_; for (my $i=scalar(@{$array})-1; $i>=0; $i--) { if ($array->[$i] eq $value) { return $i; } } return -1; } sub fix_tag { my ($tag) = @_; #$tag = lc($tag); this is done by default by the parser if ($tag !~ /^[a-zA-Z_][a-zA-Z0-9_\-\.]*$/) { print "Warning: bad start tag:'".$tag."'"; if ($tag =~ /<[a-zA-Z]/) { $tag =~ s/^[^<]*<//; # a b } if ($tag =~ /[a-zA-Z]=/) { $tag =~ s/=.*$//; # a=b -> a } if ($tag =~ /[a-zA-Z]\//) { $tag =~ s/\/.*$//; # a/b -> a } if ($tag =~ /:/) { # a:b -> b except when : at the end if ($tag =~ /^[^:]*:$/) { $tag =~ s/://; } else { $tag =~ s/^.*://; } } $tag =~ s/^[0-9\-\.]+//; $tag =~ s/[^a-zA-Z0-9_\-\.]//g; print " (converted to $tag)\n"; } return($tag); } ## # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array) # @param {Array<string>} array - reference to the array of strings # @param {string} value - the string to look for # @returns 1 if found, 0 otherwise ## sub string_in_array { my ($array, $value) = @_; foreach my $v (@{$array}) { if ($v eq $value) { return 1; } } return 0; } 1; __END__