modules/damieng/clean_xml/pre_xml.pm - view

File: [LON-CAPA] / modules / damieng / clean_xml / pre_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Fri Apr 17 15:35:01 2015 UTC (9 years, 10 months ago) by damieng
Branches: MAIN
CVS tags: HEAD

added clean_xml and graphical_editor

#!/usr/bin/perl package pre_xml; use strict; use utf8; use Encode; use Encode::Byte; use Encode::Guess; # list of elements inside which < and > might not be turned into entities # unfortunately, answer can sometimes contain the elements vector and value... my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities, # and returns the converted text. sub pre_xml { my ($filepath) = @_; my $lines = guess_encoding_and_read($filepath); remove_control_characters($lines); fix_cdata_elements($lines); fix_html_entities($lines); fix_missing_quotes($lines); fix_empty_li($lines); remove_doctype($lines); add_root($lines, $filepath); return(\join('', @$lines)); } ## # Tries to guess the character encoding, and returns the lines as decoded text. # Requires Encode::Byte. ## sub guess_encoding_and_read { my ($fn) = @_; no warnings "utf8"; local $/ = undef; open(my $fh, "<", $fn) or die "cannot read $fn: $!"; binmode $fh; my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element close $fh; if (index($data, '<') == -1) { die "This file has no markup !"; } # try to get a charset from a meta at the beginning of the file my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) { my $meta_charset = $1; if ($meta_charset ne '') { if ($meta_charset =~ /iso-?8859-?1/i) { # usually a lie $meta_charset = 'cp1252'; } # now try to decode using that encoding my $decoder = guess_encoding($data, ($meta_charset)); if (ref($decoder)) { my $decoded = $decoder->decode($data); my @lines = split(/^/m, $decoded); return \@lines; } else { print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n"; } } } my $decoded; if (length($data) > 0) { # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/); # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM my $decoder = Encode::Guess->guess($data); if (ref($decoder)) { $decoded = $decoder->decode($data); # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8) } else { print "Warning: encoding is not UTF-8 for $fn"; # let's try iso-2022-jp first $decoder = Encode::Guess->guess($data, 'iso-2022-jp'); if (ref($decoder)) { $decoded = $decoder->decode($data); print "; using iso-2022-jp\n"; } else { # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159 # instead of control codes. We can assume that these control codes are not used, so there # is no need to test for iso-8859-1. # The main problem here is to distinguish between cp1252 and MacRoman. # see http://www.alanwood.net/demos/charsetdiffs.html#f my $decoded_windows = decode('cp1252', $data); my $decoded_mac = decode('MacRoman', $data); # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese) # í has been removed because it conflicts with ’ and ’ is more frequent # ± has been removed because it is, suprisingly, the same code in both encodings ! my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//; my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//; # check newlines too (\r on MacOS < X, \r\n on Windows) my $ind_cr = index($data, "\r"); if ($ind_cr != -1) { if (substr($data, $ind_cr + 1, 1) eq "\n") { $score_windows++; } else { $score_mac++; } } if ($score_windows >= $score_mac) { $decoded = $decoded_windows; print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n"; } else { print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n"; $decoded = $decoded_mac; } } } } else { $decoded = ''; } my @lines = split(/^/m, $decoded); return \@lines; } ## # Removes some control characters # @param {Array<string>} lines ## sub remove_control_characters { my ($lines) = @_; foreach my $line (@{$lines}) { $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g; $line =~ s/&#[0-7];//g; $line =~ s/&#1[4-9];//g; $line =~ s/&#2[0-9];//g; } } ## # Replaces < and > characters by < and > in cdata elements (listed in @cdata_elements). # EXCEPT for answer when it's inside numericalresponse or formularesponse. # @param {Array<string>} lines ## sub fix_cdata_elements { my ($lines) = @_; my $i = 0; my $j = 0; my $tag = ''; my $type; my $in_numericalresponse = 0; my $in_formularesponse = 0; my $in_script = 0; ($tag, $type, $i, $j) = next_tag($lines, $i, $j); while ($tag ne '') { if ($tag eq 'numericalresponse') { if ($type eq 'start') { $in_numericalresponse = 1; } else { $in_numericalresponse = 0; } } elsif ($tag eq 'formularesponse') { if ($type eq 'start') { $in_formularesponse = 1; } else { $in_formularesponse = 0; } } elsif ($tag eq 'script') { if ($type eq 'start') { $in_script = 1; } else { $in_script = 0; } } if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script && ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) { my $cde = $tag; my $line = $lines->[$i]; $j = index($line, '>', $j+1) + 1; my $stop = 0; while (!$stop && $i < scalar(@{$lines})) { my $indinf = index($line, '<', $j); if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) { $i++; $line = $lines->[$i]; $j = 0; last; } my $indsup = index($line, '>', $j); if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) { my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1)); $test =~ s/^\s+|\s+$//g ; if ($test eq '/'.$cde) { $stop = 1; $j = $indsup; # this is commented because of markup like <display>&web(' ','','<p>')</display> #} elsif ($test =~ /^[a-zA-Z\/]$/) { # $j = $indsup + 1; } else { $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1); $lines->[$i] = $line; } } elsif ($indinf != -1 && $indsup == -1) { $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1); $lines->[$i] = $line; } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) { $line = substr($line, 0, $indsup).'>'.substr($line, $indsup+1); $lines->[$i] = $line; } else { $i++; $line = $lines->[$i]; $j = 0; } } } $j++; ($tag, $type, $i, $j) = next_tag($lines, $i, $j); } } ## # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML). # @param {Array<string>} lines ## sub fix_html_entities { my ($lines) = @_; foreach my $line (@{$lines}) { # html_to_xml is converting named entities before 255 (see HTML parser dtext) # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255): $line =~ s/|/€/g; $line =~ s/|/‚/g; $line =~ s/|/„/g; $line =~ s/|/…/g; $line =~ s/|/†/g; $line =~ s/|/‡/g; $line =~ s/|/ˆ/g; $line =~ s/|/‰/g; $line =~ s/|/‹/g; $line =~ s/|/‘/g; $line =~ s/|/’/g; $line =~ s/|/“/g; $line =~ s/|/”/g; $line =~ s/|/•/g; $line =~ s/|/–/g; $line =~ s/|/—/g; $line =~ s/|/˜/g; $line =~ s/|/™/g; $line =~ s/|/›/g; $line =~ s/|/œ/g; } } # Tries to fix things like <font color="#990000" face="Verdana,> # without breaking <a b="c>d"> # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case). # Also transforms <a b="c> <d e=" into <a b="c"><d e=" , # and (no markup before)<a b="c> (no quote after) into <a b="c"> . sub fix_missing_quotes { my ($lines) = @_; foreach my $line (@{$lines}) { my $n_inf = $line =~ tr/<//; my $n_sup = $line =~ tr/>//; if ($n_inf == 1 && $n_sup == 1) { my $ind_inf = index($line, '<'); my $ind_sup = index($line, '>'); if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) { my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//; if ($n_quotes % 2 != 0) { # add a quote before > when there is an odd number of quotes inside <> $line =~ s/>/">/; } } } $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/; $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/; } } # Replaces <li/> by <li> (the end tag will be added in html_to_xml sub fix_empty_li { my ($lines) = @_; foreach my $line (@{$lines}) { $line =~ s/<li\s?\/>/<li>/; } } # remove doctypes, without assuming they are at the beginning sub remove_doctype { my ($lines) = @_; foreach my $line (@{$lines}) { $line =~ s/<!DOCTYPE[^>]*>//; } } # Adds a problem, library or html root element, enclosing things outside of the problem element. # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing). sub add_root { my ($lines, $filepath) = @_; my $root_name; if ($filepath =~ /\.library$/i) { $root_name = 'library'; } elsif ($filepath =~ /\.html?$/i) { $root_name = 'html'; } else { $root_name = 'problem'; } if ($root_name eq 'library') { foreach my $line (@{$lines}) { if ($line =~ /^\s*<[a-z]/) { last; } if ($line !~ /^\s*$/) { die "this library does not start with a tag, it might be a scriptlib"; } } } my $line1 = $lines->[0]; $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway $line1 = "<$root_name>".$line1; $lines->[0] = $line1; $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>"; } ## # Returns information about the next tag, starting at line number and char number. # Assumes the markup is well-formed and there is no CDATA, # which is not always true (like inside script), so results might be wrong sometimes. # It is however useful to avoid unnecessary changes in the document (using a parser to # do read/write for the whole document would mess up non well-formed documents). # @param {Array<string>} lines # @param {int} line_number - line number to start at # @param {int} char_number - char number to start at on the line # @returns (tag, type, line_number, char_number) ## sub next_tag { my ($lines, $i, $j ) = @_; my $i2 = $i; my $j2 = $j; while ($i2 < scalar(@{$lines})) { my $line = $lines->[$i2]; $j2 = index($line, '<', $j2); #TODO: handle comments while ($j2 != -1) { my $ind_slash = index($line, '/', $j2); my $ind_sup = index($line, '>', $j2); my $ind_space = index($line, ' ', $j2); my $type; my $tag; if ($ind_slash == $j2 + 1 && $ind_sup != -1) { $type = 'end'; $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2)); } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) { $type = 'empty'; if ($ind_space != -1 && $ind_space < $ind_sup) { $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1)); } else { $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1)); } } elsif ($ind_sup != -1) { $type = 'start'; if ($ind_space != -1 && $ind_space < $ind_sup) { $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1)); } else { $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1)); } } else { $tag = '' } if ($tag ne '') { return ($tag, $type, $i2, $j2); } $j2 = index($line, '<', $j2 + 1); } $i2++; $j2 = 0; } return ('', '', 0, 0); } ## # Tests if a string is in an array, ignoring case ## sub in_array_ignore_case { my ($array, $value) = @_; my $lcvalue = lc($value); foreach my $v (@{$array}) { if (lc($v) eq $lcvalue) { return 1; } } return 0; } 1; __END__