Annotation of modules/damieng/clean_xml/pre_xml.pm, revision 1.1
1.1 ! damieng 1: #!/usr/bin/perl
! 2:
! 3: package pre_xml;
! 4:
! 5: use strict;
! 6: use utf8;
! 7:
! 8: use Encode;
! 9: use Encode::Byte;
! 10: use Encode::Guess;
! 11:
! 12: # list of elements inside which < and > might not be turned into entities
! 13: # unfortunately, answer can sometimes contain the elements vector and value...
! 14: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
! 15:
! 16:
! 17: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
! 18: # and returns the converted text.
! 19: sub pre_xml {
! 20: my ($filepath) = @_;
! 21:
! 22: my $lines = guess_encoding_and_read($filepath);
! 23:
! 24: remove_control_characters($lines);
! 25:
! 26: fix_cdata_elements($lines);
! 27:
! 28: fix_html_entities($lines);
! 29:
! 30: fix_missing_quotes($lines);
! 31:
! 32: fix_empty_li($lines);
! 33:
! 34: remove_doctype($lines);
! 35:
! 36: add_root($lines, $filepath);
! 37:
! 38: return(\join('', @$lines));
! 39: }
! 40:
! 41:
! 42: ##
! 43: # Tries to guess the character encoding, and returns the lines as decoded text.
! 44: # Requires Encode::Byte.
! 45: ##
! 46: sub guess_encoding_and_read {
! 47: my ($fn) = @_;
! 48: no warnings "utf8";
! 49: local $/ = undef;
! 50: open(my $fh, "<", $fn) or die "cannot read $fn: $!";
! 51: binmode $fh;
! 52: my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
! 53: close $fh;
! 54:
! 55: if (index($data, '<') == -1) {
! 56: die "This file has no markup !";
! 57: }
! 58:
! 59: # try to get a charset from a meta at the beginning of the file
! 60: my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
! 61: if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
! 62: my $meta_charset = $1;
! 63: if ($meta_charset ne '') {
! 64: if ($meta_charset =~ /iso-?8859-?1/i) {
! 65: # usually a lie
! 66: $meta_charset = 'cp1252';
! 67: }
! 68: # now try to decode using that encoding
! 69: my $decoder = guess_encoding($data, ($meta_charset));
! 70: if (ref($decoder)) {
! 71: my $decoded = $decoder->decode($data);
! 72: my @lines = split(/^/m, $decoded);
! 73: return \@lines;
! 74: } else {
! 75: print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
! 76: }
! 77: }
! 78: }
! 79:
! 80: my $decoded;
! 81: if (length($data) > 0) {
! 82: # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
! 83: #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
! 84: # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
! 85: my $decoder = Encode::Guess->guess($data);
! 86: if (ref($decoder)) {
! 87: $decoded = $decoder->decode($data);
! 88: # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
! 89: } else {
! 90: print "Warning: encoding is not UTF-8 for $fn";
! 91:
! 92: # let's try iso-2022-jp first
! 93: $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
! 94: if (ref($decoder)) {
! 95: $decoded = $decoder->decode($data);
! 96: print "; using iso-2022-jp\n";
! 97: } else {
! 98: # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
! 99: # instead of control codes. We can assume that these control codes are not used, so there
! 100: # is no need to test for iso-8859-1.
! 101: # The main problem here is to distinguish between cp1252 and MacRoman.
! 102: # see http://www.alanwood.net/demos/charsetdiffs.html#f
! 103: my $decoded_windows = decode('cp1252', $data);
! 104: my $decoded_mac = decode('MacRoman', $data);
! 105: # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
! 106: # í has been removed because it conflicts with ’ and ’ is more frequent
! 107: # ± has been removed because it is, suprisingly, the same code in both encodings !
! 108: my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
! 109: my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
! 110: # check newlines too (\r on MacOS < X, \r\n on Windows)
! 111: my $ind_cr = index($data, "\r");
! 112: if ($ind_cr != -1) {
! 113: if (substr($data, $ind_cr + 1, 1) eq "\n") {
! 114: $score_windows++;
! 115: } else {
! 116: $score_mac++;
! 117: }
! 118: }
! 119: if ($score_windows >= $score_mac) {
! 120: $decoded = $decoded_windows;
! 121: print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
! 122: } else {
! 123: print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
! 124: $decoded = $decoded_mac;
! 125: }
! 126: }
! 127: }
! 128: } else {
! 129: $decoded = '';
! 130: }
! 131: my @lines = split(/^/m, $decoded);
! 132: return \@lines;
! 133: }
! 134:
! 135:
! 136: ##
! 137: # Removes some control characters
! 138: # @param {Array<string>} lines
! 139: ##
! 140: sub remove_control_characters {
! 141: my ($lines) = @_;
! 142: foreach my $line (@{$lines}) {
! 143: $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
! 144: $line =~ s/&#[0-7];//g;
! 145: $line =~ s/[4-9];//g;
! 146: $line =~ s/[0-9];//g;
! 147: }
! 148: }
! 149:
! 150: ##
! 151: # Replaces < and > characters by < and > in cdata elements (listed in @cdata_elements).
! 152: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
! 153: # @param {Array<string>} lines
! 154: ##
! 155: sub fix_cdata_elements {
! 156: my ($lines) = @_;
! 157: my $i = 0;
! 158: my $j = 0;
! 159: my $tag = '';
! 160: my $type;
! 161: my $in_numericalresponse = 0;
! 162: my $in_formularesponse = 0;
! 163: my $in_script = 0;
! 164: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
! 165: while ($tag ne '') {
! 166: if ($tag eq 'numericalresponse') {
! 167: if ($type eq 'start') {
! 168: $in_numericalresponse = 1;
! 169: } else {
! 170: $in_numericalresponse = 0;
! 171: }
! 172: } elsif ($tag eq 'formularesponse') {
! 173: if ($type eq 'start') {
! 174: $in_formularesponse = 1;
! 175: } else {
! 176: $in_formularesponse = 0;
! 177: }
! 178: } elsif ($tag eq 'script') {
! 179: if ($type eq 'start') {
! 180: $in_script = 1;
! 181: } else {
! 182: $in_script = 0;
! 183: }
! 184: }
! 185: if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
! 186: ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
! 187: my $cde = $tag;
! 188: my $line = $lines->[$i];
! 189: $j = index($line, '>', $j+1) + 1;
! 190: my $stop = 0;
! 191: while (!$stop && $i < scalar(@{$lines})) {
! 192: my $indinf = index($line, '<', $j);
! 193: if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
! 194: $i++;
! 195: $line = $lines->[$i];
! 196: $j = 0;
! 197: last;
! 198: }
! 199: my $indsup = index($line, '>', $j);
! 200: if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
! 201: my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
! 202: $test =~ s/^\s+|\s+$//g ;
! 203: if ($test eq '/'.$cde) {
! 204: $stop = 1;
! 205: $j = $indsup;
! 206: # this is commented because of markup like <display>&web(' ','','<p>')</display>
! 207: #} elsif ($test =~ /^[a-zA-Z\/]$/) {
! 208: # $j = $indsup + 1;
! 209: } else {
! 210: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
! 211: $lines->[$i] = $line;
! 212: }
! 213: } elsif ($indinf != -1 && $indsup == -1) {
! 214: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
! 215: $lines->[$i] = $line;
! 216: } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
! 217: $line = substr($line, 0, $indsup).'>'.substr($line, $indsup+1);
! 218: $lines->[$i] = $line;
! 219: } else {
! 220: $i++;
! 221: $line = $lines->[$i];
! 222: $j = 0;
! 223: }
! 224: }
! 225: }
! 226: $j++;
! 227: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
! 228: }
! 229: }
! 230:
! 231:
! 232: ##
! 233: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
! 234: # @param {Array<string>} lines
! 235: ##
! 236: sub fix_html_entities {
! 237: my ($lines) = @_;
! 238: foreach my $line (@{$lines}) {
! 239: # html_to_xml is converting named entities before 255 (see HTML parser dtext)
! 240: # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
! 241: $line =~ s/€|€/€/g;
! 242: $line =~ s/‚|‚/‚/g;
! 243: $line =~ s/„|„/„/g;
! 244: $line =~ s/…|…/…/g;
! 245: $line =~ s/†|†/†/g;
! 246: $line =~ s/‡|‡/‡/g;
! 247: $line =~ s/ˆ|ˆ/ˆ/g;
! 248: $line =~ s/‰|‰/‰/g;
! 249: $line =~ s/‹|‹/‹/g;
! 250: $line =~ s/‘|‘/‘/g;
! 251: $line =~ s/’|’/’/g;
! 252: $line =~ s/“|“/“/g;
! 253: $line =~ s/”|”/”/g;
! 254: $line =~ s/•|•/•/g;
! 255: $line =~ s/–|–/–/g;
! 256: $line =~ s/—|—/—/g;
! 257: $line =~ s/˜|˜/˜/g;
! 258: $line =~ s/™|™/™/g;
! 259: $line =~ s/›|›/›/g;
! 260: $line =~ s/œ|œ/œ/g;
! 261: }
! 262: }
! 263:
! 264:
! 265: # Tries to fix things like <font color="#990000" face="Verdana,>
! 266: # without breaking <a b="c>d">
! 267: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
! 268: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
! 269: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
! 270: sub fix_missing_quotes {
! 271: my ($lines) = @_;
! 272: foreach my $line (@{$lines}) {
! 273: my $n_inf = $line =~ tr/<//;
! 274: my $n_sup = $line =~ tr/>//;
! 275: if ($n_inf == 1 && $n_sup == 1) {
! 276: my $ind_inf = index($line, '<');
! 277: my $ind_sup = index($line, '>');
! 278: if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
! 279: my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
! 280: if ($n_quotes % 2 != 0) {
! 281: # add a quote before > when there is an odd number of quotes inside <>
! 282: $line =~ s/>/">/;
! 283: }
! 284: }
! 285: }
! 286: $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
! 287: $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
! 288: }
! 289: }
! 290:
! 291:
! 292: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
! 293: sub fix_empty_li {
! 294: my ($lines) = @_;
! 295: foreach my $line (@{$lines}) {
! 296: $line =~ s/<li\s?\/>/<li>/;
! 297: }
! 298: }
! 299:
! 300:
! 301: # remove doctypes, without assuming they are at the beginning
! 302: sub remove_doctype {
! 303: my ($lines) = @_;
! 304: foreach my $line (@{$lines}) {
! 305: $line =~ s/<!DOCTYPE[^>]*>//;
! 306: }
! 307: }
! 308:
! 309:
! 310: # Adds a problem, library or html root element, enclosing things outside of the problem element.
! 311: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
! 312: sub add_root {
! 313: my ($lines, $filepath) = @_;
! 314: my $root_name;
! 315: if ($filepath =~ /\.library$/i) {
! 316: $root_name = 'library';
! 317: } elsif ($filepath =~ /\.html?$/i) {
! 318: $root_name = 'html';
! 319: } else {
! 320: $root_name = 'problem';
! 321: }
! 322: if ($root_name eq 'library') {
! 323: foreach my $line (@{$lines}) {
! 324: if ($line =~ /^\s*<[a-z]/) {
! 325: last;
! 326: }
! 327: if ($line !~ /^\s*$/) {
! 328: die "this library does not start with a tag, it might be a scriptlib";
! 329: }
! 330: }
! 331: }
! 332: my $line1 = $lines->[0];
! 333: $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
! 334: $line1 = "<$root_name>".$line1;
! 335: $lines->[0] = $line1;
! 336: $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
! 337: }
! 338:
! 339:
! 340: ##
! 341: # Returns information about the next tag, starting at line number and char number.
! 342: # Assumes the markup is well-formed and there is no CDATA,
! 343: # which is not always true (like inside script), so results might be wrong sometimes.
! 344: # It is however useful to avoid unnecessary changes in the document (using a parser to
! 345: # do read/write for the whole document would mess up non well-formed documents).
! 346: # @param {Array<string>} lines
! 347: # @param {int} line_number - line number to start at
! 348: # @param {int} char_number - char number to start at on the line
! 349: # @returns (tag, type, line_number, char_number)
! 350: ##
! 351: sub next_tag {
! 352: my ($lines, $i, $j ) = @_;
! 353: my $i2 = $i;
! 354: my $j2 = $j;
! 355: while ($i2 < scalar(@{$lines})) {
! 356: my $line = $lines->[$i2];
! 357: $j2 = index($line, '<', $j2);
! 358: #TODO: handle comments
! 359: while ($j2 != -1) {
! 360: my $ind_slash = index($line, '/', $j2);
! 361: my $ind_sup = index($line, '>', $j2);
! 362: my $ind_space = index($line, ' ', $j2);
! 363: my $type;
! 364: my $tag;
! 365: if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
! 366: $type = 'end';
! 367: $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
! 368: } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
! 369: $type = 'empty';
! 370: if ($ind_space != -1 && $ind_space < $ind_sup) {
! 371: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
! 372: } else {
! 373: $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
! 374: }
! 375: } elsif ($ind_sup != -1) {
! 376: $type = 'start';
! 377: if ($ind_space != -1 && $ind_space < $ind_sup) {
! 378: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
! 379: } else {
! 380: $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
! 381: }
! 382: } else {
! 383: $tag = ''
! 384: }
! 385: if ($tag ne '') {
! 386: return ($tag, $type, $i2, $j2);
! 387: }
! 388: $j2 = index($line, '<', $j2 + 1);
! 389: }
! 390: $i2++;
! 391: $j2 = 0;
! 392: }
! 393: return ('', '', 0, 0);
! 394: }
! 395:
! 396: ##
! 397: # Tests if a string is in an array, ignoring case
! 398: ##
! 399: sub in_array_ignore_case {
! 400: my ($array, $value) = @_;
! 401: my $lcvalue = lc($value);
! 402: foreach my $v (@{$array}) {
! 403: if (lc($v) eq $lcvalue) {
! 404: return 1;
! 405: }
! 406: }
! 407: return 0;
! 408: }
! 409:
! 410: 1;
! 411: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>