Annotation of loncom/homework/cleanxml/pre_xml.pm, revision 1.1
1.1 ! damieng 1: # The LearningOnline Network
! 2: # First step to clean a file.
! 3: #
! 4: # $Id$
! 5: #
! 6: # Copyright Michigan State University Board of Trustees
! 7: #
! 8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
! 9: #
! 10: # LON-CAPA is free software; you can redistribute it and/or modify
! 11: # it under the terms of the GNU General Public License as published by
! 12: # the Free Software Foundation; either version 2 of the License, or
! 13: # (at your option) any later version.
! 14: #
! 15: # LON-CAPA is distributed in the hope that it will be useful,
! 16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! 18: # GNU General Public License for more details.
! 19: #
! 20: # You should have received a copy of the GNU General Public License
! 21: # along with LON-CAPA; if not, write to the Free Software
! 22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
! 23: #
! 24: # /home/httpd/html/adm/gpl.txt
! 25: #
! 26: # http://www.lon-capa.org/
! 27: #
! 28: ###
! 29:
! 30: #!/usr/bin/perl
! 31:
! 32: package Apache::pre_xml;
! 33:
! 34: use strict;
! 35: use utf8;
! 36:
! 37: use Encode;
! 38: use Encode::Byte;
! 39: use Encode::Guess;
! 40:
! 41: # list of elements inside which < and > might not be turned into entities
! 42: # unfortunately, answer can sometimes contain the elements vector and value...
! 43: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
! 44:
! 45: my $warnings; # 1 = print warnings
! 46:
! 47: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
! 48: # and returns the converted text.
! 49: sub pre_xml {
! 50: my ($filepath, $warn) = @_;
! 51: $warnings = $warn;
! 52:
! 53: my $lines = guess_encoding_and_read($filepath);
! 54:
! 55: remove_control_characters($lines);
! 56:
! 57: fix_cdata_elements($lines);
! 58:
! 59: fix_html_entities($lines);
! 60:
! 61: fix_missing_quotes($lines);
! 62:
! 63: fix_empty_li($lines);
! 64:
! 65: remove_doctype($lines);
! 66:
! 67: add_root($lines, $filepath);
! 68:
! 69: return(\join('', @$lines));
! 70: }
! 71:
! 72:
! 73: ##
! 74: # Tries to guess the character encoding, and returns the lines as decoded text.
! 75: # Requires Encode::Byte.
! 76: ##
! 77: sub guess_encoding_and_read {
! 78: my ($fn) = @_;
! 79: no warnings "utf8";
! 80: local $/ = undef;
! 81: open(my $fh, "<", $fn) or die "cannot read $fn: $!";
! 82: binmode $fh;
! 83: my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
! 84: close $fh;
! 85:
! 86: if (index($data, '<') == -1) {
! 87: die "This file has no markup !";
! 88: }
! 89:
! 90: # try to get a charset from a meta at the beginning of the file
! 91: my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
! 92: if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
! 93: my $meta_charset = $1;
! 94: if ($meta_charset ne '') {
! 95: if ($meta_charset =~ /iso-?8859-?1/i) {
! 96: # usually a lie
! 97: $meta_charset = 'cp1252';
! 98: }
! 99: # now try to decode using that encoding
! 100: my $decoder = guess_encoding($data, ($meta_charset));
! 101: if (ref($decoder)) {
! 102: my $decoded = $decoder->decode($data);
! 103: my @lines = split(/^/m, $decoded);
! 104: return \@lines;
! 105: } else {
! 106: if ($warnings) {
! 107: print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
! 108: }
! 109: }
! 110: }
! 111: }
! 112:
! 113: my $decoded;
! 114: if (length($data) > 0) {
! 115: # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
! 116: #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
! 117: # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
! 118: my $decoder = Encode::Guess->guess($data);
! 119: if (ref($decoder)) {
! 120: $decoded = $decoder->decode($data);
! 121: # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
! 122: } else {
! 123: if ($warnings) {
! 124: print "Warning: encoding is not UTF-8 for $fn";
! 125: }
! 126:
! 127: # let's try iso-2022-jp first
! 128: $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
! 129: if (ref($decoder)) {
! 130: $decoded = $decoder->decode($data);
! 131: if ($warnings) {
! 132: print "; using iso-2022-jp\n";
! 133: }
! 134: } else {
! 135: # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
! 136: # instead of control codes. We can assume that these control codes are not used, so there
! 137: # is no need to test for iso-8859-1.
! 138: # The main problem here is to distinguish between cp1252 and MacRoman.
! 139: # see http://www.alanwood.net/demos/charsetdiffs.html#f
! 140: my $decoded_windows = decode('cp1252', $data);
! 141: my $decoded_mac = decode('MacRoman', $data);
! 142: # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
! 143: # í has been removed because it conflicts with ’ and ’ is more frequent
! 144: # ± has been removed because it is, suprisingly, the same code in both encodings !
! 145: my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
! 146: my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
! 147: # check newlines too (\r on MacOS < X, \r\n on Windows)
! 148: my $ind_cr = index($data, "\r");
! 149: if ($ind_cr != -1) {
! 150: if (substr($data, $ind_cr + 1, 1) eq "\n") {
! 151: $score_windows++;
! 152: } else {
! 153: $score_mac++;
! 154: }
! 155: }
! 156: if ($score_windows >= $score_mac) {
! 157: $decoded = $decoded_windows;
! 158: if ($warnings) {
! 159: print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
! 160: }
! 161: } else {
! 162: if ($warnings) {
! 163: print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
! 164: }
! 165: $decoded = $decoded_mac;
! 166: }
! 167: }
! 168: }
! 169: } else {
! 170: $decoded = '';
! 171: }
! 172: my @lines = split(/^/m, $decoded);
! 173: return \@lines;
! 174: }
! 175:
! 176:
! 177: ##
! 178: # Removes some control characters
! 179: # @param {Array<string>} lines
! 180: ##
! 181: sub remove_control_characters {
! 182: my ($lines) = @_;
! 183: foreach my $line (@{$lines}) {
! 184: $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
! 185: $line =~ s/&#[0-7];//g;
! 186: $line =~ s/[4-9];//g;
! 187: $line =~ s/[0-9];//g;
! 188: }
! 189: }
! 190:
! 191: ##
! 192: # Replaces < and > characters by < and > in cdata elements (listed in @cdata_elements).
! 193: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
! 194: # @param {Array<string>} lines
! 195: ##
! 196: sub fix_cdata_elements {
! 197: my ($lines) = @_;
! 198: my $i = 0;
! 199: my $j = 0;
! 200: my $tag = '';
! 201: my $type;
! 202: my $in_numericalresponse = 0;
! 203: my $in_formularesponse = 0;
! 204: my $in_script = 0;
! 205: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
! 206: while ($tag ne '') {
! 207: if ($tag eq 'numericalresponse') {
! 208: if ($type eq 'start') {
! 209: $in_numericalresponse = 1;
! 210: } else {
! 211: $in_numericalresponse = 0;
! 212: }
! 213: } elsif ($tag eq 'formularesponse') {
! 214: if ($type eq 'start') {
! 215: $in_formularesponse = 1;
! 216: } else {
! 217: $in_formularesponse = 0;
! 218: }
! 219: } elsif ($tag eq 'script') {
! 220: if ($type eq 'start') {
! 221: $in_script = 1;
! 222: } else {
! 223: $in_script = 0;
! 224: }
! 225: }
! 226: if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
! 227: ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
! 228: my $cde = $tag;
! 229: my $line = $lines->[$i];
! 230: $j = index($line, '>', $j+1) + 1;
! 231: my $stop = 0;
! 232: while (!$stop && $i < scalar(@{$lines})) {
! 233: my $indinf = index($line, '<', $j);
! 234: if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
! 235: $i++;
! 236: $line = $lines->[$i];
! 237: $j = 0;
! 238: last;
! 239: }
! 240: my $indsup = index($line, '>', $j);
! 241: if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
! 242: my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
! 243: $test =~ s/^\s+|\s+$//g ;
! 244: if ($test eq '/'.$cde) {
! 245: $stop = 1;
! 246: $j = $indsup;
! 247: # this is commented because of markup like <display>&web(' ','','<p>')</display>
! 248: #} elsif ($test =~ /^[a-zA-Z\/]$/) {
! 249: # $j = $indsup + 1;
! 250: } else {
! 251: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
! 252: $lines->[$i] = $line;
! 253: }
! 254: } elsif ($indinf != -1 && $indsup == -1) {
! 255: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
! 256: $lines->[$i] = $line;
! 257: } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
! 258: $line = substr($line, 0, $indsup).'>'.substr($line, $indsup+1);
! 259: $lines->[$i] = $line;
! 260: } else {
! 261: $i++;
! 262: $line = $lines->[$i];
! 263: $j = 0;
! 264: }
! 265: }
! 266: }
! 267: $j++;
! 268: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
! 269: }
! 270: }
! 271:
! 272:
! 273: ##
! 274: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
! 275: # @param {Array<string>} lines
! 276: ##
! 277: sub fix_html_entities {
! 278: my ($lines) = @_;
! 279: foreach my $line (@{$lines}) {
! 280: # html_to_xml is converting named entities before 255 (see HTML parser dtext)
! 281: # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
! 282: $line =~ s/€|€/€/g;
! 283: $line =~ s/‚|‚/‚/g;
! 284: $line =~ s/„|„/„/g;
! 285: $line =~ s/…|…/…/g;
! 286: $line =~ s/†|†/†/g;
! 287: $line =~ s/‡|‡/‡/g;
! 288: $line =~ s/ˆ|ˆ/ˆ/g;
! 289: $line =~ s/‰|‰/‰/g;
! 290: $line =~ s/‹|‹/‹/g;
! 291: $line =~ s/‘|‘/‘/g;
! 292: $line =~ s/’|’/’/g;
! 293: $line =~ s/“|“/“/g;
! 294: $line =~ s/”|”/”/g;
! 295: $line =~ s/•|•/•/g;
! 296: $line =~ s/–|–/–/g;
! 297: $line =~ s/—|—/—/g;
! 298: $line =~ s/˜|˜/˜/g;
! 299: $line =~ s/™|™/™/g;
! 300: $line =~ s/›|›/›/g;
! 301: $line =~ s/œ|œ/œ/g;
! 302: }
! 303: }
! 304:
! 305:
! 306: # Tries to fix things like <font color="#990000" face="Verdana,>
! 307: # without breaking <a b="c>d">
! 308: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
! 309: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
! 310: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
! 311: sub fix_missing_quotes {
! 312: my ($lines) = @_;
! 313: foreach my $line (@{$lines}) {
! 314: my $n_inf = $line =~ tr/<//;
! 315: my $n_sup = $line =~ tr/>//;
! 316: if ($n_inf == 1 && $n_sup == 1) {
! 317: my $ind_inf = index($line, '<');
! 318: my $ind_sup = index($line, '>');
! 319: if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
! 320: my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
! 321: if ($n_quotes % 2 != 0) {
! 322: # add a quote before > when there is an odd number of quotes inside <>
! 323: $line =~ s/>/">/;
! 324: }
! 325: }
! 326: }
! 327: $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
! 328: $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
! 329: }
! 330: }
! 331:
! 332:
! 333: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
! 334: sub fix_empty_li {
! 335: my ($lines) = @_;
! 336: foreach my $line (@{$lines}) {
! 337: $line =~ s/<li\s?\/>/<li>/;
! 338: }
! 339: }
! 340:
! 341:
! 342: # remove doctypes, without assuming they are at the beginning
! 343: sub remove_doctype {
! 344: my ($lines) = @_;
! 345: foreach my $line (@{$lines}) {
! 346: $line =~ s/<!DOCTYPE[^>]*>//;
! 347: }
! 348: }
! 349:
! 350:
! 351: # Adds a problem, library or html root element, enclosing things outside of the problem element.
! 352: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
! 353: sub add_root {
! 354: my ($lines, $filepath) = @_;
! 355: my $root_name;
! 356: if ($filepath =~ /\.library$/i) {
! 357: $root_name = 'library';
! 358: } elsif ($filepath =~ /\.html?$/i) {
! 359: $root_name = 'html';
! 360: } else {
! 361: $root_name = 'problem';
! 362: }
! 363: if ($root_name eq 'library') {
! 364: foreach my $line (@{$lines}) {
! 365: if ($line =~ /^\s*<[a-z]/) {
! 366: last;
! 367: }
! 368: if ($line !~ /^\s*$/) {
! 369: die "this library does not start with a tag, it might be a scriptlib";
! 370: }
! 371: }
! 372: }
! 373: my $line1 = $lines->[0];
! 374: $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
! 375: $line1 = "<$root_name>".$line1;
! 376: $lines->[0] = $line1;
! 377: $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
! 378: }
! 379:
! 380:
! 381: ##
! 382: # Returns information about the next tag, starting at line number and char number.
! 383: # Assumes the markup is well-formed and there is no CDATA,
! 384: # which is not always true (like inside script), so results might be wrong sometimes.
! 385: # It is however useful to avoid unnecessary changes in the document (using a parser to
! 386: # do read/write for the whole document would mess up non well-formed documents).
! 387: # @param {Array<string>} lines
! 388: # @param {int} line_number - line number to start at
! 389: # @param {int} char_number - char number to start at on the line
! 390: # @returns (tag, type, line_number, char_number)
! 391: ##
! 392: sub next_tag {
! 393: my ($lines, $i, $j ) = @_;
! 394: my $i2 = $i;
! 395: my $j2 = $j;
! 396: while ($i2 < scalar(@{$lines})) {
! 397: my $line = $lines->[$i2];
! 398: $j2 = index($line, '<', $j2);
! 399: #TODO: handle comments
! 400: while ($j2 != -1) {
! 401: my $ind_slash = index($line, '/', $j2);
! 402: my $ind_sup = index($line, '>', $j2);
! 403: my $ind_space = index($line, ' ', $j2);
! 404: my $type;
! 405: my $tag;
! 406: if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
! 407: $type = 'end';
! 408: $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
! 409: } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
! 410: $type = 'empty';
! 411: if ($ind_space != -1 && $ind_space < $ind_sup) {
! 412: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
! 413: } else {
! 414: $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
! 415: }
! 416: } elsif ($ind_sup != -1) {
! 417: $type = 'start';
! 418: if ($ind_space != -1 && $ind_space < $ind_sup) {
! 419: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
! 420: } else {
! 421: $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
! 422: }
! 423: } else {
! 424: $tag = ''
! 425: }
! 426: if ($tag ne '') {
! 427: return ($tag, $type, $i2, $j2);
! 428: }
! 429: $j2 = index($line, '<', $j2 + 1);
! 430: }
! 431: $i2++;
! 432: $j2 = 0;
! 433: }
! 434: return ('', '', 0, 0);
! 435: }
! 436:
! 437: ##
! 438: # Tests if a string is in an array, ignoring case
! 439: ##
! 440: sub in_array_ignore_case {
! 441: my ($array, $value) = @_;
! 442: my $lcvalue = lc($value);
! 443: foreach my $v (@{$array}) {
! 444: if (lc($v) eq $lcvalue) {
! 445: return 1;
! 446: }
! 447: }
! 448: return 0;
! 449: }
! 450:
! 451: 1;
! 452: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>