modules/damieng/clean_xml/pre_xml.pm - view

File: [LON-CAPA] / modules / damieng / clean_xml / pre_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Fri Apr 17 15:35:01 2015 UTC (9 years, 5 months ago) by damieng
Branches: MAIN
CVS tags: HEAD

added clean_xml and graphical_editor

1: #!/usr/bin/perl 2: 3: package pre_xml; 4: 5: use strict; 6: use utf8; 7: 8: use Encode; 9: use Encode::Byte; 10: use Encode::Guess; 11: 12: # list of elements inside which < and > might not be turned into entities 13: # unfortunately, answer can sometimes contain the elements vector and value... 14: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it 15: 16: 17: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities, 18: # and returns the converted text. 19: sub pre_xml { 20: my ($filepath) = @_; 21: 22: my $lines = guess_encoding_and_read($filepath); 23: 24: remove_control_characters($lines); 25: 26: fix_cdata_elements($lines); 27: 28: fix_html_entities($lines); 29: 30: fix_missing_quotes($lines); 31: 32: fix_empty_li($lines); 33: 34: remove_doctype($lines); 35: 36: add_root($lines, $filepath); 37: 38: return(\join('', @$lines)); 39: } 40: 41: 42: ## 43: # Tries to guess the character encoding, and returns the lines as decoded text. 44: # Requires Encode::Byte. 45: ## 46: sub guess_encoding_and_read { 47: my ($fn) = @_; 48: no warnings "utf8"; 49: local $/ = undef; 50: open(my $fh, "<", $fn) or die "cannot read $fn: $!"; 51: binmode $fh; 52: my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element 53: close $fh; 54: 55: if (index($data, '<') == -1) { 56: die "This file has no markup !"; 57: } 58: 59: # try to get a charset from a meta at the beginning of the file 60: my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half 61: if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) { 62: my $meta_charset = $1; 63: if ($meta_charset ne '') { 64: if ($meta_charset =~ /iso-?8859-?1/i) { 65: # usually a lie 66: $meta_charset = 'cp1252'; 67: } 68: # now try to decode using that encoding 69: my $decoder = guess_encoding($data, ($meta_charset)); 70: if (ref($decoder)) { 71: my $decoded = $decoder->decode($data); 72: my @lines = split(/^/m, $decoded); 73: return \@lines; 74: } else { 75: print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n"; 76: } 77: } 78: } 79: 80: my $decoded; 81: if (length($data) > 0) { 82: # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess 83: #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/); 84: # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM 85: my $decoder = Encode::Guess->guess($data); 86: if (ref($decoder)) { 87: $decoded = $decoder->decode($data); 88: # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8) 89: } else { 90: print "Warning: encoding is not UTF-8 for $fn"; 91: 92: # let's try iso-2022-jp first 93: $decoder = Encode::Guess->guess($data, 'iso-2022-jp'); 94: if (ref($decoder)) { 95: $decoded = $decoder->decode($data); 96: print "; using iso-2022-jp\n"; 97: } else { 98: # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159 99: # instead of control codes. We can assume that these control codes are not used, so there 100: # is no need to test for iso-8859-1. 101: # The main problem here is to distinguish between cp1252 and MacRoman. 102: # see http://www.alanwood.net/demos/charsetdiffs.html#f 103: my $decoded_windows = decode('cp1252', $data); 104: my $decoded_mac = decode('MacRoman', $data); 105: # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese) 106: # í has been removed because it conflicts with ’ and ’ is more frequent 107: # ± has been removed because it is, suprisingly, the same code in both encodings ! 108: my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//; 109: my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//; 110: # check newlines too (\r on MacOS < X, \r\n on Windows) 111: my $ind_cr = index($data, "\r"); 112: if ($ind_cr != -1) { 113: if (substr($data, $ind_cr + 1, 1) eq "\n") { 114: $score_windows++; 115: } else { 116: $score_mac++; 117: } 118: } 119: if ($score_windows >= $score_mac) { 120: $decoded = $decoded_windows; 121: print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n"; 122: } else { 123: print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n"; 124: $decoded = $decoded_mac; 125: } 126: } 127: } 128: } else { 129: $decoded = ''; 130: } 131: my @lines = split(/^/m, $decoded); 132: return \@lines; 133: } 134: 135: 136: ## 137: # Removes some control characters 138: # @param {Array<string>} lines 139: ## 140: sub remove_control_characters { 141: my ($lines) = @_; 142: foreach my $line (@{$lines}) { 143: $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g; 144: $line =~ s/&#[0-7];//g; 145: $line =~ s/&#1[4-9];//g; 146: $line =~ s/&#2[0-9];//g; 147: } 148: } 149: 150: ## 151: # Replaces < and > characters by < and > in cdata elements (listed in @cdata_elements). 152: # EXCEPT for answer when it's inside numericalresponse or formularesponse. 153: # @param {Array<string>} lines 154: ## 155: sub fix_cdata_elements { 156: my ($lines) = @_; 157: my $i = 0; 158: my $j = 0; 159: my $tag = ''; 160: my $type; 161: my $in_numericalresponse = 0; 162: my $in_formularesponse = 0; 163: my $in_script = 0; 164: ($tag, $type, $i, $j) = next_tag($lines, $i, $j); 165: while ($tag ne '') { 166: if ($tag eq 'numericalresponse') { 167: if ($type eq 'start') { 168: $in_numericalresponse = 1; 169: } else { 170: $in_numericalresponse = 0; 171: } 172: } elsif ($tag eq 'formularesponse') { 173: if ($type eq 'start') { 174: $in_formularesponse = 1; 175: } else { 176: $in_formularesponse = 0; 177: } 178: } elsif ($tag eq 'script') { 179: if ($type eq 'start') { 180: $in_script = 1; 181: } else { 182: $in_script = 0; 183: } 184: } 185: if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script && 186: ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) { 187: my $cde = $tag; 188: my $line = $lines->[$i]; 189: $j = index($line, '>', $j+1) + 1; 190: my $stop = 0; 191: while (!$stop && $i < scalar(@{$lines})) { 192: my $indinf = index($line, '<', $j); 193: if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) { 194: $i++; 195: $line = $lines->[$i]; 196: $j = 0; 197: last; 198: } 199: my $indsup = index($line, '>', $j); 200: if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) { 201: my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1)); 202: $test =~ s/^\s+|\s+$//g ; 203: if ($test eq '/'.$cde) { 204: $stop = 1; 205: $j = $indsup; 206: # this is commented because of markup like <display>&web(' ','','<p>')</display> 207: #} elsif ($test =~ /^[a-zA-Z\/]$/) { 208: # $j = $indsup + 1; 209: } else { 210: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1); 211: $lines->[$i] = $line; 212: } 213: } elsif ($indinf != -1 && $indsup == -1) { 214: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1); 215: $lines->[$i] = $line; 216: } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) { 217: $line = substr($line, 0, $indsup).'>'.substr($line, $indsup+1); 218: $lines->[$i] = $line; 219: } else { 220: $i++; 221: $line = $lines->[$i]; 222: $j = 0; 223: } 224: } 225: } 226: $j++; 227: ($tag, $type, $i, $j) = next_tag($lines, $i, $j); 228: } 229: } 230: 231: 232: ## 233: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML). 234: # @param {Array<string>} lines 235: ## 236: sub fix_html_entities { 237: my ($lines) = @_; 238: foreach my $line (@{$lines}) { 239: # html_to_xml is converting named entities before 255 (see HTML parser dtext) 240: # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255): 241: $line =~ s/|/€/g; 242: $line =~ s/|/‚/g; 243: $line =~ s/|/„/g; 244: $line =~ s/|/…/g; 245: $line =~ s/|/†/g; 246: $line =~ s/|/‡/g; 247: $line =~ s/|/ˆ/g; 248: $line =~ s/|/‰/g; 249: $line =~ s/|/‹/g; 250: $line =~ s/|/‘/g; 251: $line =~ s/|/’/g; 252: $line =~ s/|/“/g; 253: $line =~ s/|/”/g; 254: $line =~ s/|/•/g; 255: $line =~ s/|/–/g; 256: $line =~ s/|/—/g; 257: $line =~ s/|/˜/g; 258: $line =~ s/|/™/g; 259: $line =~ s/|/›/g; 260: $line =~ s/|/œ/g; 261: } 262: } 263: 264: 265: # Tries to fix things like <font color="#990000" face="Verdana,> 266: # without breaking <a b="c>d"> 267: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case). 268: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" , 269: # and (no markup before)<a b="c> (no quote after) into <a b="c"> . 270: sub fix_missing_quotes { 271: my ($lines) = @_; 272: foreach my $line (@{$lines}) { 273: my $n_inf = $line =~ tr/<//; 274: my $n_sup = $line =~ tr/>//; 275: if ($n_inf == 1 && $n_sup == 1) { 276: my $ind_inf = index($line, '<'); 277: my $ind_sup = index($line, '>'); 278: if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) { 279: my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//; 280: if ($n_quotes % 2 != 0) { 281: # add a quote before > when there is an odd number of quotes inside <> 282: $line =~ s/>/">/; 283: } 284: } 285: } 286: $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/; 287: $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/; 288: } 289: } 290: 291: 292: # Replaces <li/> by <li> (the end tag will be added in html_to_xml 293: sub fix_empty_li { 294: my ($lines) = @_; 295: foreach my $line (@{$lines}) { 296: $line =~ s/<li\s?\/>/<li>/; 297: } 298: } 299: 300: 301: # remove doctypes, without assuming they are at the beginning 302: sub remove_doctype { 303: my ($lines) = @_; 304: foreach my $line (@{$lines}) { 305: $line =~ s/<!DOCTYPE[^>]*>//; 306: } 307: } 308: 309: 310: # Adds a problem, library or html root element, enclosing things outside of the problem element. 311: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing). 312: sub add_root { 313: my ($lines, $filepath) = @_; 314: my $root_name; 315: if ($filepath =~ /\.library$/i) { 316: $root_name = 'library'; 317: } elsif ($filepath =~ /\.html?$/i) { 318: $root_name = 'html'; 319: } else { 320: $root_name = 'problem'; 321: } 322: if ($root_name eq 'library') { 323: foreach my $line (@{$lines}) { 324: if ($line =~ /^\s*<[a-z]/) { 325: last; 326: } 327: if ($line !~ /^\s*$/) { 328: die "this library does not start with a tag, it might be a scriptlib"; 329: } 330: } 331: } 332: my $line1 = $lines->[0]; 333: $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway 334: $line1 = "<$root_name>".$line1; 335: $lines->[0] = $line1; 336: $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>"; 337: } 338: 339: 340: ## 341: # Returns information about the next tag, starting at line number and char number. 342: # Assumes the markup is well-formed and there is no CDATA, 343: # which is not always true (like inside script), so results might be wrong sometimes. 344: # It is however useful to avoid unnecessary changes in the document (using a parser to 345: # do read/write for the whole document would mess up non well-formed documents). 346: # @param {Array<string>} lines 347: # @param {int} line_number - line number to start at 348: # @param {int} char_number - char number to start at on the line 349: # @returns (tag, type, line_number, char_number) 350: ## 351: sub next_tag { 352: my ($lines, $i, $j ) = @_; 353: my $i2 = $i; 354: my $j2 = $j; 355: while ($i2 < scalar(@{$lines})) { 356: my $line = $lines->[$i2]; 357: $j2 = index($line, '<', $j2); 358: #TODO: handle comments 359: while ($j2 != -1) { 360: my $ind_slash = index($line, '/', $j2); 361: my $ind_sup = index($line, '>', $j2); 362: my $ind_space = index($line, ' ', $j2); 363: my $type; 364: my $tag; 365: if ($ind_slash == $j2 + 1 && $ind_sup != -1) { 366: $type = 'end'; 367: $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2)); 368: } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) { 369: $type = 'empty'; 370: if ($ind_space != -1 && $ind_space < $ind_sup) { 371: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1)); 372: } else { 373: $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1)); 374: } 375: } elsif ($ind_sup != -1) { 376: $type = 'start'; 377: if ($ind_space != -1 && $ind_space < $ind_sup) { 378: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1)); 379: } else { 380: $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1)); 381: } 382: } else { 383: $tag = '' 384: } 385: if ($tag ne '') { 386: return ($tag, $type, $i2, $j2); 387: } 388: $j2 = index($line, '<', $j2 + 1); 389: } 390: $i2++; 391: $j2 = 0; 392: } 393: return ('', '', 0, 0); 394: } 395: 396: ## 397: # Tests if a string is in an array, ignoring case 398: ## 399: sub in_array_ignore_case { 400: my ($array, $value) = @_; 401: my $lcvalue = lc($value); 402: foreach my $v (@{$array}) { 403: if (lc($v) eq $lcvalue) { 404: return 1; 405: } 406: } 407: return 0; 408: } 409: 410: 1; 411: __END__