File:  [LON-CAPA] / modules / damieng / clean_xml / pre_xml.pm
Revision 1.1: download - view: text, annotated - select for diffs
Fri Apr 17 15:35:01 2015 UTC (9 years, 5 months ago) by damieng
Branches: MAIN
CVS tags: HEAD
added clean_xml and graphical_editor

    1: #!/usr/bin/perl
    2: 
    3: package pre_xml;
    4: 
    5: use strict;
    6: use utf8;
    7: 
    8: use Encode;
    9: use Encode::Byte;
   10: use Encode::Guess;
   11: 
   12: # list of elements inside which < and > might not be turned into entities
   13: # unfortunately, answer can sometimes contain the elements vector and value...
   14: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
   15: 
   16: 
   17: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
   18: # and returns the converted text.
   19: sub pre_xml {
   20:   my ($filepath) = @_;
   21:   
   22:   my $lines = guess_encoding_and_read($filepath);
   23: 
   24:   remove_control_characters($lines);
   25:   
   26:   fix_cdata_elements($lines);
   27: 
   28:   fix_html_entities($lines);
   29:   
   30:   fix_missing_quotes($lines);
   31:   
   32:   fix_empty_li($lines);
   33:   
   34:   remove_doctype($lines);
   35:   
   36:   add_root($lines, $filepath);
   37:   
   38:   return(\join('', @$lines));
   39: }
   40: 
   41: 
   42: ##
   43: # Tries to guess the character encoding, and returns the lines as decoded text.
   44: # Requires Encode::Byte.
   45: ##
   46: sub guess_encoding_and_read {
   47:   my ($fn) = @_;
   48:   no warnings "utf8";
   49:   local $/ = undef;
   50:   open(my $fh, "<", $fn) or die "cannot read $fn: $!";
   51:   binmode $fh;
   52:   my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
   53:   close $fh;
   54:   
   55:   if (index($data, '<') == -1) {
   56:     die "This file has no markup !";
   57:   }
   58:   
   59:   # try to get a charset from a meta at the beginning of the file
   60:   my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
   61:   if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
   62:     my $meta_charset = $1;
   63:     if ($meta_charset ne '') {
   64:       if ($meta_charset =~ /iso-?8859-?1/i) {
   65:         # usually a lie
   66:         $meta_charset = 'cp1252';
   67:       }
   68:       # now try to decode using that encoding
   69:       my $decoder = guess_encoding($data, ($meta_charset));
   70:       if (ref($decoder)) {
   71:         my $decoded = $decoder->decode($data);
   72:         my @lines = split(/^/m, $decoded);
   73:         return \@lines;
   74:       } else {
   75:         print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
   76:       }
   77:     }
   78:   }
   79:   
   80:   my $decoded;
   81:   if (length($data) > 0) {
   82:     # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
   83:     #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
   84:     # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
   85:     my $decoder = Encode::Guess->guess($data);
   86:     if (ref($decoder)) {
   87:       $decoded = $decoder->decode($data);
   88:       # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
   89:     } else {
   90:       print "Warning: encoding is not UTF-8 for $fn";
   91:       
   92:       # let's try iso-2022-jp first
   93:       $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
   94:       if (ref($decoder)) {
   95:         $decoded = $decoder->decode($data);
   96:         print "; using iso-2022-jp\n";
   97:       } else {
   98:         # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
   99:         # instead of control codes. We can assume that these control codes are not used, so there
  100:         # is no need to test for iso-8859-1.
  101:         # The main problem here is to distinguish between cp1252 and MacRoman.
  102:         # see http://www.alanwood.net/demos/charsetdiffs.html#f
  103:         my $decoded_windows = decode('cp1252', $data);
  104:         my $decoded_mac = decode('MacRoman', $data);
  105:         # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
  106:         # í has been removed because it conflicts with ’ and ’ is more frequent
  107:         # ± has been removed because it is, suprisingly, the same code in both encodings !
  108:         my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
  109:         my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
  110:         # check newlines too (\r on MacOS < X, \r\n on Windows)
  111:         my $ind_cr = index($data, "\r");
  112:         if ($ind_cr != -1) {
  113:           if (substr($data, $ind_cr + 1, 1) eq "\n") {
  114:             $score_windows++;
  115:           } else {
  116:             $score_mac++;
  117:           }
  118:         }
  119:         if ($score_windows >= $score_mac) {
  120:           $decoded = $decoded_windows;
  121:           print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
  122:         } else {
  123:           print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
  124:           $decoded = $decoded_mac;
  125:         }
  126:       }
  127:     }
  128:   } else {
  129:     $decoded = '';
  130:   }
  131:   my @lines = split(/^/m, $decoded);
  132:   return \@lines;
  133: }
  134: 
  135: 
  136: ##
  137: # Removes some control characters
  138: # @param {Array<string>} lines
  139: ##
  140: sub remove_control_characters {
  141:   my ($lines) = @_;
  142:   foreach my $line (@{$lines}) {
  143:     $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
  144:     $line =~ s/&#[0-7];//g;
  145:     $line =~ s/&#1[4-9];//g;
  146:     $line =~ s/&#2[0-9];//g;
  147:   }
  148: }
  149: 
  150: ##
  151: # Replaces < and > characters by &lt; and &gt; in cdata elements (listed in @cdata_elements).
  152: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
  153: # @param {Array<string>} lines
  154: ##
  155: sub fix_cdata_elements {
  156:   my ($lines) = @_;
  157:   my $i = 0;
  158:   my $j = 0;
  159:   my $tag = '';
  160:   my $type;
  161:   my $in_numericalresponse = 0;
  162:   my $in_formularesponse = 0;
  163:   my $in_script = 0;
  164:   ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
  165:   while ($tag ne '') {
  166:     if ($tag eq 'numericalresponse') {
  167:       if ($type eq 'start') {
  168:         $in_numericalresponse = 1;
  169:       } else {
  170:         $in_numericalresponse = 0;
  171:       }
  172:     } elsif ($tag eq 'formularesponse') {
  173:       if ($type eq 'start') {
  174:         $in_formularesponse = 1;
  175:       } else {
  176:         $in_formularesponse = 0;
  177:       }
  178:     } elsif ($tag eq 'script') {
  179:       if ($type eq 'start') {
  180:         $in_script = 1;
  181:       } else {
  182:         $in_script = 0;
  183:       }
  184:     }
  185:     if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
  186:         ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
  187:       my $cde = $tag;
  188:       my $line = $lines->[$i];
  189:       $j = index($line, '>', $j+1) + 1;
  190:       my $stop = 0;
  191:       while (!$stop && $i < scalar(@{$lines})) {
  192:         my $indinf = index($line, '<', $j);
  193:         if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
  194:           $i++;
  195:           $line = $lines->[$i];
  196:           $j = 0;
  197:           last;
  198:         }
  199:         my $indsup = index($line, '>', $j);
  200:         if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
  201:           my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
  202:           $test =~ s/^\s+|\s+$//g ;
  203:           if ($test eq '/'.$cde) {
  204:             $stop = 1;
  205:             $j = $indsup;
  206:           # this is commented because of markup like <display>&web(' ','','<p>')</display>
  207:           #} elsif ($test =~ /^[a-zA-Z\/]$/) {
  208:           #  $j = $indsup + 1;
  209:           } else {
  210:             $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
  211:             $lines->[$i] = $line;
  212:           }
  213:         } elsif ($indinf != -1 && $indsup == -1) {
  214:           $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
  215:           $lines->[$i] = $line;
  216:         } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
  217:           $line = substr($line, 0, $indsup).'&gt;'.substr($line, $indsup+1);
  218:           $lines->[$i] = $line;
  219:         } else {
  220:           $i++;
  221:           $line = $lines->[$i];
  222:           $j = 0;
  223:         }
  224:       }
  225:     }
  226:     $j++;
  227:     ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
  228:   }
  229: }
  230: 
  231: 
  232: ##
  233: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
  234: # @param {Array<string>} lines
  235: ##
  236: sub fix_html_entities {
  237:   my ($lines) = @_;
  238:   foreach my $line (@{$lines}) {
  239:     # html_to_xml is converting named entities before 255 (see HTML parser dtext)
  240:     # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
  241:     $line =~ s/&#128;|&#x80;/€/g;
  242:     $line =~ s/&#130;|&#x82;/‚/g;
  243:     $line =~ s/&#132;|&#x84;/„/g;
  244:     $line =~ s/&#133;|&#x85;/…/g;
  245:     $line =~ s/&#134;|&#x86;/†/g;
  246:     $line =~ s/&#135;|&#x87;/‡/g;
  247:     $line =~ s/&#136;|&#x88;/ˆ/g;
  248:     $line =~ s/&#137;|&#x89;/‰/g;
  249:     $line =~ s/&#139;|&#x8B;/‹/g;
  250:     $line =~ s/&#145;|&#x91;/‘/g;
  251:     $line =~ s/&#146;|&#x92;/’/g;
  252:     $line =~ s/&#147;|&#x93;/“/g;
  253:     $line =~ s/&#148;|&#x94;/”/g;
  254:     $line =~ s/&#149;|&#x95;/•/g;
  255:     $line =~ s/&#150;|&#x96;/–/g;
  256:     $line =~ s/&#151;|&#x97;/—/g;
  257:     $line =~ s/&#152;|&#x98;/˜/g;
  258:     $line =~ s/&#153;|&#x99;/™/g;
  259:     $line =~ s/&#155;|&#x9B;/›/g;
  260:     $line =~ s/&#156;|&#x9C;/œ/g;
  261:   }
  262: }
  263: 
  264: 
  265: # Tries to fix things like <font color="#990000" face="Verdana,>
  266: # without breaking <a b="c>d">
  267: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
  268: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
  269: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
  270: sub fix_missing_quotes {
  271:   my ($lines) = @_;
  272:   foreach my $line (@{$lines}) {
  273:     my $n_inf = $line =~ tr/<//;
  274:     my $n_sup = $line =~ tr/>//;
  275:     if ($n_inf == 1 && $n_sup == 1) {
  276:       my $ind_inf = index($line, '<');
  277:       my $ind_sup = index($line, '>');
  278:       if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
  279:         my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
  280:         if ($n_quotes % 2 != 0) {
  281:           # add a quote before > when there is an odd number of quotes inside <>
  282:           $line =~ s/>/">/;
  283:         }
  284:       }
  285:     }
  286:     $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
  287:     $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
  288:   }
  289: }
  290: 
  291: 
  292: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
  293: sub fix_empty_li {
  294:   my ($lines) = @_;
  295:   foreach my $line (@{$lines}) {
  296:     $line =~ s/<li\s?\/>/<li>/;
  297:   }
  298: }
  299: 
  300: 
  301: # remove doctypes, without assuming they are at the beginning
  302: sub remove_doctype {
  303:   my ($lines) = @_;
  304:   foreach my $line (@{$lines}) {
  305:     $line =~ s/<!DOCTYPE[^>]*>//;
  306:   }
  307: }
  308: 
  309: 
  310: # Adds a problem, library or html root element, enclosing things outside of the problem element.
  311: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
  312: sub add_root {
  313:   my ($lines, $filepath) = @_;
  314:   my $root_name;
  315:   if ($filepath =~ /\.library$/i) {
  316:     $root_name = 'library';
  317:   } elsif ($filepath =~ /\.html?$/i) {
  318:     $root_name = 'html';
  319:   } else {
  320:     $root_name = 'problem';
  321:   }
  322:   if ($root_name eq 'library') {
  323:     foreach my $line (@{$lines}) {
  324:       if ($line =~ /^\s*<[a-z]/) {
  325:         last;
  326:       }
  327:       if ($line !~ /^\s*$/) {
  328:         die "this library does not start with a tag, it might be a scriptlib";
  329:       }
  330:     }
  331:   }
  332:   my $line1 = $lines->[0];
  333:   $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
  334:   $line1 = "<$root_name>".$line1;
  335:   $lines->[0] = $line1;
  336:   $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
  337: }
  338: 
  339: 
  340: ##
  341: # Returns information about the next tag, starting at line number and char number.
  342: # Assumes the markup is well-formed and there is no CDATA,
  343: # which is not always true (like inside script), so results might be wrong sometimes.
  344: # It is however useful to avoid unnecessary changes in the document (using a parser to
  345: # do read/write for the whole document would mess up non well-formed documents).
  346: # @param {Array<string>} lines
  347: # @param {int} line_number - line number to start at
  348: # @param {int} char_number - char number to start at on the line
  349: # @returns (tag, type, line_number, char_number)
  350: ##
  351: sub next_tag {
  352:   my ($lines, $i, $j ) = @_;
  353:   my $i2 = $i;
  354:   my $j2 = $j;
  355:   while ($i2 < scalar(@{$lines})) {
  356:     my $line = $lines->[$i2];
  357:     $j2 = index($line, '<', $j2);
  358:     #TODO: handle comments
  359:     while ($j2 != -1) {
  360:       my $ind_slash = index($line, '/', $j2);
  361:       my $ind_sup = index($line, '>', $j2);
  362:       my $ind_space = index($line, ' ', $j2);
  363:       my $type;
  364:       my $tag;
  365:       if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
  366:         $type = 'end';
  367:         $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
  368:       } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
  369:         $type = 'empty';
  370:         if ($ind_space != -1 && $ind_space < $ind_sup) {
  371:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
  372:         } else {
  373:           $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
  374:         }
  375:       } elsif ($ind_sup != -1) {
  376:         $type = 'start';
  377:         if ($ind_space != -1 && $ind_space < $ind_sup) {
  378:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
  379:         } else {
  380:           $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
  381:         }
  382:       } else {
  383:         $tag = ''
  384:       }
  385:       if ($tag ne '') {
  386:         return ($tag, $type, $i2, $j2);
  387:       }
  388:       $j2 = index($line, '<', $j2 + 1);
  389:     }
  390:     $i2++;
  391:     $j2 = 0;
  392:   }
  393:   return ('', '', 0, 0);
  394: }
  395: 
  396: ##
  397: # Tests if a string is in an array, ignoring case
  398: ##
  399: sub in_array_ignore_case {
  400:   my ($array, $value) = @_;
  401:   my $lcvalue = lc($value);
  402:   foreach my $v (@{$array}) {
  403:     if (lc($v) eq $lcvalue) {
  404:       return 1;
  405:     }
  406:   }
  407:   return 0;
  408: }
  409: 
  410: 1;
  411: __END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>