loncom/homework/cleanxml/pre_xml.pm - annotate

Return to pre_xml.pm CVS log
Up to [LON-CAPA] / loncom / homework / cleanxml
Annotation of loncom/homework/cleanxml/pre_xml.pm, revision 1.2

1.1       damieng     1: # The LearningOnline Network
                      2: # First step to clean a file.
                      3: #
1.2     ! damieng     4: # $Id: pre_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $
1.1       damieng     5: #
                      6: # Copyright Michigan State University Board of Trustees
                      7: #
                      8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                      9: #
                     10: # LON-CAPA is free software; you can redistribute it and/or modify
                     11: # it under the terms of the GNU General Public License as published by
                     12: # the Free Software Foundation; either version 2 of the License, or
                     13: # (at your option) any later version.
                     14: #
                     15: # LON-CAPA is distributed in the hope that it will be useful,
                     16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     18: # GNU General Public License for more details.
                     19: #
                     20: # You should have received a copy of the GNU General Public License
                     21: # along with LON-CAPA; if not, write to the Free Software
                     22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     23: #
                     24: # /home/httpd/html/adm/gpl.txt
                     25: #
                     26: # http://www.lon-capa.org/
                     27: #
                     28: ###
                     29: 
                     30: #!/usr/bin/perl
                     31: 
                     32: package Apache::pre_xml;
                     33: 
                     34: use strict;
                     35: use utf8;
                     36: 
                     37: use Encode;
                     38: use Encode::Byte;
                     39: use Encode::Guess;
                     40: 
                     41: # list of elements inside which < and > might not be turned into entities
                     42: # unfortunately, answer can sometimes contain the elements vector and value...
                     43: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
                     44: 
                     45: my $warnings; # 1 = print warnings
                     46: 
                     47: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
                     48: # and returns the converted text.
                     49: sub pre_xml {
                     50:   my ($filepath, $warn) = @_;
                     51:   $warnings = $warn;
                     52:   
                     53:   my $lines = guess_encoding_and_read($filepath);
                     54: 
                     55:   remove_control_characters($lines);
                     56:   
                     57:   fix_cdata_elements($lines);
                     58: 
                     59:   fix_html_entities($lines);
                     60:   
                     61:   fix_missing_quotes($lines);
                     62:   
                     63:   fix_empty_li($lines);
                     64:   
                     65:   remove_doctype($lines);
                     66:   
                     67:   add_root($lines, $filepath);
                     68:   
                     69:   return(\join('', @$lines));
                     70: }
                     71: 
                     72: 
                     73: ##
                     74: # Tries to guess the character encoding, and returns the lines as decoded text.
                     75: # Requires Encode::Byte.
                     76: ##
                     77: sub guess_encoding_and_read {
                     78:   my ($fn) = @_;
                     79:   no warnings "utf8";
                     80:   local $/ = undef;
                     81:   open(my $fh, "<", $fn) or die "cannot read $fn: $!";
                     82:   binmode $fh;
                     83:   my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
                     84:   close $fh;
                     85:   
                     86:   if (index($data, '<') == -1) {
                     87:     die "This file has no markup !";
                     88:   }
                     89:   
                     90:   # try to get a charset from a meta at the beginning of the file
                     91:   my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
                     92:   if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
                     93:     my $meta_charset = $1;
                     94:     if ($meta_charset ne '') {
                     95:       if ($meta_charset =~ /iso-?8859-?1/i) {
                     96:         # usually a lie
                     97:         $meta_charset = 'cp1252';
                     98:       }
                     99:       # now try to decode using that encoding
                    100:       my $decoder = guess_encoding($data, ($meta_charset));
                    101:       if (ref($decoder)) {
                    102:         my $decoded = $decoder->decode($data);
                    103:         my @lines = split(/^/m, $decoded);
                    104:         return \@lines;
                    105:       } else {
                    106:         if ($warnings) {
                    107:           print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
                    108:         }
                    109:       }
                    110:     }
                    111:   }
                    112:   
                    113:   my $decoded;
                    114:   if (length($data) > 0) {
                    115:     # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
                    116:     #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
                    117:     # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
                    118:     my $decoder = Encode::Guess->guess($data);
                    119:     if (ref($decoder)) {
                    120:       $decoded = $decoder->decode($data);
                    121:       # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
                    122:     } else {
                    123:       if ($warnings) {
                    124:         print "Warning: encoding is not UTF-8 for $fn";
                    125:       }
                    126:       
                    127:       # let's try iso-2022-jp first
                    128:       $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
                    129:       if (ref($decoder)) {
                    130:         $decoded = $decoder->decode($data);
                    131:         if ($warnings) {
                    132:           print "; using iso-2022-jp\n";
                    133:         }
                    134:       } else {
                    135:         # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
                    136:         # instead of control codes. We can assume that these control codes are not used, so there
                    137:         # is no need to test for iso-8859-1.
                    138:         # The main problem here is to distinguish between cp1252 and MacRoman.
                    139:         # see http://www.alanwood.net/demos/charsetdiffs.html#f
                    140:         my $decoded_windows = decode('cp1252', $data);
                    141:         my $decoded_mac = decode('MacRoman', $data);
                    142:         # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
                    143:         # í has been removed because it conflicts with ’ and ’ is more frequent
                    144:         # ± has been removed because it is, suprisingly, the same code in both encodings !
                    145:         my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
                    146:         my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
                    147:         # check newlines too (\r on MacOS < X, \r\n on Windows)
                    148:         my $ind_cr = index($data, "\r");
                    149:         if ($ind_cr != -1) {
                    150:           if (substr($data, $ind_cr + 1, 1) eq "\n") {
                    151:             $score_windows++;
                    152:           } else {
                    153:             $score_mac++;
                    154:           }
                    155:         }
                    156:         if ($score_windows >= $score_mac) {
                    157:           $decoded = $decoded_windows;
                    158:           if ($warnings) {
                    159:             print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
                    160:           }
                    161:         } else {
                    162:           if ($warnings) {
                    163:             print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
                    164:           }
                    165:           $decoded = $decoded_mac;
                    166:         }
                    167:       }
                    168:     }
                    169:   } else {
                    170:     $decoded = '';
                    171:   }
                    172:   my @lines = split(/^/m, $decoded);
                    173:   return \@lines;
                    174: }
                    175: 
                    176: 
                    177: ##
                    178: # Removes some control characters
                    179: # @param {Array<string>} lines
                    180: ##
                    181: sub remove_control_characters {
                    182:   my ($lines) = @_;
                    183:   foreach my $line (@{$lines}) {
                    184:     $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
                    185:     $line =~ s/&#[0-7];//g;
                    186:     $line =~ s/&#1[4-9];//g;
                    187:     $line =~ s/&#2[0-9];//g;
                    188:   }
                    189: }
                    190: 
                    191: ##
                    192: # Replaces < and > characters by &lt; and &gt; in cdata elements (listed in @cdata_elements).
                    193: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
                    194: # @param {Array<string>} lines
                    195: ##
                    196: sub fix_cdata_elements {
                    197:   my ($lines) = @_;
                    198:   my $i = 0;
                    199:   my $j = 0;
                    200:   my $tag = '';
                    201:   my $type;
                    202:   my $in_numericalresponse = 0;
                    203:   my $in_formularesponse = 0;
                    204:   my $in_script = 0;
                    205:   ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
                    206:   while ($tag ne '') {
                    207:     if ($tag eq 'numericalresponse') {
                    208:       if ($type eq 'start') {
                    209:         $in_numericalresponse = 1;
                    210:       } else {
                    211:         $in_numericalresponse = 0;
                    212:       }
                    213:     } elsif ($tag eq 'formularesponse') {
                    214:       if ($type eq 'start') {
                    215:         $in_formularesponse = 1;
                    216:       } else {
                    217:         $in_formularesponse = 0;
                    218:       }
                    219:     } elsif ($tag eq 'script') {
                    220:       if ($type eq 'start') {
                    221:         $in_script = 1;
                    222:       } else {
                    223:         $in_script = 0;
                    224:       }
                    225:     }
                    226:     if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
                    227:         ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
                    228:       my $cde = $tag;
                    229:       my $line = $lines->[$i];
                    230:       $j = index($line, '>', $j+1) + 1;
                    231:       my $stop = 0;
                    232:       while (!$stop && $i < scalar(@{$lines})) {
                    233:         my $indinf = index($line, '<', $j);
                    234:         if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
                    235:           $i++;
                    236:           $line = $lines->[$i];
                    237:           $j = 0;
                    238:           last;
                    239:         }
                    240:         my $indsup = index($line, '>', $j);
                    241:         if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
                    242:           my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
                    243:           $test =~ s/^\s+|\s+$//g ;
                    244:           if ($test eq '/'.$cde) {
                    245:             $stop = 1;
                    246:             $j = $indsup;
                    247:           # this is commented because of markup like <display>&web(' ','','<p>')</display>
                    248:           #} elsif ($test =~ /^[a-zA-Z\/]$/) {
                    249:           #  $j = $indsup + 1;
                    250:           } else {
                    251:             $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
                    252:             $lines->[$i] = $line;
                    253:           }
                    254:         } elsif ($indinf != -1 && $indsup == -1) {
                    255:           $line = substr($line, 0, $indinf).'&lt;'.substr($line, $indinf+1);
                    256:           $lines->[$i] = $line;
                    257:         } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
                    258:           $line = substr($line, 0, $indsup).'&gt;'.substr($line, $indsup+1);
                    259:           $lines->[$i] = $line;
                    260:         } else {
                    261:           $i++;
                    262:           $line = $lines->[$i];
                    263:           $j = 0;
                    264:         }
                    265:       }
                    266:     }
                    267:     $j++;
                    268:     ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
                    269:   }
                    270: }
                    271: 
                    272: 
                    273: ##
                    274: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
                    275: # @param {Array<string>} lines
                    276: ##
                    277: sub fix_html_entities {
                    278:   my ($lines) = @_;
                    279:   foreach my $line (@{$lines}) {
                    280:     # html_to_xml is converting named entities before 255 (see HTML parser dtext)
                    281:     # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
                    282:     $line =~ s/&#128;|&#x80;/€/g;
                    283:     $line =~ s/&#130;|&#x82;/‚/g;
                    284:     $line =~ s/&#132;|&#x84;/„/g;
                    285:     $line =~ s/&#133;|&#x85;/…/g;
                    286:     $line =~ s/&#134;|&#x86;/†/g;
                    287:     $line =~ s/&#135;|&#x87;/‡/g;
                    288:     $line =~ s/&#136;|&#x88;/ˆ/g;
                    289:     $line =~ s/&#137;|&#x89;/‰/g;
                    290:     $line =~ s/&#139;|&#x8B;/‹/g;
                    291:     $line =~ s/&#145;|&#x91;/‘/g;
                    292:     $line =~ s/&#146;|&#x92;/’/g;
                    293:     $line =~ s/&#147;|&#x93;/“/g;
                    294:     $line =~ s/&#148;|&#x94;/”/g;
                    295:     $line =~ s/&#149;|&#x95;/•/g;
                    296:     $line =~ s/&#150;|&#x96;/–/g;
                    297:     $line =~ s/&#151;|&#x97;/—/g;
                    298:     $line =~ s/&#152;|&#x98;/˜/g;
                    299:     $line =~ s/&#153;|&#x99;/™/g;
                    300:     $line =~ s/&#155;|&#x9B;/›/g;
                    301:     $line =~ s/&#156;|&#x9C;/œ/g;
                    302:   }
                    303: }
                    304: 
                    305: 
                    306: # Tries to fix things like <font color="#990000" face="Verdana,>
                    307: # without breaking <a b="c>d">
                    308: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
                    309: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
                    310: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
                    311: sub fix_missing_quotes {
                    312:   my ($lines) = @_;
                    313:   foreach my $line (@{$lines}) {
                    314:     my $n_inf = $line =~ tr/<//;
                    315:     my $n_sup = $line =~ tr/>//;
                    316:     if ($n_inf == 1 && $n_sup == 1) {
                    317:       my $ind_inf = index($line, '<');
                    318:       my $ind_sup = index($line, '>');
                    319:       if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
                    320:         my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
                    321:         if ($n_quotes % 2 != 0) {
                    322:           # add a quote before > when there is an odd number of quotes inside <>
                    323:           $line =~ s/>/">/;
                    324:         }
                    325:       }
                    326:     }
                    327:     $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
                    328:     $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
                    329:   }
                    330: }
                    331: 
                    332: 
                    333: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
                    334: sub fix_empty_li {
                    335:   my ($lines) = @_;
                    336:   foreach my $line (@{$lines}) {
                    337:     $line =~ s/<li\s?\/>/<li>/;
                    338:   }
                    339: }
                    340: 
                    341: 
                    342: # remove doctypes, without assuming they are at the beginning
                    343: sub remove_doctype {
                    344:   my ($lines) = @_;
                    345:   foreach my $line (@{$lines}) {
                    346:     $line =~ s/<!DOCTYPE[^>]*>//;
                    347:   }
                    348: }
                    349: 
                    350: 
                    351: # Adds a problem, library or html root element, enclosing things outside of the problem element.
                    352: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
                    353: sub add_root {
                    354:   my ($lines, $filepath) = @_;
                    355:   my $root_name;
                    356:   if ($filepath =~ /\.library$/i) {
                    357:     $root_name = 'library';
                    358:   } elsif ($filepath =~ /\.html?$/i) {
                    359:     $root_name = 'html';
1.2     ! damieng   360:   } elsif ($filepath =~ /\.task?$/i) {
        !           361:     $root_name = 'Task';
1.1       damieng   362:   } else {
                    363:     $root_name = 'problem';
                    364:   }
                    365:   if ($root_name eq 'library') {
                    366:     foreach my $line (@{$lines}) {
                    367:       if ($line =~ /^\s*<[a-z]/) {
                    368:         last;
                    369:       }
                    370:       if ($line !~ /^\s*$/) {
                    371:         die "this library does not start with a tag, it might be a scriptlib";
                    372:       }
                    373:     }
                    374:   }
                    375:   my $line1 = $lines->[0];
                    376:   $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
                    377:   $line1 = "<$root_name>".$line1;
                    378:   $lines->[0] = $line1;
                    379:   $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
                    380: }
                    381: 
                    382: 
                    383: ##
                    384: # Returns information about the next tag, starting at line number and char number.
                    385: # Assumes the markup is well-formed and there is no CDATA,
                    386: # which is not always true (like inside script), so results might be wrong sometimes.
                    387: # It is however useful to avoid unnecessary changes in the document (using a parser to
                    388: # do read/write for the whole document would mess up non well-formed documents).
                    389: # @param {Array<string>} lines
                    390: # @param {int} line_number - line number to start at
                    391: # @param {int} char_number - char number to start at on the line
                    392: # @returns (tag, type, line_number, char_number)
                    393: ##
                    394: sub next_tag {
                    395:   my ($lines, $i, $j ) = @_;
                    396:   my $i2 = $i;
                    397:   my $j2 = $j;
                    398:   while ($i2 < scalar(@{$lines})) {
                    399:     my $line = $lines->[$i2];
                    400:     $j2 = index($line, '<', $j2);
                    401:     #TODO: handle comments
                    402:     while ($j2 != -1) {
                    403:       my $ind_slash = index($line, '/', $j2);
                    404:       my $ind_sup = index($line, '>', $j2);
                    405:       my $ind_space = index($line, ' ', $j2);
                    406:       my $type;
                    407:       my $tag;
                    408:       if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
                    409:         $type = 'end';
                    410:         $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
                    411:       } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
                    412:         $type = 'empty';
                    413:         if ($ind_space != -1 && $ind_space < $ind_sup) {
                    414:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
                    415:         } else {
                    416:           $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
                    417:         }
                    418:       } elsif ($ind_sup != -1) {
                    419:         $type = 'start';
                    420:         if ($ind_space != -1 && $ind_space < $ind_sup) {
                    421:           $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
                    422:         } else {
                    423:           $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
                    424:         }
                    425:       } else {
                    426:         $tag = ''
                    427:       }
                    428:       if ($tag ne '') {
                    429:         return ($tag, $type, $i2, $j2);
                    430:       }
                    431:       $j2 = index($line, '<', $j2 + 1);
                    432:     }
                    433:     $i2++;
                    434:     $j2 = 0;
                    435:   }
                    436:   return ('', '', 0, 0);
                    437: }
                    438: 
                    439: ##
                    440: # Tests if a string is in an array, ignoring case
                    441: ##
                    442: sub in_array_ignore_case {
                    443:   my ($array, $value) = @_;
                    444:   my $lcvalue = lc($value);
                    445:   foreach my $v (@{$array}) {
                    446:     if (lc($v) eq $lcvalue) {
                    447:       return 1;
                    448:     }
                    449:   }
                    450:   return 0;
                    451: }
                    452: 
                    453: 1;
                    454: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>