Annotation of loncom/homework/cleanxml/pre_xml.pm, revision 1.2
1.1 damieng 1: # The LearningOnline Network
2: # First step to clean a file.
3: #
1.2 ! damieng 4: # $Id: pre_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $
1.1 damieng 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ###
29:
30: #!/usr/bin/perl
31:
32: package Apache::pre_xml;
33:
34: use strict;
35: use utf8;
36:
37: use Encode;
38: use Encode::Byte;
39: use Encode::Guess;
40:
41: # list of elements inside which < and > might not be turned into entities
42: # unfortunately, answer can sometimes contain the elements vector and value...
43: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
44:
45: my $warnings; # 1 = print warnings
46:
47: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
48: # and returns the converted text.
49: sub pre_xml {
50: my ($filepath, $warn) = @_;
51: $warnings = $warn;
52:
53: my $lines = guess_encoding_and_read($filepath);
54:
55: remove_control_characters($lines);
56:
57: fix_cdata_elements($lines);
58:
59: fix_html_entities($lines);
60:
61: fix_missing_quotes($lines);
62:
63: fix_empty_li($lines);
64:
65: remove_doctype($lines);
66:
67: add_root($lines, $filepath);
68:
69: return(\join('', @$lines));
70: }
71:
72:
73: ##
74: # Tries to guess the character encoding, and returns the lines as decoded text.
75: # Requires Encode::Byte.
76: ##
77: sub guess_encoding_and_read {
78: my ($fn) = @_;
79: no warnings "utf8";
80: local $/ = undef;
81: open(my $fh, "<", $fn) or die "cannot read $fn: $!";
82: binmode $fh;
83: my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
84: close $fh;
85:
86: if (index($data, '<') == -1) {
87: die "This file has no markup !";
88: }
89:
90: # try to get a charset from a meta at the beginning of the file
91: my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
92: if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
93: my $meta_charset = $1;
94: if ($meta_charset ne '') {
95: if ($meta_charset =~ /iso-?8859-?1/i) {
96: # usually a lie
97: $meta_charset = 'cp1252';
98: }
99: # now try to decode using that encoding
100: my $decoder = guess_encoding($data, ($meta_charset));
101: if (ref($decoder)) {
102: my $decoded = $decoder->decode($data);
103: my @lines = split(/^/m, $decoded);
104: return \@lines;
105: } else {
106: if ($warnings) {
107: print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
108: }
109: }
110: }
111: }
112:
113: my $decoded;
114: if (length($data) > 0) {
115: # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
116: #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
117: # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
118: my $decoder = Encode::Guess->guess($data);
119: if (ref($decoder)) {
120: $decoded = $decoder->decode($data);
121: # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
122: } else {
123: if ($warnings) {
124: print "Warning: encoding is not UTF-8 for $fn";
125: }
126:
127: # let's try iso-2022-jp first
128: $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
129: if (ref($decoder)) {
130: $decoded = $decoder->decode($data);
131: if ($warnings) {
132: print "; using iso-2022-jp\n";
133: }
134: } else {
135: # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
136: # instead of control codes. We can assume that these control codes are not used, so there
137: # is no need to test for iso-8859-1.
138: # The main problem here is to distinguish between cp1252 and MacRoman.
139: # see http://www.alanwood.net/demos/charsetdiffs.html#f
140: my $decoded_windows = decode('cp1252', $data);
141: my $decoded_mac = decode('MacRoman', $data);
142: # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
143: # í has been removed because it conflicts with ’ and ’ is more frequent
144: # ± has been removed because it is, suprisingly, the same code in both encodings !
145: my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
146: my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
147: # check newlines too (\r on MacOS < X, \r\n on Windows)
148: my $ind_cr = index($data, "\r");
149: if ($ind_cr != -1) {
150: if (substr($data, $ind_cr + 1, 1) eq "\n") {
151: $score_windows++;
152: } else {
153: $score_mac++;
154: }
155: }
156: if ($score_windows >= $score_mac) {
157: $decoded = $decoded_windows;
158: if ($warnings) {
159: print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
160: }
161: } else {
162: if ($warnings) {
163: print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
164: }
165: $decoded = $decoded_mac;
166: }
167: }
168: }
169: } else {
170: $decoded = '';
171: }
172: my @lines = split(/^/m, $decoded);
173: return \@lines;
174: }
175:
176:
177: ##
178: # Removes some control characters
179: # @param {Array<string>} lines
180: ##
181: sub remove_control_characters {
182: my ($lines) = @_;
183: foreach my $line (@{$lines}) {
184: $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
185: $line =~ s/&#[0-7];//g;
186: $line =~ s/[4-9];//g;
187: $line =~ s/[0-9];//g;
188: }
189: }
190:
191: ##
192: # Replaces < and > characters by < and > in cdata elements (listed in @cdata_elements).
193: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
194: # @param {Array<string>} lines
195: ##
196: sub fix_cdata_elements {
197: my ($lines) = @_;
198: my $i = 0;
199: my $j = 0;
200: my $tag = '';
201: my $type;
202: my $in_numericalresponse = 0;
203: my $in_formularesponse = 0;
204: my $in_script = 0;
205: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
206: while ($tag ne '') {
207: if ($tag eq 'numericalresponse') {
208: if ($type eq 'start') {
209: $in_numericalresponse = 1;
210: } else {
211: $in_numericalresponse = 0;
212: }
213: } elsif ($tag eq 'formularesponse') {
214: if ($type eq 'start') {
215: $in_formularesponse = 1;
216: } else {
217: $in_formularesponse = 0;
218: }
219: } elsif ($tag eq 'script') {
220: if ($type eq 'start') {
221: $in_script = 1;
222: } else {
223: $in_script = 0;
224: }
225: }
226: if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
227: ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
228: my $cde = $tag;
229: my $line = $lines->[$i];
230: $j = index($line, '>', $j+1) + 1;
231: my $stop = 0;
232: while (!$stop && $i < scalar(@{$lines})) {
233: my $indinf = index($line, '<', $j);
234: if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
235: $i++;
236: $line = $lines->[$i];
237: $j = 0;
238: last;
239: }
240: my $indsup = index($line, '>', $j);
241: if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
242: my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
243: $test =~ s/^\s+|\s+$//g ;
244: if ($test eq '/'.$cde) {
245: $stop = 1;
246: $j = $indsup;
247: # this is commented because of markup like <display>&web(' ','','<p>')</display>
248: #} elsif ($test =~ /^[a-zA-Z\/]$/) {
249: # $j = $indsup + 1;
250: } else {
251: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
252: $lines->[$i] = $line;
253: }
254: } elsif ($indinf != -1 && $indsup == -1) {
255: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
256: $lines->[$i] = $line;
257: } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
258: $line = substr($line, 0, $indsup).'>'.substr($line, $indsup+1);
259: $lines->[$i] = $line;
260: } else {
261: $i++;
262: $line = $lines->[$i];
263: $j = 0;
264: }
265: }
266: }
267: $j++;
268: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
269: }
270: }
271:
272:
273: ##
274: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
275: # @param {Array<string>} lines
276: ##
277: sub fix_html_entities {
278: my ($lines) = @_;
279: foreach my $line (@{$lines}) {
280: # html_to_xml is converting named entities before 255 (see HTML parser dtext)
281: # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
282: $line =~ s/€|€/€/g;
283: $line =~ s/‚|‚/‚/g;
284: $line =~ s/„|„/„/g;
285: $line =~ s/…|…/…/g;
286: $line =~ s/†|†/†/g;
287: $line =~ s/‡|‡/‡/g;
288: $line =~ s/ˆ|ˆ/ˆ/g;
289: $line =~ s/‰|‰/‰/g;
290: $line =~ s/‹|‹/‹/g;
291: $line =~ s/‘|‘/‘/g;
292: $line =~ s/’|’/’/g;
293: $line =~ s/“|“/“/g;
294: $line =~ s/”|”/”/g;
295: $line =~ s/•|•/•/g;
296: $line =~ s/–|–/–/g;
297: $line =~ s/—|—/—/g;
298: $line =~ s/˜|˜/˜/g;
299: $line =~ s/™|™/™/g;
300: $line =~ s/›|›/›/g;
301: $line =~ s/œ|œ/œ/g;
302: }
303: }
304:
305:
306: # Tries to fix things like <font color="#990000" face="Verdana,>
307: # without breaking <a b="c>d">
308: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
309: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
310: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
311: sub fix_missing_quotes {
312: my ($lines) = @_;
313: foreach my $line (@{$lines}) {
314: my $n_inf = $line =~ tr/<//;
315: my $n_sup = $line =~ tr/>//;
316: if ($n_inf == 1 && $n_sup == 1) {
317: my $ind_inf = index($line, '<');
318: my $ind_sup = index($line, '>');
319: if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
320: my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
321: if ($n_quotes % 2 != 0) {
322: # add a quote before > when there is an odd number of quotes inside <>
323: $line =~ s/>/">/;
324: }
325: }
326: }
327: $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
328: $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
329: }
330: }
331:
332:
333: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
334: sub fix_empty_li {
335: my ($lines) = @_;
336: foreach my $line (@{$lines}) {
337: $line =~ s/<li\s?\/>/<li>/;
338: }
339: }
340:
341:
342: # remove doctypes, without assuming they are at the beginning
343: sub remove_doctype {
344: my ($lines) = @_;
345: foreach my $line (@{$lines}) {
346: $line =~ s/<!DOCTYPE[^>]*>//;
347: }
348: }
349:
350:
351: # Adds a problem, library or html root element, enclosing things outside of the problem element.
352: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
353: sub add_root {
354: my ($lines, $filepath) = @_;
355: my $root_name;
356: if ($filepath =~ /\.library$/i) {
357: $root_name = 'library';
358: } elsif ($filepath =~ /\.html?$/i) {
359: $root_name = 'html';
1.2 ! damieng 360: } elsif ($filepath =~ /\.task?$/i) {
! 361: $root_name = 'Task';
1.1 damieng 362: } else {
363: $root_name = 'problem';
364: }
365: if ($root_name eq 'library') {
366: foreach my $line (@{$lines}) {
367: if ($line =~ /^\s*<[a-z]/) {
368: last;
369: }
370: if ($line !~ /^\s*$/) {
371: die "this library does not start with a tag, it might be a scriptlib";
372: }
373: }
374: }
375: my $line1 = $lines->[0];
376: $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
377: $line1 = "<$root_name>".$line1;
378: $lines->[0] = $line1;
379: $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
380: }
381:
382:
383: ##
384: # Returns information about the next tag, starting at line number and char number.
385: # Assumes the markup is well-formed and there is no CDATA,
386: # which is not always true (like inside script), so results might be wrong sometimes.
387: # It is however useful to avoid unnecessary changes in the document (using a parser to
388: # do read/write for the whole document would mess up non well-formed documents).
389: # @param {Array<string>} lines
390: # @param {int} line_number - line number to start at
391: # @param {int} char_number - char number to start at on the line
392: # @returns (tag, type, line_number, char_number)
393: ##
394: sub next_tag {
395: my ($lines, $i, $j ) = @_;
396: my $i2 = $i;
397: my $j2 = $j;
398: while ($i2 < scalar(@{$lines})) {
399: my $line = $lines->[$i2];
400: $j2 = index($line, '<', $j2);
401: #TODO: handle comments
402: while ($j2 != -1) {
403: my $ind_slash = index($line, '/', $j2);
404: my $ind_sup = index($line, '>', $j2);
405: my $ind_space = index($line, ' ', $j2);
406: my $type;
407: my $tag;
408: if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
409: $type = 'end';
410: $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
411: } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
412: $type = 'empty';
413: if ($ind_space != -1 && $ind_space < $ind_sup) {
414: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
415: } else {
416: $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
417: }
418: } elsif ($ind_sup != -1) {
419: $type = 'start';
420: if ($ind_space != -1 && $ind_space < $ind_sup) {
421: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
422: } else {
423: $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
424: }
425: } else {
426: $tag = ''
427: }
428: if ($tag ne '') {
429: return ($tag, $type, $i2, $j2);
430: }
431: $j2 = index($line, '<', $j2 + 1);
432: }
433: $i2++;
434: $j2 = 0;
435: }
436: return ('', '', 0, 0);
437: }
438:
439: ##
440: # Tests if a string is in an array, ignoring case
441: ##
442: sub in_array_ignore_case {
443: my ($array, $value) = @_;
444: my $lcvalue = lc($value);
445: foreach my $v (@{$array}) {
446: if (lc($v) eq $lcvalue) {
447: return 1;
448: }
449: }
450: return 0;
451: }
452:
453: 1;
454: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>