1: #!/usr/bin/perl
2:
3: package pre_xml;
4:
5: use strict;
6: use utf8;
7:
8: use Encode;
9: use Encode::Byte;
10: use Encode::Guess;
11:
12: # list of elements inside which < and > might not be turned into entities
13: # unfortunately, answer can sometimes contain the elements vector and value...
14: my @cdata_elements = ('answer', 'm', 'display', 'parse'); # not script because the HTML parser will handle it
15:
16:
17: # Reads a LON-CAPA 2 file, guesses the encoding, fixes characters in cdata_elements, fixes HTML entities,
18: # and returns the converted text.
19: sub pre_xml {
20: my ($filepath) = @_;
21:
22: my $lines = guess_encoding_and_read($filepath);
23:
24: remove_control_characters($lines);
25:
26: fix_cdata_elements($lines);
27:
28: fix_html_entities($lines);
29:
30: fix_missing_quotes($lines);
31:
32: fix_empty_li($lines);
33:
34: remove_doctype($lines);
35:
36: add_root($lines, $filepath);
37:
38: return(\join('', @$lines));
39: }
40:
41:
42: ##
43: # Tries to guess the character encoding, and returns the lines as decoded text.
44: # Requires Encode::Byte.
45: ##
46: sub guess_encoding_and_read {
47: my ($fn) = @_;
48: no warnings "utf8";
49: local $/ = undef;
50: open(my $fh, "<", $fn) or die "cannot read $fn: $!";
51: binmode $fh;
52: my $data = <$fh>; # we need to read the whole file to test if font is a block or inline element
53: close $fh;
54:
55: if (index($data, '<') == -1) {
56: die "This file has no markup !";
57: }
58:
59: # try to get a charset from a meta at the beginning of the file
60: my $beginning = substr($data, 0, 1024); # to avoid a full match; hopefully we won't cut the charset in half
61: if ($beginning =~ /<meta[^>]*charset\s?=\s?([^\n>"';]*)/i) {
62: my $meta_charset = $1;
63: if ($meta_charset ne '') {
64: if ($meta_charset =~ /iso-?8859-?1/i) {
65: # usually a lie
66: $meta_charset = 'cp1252';
67: }
68: # now try to decode using that encoding
69: my $decoder = guess_encoding($data, ($meta_charset));
70: if (ref($decoder)) {
71: my $decoded = $decoder->decode($data);
72: my @lines = split(/^/m, $decoded);
73: return \@lines;
74: } else {
75: print "Warning: decoding did not work with the charset defined by the meta ($meta_charset)\n";
76: }
77: }
78: }
79:
80: my $decoded;
81: if (length($data) > 0) {
82: # NOTE: this list is too ambigous, Encode::Guess refuses to even try a guess
83: #Encode::Guess->set_suspects(qw/ascii UTF-8 iso-8859-1 MacRoman cp1252/);
84: # by default Encode::Guess uses ascii, utf8 and UTF-16/32 with BOM
85: my $decoder = Encode::Guess->guess($data);
86: if (ref($decoder)) {
87: $decoded = $decoder->decode($data);
88: # NOTE: this seems to accept binary files sometimes (conversion will fail later because it is not really UTF-8)
89: } else {
90: print "Warning: encoding is not UTF-8 for $fn";
91:
92: # let's try iso-2022-jp first
93: $decoder = Encode::Guess->guess($data, 'iso-2022-jp');
94: if (ref($decoder)) {
95: $decoded = $decoder->decode($data);
96: print "; using iso-2022-jp\n";
97: } else {
98: # NOTE: cp1252 is identical to iso-8859-1 but with additionnal characters in range 128-159
99: # instead of control codes. We can assume that these control codes are not used, so there
100: # is no need to test for iso-8859-1.
101: # The main problem here is to distinguish between cp1252 and MacRoman.
102: # see http://www.alanwood.net/demos/charsetdiffs.html#f
103: my $decoded_windows = decode('cp1252', $data);
104: my $decoded_mac = decode('MacRoman', $data);
105: # try to use frequent non-ASCII characters to distinguish the encodings (languages: mostly German, Spanish, Portuguese)
106: # í has been removed because it conflicts with ’ and ’ is more frequent
107: # ± has been removed because it is, suprisingly, the same code in both encodings !
108: my $score_windows = $decoded_windows =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
109: my $score_mac = $decoded_mac =~ tr/ßáàäâãçéèêëñóöôõúüÄÉÑÖÜ¿¡‘’“” °½–—…§//;
110: # check newlines too (\r on MacOS < X, \r\n on Windows)
111: my $ind_cr = index($data, "\r");
112: if ($ind_cr != -1) {
113: if (substr($data, $ind_cr + 1, 1) eq "\n") {
114: $score_windows++;
115: } else {
116: $score_mac++;
117: }
118: }
119: if ($score_windows >= $score_mac) {
120: $decoded = $decoded_windows;
121: print "; guess=cp1252 ($score_windows cp1252 >= $score_mac MacRoman)\n";
122: } else {
123: print "; guess=MacRoman ($score_mac MacRoman > $score_windows cp1252)\n";
124: $decoded = $decoded_mac;
125: }
126: }
127: }
128: } else {
129: $decoded = '';
130: }
131: my @lines = split(/^/m, $decoded);
132: return \@lines;
133: }
134:
135:
136: ##
137: # Removes some control characters
138: # @param {Array<string>} lines
139: ##
140: sub remove_control_characters {
141: my ($lines) = @_;
142: foreach my $line (@{$lines}) {
143: $line =~ s/[\x00-\x07\x0B\x0C\x0E-\x1F]//g;
144: $line =~ s/&#[0-7];//g;
145: $line =~ s/[4-9];//g;
146: $line =~ s/[0-9];//g;
147: }
148: }
149:
150: ##
151: # Replaces < and > characters by < and > in cdata elements (listed in @cdata_elements).
152: # EXCEPT for answer when it's inside numericalresponse or formularesponse.
153: # @param {Array<string>} lines
154: ##
155: sub fix_cdata_elements {
156: my ($lines) = @_;
157: my $i = 0;
158: my $j = 0;
159: my $tag = '';
160: my $type;
161: my $in_numericalresponse = 0;
162: my $in_formularesponse = 0;
163: my $in_script = 0;
164: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
165: while ($tag ne '') {
166: if ($tag eq 'numericalresponse') {
167: if ($type eq 'start') {
168: $in_numericalresponse = 1;
169: } else {
170: $in_numericalresponse = 0;
171: }
172: } elsif ($tag eq 'formularesponse') {
173: if ($type eq 'start') {
174: $in_formularesponse = 1;
175: } else {
176: $in_formularesponse = 0;
177: }
178: } elsif ($tag eq 'script') {
179: if ($type eq 'start') {
180: $in_script = 1;
181: } else {
182: $in_script = 0;
183: }
184: }
185: if ($type eq 'start' && in_array_ignore_case(\@cdata_elements, $tag) && !$in_script &&
186: ($tag ne 'answer' || (!$in_numericalresponse && !$in_formularesponse))) {
187: my $cde = $tag;
188: my $line = $lines->[$i];
189: $j = index($line, '>', $j+1) + 1;
190: my $stop = 0;
191: while (!$stop && $i < scalar(@{$lines})) {
192: my $indinf = index($line, '<', $j);
193: if ($indinf != -1 && index($line, '<![CDATA[', $indinf) == $indinf) {
194: $i++;
195: $line = $lines->[$i];
196: $j = 0;
197: last;
198: }
199: my $indsup = index($line, '>', $j);
200: if ($indinf != -1 && $indsup != -1 && $indinf < $indsup) {
201: my $test = substr($line, $indinf + 1, $indsup - ($indinf + 1));
202: $test =~ s/^\s+|\s+$//g ;
203: if ($test eq '/'.$cde) {
204: $stop = 1;
205: $j = $indsup;
206: # this is commented because of markup like <display>&web(' ','','<p>')</display>
207: #} elsif ($test =~ /^[a-zA-Z\/]$/) {
208: # $j = $indsup + 1;
209: } else {
210: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
211: $lines->[$i] = $line;
212: }
213: } elsif ($indinf != -1 && $indsup == -1) {
214: $line = substr($line, 0, $indinf).'<'.substr($line, $indinf+1);
215: $lines->[$i] = $line;
216: } elsif ($indsup != -1 && ($indinf == -1 || $indsup < $indinf)) {
217: $line = substr($line, 0, $indsup).'>'.substr($line, $indsup+1);
218: $lines->[$i] = $line;
219: } else {
220: $i++;
221: $line = $lines->[$i];
222: $j = 0;
223: }
224: }
225: }
226: $j++;
227: ($tag, $type, $i, $j) = next_tag($lines, $i, $j);
228: }
229: }
230:
231:
232: ##
233: # Replaces HTML entities (they are not XML unless a DTD is used, which is no longer recommanded for XHTML).
234: # @param {Array<string>} lines
235: ##
236: sub fix_html_entities {
237: my ($lines) = @_;
238: foreach my $line (@{$lines}) {
239: # html_to_xml is converting named entities before 255 (see HTML parser dtext)
240: # Assuming Windows encoding (Unicode entities are not before 160 and are the same between 160 and 255):
241: $line =~ s/€|€/€/g;
242: $line =~ s/‚|‚/‚/g;
243: $line =~ s/„|„/„/g;
244: $line =~ s/…|…/…/g;
245: $line =~ s/†|†/†/g;
246: $line =~ s/‡|‡/‡/g;
247: $line =~ s/ˆ|ˆ/ˆ/g;
248: $line =~ s/‰|‰/‰/g;
249: $line =~ s/‹|‹/‹/g;
250: $line =~ s/‘|‘/‘/g;
251: $line =~ s/’|’/’/g;
252: $line =~ s/“|“/“/g;
253: $line =~ s/”|”/”/g;
254: $line =~ s/•|•/•/g;
255: $line =~ s/–|–/–/g;
256: $line =~ s/—|—/—/g;
257: $line =~ s/˜|˜/˜/g;
258: $line =~ s/™|™/™/g;
259: $line =~ s/›|›/›/g;
260: $line =~ s/œ|œ/œ/g;
261: }
262: }
263:
264:
265: # Tries to fix things like <font color="#990000" face="Verdana,>
266: # without breaking <a b="c>d">
267: # This is only fixing tags when there is a single tag in a line (it is impossible to fix in the general case).
268: # Also transforms <a b="c> <d e=" into <a b="c"><d e=" ,
269: # and (no markup before)<a b="c> (no quote after) into <a b="c"> .
270: sub fix_missing_quotes {
271: my ($lines) = @_;
272: foreach my $line (@{$lines}) {
273: my $n_inf = $line =~ tr/<//;
274: my $n_sup = $line =~ tr/>//;
275: if ($n_inf == 1 && $n_sup == 1) {
276: my $ind_inf = index($line, '<');
277: my $ind_sup = index($line, '>');
278: if ($ind_inf != -1 && $ind_sup != -1 && $ind_inf < $ind_sup) {
279: my $n_quotes = substr($line, $ind_inf, $ind_sup) =~ tr/"//;
280: if ($n_quotes % 2 != 0) {
281: # add a quote before > when there is an odd number of quotes inside <>
282: $line =~ s/>/">/;
283: }
284: }
285: }
286: $line =~ s/(<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>\s*<[a-zA-Z]+ [a-zA-Z]+=")/$1"$2/;
287: $line =~ s/^([^"<>]*<[a-zA-Z]+ [a-zA-Z]+="[^"<>\s]+)(>[^"]*)$/$1"$2/;
288: }
289: }
290:
291:
292: # Replaces <li/> by <li> (the end tag will be added in html_to_xml
293: sub fix_empty_li {
294: my ($lines) = @_;
295: foreach my $line (@{$lines}) {
296: $line =~ s/<li\s?\/>/<li>/;
297: }
298: }
299:
300:
301: # remove doctypes, without assuming they are at the beginning
302: sub remove_doctype {
303: my ($lines) = @_;
304: foreach my $line (@{$lines}) {
305: $line =~ s/<!DOCTYPE[^>]*>//;
306: }
307: }
308:
309:
310: # Adds a problem, library or html root element, enclosing things outside of the problem element.
311: # (any extra root element will be removed in post_xml, but this ensures one is added as root if missing).
312: sub add_root {
313: my ($lines, $filepath) = @_;
314: my $root_name;
315: if ($filepath =~ /\.library$/i) {
316: $root_name = 'library';
317: } elsif ($filepath =~ /\.html?$/i) {
318: $root_name = 'html';
319: } else {
320: $root_name = 'problem';
321: }
322: if ($root_name eq 'library') {
323: foreach my $line (@{$lines}) {
324: if ($line =~ /^\s*<[a-z]/) {
325: last;
326: }
327: if ($line !~ /^\s*$/) {
328: die "this library does not start with a tag, it might be a scriptlib";
329: }
330: }
331: }
332: my $line1 = $lines->[0];
333: $line1 =~ s/<\?.*\?>//; # remove any PI, it would cause problems later anyway
334: $line1 = "<$root_name>".$line1;
335: $lines->[0] = $line1;
336: $lines->[scalar(@$lines)-1] = $lines->[scalar(@$lines)-1]."</$root_name>";
337: }
338:
339:
340: ##
341: # Returns information about the next tag, starting at line number and char number.
342: # Assumes the markup is well-formed and there is no CDATA,
343: # which is not always true (like inside script), so results might be wrong sometimes.
344: # It is however useful to avoid unnecessary changes in the document (using a parser to
345: # do read/write for the whole document would mess up non well-formed documents).
346: # @param {Array<string>} lines
347: # @param {int} line_number - line number to start at
348: # @param {int} char_number - char number to start at on the line
349: # @returns (tag, type, line_number, char_number)
350: ##
351: sub next_tag {
352: my ($lines, $i, $j ) = @_;
353: my $i2 = $i;
354: my $j2 = $j;
355: while ($i2 < scalar(@{$lines})) {
356: my $line = $lines->[$i2];
357: $j2 = index($line, '<', $j2);
358: #TODO: handle comments
359: while ($j2 != -1) {
360: my $ind_slash = index($line, '/', $j2);
361: my $ind_sup = index($line, '>', $j2);
362: my $ind_space = index($line, ' ', $j2);
363: my $type;
364: my $tag;
365: if ($ind_slash == $j2 + 1 && $ind_sup != -1) {
366: $type = 'end';
367: $tag = substr($line, $j2 + 2, $ind_sup - ($j2 + 2));
368: } elsif ($ind_slash != -1 && $ind_sup != -1 && $ind_slash == $ind_sup - 1) {
369: $type = 'empty';
370: if ($ind_space != -1 && $ind_space < $ind_sup) {
371: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
372: } else {
373: $tag = substr($line, $j2 + 1, $ind_slash - ($j2 + 1));
374: }
375: } elsif ($ind_sup != -1) {
376: $type = 'start';
377: if ($ind_space != -1 && $ind_space < $ind_sup) {
378: $tag = substr($line, $j2 + 1, $ind_space - ($j2 + 1));
379: } else {
380: $tag = substr($line, $j2 + 1, $ind_sup - ($j2 + 1));
381: }
382: } else {
383: $tag = ''
384: }
385: if ($tag ne '') {
386: return ($tag, $type, $i2, $j2);
387: }
388: $j2 = index($line, '<', $j2 + 1);
389: }
390: $i2++;
391: $j2 = 0;
392: }
393: return ('', '', 0, 0);
394: }
395:
396: ##
397: # Tests if a string is in an array, ignoring case
398: ##
399: sub in_array_ignore_case {
400: my ($array, $value) = @_;
401: my $lcvalue = lc($value);
402: foreach my $v (@{$array}) {
403: if (lc($v) eq $lcvalue) {
404: return 1;
405: }
406: }
407: return 0;
408: }
409:
410: 1;
411: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>