Annotation of loncom/homework/cleanxml/post_xml.pm, revision 1.5
1.1 damieng 1: # The LearningOnline Network
2: # Third step to clean a file.
3: #
1.5 ! damieng 4: # $Id: post_xml.pm,v 1.4 2015/12/18 22:07:42 damieng Exp $
1.1 damieng 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ###
29:
30: #!/usr/bin/perl
31:
32: package Apache::post_xml;
33:
34: use strict;
35: use utf8;
36: use warnings;
37:
38: use File::Basename;
39: use File::Temp qw/ tempfile /;
40: use Cwd 'abs_path';
41: use XML::LibXML;
42: use HTML::TokeParser; # used to parse sty files
43: use Tie::IxHash; # for ordered hashes
44:
45: no warnings 'recursion'; # yes, fix_paragraph is using heavy recursion, I know
46:
47: # these are constants
1.4 damieng 48: my @block_elements = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm');
49: my @inline_like_block = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup','translated','lang'); # inline elements treated like blocks for pretty print and some other things
1.1 damieng 50: my @responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse','essayresponse','radiobuttonresponse','matchresponse','rankresponse','imageresponse','functionplotresponse');
51: my @block_html = ('html','head','body','section','h1','h2','h3','h4','h5','h6','div','p','ul','ol','li','table','tbody','tr','td','th','dl','dt','dd','pre','noscript','hr','address','blockquote','object','applet','embed','map','form','fieldset','iframe','center','frameset');
52: my @no_newline_inside = ('import','parserlib','scriptlib','data','function','label','xlabel','ylabel','tic','text','rectangle','image','title','h1','h2','h3','h4','h5','h6','li','td','p');
53: my @preserve_elements = ('script','answer','pre');
54: my @accepting_style = ('section','h1','h2','h3','h4','h5','h6','div','p','li','td','th','dt','dd','pre','blockquote');
55: my @latex_math = ('\alpha', '\theta', '\omicron', '\tau', '\beta', '\vartheta', '\pi', '\upsilon', '\gamma', '\gamma', '\varpi', '\phi', '\delta', '\kappa', '\rho', '\varphi', '\epsilon', '\lambda', '\varrho', '\chi', '\varepsilon', '\mu', '\sigma', '\psi', '\zeta', '\nu', '\varsigma', '\omega', '\eta', '\xi',
56: '\Gamma', '\Lambda', '\Sigma', '\Psi', '\Delta', '\Xi', '\Upsilon', '\Omega', '\Theta', '\Pi', '\Phi',
57: '\pm', '\cap', '\diamond', '\oplus', '\mp', '\cup', '\bigtriangleup', '\ominus', '\times', '\uplus', '\bigtriangledown', '\otimes', '\div', '\sqcap', '\triangleleft', '\oslash', '\ast', '\sqcup', '\triangleright', '\odot', '\star', '\vee', '\lhd$', '\bigcirc', '\circ', '\wedge', '\rhd$', '\dagger', '\bullet', '\setminus', '\unlhd$', '\ddagger', '\cdot', '\wr', '\unrhd$', '\amalg', '+', '-',
58: '\leq', '\geq', '\equiv', '\models', '\prec', '\succ', '\sim', '\perp', '\preceq', '\succeq', '\simeq', '\mid', '\ll', '\gg', '\asymp', '\parallel', '\subset', '\supset', '\approx', '\bowtie', '\subseteq', '\supseteq', '\cong', '\Join$', '\sqsubset$', '\sqsupset$', '\neq', '\smile', '\sqsubseteq', '\sqsupseteq', '\doteq', '\frown', '\in', '\ni', '\propto', '\vdash', '\dashv',
59: '\colon', '\ldotp', '\cdotp',
60: '\leftarrow', '\longleftarrow', '\uparrow', '\Leftarrow', '\Longleftarrow', '\Uparrow', '\rightarrow', '\longrightarrow', '\downarrow', '\Rightarrow', '\Longrightarrow', '\Downarrow', '\leftrightarrow', '\longleftrightarrow', '\updownarrow', '\Leftrightarrow', '\Longleftrightarrow', '\Updownarrow', '\mapsto', '\longmapsto', '\nearrow', '\hookleftarrow', '\hookrightarrow', '\searrow', '\leftharpoonup', '\rightharpoonup', '\swarrow', '\leftharpoondown', '\rightharpoondown', '\nwarrow', '\rightleftharpoons', '\leadsto$',
61: '\ldots', '\cdots', '\vdots', '\ddots', '\aleph', '\prime', '\forall', '\infty', '\hbar', '\emptyset', '\exists', '\Box$', '\imath', '\nabla', '\neg', '\Diamond$', '\jmath', '\surd', '\flat', '\triangle', '\ell', '\top', '\natural', '\clubsuit', '\wp', '\bot', '\sharp', '\diamondsuit', '\Re', '\|', '\backslash', '\heartsuit', '\Im', '\angle', '\partial', '\spadesuit', '\mho$',
62: '\sum', '\bigcap', '\bigodot', '\prod', '\bigcup', '\bigotimes', '\coprod', '\bigsqcup', '\bigoplus', '\int', '\bigvee', '\biguplus', '\oint', '\bigwedge',
63: '\arccos', '\cos', '\csc', '\exp', '\ker', '\limsup', '\min', '\sinh', '\arcsin', '\cosh', '\deg', '\gcd', '\lg', '\ln', '\Pr', '\sup', '\arctan', '\cot', '\det', '\hom', '\lim', '\log', '\sec', '\tan', '\arg', '\coth', '\dim', '\inf', '\liminf', '\max', '\sin', '\tanh',
64: '\uparrow', '\Uparrow', '\downarrow', '\Downarrow', '\updownarrow', '\Updownarrow', '\lfloor', '\rfloor', '\lceil', '\rceil', '\langle', '\rangle', '\backslash',
65: '\rmoustache', '\lmoustache', '\rgroup', '\lgroup', '\arrowvert', '\Arrowvert', '\bracevert',
66: '\hat{', '\acute{', '\bar{', '\dot{', '\breve{', '\check{', '\grave{', '\vec{', '\ddot{', '\tilde{',
67: '\widetilde{', '\widehat{', '\overleftarrow{', '\overrightarrow{', '\overline{', '\underline{', '\overbrace{', '\underbrace{', '\sqrt{', '\sqrt[', '\frac{'
68: );
69: # list of elements that can contain style elements:
70: my @containing_styles = ('library','problem',@responses,'foil','item','text','hintgroup','hintpart','label','part','preduedate','postanswerdate','solved','notsolved','block','while','web','standalone','problemtype','languageblock','translated','lang','window','windowlink','togglebox','instructorcomment','body','section','div','p','li','dd','td','th','blockquote','object','applet','video','audio','canvas','fieldset','button',
71: 'span','strong','em','b','i','sup','sub','code','kbd','samp','tt','ins','del','var','small','big','u','font');
72: my @html_styles = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'u');
73:
74: my $warnings; # 1 = print warnings
75:
76:
77: # Parses the XML document and fixes many things to turn it into a document matching the schema.
1.2 damieng 78: # @param {reference} textref - reference to the text of the document
79: # @param {string} file_path - path of the file being processed (we only extract the directory path from it)
80: # @param {string} res_dir - path of res directory parent (without the / at the end)
81: # @param {boolean} warn - 1 to print warnings, 0 otherwise
82: # @returns the text of the document as a byte string.
1.1 damieng 83: sub post_xml {
1.2 damieng 84: my ($textref, $file_path, $res_dir, $warn) = @_;
1.1 damieng 85: $warnings = $warn;
86:
87: my $dom_doc = XML::LibXML->load_xml(string => $textref);
88:
89: my $root = fix_structure($dom_doc);
90:
91: remove_elements($root, ['startouttext','startoutext','startottext','startouttex','startouttect','atartouttext','starouttext','starttextout','starttext','starttextarea','endouttext','endoutext','endoutttext','endouttxt','endouutext','ednouttext','endouttex','endoouttext','endouttest','endtextout','endtextarea','startpartmarker','endpartmarker','basefont','x-claris-tagview','x-claris-window','x-sas-window']);
92:
93: remove_empty_attributes($root);
94:
95: fix_attribute_case($root);
96:
1.3 damieng 97: replace_m($root);
1.1 damieng 98:
99: my @all_block = (@block_elements, @block_html);
1.2 damieng 100: add_sty_blocks($file_path, $res_dir, $root, \@all_block); # must come before the subs using @all_block
1.1 damieng 101:
102: fix_block_styles($root, \@all_block);
103: $root->normalize();
104:
105: fix_fonts($root, \@all_block);
106:
107: replace_u($root);
108:
109: remove_bad_cdata_sections($root);
110:
111: add_cdata_sections($root);
112:
113: fix_style_element($root);
114:
115: fix_tables($root);
116:
117: fix_lists($root);
118:
119: fix_wrong_name_for_img($root); # should be before replace_deprecated_attributes_by_css
120:
121: replace_deprecated_attributes_by_css($root);
122:
123: replace_center($root, \@all_block); # must come after replace_deprecated_attributes_by_css
124:
125: replace_nobr($root);
126:
127: remove_useless_notsolved($root);
128:
129: fix_paragraphs_inside($root, \@all_block);
130:
131: remove_empty_style($root);
132:
133: fix_empty_lc_elements($root);
134:
135: lowercase_attribute_values($root);
136:
137: replace_numericalresponse_unit_attribute($root);
138:
139: replace_functions_by_elements($root);
140:
141: pretty($root, \@all_block);
142:
143: replace_tm_dtm($root);
144:
145: return $dom_doc->toString(); # byte string !
146: }
147:
148: sub fix_structure {
149: my ($doc) = @_;
150: # the root element has already been added in pre_xml
151: my $root = $doc->documentElement;
152: # inside the root, replace html, problem and library elements by their content
1.5 ! damieng 153: my @toreplace = ('html','problem','library','Task');
1.1 damieng 154: foreach my $name (@toreplace) {
155: my @elements = $root->getElementsByTagName($name);
156: foreach my $element (@elements) {
157: replace_by_children($element);
158: }
159: }
160: # insert all link and style elements inside a new head element
161: my $current_node = undef;
162: my @heads = $doc->getElementsByTagName('head');
163: my @links = $doc->getElementsByTagName('link');
164: my @styles = $doc->getElementsByTagName('style');
165: my @titles = $doc->getElementsByTagName('title');
166: if (scalar(@titles) > 0) {
167: # NOTE: there is a title element in gnuplot, not to be confused with the one inside HTML head
168: for (my $i=0; $i<scalar(@titles); $i++) {
169: my $title = $titles[$i];
170: my $found_gnuplot = 0;
171: my $ancestor = $title->parentNode;
172: while (defined $ancestor) {
173: if ($ancestor->nodeName eq 'gnuplot') {
174: $found_gnuplot = 1;
175: last;
176: }
177: $ancestor = $ancestor->parentNode;
178: }
179: if ($found_gnuplot) {
180: splice(@titles, $i, 1);
181: $i--;
182: }
183: }
184: }
185: if (scalar(@heads) > 0 || scalar(@titles) > 0 || scalar(@links) > 0 || scalar(@styles) > 0) {
186: my $htmlhead = $doc->createElement('head');
187: foreach my $head (@heads) {
188: my $next;
189: for (my $child=$head->firstChild; defined $child; $child=$next) {
190: $next = $child->nextSibling;
191: $head->removeChild($child);
192: if ($child->nodeType != XML_ELEMENT_NODE ||
193: string_in_array(['title','script','style','meta','link','import','base'], $child->nodeName)) {
194: $htmlhead->appendChild($child);
195: } else {
196: # this should not be in head
197: insert_after_or_first($root, $child, $current_node);
198: }
199: }
200: $head->parentNode->removeChild($head);
201: }
202: foreach my $child (@titles, @links, @styles) {
203: $child->parentNode->removeChild($child);
204: $htmlhead->appendChild($child);
205: }
206: insert_after_or_first($root, $htmlhead, $current_node);
207: $current_node = $htmlhead;
208: }
209: # body
210: my $htmlbody = undef;
211: my @bodies = $doc->getElementsByTagName('body');
212: if (scalar(@bodies) > 0) {
213: # TODO: fix content and position of body elements
214: if ($root->nodeName eq 'problem') {
215: foreach my $body (@bodies) {
216: replace_by_children($body);
217: }
218: }
219: }
220: # add all the meta elements afterwards when they are LON-CAPA meta. Remove all HTML meta.
221: my @meta_names = ('abstract','author','authorspace','avetries','avetries_list','clear','comefrom','comefrom_list','copyright','correct','count','course','course_list','courserestricted','creationdate','dependencies','depth','difficulty','difficulty_list','disc','disc_list','domain','end','field','firstname','generation','goto','goto_list','groupname','helpful','highestgradelevel','hostname','id','keynum','keywords','language','lastname','lastrevisiondate','lowestgradelevel','middlename','mime','modifyinguser','notes','owner','permanentemail','scope','sequsage','sequsage_list','standards','start','stdno','stdno_list','subject','technical','title','url','username','value','version');
222: my @metas = $doc->getElementsByTagName('meta');
223: foreach my $meta (@metas) {
224: $meta->parentNode->removeChild($meta);
225: my $name = $meta->getAttribute('name');
226: my $content = $meta->getAttribute('content');
227: if (defined $name && defined $content && string_in_array(\@meta_names, lc($name))) {
228: my $lcmeta = $doc->createElement('meta');
229: $lcmeta->setAttribute('name', lc($name));
230: $lcmeta->setAttribute('content', $content);
231: insert_after_or_first($root, $lcmeta, $current_node);
232: $current_node = $lcmeta;
233: }
234: }
235: return($root);
236: }
237:
238: # insert the new child under parent after the reference child, or as the first child if the reference child is not defined
239: sub insert_after_or_first {
240: my ($parent, $newchild, $refchild) = @_;
241: if (defined $refchild) {
242: $parent->insertAfter($newchild, $refchild);
243: } elsif (defined $parent->firstChild) {
244: $parent->insertBefore($newchild, $parent->firstChild);
245: } else {
246: $parent->appendChild($newchild);
247: }
248: }
249:
250: # removes all elements with given names inside the node, but keep the content
251: sub remove_elements {
252: my ($node, $to_remove) = @_;
253: my $nextChild;
254: for (my $child=$node->firstChild; defined $child; $child=$nextChild) {
255: $nextChild = $child->nextSibling;
256: my $type = $node->nodeType;
257: if ($type == XML_ELEMENT_NODE) {
258: if (string_in_array($to_remove, $child->nodeName)) {
259: my $first_non_white = $child->firstChild;
260: if (defined $first_non_white && $first_non_white->nodeType == XML_TEXT_NODE &&
261: $first_non_white->nodeValue =~ /^\s*$/) {
262: $first_non_white = $first_non_white->nextSibling;
263: }
264: if (defined $first_non_white) {
265: $nextChild = $first_non_white;
266: replace_by_children($child);
267: } else {
268: $node->removeChild($child);
269: }
270: } else {
271: remove_elements($child, $to_remove);
272: }
273: }
274: }
275: }
276:
277: # removes some attributes that have an invalid empty value
278: sub remove_empty_attributes {
279: my ($root) = @_;
280: my $doc = $root->ownerDocument;
281: # this list is based on validation errors in the MSU subset (it could be more complete if it was based on the schema)
282: my @attributes = (
283: ['curve', ['pointsize']],
284: ['foil', ['location']],
285: ['foilgroup', ['checkboxoptions', 'options', 'texoptions']],
286: ['gnuplot', ['pattern', 'texwidth']],
287: ['img', ['height', 'texheight', 'texwidth', 'texwrap', 'width']],
288: ['import', ['importmode']],
289: ['optionresponse', ['max']],
290: ['organicstructure', ['options']],
291: ['radiobuttonresponse', ['max']],
292: ['randomlabel', ['height', 'texwidth', 'width']],
293: ['stringresponse', ['type']],
294: ['textline', ['size']],
295: );
296: foreach my $element_attributes (@attributes) {
297: my $element_name = $element_attributes->[0];
298: my $attribute_names = $element_attributes->[1];
299: my @elements = $doc->getElementsByTagName($element_name);
300: foreach my $element (@elements) {
301: foreach my $attribute_name (@$attribute_names) {
302: my $value = $element->getAttribute($attribute_name);
303: if (defined $value && $value =~ /^\s*$/) {
304: $element->removeAttribute($attribute_name);
305: }
306: }
307: }
308: }
309: }
310:
311: # fixes the case for a few attributes that are not all lowercase
312: # (the HTML parser used in html_to_xml turns everything lowercase, which is a good thing in general)
313: sub fix_attribute_case {
314: my ($root) = @_;
315: my $doc = $root->ownerDocument;
316: my @attributes = (
317: ['labelgroup', ['TeXsize']],
318: ['h1', ['TeXsize']],
319: ['h2', ['TeXsize']],
320: ['h3', ['TeXsize']],
321: ['h4', ['TeXsize']],
322: ['h5', ['TeXsize']],
323: ['h6', ['TeXsize']],
324: # font and basefont have a TeXsize but will be removed
325: ['optionresponse', ['TeXlayout']],
326: ['itemgroup', ['TeXitemgroupwidth']],
327: ['Task', ['OptionalRequired']],
328: ['Question', ['OptionalRequired','Mandatory']],
329: ['Instance', ['OptionalRequired','Disabled']],
330: ['Criteria', ['Mandatory']],
331: ['table', ['TeXwidth','TeXtheme']],
332: ['td', ['TeXwidth']],
333: ['th', ['TeXwidth']],
334: ['img', ['TeXwidth','TeXheight','TeXwrap']],
335: );
336: foreach my $element_attributes (@attributes) {
337: my $element_name = $element_attributes->[0];
338: my $attribute_names = $element_attributes->[1];
339: my @elements = $doc->getElementsByTagName($element_name);
340: foreach my $element (@elements) {
341: foreach my $attribute_name (@$attribute_names) {
342: my $value = $element->getAttribute(lc($attribute_name));
343: if (defined $value) {
344: $element->removeAttribute(lc($attribute_name));
345: $element->setAttribute($attribute_name, $value);
346: }
347: }
348: }
349: }
350: }
351:
352: # Replaces m by HTML, tm and/or dtm (which will be replaced by <m> later, but they are useful
353: # to know if the element is a block element or not).
354: # m might contain non-math LaTeX, while tm and dtm may only contain math.
355: sub replace_m {
356: my ($root) = @_;
357: my $doc = $root->ownerDocument;
358: # search for variable declarations
359: my @variables = ();
360: my @scripts = $root->getElementsByTagName('script');
361: foreach my $script (@scripts) {
362: my $type = $script->getAttribute('type');
363: if (defined $type && $type eq 'loncapa/perl') {
364: if (defined $script->firstChild && $script->firstChild->nodeType == XML_TEXT_NODE) {
365: my $text = $script->firstChild->nodeValue;
366: # NOTE: we are not interested in replacing "@value", only "$value"
367: # this regexp is for " $a = ..." and " $a[...] = ..."
368: while ($text =~ /^[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
369: if (!string_in_array(\@variables, $1)) {
370: push(@variables, $1);
371: }
372: }
373: # this regexp is for "...; $a = ..." and "...; $a[...] = ..."
374: while ($text =~ /^[^'"\/;]+;[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
375: if (!string_in_array(\@variables, $1)) {
376: push(@variables, $1);
377: }
378: }
379: # this regexp is for " @a = ..."
380: while ($text =~ /^[ \t]*\@([a-zA-Z_0-9]+)[ \t]*=/gm) {
381: if (!string_in_array(\@variables, $1)) {
382: push(@variables, $1);
383: }
384: }
385: # this regexp is for " ($a, $b, $c) = ..."
386: my @matches = ($text =~ /^[ \t]*\([ \t]*\$([a-zA-Z_0-9]+)(?:[ \t]*,[ \t]*\$([a-zA-Z_0-9]+))*[ \t]*\)[ \t]*=/gm);
387: foreach my $match (@matches) {
388: if (!defined $match) {
389: next; # not sure why it happens, but it does
390: }
391: if (!string_in_array(\@variables, $match)) {
392: push(@variables, $match);
393: }
394: }
395: # and this one is for "push @a"
396: while ($text =~ /^[ \t]*push @([a-zA-Z_0-9]+)[ \t,]*/gm) {
397: if (!string_in_array(\@variables, $1)) {
398: push(@variables, $1);
399: }
400: }
401: # use the opportunity to report usage of <m> in Perl scripts
402: if ($text =~ /^[^#].*<m[ >]/m) {
403: if ($warnings) {
404: print "WARNING: <m> is used in a script, it should be converted by hand\n";
405: }
406: }
407: }
408: }
409: }
410: my @ms = $root->getElementsByTagName('m');
411: foreach my $m (@ms) {
412: if (!defined $m->firstChild) {
413: $m->parentNode->removeChild($m);
414: next;
415: }
416: if (defined $m->firstChild->nextSibling || $m->firstChild->nodeType != XML_TEXT_NODE) {
417: if ($warnings) {
418: print "WARNING: m value is not simple text\n";
419: }
420: next;
421: }
422: my $text = $m->firstChild->nodeValue;
423: my $text_before_variable_replacement = $text;
424: my $var_key1 = 'dfhg3df54hg65hg4';
425: my $var_key2 = 'dfhg654d6f5g4h5f';
426: my $eval = defined $m->getAttribute('eval') && $m->getAttribute('eval') eq 'on';
427: if ($eval) {
428: # replace variables
429: foreach my $variable (@variables) {
430: my $replacement = $var_key1.$variable.$var_key2;
431: $text =~ s/\$$variable(?![a-zA-Z])/$replacement/ge;
432: $text =~ s/\$\{$variable\}/$replacement/ge;
433: }
434: }
435: # check if the expression is enclosed in math separators: $ $$ \( \) \[ \]
436: # if so, replace the whole node by dtm or tm
437: my $new_text;
438: my $new_node_name;
439: if ($text =~ /^\s*\$\$([^\$]*)\$\$\s*$/) {
440: $new_node_name = 'dtm';
441: $new_text = $1;
442: } elsif ($text =~ /^\s*\\\[(.*)\\\]\s*$/s) {
443: $new_node_name = 'dtm';
444: $new_text = $1;
445: } elsif ($text =~ /^\s*\$([^\$]*)\$\s*$/) {
446: $new_node_name = 'tm';
447: $new_text = $1;
448: } elsif ($text =~ /^\s*\\\((.*)\\\)\s*$/s) {
449: $new_node_name = 'tm';
450: $new_text = $1;
451: }
452: if (defined $new_node_name) {
453: if ($eval) {
454: foreach my $variable (@variables) {
455: my $replacement = $var_key1.$variable.$var_key2;
456: $new_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
457: $new_text =~ s/$replacement/\$$variable/g;
458: }
459: }
460: my $new_node = $doc->createElement($new_node_name);
461: if ($eval) {
462: $new_node->setAttribute('eval', 'on');
463: }
464: $new_node->appendChild($doc->createTextNode($new_text));
465: $m->parentNode->replaceChild($new_node, $m);
466: next;
467: }
468: if ($text !~ /\$|\\\(|\\\)|\\\[|\\\]/) {
469: # there are no math separators inside
470: # try to guess if this is meant as math
471: my $found_math = 0;
472: foreach my $symbol (@latex_math) {
473: if (index($text, $symbol) != -1) {
474: $found_math = 1;
475: last;
476: }
477: }
478: if ($found_math) {
479: # interpret the whole text as LaTeX inline math
480: my $new_node = $doc->createElement('tm');
481: if ($eval) {
482: $new_node->setAttribute('eval', 'on');
483: }
484: $new_node->appendChild($doc->createTextNode($text_before_variable_replacement));
485: $m->parentNode->replaceChild($new_node, $m);
486: next;
487: }
488: # no math symbol found, we will convert the text with tth
489: }
490:
491: # there are math separators inside, even after hiding variables, or there was no math symbol
492:
493: # hide math parts inside before running tth
494: my $math_key1 = '#ghjgdh5hg45gf';
495: my $math_key2 = '#';
496: my @maths = ();
497: my @separators = (['$$','$$'], ['\\(','\\)'], ['\\[','\\]'], ['$','$']);
498: foreach my $seps (@separators) {
499: my $sep1 = $seps->[0];
500: my $sep2 = $seps->[1];
501: my $pos1 = index($text, $sep1);
502: if ($pos1 == -1) {
503: next;
504: }
505: my $pos2 = index($text, $sep2, $pos1+length($sep1));
506: while ($pos1 != -1 && $pos2 != -1) {
507: my $replace = substr($text, $pos1, $pos2+length($sep2)-$pos1);
508: push(@maths, $replace);
509: my $by = $math_key1.scalar(@maths).$math_key2;
510: $text = substr($text, 0, $pos1).$by.substr($text, $pos2+length($sep2));
511: $pos1 = index($text, $sep1);
512: if ($pos1 != -1) {
513: $pos2 = index($text, $sep2, $pos1+length($sep1));
514: }
515: }
516: }
517: # get HTML as text from tth
518: my $html_text = tth($text);
519: # replace math by replacements
520: for (my $i=0; $i < scalar(@maths); $i++) {
521: my $math = $maths[$i];
522: $math =~ s/&/&/g;
523: $math =~ s/</</g;
524: $math =~ s/>/>/g;
525: if ($math =~ /^\$\$(.*)\$\$$/s) {
526: $math = '<dtm>'.$1.'</dtm>';
527: } elsif ($math =~ /^\\\[(.*)\\\]$/s) {
528: $math = '<dtm>'.$1.'</dtm>';
529: } elsif ($math =~ /^\\\((.*)\\\)$/s) {
530: $math = '<tm>'.$1.'</tm>';
531: } elsif ($math =~ /^\$(.*)\$$/s) {
532: $math = '<tm>'.$1.'</tm>';
533: }
534: my $replace = $math_key1.($i+1).$math_key2;
535: $html_text =~ s/$replace/$math/;
536: }
537: # replace variables if necessary
538: if ($eval) {
539: foreach my $variable (@variables) {
540: my $replacement = $var_key1.$variable.$var_key2;
541: $html_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
542: $html_text =~ s/$replacement/\$$variable/g;
543: }
544: }
545: my $fragment = html_to_dom($html_text);
546: $doc->adoptNode($fragment);
547: $m->parentNode->replaceChild($fragment, $m);
548:
549: }
550: }
551:
552: # Returns the HTML equivalent of LaTeX input, using tth
553: sub tth {
554: my ($text) = @_;
555: my ($fh, $tmp_path) = tempfile();
556: binmode($fh, ':utf8');
557: print $fh $text;
558: close $fh;
559: my $output = `tth -r -w2 -u -y0 < $tmp_path 2>/dev/null`;
560: # hopefully the temp file will not be removed before this point (otherwise we should use unlink_on_destroy 0)
561: $output =~ s/^\s*|\s*$//;
562: $output =~ s/<div class="p"><!----><\/div>/<br\/>/; # why is tth using such ugly markup for \newline ?
563: return $output;
564: }
565:
566: # transform simple HTML into a DOM fragment (which will need to be adopted by the document)
567: sub html_to_dom {
568: my ($text) = @_;
569: $text = '<root>'.$text.'</root>';
570: my $textref = html_to_xml::html_to_xml(\$text);
571: utf8::upgrade($$textref); # otherwise the XML parser fails when the HTML parser turns into a character
572: my $dom_doc = XML::LibXML->load_xml(string => $textref);
573: my $root = $dom_doc->documentElement;
574: remove_empty_style($root);
575: my $fragment = $dom_doc->createDocumentFragment();
576: my $next;
577: for (my $n=$root->firstChild; defined $n; $n=$next) {
578: $next = $n->nextSibling;
579: $root->removeChild($n);
580: $fragment->appendChild($n);
581: }
582: return($fragment);
583: }
584:
585: # Use the linked sty files to guess which newly defined elements should be considered blocks.
586: # Also adds to @containing_styles the sty elements that contain styles.
587: # @param {string} fn - the file path (we only extract the directory path from it)
1.2 damieng 588: # @param {string} res_dir - path of res directory parent (without the / at the end)
589: # @param {Element} root - the root element
1.1 damieng 590: sub add_sty_blocks {
1.2 damieng 591: my ($fn, $res_dir, $root, $all_block) = @_;
1.1 damieng 592: my $doc = $root->ownerDocument;
593: my @parserlibs = $doc->getElementsByTagName('parserlib');
594: my @libs = ();
595: foreach my $parserlib (@parserlibs) {
596: if (defined $parserlib->firstChild && $parserlib->firstChild->nodeType == XML_TEXT_NODE) {
597: my $value = $parserlib->firstChild->nodeValue;
598: $value =~ s/^\s+|\s+$//g;
599: if ($value ne '') {
600: push(@libs, $value);
601: }
602: }
603: }
604: my ($name, $path, $suffix) = fileparse($fn);
605: foreach my $sty (@libs) {
606: if (substr($sty, 0, 1) eq '/') {
1.2 damieng 607: $sty = $res_dir.$sty;
1.1 damieng 608: } else {
609: $sty = $path.$sty;
610: }
611: my $new_elements = parse_sty($sty, $all_block);
612: better_guess($root, $new_elements, $all_block);
613: my $new_blocks = $new_elements->{'block'};
614: my $new_inlines = $new_elements->{'inline'};
615: push(@$all_block, @{$new_blocks});
616: #push(@inlines, @{$new_inlines}); # we are not using a list of inline elements at this point
617: }
618: }
619:
620: ##
621: # Parses a sty file and returns lists of block and inline elements.
622: # @param {string} fn - the file path
623: ##
624: sub parse_sty {
625: my ($fn, $all_block) = @_;
626: my @blocks = ();
627: my @inlines = ();
628: my $p = HTML::TokeParser->new($fn);
629: if (! $p) {
630: die "post_xml.pl: parse_sty: Error reading $fn\n";
631: }
632: $p->empty_element_tags(1);
633: my $in_definetag = 0;
634: my $in_render = 0;
635: my %newtags = ();
636: my $newtag = '';
637: my $is_block = 0;
638: while (my $token = $p->get_token) {
639: if ($token->[0] eq 'S') {
640: my $tag = lc($token->[1]);
641: if ($tag eq 'definetag') {
642: $in_definetag = 1;
643: $is_block = 0;
644: my $attributes = $token->[2];
645: $newtag = $attributes->{'name'};
646: if (substr($newtag, 0, 1) eq '/') {
647: $newtag = substr($newtag, 1);
648: }
649: } elsif ($in_definetag && $tag eq 'render') {
650: $in_render = 1;
651: $is_block = 0;
652: } elsif ($in_render) {
653: if (string_in_array($all_block, $tag)) {
654: $is_block = 1;
655: }
656: }
657: } elsif ($token->[0] eq 'E') {
658: my $tag = lc($token->[1]);
659: if ($tag eq 'definetag') {
660: $in_definetag = 0;
661: if (defined $newtags{$newtag}) {
662: $newtags{$newtag} = $newtags{$newtag} || $is_block;
663: } else {
664: $newtags{$newtag} = $is_block;
665: }
666: } elsif ($in_definetag && $tag eq 'render') {
667: $in_render = 0;
668: }
669: }
670: }
671: foreach $newtag (keys(%newtags)) {
672: if ($newtags{$newtag} == 1) {
673: push(@blocks, $newtag);
674: } else {
675: push(@inlines, $newtag);
676: }
677: }
678: return {'block'=>\@blocks, 'inline'=>\@inlines};
679: }
680:
681: ##
682: # Marks as block the elements that contain block elements in the input file.
683: # Also adds to @containing_styles the sty elements that contain styles.
684: # @param {string} fn - the file path
685: # @param {Hash<string,Array>} new_elements - contains arrays in 'block' and 'inline'
686: ##
687: sub better_guess {
688: my ($root, $new_elements, $all_block) = @_;
689: my $new_blocks = $new_elements->{'block'};
690: my $new_inlines = $new_elements->{'inline'};
691:
692: my @change = (); # change these elements from inline to block
693: foreach my $tag (@{$new_inlines}) {
694: my @nodes = $root->getElementsByTagName($tag);
695: NODE_LOOP: foreach my $node (@nodes) {
696: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
697: if ($child->nodeType == XML_ELEMENT_NODE) {
698: if (string_in_array($all_block, $child->nodeName) || string_in_array($new_blocks, $child->nodeName)) {
699: push(@change, $tag);
700: last NODE_LOOP;
701: }
702: }
703: }
704: }
705: }
706: foreach my $inline (@change) {
707: my $index = 0;
708: $index++ until $new_inlines->[$index] eq $inline;
709: splice(@{$new_inlines}, $index, 1);
710: push(@{$new_blocks}, $inline);
711: }
712: # add to @containing_styles when a style is used inside
713: # NOTE: some sty elements will be added even though they should not, but if we don't do that
714: # all style will be removed in the sty elements.
715: foreach my $tag ((@{$new_blocks}, @{$new_inlines})) {
716: my @nodes = $root->getElementsByTagName($tag);
717: NODE_LOOP: foreach my $node (@nodes) {
718: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
719: if ($child->nodeType == XML_ELEMENT_NODE) {
720: if (string_in_array(\@html_styles, $child->nodeName)) {
721: push(@containing_styles, $tag);
722: last NODE_LOOP;
723: }
724: }
725: }
726: }
727: }
728: }
729:
730: # When a style element contains a block, move the style inside the block where it is allowed.
731: # style/block/other -> block/style/other
732: # When a style is used where it is not allowed, move it inside its children or remove it (unless it contains only text)
733: # element_not_containing_styles/style/other -> element_not_containing_styles/other/style (except if other is a style)
734: # The fix is not perfect in the case of element_not_containing_styles/style1/style2/block/text (style1 will be lost):
735: # element_not_containing_styles/style1/style2/block/text -> element_not_containing_styles/block/style2/text
736: # (a solution to this problem would be to merge the styles in a span)
737: # NOTE: .sty defined elements might have been added to @containing_styles by better_guess().
738: sub fix_block_styles {
739: my ($element, $all_block) = @_;
740: my $doc = $element->ownerDocument;
741: if (string_in_array(\@html_styles, $element->nodeName)) {
742: # move spaces out of the style element
743: if (defined $element->firstChild && $element->firstChild->nodeType == XML_TEXT_NODE) {
744: my $child = $element->firstChild;
745: if ($child->nodeValue =~ /^(\s+)(\S.*)$/s) {
746: $element->parentNode->insertBefore($doc->createTextNode($1), $element);
747: $child->setData($2);
748: }
749: }
750: if (defined $element->lastChild && $element->lastChild->nodeType == XML_TEXT_NODE) {
751: my $child = $element->lastChild;
752: if ($child->nodeValue =~ /^(.*\S)(\s+)$/s) {
753: $element->parentNode->insertAfter($doc->createTextNode($2), $element);
754: $child->setData($1);
755: }
756: }
757:
758: my $found_block = 0;
759: for (my $child=$element->firstChild; defined $child; $child=$child->nextSibling) {
760: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
761: $found_block = 1;
762: last;
763: }
764: }
765: my $no_style_here = !string_in_array(\@containing_styles, $element->parentNode->nodeName);
766: if ($found_block || $no_style_here) {
767: # there is a block or the style is not allowed here,
768: # the style element has to be replaced by its modified children
769: my $s; # a clone of the style
770: my $next;
771: for (my $child=$element->firstChild; defined $child; $child=$next) {
772: $next = $child->nextSibling;
773: if ($child->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $child->nodeName) ||
774: $child->nodeName eq 'br' || $no_style_here)) {
775: # avoid inverting a style with a style with $no_style_here (that would cause endless recursion)
776: if (!$no_style_here || (!string_in_array(\@html_styles, $child->nodeName) &&
777: string_in_array(\@containing_styles, $child->nodeName))) {
778: # block node or inline node when the style is not allowed:
779: # move all children inside the style, and make the style the only child
780: $s = $element->cloneNode();
781: my $next2;
782: for (my $child2=$child->firstChild; defined $child2; $child2=$next2) {
783: $next2 = $child2->nextSibling;
784: $child->removeChild($child2);
785: $s->appendChild($child2);
786: }
787: $child->appendChild($s);
788: }
789: $s = undef;
790: } elsif (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
791: $child->nodeType == XML_ELEMENT_NODE) {
792: # if the style is allowed, move text and inline nodes inside the style
793: if (!$no_style_here) {
794: if (!defined $s) {
795: $s = $element->cloneNode();
796: $element->insertBefore($s, $child);
797: }
798: $element->removeChild($child);
799: $s->appendChild($child);
800: }
801: } else {
802: # do not put other nodes inside the style
803: $s = undef;
804: }
805: }
806: # now replace by children and fix them
807: my $parent = $element->parentNode;
808: for (my $child=$element->firstChild; defined $child; $child=$next) {
809: $next = $child->nextSibling;
810: $element->removeChild($child);
811: $parent->insertBefore($child, $element);
812: if ($child->nodeType == XML_ELEMENT_NODE) {
813: fix_block_styles($child, $all_block);
814: }
815: }
816: $parent->removeChild($element);
817: return;
818: }
819: }
820: # otherwise fix all children
821: my $next;
822: for (my $child=$element->firstChild; defined $child; $child=$next) {
823: $next = $child->nextSibling;
824: if ($child->nodeType == XML_ELEMENT_NODE) {
825: fix_block_styles($child, $all_block);
826: }
827: }
828: }
829:
830: # removes empty font elements and font elements that contain at least one block element
831: # replaces other font elements by equivalent span
832: sub fix_fonts {
833: my ($root, $all_block) = @_;
834: my $doc = $root->ownerDocument;
835: my @fonts = $root->getElementsByTagName('font');
836: @fonts = reverse(@fonts); # to deal with the ancestor last in the case of font/font
837: foreach my $font (@fonts) {
838: my $block = 0;
839: for (my $child=$font->firstChild; defined $child; $child=$child->nextSibling) {
840: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
841: $block = 1;
842: last;
843: }
844: }
845: if (!defined $font->firstChild || $block) {
846: # empty font or font containing block elements
847: # replace this node by its content
848: replace_by_children($font);
849: } else {
850: # replace by equivalent span
851: my $color = get_non_empty_attribute($font, 'color');
852: my $size = get_non_empty_attribute($font, 'size');
853: my $face = get_non_empty_attribute($font, 'face');
854: if (defined $face) {
855: $face =~ s/^,|,$//;
856: }
857: if (!defined $color && !defined $size && !defined $face) {
858: # useless font element: replace this node by its content
859: replace_by_children($font);
860: next;
861: }
862: my $replacement;
863: tie (my %properties, 'Tie::IxHash', ());
864: if (!defined $color && !defined $size && defined $face && lc($face) eq 'symbol') {
865: $replacement = $doc->createDocumentFragment();
866: } else {
867: $replacement = $doc->createElement('span');
868: my $css = '';
869: if (defined $color) {
870: $color =~ s/^x/#/;
871: $properties{'color'} = $color;
872: }
873: if (defined $size) {
874: my %hash = (
875: '1' => 'x-small',
876: '2' => 'small',
877: '3' => 'medium',
878: '4' => 'large',
879: '5' => 'x-large',
880: '6' => 'xx-large',
881: '7' => '300%',
882: '-1' => 'small',
883: '-2' => 'x-small',
884: '+1' => 'large',
885: '+2' => 'x-large',
886: '+3' => 'xx-large',
887: '+4' => '300%',
888: );
889: my $value = $hash{$size};
890: if (!defined $value) {
891: $value = 'medium';
892: }
893: $properties{'font-size'} = $value;
894: }
895: if (defined $face) {
896: if (lc($face) ne 'symbol' && lc($face) ne 'bold') {
897: $properties{'font-family'} = $face;
898: }
899: }
900: set_css_properties($replacement, \%properties);
901: }
902: if (defined $face && lc($face) eq 'symbol') {
903: # convert all content to unicode
904: my $next;
905: for (my $child=$font->firstChild; defined $child; $child=$next) {
906: $next = $child->nextSibling;
907: if ($child->nodeType == XML_TEXT_NODE) {
908: my $value = $child->nodeValue;
909: $value =~ tr/ABGDEZHQIKLMNXOPRSTUFCYWabgdezhqiklmnxoprVstufcywJjv¡«¬®/ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϕϖϒ↔←→/;
910: $child->setData($value);
911: }
912: }
913: }
914: # replace the font node
915: if ($replacement->nodeType == XML_ELEMENT_NODE && !defined $font->previousSibling &&
916: !defined $font->nextSibling && string_in_array(\@accepting_style, $font->parentNode->nodeName)) {
917: # use CSS on the parent block and replace font by its children instead of using a new element
918: set_css_properties($font->parentNode, \%properties);
919: replace_by_children($font);
920: } else {
921: # move all font children inside the replacement (span or fragment)
922: my $next;
923: for (my $child=$font->firstChild; defined $child; $child=$next) {
924: $next = $child->nextSibling;
925: $font->removeChild($child);
926: $replacement->appendChild($child);
927: }
928: # replace font
929: $font->parentNode->replaceChild($replacement, $font);
930: }
931: }
932: }
933: $root->normalize();
934: }
935:
936: # replaces u by <span style="text-decoration: underline">
937: sub replace_u {
938: my ($root) = @_;
939: my $doc = $root->ownerDocument;
940: my @us = $root->getElementsByTagName('u');
941: foreach my $u (@us) {
942: my $span = $doc->createElement('span');
943: $span->setAttribute('style', 'text-decoration: underline');
944: my $next;
945: for (my $child=$u->firstChild; defined $child; $child=$next) {
946: $next = $child->nextSibling;
947: $u->removeChild($child);
948: $span->appendChild($child);
949: }
950: $u->parentNode->replaceChild($span, $u);
951: }
952: }
953:
954: # removes CDATA sections tags that have not been parsed correcty by the HTML parser
955: # also removes bad comments in script elements
956: sub remove_bad_cdata_sections {
957: my ($root) = @_;
958: my $doc = $root->ownerDocument;
959: foreach my $name (@preserve_elements) {
960: my @nodes = $root->getElementsByTagName($name);
961: foreach my $node (@nodes) {
962: if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
963: my $value = $node->firstChild->nodeValue;
964: if ($name eq 'script' && (!defined $node->getAttribute('type') || $node->getAttribute('type') ne 'loncapa/perl') &&
965: !defined $node->firstChild->nextSibling && $value =~ /^(\s*)<!--(.*)-->(\s*)$/) {
966: # web browsers interpret that as a real comment when it is on 1 line, but the Perl HTML parser thinks it is the script
967: # -> turning it back into a comment
968: # (this is only true for Javascript script elements, since LON-CAPA does not parse loncapa/perl scripts in the same way)
969: $node->removeChild($node->firstChild);
970: $node->appendChild($doc->createComment($2));
971: next;
972: }
973: # at the beginning:
974: $value =~ s/^(\s*)<!\[CDATA\[/$1/; # <![CDATA[
975: $value =~ s/^(\s*)\/\*\s*<!\[CDATA\[\s*\*\//$1/; # /* <![CDATA[ */
976: $value =~ s/^(\s*)\/\/\s*<!\[CDATA\[/$1/; # // <![CDATA[
977: $value =~ s/^(\s*)(\/\/)?\s*<!--/$1/; # // <!--
978: # at the end:
979: $value =~ s/\/\/\s*\]\]>(\s*)$/$1/; # // ]]>
980: $value =~ s/\]\]>(\s*)$/$1/; # ]]>
981: $value =~ s/(\/\/)?\s*-->(\s*)$/$2/; # // -->
982: $value =~ s/\/\*\s*\]\]>\s*\*\/(\s*)$/$1/; # /* ]]> */
983:
984: $value = "\n".$value."\n";
985: $value =~ s/\s*(\n[ \t]*)/$1/;
986: $value =~ s/\s+$/\n/;
987: $node->firstChild->setData($value);
988: }
989: }
990: }
991: }
992:
993: # adds CDATA sections to scripts
994: sub add_cdata_sections {
995: my ($root) = @_;
996: my $doc = $root->ownerDocument;
997: my @scripts = $root->getElementsByTagName('script');
998: my @answers = $root->getElementsByTagName('answer');
999: foreach my $answer (@answers) {
1000: my $ancestor = $answer->parentNode;
1001: my $found_capa_response = 0;
1002: while (defined $ancestor) {
1003: if ($ancestor->nodeName eq 'numericalresponse' || $ancestor->nodeName eq 'formularesponse') {
1004: $found_capa_response = 1;
1005: last;
1006: }
1007: $ancestor = $ancestor->parentNode;
1008: }
1009: if (!$found_capa_response) {
1010: push(@scripts, $answer);
1011: }
1012: }
1013: foreach my $script (@scripts) {
1014: # use a CDATA section in the normal situation, for any script
1015: my $first = $script->firstChild;
1016: if (defined $first && $first->nodeType == XML_TEXT_NODE && !defined $first->nextSibling) {
1017: my $cdata = $doc->createCDATASection($first->nodeValue);
1018: $script->replaceChild($cdata, $first);
1019: }
1020: }
1021: }
1022:
1023: # removes "<!--" and "-->" at the beginning and end of style elements
1024: sub fix_style_element {
1025: my ($root) = @_;
1026: my @styles = $root->getElementsByTagName('style');
1027: foreach my $style (@styles) {
1028: if (defined $style->firstChild && $style->firstChild->nodeType == XML_TEXT_NODE &&
1029: !defined $style->firstChild->nextSibling) {
1030: my $text = $style->firstChild->nodeValue;
1031: if ($text =~ /^\s*<!--(.*)-->\s*$/s) {
1032: $style->firstChild->setData($1);
1033: }
1034: }
1035: }
1036: }
1037:
1038: # create missing cells at the end of table rows
1039: sub fix_tables {
1040: my ($root) = @_;
1041: my @tables = $root->getElementsByTagName('table');
1042: foreach my $table (@tables) {
1043: fix_cells($table);
1044: foreach my $tbody ($table->getChildrenByTagName('tbody')) {
1045: fix_cells($tbody);
1046: }
1047: foreach my $thead ($table->getChildrenByTagName('thead')) {
1048: fix_cells($thead);
1049: }
1050: foreach my $tfoot ($table->getChildrenByTagName('tfoot')) {
1051: fix_cells($tfoot);
1052: }
1053: }
1054: }
1055:
1056: # create missing cells at the end of table rows
1057: sub fix_cells {
1058: my ($table) = @_; # could actually be table, tbody, thead or tfoot
1059: my $doc = $table->ownerDocument;
1060: my @nb_cells = ();
1061: my $max_nb_cells = 0;
1062: my @rowspans = ();
1063: my @trs = $table->getChildrenByTagName('tr');
1064: foreach my $tr (@trs) {
1065: my $nb_cells;
1066: if (defined $rowspans[0]) {
1067: $nb_cells = shift(@rowspans);
1068: } else {
1069: $nb_cells = 0;
1070: }
1071: for (my $cell=$tr->firstChild; defined $cell; $cell=$cell->nextSibling) {
1072: if ($cell->nodeName eq 'td' || $cell->nodeName eq 'th') {
1073: my $colspan = $cell->getAttribute('colspan');
1074: if (defined $colspan && $colspan =~ /^\s*[0-9]+\s*$/) {
1075: $nb_cells += $colspan;
1076: } else {
1077: $nb_cells++;
1078: }
1079: my $rowspan = $cell->getAttribute('rowspan');
1080: if (defined $rowspan && $rowspan =~ /^\s*[0-9]+\s*$/) {
1081: for (my $i=0; $i < $rowspan-1; $i++) {
1082: if (!defined $rowspans[$i]) {
1083: $rowspans[$i] = 1;
1084: } else {
1085: $rowspans[$i]++;
1086: }
1087: }
1088: }
1089: }
1090: }
1091: push(@nb_cells, $nb_cells);
1092: if ($nb_cells > $max_nb_cells) {
1093: $max_nb_cells = $nb_cells;
1094: }
1095: }
1096: foreach my $tr (@trs) {
1097: my $nb_cells = shift(@nb_cells);
1098: if ($nb_cells < $max_nb_cells) {
1099: for (1..($max_nb_cells - $nb_cells)) {
1100: $tr->appendChild($doc->createElement('td'));
1101: }
1102: }
1103: }
1104: }
1105:
1106: # replaces ul/ul by ul/li/ul and the same for ol (using the previous li if possible)
1107: # also adds a ul element when a li has no ul/ol ancestor
1108: sub fix_lists {
1109: my ($root) = @_;
1110: my $doc = $root->ownerDocument;
1111: my @uls = $root->getElementsByTagName('ul');
1112: my @ols = $root->getElementsByTagName('ol');
1113: my @lists = (@uls, @ols);
1114: foreach my $list (@lists) {
1115: my $next;
1116: for (my $child=$list->firstChild; defined $child; $child=$next) {
1117: $next = $child->nextSibling;
1118: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array(['ul','ol'], $child->nodeName)) {
1119: my $previous = $child->previousNonBlankSibling(); # note: non-DOM method
1120: $list->removeChild($child);
1121: if (defined $previous && $previous->nodeType == XML_ELEMENT_NODE && $previous->nodeName eq 'li') {
1122: $previous->appendChild($child);
1123: } else {
1124: my $li = $doc->createElement('li');
1125: $li->appendChild($child);
1126: if (!defined $next) {
1127: $list->appendChild($li);
1128: } else {
1129: $list->insertBefore($li, $next);
1130: }
1131: }
1132: }
1133: }
1134: }
1135: my @lis = $root->getElementsByTagName('li');
1136: foreach my $li (@lis) {
1137: my $found_list_ancestor = 0;
1138: my $ancestor = $li->parentNode;
1139: while (defined $ancestor) {
1140: if ($ancestor->nodeName eq 'ul' || $ancestor->nodeName eq 'ol') {
1141: $found_list_ancestor = 1;
1142: last;
1143: }
1144: $ancestor = $ancestor->parentNode;
1145: }
1146: if (!$found_list_ancestor) {
1147: # replace li by ul and add li under ul
1148: my $ul = $doc->createElement('ul');
1149: $li->parentNode->insertBefore($ul, $li);
1150: $li->parentNode->removeChild($li);
1151: $ul->appendChild($li);
1152: # add all other li afterwards inside ul (there might be text nodes in-between)
1153: my $next = $ul->nextSibling;
1154: while (defined $next) {
1155: my $next_next = $next->nextSibling;
1156: if ($next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^\s*$/ &&
1157: defined $next_next && $next_next->nodeType == XML_ELEMENT_NODE && $next_next->nodeName eq 'li') {
1158: $next->parentNode->removeChild($next);
1159: $ul->appendChild($next);
1160: $next = $next_next;
1161: $next_next = $next_next->nextSibling;
1162: }
1163: if ($next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'li') {
1164: $next->parentNode->removeChild($next);
1165: $ul->appendChild($next);
1166: } else {
1167: last;
1168: }
1169: $next = $next_next;
1170: }
1171: }
1172: }
1173: }
1174:
1175: # Some "image" elements are actually img element with a wrong name. This renames them.
1176: # Amazingly enough, "<image src=..." displays an image in some browsers
1177: # ("image" has existed at some point as an experimental HTML element).
1178: sub fix_wrong_name_for_img {
1179: my ($root) = @_;
1180: my @images = $root->getElementsByTagName('image');
1181: foreach my $image (@images) {
1182: if (!defined $image->getAttribute('src')) {
1183: next;
1184: }
1185: my $found_correct_ancestor = 0;
1186: my $ancestor = $image->parentNode;
1187: while (defined $ancestor) {
1188: if ($ancestor->nodeName eq 'drawimage' || $ancestor->nodeName eq 'imageresponse') {
1189: $found_correct_ancestor = 1;
1190: last;
1191: }
1192: $ancestor = $ancestor->parentNode;
1193: }
1194: if ($found_correct_ancestor) {
1195: next;
1196: }
1197: # this really has to be renamed "img"
1198: $image->setNodeName('img');
1199: }
1200: }
1201:
1202: # Replaces many deprecated attributes and replaces them by equivalent CSS when possible
1203: sub replace_deprecated_attributes_by_css {
1204: my ($root) = @_;
1205:
1206: fix_deprecated_in_tables($root);
1207:
1208: fix_deprecated_in_table_rows($root);
1209:
1210: fix_deprecated_in_table_cells($root);
1211:
1212: fix_deprecated_in_lists($root);
1213:
1214: fix_deprecated_in_list_items($root);
1215:
1216: fix_deprecated_in_hr($root);
1217:
1218: fix_deprecated_in_img($root);
1219:
1220: fix_deprecated_in_body($root);
1221:
1222: fix_align_attribute($root);
1223: }
1224:
1225: # Replaces deprecated attributes in tables
1226: sub fix_deprecated_in_tables {
1227: my ($root) = @_;
1228: my @tables = $root->getElementsByTagName('table');
1229: foreach my $table (@tables) {
1230: tie (my %new_properties, 'Tie::IxHash', ());
1231: my $align = $table->getAttribute('align');
1232: if (defined $align) {
1233: $table->removeAttribute('align');
1234: $align = lc(trim($align));
1235: }
1236: if ($table->parentNode->nodeName eq 'center' || (defined $align && $align eq 'center') ||
1237: (defined $table->parentNode->getAttribute('align') && $table->parentNode->getAttribute('align') eq 'center')) {
1238: $new_properties{'margin-left'} = 'auto';
1239: $new_properties{'margin-right'} = 'auto';
1240: }
1241: if (defined $align && ($align eq 'left' || $align eq 'right')) {
1242: $new_properties{'float'} = $align;
1243: }
1244: my $width = $table->getAttribute('width');
1245: if (defined $width) {
1246: $table->removeAttribute('width');
1247: $width = trim($width);
1248: if ($width =~ /^[0-9]+$/) {
1249: $width .= 'px';
1250: }
1251: if ($width ne '') {
1252: $new_properties{'width'} = $width;
1253: }
1254: }
1255: my $height = $table->getAttribute('height');
1256: if (defined $height) {
1257: $table->removeAttribute('height');
1258: # no replacement for table height
1259: }
1260: my $bgcolor = $table->getAttribute('bgcolor');
1261: if (defined $bgcolor) {
1262: $table->removeAttribute('bgcolor');
1263: $bgcolor = trim($bgcolor);
1264: $bgcolor =~ s/^x\s*//;
1265: if ($bgcolor ne '') {
1266: $new_properties{'background-color'} = $bgcolor;
1267: }
1268: }
1269: my $frame = $table->getAttribute('frame');
1270: if (defined $frame) {
1271: $table->removeAttribute('frame');
1272: $frame = lc(trim($frame));
1273: if ($frame eq 'void') {
1274: $new_properties{'border'} = 'none';
1275: } elsif ($frame eq 'above') {
1276: $new_properties{'border-top'} = '1px solid black';
1277: } elsif ($frame eq 'below') {
1278: $new_properties{'border-bottom'} = '1px solid black';
1279: } elsif ($frame eq 'hsides') {
1280: $new_properties{'border-top'} = '1px solid black';
1281: $new_properties{'border-bottom'} = '1px solid black';
1282: } elsif ($frame eq 'vsides') {
1283: $new_properties{'border-left'} = '1px solid black';
1284: $new_properties{'border-right'} = '1px solid black';
1285: } elsif ($frame eq 'lhs') {
1286: $new_properties{'border-left'} = '1px solid black';
1287: } elsif ($frame eq 'rhs') {
1288: $new_properties{'border-right'} = '1px solid black';
1289: } elsif ($frame eq 'box') {
1290: $new_properties{'border'} = '1px solid black';
1291: } elsif ($frame eq 'border') {
1292: $new_properties{'border'} = '1px solid black';
1293: }
1294: }
1295: if (scalar(keys %new_properties) > 0) {
1296: set_css_properties($table, \%new_properties);
1297: }
1298: # we can't replace the border attribute without creating a style block, but we can improve things like border="BORDER"
1299: my $border = $table->getAttribute('border');
1300: if (defined $border) {
1301: $border = trim($border);
1302: if ($border !~ /^\s*[0-9]+\s*(px)?\s*$/) {
1303: $table->setAttribute('border', '1');
1304: }
1305: }
1306: }
1307:
1308: }
1309:
1310: # Replaces deprecated attributes in tr elements
1311: sub fix_deprecated_in_table_rows {
1312: my ($root) = @_;
1313: my @trs = $root->getElementsByTagName('tr');
1314: foreach my $tr (@trs) {
1315: my $old_properties = get_css_properties($tr);
1316: tie (my %new_properties, 'Tie::IxHash', ());
1317: my $bgcolor = $tr->getAttribute('bgcolor');
1318: if (defined $bgcolor) {
1319: $tr->removeAttribute('bgcolor');
1320: if (!defined $old_properties->{'background-color'}) {
1321: $bgcolor = trim($bgcolor);
1322: $bgcolor =~ s/^x\s*//;
1323: if ($bgcolor ne '') {
1324: $new_properties{'background-color'} = $bgcolor;
1325: }
1326: }
1327: }
1328: my $align = $tr->getAttribute('align');
1329: if (defined $align && $align !~ /\s*char\s*/i) {
1330: $tr->removeAttribute('align');
1331: if (!defined $old_properties->{'text-align'}) {
1332: $align = lc(trim($align));
1333: if ($align ne '') {
1334: $new_properties{'text-align'} = $align;
1335: }
1336: }
1337: }
1338: my $valign = $tr->getAttribute('valign');
1339: if (defined $valign) {
1340: $tr->removeAttribute('valign');
1341: if (!defined $old_properties->{'vertical-align'}) {
1342: $valign = lc(trim($valign));
1343: if ($valign ne '') {
1344: $new_properties{'vertical-align'} = $valign;
1345: }
1346: }
1347: }
1348: if (scalar(keys %new_properties) > 0) {
1349: set_css_properties($tr, \%new_properties);
1350: }
1351: }
1352: }
1353:
1354: # Replaces deprecated attributes in table cells (td and th)
1355: sub fix_deprecated_in_table_cells {
1356: my ($root) = @_;
1357: my @tds = $root->getElementsByTagName('td');
1358: my @ths = $root->getElementsByTagName('th');
1359: my @cells = (@tds, @ths);
1360: foreach my $cell (@cells) {
1361: my $old_properties = get_css_properties($cell);
1362: tie (my %new_properties, 'Tie::IxHash', ());
1363: my $width = $cell->getAttribute('width');
1364: if (defined $width) {
1365: $cell->removeAttribute('width');
1366: if (!defined $old_properties->{'width'}) {
1367: $width = trim($width);
1368: if ($width =~ /^[0-9]+$/) {
1369: $width .= 'px';
1370: }
1371: if ($width ne '') {
1372: $new_properties{'width'} = $width;
1373: }
1374: }
1375: }
1376: my $height = $cell->getAttribute('height');
1377: if (defined $height) {
1378: $cell->removeAttribute('height');
1379: if (!defined $old_properties->{'height'}) {
1380: $height = trim($height);
1381: if ($height =~ /^[0-9]+$/) {
1382: $height .= 'px';
1383: }
1384: if ($height ne '') {
1385: $new_properties{'height'} = $height;
1386: }
1387: }
1388: }
1389: my $bgcolor = $cell->getAttribute('bgcolor');
1390: if (defined $bgcolor) {
1391: $cell->removeAttribute('bgcolor');
1392: if (!defined $old_properties->{'background-color'}) {
1393: $bgcolor = trim($bgcolor);
1394: $bgcolor =~ s/^x\s*//;
1395: if ($bgcolor ne '') {
1396: $new_properties{'background-color'} = $bgcolor;
1397: }
1398: }
1399: }
1400: my $align = $cell->getAttribute('align');
1401: if (defined $align && $align !~ /\s*char\s*/i) {
1402: $cell->removeAttribute('align');
1403: if (!defined $old_properties->{'text-align'}) {
1404: $align = lc(trim($align));
1405: if ($align ne '') {
1406: $new_properties{'text-align'} = $align;
1407: }
1408: }
1409: }
1410: my $valign = $cell->getAttribute('valign');
1411: if (defined $valign) {
1412: $cell->removeAttribute('valign');
1413: if (!defined $old_properties->{'vertical-align'}) {
1414: $valign = lc(trim($valign));
1415: if ($valign ne '') {
1416: $new_properties{'vertical-align'} = $valign;
1417: }
1418: }
1419: }
1420: if (scalar(keys %new_properties) > 0) {
1421: set_css_properties($cell, \%new_properties);
1422: }
1423: }
1424: }
1425:
1426: # Replaces deprecated attributes in lists (ul and ol)
1427: sub fix_deprecated_in_lists {
1428: my ($root) = @_;
1429: my @uls = $root->getElementsByTagName('ul');
1430: my @ols = $root->getElementsByTagName('ol');
1431: my @lists = (@uls, @ols);
1432: foreach my $list (@lists) {
1433: my $type = $list->getAttribute('type');
1434: if (defined $type) {
1435: my $lst = list_style_type($type);
1436: if (defined $lst) {
1437: $list->removeAttribute('type');
1438: if (!defined get_css_property($list, 'list-style-type')) {
1439: set_css_property($list, 'list-style-type', $lst);
1440: }
1441: }
1442: }
1443: }
1444: }
1445:
1446: # Replaces deprecated attributes in list items (li)
1447: sub fix_deprecated_in_list_items {
1448: my ($root) = @_;
1449: my @lis = $root->getElementsByTagName('li');
1450: foreach my $li (@lis) {
1451: my $type = $li->getAttribute('type');
1452: if (defined $type) {
1453: my $lst = list_style_type($type);
1454: if (defined $lst) {
1455: $li->removeAttribute('type');
1456: if (!defined get_css_property($li, 'list-style-type')) {
1457: set_css_property($li, 'list-style-type', $lst);
1458: }
1459: }
1460: }
1461: }
1462: }
1463:
1464: # returns the CSS list-style-type value equivalent to the given type attribute for a list or list item
1465: sub list_style_type {
1466: my ($type) = @_;
1467: my $value;
1468: $type = trim($type);
1469: if (lc($type) eq 'circle') {
1470: $value = 'circle';
1471: } elsif (lc($type) eq 'disc') {
1472: $value = 'disc';
1473: } elsif (lc($type) eq 'square') {
1474: $value = 'square';
1475: } elsif ($type eq 'a') {
1476: $value = 'lower-latin';
1477: } elsif ($type eq 'A') {
1478: $value = 'upper-latin';
1479: } elsif ($type eq 'i') {
1480: $value = 'lower-roman';
1481: } elsif ($type eq 'I') {
1482: $value = 'upper-roman';
1483: } elsif ($type eq '1') {
1484: $value = 'decimal';
1485: }
1486: return $value;
1487: }
1488:
1489: # Replaces deprecated attributes in hr
1490: sub fix_deprecated_in_hr {
1491: my ($root) = @_;
1492: my @hrs = $root->getElementsByTagName('hr');
1493: foreach my $hr (@hrs) {
1494: tie (my %new_properties, 'Tie::IxHash', ());
1495: my $align = $hr->getAttribute('align');
1496: if (defined $align) {
1497: $align = lc(trim($align));
1498: if ($align eq 'left') {
1499: $new_properties{'text-align'} = 'left';
1500: $new_properties{'margin-left'} = '0';
1501: } elsif ($align eq 'right') {
1502: $new_properties{'text-align'} = 'right';
1503: $new_properties{'margin-right'} = '0';
1504: }
1505: $hr->removeAttribute('align');
1506: }
1507: my $color = $hr->getAttribute('color');
1508: if (defined $color) {
1509: $color = trim($color);
1510: $color =~ s/^x\s*//;
1511: if ($color ne '') {
1512: $new_properties{'color'} = $color;
1513: $new_properties{'background-color'} = $color;
1514: }
1515: $hr->removeAttribute('color');
1516: }
1517: my $noshade = $hr->getAttribute('noshade');
1518: my $size = $hr->getAttribute('size');
1519: if (defined $noshade) {
1520: $new_properties{'border-width'} = '0';
1521: if (!defined $color) {
1522: $new_properties{'color'} = 'gray';
1523: $new_properties{'background-color'} = 'gray';
1524: }
1525: if (!defined $size) {
1526: $size = '2';
1527: }
1528: $hr->removeAttribute('noshade');
1529: }
1530: if (defined $size) {
1531: $size = trim($size);
1532: if ($size ne '') {
1533: $new_properties{'height'} = $size.'px';
1534: }
1535: if (defined $hr->getAttribute('size')) {
1536: $hr->removeAttribute('size');
1537: }
1538: }
1539: my $width = $hr->getAttribute('width');
1540: if (defined $width) {
1541: $width = trim($width);
1542: if ($width ne '') {
1543: if ($width !~ /\%$/) {
1544: $width .= 'px';
1545: }
1546: $new_properties{'width'} = $width;
1547: }
1548: $hr->removeAttribute('width');
1549: }
1550: if (scalar(keys %new_properties) > 0) {
1551: set_css_properties($hr, \%new_properties);
1552: }
1553: }
1554: }
1555:
1556: # Replaces deprecated attributes in img
1557: sub fix_deprecated_in_img {
1558: my ($root) = @_;
1559: my @imgs = $root->getElementsByTagName('img');
1560: foreach my $img (@imgs) {
1561: my $old_properties = get_css_properties($img);
1562: tie (my %new_properties, 'Tie::IxHash', ());
1563: my $align = $img->getAttribute('align');
1564: if (defined $align) {
1565: $align = lc(trim($align));
1566: if ($align eq 'middle' || $align eq 'top' || $align eq 'bottom') {
1567: $img->removeAttribute('align');
1568: if (!defined $old_properties->{'vertical-align'}) {
1569: $new_properties{'vertical-align'} = $align;
1570: }
1571: } elsif ($align eq 'left' || $align eq 'right') {
1572: $img->removeAttribute('align');
1573: if (!defined $old_properties->{'float'}) {
1574: $new_properties{'float'} = $align;
1575: }
1576: } elsif ($align eq 'center' || $align eq '') {
1577: $img->removeAttribute('align');
1578: }
1579: }
1580: my $border = $img->getAttribute('border');
1581: if (defined $border) {
1582: $border = lc(trim($border));
1583: if ($border =~ /^[0-9]+\s*(px)?$/) {
1584: $img->removeAttribute('border');
1585: if (!defined $old_properties->{'border'}) {
1586: if ($border !~ /px$/) {
1587: $border .= 'px';
1588: }
1589: $new_properties{'border'} = $border.' solid black';
1590: }
1591: }
1592: }
1593: my $hspace = $img->getAttribute('hspace');
1594: if (defined $hspace) {
1595: $hspace = lc(trim($hspace));
1596: if ($hspace =~ /^[0-9]+\s*(px)?$/) {
1597: $img->removeAttribute('hspace');
1598: if (!defined $old_properties->{'margin-left'} || !defined $old_properties->{'margin-right'}) {
1599: if ($hspace !~ /px$/) {
1600: $hspace .= 'px';
1601: }
1602: $new_properties{'margin-left'} = $hspace;
1603: $new_properties{'margin-right'} = $hspace;
1604: }
1605: }
1606: }
1607: if (scalar(keys %new_properties) > 0) {
1608: set_css_properties($img, \%new_properties);
1609: }
1610: }
1611: }
1612:
1613: # Replaces deprecated attributes in htmlbody (the style attribute could be used in a div for output)
1614: sub fix_deprecated_in_body {
1615: my ($root) = @_;
1616: my $doc = $root->ownerDocument;
1617: my @bodies = $root->getElementsByTagName('htmlbody');
1618: foreach my $body (@bodies) {
1619: my $old_properties = get_css_properties($body);
1620: tie (my %new_properties, 'Tie::IxHash', ());
1621: my $bgcolor = $body->getAttribute('bgcolor');
1622: if (defined $bgcolor) {
1623: $body->removeAttribute('bgcolor');
1624: if (!defined $old_properties->{'background-color'}) {
1625: $bgcolor = trim($bgcolor);
1626: $bgcolor =~ s/^x\s*//;
1627: if ($bgcolor ne '') {
1628: $new_properties{'background-color'} = $bgcolor;
1629: }
1630: }
1631: }
1632: my $color = $body->getAttribute('text');
1633: if (defined $color) {
1634: $body->removeAttribute('text');
1635: if (!defined $old_properties->{'color'}) {
1636: $color = trim($color);
1637: $color =~ s/^x\s*//;
1638: if ($color ne '') {
1639: $new_properties{'color'} = $color;
1640: }
1641: }
1642: }
1643: my $background = $body->getAttribute('background');
1644: if (defined $background && ($background =~ /\.jpe?g$|\.gif|\.png/i)) {
1645: $body->removeAttribute('background');
1646: if (!defined $old_properties->{'background-image'}) {
1647: $background = trim($background);
1648: if ($background ne '') {
1649: $new_properties{'background-image'} = 'url('.$background.')';
1650: }
1651: }
1652: }
1653: # NOTE: these attributes have never been standard and are better removed with no replacement
1654: foreach my $bad ('bottommargin', 'leftmargin', 'rightmargin', 'topmargin', 'marginheight', 'marginwidth') {
1655: if ($body->hasAttribute($bad)) {
1656: $body->removeAttribute($bad);
1657: }
1658: }
1659: # NOTE: link alink and vlink require a <style> block to be converted
1660: my $link = $body->getAttribute('link');
1661: my $alink = $body->getAttribute('alink');
1662: my $vlink = $body->getAttribute('vlink');
1663: if (defined $link || defined $alink || defined $vlink) {
1664: my $head;
1665: my @heads = $root->getElementsByTagName('htmlhead');
1666: if (scalar(@heads) > 0) {
1667: $head = $heads[0];
1668: } else {
1669: $head = $doc->createElement('htmlhead');
1670: $root->insertBefore($head, $root->firstChild);
1671: }
1672: my $style = $doc->createElement('style');
1673: $head->appendChild($style);
1674: my $css = "\n";
1675: if (defined $link) {
1676: $body->removeAttribute('link');
1677: $link = trim($link);
1678: $link =~ s/^x\s*//;
1679: $css .= ' a:link { color:'.$link.' }';
1680: $css .= "\n";
1681: }
1682: if (defined $alink) {
1683: $body->removeAttribute('alink');
1684: $alink = trim($alink);
1685: $alink =~ s/^x\s*//;
1686: $css .= ' a:active { color:'.$alink.' }';
1687: $css .= "\n";
1688: }
1689: if (defined $vlink) {
1690: $body->removeAttribute('vlink');
1691: $vlink = trim($vlink);
1692: $vlink =~ s/^x\s*//;
1693: $css .= ' a:visited { color:'.$vlink.' }';
1694: $css .= "\n";
1695: }
1696: $css .= ' ';
1697: $style->appendChild($doc->createTextNode($css));
1698: }
1699: if (scalar(keys %new_properties) > 0) {
1700: set_css_properties($body, \%new_properties);
1701: } elsif (!$body->hasAttributes) {
1702: $body->parentNode->removeChild($body);
1703: }
1704: }
1705: }
1706:
1707: # replaces <div align="center"> by <div style="text-align:center;">
1708: # also for p and h1..h6
1709: sub fix_align_attribute {
1710: my ($root) = @_;
1711: my @nodes = $root->getElementsByTagName('div');
1712: push(@nodes, $root->getElementsByTagName('p'));
1713: for (my $i=1; $i<=6; $i++) {
1714: push(@nodes, $root->getElementsByTagName('h'.$i));
1715: }
1716: foreach my $node (@nodes) {
1717: my $align = $node->getAttribute('align');
1718: if (defined $align) {
1719: $node->removeAttribute('align');
1720: $align = trim($align);
1721: if ($align ne '' && !defined get_css_property($node, 'text-align')) {
1722: set_css_property($node, 'text-align', lc($align));
1723: }
1724: }
1725: }
1726: }
1727:
1728: # replace center by a div or remove it if there is a table inside
1729: sub replace_center {
1730: my ($root, $all_block) = @_;
1731: my $doc = $root->ownerDocument;
1732: my @centers = $root->getElementsByTagName('center');
1733: foreach my $center (@centers) {
1734: if ($center->getChildrenByTagName('table')->size() > 0) { # note: getChildrenByTagName is not DOM (LibXML specific)
1735: replace_by_children($center);
1736: } else {
1737: if ((!defined $center->previousSibling ||
1738: ($center->previousSibling->nodeType == XML_TEXT_NODE && $center->previousSibling->nodeValue =~ /^\s*$/ && !defined $center->previousSibling->previousSibling)) &&
1739: (!defined $center->nextSibling ||
1740: ($center->nextSibling->nodeType == XML_TEXT_NODE && $center->nextSibling->nodeValue =~ /^\s*$/ && !defined $center->nextSibling->nextSibling)) &&
1741: string_in_array(\@accepting_style, $center->parentNode->nodeName)) {
1742: # use CSS on the parent block and replace center by its children
1743: set_css_property($center->parentNode, 'text-align', 'center');
1744: replace_by_children($center);
1745: } else {
1746: # use p or div ? check if there is a block inside
1747: my $found_block = 0;
1748: for (my $child=$center->firstChild; defined $child; $child=$child->nextSibling) {
1749: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1750: $found_block = 1;
1751: last;
1752: }
1753: }
1754: my $new_node;
1755: if ($found_block) {
1756: $new_node = $doc->createElement('div');
1757: $new_node->setAttribute('style', 'text-align: center; margin: 0 auto');
1758: } else {
1759: $new_node = $doc->createElement('p');
1760: $new_node->setAttribute('style', 'text-align: center');
1761: }
1762: my $next;
1763: for (my $child=$center->firstChild; defined $child; $child=$next) {
1764: $next = $child->nextSibling;
1765: $center->removeChild($child);
1766: $new_node->appendChild($child);
1767: }
1768: $center->parentNode->replaceChild($new_node, $center);
1769: }
1770: }
1771: }
1772: }
1773:
1774: # replaces <nobr> by <span style="white-space:nowrap">
1775: sub replace_nobr {
1776: my ($root) = @_;
1777: my @nobrs = $root->getElementsByTagName('nobr');
1778: foreach my $nobr (@nobrs) {
1779: if (!defined $nobr->previousSibling && !defined $nobr->nextSibling &&
1780: string_in_array(\@accepting_style, $nobr->parentNode->nodeName)) {
1781: # use CSS on the parent block
1782: set_css_property($nobr->parentNode, 'white-space', 'nowrap');
1783: replace_by_children($nobr);
1784: } else {
1785: $nobr->setNodeName('span');
1786: $nobr->setAttribute('style', 'white-space:nowrap');
1787: }
1788: }
1789: }
1790:
1791: # removes notsolved tags in the case <hintgroup showoncorrect="no"><notsolved>...</notsolved></hintgroup>
1792: # and in the case <notsolved><hintgroup showoncorrect="no">...</hintgroup></notsolved>
1793: sub remove_useless_notsolved {
1794: my ($root) = @_;
1795: my @hintgroups = $root->getElementsByTagName('hintgroup');
1796: foreach my $hintgroup (@hintgroups) {
1797: my $showoncorrect = get_non_empty_attribute($hintgroup, 'showoncorrect');
1798: if (!defined $showoncorrect || $showoncorrect eq 'no') {
1799: my @notsolveds = $hintgroup->getElementsByTagName('notsolved');
1800: foreach my $notsolved (@notsolveds) {
1801: replace_by_children($notsolved);
1802: }
1803: }
1804: my $parent = $hintgroup->parentNode;
1805: if ($parent->nodeName eq 'notsolved' && scalar(@{$parent->nonBlankChildNodes()}) == 1) {
1806: replace_by_children($parent);
1807: }
1808: }
1809: }
1810:
1811: # adds a paragraph inside if needed and calls fix_paragraph for all paragraphs (including new ones)
1812: sub fix_paragraphs_inside {
1813: my ($node, $all_block) = @_;
1814: # blocks in which paragrahs will be added:
1.5 ! damieng 1815: my @blocks_with_p = ('loncapa','library','problem','part','problemtype','window','block','while','postanswerdate','preduedate','solved','notsolved','languageblock','instructorcomment','togglebox','standalone','body','form');
1.1 damieng 1816: my @fix_p_if_br_or_p = (@responses,'foil','item','text','label','hintgroup','hintpart','hint','web','windowlink','div','li','dd','td','th','blockquote');
1817: if ((string_in_array(\@blocks_with_p, $node->nodeName) && paragraph_needed($node)) ||
1818: (string_in_array(\@fix_p_if_br_or_p, $node->nodeName) && paragraph_inside($node))) {
1819: # if non-empty, add paragraphs where needed between all br and remove br
1820: # (it would be easier to just put everything in a p and fix it afterwards, but there are performance issues
1821: # when a paragraph has many blocks directly inside)
1822: my $doc = $node->ownerDocument;
1823: my $p = undef;
1824: my @new_children = ();
1825: my $next;
1826: for (my $child=$node->firstChild; defined $child; $child=$next) {
1827: $next = $child->nextSibling;
1828: $node->removeChild($child);
1829: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'br') {
1830: if (defined $p) {
1831: push(@new_children, $p);
1832: } else {
1833: push(@new_children, $doc->createElement('p'));
1834: }
1835: $p = undef;
1836: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array(\@inline_like_block, $child->nodeName)) {
1837: # inline_like_block: use the paragraph if there is one, otherwise do not create one
1838: if (defined $p) {
1839: $p->appendChild($child);
1840: } else {
1841: push(@new_children, $child);
1842: }
1843: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1844: # these children are blocks and should not be in a paragraph
1845: if (defined $p) {
1846: push(@new_children, $p);
1847: $p = undef;
1848: }
1849: push(@new_children, $child);
1850: } elsif ($child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^[ \t\f\n\r]*$/) {
1851: # blank text: add to paragraph if there is one and there is a next node, otherwise keep out of the paragraph
1852: if (defined $p) {
1853: if (defined $next) {
1854: $p->appendChild($child);
1855: } else {
1856: push(@new_children, $p);
1857: $p = undef;
1858: push(@new_children, $child);
1859: }
1860: } else {
1861: push(@new_children, $child);
1862: }
1863: } elsif ($child->nodeType == XML_TEXT_NODE ||
1864: $child->nodeType == XML_ELEMENT_NODE || $child->nodeType == XML_CDATA_SECTION_NODE ||
1865: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1866: # these children require a paragraph
1867: if (!defined $p) {
1868: $p = $doc->createElement('p');
1869: }
1870: $p->appendChild($child);
1871: } else {
1872: # these children do not require a paragraph (XML comments, PI)
1873: # -> do not move them in a new paragraph
1874: if (defined $p) {
1875: push(@new_children, $p);
1876: $p = undef;
1877: }
1878: push(@new_children, $child);
1879: }
1880: }
1881: if (defined $p) {
1882: push(@new_children, $p);
1883: }
1884: foreach my $child (@new_children) {
1885: $node->appendChild($child);
1886: }
1887: }
1888: # now fix the paragraphs everywhere, so that all inline nodes are inside a paragraph, and block nodes are outside
1889: my $next;
1890: for (my $child=$node->firstChild; defined $child; $child=$next) {
1891: $next = $child->nextSibling;
1892: if ($child->nodeType == XML_ELEMENT_NODE && defined $child->firstChild) {
1893: if ($child->nodeName eq 'p') {
1894: fix_paragraph($child, $all_block);
1895: } else {
1896: fix_paragraphs_inside($child, $all_block);
1897: }
1898: }
1899: }
1900: }
1901:
1902: # returns 1 if a paragraph is needed inside this node (assuming the parent can have paragraphs)
1903: sub paragraph_needed {
1904: my ($node) = @_;
1905: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1906: if (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
1907: ($child->nodeType == XML_ELEMENT_NODE && !string_in_array(\@inline_like_block, $child->nodeName)) ||
1908: $child->nodeType == XML_CDATA_SECTION_NODE ||
1909: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1910: return(1);
1911: }
1912: }
1913: return(0);
1914: }
1915:
1916: # returns 1 if there is a paragraph or br in a child of this node, or inside an inline child
1917: sub paragraph_inside {
1918: my ($node) = @_;
1919: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
1920: # (also used in first_block below)
1921: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
1922: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1923: if ($child->nodeType == XML_ELEMENT_NODE) {
1924: my $name = $child->nodeName;
1925: if ($name eq 'p' || $name eq 'br') {
1926: return(1);
1927: } elsif (string_in_array(\@splitable_inline, $name)) {
1928: if (paragraph_inside($child)) {
1929: return(1);
1930: }
1931: }
1932: }
1933: }
1934: return(0);
1935: }
1936:
1937: # fixes paragraphs inside paragraphs (without a block in-between)
1938: sub fix_paragraph {
1939: my ($p, $all_block) = @_;
1940: my $loop_right = 1; # this loops is to avoid out of memory errors with recurse, see below
1941: while ($loop_right) {
1942: $loop_right = 0;
1943: my $block = find_first_block($p, $all_block);
1944: if (defined $block) {
1945: my $trees = clone_ancestor_around_node($p, $block);
1946: my $doc = $p->ownerDocument;
1947: my $replacement = $doc->createDocumentFragment();
1948: my $left = $trees->{'left'};
1949: my $middle = $trees->{'middle'};
1950: my $right = $trees->{'right'};
1951: my $left_needs_p = 0; # 1 if it needs a paragraph (used to replace br later)
1952:
1953: if (defined $left) {
1954: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
1955: for (my $child=$left->firstChild; defined $child; $child=$child->nextSibling) {
1956: if ($child->nodeType == XML_ELEMENT_NODE) {
1957: fix_paragraphs_inside($child, $all_block);
1958: }
1959: }
1960: if (!paragraph_needed($left)) {
1961: # this was just blank text, comments or inline responses, it should not create a new paragraph
1962: my $next;
1963: for (my $child=$left->firstChild; defined $child; $child=$next) {
1964: $next = $child->nextSibling;
1965: $left->removeChild($child);
1966: $replacement->appendChild($child);
1967: }
1968: } else {
1969: $left_needs_p = 1;
1970: $replacement->appendChild($left);
1971: }
1972: }
1973:
1974: my $n = $middle->firstChild;
1975: while (defined $n) {
1976: if ($n->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $n->nodeName) || $n->nodeName eq 'br')) {
1977: if ($n->nodeName eq 'p') {
1978: my $parent = $n->parentNode;
1979: # first apply recursion
1980: fix_paragraph($n, $all_block);
1981: # now the p might have been replaced by several nodes, which should replace the initial p
1982: my $next_block;
1983: for (my $block=$parent->firstChild; defined $block; $block=$next_block) {
1984: $next_block = $block->nextSibling;
1985: if ($block->nodeName eq 'p') {
1986: $parent->removeChild($block);
1987: # for each parent before $middle, clone in-between the p and its children (to preserve the styles)
1988: if (defined $block->firstChild) {
1989: for (my $p=$parent; $p!=$middle; $p=$p->parentNode) {
1990: my $newp = $p->cloneNode(0);
1991: my $next;
1992: for (my $child=$block->firstChild; defined $child; $child=$next) {
1993: $next = $child->nextSibling;
1994: $block->removeChild($child);
1995: $newp->appendChild($child);
1996: }
1997: $block->appendChild($newp);
1998: }
1999: }
2000: }
2001: $replacement->appendChild($block);
2002: }
2003: } else {
2004: # replace the whole p by this block, forgetting about intermediate inline elements
2005: $n->parentNode->removeChild($n);
2006: if ($n->nodeName eq 'br') {
2007: # replace a br by a paragraph if there was nothing before in the paragraph,
2008: # otherwise remove it because it already broke the paragraph in half
2009: if (!defined $left || !$left_needs_p) {
2010: $replacement->appendChild($middle);
2011: }
2012: } else {
2013: fix_paragraphs_inside($n, $all_block);
2014: $replacement->appendChild($n);
2015: }
2016: }
2017: last;
2018: }
2019: $n = $n->firstChild;
2020: if (defined $n && defined $n->nextSibling) {
2021: die "Error in post_xml.fix_paragraph: block not found";
2022: }
2023: }
2024:
2025: if (defined $right) {
2026: if ($block->nodeName eq 'p') {
2027: # remove attributes on the right paragraph
2028: my @attributelist = $right->attributes();
2029: foreach my $att (@attributelist) {
2030: $right->removeAttribute($att->nodeName);
2031: }
2032: }
2033: if ($right->firstChild->nodeType == XML_TEXT_NODE && $right->firstChild->nodeValue =~ /^[ \t\f\n\r]*$/) {
2034: # remove the first text node with whitespace only from the p, it should not trigger the creation of a p
2035: # (but take nbsp into account, so we should not use \s here)
2036: my $first = $right->firstChild;
2037: $right->removeChild($first);
2038: $replacement->appendChild($first);
2039: }
2040: if (defined $right->firstChild) {
2041: if (paragraph_needed($right)) {
2042: $replacement->appendChild($right);
2043: #fix_paragraph($right, $all_block); This is taking way too much memory for blocks with many children
2044: # -> loop instead of recurse
2045: $loop_right = 1;
2046: } else {
2047: # this was just blank text, comments or inline responses, it should not create a new paragraph
2048: my $next;
2049: for (my $child=$right->firstChild; defined $child; $child=$next) {
2050: $next = $child->nextSibling;
2051: $right->removeChild($child);
2052: $replacement->appendChild($child);
2053: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2054: if ($child->nodeType == XML_ELEMENT_NODE) {
2055: fix_paragraphs_inside($child, $all_block);
2056: }
2057: }
2058: }
2059: }
2060: }
2061:
2062: $p->parentNode->replaceChild($replacement, $p);
2063:
2064: if ($loop_right) {
2065: $p = $right;
2066: }
2067:
2068: } else {
2069: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2070: my $next;
2071: for (my $child=$p->firstChild; defined $child; $child=$next) {
2072: $next = $child->nextSibling;
2073: if ($child->nodeType == XML_ELEMENT_NODE) {
2074: fix_paragraphs_inside($child, $all_block);
2075: }
2076: }
2077: }
2078: }
2079: }
2080:
2081: sub find_first_block {
2082: my ($node, $all_block) = @_;
2083: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
2084: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
2085: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2086: if ($child->nodeType == XML_ELEMENT_NODE) {
2087: if (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br') {
2088: return($child);
2089: }
2090: if (string_in_array(\@splitable_inline, $child->nodeName)) {
2091: my $block = find_first_block($child, $all_block);
2092: if (defined $block) {
2093: return($block);
2094: }
2095: }
2096: }
2097: }
2098: return(undef);
2099: }
2100:
2101: # Creates clones of the ancestor containing the descendants before the node, at the node, and after the node.
2102: # returns a hash with: left, middle, right (left and right can be undef)
2103: sub clone_ancestor_around_node {
2104: my ($ancestor, $node) = @_;
2105: my $middle_node;
2106: my ($left, $middle, $right);
2107: for (my $child=$ancestor->firstChild; defined $child; $child=$child->nextSibling) {
2108: if ($child == $node || is_ancestor_of($child, $node)) {
2109: $middle_node = $child;
2110: last;
2111: }
2112: }
2113: if (!defined $middle_node) {
2114: die "error in split_ancestor_around_node: middle not found";
2115: }
2116: if (defined $middle_node->previousSibling) {
2117: $left = $ancestor->cloneNode(0);
2118: for (my $child=$ancestor->firstChild; $child != $middle_node; $child=$child->nextSibling) {
2119: $left->appendChild($child->cloneNode(1));
2120: }
2121: }
2122: $middle = $ancestor->cloneNode(0);
2123: if ($middle_node == $node) {
2124: $middle->appendChild($middle_node->cloneNode(1));
2125: } else {
2126: my $subres = clone_ancestor_around_node($middle_node, $node);
2127: my $subleft = $subres->{'left'};
2128: if (defined $subleft) {
2129: if (!defined $left) {
2130: $left = $ancestor->cloneNode(0);
2131: }
2132: $left->appendChild($subleft);
2133: }
2134: $middle->appendChild($subres->{'middle'});
2135: my $subright = $subres->{'right'};
2136: if (defined $subright) {
2137: $right = $ancestor->cloneNode(0);
2138: $right->appendChild($subright);
2139: }
2140: }
2141: if (defined $middle_node->nextSibling) {
2142: if (!defined $right) {
2143: $right = $ancestor->cloneNode(0);
2144: }
2145: for (my $child=$middle_node->nextSibling; defined $child; $child=$child->nextSibling) {
2146: $right->appendChild($child->cloneNode(1));
2147: }
2148: }
2149: my %result = ();
2150: $result{'left'} = $left;
2151: $result{'middle'} = $middle;
2152: $result{'right'} = $right;
2153: return(\%result);
2154: }
2155:
2156: sub is_ancestor_of {
2157: my ($n1, $n2) = @_;
2158: my $n = $n2->parentNode;
2159: while (defined $n) {
2160: if ($n == $n1) {
2161: return(1);
2162: }
2163: $n = $n->parentNode;
2164: }
2165: return(0);
2166: }
2167:
2168: # removes empty style elements and replaces the ones with only whitespaces inside by their content
2169: # also remove hints that have become empty after empty style removal.
2170: sub remove_empty_style {
2171: my ($root) = @_;
2172: # actually, preserve some elements like ins when they have whitespace, only remove if they are empty
2173: my @remove_if_empty = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u', 'hint');
2174: my @remove_if_blank = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'font', 'u', 'hint');
2175: foreach my $name (@remove_if_empty) {
2176: my @nodes = $root->getElementsByTagName($name);
2177: while (scalar(@nodes) > 0) {
2178: my $node = pop(@nodes);
2179: if (!defined $node->firstChild) {
2180: my $parent = $node->parentNode;
2181: if (defined $node->previousSibling && $node->previousSibling->nodeType == XML_TEXT_NODE &&
2182: $node->previousSibling->nodeValue =~ /\$\S*$/) {
2183: # case $a<sup></sup>x
2184: my $value = $node->previousSibling->nodeValue;
2185: $value =~ s/\$(\S*)$/\$\{$1\}/;
2186: $node->previousSibling->setData($value);
2187: }
2188: $parent->removeChild($node);
2189: $parent->normalize();
2190: # now that we removed the node, check if the parent has become an empty style, and so on
2191: while (defined $parent && string_in_array(\@remove_if_empty, $parent->nodeName) && !defined $parent->firstChild) {
2192: my $grandparent = $parent->parentNode;
2193: $grandparent->removeChild($parent);
2194: remove_reference_from_array(\@nodes, $parent);
2195: $parent = $grandparent;
2196: }
2197: }
2198: }
2199: }
2200: foreach my $name (@remove_if_blank) {
2201: my @nodes = $root->getElementsByTagName($name);
2202: while (scalar(@nodes) > 0) {
2203: my $node = pop(@nodes);
2204: if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE) {
2205: # NOTE: careful, with UTF-8, \s matches non-breaking spaces and we want to preserve these
2206: if ($node->firstChild->nodeValue =~ /^[\t\n\f\r ]*$/) {
2207: my $parent = $node->parentNode;
2208: replace_by_children($node);
2209: $parent->normalize();
2210: # now that we removed the node, check if the parent has become a style with only whitespace, and so on
2211: while (defined $parent && string_in_array(\@remove_if_blank, $parent->nodeName) &&
2212: (!defined $parent->firstChild ||
2213: (!defined $parent->firstChild->nextSibling && $parent->firstChild->nodeType == XML_TEXT_NODE &&
2214: $parent->firstChild->nodeValue =~ /^^[\t\n\f\r ]*$/))) {
2215: my $grandparent = $parent->parentNode;
2216: replace_by_children($parent);
2217: remove_reference_from_array(\@nodes, $parent);
2218: $parent = $grandparent;
2219: }
2220: }
2221: }
2222: }
2223: }
2224: }
2225:
2226: # remove whitespace inside LON-CAPA elements that have an empty content-model (HTML ones are handled by html_to_xml)
2227: sub fix_empty_lc_elements {
2228: my ($node) = @_;
2229: my @lcempty = ('arc','axis','backgroundplot','drawoptionlist','drawvectorsum','fill','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','hiddenline','hiddensubmission','key','line','location','organicstructure','parameter','plotobject','plotvector','responseparam','spline','textline');
2230: if (string_in_array(\@lcempty, $node->nodeName)) {
2231: if (defined $node->firstChild && !defined $node->firstChild->nextSibling &&
2232: $node->firstChild->nodeType == XML_TEXT_NODE && $node->firstChild->nodeValue =~ /^\s*$/) {
2233: $node->removeChild($node->firstChild);
2234: }
2235: if (defined $node->firstChild) {
2236: if ($warnings) {
2237: print "Warning: a ".$node->nodeName." has something inside\n";
2238: }
2239: }
2240: return;
2241: }
2242: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2243: if ($child->nodeType == XML_ELEMENT_NODE) {
2244: fix_empty_lc_elements($child);
2245: }
2246: }
2247: }
2248:
2249: # turn some attribute values into lowercase when they should be
2250: sub lowercase_attribute_values {
2251: my ($root) = @_;
2252: my @with_yesno = (['radiobuttonresponse', ['randomize']],
2253: ['optionresponse', ['randomize']],
2254: ['matchresponse', ['randomize']],
2255: ['itemgroup', ['randomize']],
2256: ['rankresponse', ['randomize']],
2257: ['functionplotresponse', ['xaxisvisible', 'yaxisvisible', 'gridvisible']],
2258: ['backgroundplot', ['fixed']],
2259: ['drawvectorsum', ['showvalue']],
2260: ['textline', ['readonly']],
2261: ['hint', ['showoncorrect']],
2262: ['body', ['dir']],
2263: ['img', ['encrypturl']],
2264: ['form', ['method']],
2265: ['input', ['type']]
2266: );
2267: foreach my $el_attributes (@with_yesno) {
2268: my $el_name = $el_attributes->[0];
2269: my @elements = $root->getElementsByTagName($el_name);
2270: foreach my $element (@elements) {
2271: my $att_list = $el_attributes->[1];
2272: foreach my $att_name (@$att_list) {
2273: my $att_value = $element->getAttribute($att_name);
2274: if (!defined $att_value) {
2275: next;
2276: }
2277: if ($att_value eq 'yes' || $att_value eq 'no') {
2278: next;
2279: }
2280: if ($att_value =~ /\s*yes\s*/i) {
2281: $element->setAttribute($att_name, 'yes');
2282: } elsif ($att_value =~ /\s*no\s*/i) {
2283: $element->setAttribute($att_name, 'no');
2284: }
2285: }
2286: }
2287: }
2288: }
2289:
2290: # fixes spelling mistakes for numericalresponse/@unit
2291: sub replace_numericalresponse_unit_attribute {
2292: my ($root) = @_;
2293: my @numericalresponses = $root->getElementsByTagName('numericalresponse');
2294: foreach my $numericalresponse (@numericalresponses) {
2295: if (defined $numericalresponse->getAttribute('units') && !defined $numericalresponse->getAttribute('unit')) {
2296: $numericalresponse->setAttribute('unit', $numericalresponse->getAttribute('units'));
2297: $numericalresponse->removeAttribute('units');
2298: }
2299: }
2300:
2301: }
2302:
2303: # Replaces &format and &prettyprint by <num> whenever possible.
2304: # Also replaces &chemparse by <chem>.
2305: # If the function call is enclosed in <display>, the <display> element is removed.
2306: sub replace_functions_by_elements {
2307: my ($root) = @_;
2308: my $doc = $root->ownerDocument;
2309: my @preserve = ('script','answer','parse','m','tm','dtm','numericalhintscript'); # display is handled later
2310: my @all = $root->getElementsByTagName('*');
2311: foreach my $element (@all) {
2312: if (string_in_array(\@preserve, $element->nodeName)) {
2313: next;
2314: }
2315: my $changed = 0;
2316: my $next;
2317: for (my $child=$element->firstChild; defined $child; $child=$next) {
2318: $next = $child->nextSibling;
2319: if ($child->nodeType == XML_TEXT_NODE) {
2320: my $value = $child->nodeValue;
2321: if ($value =~ /^(.*)&(?:format|prettyprint)\((\$\{?[a-zA-Z0-9]*\}?(?:\[[^\]]*\])?|[0-9.]+)\s?,\s?(["'][,.\$]?[0-9][eEfFgGsS]["']|\$[a-zA-Z0-9]*)\)(.*)$/s) {
2322: # NOTE: we don't check for &prettyprint's 3rd argument (target), but it has not been seen outside of script elements.
2323: # NOTE: the format options ',' and '$' are not supported by &format in current LON-CAPA since rev 1.81 of default_homework.lcpm,
2324: # but are supported by &prettyprint;
2325: # if we use (like current LON-CAPA) &prettyprint for <num> implementation, it will change a few resulting documents
2326: # (by making them display something they were probably intended to display, but which did not).
2327: # Usage of <num> with &prettyprint instead of &format might also change the display when there is an exponent.
2328: my $before = $1;
2329: my $number = $2;
2330: my $format = $3;
2331: my $after = $4;
2332: $format =~ s/^['"]|['"]$//g;
2333: # do not change this if the parent is <display> and there are other things before or after &format
2334: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2335: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2336: last;
2337: }
2338: my $replacement = $doc->createDocumentFragment();
2339: my $num = $doc->createElement('num');
2340: $num->setAttribute('format', $format);
2341: $num->appendChild($doc->createTextNode($number));
2342: if (length($before) > 0) {
2343: $replacement->appendChild($doc->createTextNode($before));
2344: }
2345: $replacement->appendChild($num);
2346: if (length($after) > 0) {
2347: $replacement->appendChild($doc->createTextNode($after));
2348: }
2349: $element->replaceChild($replacement, $child);
2350: $changed = 1;
2351: $next = $element->firstChild; # start over, there might be another &format in the same text node
2352: } elsif ($value =~ /^(.*)&chemparse\(([^'"()]*|'[^']*'|"[^"]*")\)(.*)$/s) {
2353: my $before = $1;
2354: my $reaction = $2;
2355: my $after = $3;
2356: $reaction =~ s/^'(.*)'$/$1/;
2357: $reaction =~ s/^"(.*)"$/$1/;
2358: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2359: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2360: last;
2361: }
2362: my $replacement = $doc->createDocumentFragment();
2363: my $chem = $doc->createElement('chem');
2364: $chem->appendChild($doc->createTextNode($reaction));
2365: if (length($before) > 0) {
2366: $replacement->appendChild($doc->createTextNode($before));
2367: }
2368: $replacement->appendChild($chem);
2369: if (length($after) > 0) {
2370: $replacement->appendChild($doc->createTextNode($after));
2371: }
2372: $element->replaceChild($replacement, $child);
2373: $changed = 1;
2374: $next = $element->firstChild;
2375: }
2376: }
2377: }
2378: if ($changed && $element->nodeName eq 'display') {
2379: my $first = $element->firstChild;
2380: if ($first->nodeType == XML_ELEMENT_NODE && string_in_array(['num','chem'], $first->nodeName) &&
2381: !defined $first->nextSibling) {
2382: # remove useless display element
2383: replace_by_children($element);
2384: }
2385: }
2386: }
2387: }
2388:
2389: # pretty-print using im-memory DOM tree
2390: sub pretty {
2391: my ($node, $all_block, $indent_level) = @_;
2392: my $doc = $node->ownerDocument;
2393: $indent_level ||= 0;
2394: my $type = $node->nodeType;
2395: if ($type == XML_ELEMENT_NODE) {
2396: my $name = $node->nodeName;
2397: if ((string_in_array($all_block, $name) || string_in_array(\@inline_like_block, $name)) &&
2398: !string_in_array(\@preserve_elements, $name)) {
2399: # make sure there is a newline at the beginning and at the end if there is anything inside
2400: if (defined $node->firstChild && !string_in_array(\@no_newline_inside, $name)) {
2401: my $first = $node->firstChild;
2402: if ($first->nodeType == XML_TEXT_NODE) {
2403: my $text = $first->nodeValue;
2404: if ($text !~ /^ *\n/) {
2405: $first->setData("\n" . $text);
2406: }
2407: } else {
2408: $node->insertBefore($doc->createTextNode("\n"), $first);
2409: }
2410: my $last = $node->lastChild;
2411: if ($last->nodeType == XML_TEXT_NODE) {
2412: my $text = $last->nodeValue;
2413: if ($text !~ /\n *$/) {
2414: $last->setData($text . "\n");
2415: }
2416: } else {
2417: $node->appendChild($doc->createTextNode("\n"));
2418: }
2419: }
2420:
2421: # indent and make sure there is a newline before and after a block element
2422: my $newline_indent = "\n".(' ' x (2*($indent_level + 1)));
2423: my $newline_indent_last = "\n".(' ' x (2*$indent_level));
2424: my $next;
2425: for (my $child=$node->firstChild; defined $child; $child=$next) {
2426: $next = $child->nextSibling;
2427: if ($child->nodeType == XML_ELEMENT_NODE) {
2428: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
2429: # make sure there is a newline before and after a block element
2430: if (defined $child->previousSibling && $child->previousSibling->nodeType == XML_TEXT_NODE) {
2431: my $prev = $child->previousSibling;
2432: my $text = $prev->nodeValue;
2433: if ($text !~ /\n *$/) {
2434: $prev->setData($text . $newline_indent);
2435: }
2436: } else {
2437: $node->insertBefore($doc->createTextNode($newline_indent), $child);
2438: }
2439: if (defined $next && $next->nodeType == XML_TEXT_NODE) {
2440: my $text = $next->nodeValue;
2441: if ($text !~ /^ *\n/) {
2442: $next->setData($newline_indent . $text);
2443: }
2444: } else {
2445: $node->insertAfter($doc->createTextNode($newline_indent), $child);
2446: }
2447: }
2448: pretty($child, $all_block, $indent_level+1);
2449: } elsif ($child->nodeType == XML_TEXT_NODE) {
2450: my $text = $child->nodeValue;
2451: # collapse newlines
2452: $text =~ s/\n([\t ]*\n)+/\n/g;
2453: # indent and remove spaces and tabs before newlines
2454: if (defined $next) {
2455: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2456: } else {
2457: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2458: $text =~ s/[\t ]*\n[\t ]*$/$newline_indent_last/e;
2459: }
2460: $child->setData($text);
2461: }
2462: }
2463:
2464: # removes whitespace at the beginning and end of p td, th and li (except for nbsp at the beginning)
2465: my @to_trim = ('p','td','th','li');
2466: if (string_in_array(\@to_trim, $name) && defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
2467: my $text = $node->firstChild->nodeValue;
2468: $text =~ s/^[ \t\f\n\r]*//;
2469: if ($text eq '') {
2470: $node->removeChild($node->firstChild);
2471: } else {
2472: $node->firstChild->setData($text);
2473: }
2474: }
2475: if (string_in_array(\@to_trim, $name) && defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
2476: my $text = $node->lastChild->nodeValue;
2477: $text =~ s/\s*$//;
2478: if ($text eq '') {
2479: $node->removeChild($node->lastChild);
2480: } else {
2481: $node->lastChild->setData($text);
2482: }
2483: }
2484: } elsif (string_in_array(\@preserve_elements, $name)) {
2485: # collapse newlines at the beginning and the end of scripts
2486: if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
2487: my $text = $node->firstChild->nodeValue;
2488: $text =~ s/^\n( *\n)+/\n/;
2489: if ($text eq '') {
2490: $node->removeChild($node->firstChild);
2491: } else {
2492: $node->firstChild->setData($text);
2493: }
2494: }
2495: if (defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
2496: my $text = $node->lastChild->nodeValue;
2497: $text =~ s/\n( *\n)+$/\n/;
2498: if ($text eq '') {
2499: $node->removeChild($node->lastChild);
2500: } else {
2501: $node->lastChild->setData($text);
2502: }
2503: }
2504: }
2505: }
2506: }
2507:
2508: sub replace_tm_dtm {
2509: my ($root) = @_;
2510: my $doc = $root->ownerDocument;
2511: my @elements = $root->getElementsByTagName('tm');
2512: push(@elements, $root->getElementsByTagName('dtm'));
2513: foreach my $element (@elements) {
2514: my $first = $element->firstChild;
2515: if (defined $first && $first->nodeType == XML_TEXT_NODE) {
2516: my $text = $first->nodeValue;
2517: if ($element->nodeName eq 'tm') {
2518: $first->setData('$'.$text.'$');
2519: } else {
2520: $first->setData('$$'.$text.'$$');
2521: }
2522: }
2523: $element->setNodeName('m');
2524: }
2525: }
2526:
2527:
2528: ######## utilities ########
2529:
2530: ##
2531: # Trims a string (really, this should be built-in in Perl, this is ridiculous, ugly and slow)
2532: # @param {string} s - the string to trim
2533: # @returns the trimmed string
2534: ##
2535: sub trim {
2536: my ($s) = @_;
2537: $s =~ s/^\s+//;
2538: $s =~ s/\s+$//;
2539: return($s);
2540: }
2541:
2542: ##
2543: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
2544: # @param {Array<string>} array - reference to the array of strings
2545: # @param {string} value - the string to look for
2546: # @returns 1 if found, 0 otherwise
2547: ##
2548: sub string_in_array {
2549: my ($array, $value) = @_;
2550: # NOTE: would it be faster to use "any { $value eq $_ } @{$array}" from List::Util ?
2551: foreach my $v (@{$array}) {
2552: if ($v eq $value) {
2553: return 1;
2554: }
2555: }
2556: return 0;
2557: }
2558:
2559: ##
2560: # Tests if an object is in an array (using ==)
2561: # @param {Array<Object>} array - reference to the array of references
2562: # @param {Object} ref - the reference to look for
2563: # @returns 1 if found, 0 otherwise
2564: ##
2565: sub reference_in_array {
2566: my ($array, $ref) = @_;
2567: foreach my $v (@{$array}) {
2568: if ($v == $ref) {
2569: return 1;
2570: }
2571: }
2572: return 0;
2573: }
2574:
2575: ##
2576: # returns the index of a string in an array
2577: # @param {Array<Object>} array - reference to the array of strings
2578: # @param {string} s - the string to look for (using eq)
2579: # @returns the index if found, -1 otherwise
2580: ##
2581: sub index_of_string {
2582: my ($array, $s) = @_;
2583: for (my $i=0; $i<scalar(@{$array}); $i++) {
2584: if ($array->[$i] eq $s) {
2585: return $i;
2586: }
2587: }
2588: return -1;
2589: }
2590:
2591: ##
2592: # returns the index of a reference in an array
2593: # @param {Array<Object>} array - reference to the array of references
2594: # @param {Object} ref - the reference to look for
2595: # @returns the index if found, -1 otherwise
2596: ##
2597: sub index_of_reference {
2598: my ($array, $ref) = @_;
2599: for (my $i=0; $i<scalar(@{$array}); $i++) {
2600: if ($array->[$i] == $ref) {
2601: return $i;
2602: }
2603: }
2604: return -1;
2605: }
2606:
2607: ##
2608: # if found, removes a string from an array, otherwise do nothing
2609: # @param {Array<string>} array - reference to the array of string
2610: # @param {string} s - the string to look for (using eq)
2611: ##
2612: sub remove_string_from_array {
2613: my ($array, $s) = @_;
2614: my $index = index_of_string($array, $s);
2615: if ($index != -1) {
2616: splice(@$array, $index, 1);
2617: }
2618: }
2619:
2620: ##
2621: # if found, removes a reference from an array, otherwise do nothing
2622: # @param {Array<Object>} array - reference to the array of references
2623: # @param {Object} ref - the reference to look for
2624: ##
2625: sub remove_reference_from_array {
2626: my ($array, $ref) = @_;
2627: my $index = index_of_reference($array, $ref);
2628: if ($index != -1) {
2629: splice(@$array, $index, 1);
2630: }
2631: }
2632:
2633: ##
2634: # replaces a node by its children
2635: # @param {Node} node - the DOM node
2636: ##
2637: sub replace_by_children {
2638: my ($node) = @_;
2639: my $parent = $node->parentNode;
2640: my $next;
2641: my $previous;
2642: for (my $child=$node->firstChild; defined $child; $child=$next) {
2643: $next = $child->nextSibling;
2644: if ((!defined $previous || !defined $next) &&
2645: $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
2646: next; # do not keep first and last whitespace nodes
2647: } else {
2648: if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
2649: # remove whitespace at the beginning
2650: my $value = $child->nodeValue;
2651: $value =~ s/^\s+//;
2652: $child->setData($value);
2653: }
2654: if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
2655: # and at the end
2656: my $value = $child->nodeValue;
2657: $value =~ s/\s+$//;
2658: $child->setData($value);
2659: }
2660: }
2661: $node->removeChild($child);
2662: $parent->insertBefore($child, $node);
2663: $previous = $child;
2664: }
2665: $parent->removeChild($node);
2666: }
2667:
2668: ##
2669: # returns the trimmed attribute value if the attribute exists and is not blank, undef otherwise
2670: # @param {Node} node - the DOM node
2671: # @param {string} attribute_name - the attribute name
2672: ##
2673: sub get_non_empty_attribute {
2674: my ($node, $attribute_name) = @_;
2675: my $value = $node->getAttribute($attribute_name);
2676: if (defined $value && $value !~ /^\s*$/) {
2677: $value = trim($value);
2678: return($value);
2679: }
2680: return(undef);
2681: }
2682:
2683: ##
2684: # Returns a CSS property value from the style attribute of the element, or undef if not defined
2685: # @param {Element} el - the DOM element
2686: # @param {string} property_name - the CSS property name
2687: ##
2688: sub get_css_property {
2689: my ($el, $property_name) = @_;
2690: my $style = $el->getAttribute('style');
2691: if (defined $style) {
2692: $style =~ s/^\s*;\s*//;
2693: $style =~ s/\s*;\s*$//;
2694: } else {
2695: $style = '';
2696: }
2697: my @pairs = split(';', $style);
2698: foreach my $pair (@pairs) {
2699: my @name_value = split(':', $pair);
2700: if (scalar(@name_value) != 2) {
2701: next;
2702: }
2703: my $name = trim($name_value[0]);
2704: my $value = trim($name_value[1]);
2705: if (lc($name) eq $property_name) {
2706: return($value); # return the first one found
2707: }
2708: }
2709: return(undef);
2710: }
2711:
2712: ##
2713: # Returns the reference to a hash CSS property name => value from the style attribute of the element.
2714: # Returns an empty list if the style attribute is not defined,
2715: # @param {Element} el - the DOM element
2716: # @return {Hash<string, string>} reference to the hash property name => property value
2717: ##
2718: sub get_css_properties {
2719: my ($el) = @_;
2720: my $style = $el->getAttribute('style');
2721: if (defined $style) {
2722: $style =~ s/^\s*;\s*//;
2723: $style =~ s/\s*;\s*$//;
2724: } else {
2725: $style = '';
2726: }
2727: my @pairs = split(';', $style);
2728: tie (my %hash, 'Tie::IxHash', ());
2729: foreach my $pair (@pairs) {
2730: my @name_value = split(':', $pair);
2731: if (scalar(@name_value) != 2) {
2732: next;
2733: }
2734: my $name = trim($name_value[0]);
2735: my $value = trim($name_value[1]);
2736: if (defined $hash{$name}) {
2737: # duplicate property in the style attribute: keep only the last one
2738: delete $hash{$name};
2739: }
2740: $hash{$name} = $value;
2741: }
2742: return(\%hash);
2743: }
2744:
2745: ##
2746: # Sets a CSS property in the style attribute of an element
2747: # @param {Element} el - the DOM element
2748: # @param {string} property_name - the CSS property name
2749: # @param {string} property_value - the CSS property value
2750: ##
2751: sub set_css_property {
2752: my ($el, $property_name, $property_value) = @_;
2753: my $hash_ref = { $property_name => $property_value };
2754: set_css_properties($el, $hash_ref);
2755: }
2756:
2757: ##
2758: # Sets several CSS properties in the style attribute of an element
2759: # @param {Element} el - the DOM element
2760: # @param {Hash<string, string>} properties - reference to the hash property name => property value
2761: ##
2762: sub set_css_properties {
2763: my ($el, $properties) = @_;
2764: my $hash = get_css_properties($el);
2765: foreach my $property_name (keys %$properties) {
2766: my $property_value = $properties->{$property_name};
2767: if (defined $hash->{$property_name}) {
2768: delete $hash->{$property_name}; # to add the new one at the end
2769: }
2770: $hash->{$property_name} = $property_value;
2771: }
2772: my $style = '';
2773: foreach my $key (keys %$hash) {
2774: $style .= $key.':'.$hash->{$key}.'; ';
2775: }
2776: $style =~ s/; $//;
2777: $el->setAttribute('style', $style);
2778: }
2779:
2780: 1;
2781: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>