Annotation of loncom/homework/cleanxml/post_xml.pm, revision 1.9
1.1 damieng 1: # The LearningOnline Network
2: # Third step to clean a file.
3: #
1.9 ! damieng 4: # $Id: post_xml.pm,v 1.8 2016/01/20 00:40:39 damieng Exp $
1.1 damieng 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ###
29:
30: #!/usr/bin/perl
31:
32: package Apache::post_xml;
33:
34: use strict;
35: use utf8;
36: use warnings;
37:
38: use File::Basename;
39: use File::Temp qw/ tempfile /;
40: use Cwd 'abs_path';
41: use XML::LibXML;
42: use HTML::TokeParser; # used to parse sty files
43: use Tie::IxHash; # for ordered hashes
1.7 damieng 44: use tth;
45: use Apache::html_to_xml;
1.1 damieng 46:
47: no warnings 'recursion'; # yes, fix_paragraph is using heavy recursion, I know
48:
49: # these are constants
1.4 damieng 50: my @block_elements = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm');
51: my @inline_like_block = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup','translated','lang'); # inline elements treated like blocks for pretty print and some other things
1.1 damieng 52: my @responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse','essayresponse','radiobuttonresponse','matchresponse','rankresponse','imageresponse','functionplotresponse');
53: my @block_html = ('html','head','body','section','h1','h2','h3','h4','h5','h6','div','p','ul','ol','li','table','tbody','tr','td','th','dl','dt','dd','pre','noscript','hr','address','blockquote','object','applet','embed','map','form','fieldset','iframe','center','frameset');
54: my @no_newline_inside = ('import','parserlib','scriptlib','data','function','label','xlabel','ylabel','tic','text','rectangle','image','title','h1','h2','h3','h4','h5','h6','li','td','p');
1.6 damieng 55: my @preserve_elements = ('script','answer','pre','style');
1.1 damieng 56: my @accepting_style = ('section','h1','h2','h3','h4','h5','h6','div','p','li','td','th','dt','dd','pre','blockquote');
57: my @latex_math = ('\alpha', '\theta', '\omicron', '\tau', '\beta', '\vartheta', '\pi', '\upsilon', '\gamma', '\gamma', '\varpi', '\phi', '\delta', '\kappa', '\rho', '\varphi', '\epsilon', '\lambda', '\varrho', '\chi', '\varepsilon', '\mu', '\sigma', '\psi', '\zeta', '\nu', '\varsigma', '\omega', '\eta', '\xi',
58: '\Gamma', '\Lambda', '\Sigma', '\Psi', '\Delta', '\Xi', '\Upsilon', '\Omega', '\Theta', '\Pi', '\Phi',
59: '\pm', '\cap', '\diamond', '\oplus', '\mp', '\cup', '\bigtriangleup', '\ominus', '\times', '\uplus', '\bigtriangledown', '\otimes', '\div', '\sqcap', '\triangleleft', '\oslash', '\ast', '\sqcup', '\triangleright', '\odot', '\star', '\vee', '\lhd$', '\bigcirc', '\circ', '\wedge', '\rhd$', '\dagger', '\bullet', '\setminus', '\unlhd$', '\ddagger', '\cdot', '\wr', '\unrhd$', '\amalg', '+', '-',
60: '\leq', '\geq', '\equiv', '\models', '\prec', '\succ', '\sim', '\perp', '\preceq', '\succeq', '\simeq', '\mid', '\ll', '\gg', '\asymp', '\parallel', '\subset', '\supset', '\approx', '\bowtie', '\subseteq', '\supseteq', '\cong', '\Join$', '\sqsubset$', '\sqsupset$', '\neq', '\smile', '\sqsubseteq', '\sqsupseteq', '\doteq', '\frown', '\in', '\ni', '\propto', '\vdash', '\dashv',
61: '\colon', '\ldotp', '\cdotp',
62: '\leftarrow', '\longleftarrow', '\uparrow', '\Leftarrow', '\Longleftarrow', '\Uparrow', '\rightarrow', '\longrightarrow', '\downarrow', '\Rightarrow', '\Longrightarrow', '\Downarrow', '\leftrightarrow', '\longleftrightarrow', '\updownarrow', '\Leftrightarrow', '\Longleftrightarrow', '\Updownarrow', '\mapsto', '\longmapsto', '\nearrow', '\hookleftarrow', '\hookrightarrow', '\searrow', '\leftharpoonup', '\rightharpoonup', '\swarrow', '\leftharpoondown', '\rightharpoondown', '\nwarrow', '\rightleftharpoons', '\leadsto$',
63: '\ldots', '\cdots', '\vdots', '\ddots', '\aleph', '\prime', '\forall', '\infty', '\hbar', '\emptyset', '\exists', '\Box$', '\imath', '\nabla', '\neg', '\Diamond$', '\jmath', '\surd', '\flat', '\triangle', '\ell', '\top', '\natural', '\clubsuit', '\wp', '\bot', '\sharp', '\diamondsuit', '\Re', '\|', '\backslash', '\heartsuit', '\Im', '\angle', '\partial', '\spadesuit', '\mho$',
64: '\sum', '\bigcap', '\bigodot', '\prod', '\bigcup', '\bigotimes', '\coprod', '\bigsqcup', '\bigoplus', '\int', '\bigvee', '\biguplus', '\oint', '\bigwedge',
65: '\arccos', '\cos', '\csc', '\exp', '\ker', '\limsup', '\min', '\sinh', '\arcsin', '\cosh', '\deg', '\gcd', '\lg', '\ln', '\Pr', '\sup', '\arctan', '\cot', '\det', '\hom', '\lim', '\log', '\sec', '\tan', '\arg', '\coth', '\dim', '\inf', '\liminf', '\max', '\sin', '\tanh',
66: '\uparrow', '\Uparrow', '\downarrow', '\Downarrow', '\updownarrow', '\Updownarrow', '\lfloor', '\rfloor', '\lceil', '\rceil', '\langle', '\rangle', '\backslash',
67: '\rmoustache', '\lmoustache', '\rgroup', '\lgroup', '\arrowvert', '\Arrowvert', '\bracevert',
68: '\hat{', '\acute{', '\bar{', '\dot{', '\breve{', '\check{', '\grave{', '\vec{', '\ddot{', '\tilde{',
69: '\widetilde{', '\widehat{', '\overleftarrow{', '\overrightarrow{', '\overline{', '\underline{', '\overbrace{', '\underbrace{', '\sqrt{', '\sqrt[', '\frac{'
70: );
71: # list of elements that can contain style elements:
72: my @containing_styles = ('library','problem',@responses,'foil','item','text','hintgroup','hintpart','label','part','preduedate','postanswerdate','solved','notsolved','block','while','web','standalone','problemtype','languageblock','translated','lang','window','windowlink','togglebox','instructorcomment','body','section','div','p','li','dd','td','th','blockquote','object','applet','video','audio','canvas','fieldset','button',
73: 'span','strong','em','b','i','sup','sub','code','kbd','samp','tt','ins','del','var','small','big','u','font');
74: my @html_styles = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'u');
75:
76: my $warnings; # 1 = print warnings
77:
78:
79: # Parses the XML document and fixes many things to turn it into a document matching the schema.
1.2 damieng 80: # @param {reference} textref - reference to the text of the document
81: # @param {string} file_path - path of the file being processed (we only extract the directory path from it)
82: # @param {string} res_dir - path of res directory parent (without the / at the end)
83: # @param {boolean} warn - 1 to print warnings, 0 otherwise
84: # @returns the text of the document as a byte string.
1.1 damieng 85: sub post_xml {
1.2 damieng 86: my ($textref, $file_path, $res_dir, $warn) = @_;
1.1 damieng 87: $warnings = $warn;
88:
89: my $dom_doc = XML::LibXML->load_xml(string => $textref);
90:
91: my $root = fix_structure($dom_doc);
92:
93: remove_elements($root, ['startouttext','startoutext','startottext','startouttex','startouttect','atartouttext','starouttext','starttextout','starttext','starttextarea','endouttext','endoutext','endoutttext','endouttxt','endouutext','ednouttext','endouttex','endoouttext','endouttest','endtextout','endtextarea','startpartmarker','endpartmarker','basefont','x-claris-tagview','x-claris-window','x-sas-window']);
94:
95: remove_empty_attributes($root);
96:
97: fix_attribute_case($root);
98:
1.3 damieng 99: replace_m($root);
1.1 damieng 100:
101: my @all_block = (@block_elements, @block_html);
1.2 damieng 102: add_sty_blocks($file_path, $res_dir, $root, \@all_block); # must come before the subs using @all_block
1.1 damieng 103:
104: fix_block_styles($root, \@all_block);
105: $root->normalize();
106:
107: fix_fonts($root, \@all_block);
108:
109: replace_u($root);
110:
111: remove_bad_cdata_sections($root);
112:
113: add_cdata_sections($root);
114:
115: fix_style_element($root);
116:
117: fix_tables($root);
118:
119: fix_lists($root);
120:
121: fix_wrong_name_for_img($root); # should be before replace_deprecated_attributes_by_css
122:
123: replace_deprecated_attributes_by_css($root);
124:
125: replace_center($root, \@all_block); # must come after replace_deprecated_attributes_by_css
126:
127: replace_nobr($root);
128:
129: remove_useless_notsolved($root);
130:
1.9 ! damieng 131: fix_comments($root);
! 132:
1.1 damieng 133: fix_paragraphs_inside($root, \@all_block);
134:
135: remove_empty_style($root);
136:
137: fix_empty_lc_elements($root);
138:
139: lowercase_attribute_values($root);
140:
141: replace_numericalresponse_unit_attribute($root);
142:
143: replace_functions_by_elements($root);
144:
145: pretty($root, \@all_block);
146:
147: replace_tm_dtm($root);
148:
149: return $dom_doc->toString(); # byte string !
150: }
151:
152: sub fix_structure {
153: my ($doc) = @_;
154: # the root element has already been added in pre_xml
155: my $root = $doc->documentElement;
156: # inside the root, replace html, problem and library elements by their content
1.5 damieng 157: my @toreplace = ('html','problem','library','Task');
1.1 damieng 158: foreach my $name (@toreplace) {
159: my @elements = $root->getElementsByTagName($name);
160: foreach my $element (@elements) {
161: replace_by_children($element);
162: }
163: }
164: # insert all link and style elements inside a new head element
165: my $current_node = undef;
166: my @heads = $doc->getElementsByTagName('head');
167: my @links = $doc->getElementsByTagName('link');
168: my @styles = $doc->getElementsByTagName('style');
169: my @titles = $doc->getElementsByTagName('title');
170: if (scalar(@titles) > 0) {
171: # NOTE: there is a title element in gnuplot, not to be confused with the one inside HTML head
172: for (my $i=0; $i<scalar(@titles); $i++) {
173: my $title = $titles[$i];
174: my $found_gnuplot = 0;
175: my $ancestor = $title->parentNode;
176: while (defined $ancestor) {
177: if ($ancestor->nodeName eq 'gnuplot') {
178: $found_gnuplot = 1;
179: last;
180: }
181: $ancestor = $ancestor->parentNode;
182: }
183: if ($found_gnuplot) {
184: splice(@titles, $i, 1);
185: $i--;
186: }
187: }
188: }
189: if (scalar(@heads) > 0 || scalar(@titles) > 0 || scalar(@links) > 0 || scalar(@styles) > 0) {
190: my $htmlhead = $doc->createElement('head');
191: foreach my $head (@heads) {
192: my $next;
193: for (my $child=$head->firstChild; defined $child; $child=$next) {
194: $next = $child->nextSibling;
195: $head->removeChild($child);
196: if ($child->nodeType != XML_ELEMENT_NODE ||
197: string_in_array(['title','script','style','meta','link','import','base'], $child->nodeName)) {
198: $htmlhead->appendChild($child);
199: } else {
200: # this should not be in head
201: insert_after_or_first($root, $child, $current_node);
202: }
203: }
204: $head->parentNode->removeChild($head);
205: }
206: foreach my $child (@titles, @links, @styles) {
207: $child->parentNode->removeChild($child);
208: $htmlhead->appendChild($child);
209: }
210: insert_after_or_first($root, $htmlhead, $current_node);
211: $current_node = $htmlhead;
212: }
213: # body
214: my $htmlbody = undef;
215: my @bodies = $doc->getElementsByTagName('body');
216: if (scalar(@bodies) > 0) {
217: # TODO: fix content and position of body elements
218: if ($root->nodeName eq 'problem') {
219: foreach my $body (@bodies) {
220: replace_by_children($body);
221: }
222: }
223: }
224: # add all the meta elements afterwards when they are LON-CAPA meta. Remove all HTML meta.
225: my @meta_names = ('abstract','author','authorspace','avetries','avetries_list','clear','comefrom','comefrom_list','copyright','correct','count','course','course_list','courserestricted','creationdate','dependencies','depth','difficulty','difficulty_list','disc','disc_list','domain','end','field','firstname','generation','goto','goto_list','groupname','helpful','highestgradelevel','hostname','id','keynum','keywords','language','lastname','lastrevisiondate','lowestgradelevel','middlename','mime','modifyinguser','notes','owner','permanentemail','scope','sequsage','sequsage_list','standards','start','stdno','stdno_list','subject','technical','title','url','username','value','version');
226: my @metas = $doc->getElementsByTagName('meta');
227: foreach my $meta (@metas) {
228: $meta->parentNode->removeChild($meta);
229: my $name = $meta->getAttribute('name');
230: my $content = $meta->getAttribute('content');
231: if (defined $name && defined $content && string_in_array(\@meta_names, lc($name))) {
232: my $lcmeta = $doc->createElement('meta');
233: $lcmeta->setAttribute('name', lc($name));
234: $lcmeta->setAttribute('content', $content);
235: insert_after_or_first($root, $lcmeta, $current_node);
236: $current_node = $lcmeta;
237: }
238: }
239: return($root);
240: }
241:
242: # insert the new child under parent after the reference child, or as the first child if the reference child is not defined
243: sub insert_after_or_first {
244: my ($parent, $newchild, $refchild) = @_;
245: if (defined $refchild) {
246: $parent->insertAfter($newchild, $refchild);
247: } elsif (defined $parent->firstChild) {
248: $parent->insertBefore($newchild, $parent->firstChild);
249: } else {
250: $parent->appendChild($newchild);
251: }
252: }
253:
254: # removes all elements with given names inside the node, but keep the content
255: sub remove_elements {
256: my ($node, $to_remove) = @_;
257: my $nextChild;
258: for (my $child=$node->firstChild; defined $child; $child=$nextChild) {
259: $nextChild = $child->nextSibling;
260: my $type = $node->nodeType;
261: if ($type == XML_ELEMENT_NODE) {
262: if (string_in_array($to_remove, $child->nodeName)) {
263: my $first_non_white = $child->firstChild;
264: if (defined $first_non_white && $first_non_white->nodeType == XML_TEXT_NODE &&
265: $first_non_white->nodeValue =~ /^\s*$/) {
266: $first_non_white = $first_non_white->nextSibling;
267: }
268: if (defined $first_non_white) {
269: $nextChild = $first_non_white;
270: replace_by_children($child);
271: } else {
272: $node->removeChild($child);
273: }
274: } else {
275: remove_elements($child, $to_remove);
276: }
277: }
278: }
279: }
280:
281: # removes some attributes that have an invalid empty value
282: sub remove_empty_attributes {
283: my ($root) = @_;
284: my $doc = $root->ownerDocument;
285: # this list is based on validation errors in the MSU subset (it could be more complete if it was based on the schema)
286: my @attributes = (
287: ['curve', ['pointsize']],
288: ['foil', ['location']],
289: ['foilgroup', ['checkboxoptions', 'options', 'texoptions']],
290: ['gnuplot', ['pattern', 'texwidth']],
291: ['img', ['height', 'texheight', 'texwidth', 'texwrap', 'width']],
292: ['import', ['importmode']],
293: ['optionresponse', ['max']],
294: ['organicstructure', ['options']],
295: ['radiobuttonresponse', ['max']],
296: ['randomlabel', ['height', 'texwidth', 'width']],
297: ['stringresponse', ['type']],
298: ['textline', ['size']],
299: );
300: foreach my $element_attributes (@attributes) {
301: my $element_name = $element_attributes->[0];
302: my $attribute_names = $element_attributes->[1];
303: my @elements = $doc->getElementsByTagName($element_name);
304: foreach my $element (@elements) {
305: foreach my $attribute_name (@$attribute_names) {
306: my $value = $element->getAttribute($attribute_name);
307: if (defined $value && $value =~ /^\s*$/) {
308: $element->removeAttribute($attribute_name);
309: }
310: }
311: }
312: }
313: }
314:
315: # fixes the case for a few attributes that are not all lowercase
316: # (the HTML parser used in html_to_xml turns everything lowercase, which is a good thing in general)
317: sub fix_attribute_case {
318: my ($root) = @_;
319: my $doc = $root->ownerDocument;
320: my @attributes = (
321: ['labelgroup', ['TeXsize']],
322: ['h1', ['TeXsize']],
323: ['h2', ['TeXsize']],
324: ['h3', ['TeXsize']],
325: ['h4', ['TeXsize']],
326: ['h5', ['TeXsize']],
327: ['h6', ['TeXsize']],
328: # font and basefont have a TeXsize but will be removed
329: ['optionresponse', ['TeXlayout']],
330: ['itemgroup', ['TeXitemgroupwidth']],
331: ['Task', ['OptionalRequired']],
332: ['Question', ['OptionalRequired','Mandatory']],
333: ['Instance', ['OptionalRequired','Disabled']],
334: ['Criteria', ['Mandatory']],
335: ['table', ['TeXwidth','TeXtheme']],
336: ['td', ['TeXwidth']],
337: ['th', ['TeXwidth']],
338: ['img', ['TeXwidth','TeXheight','TeXwrap']],
339: );
340: foreach my $element_attributes (@attributes) {
341: my $element_name = $element_attributes->[0];
342: my $attribute_names = $element_attributes->[1];
343: my @elements = $doc->getElementsByTagName($element_name);
344: foreach my $element (@elements) {
345: foreach my $attribute_name (@$attribute_names) {
346: my $value = $element->getAttribute(lc($attribute_name));
347: if (defined $value) {
348: $element->removeAttribute(lc($attribute_name));
349: $element->setAttribute($attribute_name, $value);
350: }
351: }
352: }
353: }
354: }
355:
356: # Replaces m by HTML, tm and/or dtm (which will be replaced by <m> later, but they are useful
357: # to know if the element is a block element or not).
358: # m might contain non-math LaTeX, while tm and dtm may only contain math.
359: sub replace_m {
360: my ($root) = @_;
361: my $doc = $root->ownerDocument;
362: # search for variable declarations
363: my @variables = ();
364: my @scripts = $root->getElementsByTagName('script');
365: foreach my $script (@scripts) {
366: my $type = $script->getAttribute('type');
367: if (defined $type && $type eq 'loncapa/perl') {
368: if (defined $script->firstChild && $script->firstChild->nodeType == XML_TEXT_NODE) {
369: my $text = $script->firstChild->nodeValue;
370: # NOTE: we are not interested in replacing "@value", only "$value"
371: # this regexp is for " $a = ..." and " $a[...] = ..."
372: while ($text =~ /^[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
373: if (!string_in_array(\@variables, $1)) {
374: push(@variables, $1);
375: }
376: }
377: # this regexp is for "...; $a = ..." and "...; $a[...] = ..."
378: while ($text =~ /^[^'"\/;]+;[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
379: if (!string_in_array(\@variables, $1)) {
380: push(@variables, $1);
381: }
382: }
383: # this regexp is for " @a = ..."
384: while ($text =~ /^[ \t]*\@([a-zA-Z_0-9]+)[ \t]*=/gm) {
385: if (!string_in_array(\@variables, $1)) {
386: push(@variables, $1);
387: }
388: }
389: # this regexp is for " ($a, $b, $c) = ..."
390: my @matches = ($text =~ /^[ \t]*\([ \t]*\$([a-zA-Z_0-9]+)(?:[ \t]*,[ \t]*\$([a-zA-Z_0-9]+))*[ \t]*\)[ \t]*=/gm);
391: foreach my $match (@matches) {
392: if (!defined $match) {
393: next; # not sure why it happens, but it does
394: }
395: if (!string_in_array(\@variables, $match)) {
396: push(@variables, $match);
397: }
398: }
399: # and this one is for "push @a"
400: while ($text =~ /^[ \t]*push @([a-zA-Z_0-9]+)[ \t,]*/gm) {
401: if (!string_in_array(\@variables, $1)) {
402: push(@variables, $1);
403: }
404: }
405: # use the opportunity to report usage of <m> in Perl scripts
406: if ($text =~ /^[^#].*<m[ >]/m) {
407: if ($warnings) {
408: print "WARNING: <m> is used in a script, it should be converted by hand\n";
409: }
410: }
411: }
412: }
413: }
414: my @ms = $root->getElementsByTagName('m');
415: foreach my $m (@ms) {
416: if (!defined $m->firstChild) {
417: $m->parentNode->removeChild($m);
418: next;
419: }
420: if (defined $m->firstChild->nextSibling || $m->firstChild->nodeType != XML_TEXT_NODE) {
421: if ($warnings) {
422: print "WARNING: m value is not simple text\n";
423: }
424: next;
425: }
426: my $text = $m->firstChild->nodeValue;
427: my $text_before_variable_replacement = $text;
428: my $var_key1 = 'dfhg3df54hg65hg4';
429: my $var_key2 = 'dfhg654d6f5g4h5f';
430: my $eval = defined $m->getAttribute('eval') && $m->getAttribute('eval') eq 'on';
431: if ($eval) {
432: # replace variables
433: foreach my $variable (@variables) {
434: my $replacement = $var_key1.$variable.$var_key2;
435: $text =~ s/\$$variable(?![a-zA-Z])/$replacement/ge;
436: $text =~ s/\$\{$variable\}/$replacement/ge;
437: }
438: }
439: # check if the expression is enclosed in math separators: $ $$ \( \) \[ \]
440: # if so, replace the whole node by dtm or tm
441: my $new_text;
442: my $new_node_name;
443: if ($text =~ /^\s*\$\$([^\$]*)\$\$\s*$/) {
444: $new_node_name = 'dtm';
445: $new_text = $1;
446: } elsif ($text =~ /^\s*\\\[(.*)\\\]\s*$/s) {
447: $new_node_name = 'dtm';
448: $new_text = $1;
449: } elsif ($text =~ /^\s*\$([^\$]*)\$\s*$/) {
450: $new_node_name = 'tm';
451: $new_text = $1;
452: } elsif ($text =~ /^\s*\\\((.*)\\\)\s*$/s) {
453: $new_node_name = 'tm';
454: $new_text = $1;
455: }
456: if (defined $new_node_name) {
457: if ($eval) {
458: foreach my $variable (@variables) {
459: my $replacement = $var_key1.$variable.$var_key2;
460: $new_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
461: $new_text =~ s/$replacement/\$$variable/g;
462: }
463: }
464: my $new_node = $doc->createElement($new_node_name);
465: if ($eval) {
466: $new_node->setAttribute('eval', 'on');
467: }
468: $new_node->appendChild($doc->createTextNode($new_text));
469: $m->parentNode->replaceChild($new_node, $m);
470: next;
471: }
472: if ($text !~ /\$|\\\(|\\\)|\\\[|\\\]/) {
473: # there are no math separators inside
474: # try to guess if this is meant as math
475: my $found_math = 0;
476: foreach my $symbol (@latex_math) {
477: if (index($text, $symbol) != -1) {
478: $found_math = 1;
479: last;
480: }
481: }
482: if ($found_math) {
483: # interpret the whole text as LaTeX inline math
484: my $new_node = $doc->createElement('tm');
485: if ($eval) {
486: $new_node->setAttribute('eval', 'on');
487: }
488: $new_node->appendChild($doc->createTextNode($text_before_variable_replacement));
489: $m->parentNode->replaceChild($new_node, $m);
490: next;
491: }
492: # no math symbol found, we will convert the text with tth
493: }
494:
495: # there are math separators inside, even after hiding variables, or there was no math symbol
496:
497: # hide math parts inside before running tth
498: my $math_key1 = '#ghjgdh5hg45gf';
499: my $math_key2 = '#';
500: my @maths = ();
501: my @separators = (['$$','$$'], ['\\(','\\)'], ['\\[','\\]'], ['$','$']);
502: foreach my $seps (@separators) {
503: my $sep1 = $seps->[0];
504: my $sep2 = $seps->[1];
505: my $pos1 = index($text, $sep1);
506: if ($pos1 == -1) {
507: next;
508: }
509: my $pos2 = index($text, $sep2, $pos1+length($sep1));
510: while ($pos1 != -1 && $pos2 != -1) {
511: my $replace = substr($text, $pos1, $pos2+length($sep2)-$pos1);
512: push(@maths, $replace);
513: my $by = $math_key1.scalar(@maths).$math_key2;
514: $text = substr($text, 0, $pos1).$by.substr($text, $pos2+length($sep2));
515: $pos1 = index($text, $sep1);
516: if ($pos1 != -1) {
517: $pos2 = index($text, $sep2, $pos1+length($sep1));
518: }
519: }
520: }
521: # get HTML as text from tth
522: my $html_text = tth($text);
523: # replace math by replacements
524: for (my $i=0; $i < scalar(@maths); $i++) {
525: my $math = $maths[$i];
526: $math =~ s/&/&/g;
527: $math =~ s/</</g;
528: $math =~ s/>/>/g;
529: if ($math =~ /^\$\$(.*)\$\$$/s) {
530: $math = '<dtm>'.$1.'</dtm>';
531: } elsif ($math =~ /^\\\[(.*)\\\]$/s) {
532: $math = '<dtm>'.$1.'</dtm>';
533: } elsif ($math =~ /^\\\((.*)\\\)$/s) {
534: $math = '<tm>'.$1.'</tm>';
535: } elsif ($math =~ /^\$(.*)\$$/s) {
536: $math = '<tm>'.$1.'</tm>';
537: }
538: my $replace = $math_key1.($i+1).$math_key2;
539: $html_text =~ s/$replace/$math/;
540: }
541: # replace variables if necessary
542: if ($eval) {
543: foreach my $variable (@variables) {
544: my $replacement = $var_key1.$variable.$var_key2;
545: $html_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
546: $html_text =~ s/$replacement/\$$variable/g;
547: }
548: }
549: my $fragment = html_to_dom($html_text);
550: $doc->adoptNode($fragment);
551: $m->parentNode->replaceChild($fragment, $m);
552:
553: }
554: }
555:
556: # Returns the HTML equivalent of LaTeX input, using tth
557: sub tth {
558: my ($text) = @_;
1.7 damieng 559: my $output = &tth::tth($text);
560: my $errorstring = &tth::ttherror();
561: if ($errorstring) {
562: die $errorstring;
563: }
1.1 damieng 564: # hopefully the temp file will not be removed before this point (otherwise we should use unlink_on_destroy 0)
565: $output =~ s/^\s*|\s*$//;
566: $output =~ s/<div class="p"><!----><\/div>/<br\/>/; # why is tth using such ugly markup for \newline ?
567: return $output;
568: }
569:
570: # transform simple HTML into a DOM fragment (which will need to be adopted by the document)
571: sub html_to_dom {
572: my ($text) = @_;
573: $text = '<root>'.$text.'</root>';
1.7 damieng 574: my $textref = Apache::html_to_xml::html_to_xml(\$text);
1.1 damieng 575: utf8::upgrade($$textref); # otherwise the XML parser fails when the HTML parser turns into a character
576: my $dom_doc = XML::LibXML->load_xml(string => $textref);
577: my $root = $dom_doc->documentElement;
578: remove_empty_style($root);
579: my $fragment = $dom_doc->createDocumentFragment();
580: my $next;
581: for (my $n=$root->firstChild; defined $n; $n=$next) {
582: $next = $n->nextSibling;
583: $root->removeChild($n);
584: $fragment->appendChild($n);
585: }
586: return($fragment);
587: }
588:
589: # Use the linked sty files to guess which newly defined elements should be considered blocks.
590: # Also adds to @containing_styles the sty elements that contain styles.
591: # @param {string} fn - the file path (we only extract the directory path from it)
1.2 damieng 592: # @param {string} res_dir - path of res directory parent (without the / at the end)
593: # @param {Element} root - the root element
1.1 damieng 594: sub add_sty_blocks {
1.2 damieng 595: my ($fn, $res_dir, $root, $all_block) = @_;
1.1 damieng 596: my $doc = $root->ownerDocument;
597: my @parserlibs = $doc->getElementsByTagName('parserlib');
598: my @libs = ();
599: foreach my $parserlib (@parserlibs) {
600: if (defined $parserlib->firstChild && $parserlib->firstChild->nodeType == XML_TEXT_NODE) {
601: my $value = $parserlib->firstChild->nodeValue;
602: $value =~ s/^\s+|\s+$//g;
603: if ($value ne '') {
604: push(@libs, $value);
605: }
606: }
607: }
608: my ($name, $path, $suffix) = fileparse($fn);
609: foreach my $sty (@libs) {
610: if (substr($sty, 0, 1) eq '/') {
1.2 damieng 611: $sty = $res_dir.$sty;
1.1 damieng 612: } else {
613: $sty = $path.$sty;
614: }
615: my $new_elements = parse_sty($sty, $all_block);
616: better_guess($root, $new_elements, $all_block);
617: my $new_blocks = $new_elements->{'block'};
618: my $new_inlines = $new_elements->{'inline'};
619: push(@$all_block, @{$new_blocks});
620: #push(@inlines, @{$new_inlines}); # we are not using a list of inline elements at this point
621: }
622: }
623:
624: ##
625: # Parses a sty file and returns lists of block and inline elements.
626: # @param {string} fn - the file path
627: ##
628: sub parse_sty {
629: my ($fn, $all_block) = @_;
630: my @blocks = ();
631: my @inlines = ();
632: my $p = HTML::TokeParser->new($fn);
633: if (! $p) {
634: die "post_xml.pl: parse_sty: Error reading $fn\n";
635: }
636: $p->empty_element_tags(1);
637: my $in_definetag = 0;
638: my $in_render = 0;
639: my %newtags = ();
640: my $newtag = '';
641: my $is_block = 0;
642: while (my $token = $p->get_token) {
643: if ($token->[0] eq 'S') {
644: my $tag = lc($token->[1]);
645: if ($tag eq 'definetag') {
646: $in_definetag = 1;
647: $is_block = 0;
648: my $attributes = $token->[2];
649: $newtag = $attributes->{'name'};
650: if (substr($newtag, 0, 1) eq '/') {
651: $newtag = substr($newtag, 1);
652: }
653: } elsif ($in_definetag && $tag eq 'render') {
654: $in_render = 1;
655: $is_block = 0;
656: } elsif ($in_render) {
657: if (string_in_array($all_block, $tag)) {
658: $is_block = 1;
659: }
660: }
661: } elsif ($token->[0] eq 'E') {
662: my $tag = lc($token->[1]);
663: if ($tag eq 'definetag') {
664: $in_definetag = 0;
665: if (defined $newtags{$newtag}) {
666: $newtags{$newtag} = $newtags{$newtag} || $is_block;
667: } else {
668: $newtags{$newtag} = $is_block;
669: }
670: } elsif ($in_definetag && $tag eq 'render') {
671: $in_render = 0;
672: }
673: }
674: }
675: foreach $newtag (keys(%newtags)) {
676: if ($newtags{$newtag} == 1) {
677: push(@blocks, $newtag);
678: } else {
679: push(@inlines, $newtag);
680: }
681: }
682: return {'block'=>\@blocks, 'inline'=>\@inlines};
683: }
684:
685: ##
686: # Marks as block the elements that contain block elements in the input file.
687: # Also adds to @containing_styles the sty elements that contain styles.
688: # @param {string} fn - the file path
689: # @param {Hash<string,Array>} new_elements - contains arrays in 'block' and 'inline'
690: ##
691: sub better_guess {
692: my ($root, $new_elements, $all_block) = @_;
693: my $new_blocks = $new_elements->{'block'};
694: my $new_inlines = $new_elements->{'inline'};
695:
696: my @change = (); # change these elements from inline to block
697: foreach my $tag (@{$new_inlines}) {
698: my @nodes = $root->getElementsByTagName($tag);
699: NODE_LOOP: foreach my $node (@nodes) {
700: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
701: if ($child->nodeType == XML_ELEMENT_NODE) {
702: if (string_in_array($all_block, $child->nodeName) || string_in_array($new_blocks, $child->nodeName)) {
703: push(@change, $tag);
704: last NODE_LOOP;
705: }
706: }
707: }
708: }
709: }
710: foreach my $inline (@change) {
711: my $index = 0;
712: $index++ until $new_inlines->[$index] eq $inline;
713: splice(@{$new_inlines}, $index, 1);
714: push(@{$new_blocks}, $inline);
715: }
716: # add to @containing_styles when a style is used inside
717: # NOTE: some sty elements will be added even though they should not, but if we don't do that
718: # all style will be removed in the sty elements.
719: foreach my $tag ((@{$new_blocks}, @{$new_inlines})) {
720: my @nodes = $root->getElementsByTagName($tag);
721: NODE_LOOP: foreach my $node (@nodes) {
722: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
723: if ($child->nodeType == XML_ELEMENT_NODE) {
724: if (string_in_array(\@html_styles, $child->nodeName)) {
725: push(@containing_styles, $tag);
726: last NODE_LOOP;
727: }
728: }
729: }
730: }
731: }
732: }
733:
734: # When a style element contains a block, move the style inside the block where it is allowed.
735: # style/block/other -> block/style/other
736: # When a style is used where it is not allowed, move it inside its children or remove it (unless it contains only text)
737: # element_not_containing_styles/style/other -> element_not_containing_styles/other/style (except if other is a style)
738: # The fix is not perfect in the case of element_not_containing_styles/style1/style2/block/text (style1 will be lost):
739: # element_not_containing_styles/style1/style2/block/text -> element_not_containing_styles/block/style2/text
740: # (a solution to this problem would be to merge the styles in a span)
741: # NOTE: .sty defined elements might have been added to @containing_styles by better_guess().
742: sub fix_block_styles {
743: my ($element, $all_block) = @_;
744: my $doc = $element->ownerDocument;
745: if (string_in_array(\@html_styles, $element->nodeName)) {
746: # move spaces out of the style element
747: if (defined $element->firstChild && $element->firstChild->nodeType == XML_TEXT_NODE) {
748: my $child = $element->firstChild;
749: if ($child->nodeValue =~ /^(\s+)(\S.*)$/s) {
750: $element->parentNode->insertBefore($doc->createTextNode($1), $element);
751: $child->setData($2);
752: }
753: }
754: if (defined $element->lastChild && $element->lastChild->nodeType == XML_TEXT_NODE) {
755: my $child = $element->lastChild;
756: if ($child->nodeValue =~ /^(.*\S)(\s+)$/s) {
757: $element->parentNode->insertAfter($doc->createTextNode($2), $element);
758: $child->setData($1);
759: }
760: }
761:
762: my $found_block = 0;
763: for (my $child=$element->firstChild; defined $child; $child=$child->nextSibling) {
764: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
765: $found_block = 1;
766: last;
767: }
768: }
769: my $no_style_here = !string_in_array(\@containing_styles, $element->parentNode->nodeName);
770: if ($found_block || $no_style_here) {
771: # there is a block or the style is not allowed here,
772: # the style element has to be replaced by its modified children
773: my $s; # a clone of the style
774: my $next;
775: for (my $child=$element->firstChild; defined $child; $child=$next) {
776: $next = $child->nextSibling;
777: if ($child->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $child->nodeName) ||
778: $child->nodeName eq 'br' || $no_style_here)) {
779: # avoid inverting a style with a style with $no_style_here (that would cause endless recursion)
780: if (!$no_style_here || (!string_in_array(\@html_styles, $child->nodeName) &&
781: string_in_array(\@containing_styles, $child->nodeName))) {
782: # block node or inline node when the style is not allowed:
783: # move all children inside the style, and make the style the only child
784: $s = $element->cloneNode();
785: my $next2;
786: for (my $child2=$child->firstChild; defined $child2; $child2=$next2) {
787: $next2 = $child2->nextSibling;
788: $child->removeChild($child2);
789: $s->appendChild($child2);
790: }
791: $child->appendChild($s);
792: }
793: $s = undef;
794: } elsif (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
795: $child->nodeType == XML_ELEMENT_NODE) {
796: # if the style is allowed, move text and inline nodes inside the style
797: if (!$no_style_here) {
798: if (!defined $s) {
799: $s = $element->cloneNode();
800: $element->insertBefore($s, $child);
801: }
802: $element->removeChild($child);
803: $s->appendChild($child);
804: }
805: } else {
806: # do not put other nodes inside the style
807: $s = undef;
808: }
809: }
810: # now replace by children and fix them
811: my $parent = $element->parentNode;
812: for (my $child=$element->firstChild; defined $child; $child=$next) {
813: $next = $child->nextSibling;
814: $element->removeChild($child);
815: $parent->insertBefore($child, $element);
816: if ($child->nodeType == XML_ELEMENT_NODE) {
817: fix_block_styles($child, $all_block);
818: }
819: }
820: $parent->removeChild($element);
821: return;
822: }
823: }
824: # otherwise fix all children
825: my $next;
826: for (my $child=$element->firstChild; defined $child; $child=$next) {
827: $next = $child->nextSibling;
828: if ($child->nodeType == XML_ELEMENT_NODE) {
829: fix_block_styles($child, $all_block);
830: }
831: }
832: }
833:
834: # removes empty font elements and font elements that contain at least one block element
835: # replaces other font elements by equivalent span
836: sub fix_fonts {
837: my ($root, $all_block) = @_;
838: my $doc = $root->ownerDocument;
839: my @fonts = $root->getElementsByTagName('font');
840: @fonts = reverse(@fonts); # to deal with the ancestor last in the case of font/font
841: foreach my $font (@fonts) {
842: my $block = 0;
843: for (my $child=$font->firstChild; defined $child; $child=$child->nextSibling) {
844: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
845: $block = 1;
846: last;
847: }
848: }
849: if (!defined $font->firstChild || $block) {
850: # empty font or font containing block elements
851: # replace this node by its content
852: replace_by_children($font);
853: } else {
854: # replace by equivalent span
855: my $color = get_non_empty_attribute($font, 'color');
856: my $size = get_non_empty_attribute($font, 'size');
857: my $face = get_non_empty_attribute($font, 'face');
858: if (defined $face) {
859: $face =~ s/^,|,$//;
860: }
861: if (!defined $color && !defined $size && !defined $face) {
862: # useless font element: replace this node by its content
863: replace_by_children($font);
864: next;
865: }
866: my $replacement;
867: tie (my %properties, 'Tie::IxHash', ());
868: if (!defined $color && !defined $size && defined $face && lc($face) eq 'symbol') {
869: $replacement = $doc->createDocumentFragment();
870: } else {
871: $replacement = $doc->createElement('span');
872: my $css = '';
873: if (defined $color) {
874: $color =~ s/^x/#/;
875: $properties{'color'} = $color;
876: }
877: if (defined $size) {
878: my %hash = (
879: '1' => 'x-small',
880: '2' => 'small',
881: '3' => 'medium',
882: '4' => 'large',
883: '5' => 'x-large',
884: '6' => 'xx-large',
885: '7' => '300%',
886: '-1' => 'small',
887: '-2' => 'x-small',
888: '+1' => 'large',
889: '+2' => 'x-large',
890: '+3' => 'xx-large',
891: '+4' => '300%',
892: );
893: my $value = $hash{$size};
894: if (!defined $value) {
895: $value = 'medium';
896: }
897: $properties{'font-size'} = $value;
898: }
899: if (defined $face) {
900: if (lc($face) ne 'symbol' && lc($face) ne 'bold') {
901: $properties{'font-family'} = $face;
902: }
903: }
904: set_css_properties($replacement, \%properties);
905: }
906: if (defined $face && lc($face) eq 'symbol') {
907: # convert all content to unicode
908: my $next;
909: for (my $child=$font->firstChild; defined $child; $child=$next) {
910: $next = $child->nextSibling;
911: if ($child->nodeType == XML_TEXT_NODE) {
912: my $value = $child->nodeValue;
913: $value =~ tr/ABGDEZHQIKLMNXOPRSTUFCYWabgdezhqiklmnxoprVstufcywJjv¡«¬®/ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϕϖϒ↔←→/;
914: $child->setData($value);
915: }
916: }
917: }
918: # replace the font node
919: if ($replacement->nodeType == XML_ELEMENT_NODE && !defined $font->previousSibling &&
920: !defined $font->nextSibling && string_in_array(\@accepting_style, $font->parentNode->nodeName)) {
921: # use CSS on the parent block and replace font by its children instead of using a new element
922: set_css_properties($font->parentNode, \%properties);
923: replace_by_children($font);
924: } else {
925: # move all font children inside the replacement (span or fragment)
926: my $next;
927: for (my $child=$font->firstChild; defined $child; $child=$next) {
928: $next = $child->nextSibling;
929: $font->removeChild($child);
930: $replacement->appendChild($child);
931: }
932: # replace font
933: $font->parentNode->replaceChild($replacement, $font);
934: }
935: }
936: }
937: $root->normalize();
938: }
939:
940: # replaces u by <span style="text-decoration: underline">
941: sub replace_u {
942: my ($root) = @_;
943: my $doc = $root->ownerDocument;
944: my @us = $root->getElementsByTagName('u');
945: foreach my $u (@us) {
946: my $span = $doc->createElement('span');
947: $span->setAttribute('style', 'text-decoration: underline');
948: my $next;
949: for (my $child=$u->firstChild; defined $child; $child=$next) {
950: $next = $child->nextSibling;
951: $u->removeChild($child);
952: $span->appendChild($child);
953: }
954: $u->parentNode->replaceChild($span, $u);
955: }
956: }
957:
958: # removes CDATA sections tags that have not been parsed correcty by the HTML parser
959: # also removes bad comments in script elements
960: sub remove_bad_cdata_sections {
961: my ($root) = @_;
962: my $doc = $root->ownerDocument;
963: foreach my $name (@preserve_elements) {
964: my @nodes = $root->getElementsByTagName($name);
965: foreach my $node (@nodes) {
966: if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
967: my $value = $node->firstChild->nodeValue;
968: if ($name eq 'script' && (!defined $node->getAttribute('type') || $node->getAttribute('type') ne 'loncapa/perl') &&
969: !defined $node->firstChild->nextSibling && $value =~ /^(\s*)<!--(.*)-->(\s*)$/) {
970: # web browsers interpret that as a real comment when it is on 1 line, but the Perl HTML parser thinks it is the script
971: # -> turning it back into a comment
972: # (this is only true for Javascript script elements, since LON-CAPA does not parse loncapa/perl scripts in the same way)
973: $node->removeChild($node->firstChild);
974: $node->appendChild($doc->createComment($2));
975: next;
976: }
977: # at the beginning:
978: $value =~ s/^(\s*)<!\[CDATA\[/$1/; # <![CDATA[
979: $value =~ s/^(\s*)\/\*\s*<!\[CDATA\[\s*\*\//$1/; # /* <![CDATA[ */
980: $value =~ s/^(\s*)\/\/\s*<!\[CDATA\[/$1/; # // <![CDATA[
981: $value =~ s/^(\s*)(\/\/)?\s*<!--/$1/; # // <!--
982: # at the end:
983: $value =~ s/\/\/\s*\]\]>(\s*)$/$1/; # // ]]>
984: $value =~ s/\]\]>(\s*)$/$1/; # ]]>
985: $value =~ s/(\/\/)?\s*-->(\s*)$/$2/; # // -->
986: $value =~ s/\/\*\s*\]\]>\s*\*\/(\s*)$/$1/; # /* ]]> */
987:
988: $value = "\n".$value."\n";
989: $value =~ s/\s*(\n[ \t]*)/$1/;
990: $value =~ s/\s+$/\n/;
991: $node->firstChild->setData($value);
992: }
993: }
994: }
995: }
996:
997: # adds CDATA sections to scripts
998: sub add_cdata_sections {
999: my ($root) = @_;
1000: my $doc = $root->ownerDocument;
1001: my @scripts = $root->getElementsByTagName('script');
1002: my @answers = $root->getElementsByTagName('answer');
1003: foreach my $answer (@answers) {
1004: my $ancestor = $answer->parentNode;
1005: my $found_capa_response = 0;
1006: while (defined $ancestor) {
1007: if ($ancestor->nodeName eq 'numericalresponse' || $ancestor->nodeName eq 'formularesponse') {
1008: $found_capa_response = 1;
1009: last;
1010: }
1011: $ancestor = $ancestor->parentNode;
1012: }
1013: if (!$found_capa_response) {
1014: push(@scripts, $answer);
1015: }
1016: }
1017: foreach my $script (@scripts) {
1018: # use a CDATA section in the normal situation, for any script
1019: my $first = $script->firstChild;
1020: if (defined $first && $first->nodeType == XML_TEXT_NODE && !defined $first->nextSibling) {
1021: my $cdata = $doc->createCDATASection($first->nodeValue);
1022: $script->replaceChild($cdata, $first);
1023: }
1024: }
1025: }
1026:
1027: # removes "<!--" and "-->" at the beginning and end of style elements
1028: sub fix_style_element {
1029: my ($root) = @_;
1030: my @styles = $root->getElementsByTagName('style');
1031: foreach my $style (@styles) {
1032: if (defined $style->firstChild && $style->firstChild->nodeType == XML_TEXT_NODE &&
1033: !defined $style->firstChild->nextSibling) {
1034: my $text = $style->firstChild->nodeValue;
1035: if ($text =~ /^\s*<!--(.*)-->\s*$/s) {
1036: $style->firstChild->setData($1);
1037: }
1038: }
1039: }
1040: }
1041:
1042: # create missing cells at the end of table rows
1043: sub fix_tables {
1044: my ($root) = @_;
1045: my @tables = $root->getElementsByTagName('table');
1046: foreach my $table (@tables) {
1047: fix_cells($table);
1048: foreach my $tbody ($table->getChildrenByTagName('tbody')) {
1049: fix_cells($tbody);
1050: }
1051: foreach my $thead ($table->getChildrenByTagName('thead')) {
1052: fix_cells($thead);
1053: }
1054: foreach my $tfoot ($table->getChildrenByTagName('tfoot')) {
1055: fix_cells($tfoot);
1056: }
1057: }
1058: }
1059:
1060: # create missing cells at the end of table rows
1061: sub fix_cells {
1062: my ($table) = @_; # could actually be table, tbody, thead or tfoot
1063: my $doc = $table->ownerDocument;
1064: my @nb_cells = ();
1065: my $max_nb_cells = 0;
1066: my @rowspans = ();
1067: my @trs = $table->getChildrenByTagName('tr');
1068: foreach my $tr (@trs) {
1069: my $nb_cells;
1070: if (defined $rowspans[0]) {
1071: $nb_cells = shift(@rowspans);
1072: } else {
1073: $nb_cells = 0;
1074: }
1075: for (my $cell=$tr->firstChild; defined $cell; $cell=$cell->nextSibling) {
1076: if ($cell->nodeName eq 'td' || $cell->nodeName eq 'th') {
1077: my $colspan = $cell->getAttribute('colspan');
1078: if (defined $colspan && $colspan =~ /^\s*[0-9]+\s*$/) {
1079: $nb_cells += $colspan;
1080: } else {
1081: $nb_cells++;
1082: }
1083: my $rowspan = $cell->getAttribute('rowspan');
1084: if (defined $rowspan && $rowspan =~ /^\s*[0-9]+\s*$/) {
1085: for (my $i=0; $i < $rowspan-1; $i++) {
1086: if (!defined $rowspans[$i]) {
1087: $rowspans[$i] = 1;
1088: } else {
1089: $rowspans[$i]++;
1090: }
1091: }
1092: }
1093: }
1094: }
1095: push(@nb_cells, $nb_cells);
1096: if ($nb_cells > $max_nb_cells) {
1097: $max_nb_cells = $nb_cells;
1098: }
1099: }
1100: foreach my $tr (@trs) {
1101: my $nb_cells = shift(@nb_cells);
1102: if ($nb_cells < $max_nb_cells) {
1103: for (1..($max_nb_cells - $nb_cells)) {
1104: $tr->appendChild($doc->createElement('td'));
1105: }
1106: }
1107: }
1108: }
1109:
1110: # replaces ul/ul by ul/li/ul and the same for ol (using the previous li if possible)
1111: # also adds a ul element when a li has no ul/ol ancestor
1112: sub fix_lists {
1113: my ($root) = @_;
1114: my $doc = $root->ownerDocument;
1115: my @uls = $root->getElementsByTagName('ul');
1116: my @ols = $root->getElementsByTagName('ol');
1117: my @lists = (@uls, @ols);
1118: foreach my $list (@lists) {
1119: my $next;
1120: for (my $child=$list->firstChild; defined $child; $child=$next) {
1121: $next = $child->nextSibling;
1122: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array(['ul','ol'], $child->nodeName)) {
1123: my $previous = $child->previousNonBlankSibling(); # note: non-DOM method
1124: $list->removeChild($child);
1125: if (defined $previous && $previous->nodeType == XML_ELEMENT_NODE && $previous->nodeName eq 'li') {
1126: $previous->appendChild($child);
1127: } else {
1128: my $li = $doc->createElement('li');
1129: $li->appendChild($child);
1130: if (!defined $next) {
1131: $list->appendChild($li);
1132: } else {
1133: $list->insertBefore($li, $next);
1134: }
1135: }
1136: }
1137: }
1138: }
1139: my @lis = $root->getElementsByTagName('li');
1140: foreach my $li (@lis) {
1141: my $found_list_ancestor = 0;
1142: my $ancestor = $li->parentNode;
1143: while (defined $ancestor) {
1144: if ($ancestor->nodeName eq 'ul' || $ancestor->nodeName eq 'ol') {
1145: $found_list_ancestor = 1;
1146: last;
1147: }
1148: $ancestor = $ancestor->parentNode;
1149: }
1150: if (!$found_list_ancestor) {
1151: # replace li by ul and add li under ul
1152: my $ul = $doc->createElement('ul');
1153: $li->parentNode->insertBefore($ul, $li);
1154: $li->parentNode->removeChild($li);
1155: $ul->appendChild($li);
1156: # add all other li afterwards inside ul (there might be text nodes in-between)
1157: my $next = $ul->nextSibling;
1158: while (defined $next) {
1159: my $next_next = $next->nextSibling;
1160: if ($next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^\s*$/ &&
1161: defined $next_next && $next_next->nodeType == XML_ELEMENT_NODE && $next_next->nodeName eq 'li') {
1162: $next->parentNode->removeChild($next);
1163: $ul->appendChild($next);
1164: $next = $next_next;
1165: $next_next = $next_next->nextSibling;
1166: }
1167: if ($next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'li') {
1168: $next->parentNode->removeChild($next);
1169: $ul->appendChild($next);
1170: } else {
1171: last;
1172: }
1173: $next = $next_next;
1174: }
1175: }
1176: }
1177: }
1178:
1179: # Some "image" elements are actually img element with a wrong name. This renames them.
1180: # Amazingly enough, "<image src=..." displays an image in some browsers
1181: # ("image" has existed at some point as an experimental HTML element).
1182: sub fix_wrong_name_for_img {
1183: my ($root) = @_;
1184: my @images = $root->getElementsByTagName('image');
1185: foreach my $image (@images) {
1186: if (!defined $image->getAttribute('src')) {
1187: next;
1188: }
1189: my $found_correct_ancestor = 0;
1190: my $ancestor = $image->parentNode;
1191: while (defined $ancestor) {
1192: if ($ancestor->nodeName eq 'drawimage' || $ancestor->nodeName eq 'imageresponse') {
1193: $found_correct_ancestor = 1;
1194: last;
1195: }
1196: $ancestor = $ancestor->parentNode;
1197: }
1198: if ($found_correct_ancestor) {
1199: next;
1200: }
1201: # this really has to be renamed "img"
1202: $image->setNodeName('img');
1203: }
1204: }
1205:
1206: # Replaces many deprecated attributes and replaces them by equivalent CSS when possible
1207: sub replace_deprecated_attributes_by_css {
1208: my ($root) = @_;
1209:
1210: fix_deprecated_in_tables($root);
1211:
1212: fix_deprecated_in_table_rows($root);
1213:
1214: fix_deprecated_in_table_cells($root);
1215:
1216: fix_deprecated_in_lists($root);
1217:
1218: fix_deprecated_in_list_items($root);
1219:
1220: fix_deprecated_in_hr($root);
1221:
1222: fix_deprecated_in_img($root);
1223:
1224: fix_deprecated_in_body($root);
1225:
1226: fix_align_attribute($root);
1227: }
1228:
1229: # Replaces deprecated attributes in tables
1230: sub fix_deprecated_in_tables {
1231: my ($root) = @_;
1232: my @tables = $root->getElementsByTagName('table');
1233: foreach my $table (@tables) {
1234: tie (my %new_properties, 'Tie::IxHash', ());
1235: my $align = $table->getAttribute('align');
1236: if (defined $align) {
1237: $table->removeAttribute('align');
1238: $align = lc(trim($align));
1239: }
1240: if ($table->parentNode->nodeName eq 'center' || (defined $align && $align eq 'center') ||
1241: (defined $table->parentNode->getAttribute('align') && $table->parentNode->getAttribute('align') eq 'center')) {
1242: $new_properties{'margin-left'} = 'auto';
1243: $new_properties{'margin-right'} = 'auto';
1244: }
1245: if (defined $align && ($align eq 'left' || $align eq 'right')) {
1246: $new_properties{'float'} = $align;
1247: }
1248: my $width = $table->getAttribute('width');
1249: if (defined $width) {
1250: $table->removeAttribute('width');
1251: $width = trim($width);
1252: if ($width =~ /^[0-9]+$/) {
1253: $width .= 'px';
1254: }
1255: if ($width ne '') {
1256: $new_properties{'width'} = $width;
1257: }
1258: }
1259: my $height = $table->getAttribute('height');
1260: if (defined $height) {
1261: $table->removeAttribute('height');
1262: # no replacement for table height
1263: }
1264: my $bgcolor = $table->getAttribute('bgcolor');
1265: if (defined $bgcolor) {
1266: $table->removeAttribute('bgcolor');
1267: $bgcolor = trim($bgcolor);
1268: $bgcolor =~ s/^x\s*//;
1269: if ($bgcolor ne '') {
1270: $new_properties{'background-color'} = $bgcolor;
1271: }
1272: }
1273: my $frame = $table->getAttribute('frame');
1274: if (defined $frame) {
1275: $table->removeAttribute('frame');
1276: $frame = lc(trim($frame));
1277: if ($frame eq 'void') {
1278: $new_properties{'border'} = 'none';
1279: } elsif ($frame eq 'above') {
1280: $new_properties{'border-top'} = '1px solid black';
1281: } elsif ($frame eq 'below') {
1282: $new_properties{'border-bottom'} = '1px solid black';
1283: } elsif ($frame eq 'hsides') {
1284: $new_properties{'border-top'} = '1px solid black';
1285: $new_properties{'border-bottom'} = '1px solid black';
1286: } elsif ($frame eq 'vsides') {
1287: $new_properties{'border-left'} = '1px solid black';
1288: $new_properties{'border-right'} = '1px solid black';
1289: } elsif ($frame eq 'lhs') {
1290: $new_properties{'border-left'} = '1px solid black';
1291: } elsif ($frame eq 'rhs') {
1292: $new_properties{'border-right'} = '1px solid black';
1293: } elsif ($frame eq 'box') {
1294: $new_properties{'border'} = '1px solid black';
1295: } elsif ($frame eq 'border') {
1296: $new_properties{'border'} = '1px solid black';
1297: }
1298: }
1299: if (scalar(keys %new_properties) > 0) {
1300: set_css_properties($table, \%new_properties);
1301: }
1302: # we can't replace the border attribute without creating a style block, but we can improve things like border="BORDER"
1303: my $border = $table->getAttribute('border');
1304: if (defined $border) {
1305: $border = trim($border);
1306: if ($border !~ /^\s*[0-9]+\s*(px)?\s*$/) {
1307: $table->setAttribute('border', '1');
1308: }
1309: }
1310: }
1311:
1312: }
1313:
1314: # Replaces deprecated attributes in tr elements
1315: sub fix_deprecated_in_table_rows {
1316: my ($root) = @_;
1317: my @trs = $root->getElementsByTagName('tr');
1318: foreach my $tr (@trs) {
1319: my $old_properties = get_css_properties($tr);
1320: tie (my %new_properties, 'Tie::IxHash', ());
1321: my $bgcolor = $tr->getAttribute('bgcolor');
1322: if (defined $bgcolor) {
1323: $tr->removeAttribute('bgcolor');
1324: if (!defined $old_properties->{'background-color'}) {
1325: $bgcolor = trim($bgcolor);
1326: $bgcolor =~ s/^x\s*//;
1327: if ($bgcolor ne '') {
1328: $new_properties{'background-color'} = $bgcolor;
1329: }
1330: }
1331: }
1332: my $align = $tr->getAttribute('align');
1333: if (defined $align && $align !~ /\s*char\s*/i) {
1334: $tr->removeAttribute('align');
1335: if (!defined $old_properties->{'text-align'}) {
1336: $align = lc(trim($align));
1337: if ($align ne '') {
1338: $new_properties{'text-align'} = $align;
1339: }
1340: }
1341: }
1342: my $valign = $tr->getAttribute('valign');
1343: if (defined $valign) {
1344: $tr->removeAttribute('valign');
1345: if (!defined $old_properties->{'vertical-align'}) {
1346: $valign = lc(trim($valign));
1347: if ($valign ne '') {
1348: $new_properties{'vertical-align'} = $valign;
1349: }
1350: }
1351: }
1352: if (scalar(keys %new_properties) > 0) {
1353: set_css_properties($tr, \%new_properties);
1354: }
1355: }
1356: }
1357:
1358: # Replaces deprecated attributes in table cells (td and th)
1359: sub fix_deprecated_in_table_cells {
1360: my ($root) = @_;
1361: my @tds = $root->getElementsByTagName('td');
1362: my @ths = $root->getElementsByTagName('th');
1363: my @cells = (@tds, @ths);
1364: foreach my $cell (@cells) {
1365: my $old_properties = get_css_properties($cell);
1366: tie (my %new_properties, 'Tie::IxHash', ());
1367: my $width = $cell->getAttribute('width');
1368: if (defined $width) {
1369: $cell->removeAttribute('width');
1370: if (!defined $old_properties->{'width'}) {
1371: $width = trim($width);
1372: if ($width =~ /^[0-9]+$/) {
1373: $width .= 'px';
1374: }
1375: if ($width ne '') {
1376: $new_properties{'width'} = $width;
1377: }
1378: }
1379: }
1380: my $height = $cell->getAttribute('height');
1381: if (defined $height) {
1382: $cell->removeAttribute('height');
1383: if (!defined $old_properties->{'height'}) {
1384: $height = trim($height);
1385: if ($height =~ /^[0-9]+$/) {
1386: $height .= 'px';
1387: }
1388: if ($height ne '') {
1389: $new_properties{'height'} = $height;
1390: }
1391: }
1392: }
1393: my $bgcolor = $cell->getAttribute('bgcolor');
1394: if (defined $bgcolor) {
1395: $cell->removeAttribute('bgcolor');
1396: if (!defined $old_properties->{'background-color'}) {
1397: $bgcolor = trim($bgcolor);
1398: $bgcolor =~ s/^x\s*//;
1399: if ($bgcolor ne '') {
1400: $new_properties{'background-color'} = $bgcolor;
1401: }
1402: }
1403: }
1404: my $align = $cell->getAttribute('align');
1405: if (defined $align && $align !~ /\s*char\s*/i) {
1406: $cell->removeAttribute('align');
1407: if (!defined $old_properties->{'text-align'}) {
1408: $align = lc(trim($align));
1409: if ($align ne '') {
1410: $new_properties{'text-align'} = $align;
1411: }
1412: }
1413: }
1414: my $valign = $cell->getAttribute('valign');
1415: if (defined $valign) {
1416: $cell->removeAttribute('valign');
1417: if (!defined $old_properties->{'vertical-align'}) {
1418: $valign = lc(trim($valign));
1419: if ($valign ne '') {
1420: $new_properties{'vertical-align'} = $valign;
1421: }
1422: }
1423: }
1424: if (scalar(keys %new_properties) > 0) {
1425: set_css_properties($cell, \%new_properties);
1426: }
1427: }
1428: }
1429:
1430: # Replaces deprecated attributes in lists (ul and ol)
1431: sub fix_deprecated_in_lists {
1432: my ($root) = @_;
1433: my @uls = $root->getElementsByTagName('ul');
1434: my @ols = $root->getElementsByTagName('ol');
1435: my @lists = (@uls, @ols);
1436: foreach my $list (@lists) {
1437: my $type = $list->getAttribute('type');
1438: if (defined $type) {
1439: my $lst = list_style_type($type);
1440: if (defined $lst) {
1441: $list->removeAttribute('type');
1442: if (!defined get_css_property($list, 'list-style-type')) {
1443: set_css_property($list, 'list-style-type', $lst);
1444: }
1445: }
1446: }
1447: }
1448: }
1449:
1450: # Replaces deprecated attributes in list items (li)
1451: sub fix_deprecated_in_list_items {
1452: my ($root) = @_;
1453: my @lis = $root->getElementsByTagName('li');
1454: foreach my $li (@lis) {
1455: my $type = $li->getAttribute('type');
1456: if (defined $type) {
1457: my $lst = list_style_type($type);
1458: if (defined $lst) {
1459: $li->removeAttribute('type');
1460: if (!defined get_css_property($li, 'list-style-type')) {
1461: set_css_property($li, 'list-style-type', $lst);
1462: }
1463: }
1464: }
1465: }
1466: }
1467:
1468: # returns the CSS list-style-type value equivalent to the given type attribute for a list or list item
1469: sub list_style_type {
1470: my ($type) = @_;
1471: my $value;
1472: $type = trim($type);
1473: if (lc($type) eq 'circle') {
1474: $value = 'circle';
1475: } elsif (lc($type) eq 'disc') {
1476: $value = 'disc';
1477: } elsif (lc($type) eq 'square') {
1478: $value = 'square';
1479: } elsif ($type eq 'a') {
1480: $value = 'lower-latin';
1481: } elsif ($type eq 'A') {
1482: $value = 'upper-latin';
1483: } elsif ($type eq 'i') {
1484: $value = 'lower-roman';
1485: } elsif ($type eq 'I') {
1486: $value = 'upper-roman';
1487: } elsif ($type eq '1') {
1488: $value = 'decimal';
1489: }
1490: return $value;
1491: }
1492:
1493: # Replaces deprecated attributes in hr
1494: sub fix_deprecated_in_hr {
1495: my ($root) = @_;
1496: my @hrs = $root->getElementsByTagName('hr');
1497: foreach my $hr (@hrs) {
1498: tie (my %new_properties, 'Tie::IxHash', ());
1499: my $align = $hr->getAttribute('align');
1500: if (defined $align) {
1501: $align = lc(trim($align));
1502: if ($align eq 'left') {
1503: $new_properties{'text-align'} = 'left';
1504: $new_properties{'margin-left'} = '0';
1505: } elsif ($align eq 'right') {
1506: $new_properties{'text-align'} = 'right';
1507: $new_properties{'margin-right'} = '0';
1508: }
1509: $hr->removeAttribute('align');
1510: }
1511: my $color = $hr->getAttribute('color');
1512: if (defined $color) {
1513: $color = trim($color);
1514: $color =~ s/^x\s*//;
1515: if ($color ne '') {
1516: $new_properties{'color'} = $color;
1517: $new_properties{'background-color'} = $color;
1518: }
1519: $hr->removeAttribute('color');
1520: }
1521: my $noshade = $hr->getAttribute('noshade');
1522: my $size = $hr->getAttribute('size');
1523: if (defined $noshade) {
1524: $new_properties{'border-width'} = '0';
1525: if (!defined $color) {
1526: $new_properties{'color'} = 'gray';
1527: $new_properties{'background-color'} = 'gray';
1528: }
1529: if (!defined $size) {
1530: $size = '2';
1531: }
1532: $hr->removeAttribute('noshade');
1533: }
1534: if (defined $size) {
1535: $size = trim($size);
1536: if ($size ne '') {
1537: $new_properties{'height'} = $size.'px';
1538: }
1539: if (defined $hr->getAttribute('size')) {
1540: $hr->removeAttribute('size');
1541: }
1542: }
1543: my $width = $hr->getAttribute('width');
1544: if (defined $width) {
1545: $width = trim($width);
1546: if ($width ne '') {
1547: if ($width !~ /\%$/) {
1548: $width .= 'px';
1549: }
1550: $new_properties{'width'} = $width;
1551: }
1552: $hr->removeAttribute('width');
1553: }
1554: if (scalar(keys %new_properties) > 0) {
1555: set_css_properties($hr, \%new_properties);
1556: }
1557: }
1558: }
1559:
1560: # Replaces deprecated attributes in img
1561: sub fix_deprecated_in_img {
1562: my ($root) = @_;
1563: my @imgs = $root->getElementsByTagName('img');
1564: foreach my $img (@imgs) {
1565: my $old_properties = get_css_properties($img);
1566: tie (my %new_properties, 'Tie::IxHash', ());
1567: my $align = $img->getAttribute('align');
1568: if (defined $align) {
1569: $align = lc(trim($align));
1570: if ($align eq 'middle' || $align eq 'top' || $align eq 'bottom') {
1571: $img->removeAttribute('align');
1572: if (!defined $old_properties->{'vertical-align'}) {
1573: $new_properties{'vertical-align'} = $align;
1574: }
1575: } elsif ($align eq 'left' || $align eq 'right') {
1576: $img->removeAttribute('align');
1577: if (!defined $old_properties->{'float'}) {
1578: $new_properties{'float'} = $align;
1579: }
1580: } elsif ($align eq 'center' || $align eq '') {
1581: $img->removeAttribute('align');
1582: }
1583: }
1584: my $border = $img->getAttribute('border');
1585: if (defined $border) {
1586: $border = lc(trim($border));
1587: if ($border =~ /^[0-9]+\s*(px)?$/) {
1588: $img->removeAttribute('border');
1589: if (!defined $old_properties->{'border'}) {
1590: if ($border !~ /px$/) {
1591: $border .= 'px';
1592: }
1593: $new_properties{'border'} = $border.' solid black';
1594: }
1595: }
1596: }
1597: my $hspace = $img->getAttribute('hspace');
1598: if (defined $hspace) {
1599: $hspace = lc(trim($hspace));
1600: if ($hspace =~ /^[0-9]+\s*(px)?$/) {
1601: $img->removeAttribute('hspace');
1602: if (!defined $old_properties->{'margin-left'} || !defined $old_properties->{'margin-right'}) {
1603: if ($hspace !~ /px$/) {
1604: $hspace .= 'px';
1605: }
1606: $new_properties{'margin-left'} = $hspace;
1607: $new_properties{'margin-right'} = $hspace;
1608: }
1609: }
1610: }
1611: if (scalar(keys %new_properties) > 0) {
1612: set_css_properties($img, \%new_properties);
1613: }
1614: }
1615: }
1616:
1617: # Replaces deprecated attributes in htmlbody (the style attribute could be used in a div for output)
1618: sub fix_deprecated_in_body {
1619: my ($root) = @_;
1620: my $doc = $root->ownerDocument;
1621: my @bodies = $root->getElementsByTagName('htmlbody');
1622: foreach my $body (@bodies) {
1623: my $old_properties = get_css_properties($body);
1624: tie (my %new_properties, 'Tie::IxHash', ());
1625: my $bgcolor = $body->getAttribute('bgcolor');
1626: if (defined $bgcolor) {
1627: $body->removeAttribute('bgcolor');
1628: if (!defined $old_properties->{'background-color'}) {
1629: $bgcolor = trim($bgcolor);
1630: $bgcolor =~ s/^x\s*//;
1631: if ($bgcolor ne '') {
1632: $new_properties{'background-color'} = $bgcolor;
1633: }
1634: }
1635: }
1636: my $color = $body->getAttribute('text');
1637: if (defined $color) {
1638: $body->removeAttribute('text');
1639: if (!defined $old_properties->{'color'}) {
1640: $color = trim($color);
1641: $color =~ s/^x\s*//;
1642: if ($color ne '') {
1643: $new_properties{'color'} = $color;
1644: }
1645: }
1646: }
1647: my $background = $body->getAttribute('background');
1648: if (defined $background && ($background =~ /\.jpe?g$|\.gif|\.png/i)) {
1649: $body->removeAttribute('background');
1650: if (!defined $old_properties->{'background-image'}) {
1651: $background = trim($background);
1652: if ($background ne '') {
1653: $new_properties{'background-image'} = 'url('.$background.')';
1654: }
1655: }
1656: }
1657: # NOTE: these attributes have never been standard and are better removed with no replacement
1658: foreach my $bad ('bottommargin', 'leftmargin', 'rightmargin', 'topmargin', 'marginheight', 'marginwidth') {
1659: if ($body->hasAttribute($bad)) {
1660: $body->removeAttribute($bad);
1661: }
1662: }
1663: # NOTE: link alink and vlink require a <style> block to be converted
1664: my $link = $body->getAttribute('link');
1665: my $alink = $body->getAttribute('alink');
1666: my $vlink = $body->getAttribute('vlink');
1667: if (defined $link || defined $alink || defined $vlink) {
1668: my $head;
1669: my @heads = $root->getElementsByTagName('htmlhead');
1670: if (scalar(@heads) > 0) {
1671: $head = $heads[0];
1672: } else {
1673: $head = $doc->createElement('htmlhead');
1674: $root->insertBefore($head, $root->firstChild);
1675: }
1676: my $style = $doc->createElement('style');
1677: $head->appendChild($style);
1678: my $css = "\n";
1679: if (defined $link) {
1680: $body->removeAttribute('link');
1681: $link = trim($link);
1682: $link =~ s/^x\s*//;
1683: $css .= ' a:link { color:'.$link.' }';
1684: $css .= "\n";
1685: }
1686: if (defined $alink) {
1687: $body->removeAttribute('alink');
1688: $alink = trim($alink);
1689: $alink =~ s/^x\s*//;
1690: $css .= ' a:active { color:'.$alink.' }';
1691: $css .= "\n";
1692: }
1693: if (defined $vlink) {
1694: $body->removeAttribute('vlink');
1695: $vlink = trim($vlink);
1696: $vlink =~ s/^x\s*//;
1697: $css .= ' a:visited { color:'.$vlink.' }';
1698: $css .= "\n";
1699: }
1700: $css .= ' ';
1701: $style->appendChild($doc->createTextNode($css));
1702: }
1703: if (scalar(keys %new_properties) > 0) {
1704: set_css_properties($body, \%new_properties);
1705: } elsif (!$body->hasAttributes) {
1706: $body->parentNode->removeChild($body);
1707: }
1708: }
1709: }
1710:
1711: # replaces <div align="center"> by <div style="text-align:center;">
1712: # also for p and h1..h6
1713: sub fix_align_attribute {
1714: my ($root) = @_;
1715: my @nodes = $root->getElementsByTagName('div');
1716: push(@nodes, $root->getElementsByTagName('p'));
1717: for (my $i=1; $i<=6; $i++) {
1718: push(@nodes, $root->getElementsByTagName('h'.$i));
1719: }
1720: foreach my $node (@nodes) {
1721: my $align = $node->getAttribute('align');
1722: if (defined $align) {
1723: $node->removeAttribute('align');
1724: $align = trim($align);
1725: if ($align ne '' && !defined get_css_property($node, 'text-align')) {
1726: set_css_property($node, 'text-align', lc($align));
1727: }
1728: }
1729: }
1730: }
1731:
1732: # replace center by a div or remove it if there is a table inside
1733: sub replace_center {
1734: my ($root, $all_block) = @_;
1735: my $doc = $root->ownerDocument;
1736: my @centers = $root->getElementsByTagName('center');
1737: foreach my $center (@centers) {
1738: if ($center->getChildrenByTagName('table')->size() > 0) { # note: getChildrenByTagName is not DOM (LibXML specific)
1739: replace_by_children($center);
1740: } else {
1741: if ((!defined $center->previousSibling ||
1742: ($center->previousSibling->nodeType == XML_TEXT_NODE && $center->previousSibling->nodeValue =~ /^\s*$/ && !defined $center->previousSibling->previousSibling)) &&
1743: (!defined $center->nextSibling ||
1744: ($center->nextSibling->nodeType == XML_TEXT_NODE && $center->nextSibling->nodeValue =~ /^\s*$/ && !defined $center->nextSibling->nextSibling)) &&
1745: string_in_array(\@accepting_style, $center->parentNode->nodeName)) {
1746: # use CSS on the parent block and replace center by its children
1747: set_css_property($center->parentNode, 'text-align', 'center');
1748: replace_by_children($center);
1749: } else {
1750: # use p or div ? check if there is a block inside
1751: my $found_block = 0;
1752: for (my $child=$center->firstChild; defined $child; $child=$child->nextSibling) {
1753: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1754: $found_block = 1;
1755: last;
1756: }
1757: }
1758: my $new_node;
1759: if ($found_block) {
1760: $new_node = $doc->createElement('div');
1761: $new_node->setAttribute('style', 'text-align: center; margin: 0 auto');
1762: } else {
1763: $new_node = $doc->createElement('p');
1764: $new_node->setAttribute('style', 'text-align: center');
1765: }
1766: my $next;
1767: for (my $child=$center->firstChild; defined $child; $child=$next) {
1768: $next = $child->nextSibling;
1769: $center->removeChild($child);
1770: $new_node->appendChild($child);
1771: }
1772: $center->parentNode->replaceChild($new_node, $center);
1773: }
1774: }
1775: }
1776: }
1777:
1778: # replaces <nobr> by <span style="white-space:nowrap">
1779: sub replace_nobr {
1780: my ($root) = @_;
1781: my @nobrs = $root->getElementsByTagName('nobr');
1782: foreach my $nobr (@nobrs) {
1783: if (!defined $nobr->previousSibling && !defined $nobr->nextSibling &&
1784: string_in_array(\@accepting_style, $nobr->parentNode->nodeName)) {
1785: # use CSS on the parent block
1786: set_css_property($nobr->parentNode, 'white-space', 'nowrap');
1787: replace_by_children($nobr);
1788: } else {
1789: $nobr->setNodeName('span');
1790: $nobr->setAttribute('style', 'white-space:nowrap');
1791: }
1792: }
1793: }
1794:
1795: # removes notsolved tags in the case <hintgroup showoncorrect="no"><notsolved>...</notsolved></hintgroup>
1796: # and in the case <notsolved><hintgroup showoncorrect="no">...</hintgroup></notsolved>
1797: sub remove_useless_notsolved {
1798: my ($root) = @_;
1799: my @hintgroups = $root->getElementsByTagName('hintgroup');
1800: foreach my $hintgroup (@hintgroups) {
1801: my $showoncorrect = get_non_empty_attribute($hintgroup, 'showoncorrect');
1802: if (!defined $showoncorrect || $showoncorrect eq 'no') {
1803: my @notsolveds = $hintgroup->getElementsByTagName('notsolved');
1804: foreach my $notsolved (@notsolveds) {
1805: replace_by_children($notsolved);
1806: }
1807: }
1808: my $parent = $hintgroup->parentNode;
1809: if ($parent->nodeName eq 'notsolved' && scalar(@{$parent->nonBlankChildNodes()}) == 1) {
1810: replace_by_children($parent);
1811: }
1812: }
1813: }
1814:
1.9 ! damieng 1815: # Use <pre> for multi-line comments without elements.
! 1816: sub fix_comments {
! 1817: my ($root) = @_;
! 1818: my $doc = $root->ownerDocument;
! 1819: my @comments = $root->getElementsByTagName('comment');
! 1820: foreach my $comment (@comments) {
! 1821: my $first = $comment->firstChild;
! 1822: if (defined $first) {
! 1823: if ($first->nodeType == XML_TEXT_NODE && $first->nodeValue =~ /\n/ &&
! 1824: !defined $first->nextSibling) {
! 1825: my $pre = $doc->createElement('pre');
! 1826: $comment->removeChild($first);
! 1827: $comment->appendChild($pre);
! 1828: $pre->appendChild($first);
! 1829: }
! 1830: }
! 1831: }
! 1832: }
! 1833:
1.1 damieng 1834: # adds a paragraph inside if needed and calls fix_paragraph for all paragraphs (including new ones)
1835: sub fix_paragraphs_inside {
1836: my ($node, $all_block) = @_;
1837: # blocks in which paragrahs will be added:
1.8 damieng 1838: my @blocks_with_p = ('loncapa','library','problem','part','problemtype','window','block','while','postanswerdate','preduedate','languageblock','instructorcomment','togglebox','standalone','body','form');
1839: my @fix_p_if_br_or_p = (@responses,'foil','item','text','label','hintgroup','hintpart','hint','web','windowlink','div','li','dd','td','th','blockquote','solved','notsolved');
1.1 damieng 1840: if ((string_in_array(\@blocks_with_p, $node->nodeName) && paragraph_needed($node)) ||
1841: (string_in_array(\@fix_p_if_br_or_p, $node->nodeName) && paragraph_inside($node))) {
1842: # if non-empty, add paragraphs where needed between all br and remove br
1843: # (it would be easier to just put everything in a p and fix it afterwards, but there are performance issues
1844: # when a paragraph has many blocks directly inside)
1845: my $doc = $node->ownerDocument;
1846: my $p = undef;
1847: my @new_children = ();
1848: my $next;
1849: for (my $child=$node->firstChild; defined $child; $child=$next) {
1850: $next = $child->nextSibling;
1851: $node->removeChild($child);
1852: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'br') {
1853: if (defined $p) {
1854: push(@new_children, $p);
1855: } else {
1856: push(@new_children, $doc->createElement('p'));
1857: }
1858: $p = undef;
1859: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array(\@inline_like_block, $child->nodeName)) {
1860: # inline_like_block: use the paragraph if there is one, otherwise do not create one
1861: if (defined $p) {
1862: $p->appendChild($child);
1863: } else {
1864: push(@new_children, $child);
1865: }
1866: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1867: # these children are blocks and should not be in a paragraph
1868: if (defined $p) {
1869: push(@new_children, $p);
1870: $p = undef;
1871: }
1872: push(@new_children, $child);
1873: } elsif ($child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^[ \t\f\n\r]*$/) {
1874: # blank text: add to paragraph if there is one and there is a next node, otherwise keep out of the paragraph
1875: if (defined $p) {
1876: if (defined $next) {
1877: $p->appendChild($child);
1878: } else {
1879: push(@new_children, $p);
1880: $p = undef;
1881: push(@new_children, $child);
1882: }
1883: } else {
1884: push(@new_children, $child);
1885: }
1886: } elsif ($child->nodeType == XML_TEXT_NODE ||
1887: $child->nodeType == XML_ELEMENT_NODE || $child->nodeType == XML_CDATA_SECTION_NODE ||
1888: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1889: # these children require a paragraph
1890: if (!defined $p) {
1891: $p = $doc->createElement('p');
1892: }
1893: $p->appendChild($child);
1894: } else {
1895: # these children do not require a paragraph (XML comments, PI)
1896: # -> do not move them in a new paragraph
1897: if (defined $p) {
1898: push(@new_children, $p);
1899: $p = undef;
1900: }
1901: push(@new_children, $child);
1902: }
1903: }
1904: if (defined $p) {
1905: push(@new_children, $p);
1906: }
1907: foreach my $child (@new_children) {
1908: $node->appendChild($child);
1909: }
1910: }
1911: # now fix the paragraphs everywhere, so that all inline nodes are inside a paragraph, and block nodes are outside
1912: my $next;
1913: for (my $child=$node->firstChild; defined $child; $child=$next) {
1914: $next = $child->nextSibling;
1915: if ($child->nodeType == XML_ELEMENT_NODE && defined $child->firstChild) {
1916: if ($child->nodeName eq 'p') {
1917: fix_paragraph($child, $all_block);
1918: } else {
1919: fix_paragraphs_inside($child, $all_block);
1920: }
1921: }
1922: }
1923: }
1924:
1925: # returns 1 if a paragraph is needed inside this node (assuming the parent can have paragraphs)
1926: sub paragraph_needed {
1927: my ($node) = @_;
1928: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1929: if (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
1930: ($child->nodeType == XML_ELEMENT_NODE && !string_in_array(\@inline_like_block, $child->nodeName)) ||
1931: $child->nodeType == XML_CDATA_SECTION_NODE ||
1932: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1933: return(1);
1934: }
1935: }
1936: return(0);
1937: }
1938:
1939: # returns 1 if there is a paragraph or br in a child of this node, or inside an inline child
1940: sub paragraph_inside {
1941: my ($node) = @_;
1942: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
1943: # (also used in first_block below)
1944: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
1945: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1946: if ($child->nodeType == XML_ELEMENT_NODE) {
1947: my $name = $child->nodeName;
1948: if ($name eq 'p' || $name eq 'br') {
1949: return(1);
1950: } elsif (string_in_array(\@splitable_inline, $name)) {
1951: if (paragraph_inside($child)) {
1952: return(1);
1953: }
1954: }
1955: }
1956: }
1957: return(0);
1958: }
1959:
1960: # fixes paragraphs inside paragraphs (without a block in-between)
1961: sub fix_paragraph {
1962: my ($p, $all_block) = @_;
1963: my $loop_right = 1; # this loops is to avoid out of memory errors with recurse, see below
1964: while ($loop_right) {
1965: $loop_right = 0;
1966: my $block = find_first_block($p, $all_block);
1967: if (defined $block) {
1968: my $trees = clone_ancestor_around_node($p, $block);
1969: my $doc = $p->ownerDocument;
1970: my $replacement = $doc->createDocumentFragment();
1971: my $left = $trees->{'left'};
1972: my $middle = $trees->{'middle'};
1973: my $right = $trees->{'right'};
1974: my $left_needs_p = 0; # 1 if it needs a paragraph (used to replace br later)
1975:
1976: if (defined $left) {
1977: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
1978: for (my $child=$left->firstChild; defined $child; $child=$child->nextSibling) {
1979: if ($child->nodeType == XML_ELEMENT_NODE) {
1980: fix_paragraphs_inside($child, $all_block);
1981: }
1982: }
1983: if (!paragraph_needed($left)) {
1984: # this was just blank text, comments or inline responses, it should not create a new paragraph
1985: my $next;
1986: for (my $child=$left->firstChild; defined $child; $child=$next) {
1987: $next = $child->nextSibling;
1988: $left->removeChild($child);
1989: $replacement->appendChild($child);
1990: }
1991: } else {
1992: $left_needs_p = 1;
1993: $replacement->appendChild($left);
1994: }
1995: }
1996:
1997: my $n = $middle->firstChild;
1998: while (defined $n) {
1999: if ($n->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $n->nodeName) || $n->nodeName eq 'br')) {
2000: if ($n->nodeName eq 'p') {
2001: my $parent = $n->parentNode;
2002: # first apply recursion
2003: fix_paragraph($n, $all_block);
2004: # now the p might have been replaced by several nodes, which should replace the initial p
2005: my $next_block;
2006: for (my $block=$parent->firstChild; defined $block; $block=$next_block) {
2007: $next_block = $block->nextSibling;
2008: if ($block->nodeName eq 'p') {
2009: $parent->removeChild($block);
2010: # for each parent before $middle, clone in-between the p and its children (to preserve the styles)
2011: if (defined $block->firstChild) {
2012: for (my $p=$parent; $p!=$middle; $p=$p->parentNode) {
2013: my $newp = $p->cloneNode(0);
2014: my $next;
2015: for (my $child=$block->firstChild; defined $child; $child=$next) {
2016: $next = $child->nextSibling;
2017: $block->removeChild($child);
2018: $newp->appendChild($child);
2019: }
2020: $block->appendChild($newp);
2021: }
2022: }
2023: }
2024: $replacement->appendChild($block);
2025: }
2026: } else {
2027: # replace the whole p by this block, forgetting about intermediate inline elements
2028: $n->parentNode->removeChild($n);
2029: if ($n->nodeName eq 'br') {
2030: # replace a br by a paragraph if there was nothing before in the paragraph,
2031: # otherwise remove it because it already broke the paragraph in half
2032: if (!defined $left || !$left_needs_p) {
2033: $replacement->appendChild($middle);
2034: }
2035: } else {
2036: fix_paragraphs_inside($n, $all_block);
2037: $replacement->appendChild($n);
2038: }
2039: }
2040: last;
2041: }
2042: $n = $n->firstChild;
2043: if (defined $n && defined $n->nextSibling) {
2044: die "Error in post_xml.fix_paragraph: block not found";
2045: }
2046: }
2047:
2048: if (defined $right) {
2049: if ($block->nodeName eq 'p') {
2050: # remove attributes on the right paragraph
2051: my @attributelist = $right->attributes();
2052: foreach my $att (@attributelist) {
2053: $right->removeAttribute($att->nodeName);
2054: }
2055: }
2056: if ($right->firstChild->nodeType == XML_TEXT_NODE && $right->firstChild->nodeValue =~ /^[ \t\f\n\r]*$/) {
2057: # remove the first text node with whitespace only from the p, it should not trigger the creation of a p
2058: # (but take nbsp into account, so we should not use \s here)
2059: my $first = $right->firstChild;
2060: $right->removeChild($first);
2061: $replacement->appendChild($first);
2062: }
2063: if (defined $right->firstChild) {
2064: if (paragraph_needed($right)) {
2065: $replacement->appendChild($right);
2066: #fix_paragraph($right, $all_block); This is taking way too much memory for blocks with many children
2067: # -> loop instead of recurse
2068: $loop_right = 1;
2069: } else {
2070: # this was just blank text, comments or inline responses, it should not create a new paragraph
2071: my $next;
2072: for (my $child=$right->firstChild; defined $child; $child=$next) {
2073: $next = $child->nextSibling;
2074: $right->removeChild($child);
2075: $replacement->appendChild($child);
2076: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2077: if ($child->nodeType == XML_ELEMENT_NODE) {
2078: fix_paragraphs_inside($child, $all_block);
2079: }
2080: }
2081: }
2082: }
2083: }
2084:
2085: $p->parentNode->replaceChild($replacement, $p);
2086:
2087: if ($loop_right) {
2088: $p = $right;
2089: }
2090:
2091: } else {
2092: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2093: my $next;
2094: for (my $child=$p->firstChild; defined $child; $child=$next) {
2095: $next = $child->nextSibling;
2096: if ($child->nodeType == XML_ELEMENT_NODE) {
2097: fix_paragraphs_inside($child, $all_block);
2098: }
2099: }
2100: }
2101: }
2102: }
2103:
2104: sub find_first_block {
2105: my ($node, $all_block) = @_;
2106: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
2107: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
2108: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2109: if ($child->nodeType == XML_ELEMENT_NODE) {
2110: if (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br') {
2111: return($child);
2112: }
2113: if (string_in_array(\@splitable_inline, $child->nodeName)) {
2114: my $block = find_first_block($child, $all_block);
2115: if (defined $block) {
2116: return($block);
2117: }
2118: }
2119: }
2120: }
2121: return(undef);
2122: }
2123:
2124: # Creates clones of the ancestor containing the descendants before the node, at the node, and after the node.
2125: # returns a hash with: left, middle, right (left and right can be undef)
2126: sub clone_ancestor_around_node {
2127: my ($ancestor, $node) = @_;
2128: my $middle_node;
2129: my ($left, $middle, $right);
2130: for (my $child=$ancestor->firstChild; defined $child; $child=$child->nextSibling) {
2131: if ($child == $node || is_ancestor_of($child, $node)) {
2132: $middle_node = $child;
2133: last;
2134: }
2135: }
2136: if (!defined $middle_node) {
2137: die "error in split_ancestor_around_node: middle not found";
2138: }
2139: if (defined $middle_node->previousSibling) {
2140: $left = $ancestor->cloneNode(0);
2141: for (my $child=$ancestor->firstChild; $child != $middle_node; $child=$child->nextSibling) {
2142: $left->appendChild($child->cloneNode(1));
2143: }
2144: }
2145: $middle = $ancestor->cloneNode(0);
2146: if ($middle_node == $node) {
2147: $middle->appendChild($middle_node->cloneNode(1));
2148: } else {
2149: my $subres = clone_ancestor_around_node($middle_node, $node);
2150: my $subleft = $subres->{'left'};
2151: if (defined $subleft) {
2152: if (!defined $left) {
2153: $left = $ancestor->cloneNode(0);
2154: }
2155: $left->appendChild($subleft);
2156: }
2157: $middle->appendChild($subres->{'middle'});
2158: my $subright = $subres->{'right'};
2159: if (defined $subright) {
2160: $right = $ancestor->cloneNode(0);
2161: $right->appendChild($subright);
2162: }
2163: }
2164: if (defined $middle_node->nextSibling) {
2165: if (!defined $right) {
2166: $right = $ancestor->cloneNode(0);
2167: }
2168: for (my $child=$middle_node->nextSibling; defined $child; $child=$child->nextSibling) {
2169: $right->appendChild($child->cloneNode(1));
2170: }
2171: }
2172: my %result = ();
2173: $result{'left'} = $left;
2174: $result{'middle'} = $middle;
2175: $result{'right'} = $right;
2176: return(\%result);
2177: }
2178:
2179: sub is_ancestor_of {
2180: my ($n1, $n2) = @_;
2181: my $n = $n2->parentNode;
2182: while (defined $n) {
2183: if ($n == $n1) {
2184: return(1);
2185: }
2186: $n = $n->parentNode;
2187: }
2188: return(0);
2189: }
2190:
2191: # removes empty style elements and replaces the ones with only whitespaces inside by their content
2192: # also remove hints that have become empty after empty style removal.
2193: sub remove_empty_style {
2194: my ($root) = @_;
2195: # actually, preserve some elements like ins when they have whitespace, only remove if they are empty
2196: my @remove_if_empty = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u', 'hint');
2197: my @remove_if_blank = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'font', 'u', 'hint');
2198: foreach my $name (@remove_if_empty) {
2199: my @nodes = $root->getElementsByTagName($name);
2200: while (scalar(@nodes) > 0) {
2201: my $node = pop(@nodes);
2202: if (!defined $node->firstChild) {
2203: my $parent = $node->parentNode;
2204: if (defined $node->previousSibling && $node->previousSibling->nodeType == XML_TEXT_NODE &&
2205: $node->previousSibling->nodeValue =~ /\$\S*$/) {
2206: # case $a<sup></sup>x
2207: my $value = $node->previousSibling->nodeValue;
2208: $value =~ s/\$(\S*)$/\$\{$1\}/;
2209: $node->previousSibling->setData($value);
2210: }
2211: $parent->removeChild($node);
2212: $parent->normalize();
2213: # now that we removed the node, check if the parent has become an empty style, and so on
2214: while (defined $parent && string_in_array(\@remove_if_empty, $parent->nodeName) && !defined $parent->firstChild) {
2215: my $grandparent = $parent->parentNode;
2216: $grandparent->removeChild($parent);
2217: remove_reference_from_array(\@nodes, $parent);
2218: $parent = $grandparent;
2219: }
2220: }
2221: }
2222: }
2223: foreach my $name (@remove_if_blank) {
2224: my @nodes = $root->getElementsByTagName($name);
2225: while (scalar(@nodes) > 0) {
2226: my $node = pop(@nodes);
2227: if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE) {
2228: # NOTE: careful, with UTF-8, \s matches non-breaking spaces and we want to preserve these
2229: if ($node->firstChild->nodeValue =~ /^[\t\n\f\r ]*$/) {
2230: my $parent = $node->parentNode;
2231: replace_by_children($node);
2232: $parent->normalize();
2233: # now that we removed the node, check if the parent has become a style with only whitespace, and so on
2234: while (defined $parent && string_in_array(\@remove_if_blank, $parent->nodeName) &&
2235: (!defined $parent->firstChild ||
2236: (!defined $parent->firstChild->nextSibling && $parent->firstChild->nodeType == XML_TEXT_NODE &&
2237: $parent->firstChild->nodeValue =~ /^^[\t\n\f\r ]*$/))) {
2238: my $grandparent = $parent->parentNode;
2239: replace_by_children($parent);
2240: remove_reference_from_array(\@nodes, $parent);
2241: $parent = $grandparent;
2242: }
2243: }
2244: }
2245: }
2246: }
2247: }
2248:
2249: # remove whitespace inside LON-CAPA elements that have an empty content-model (HTML ones are handled by html_to_xml)
2250: sub fix_empty_lc_elements {
2251: my ($node) = @_;
2252: my @lcempty = ('arc','axis','backgroundplot','drawoptionlist','drawvectorsum','fill','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','hiddenline','hiddensubmission','key','line','location','organicstructure','parameter','plotobject','plotvector','responseparam','spline','textline');
2253: if (string_in_array(\@lcempty, $node->nodeName)) {
2254: if (defined $node->firstChild && !defined $node->firstChild->nextSibling &&
2255: $node->firstChild->nodeType == XML_TEXT_NODE && $node->firstChild->nodeValue =~ /^\s*$/) {
2256: $node->removeChild($node->firstChild);
2257: }
2258: if (defined $node->firstChild) {
2259: if ($warnings) {
2260: print "Warning: a ".$node->nodeName." has something inside\n";
2261: }
2262: }
2263: return;
2264: }
2265: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2266: if ($child->nodeType == XML_ELEMENT_NODE) {
2267: fix_empty_lc_elements($child);
2268: }
2269: }
2270: }
2271:
2272: # turn some attribute values into lowercase when they should be
2273: sub lowercase_attribute_values {
2274: my ($root) = @_;
2275: my @with_yesno = (['radiobuttonresponse', ['randomize']],
2276: ['optionresponse', ['randomize']],
2277: ['matchresponse', ['randomize']],
2278: ['itemgroup', ['randomize']],
2279: ['rankresponse', ['randomize']],
2280: ['functionplotresponse', ['xaxisvisible', 'yaxisvisible', 'gridvisible']],
2281: ['backgroundplot', ['fixed']],
2282: ['drawvectorsum', ['showvalue']],
2283: ['textline', ['readonly']],
2284: ['hint', ['showoncorrect']],
2285: ['body', ['dir']],
2286: ['img', ['encrypturl']],
2287: ['form', ['method']],
2288: ['input', ['type']]
2289: );
2290: foreach my $el_attributes (@with_yesno) {
2291: my $el_name = $el_attributes->[0];
2292: my @elements = $root->getElementsByTagName($el_name);
2293: foreach my $element (@elements) {
2294: my $att_list = $el_attributes->[1];
2295: foreach my $att_name (@$att_list) {
2296: my $att_value = $element->getAttribute($att_name);
2297: if (!defined $att_value) {
2298: next;
2299: }
2300: if ($att_value eq 'yes' || $att_value eq 'no') {
2301: next;
2302: }
2303: if ($att_value =~ /\s*yes\s*/i) {
2304: $element->setAttribute($att_name, 'yes');
2305: } elsif ($att_value =~ /\s*no\s*/i) {
2306: $element->setAttribute($att_name, 'no');
2307: }
2308: }
2309: }
2310: }
2311: }
2312:
2313: # fixes spelling mistakes for numericalresponse/@unit
2314: sub replace_numericalresponse_unit_attribute {
2315: my ($root) = @_;
2316: my @numericalresponses = $root->getElementsByTagName('numericalresponse');
2317: foreach my $numericalresponse (@numericalresponses) {
2318: if (defined $numericalresponse->getAttribute('units') && !defined $numericalresponse->getAttribute('unit')) {
2319: $numericalresponse->setAttribute('unit', $numericalresponse->getAttribute('units'));
2320: $numericalresponse->removeAttribute('units');
2321: }
2322: }
2323:
2324: }
2325:
2326: # Replaces &format and &prettyprint by <num> whenever possible.
2327: # Also replaces &chemparse by <chem>.
2328: # If the function call is enclosed in <display>, the <display> element is removed.
2329: sub replace_functions_by_elements {
2330: my ($root) = @_;
2331: my $doc = $root->ownerDocument;
2332: my @preserve = ('script','answer','parse','m','tm','dtm','numericalhintscript'); # display is handled later
2333: my @all = $root->getElementsByTagName('*');
2334: foreach my $element (@all) {
2335: if (string_in_array(\@preserve, $element->nodeName)) {
2336: next;
2337: }
2338: my $changed = 0;
2339: my $next;
2340: for (my $child=$element->firstChild; defined $child; $child=$next) {
2341: $next = $child->nextSibling;
2342: if ($child->nodeType == XML_TEXT_NODE) {
2343: my $value = $child->nodeValue;
2344: if ($value =~ /^(.*)&(?:format|prettyprint)\((\$\{?[a-zA-Z0-9]*\}?(?:\[[^\]]*\])?|[0-9.]+)\s?,\s?(["'][,.\$]?[0-9][eEfFgGsS]["']|\$[a-zA-Z0-9]*)\)(.*)$/s) {
2345: # NOTE: we don't check for &prettyprint's 3rd argument (target), but it has not been seen outside of script elements.
2346: # NOTE: the format options ',' and '$' are not supported by &format in current LON-CAPA since rev 1.81 of default_homework.lcpm,
2347: # but are supported by &prettyprint;
2348: # if we use (like current LON-CAPA) &prettyprint for <num> implementation, it will change a few resulting documents
2349: # (by making them display something they were probably intended to display, but which did not).
2350: # Usage of <num> with &prettyprint instead of &format might also change the display when there is an exponent.
2351: my $before = $1;
2352: my $number = $2;
2353: my $format = $3;
2354: my $after = $4;
2355: $format =~ s/^['"]|['"]$//g;
2356: # do not change this if the parent is <display> and there are other things before or after &format
2357: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2358: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2359: last;
2360: }
2361: my $replacement = $doc->createDocumentFragment();
2362: my $num = $doc->createElement('num');
2363: $num->setAttribute('format', $format);
2364: $num->appendChild($doc->createTextNode($number));
2365: if (length($before) > 0) {
2366: $replacement->appendChild($doc->createTextNode($before));
2367: }
2368: $replacement->appendChild($num);
2369: if (length($after) > 0) {
2370: $replacement->appendChild($doc->createTextNode($after));
2371: }
2372: $element->replaceChild($replacement, $child);
2373: $changed = 1;
2374: $next = $element->firstChild; # start over, there might be another &format in the same text node
2375: } elsif ($value =~ /^(.*)&chemparse\(([^'"()]*|'[^']*'|"[^"]*")\)(.*)$/s) {
2376: my $before = $1;
2377: my $reaction = $2;
2378: my $after = $3;
2379: $reaction =~ s/^'(.*)'$/$1/;
2380: $reaction =~ s/^"(.*)"$/$1/;
2381: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2382: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2383: last;
2384: }
2385: my $replacement = $doc->createDocumentFragment();
2386: my $chem = $doc->createElement('chem');
2387: $chem->appendChild($doc->createTextNode($reaction));
2388: if (length($before) > 0) {
2389: $replacement->appendChild($doc->createTextNode($before));
2390: }
2391: $replacement->appendChild($chem);
2392: if (length($after) > 0) {
2393: $replacement->appendChild($doc->createTextNode($after));
2394: }
2395: $element->replaceChild($replacement, $child);
2396: $changed = 1;
2397: $next = $element->firstChild;
2398: }
2399: }
2400: }
2401: if ($changed && $element->nodeName eq 'display') {
2402: my $first = $element->firstChild;
2403: if ($first->nodeType == XML_ELEMENT_NODE && string_in_array(['num','chem'], $first->nodeName) &&
2404: !defined $first->nextSibling) {
2405: # remove useless display element
2406: replace_by_children($element);
2407: }
2408: }
2409: }
2410: }
2411:
2412: # pretty-print using im-memory DOM tree
2413: sub pretty {
2414: my ($node, $all_block, $indent_level) = @_;
2415: my $doc = $node->ownerDocument;
2416: $indent_level ||= 0;
2417: my $type = $node->nodeType;
2418: if ($type == XML_ELEMENT_NODE) {
2419: my $name = $node->nodeName;
1.6 damieng 2420: if (string_in_array(\@preserve_elements, $name)) {
2421: # remove newlines at the beginning and the end of preserve elements
2422: if (defined $node->firstChild && ($node->firstChild->nodeType == XML_TEXT_NODE ||
2423: $node->firstChild->nodeType == XML_CDATA_SECTION_NODE)) {
2424: my $text = $node->firstChild->nodeValue;
2425: $text =~ s/^\n+//;
2426: if ($text eq '') {
2427: $node->removeChild($node->firstChild);
2428: } else {
2429: $node->firstChild->setData($text);
2430: }
2431: }
2432: if (defined $node->lastChild && ($node->lastChild->nodeType == XML_TEXT_NODE ||
2433: $node->lastChild->nodeType == XML_CDATA_SECTION_NODE)) {
2434: my $text = $node->lastChild->nodeValue;
2435: $text =~ s/\n+$//;
2436: if ($text eq '') {
2437: $node->removeChild($node->lastChild);
2438: } else {
2439: $node->lastChild->setData($text);
2440: }
2441: }
2442: } elsif (string_in_array($all_block, $name) || string_in_array(\@inline_like_block, $name)) {
1.1 damieng 2443: # make sure there is a newline at the beginning and at the end if there is anything inside
2444: if (defined $node->firstChild && !string_in_array(\@no_newline_inside, $name)) {
2445: my $first = $node->firstChild;
2446: if ($first->nodeType == XML_TEXT_NODE) {
2447: my $text = $first->nodeValue;
2448: if ($text !~ /^ *\n/) {
2449: $first->setData("\n" . $text);
2450: }
2451: } else {
2452: $node->insertBefore($doc->createTextNode("\n"), $first);
2453: }
2454: my $last = $node->lastChild;
2455: if ($last->nodeType == XML_TEXT_NODE) {
2456: my $text = $last->nodeValue;
2457: if ($text !~ /\n *$/) {
2458: $last->setData($text . "\n");
2459: }
2460: } else {
2461: $node->appendChild($doc->createTextNode("\n"));
2462: }
2463: }
2464:
2465: # indent and make sure there is a newline before and after a block element
2466: my $newline_indent = "\n".(' ' x (2*($indent_level + 1)));
2467: my $newline_indent_last = "\n".(' ' x (2*$indent_level));
2468: my $next;
2469: for (my $child=$node->firstChild; defined $child; $child=$next) {
2470: $next = $child->nextSibling;
2471: if ($child->nodeType == XML_ELEMENT_NODE) {
2472: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
2473: # make sure there is a newline before and after a block element
2474: if (defined $child->previousSibling && $child->previousSibling->nodeType == XML_TEXT_NODE) {
2475: my $prev = $child->previousSibling;
2476: my $text = $prev->nodeValue;
2477: if ($text !~ /\n *$/) {
2478: $prev->setData($text . $newline_indent);
2479: }
2480: } else {
2481: $node->insertBefore($doc->createTextNode($newline_indent), $child);
2482: }
2483: if (defined $next && $next->nodeType == XML_TEXT_NODE) {
2484: my $text = $next->nodeValue;
2485: if ($text !~ /^ *\n/) {
2486: $next->setData($newline_indent . $text);
2487: }
2488: } else {
2489: $node->insertAfter($doc->createTextNode($newline_indent), $child);
2490: }
2491: }
2492: pretty($child, $all_block, $indent_level+1);
2493: } elsif ($child->nodeType == XML_TEXT_NODE) {
2494: my $text = $child->nodeValue;
2495: # collapse newlines
2496: $text =~ s/\n([\t ]*\n)+/\n/g;
2497: # indent and remove spaces and tabs before newlines
2498: if (defined $next) {
2499: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2500: } else {
2501: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2502: $text =~ s/[\t ]*\n[\t ]*$/$newline_indent_last/e;
2503: }
2504: $child->setData($text);
2505: }
2506: }
2507:
2508: # removes whitespace at the beginning and end of p td, th and li (except for nbsp at the beginning)
2509: my @to_trim = ('p','td','th','li');
2510: if (string_in_array(\@to_trim, $name) && defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
2511: my $text = $node->firstChild->nodeValue;
2512: $text =~ s/^[ \t\f\n\r]*//;
2513: if ($text eq '') {
2514: $node->removeChild($node->firstChild);
2515: } else {
2516: $node->firstChild->setData($text);
2517: }
2518: }
2519: if (string_in_array(\@to_trim, $name) && defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
2520: my $text = $node->lastChild->nodeValue;
2521: $text =~ s/\s*$//;
2522: if ($text eq '') {
2523: $node->removeChild($node->lastChild);
2524: } else {
2525: $node->lastChild->setData($text);
2526: }
2527: }
2528: }
2529: }
2530: }
2531:
2532: sub replace_tm_dtm {
2533: my ($root) = @_;
2534: my $doc = $root->ownerDocument;
2535: my @elements = $root->getElementsByTagName('tm');
2536: push(@elements, $root->getElementsByTagName('dtm'));
2537: foreach my $element (@elements) {
2538: my $first = $element->firstChild;
2539: if (defined $first && $first->nodeType == XML_TEXT_NODE) {
2540: my $text = $first->nodeValue;
2541: if ($element->nodeName eq 'tm') {
2542: $first->setData('$'.$text.'$');
2543: } else {
2544: $first->setData('$$'.$text.'$$');
2545: }
2546: }
2547: $element->setNodeName('m');
2548: }
2549: }
2550:
2551:
2552: ######## utilities ########
2553:
2554: ##
2555: # Trims a string (really, this should be built-in in Perl, this is ridiculous, ugly and slow)
2556: # @param {string} s - the string to trim
2557: # @returns the trimmed string
2558: ##
2559: sub trim {
2560: my ($s) = @_;
2561: $s =~ s/^\s+//;
2562: $s =~ s/\s+$//;
2563: return($s);
2564: }
2565:
2566: ##
2567: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
2568: # @param {Array<string>} array - reference to the array of strings
2569: # @param {string} value - the string to look for
2570: # @returns 1 if found, 0 otherwise
2571: ##
2572: sub string_in_array {
2573: my ($array, $value) = @_;
2574: # NOTE: would it be faster to use "any { $value eq $_ } @{$array}" from List::Util ?
2575: foreach my $v (@{$array}) {
2576: if ($v eq $value) {
2577: return 1;
2578: }
2579: }
2580: return 0;
2581: }
2582:
2583: ##
2584: # Tests if an object is in an array (using ==)
2585: # @param {Array<Object>} array - reference to the array of references
2586: # @param {Object} ref - the reference to look for
2587: # @returns 1 if found, 0 otherwise
2588: ##
2589: sub reference_in_array {
2590: my ($array, $ref) = @_;
2591: foreach my $v (@{$array}) {
2592: if ($v == $ref) {
2593: return 1;
2594: }
2595: }
2596: return 0;
2597: }
2598:
2599: ##
2600: # returns the index of a string in an array
2601: # @param {Array<Object>} array - reference to the array of strings
2602: # @param {string} s - the string to look for (using eq)
2603: # @returns the index if found, -1 otherwise
2604: ##
2605: sub index_of_string {
2606: my ($array, $s) = @_;
2607: for (my $i=0; $i<scalar(@{$array}); $i++) {
2608: if ($array->[$i] eq $s) {
2609: return $i;
2610: }
2611: }
2612: return -1;
2613: }
2614:
2615: ##
2616: # returns the index of a reference in an array
2617: # @param {Array<Object>} array - reference to the array of references
2618: # @param {Object} ref - the reference to look for
2619: # @returns the index if found, -1 otherwise
2620: ##
2621: sub index_of_reference {
2622: my ($array, $ref) = @_;
2623: for (my $i=0; $i<scalar(@{$array}); $i++) {
2624: if ($array->[$i] == $ref) {
2625: return $i;
2626: }
2627: }
2628: return -1;
2629: }
2630:
2631: ##
2632: # if found, removes a string from an array, otherwise do nothing
2633: # @param {Array<string>} array - reference to the array of string
2634: # @param {string} s - the string to look for (using eq)
2635: ##
2636: sub remove_string_from_array {
2637: my ($array, $s) = @_;
2638: my $index = index_of_string($array, $s);
2639: if ($index != -1) {
2640: splice(@$array, $index, 1);
2641: }
2642: }
2643:
2644: ##
2645: # if found, removes a reference from an array, otherwise do nothing
2646: # @param {Array<Object>} array - reference to the array of references
2647: # @param {Object} ref - the reference to look for
2648: ##
2649: sub remove_reference_from_array {
2650: my ($array, $ref) = @_;
2651: my $index = index_of_reference($array, $ref);
2652: if ($index != -1) {
2653: splice(@$array, $index, 1);
2654: }
2655: }
2656:
2657: ##
2658: # replaces a node by its children
2659: # @param {Node} node - the DOM node
2660: ##
2661: sub replace_by_children {
2662: my ($node) = @_;
2663: my $parent = $node->parentNode;
2664: my $next;
2665: my $previous;
2666: for (my $child=$node->firstChild; defined $child; $child=$next) {
2667: $next = $child->nextSibling;
2668: if ((!defined $previous || !defined $next) &&
2669: $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
2670: next; # do not keep first and last whitespace nodes
2671: } else {
2672: if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
2673: # remove whitespace at the beginning
2674: my $value = $child->nodeValue;
2675: $value =~ s/^\s+//;
2676: $child->setData($value);
2677: }
2678: if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
2679: # and at the end
2680: my $value = $child->nodeValue;
2681: $value =~ s/\s+$//;
2682: $child->setData($value);
2683: }
2684: }
2685: $node->removeChild($child);
2686: $parent->insertBefore($child, $node);
2687: $previous = $child;
2688: }
2689: $parent->removeChild($node);
2690: }
2691:
2692: ##
2693: # returns the trimmed attribute value if the attribute exists and is not blank, undef otherwise
2694: # @param {Node} node - the DOM node
2695: # @param {string} attribute_name - the attribute name
2696: ##
2697: sub get_non_empty_attribute {
2698: my ($node, $attribute_name) = @_;
2699: my $value = $node->getAttribute($attribute_name);
2700: if (defined $value && $value !~ /^\s*$/) {
2701: $value = trim($value);
2702: return($value);
2703: }
2704: return(undef);
2705: }
2706:
2707: ##
2708: # Returns a CSS property value from the style attribute of the element, or undef if not defined
2709: # @param {Element} el - the DOM element
2710: # @param {string} property_name - the CSS property name
2711: ##
2712: sub get_css_property {
2713: my ($el, $property_name) = @_;
2714: my $style = $el->getAttribute('style');
2715: if (defined $style) {
2716: $style =~ s/^\s*;\s*//;
2717: $style =~ s/\s*;\s*$//;
2718: } else {
2719: $style = '';
2720: }
2721: my @pairs = split(';', $style);
2722: foreach my $pair (@pairs) {
2723: my @name_value = split(':', $pair);
2724: if (scalar(@name_value) != 2) {
2725: next;
2726: }
2727: my $name = trim($name_value[0]);
2728: my $value = trim($name_value[1]);
2729: if (lc($name) eq $property_name) {
2730: return($value); # return the first one found
2731: }
2732: }
2733: return(undef);
2734: }
2735:
2736: ##
2737: # Returns the reference to a hash CSS property name => value from the style attribute of the element.
2738: # Returns an empty list if the style attribute is not defined,
2739: # @param {Element} el - the DOM element
2740: # @return {Hash<string, string>} reference to the hash property name => property value
2741: ##
2742: sub get_css_properties {
2743: my ($el) = @_;
2744: my $style = $el->getAttribute('style');
2745: if (defined $style) {
2746: $style =~ s/^\s*;\s*//;
2747: $style =~ s/\s*;\s*$//;
2748: } else {
2749: $style = '';
2750: }
2751: my @pairs = split(';', $style);
2752: tie (my %hash, 'Tie::IxHash', ());
2753: foreach my $pair (@pairs) {
2754: my @name_value = split(':', $pair);
2755: if (scalar(@name_value) != 2) {
2756: next;
2757: }
2758: my $name = trim($name_value[0]);
2759: my $value = trim($name_value[1]);
2760: if (defined $hash{$name}) {
2761: # duplicate property in the style attribute: keep only the last one
2762: delete $hash{$name};
2763: }
2764: $hash{$name} = $value;
2765: }
2766: return(\%hash);
2767: }
2768:
2769: ##
2770: # Sets a CSS property in the style attribute of an element
2771: # @param {Element} el - the DOM element
2772: # @param {string} property_name - the CSS property name
2773: # @param {string} property_value - the CSS property value
2774: ##
2775: sub set_css_property {
2776: my ($el, $property_name, $property_value) = @_;
2777: my $hash_ref = { $property_name => $property_value };
2778: set_css_properties($el, $hash_ref);
2779: }
2780:
2781: ##
2782: # Sets several CSS properties in the style attribute of an element
2783: # @param {Element} el - the DOM element
2784: # @param {Hash<string, string>} properties - reference to the hash property name => property value
2785: ##
2786: sub set_css_properties {
2787: my ($el, $properties) = @_;
2788: my $hash = get_css_properties($el);
2789: foreach my $property_name (keys %$properties) {
2790: my $property_value = $properties->{$property_name};
2791: if (defined $hash->{$property_name}) {
2792: delete $hash->{$property_name}; # to add the new one at the end
2793: }
2794: $hash->{$property_name} = $property_value;
2795: }
2796: my $style = '';
2797: foreach my $key (keys %$hash) {
2798: $style .= $key.':'.$hash->{$key}.'; ';
2799: }
2800: $style =~ s/; $//;
2801: $el->setAttribute('style', $style);
2802: }
2803:
2804: 1;
2805: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>