Annotation of loncom/homework/cleanxml/post_xml.pm, revision 1.2
1.1 damieng 1: # The LearningOnline Network
2: # Third step to clean a file.
3: #
1.2 ! damieng 4: # $Id: post_xml.pm,v 1.1 2015/12/03 20:40:31 damieng Exp $
1.1 damieng 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ###
29:
30: #!/usr/bin/perl
31:
32: package Apache::post_xml;
33:
34: use strict;
35: use utf8;
36: use warnings;
37:
38: use File::Basename;
39: use File::Temp qw/ tempfile /;
40: use Cwd 'abs_path';
41: use XML::LibXML;
42: use HTML::TokeParser; # used to parse sty files
43: use Tie::IxHash; # for ordered hashes
44:
45: no warnings 'recursion'; # yes, fix_paragraph is using heavy recursion, I know
46:
47: # these are constants
48: my @block_elements = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm');
49: my @inline_like_block = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup'); # inline elements treated like blocks for pretty print and some other things
50: my @responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse','essayresponse','radiobuttonresponse','matchresponse','rankresponse','imageresponse','functionplotresponse');
51: my @block_html = ('html','head','body','section','h1','h2','h3','h4','h5','h6','div','p','ul','ol','li','table','tbody','tr','td','th','dl','dt','dd','pre','noscript','hr','address','blockquote','object','applet','embed','map','form','fieldset','iframe','center','frameset');
52: my @no_newline_inside = ('import','parserlib','scriptlib','data','function','label','xlabel','ylabel','tic','text','rectangle','image','title','h1','h2','h3','h4','h5','h6','li','td','p');
53: my @preserve_elements = ('script','answer','pre');
54: my @accepting_style = ('section','h1','h2','h3','h4','h5','h6','div','p','li','td','th','dt','dd','pre','blockquote');
55: my @latex_math = ('\alpha', '\theta', '\omicron', '\tau', '\beta', '\vartheta', '\pi', '\upsilon', '\gamma', '\gamma', '\varpi', '\phi', '\delta', '\kappa', '\rho', '\varphi', '\epsilon', '\lambda', '\varrho', '\chi', '\varepsilon', '\mu', '\sigma', '\psi', '\zeta', '\nu', '\varsigma', '\omega', '\eta', '\xi',
56: '\Gamma', '\Lambda', '\Sigma', '\Psi', '\Delta', '\Xi', '\Upsilon', '\Omega', '\Theta', '\Pi', '\Phi',
57: '\pm', '\cap', '\diamond', '\oplus', '\mp', '\cup', '\bigtriangleup', '\ominus', '\times', '\uplus', '\bigtriangledown', '\otimes', '\div', '\sqcap', '\triangleleft', '\oslash', '\ast', '\sqcup', '\triangleright', '\odot', '\star', '\vee', '\lhd$', '\bigcirc', '\circ', '\wedge', '\rhd$', '\dagger', '\bullet', '\setminus', '\unlhd$', '\ddagger', '\cdot', '\wr', '\unrhd$', '\amalg', '+', '-',
58: '\leq', '\geq', '\equiv', '\models', '\prec', '\succ', '\sim', '\perp', '\preceq', '\succeq', '\simeq', '\mid', '\ll', '\gg', '\asymp', '\parallel', '\subset', '\supset', '\approx', '\bowtie', '\subseteq', '\supseteq', '\cong', '\Join$', '\sqsubset$', '\sqsupset$', '\neq', '\smile', '\sqsubseteq', '\sqsupseteq', '\doteq', '\frown', '\in', '\ni', '\propto', '\vdash', '\dashv',
59: '\colon', '\ldotp', '\cdotp',
60: '\leftarrow', '\longleftarrow', '\uparrow', '\Leftarrow', '\Longleftarrow', '\Uparrow', '\rightarrow', '\longrightarrow', '\downarrow', '\Rightarrow', '\Longrightarrow', '\Downarrow', '\leftrightarrow', '\longleftrightarrow', '\updownarrow', '\Leftrightarrow', '\Longleftrightarrow', '\Updownarrow', '\mapsto', '\longmapsto', '\nearrow', '\hookleftarrow', '\hookrightarrow', '\searrow', '\leftharpoonup', '\rightharpoonup', '\swarrow', '\leftharpoondown', '\rightharpoondown', '\nwarrow', '\rightleftharpoons', '\leadsto$',
61: '\ldots', '\cdots', '\vdots', '\ddots', '\aleph', '\prime', '\forall', '\infty', '\hbar', '\emptyset', '\exists', '\Box$', '\imath', '\nabla', '\neg', '\Diamond$', '\jmath', '\surd', '\flat', '\triangle', '\ell', '\top', '\natural', '\clubsuit', '\wp', '\bot', '\sharp', '\diamondsuit', '\Re', '\|', '\backslash', '\heartsuit', '\Im', '\angle', '\partial', '\spadesuit', '\mho$',
62: '\sum', '\bigcap', '\bigodot', '\prod', '\bigcup', '\bigotimes', '\coprod', '\bigsqcup', '\bigoplus', '\int', '\bigvee', '\biguplus', '\oint', '\bigwedge',
63: '\arccos', '\cos', '\csc', '\exp', '\ker', '\limsup', '\min', '\sinh', '\arcsin', '\cosh', '\deg', '\gcd', '\lg', '\ln', '\Pr', '\sup', '\arctan', '\cot', '\det', '\hom', '\lim', '\log', '\sec', '\tan', '\arg', '\coth', '\dim', '\inf', '\liminf', '\max', '\sin', '\tanh',
64: '\uparrow', '\Uparrow', '\downarrow', '\Downarrow', '\updownarrow', '\Updownarrow', '\lfloor', '\rfloor', '\lceil', '\rceil', '\langle', '\rangle', '\backslash',
65: '\rmoustache', '\lmoustache', '\rgroup', '\lgroup', '\arrowvert', '\Arrowvert', '\bracevert',
66: '\hat{', '\acute{', '\bar{', '\dot{', '\breve{', '\check{', '\grave{', '\vec{', '\ddot{', '\tilde{',
67: '\widetilde{', '\widehat{', '\overleftarrow{', '\overrightarrow{', '\overline{', '\underline{', '\overbrace{', '\underbrace{', '\sqrt{', '\sqrt[', '\frac{'
68: );
69: # list of elements that can contain style elements:
70: my @containing_styles = ('library','problem',@responses,'foil','item','text','hintgroup','hintpart','label','part','preduedate','postanswerdate','solved','notsolved','block','while','web','standalone','problemtype','languageblock','translated','lang','window','windowlink','togglebox','instructorcomment','body','section','div','p','li','dd','td','th','blockquote','object','applet','video','audio','canvas','fieldset','button',
71: 'span','strong','em','b','i','sup','sub','code','kbd','samp','tt','ins','del','var','small','big','u','font');
72: my @html_styles = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'u');
73:
74: my $warnings; # 1 = print warnings
75:
76:
77: # Parses the XML document and fixes many things to turn it into a document matching the schema.
1.2 ! damieng 78: # @param {reference} textref - reference to the text of the document
! 79: # @param {string} file_path - path of the file being processed (we only extract the directory path from it)
! 80: # @param {string} res_dir - path of res directory parent (without the / at the end)
! 81: # @param {boolean} warn - 1 to print warnings, 0 otherwise
! 82: # @returns the text of the document as a byte string.
1.1 damieng 83: sub post_xml {
1.2 ! damieng 84: my ($textref, $file_path, $res_dir, $warn) = @_;
1.1 damieng 85: $warnings = $warn;
86:
87: my $dom_doc = XML::LibXML->load_xml(string => $textref);
88:
89: my $root = fix_structure($dom_doc);
90:
91: remove_elements($root, ['startouttext','startoutext','startottext','startouttex','startouttect','atartouttext','starouttext','starttextout','starttext','starttextarea','endouttext','endoutext','endoutttext','endouttxt','endouutext','ednouttext','endouttex','endoouttext','endouttest','endtextout','endtextarea','startpartmarker','endpartmarker','basefont','x-claris-tagview','x-claris-window','x-sas-window']);
92:
93: remove_empty_attributes($root);
94:
95: fix_attribute_case($root);
96:
97: my $fix_by_hand = replace_m($root);
98:
99: my @all_block = (@block_elements, @block_html);
1.2 ! damieng 100: add_sty_blocks($file_path, $res_dir, $root, \@all_block); # must come before the subs using @all_block
1.1 damieng 101:
102: fix_block_styles($root, \@all_block);
103: $root->normalize();
104:
105: fix_fonts($root, \@all_block);
106:
107: replace_u($root);
108:
109: remove_bad_cdata_sections($root);
110:
111: add_cdata_sections($root);
112:
113: fix_style_element($root);
114:
115: fix_tables($root);
116:
117: fix_lists($root);
118:
119: fix_wrong_name_for_img($root); # should be before replace_deprecated_attributes_by_css
120:
121: replace_deprecated_attributes_by_css($root);
122:
123: replace_center($root, \@all_block); # must come after replace_deprecated_attributes_by_css
124:
125: replace_nobr($root);
126:
127: remove_useless_notsolved($root);
128:
129: fix_paragraphs_inside($root, \@all_block);
130:
131: remove_empty_style($root);
132:
133: fix_empty_lc_elements($root);
134:
135: lowercase_attribute_values($root);
136:
137: replace_numericalresponse_unit_attribute($root);
138:
139: replace_functions_by_elements($root);
140:
141: pretty($root, \@all_block);
142:
143: replace_tm_dtm($root);
144:
145: if ($fix_by_hand) {
146: die "The file has been converted but it should be fixed by hand.";
147: }
148: return $dom_doc->toString(); # byte string !
149: }
150:
151: sub fix_structure {
152: my ($doc) = @_;
153: # the root element has already been added in pre_xml
154: my $root = $doc->documentElement;
155: # inside the root, replace html, problem and library elements by their content
156: my @toreplace = ('html','problem','library');
157: foreach my $name (@toreplace) {
158: my @elements = $root->getElementsByTagName($name);
159: foreach my $element (@elements) {
160: replace_by_children($element);
161: }
162: }
163: # insert all link and style elements inside a new head element
164: my $current_node = undef;
165: my @heads = $doc->getElementsByTagName('head');
166: my @links = $doc->getElementsByTagName('link');
167: my @styles = $doc->getElementsByTagName('style');
168: my @titles = $doc->getElementsByTagName('title');
169: if (scalar(@titles) > 0) {
170: # NOTE: there is a title element in gnuplot, not to be confused with the one inside HTML head
171: for (my $i=0; $i<scalar(@titles); $i++) {
172: my $title = $titles[$i];
173: my $found_gnuplot = 0;
174: my $ancestor = $title->parentNode;
175: while (defined $ancestor) {
176: if ($ancestor->nodeName eq 'gnuplot') {
177: $found_gnuplot = 1;
178: last;
179: }
180: $ancestor = $ancestor->parentNode;
181: }
182: if ($found_gnuplot) {
183: splice(@titles, $i, 1);
184: $i--;
185: }
186: }
187: }
188: if (scalar(@heads) > 0 || scalar(@titles) > 0 || scalar(@links) > 0 || scalar(@styles) > 0) {
189: my $htmlhead = $doc->createElement('head');
190: foreach my $head (@heads) {
191: my $next;
192: for (my $child=$head->firstChild; defined $child; $child=$next) {
193: $next = $child->nextSibling;
194: $head->removeChild($child);
195: if ($child->nodeType != XML_ELEMENT_NODE ||
196: string_in_array(['title','script','style','meta','link','import','base'], $child->nodeName)) {
197: $htmlhead->appendChild($child);
198: } else {
199: # this should not be in head
200: insert_after_or_first($root, $child, $current_node);
201: }
202: }
203: $head->parentNode->removeChild($head);
204: }
205: foreach my $child (@titles, @links, @styles) {
206: $child->parentNode->removeChild($child);
207: $htmlhead->appendChild($child);
208: }
209: insert_after_or_first($root, $htmlhead, $current_node);
210: $current_node = $htmlhead;
211: }
212: # body
213: my $htmlbody = undef;
214: my @bodies = $doc->getElementsByTagName('body');
215: if (scalar(@bodies) > 0) {
216: # TODO: fix content and position of body elements
217: if ($root->nodeName eq 'problem') {
218: foreach my $body (@bodies) {
219: replace_by_children($body);
220: }
221: }
222: }
223: # add all the meta elements afterwards when they are LON-CAPA meta. Remove all HTML meta.
224: my @meta_names = ('abstract','author','authorspace','avetries','avetries_list','clear','comefrom','comefrom_list','copyright','correct','count','course','course_list','courserestricted','creationdate','dependencies','depth','difficulty','difficulty_list','disc','disc_list','domain','end','field','firstname','generation','goto','goto_list','groupname','helpful','highestgradelevel','hostname','id','keynum','keywords','language','lastname','lastrevisiondate','lowestgradelevel','middlename','mime','modifyinguser','notes','owner','permanentemail','scope','sequsage','sequsage_list','standards','start','stdno','stdno_list','subject','technical','title','url','username','value','version');
225: my @metas = $doc->getElementsByTagName('meta');
226: foreach my $meta (@metas) {
227: $meta->parentNode->removeChild($meta);
228: my $name = $meta->getAttribute('name');
229: my $content = $meta->getAttribute('content');
230: if (defined $name && defined $content && string_in_array(\@meta_names, lc($name))) {
231: my $lcmeta = $doc->createElement('meta');
232: $lcmeta->setAttribute('name', lc($name));
233: $lcmeta->setAttribute('content', $content);
234: insert_after_or_first($root, $lcmeta, $current_node);
235: $current_node = $lcmeta;
236: }
237: }
238: return($root);
239: }
240:
241: # insert the new child under parent after the reference child, or as the first child if the reference child is not defined
242: sub insert_after_or_first {
243: my ($parent, $newchild, $refchild) = @_;
244: if (defined $refchild) {
245: $parent->insertAfter($newchild, $refchild);
246: } elsif (defined $parent->firstChild) {
247: $parent->insertBefore($newchild, $parent->firstChild);
248: } else {
249: $parent->appendChild($newchild);
250: }
251: }
252:
253: # removes all elements with given names inside the node, but keep the content
254: sub remove_elements {
255: my ($node, $to_remove) = @_;
256: my $nextChild;
257: for (my $child=$node->firstChild; defined $child; $child=$nextChild) {
258: $nextChild = $child->nextSibling;
259: my $type = $node->nodeType;
260: if ($type == XML_ELEMENT_NODE) {
261: if (string_in_array($to_remove, $child->nodeName)) {
262: my $first_non_white = $child->firstChild;
263: if (defined $first_non_white && $first_non_white->nodeType == XML_TEXT_NODE &&
264: $first_non_white->nodeValue =~ /^\s*$/) {
265: $first_non_white = $first_non_white->nextSibling;
266: }
267: if (defined $first_non_white) {
268: $nextChild = $first_non_white;
269: replace_by_children($child);
270: } else {
271: $node->removeChild($child);
272: }
273: } else {
274: remove_elements($child, $to_remove);
275: }
276: }
277: }
278: }
279:
280: # removes some attributes that have an invalid empty value
281: sub remove_empty_attributes {
282: my ($root) = @_;
283: my $doc = $root->ownerDocument;
284: # this list is based on validation errors in the MSU subset (it could be more complete if it was based on the schema)
285: my @attributes = (
286: ['curve', ['pointsize']],
287: ['foil', ['location']],
288: ['foilgroup', ['checkboxoptions', 'options', 'texoptions']],
289: ['gnuplot', ['pattern', 'texwidth']],
290: ['img', ['height', 'texheight', 'texwidth', 'texwrap', 'width']],
291: ['import', ['importmode']],
292: ['optionresponse', ['max']],
293: ['organicstructure', ['options']],
294: ['radiobuttonresponse', ['max']],
295: ['randomlabel', ['height', 'texwidth', 'width']],
296: ['stringresponse', ['type']],
297: ['textline', ['size']],
298: );
299: foreach my $element_attributes (@attributes) {
300: my $element_name = $element_attributes->[0];
301: my $attribute_names = $element_attributes->[1];
302: my @elements = $doc->getElementsByTagName($element_name);
303: foreach my $element (@elements) {
304: foreach my $attribute_name (@$attribute_names) {
305: my $value = $element->getAttribute($attribute_name);
306: if (defined $value && $value =~ /^\s*$/) {
307: $element->removeAttribute($attribute_name);
308: }
309: }
310: }
311: }
312: }
313:
314: # fixes the case for a few attributes that are not all lowercase
315: # (the HTML parser used in html_to_xml turns everything lowercase, which is a good thing in general)
316: sub fix_attribute_case {
317: my ($root) = @_;
318: my $doc = $root->ownerDocument;
319: my @attributes = (
320: ['labelgroup', ['TeXsize']],
321: ['h1', ['TeXsize']],
322: ['h2', ['TeXsize']],
323: ['h3', ['TeXsize']],
324: ['h4', ['TeXsize']],
325: ['h5', ['TeXsize']],
326: ['h6', ['TeXsize']],
327: # font and basefont have a TeXsize but will be removed
328: ['optionresponse', ['TeXlayout']],
329: ['itemgroup', ['TeXitemgroupwidth']],
330: ['Task', ['OptionalRequired']],
331: ['Question', ['OptionalRequired','Mandatory']],
332: ['Instance', ['OptionalRequired','Disabled']],
333: ['Criteria', ['Mandatory']],
334: ['table', ['TeXwidth','TeXtheme']],
335: ['td', ['TeXwidth']],
336: ['th', ['TeXwidth']],
337: ['img', ['TeXwidth','TeXheight','TeXwrap']],
338: );
339: foreach my $element_attributes (@attributes) {
340: my $element_name = $element_attributes->[0];
341: my $attribute_names = $element_attributes->[1];
342: my @elements = $doc->getElementsByTagName($element_name);
343: foreach my $element (@elements) {
344: foreach my $attribute_name (@$attribute_names) {
345: my $value = $element->getAttribute(lc($attribute_name));
346: if (defined $value) {
347: $element->removeAttribute(lc($attribute_name));
348: $element->setAttribute($attribute_name, $value);
349: }
350: }
351: }
352: }
353: }
354:
355: # Replaces m by HTML, tm and/or dtm (which will be replaced by <m> later, but they are useful
356: # to know if the element is a block element or not).
357: # m might contain non-math LaTeX, while tm and dtm may only contain math.
358: # Returns 1 if the file should be fixed by hand, 0 otherwise.
359: sub replace_m {
360: my ($root) = @_;
361: my $doc = $root->ownerDocument;
362: my $fix_by_hand = 0;
363: # search for variable declarations
364: my @variables = ();
365: my @scripts = $root->getElementsByTagName('script');
366: foreach my $script (@scripts) {
367: my $type = $script->getAttribute('type');
368: if (defined $type && $type eq 'loncapa/perl') {
369: if (defined $script->firstChild && $script->firstChild->nodeType == XML_TEXT_NODE) {
370: my $text = $script->firstChild->nodeValue;
371: # NOTE: we are not interested in replacing "@value", only "$value"
372: # this regexp is for " $a = ..." and " $a[...] = ..."
373: while ($text =~ /^[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
374: if (!string_in_array(\@variables, $1)) {
375: push(@variables, $1);
376: }
377: }
378: # this regexp is for "...; $a = ..." and "...; $a[...] = ..."
379: while ($text =~ /^[^'"\/;]+;[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
380: if (!string_in_array(\@variables, $1)) {
381: push(@variables, $1);
382: }
383: }
384: # this regexp is for " @a = ..."
385: while ($text =~ /^[ \t]*\@([a-zA-Z_0-9]+)[ \t]*=/gm) {
386: if (!string_in_array(\@variables, $1)) {
387: push(@variables, $1);
388: }
389: }
390: # this regexp is for " ($a, $b, $c) = ..."
391: my @matches = ($text =~ /^[ \t]*\([ \t]*\$([a-zA-Z_0-9]+)(?:[ \t]*,[ \t]*\$([a-zA-Z_0-9]+))*[ \t]*\)[ \t]*=/gm);
392: foreach my $match (@matches) {
393: if (!defined $match) {
394: next; # not sure why it happens, but it does
395: }
396: if (!string_in_array(\@variables, $match)) {
397: push(@variables, $match);
398: }
399: }
400: # and this one is for "push @a"
401: while ($text =~ /^[ \t]*push @([a-zA-Z_0-9]+)[ \t,]*/gm) {
402: if (!string_in_array(\@variables, $1)) {
403: push(@variables, $1);
404: }
405: }
406: # use the opportunity to report usage of <m> in Perl scripts
407: if ($text =~ /^[^#].*<m[ >]/m) {
408: if ($warnings) {
409: print "WARNING: <m> is used in a script, it should be converted by hand\n";
410: }
411: $fix_by_hand = 1;
412: }
413: }
414: }
415: }
416: my @ms = $root->getElementsByTagName('m');
417: foreach my $m (@ms) {
418: if (!defined $m->firstChild) {
419: $m->parentNode->removeChild($m);
420: next;
421: }
422: if (defined $m->firstChild->nextSibling || $m->firstChild->nodeType != XML_TEXT_NODE) {
423: if ($warnings) {
424: print "WARNING: m value is not simple text\n";
425: }
426: $fix_by_hand = 1;
427: next;
428: }
429: my $text = $m->firstChild->nodeValue;
430: my $text_before_variable_replacement = $text;
431: my $var_key1 = 'dfhg3df54hg65hg4';
432: my $var_key2 = 'dfhg654d6f5g4h5f';
433: my $eval = defined $m->getAttribute('eval') && $m->getAttribute('eval') eq 'on';
434: if ($eval) {
435: # replace variables
436: foreach my $variable (@variables) {
437: my $replacement = $var_key1.$variable.$var_key2;
438: $text =~ s/\$$variable(?![a-zA-Z])/$replacement/ge;
439: $text =~ s/\$\{$variable\}/$replacement/ge;
440: }
441: }
442: # check if the expression is enclosed in math separators: $ $$ \( \) \[ \]
443: # if so, replace the whole node by dtm or tm
444: my $new_text;
445: my $new_node_name;
446: if ($text =~ /^\s*\$\$([^\$]*)\$\$\s*$/) {
447: $new_node_name = 'dtm';
448: $new_text = $1;
449: } elsif ($text =~ /^\s*\\\[(.*)\\\]\s*$/s) {
450: $new_node_name = 'dtm';
451: $new_text = $1;
452: } elsif ($text =~ /^\s*\$([^\$]*)\$\s*$/) {
453: $new_node_name = 'tm';
454: $new_text = $1;
455: } elsif ($text =~ /^\s*\\\((.*)\\\)\s*$/s) {
456: $new_node_name = 'tm';
457: $new_text = $1;
458: }
459: if (defined $new_node_name) {
460: if ($eval) {
461: foreach my $variable (@variables) {
462: my $replacement = $var_key1.$variable.$var_key2;
463: $new_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
464: $new_text =~ s/$replacement/\$$variable/g;
465: }
466: }
467: my $new_node = $doc->createElement($new_node_name);
468: if ($eval) {
469: $new_node->setAttribute('eval', 'on');
470: }
471: $new_node->appendChild($doc->createTextNode($new_text));
472: $m->parentNode->replaceChild($new_node, $m);
473: next;
474: }
475: if ($text !~ /\$|\\\(|\\\)|\\\[|\\\]/) {
476: # there are no math separators inside
477: # try to guess if this is meant as math
478: my $found_math = 0;
479: foreach my $symbol (@latex_math) {
480: if (index($text, $symbol) != -1) {
481: $found_math = 1;
482: last;
483: }
484: }
485: if ($found_math) {
486: # interpret the whole text as LaTeX inline math
487: my $new_node = $doc->createElement('tm');
488: if ($eval) {
489: $new_node->setAttribute('eval', 'on');
490: }
491: $new_node->appendChild($doc->createTextNode($text_before_variable_replacement));
492: $m->parentNode->replaceChild($new_node, $m);
493: next;
494: }
495: # no math symbol found, we will convert the text with tth
496: }
497:
498: # there are math separators inside, even after hiding variables, or there was no math symbol
499:
500: # hide math parts inside before running tth
501: my $math_key1 = '#ghjgdh5hg45gf';
502: my $math_key2 = '#';
503: my @maths = ();
504: my @separators = (['$$','$$'], ['\\(','\\)'], ['\\[','\\]'], ['$','$']);
505: foreach my $seps (@separators) {
506: my $sep1 = $seps->[0];
507: my $sep2 = $seps->[1];
508: my $pos1 = index($text, $sep1);
509: if ($pos1 == -1) {
510: next;
511: }
512: my $pos2 = index($text, $sep2, $pos1+length($sep1));
513: while ($pos1 != -1 && $pos2 != -1) {
514: my $replace = substr($text, $pos1, $pos2+length($sep2)-$pos1);
515: push(@maths, $replace);
516: my $by = $math_key1.scalar(@maths).$math_key2;
517: $text = substr($text, 0, $pos1).$by.substr($text, $pos2+length($sep2));
518: $pos1 = index($text, $sep1);
519: if ($pos1 != -1) {
520: $pos2 = index($text, $sep2, $pos1+length($sep1));
521: }
522: }
523: }
524: # get HTML as text from tth
525: my $html_text = tth($text);
526: # replace math by replacements
527: for (my $i=0; $i < scalar(@maths); $i++) {
528: my $math = $maths[$i];
529: $math =~ s/&/&/g;
530: $math =~ s/</</g;
531: $math =~ s/>/>/g;
532: if ($math =~ /^\$\$(.*)\$\$$/s) {
533: $math = '<dtm>'.$1.'</dtm>';
534: } elsif ($math =~ /^\\\[(.*)\\\]$/s) {
535: $math = '<dtm>'.$1.'</dtm>';
536: } elsif ($math =~ /^\\\((.*)\\\)$/s) {
537: $math = '<tm>'.$1.'</tm>';
538: } elsif ($math =~ /^\$(.*)\$$/s) {
539: $math = '<tm>'.$1.'</tm>';
540: }
541: my $replace = $math_key1.($i+1).$math_key2;
542: $html_text =~ s/$replace/$math/;
543: }
544: # replace variables if necessary
545: if ($eval) {
546: foreach my $variable (@variables) {
547: my $replacement = $var_key1.$variable.$var_key2;
548: $html_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
549: $html_text =~ s/$replacement/\$$variable/g;
550: }
551: }
552: my $fragment = html_to_dom($html_text);
553: $doc->adoptNode($fragment);
554: $m->parentNode->replaceChild($fragment, $m);
555:
556: }
557: return $fix_by_hand;
558: }
559:
560: # Returns the HTML equivalent of LaTeX input, using tth
561: sub tth {
562: my ($text) = @_;
563: my ($fh, $tmp_path) = tempfile();
564: binmode($fh, ':utf8');
565: print $fh $text;
566: close $fh;
567: my $output = `tth -r -w2 -u -y0 < $tmp_path 2>/dev/null`;
568: # hopefully the temp file will not be removed before this point (otherwise we should use unlink_on_destroy 0)
569: $output =~ s/^\s*|\s*$//;
570: $output =~ s/<div class="p"><!----><\/div>/<br\/>/; # why is tth using such ugly markup for \newline ?
571: return $output;
572: }
573:
574: # transform simple HTML into a DOM fragment (which will need to be adopted by the document)
575: sub html_to_dom {
576: my ($text) = @_;
577: $text = '<root>'.$text.'</root>';
578: my $textref = html_to_xml::html_to_xml(\$text);
579: utf8::upgrade($$textref); # otherwise the XML parser fails when the HTML parser turns into a character
580: my $dom_doc = XML::LibXML->load_xml(string => $textref);
581: my $root = $dom_doc->documentElement;
582: remove_empty_style($root);
583: my $fragment = $dom_doc->createDocumentFragment();
584: my $next;
585: for (my $n=$root->firstChild; defined $n; $n=$next) {
586: $next = $n->nextSibling;
587: $root->removeChild($n);
588: $fragment->appendChild($n);
589: }
590: return($fragment);
591: }
592:
593: # Use the linked sty files to guess which newly defined elements should be considered blocks.
594: # Also adds to @containing_styles the sty elements that contain styles.
595: # @param {string} fn - the file path (we only extract the directory path from it)
1.2 ! damieng 596: # @param {string} res_dir - path of res directory parent (without the / at the end)
! 597: # @param {Element} root - the root element
1.1 damieng 598: sub add_sty_blocks {
1.2 ! damieng 599: my ($fn, $res_dir, $root, $all_block) = @_;
1.1 damieng 600: my $doc = $root->ownerDocument;
601: my @parserlibs = $doc->getElementsByTagName('parserlib');
602: my @libs = ();
603: foreach my $parserlib (@parserlibs) {
604: if (defined $parserlib->firstChild && $parserlib->firstChild->nodeType == XML_TEXT_NODE) {
605: my $value = $parserlib->firstChild->nodeValue;
606: $value =~ s/^\s+|\s+$//g;
607: if ($value ne '') {
608: push(@libs, $value);
609: }
610: }
611: }
612: my ($name, $path, $suffix) = fileparse($fn);
613: foreach my $sty (@libs) {
614: if (substr($sty, 0, 1) eq '/') {
1.2 ! damieng 615: $sty = $res_dir.$sty;
1.1 damieng 616: } else {
617: $sty = $path.$sty;
618: }
619: my $new_elements = parse_sty($sty, $all_block);
620: better_guess($root, $new_elements, $all_block);
621: my $new_blocks = $new_elements->{'block'};
622: my $new_inlines = $new_elements->{'inline'};
623: push(@$all_block, @{$new_blocks});
624: #push(@inlines, @{$new_inlines}); # we are not using a list of inline elements at this point
625: }
626: }
627:
628: ##
629: # Parses a sty file and returns lists of block and inline elements.
630: # @param {string} fn - the file path
631: ##
632: sub parse_sty {
633: my ($fn, $all_block) = @_;
634: my @blocks = ();
635: my @inlines = ();
636: my $p = HTML::TokeParser->new($fn);
637: if (! $p) {
638: die "post_xml.pl: parse_sty: Error reading $fn\n";
639: }
640: $p->empty_element_tags(1);
641: my $in_definetag = 0;
642: my $in_render = 0;
643: my %newtags = ();
644: my $newtag = '';
645: my $is_block = 0;
646: while (my $token = $p->get_token) {
647: if ($token->[0] eq 'S') {
648: my $tag = lc($token->[1]);
649: if ($tag eq 'definetag') {
650: $in_definetag = 1;
651: $is_block = 0;
652: my $attributes = $token->[2];
653: $newtag = $attributes->{'name'};
654: if (substr($newtag, 0, 1) eq '/') {
655: $newtag = substr($newtag, 1);
656: }
657: } elsif ($in_definetag && $tag eq 'render') {
658: $in_render = 1;
659: $is_block = 0;
660: } elsif ($in_render) {
661: if (string_in_array($all_block, $tag)) {
662: $is_block = 1;
663: }
664: }
665: } elsif ($token->[0] eq 'E') {
666: my $tag = lc($token->[1]);
667: if ($tag eq 'definetag') {
668: $in_definetag = 0;
669: if (defined $newtags{$newtag}) {
670: $newtags{$newtag} = $newtags{$newtag} || $is_block;
671: } else {
672: $newtags{$newtag} = $is_block;
673: }
674: } elsif ($in_definetag && $tag eq 'render') {
675: $in_render = 0;
676: }
677: }
678: }
679: foreach $newtag (keys(%newtags)) {
680: if ($newtags{$newtag} == 1) {
681: push(@blocks, $newtag);
682: } else {
683: push(@inlines, $newtag);
684: }
685: }
686: return {'block'=>\@blocks, 'inline'=>\@inlines};
687: }
688:
689: ##
690: # Marks as block the elements that contain block elements in the input file.
691: # Also adds to @containing_styles the sty elements that contain styles.
692: # @param {string} fn - the file path
693: # @param {Hash<string,Array>} new_elements - contains arrays in 'block' and 'inline'
694: ##
695: sub better_guess {
696: my ($root, $new_elements, $all_block) = @_;
697: my $new_blocks = $new_elements->{'block'};
698: my $new_inlines = $new_elements->{'inline'};
699:
700: my @change = (); # change these elements from inline to block
701: foreach my $tag (@{$new_inlines}) {
702: my @nodes = $root->getElementsByTagName($tag);
703: NODE_LOOP: foreach my $node (@nodes) {
704: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
705: if ($child->nodeType == XML_ELEMENT_NODE) {
706: if (string_in_array($all_block, $child->nodeName) || string_in_array($new_blocks, $child->nodeName)) {
707: push(@change, $tag);
708: last NODE_LOOP;
709: }
710: }
711: }
712: }
713: }
714: foreach my $inline (@change) {
715: my $index = 0;
716: $index++ until $new_inlines->[$index] eq $inline;
717: splice(@{$new_inlines}, $index, 1);
718: push(@{$new_blocks}, $inline);
719: }
720: # add to @containing_styles when a style is used inside
721: # NOTE: some sty elements will be added even though they should not, but if we don't do that
722: # all style will be removed in the sty elements.
723: foreach my $tag ((@{$new_blocks}, @{$new_inlines})) {
724: my @nodes = $root->getElementsByTagName($tag);
725: NODE_LOOP: foreach my $node (@nodes) {
726: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
727: if ($child->nodeType == XML_ELEMENT_NODE) {
728: if (string_in_array(\@html_styles, $child->nodeName)) {
729: push(@containing_styles, $tag);
730: last NODE_LOOP;
731: }
732: }
733: }
734: }
735: }
736: }
737:
738: # When a style element contains a block, move the style inside the block where it is allowed.
739: # style/block/other -> block/style/other
740: # When a style is used where it is not allowed, move it inside its children or remove it (unless it contains only text)
741: # element_not_containing_styles/style/other -> element_not_containing_styles/other/style (except if other is a style)
742: # The fix is not perfect in the case of element_not_containing_styles/style1/style2/block/text (style1 will be lost):
743: # element_not_containing_styles/style1/style2/block/text -> element_not_containing_styles/block/style2/text
744: # (a solution to this problem would be to merge the styles in a span)
745: # NOTE: .sty defined elements might have been added to @containing_styles by better_guess().
746: sub fix_block_styles {
747: my ($element, $all_block) = @_;
748: my $doc = $element->ownerDocument;
749: if (string_in_array(\@html_styles, $element->nodeName)) {
750: # move spaces out of the style element
751: if (defined $element->firstChild && $element->firstChild->nodeType == XML_TEXT_NODE) {
752: my $child = $element->firstChild;
753: if ($child->nodeValue =~ /^(\s+)(\S.*)$/s) {
754: $element->parentNode->insertBefore($doc->createTextNode($1), $element);
755: $child->setData($2);
756: }
757: }
758: if (defined $element->lastChild && $element->lastChild->nodeType == XML_TEXT_NODE) {
759: my $child = $element->lastChild;
760: if ($child->nodeValue =~ /^(.*\S)(\s+)$/s) {
761: $element->parentNode->insertAfter($doc->createTextNode($2), $element);
762: $child->setData($1);
763: }
764: }
765:
766: my $found_block = 0;
767: for (my $child=$element->firstChild; defined $child; $child=$child->nextSibling) {
768: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
769: $found_block = 1;
770: last;
771: }
772: }
773: my $no_style_here = !string_in_array(\@containing_styles, $element->parentNode->nodeName);
774: if ($found_block || $no_style_here) {
775: # there is a block or the style is not allowed here,
776: # the style element has to be replaced by its modified children
777: my $s; # a clone of the style
778: my $next;
779: for (my $child=$element->firstChild; defined $child; $child=$next) {
780: $next = $child->nextSibling;
781: if ($child->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $child->nodeName) ||
782: $child->nodeName eq 'br' || $no_style_here)) {
783: # avoid inverting a style with a style with $no_style_here (that would cause endless recursion)
784: if (!$no_style_here || (!string_in_array(\@html_styles, $child->nodeName) &&
785: string_in_array(\@containing_styles, $child->nodeName))) {
786: # block node or inline node when the style is not allowed:
787: # move all children inside the style, and make the style the only child
788: $s = $element->cloneNode();
789: my $next2;
790: for (my $child2=$child->firstChild; defined $child2; $child2=$next2) {
791: $next2 = $child2->nextSibling;
792: $child->removeChild($child2);
793: $s->appendChild($child2);
794: }
795: $child->appendChild($s);
796: }
797: $s = undef;
798: } elsif (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
799: $child->nodeType == XML_ELEMENT_NODE) {
800: # if the style is allowed, move text and inline nodes inside the style
801: if (!$no_style_here) {
802: if (!defined $s) {
803: $s = $element->cloneNode();
804: $element->insertBefore($s, $child);
805: }
806: $element->removeChild($child);
807: $s->appendChild($child);
808: }
809: } else {
810: # do not put other nodes inside the style
811: $s = undef;
812: }
813: }
814: # now replace by children and fix them
815: my $parent = $element->parentNode;
816: for (my $child=$element->firstChild; defined $child; $child=$next) {
817: $next = $child->nextSibling;
818: $element->removeChild($child);
819: $parent->insertBefore($child, $element);
820: if ($child->nodeType == XML_ELEMENT_NODE) {
821: fix_block_styles($child, $all_block);
822: }
823: }
824: $parent->removeChild($element);
825: return;
826: }
827: }
828: # otherwise fix all children
829: my $next;
830: for (my $child=$element->firstChild; defined $child; $child=$next) {
831: $next = $child->nextSibling;
832: if ($child->nodeType == XML_ELEMENT_NODE) {
833: fix_block_styles($child, $all_block);
834: }
835: }
836: }
837:
838: # removes empty font elements and font elements that contain at least one block element
839: # replaces other font elements by equivalent span
840: sub fix_fonts {
841: my ($root, $all_block) = @_;
842: my $doc = $root->ownerDocument;
843: my @fonts = $root->getElementsByTagName('font');
844: @fonts = reverse(@fonts); # to deal with the ancestor last in the case of font/font
845: foreach my $font (@fonts) {
846: my $block = 0;
847: for (my $child=$font->firstChild; defined $child; $child=$child->nextSibling) {
848: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
849: $block = 1;
850: last;
851: }
852: }
853: if (!defined $font->firstChild || $block) {
854: # empty font or font containing block elements
855: # replace this node by its content
856: replace_by_children($font);
857: } else {
858: # replace by equivalent span
859: my $color = get_non_empty_attribute($font, 'color');
860: my $size = get_non_empty_attribute($font, 'size');
861: my $face = get_non_empty_attribute($font, 'face');
862: if (defined $face) {
863: $face =~ s/^,|,$//;
864: }
865: if (!defined $color && !defined $size && !defined $face) {
866: # useless font element: replace this node by its content
867: replace_by_children($font);
868: next;
869: }
870: my $replacement;
871: tie (my %properties, 'Tie::IxHash', ());
872: if (!defined $color && !defined $size && defined $face && lc($face) eq 'symbol') {
873: $replacement = $doc->createDocumentFragment();
874: } else {
875: $replacement = $doc->createElement('span');
876: my $css = '';
877: if (defined $color) {
878: $color =~ s/^x/#/;
879: $properties{'color'} = $color;
880: }
881: if (defined $size) {
882: my %hash = (
883: '1' => 'x-small',
884: '2' => 'small',
885: '3' => 'medium',
886: '4' => 'large',
887: '5' => 'x-large',
888: '6' => 'xx-large',
889: '7' => '300%',
890: '-1' => 'small',
891: '-2' => 'x-small',
892: '+1' => 'large',
893: '+2' => 'x-large',
894: '+3' => 'xx-large',
895: '+4' => '300%',
896: );
897: my $value = $hash{$size};
898: if (!defined $value) {
899: $value = 'medium';
900: }
901: $properties{'font-size'} = $value;
902: }
903: if (defined $face) {
904: if (lc($face) ne 'symbol' && lc($face) ne 'bold') {
905: $properties{'font-family'} = $face;
906: }
907: }
908: set_css_properties($replacement, \%properties);
909: }
910: if (defined $face && lc($face) eq 'symbol') {
911: # convert all content to unicode
912: my $next;
913: for (my $child=$font->firstChild; defined $child; $child=$next) {
914: $next = $child->nextSibling;
915: if ($child->nodeType == XML_TEXT_NODE) {
916: my $value = $child->nodeValue;
917: $value =~ tr/ABGDEZHQIKLMNXOPRSTUFCYWabgdezhqiklmnxoprVstufcywJjv¡«¬®/ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϕϖϒ↔←→/;
918: $child->setData($value);
919: }
920: }
921: }
922: # replace the font node
923: if ($replacement->nodeType == XML_ELEMENT_NODE && !defined $font->previousSibling &&
924: !defined $font->nextSibling && string_in_array(\@accepting_style, $font->parentNode->nodeName)) {
925: # use CSS on the parent block and replace font by its children instead of using a new element
926: set_css_properties($font->parentNode, \%properties);
927: replace_by_children($font);
928: } else {
929: # move all font children inside the replacement (span or fragment)
930: my $next;
931: for (my $child=$font->firstChild; defined $child; $child=$next) {
932: $next = $child->nextSibling;
933: $font->removeChild($child);
934: $replacement->appendChild($child);
935: }
936: # replace font
937: $font->parentNode->replaceChild($replacement, $font);
938: }
939: }
940: }
941: $root->normalize();
942: }
943:
944: # replaces u by <span style="text-decoration: underline">
945: sub replace_u {
946: my ($root) = @_;
947: my $doc = $root->ownerDocument;
948: my @us = $root->getElementsByTagName('u');
949: foreach my $u (@us) {
950: my $span = $doc->createElement('span');
951: $span->setAttribute('style', 'text-decoration: underline');
952: my $next;
953: for (my $child=$u->firstChild; defined $child; $child=$next) {
954: $next = $child->nextSibling;
955: $u->removeChild($child);
956: $span->appendChild($child);
957: }
958: $u->parentNode->replaceChild($span, $u);
959: }
960: }
961:
962: # removes CDATA sections tags that have not been parsed correcty by the HTML parser
963: # also removes bad comments in script elements
964: sub remove_bad_cdata_sections {
965: my ($root) = @_;
966: my $doc = $root->ownerDocument;
967: foreach my $name (@preserve_elements) {
968: my @nodes = $root->getElementsByTagName($name);
969: foreach my $node (@nodes) {
970: if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
971: my $value = $node->firstChild->nodeValue;
972: if ($name eq 'script' && (!defined $node->getAttribute('type') || $node->getAttribute('type') ne 'loncapa/perl') &&
973: !defined $node->firstChild->nextSibling && $value =~ /^(\s*)<!--(.*)-->(\s*)$/) {
974: # web browsers interpret that as a real comment when it is on 1 line, but the Perl HTML parser thinks it is the script
975: # -> turning it back into a comment
976: # (this is only true for Javascript script elements, since LON-CAPA does not parse loncapa/perl scripts in the same way)
977: $node->removeChild($node->firstChild);
978: $node->appendChild($doc->createComment($2));
979: next;
980: }
981: # at the beginning:
982: $value =~ s/^(\s*)<!\[CDATA\[/$1/; # <![CDATA[
983: $value =~ s/^(\s*)\/\*\s*<!\[CDATA\[\s*\*\//$1/; # /* <![CDATA[ */
984: $value =~ s/^(\s*)\/\/\s*<!\[CDATA\[/$1/; # // <![CDATA[
985: $value =~ s/^(\s*)(\/\/)?\s*<!--/$1/; # // <!--
986: # at the end:
987: $value =~ s/\/\/\s*\]\]>(\s*)$/$1/; # // ]]>
988: $value =~ s/\]\]>(\s*)$/$1/; # ]]>
989: $value =~ s/(\/\/)?\s*-->(\s*)$/$2/; # // -->
990: $value =~ s/\/\*\s*\]\]>\s*\*\/(\s*)$/$1/; # /* ]]> */
991:
992: $value = "\n".$value."\n";
993: $value =~ s/\s*(\n[ \t]*)/$1/;
994: $value =~ s/\s+$/\n/;
995: $node->firstChild->setData($value);
996: }
997: }
998: }
999: }
1000:
1001: # adds CDATA sections to scripts
1002: sub add_cdata_sections {
1003: my ($root) = @_;
1004: my $doc = $root->ownerDocument;
1005: my @scripts = $root->getElementsByTagName('script');
1006: my @answers = $root->getElementsByTagName('answer');
1007: foreach my $answer (@answers) {
1008: my $ancestor = $answer->parentNode;
1009: my $found_capa_response = 0;
1010: while (defined $ancestor) {
1011: if ($ancestor->nodeName eq 'numericalresponse' || $ancestor->nodeName eq 'formularesponse') {
1012: $found_capa_response = 1;
1013: last;
1014: }
1015: $ancestor = $ancestor->parentNode;
1016: }
1017: if (!$found_capa_response) {
1018: push(@scripts, $answer);
1019: }
1020: }
1021: foreach my $script (@scripts) {
1022: # use a CDATA section in the normal situation, for any script
1023: my $first = $script->firstChild;
1024: if (defined $first && $first->nodeType == XML_TEXT_NODE && !defined $first->nextSibling) {
1025: my $cdata = $doc->createCDATASection($first->nodeValue);
1026: $script->replaceChild($cdata, $first);
1027: }
1028: }
1029: }
1030:
1031: # removes "<!--" and "-->" at the beginning and end of style elements
1032: sub fix_style_element {
1033: my ($root) = @_;
1034: my @styles = $root->getElementsByTagName('style');
1035: foreach my $style (@styles) {
1036: if (defined $style->firstChild && $style->firstChild->nodeType == XML_TEXT_NODE &&
1037: !defined $style->firstChild->nextSibling) {
1038: my $text = $style->firstChild->nodeValue;
1039: if ($text =~ /^\s*<!--(.*)-->\s*$/s) {
1040: $style->firstChild->setData($1);
1041: }
1042: }
1043: }
1044: }
1045:
1046: # create missing cells at the end of table rows
1047: sub fix_tables {
1048: my ($root) = @_;
1049: my @tables = $root->getElementsByTagName('table');
1050: foreach my $table (@tables) {
1051: fix_cells($table);
1052: foreach my $tbody ($table->getChildrenByTagName('tbody')) {
1053: fix_cells($tbody);
1054: }
1055: foreach my $thead ($table->getChildrenByTagName('thead')) {
1056: fix_cells($thead);
1057: }
1058: foreach my $tfoot ($table->getChildrenByTagName('tfoot')) {
1059: fix_cells($tfoot);
1060: }
1061: }
1062: }
1063:
1064: # create missing cells at the end of table rows
1065: sub fix_cells {
1066: my ($table) = @_; # could actually be table, tbody, thead or tfoot
1067: my $doc = $table->ownerDocument;
1068: my @nb_cells = ();
1069: my $max_nb_cells = 0;
1070: my @rowspans = ();
1071: my @trs = $table->getChildrenByTagName('tr');
1072: foreach my $tr (@trs) {
1073: my $nb_cells;
1074: if (defined $rowspans[0]) {
1075: $nb_cells = shift(@rowspans);
1076: } else {
1077: $nb_cells = 0;
1078: }
1079: for (my $cell=$tr->firstChild; defined $cell; $cell=$cell->nextSibling) {
1080: if ($cell->nodeName eq 'td' || $cell->nodeName eq 'th') {
1081: my $colspan = $cell->getAttribute('colspan');
1082: if (defined $colspan && $colspan =~ /^\s*[0-9]+\s*$/) {
1083: $nb_cells += $colspan;
1084: } else {
1085: $nb_cells++;
1086: }
1087: my $rowspan = $cell->getAttribute('rowspan');
1088: if (defined $rowspan && $rowspan =~ /^\s*[0-9]+\s*$/) {
1089: for (my $i=0; $i < $rowspan-1; $i++) {
1090: if (!defined $rowspans[$i]) {
1091: $rowspans[$i] = 1;
1092: } else {
1093: $rowspans[$i]++;
1094: }
1095: }
1096: }
1097: }
1098: }
1099: push(@nb_cells, $nb_cells);
1100: if ($nb_cells > $max_nb_cells) {
1101: $max_nb_cells = $nb_cells;
1102: }
1103: }
1104: foreach my $tr (@trs) {
1105: my $nb_cells = shift(@nb_cells);
1106: if ($nb_cells < $max_nb_cells) {
1107: for (1..($max_nb_cells - $nb_cells)) {
1108: $tr->appendChild($doc->createElement('td'));
1109: }
1110: }
1111: }
1112: }
1113:
1114: # replaces ul/ul by ul/li/ul and the same for ol (using the previous li if possible)
1115: # also adds a ul element when a li has no ul/ol ancestor
1116: sub fix_lists {
1117: my ($root) = @_;
1118: my $doc = $root->ownerDocument;
1119: my @uls = $root->getElementsByTagName('ul');
1120: my @ols = $root->getElementsByTagName('ol');
1121: my @lists = (@uls, @ols);
1122: foreach my $list (@lists) {
1123: my $next;
1124: for (my $child=$list->firstChild; defined $child; $child=$next) {
1125: $next = $child->nextSibling;
1126: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array(['ul','ol'], $child->nodeName)) {
1127: my $previous = $child->previousNonBlankSibling(); # note: non-DOM method
1128: $list->removeChild($child);
1129: if (defined $previous && $previous->nodeType == XML_ELEMENT_NODE && $previous->nodeName eq 'li') {
1130: $previous->appendChild($child);
1131: } else {
1132: my $li = $doc->createElement('li');
1133: $li->appendChild($child);
1134: if (!defined $next) {
1135: $list->appendChild($li);
1136: } else {
1137: $list->insertBefore($li, $next);
1138: }
1139: }
1140: }
1141: }
1142: }
1143: my @lis = $root->getElementsByTagName('li');
1144: foreach my $li (@lis) {
1145: my $found_list_ancestor = 0;
1146: my $ancestor = $li->parentNode;
1147: while (defined $ancestor) {
1148: if ($ancestor->nodeName eq 'ul' || $ancestor->nodeName eq 'ol') {
1149: $found_list_ancestor = 1;
1150: last;
1151: }
1152: $ancestor = $ancestor->parentNode;
1153: }
1154: if (!$found_list_ancestor) {
1155: # replace li by ul and add li under ul
1156: my $ul = $doc->createElement('ul');
1157: $li->parentNode->insertBefore($ul, $li);
1158: $li->parentNode->removeChild($li);
1159: $ul->appendChild($li);
1160: # add all other li afterwards inside ul (there might be text nodes in-between)
1161: my $next = $ul->nextSibling;
1162: while (defined $next) {
1163: my $next_next = $next->nextSibling;
1164: if ($next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^\s*$/ &&
1165: defined $next_next && $next_next->nodeType == XML_ELEMENT_NODE && $next_next->nodeName eq 'li') {
1166: $next->parentNode->removeChild($next);
1167: $ul->appendChild($next);
1168: $next = $next_next;
1169: $next_next = $next_next->nextSibling;
1170: }
1171: if ($next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'li') {
1172: $next->parentNode->removeChild($next);
1173: $ul->appendChild($next);
1174: } else {
1175: last;
1176: }
1177: $next = $next_next;
1178: }
1179: }
1180: }
1181: }
1182:
1183: # Some "image" elements are actually img element with a wrong name. This renames them.
1184: # Amazingly enough, "<image src=..." displays an image in some browsers
1185: # ("image" has existed at some point as an experimental HTML element).
1186: sub fix_wrong_name_for_img {
1187: my ($root) = @_;
1188: my @images = $root->getElementsByTagName('image');
1189: foreach my $image (@images) {
1190: if (!defined $image->getAttribute('src')) {
1191: next;
1192: }
1193: my $found_correct_ancestor = 0;
1194: my $ancestor = $image->parentNode;
1195: while (defined $ancestor) {
1196: if ($ancestor->nodeName eq 'drawimage' || $ancestor->nodeName eq 'imageresponse') {
1197: $found_correct_ancestor = 1;
1198: last;
1199: }
1200: $ancestor = $ancestor->parentNode;
1201: }
1202: if ($found_correct_ancestor) {
1203: next;
1204: }
1205: # this really has to be renamed "img"
1206: $image->setNodeName('img');
1207: }
1208: }
1209:
1210: # Replaces many deprecated attributes and replaces them by equivalent CSS when possible
1211: sub replace_deprecated_attributes_by_css {
1212: my ($root) = @_;
1213:
1214: fix_deprecated_in_tables($root);
1215:
1216: fix_deprecated_in_table_rows($root);
1217:
1218: fix_deprecated_in_table_cells($root);
1219:
1220: fix_deprecated_in_lists($root);
1221:
1222: fix_deprecated_in_list_items($root);
1223:
1224: fix_deprecated_in_hr($root);
1225:
1226: fix_deprecated_in_img($root);
1227:
1228: fix_deprecated_in_body($root);
1229:
1230: fix_align_attribute($root);
1231: }
1232:
1233: # Replaces deprecated attributes in tables
1234: sub fix_deprecated_in_tables {
1235: my ($root) = @_;
1236: my @tables = $root->getElementsByTagName('table');
1237: foreach my $table (@tables) {
1238: tie (my %new_properties, 'Tie::IxHash', ());
1239: my $align = $table->getAttribute('align');
1240: if (defined $align) {
1241: $table->removeAttribute('align');
1242: $align = lc(trim($align));
1243: }
1244: if ($table->parentNode->nodeName eq 'center' || (defined $align && $align eq 'center') ||
1245: (defined $table->parentNode->getAttribute('align') && $table->parentNode->getAttribute('align') eq 'center')) {
1246: $new_properties{'margin-left'} = 'auto';
1247: $new_properties{'margin-right'} = 'auto';
1248: }
1249: if (defined $align && ($align eq 'left' || $align eq 'right')) {
1250: $new_properties{'float'} = $align;
1251: }
1252: my $width = $table->getAttribute('width');
1253: if (defined $width) {
1254: $table->removeAttribute('width');
1255: $width = trim($width);
1256: if ($width =~ /^[0-9]+$/) {
1257: $width .= 'px';
1258: }
1259: if ($width ne '') {
1260: $new_properties{'width'} = $width;
1261: }
1262: }
1263: my $height = $table->getAttribute('height');
1264: if (defined $height) {
1265: $table->removeAttribute('height');
1266: # no replacement for table height
1267: }
1268: my $bgcolor = $table->getAttribute('bgcolor');
1269: if (defined $bgcolor) {
1270: $table->removeAttribute('bgcolor');
1271: $bgcolor = trim($bgcolor);
1272: $bgcolor =~ s/^x\s*//;
1273: if ($bgcolor ne '') {
1274: $new_properties{'background-color'} = $bgcolor;
1275: }
1276: }
1277: my $frame = $table->getAttribute('frame');
1278: if (defined $frame) {
1279: $table->removeAttribute('frame');
1280: $frame = lc(trim($frame));
1281: if ($frame eq 'void') {
1282: $new_properties{'border'} = 'none';
1283: } elsif ($frame eq 'above') {
1284: $new_properties{'border-top'} = '1px solid black';
1285: } elsif ($frame eq 'below') {
1286: $new_properties{'border-bottom'} = '1px solid black';
1287: } elsif ($frame eq 'hsides') {
1288: $new_properties{'border-top'} = '1px solid black';
1289: $new_properties{'border-bottom'} = '1px solid black';
1290: } elsif ($frame eq 'vsides') {
1291: $new_properties{'border-left'} = '1px solid black';
1292: $new_properties{'border-right'} = '1px solid black';
1293: } elsif ($frame eq 'lhs') {
1294: $new_properties{'border-left'} = '1px solid black';
1295: } elsif ($frame eq 'rhs') {
1296: $new_properties{'border-right'} = '1px solid black';
1297: } elsif ($frame eq 'box') {
1298: $new_properties{'border'} = '1px solid black';
1299: } elsif ($frame eq 'border') {
1300: $new_properties{'border'} = '1px solid black';
1301: }
1302: }
1303: if (scalar(keys %new_properties) > 0) {
1304: set_css_properties($table, \%new_properties);
1305: }
1306: # we can't replace the border attribute without creating a style block, but we can improve things like border="BORDER"
1307: my $border = $table->getAttribute('border');
1308: if (defined $border) {
1309: $border = trim($border);
1310: if ($border !~ /^\s*[0-9]+\s*(px)?\s*$/) {
1311: $table->setAttribute('border', '1');
1312: }
1313: }
1314: }
1315:
1316: }
1317:
1318: # Replaces deprecated attributes in tr elements
1319: sub fix_deprecated_in_table_rows {
1320: my ($root) = @_;
1321: my @trs = $root->getElementsByTagName('tr');
1322: foreach my $tr (@trs) {
1323: my $old_properties = get_css_properties($tr);
1324: tie (my %new_properties, 'Tie::IxHash', ());
1325: my $bgcolor = $tr->getAttribute('bgcolor');
1326: if (defined $bgcolor) {
1327: $tr->removeAttribute('bgcolor');
1328: if (!defined $old_properties->{'background-color'}) {
1329: $bgcolor = trim($bgcolor);
1330: $bgcolor =~ s/^x\s*//;
1331: if ($bgcolor ne '') {
1332: $new_properties{'background-color'} = $bgcolor;
1333: }
1334: }
1335: }
1336: my $align = $tr->getAttribute('align');
1337: if (defined $align && $align !~ /\s*char\s*/i) {
1338: $tr->removeAttribute('align');
1339: if (!defined $old_properties->{'text-align'}) {
1340: $align = lc(trim($align));
1341: if ($align ne '') {
1342: $new_properties{'text-align'} = $align;
1343: }
1344: }
1345: }
1346: my $valign = $tr->getAttribute('valign');
1347: if (defined $valign) {
1348: $tr->removeAttribute('valign');
1349: if (!defined $old_properties->{'vertical-align'}) {
1350: $valign = lc(trim($valign));
1351: if ($valign ne '') {
1352: $new_properties{'vertical-align'} = $valign;
1353: }
1354: }
1355: }
1356: if (scalar(keys %new_properties) > 0) {
1357: set_css_properties($tr, \%new_properties);
1358: }
1359: }
1360: }
1361:
1362: # Replaces deprecated attributes in table cells (td and th)
1363: sub fix_deprecated_in_table_cells {
1364: my ($root) = @_;
1365: my @tds = $root->getElementsByTagName('td');
1366: my @ths = $root->getElementsByTagName('th');
1367: my @cells = (@tds, @ths);
1368: foreach my $cell (@cells) {
1369: my $old_properties = get_css_properties($cell);
1370: tie (my %new_properties, 'Tie::IxHash', ());
1371: my $width = $cell->getAttribute('width');
1372: if (defined $width) {
1373: $cell->removeAttribute('width');
1374: if (!defined $old_properties->{'width'}) {
1375: $width = trim($width);
1376: if ($width =~ /^[0-9]+$/) {
1377: $width .= 'px';
1378: }
1379: if ($width ne '') {
1380: $new_properties{'width'} = $width;
1381: }
1382: }
1383: }
1384: my $height = $cell->getAttribute('height');
1385: if (defined $height) {
1386: $cell->removeAttribute('height');
1387: if (!defined $old_properties->{'height'}) {
1388: $height = trim($height);
1389: if ($height =~ /^[0-9]+$/) {
1390: $height .= 'px';
1391: }
1392: if ($height ne '') {
1393: $new_properties{'height'} = $height;
1394: }
1395: }
1396: }
1397: my $bgcolor = $cell->getAttribute('bgcolor');
1398: if (defined $bgcolor) {
1399: $cell->removeAttribute('bgcolor');
1400: if (!defined $old_properties->{'background-color'}) {
1401: $bgcolor = trim($bgcolor);
1402: $bgcolor =~ s/^x\s*//;
1403: if ($bgcolor ne '') {
1404: $new_properties{'background-color'} = $bgcolor;
1405: }
1406: }
1407: }
1408: my $align = $cell->getAttribute('align');
1409: if (defined $align && $align !~ /\s*char\s*/i) {
1410: $cell->removeAttribute('align');
1411: if (!defined $old_properties->{'text-align'}) {
1412: $align = lc(trim($align));
1413: if ($align ne '') {
1414: $new_properties{'text-align'} = $align;
1415: }
1416: }
1417: }
1418: my $valign = $cell->getAttribute('valign');
1419: if (defined $valign) {
1420: $cell->removeAttribute('valign');
1421: if (!defined $old_properties->{'vertical-align'}) {
1422: $valign = lc(trim($valign));
1423: if ($valign ne '') {
1424: $new_properties{'vertical-align'} = $valign;
1425: }
1426: }
1427: }
1428: if (scalar(keys %new_properties) > 0) {
1429: set_css_properties($cell, \%new_properties);
1430: }
1431: }
1432: }
1433:
1434: # Replaces deprecated attributes in lists (ul and ol)
1435: sub fix_deprecated_in_lists {
1436: my ($root) = @_;
1437: my @uls = $root->getElementsByTagName('ul');
1438: my @ols = $root->getElementsByTagName('ol');
1439: my @lists = (@uls, @ols);
1440: foreach my $list (@lists) {
1441: my $type = $list->getAttribute('type');
1442: if (defined $type) {
1443: my $lst = list_style_type($type);
1444: if (defined $lst) {
1445: $list->removeAttribute('type');
1446: if (!defined get_css_property($list, 'list-style-type')) {
1447: set_css_property($list, 'list-style-type', $lst);
1448: }
1449: }
1450: }
1451: }
1452: }
1453:
1454: # Replaces deprecated attributes in list items (li)
1455: sub fix_deprecated_in_list_items {
1456: my ($root) = @_;
1457: my @lis = $root->getElementsByTagName('li');
1458: foreach my $li (@lis) {
1459: my $type = $li->getAttribute('type');
1460: if (defined $type) {
1461: my $lst = list_style_type($type);
1462: if (defined $lst) {
1463: $li->removeAttribute('type');
1464: if (!defined get_css_property($li, 'list-style-type')) {
1465: set_css_property($li, 'list-style-type', $lst);
1466: }
1467: }
1468: }
1469: }
1470: }
1471:
1472: # returns the CSS list-style-type value equivalent to the given type attribute for a list or list item
1473: sub list_style_type {
1474: my ($type) = @_;
1475: my $value;
1476: $type = trim($type);
1477: if (lc($type) eq 'circle') {
1478: $value = 'circle';
1479: } elsif (lc($type) eq 'disc') {
1480: $value = 'disc';
1481: } elsif (lc($type) eq 'square') {
1482: $value = 'square';
1483: } elsif ($type eq 'a') {
1484: $value = 'lower-latin';
1485: } elsif ($type eq 'A') {
1486: $value = 'upper-latin';
1487: } elsif ($type eq 'i') {
1488: $value = 'lower-roman';
1489: } elsif ($type eq 'I') {
1490: $value = 'upper-roman';
1491: } elsif ($type eq '1') {
1492: $value = 'decimal';
1493: }
1494: return $value;
1495: }
1496:
1497: # Replaces deprecated attributes in hr
1498: sub fix_deprecated_in_hr {
1499: my ($root) = @_;
1500: my @hrs = $root->getElementsByTagName('hr');
1501: foreach my $hr (@hrs) {
1502: tie (my %new_properties, 'Tie::IxHash', ());
1503: my $align = $hr->getAttribute('align');
1504: if (defined $align) {
1505: $align = lc(trim($align));
1506: if ($align eq 'left') {
1507: $new_properties{'text-align'} = 'left';
1508: $new_properties{'margin-left'} = '0';
1509: } elsif ($align eq 'right') {
1510: $new_properties{'text-align'} = 'right';
1511: $new_properties{'margin-right'} = '0';
1512: }
1513: $hr->removeAttribute('align');
1514: }
1515: my $color = $hr->getAttribute('color');
1516: if (defined $color) {
1517: $color = trim($color);
1518: $color =~ s/^x\s*//;
1519: if ($color ne '') {
1520: $new_properties{'color'} = $color;
1521: $new_properties{'background-color'} = $color;
1522: }
1523: $hr->removeAttribute('color');
1524: }
1525: my $noshade = $hr->getAttribute('noshade');
1526: my $size = $hr->getAttribute('size');
1527: if (defined $noshade) {
1528: $new_properties{'border-width'} = '0';
1529: if (!defined $color) {
1530: $new_properties{'color'} = 'gray';
1531: $new_properties{'background-color'} = 'gray';
1532: }
1533: if (!defined $size) {
1534: $size = '2';
1535: }
1536: $hr->removeAttribute('noshade');
1537: }
1538: if (defined $size) {
1539: $size = trim($size);
1540: if ($size ne '') {
1541: $new_properties{'height'} = $size.'px';
1542: }
1543: if (defined $hr->getAttribute('size')) {
1544: $hr->removeAttribute('size');
1545: }
1546: }
1547: my $width = $hr->getAttribute('width');
1548: if (defined $width) {
1549: $width = trim($width);
1550: if ($width ne '') {
1551: if ($width !~ /\%$/) {
1552: $width .= 'px';
1553: }
1554: $new_properties{'width'} = $width;
1555: }
1556: $hr->removeAttribute('width');
1557: }
1558: if (scalar(keys %new_properties) > 0) {
1559: set_css_properties($hr, \%new_properties);
1560: }
1561: }
1562: }
1563:
1564: # Replaces deprecated attributes in img
1565: sub fix_deprecated_in_img {
1566: my ($root) = @_;
1567: my @imgs = $root->getElementsByTagName('img');
1568: foreach my $img (@imgs) {
1569: my $old_properties = get_css_properties($img);
1570: tie (my %new_properties, 'Tie::IxHash', ());
1571: my $align = $img->getAttribute('align');
1572: if (defined $align) {
1573: $align = lc(trim($align));
1574: if ($align eq 'middle' || $align eq 'top' || $align eq 'bottom') {
1575: $img->removeAttribute('align');
1576: if (!defined $old_properties->{'vertical-align'}) {
1577: $new_properties{'vertical-align'} = $align;
1578: }
1579: } elsif ($align eq 'left' || $align eq 'right') {
1580: $img->removeAttribute('align');
1581: if (!defined $old_properties->{'float'}) {
1582: $new_properties{'float'} = $align;
1583: }
1584: } elsif ($align eq 'center' || $align eq '') {
1585: $img->removeAttribute('align');
1586: }
1587: }
1588: my $border = $img->getAttribute('border');
1589: if (defined $border) {
1590: $border = lc(trim($border));
1591: if ($border =~ /^[0-9]+\s*(px)?$/) {
1592: $img->removeAttribute('border');
1593: if (!defined $old_properties->{'border'}) {
1594: if ($border !~ /px$/) {
1595: $border .= 'px';
1596: }
1597: $new_properties{'border'} = $border.' solid black';
1598: }
1599: }
1600: }
1601: my $hspace = $img->getAttribute('hspace');
1602: if (defined $hspace) {
1603: $hspace = lc(trim($hspace));
1604: if ($hspace =~ /^[0-9]+\s*(px)?$/) {
1605: $img->removeAttribute('hspace');
1606: if (!defined $old_properties->{'margin-left'} || !defined $old_properties->{'margin-right'}) {
1607: if ($hspace !~ /px$/) {
1608: $hspace .= 'px';
1609: }
1610: $new_properties{'margin-left'} = $hspace;
1611: $new_properties{'margin-right'} = $hspace;
1612: }
1613: }
1614: }
1615: if (scalar(keys %new_properties) > 0) {
1616: set_css_properties($img, \%new_properties);
1617: }
1618: }
1619: }
1620:
1621: # Replaces deprecated attributes in htmlbody (the style attribute could be used in a div for output)
1622: sub fix_deprecated_in_body {
1623: my ($root) = @_;
1624: my $doc = $root->ownerDocument;
1625: my @bodies = $root->getElementsByTagName('htmlbody');
1626: foreach my $body (@bodies) {
1627: my $old_properties = get_css_properties($body);
1628: tie (my %new_properties, 'Tie::IxHash', ());
1629: my $bgcolor = $body->getAttribute('bgcolor');
1630: if (defined $bgcolor) {
1631: $body->removeAttribute('bgcolor');
1632: if (!defined $old_properties->{'background-color'}) {
1633: $bgcolor = trim($bgcolor);
1634: $bgcolor =~ s/^x\s*//;
1635: if ($bgcolor ne '') {
1636: $new_properties{'background-color'} = $bgcolor;
1637: }
1638: }
1639: }
1640: my $color = $body->getAttribute('text');
1641: if (defined $color) {
1642: $body->removeAttribute('text');
1643: if (!defined $old_properties->{'color'}) {
1644: $color = trim($color);
1645: $color =~ s/^x\s*//;
1646: if ($color ne '') {
1647: $new_properties{'color'} = $color;
1648: }
1649: }
1650: }
1651: my $background = $body->getAttribute('background');
1652: if (defined $background && ($background =~ /\.jpe?g$|\.gif|\.png/i)) {
1653: $body->removeAttribute('background');
1654: if (!defined $old_properties->{'background-image'}) {
1655: $background = trim($background);
1656: if ($background ne '') {
1657: $new_properties{'background-image'} = 'url('.$background.')';
1658: }
1659: }
1660: }
1661: # NOTE: these attributes have never been standard and are better removed with no replacement
1662: foreach my $bad ('bottommargin', 'leftmargin', 'rightmargin', 'topmargin', 'marginheight', 'marginwidth') {
1663: if ($body->hasAttribute($bad)) {
1664: $body->removeAttribute($bad);
1665: }
1666: }
1667: # NOTE: link alink and vlink require a <style> block to be converted
1668: my $link = $body->getAttribute('link');
1669: my $alink = $body->getAttribute('alink');
1670: my $vlink = $body->getAttribute('vlink');
1671: if (defined $link || defined $alink || defined $vlink) {
1672: my $head;
1673: my @heads = $root->getElementsByTagName('htmlhead');
1674: if (scalar(@heads) > 0) {
1675: $head = $heads[0];
1676: } else {
1677: $head = $doc->createElement('htmlhead');
1678: $root->insertBefore($head, $root->firstChild);
1679: }
1680: my $style = $doc->createElement('style');
1681: $head->appendChild($style);
1682: my $css = "\n";
1683: if (defined $link) {
1684: $body->removeAttribute('link');
1685: $link = trim($link);
1686: $link =~ s/^x\s*//;
1687: $css .= ' a:link { color:'.$link.' }';
1688: $css .= "\n";
1689: }
1690: if (defined $alink) {
1691: $body->removeAttribute('alink');
1692: $alink = trim($alink);
1693: $alink =~ s/^x\s*//;
1694: $css .= ' a:active { color:'.$alink.' }';
1695: $css .= "\n";
1696: }
1697: if (defined $vlink) {
1698: $body->removeAttribute('vlink');
1699: $vlink = trim($vlink);
1700: $vlink =~ s/^x\s*//;
1701: $css .= ' a:visited { color:'.$vlink.' }';
1702: $css .= "\n";
1703: }
1704: $css .= ' ';
1705: $style->appendChild($doc->createTextNode($css));
1706: }
1707: if (scalar(keys %new_properties) > 0) {
1708: set_css_properties($body, \%new_properties);
1709: } elsif (!$body->hasAttributes) {
1710: $body->parentNode->removeChild($body);
1711: }
1712: }
1713: }
1714:
1715: # replaces <div align="center"> by <div style="text-align:center;">
1716: # also for p and h1..h6
1717: sub fix_align_attribute {
1718: my ($root) = @_;
1719: my @nodes = $root->getElementsByTagName('div');
1720: push(@nodes, $root->getElementsByTagName('p'));
1721: for (my $i=1; $i<=6; $i++) {
1722: push(@nodes, $root->getElementsByTagName('h'.$i));
1723: }
1724: foreach my $node (@nodes) {
1725: my $align = $node->getAttribute('align');
1726: if (defined $align) {
1727: $node->removeAttribute('align');
1728: $align = trim($align);
1729: if ($align ne '' && !defined get_css_property($node, 'text-align')) {
1730: set_css_property($node, 'text-align', lc($align));
1731: }
1732: }
1733: }
1734: }
1735:
1736: # replace center by a div or remove it if there is a table inside
1737: sub replace_center {
1738: my ($root, $all_block) = @_;
1739: my $doc = $root->ownerDocument;
1740: my @centers = $root->getElementsByTagName('center');
1741: foreach my $center (@centers) {
1742: if ($center->getChildrenByTagName('table')->size() > 0) { # note: getChildrenByTagName is not DOM (LibXML specific)
1743: replace_by_children($center);
1744: } else {
1745: if ((!defined $center->previousSibling ||
1746: ($center->previousSibling->nodeType == XML_TEXT_NODE && $center->previousSibling->nodeValue =~ /^\s*$/ && !defined $center->previousSibling->previousSibling)) &&
1747: (!defined $center->nextSibling ||
1748: ($center->nextSibling->nodeType == XML_TEXT_NODE && $center->nextSibling->nodeValue =~ /^\s*$/ && !defined $center->nextSibling->nextSibling)) &&
1749: string_in_array(\@accepting_style, $center->parentNode->nodeName)) {
1750: # use CSS on the parent block and replace center by its children
1751: set_css_property($center->parentNode, 'text-align', 'center');
1752: replace_by_children($center);
1753: } else {
1754: # use p or div ? check if there is a block inside
1755: my $found_block = 0;
1756: for (my $child=$center->firstChild; defined $child; $child=$child->nextSibling) {
1757: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1758: $found_block = 1;
1759: last;
1760: }
1761: }
1762: my $new_node;
1763: if ($found_block) {
1764: $new_node = $doc->createElement('div');
1765: $new_node->setAttribute('style', 'text-align: center; margin: 0 auto');
1766: } else {
1767: $new_node = $doc->createElement('p');
1768: $new_node->setAttribute('style', 'text-align: center');
1769: }
1770: my $next;
1771: for (my $child=$center->firstChild; defined $child; $child=$next) {
1772: $next = $child->nextSibling;
1773: $center->removeChild($child);
1774: $new_node->appendChild($child);
1775: }
1776: $center->parentNode->replaceChild($new_node, $center);
1777: }
1778: }
1779: }
1780: }
1781:
1782: # replaces <nobr> by <span style="white-space:nowrap">
1783: sub replace_nobr {
1784: my ($root) = @_;
1785: my @nobrs = $root->getElementsByTagName('nobr');
1786: foreach my $nobr (@nobrs) {
1787: if (!defined $nobr->previousSibling && !defined $nobr->nextSibling &&
1788: string_in_array(\@accepting_style, $nobr->parentNode->nodeName)) {
1789: # use CSS on the parent block
1790: set_css_property($nobr->parentNode, 'white-space', 'nowrap');
1791: replace_by_children($nobr);
1792: } else {
1793: $nobr->setNodeName('span');
1794: $nobr->setAttribute('style', 'white-space:nowrap');
1795: }
1796: }
1797: }
1798:
1799: # removes notsolved tags in the case <hintgroup showoncorrect="no"><notsolved>...</notsolved></hintgroup>
1800: # and in the case <notsolved><hintgroup showoncorrect="no">...</hintgroup></notsolved>
1801: sub remove_useless_notsolved {
1802: my ($root) = @_;
1803: my @hintgroups = $root->getElementsByTagName('hintgroup');
1804: foreach my $hintgroup (@hintgroups) {
1805: my $showoncorrect = get_non_empty_attribute($hintgroup, 'showoncorrect');
1806: if (!defined $showoncorrect || $showoncorrect eq 'no') {
1807: my @notsolveds = $hintgroup->getElementsByTagName('notsolved');
1808: foreach my $notsolved (@notsolveds) {
1809: replace_by_children($notsolved);
1810: }
1811: }
1812: my $parent = $hintgroup->parentNode;
1813: if ($parent->nodeName eq 'notsolved' && scalar(@{$parent->nonBlankChildNodes()}) == 1) {
1814: replace_by_children($parent);
1815: }
1816: }
1817: }
1818:
1819: # adds a paragraph inside if needed and calls fix_paragraph for all paragraphs (including new ones)
1820: sub fix_paragraphs_inside {
1821: my ($node, $all_block) = @_;
1822: # blocks in which paragrahs will be added:
1823: my @blocks_with_p = ('loncapa','library','problem','part','problemtype','window','block','while','postanswerdate','preduedate','solved','notsolved','languageblock','translated','lang','instructorcomment','togglebox','standalone','form');
1824: my @fix_p_if_br_or_p = (@responses,'foil','item','text','label','hintgroup','hintpart','hint','web','windowlink','div','li','dd','td','th','blockquote');
1825: if ((string_in_array(\@blocks_with_p, $node->nodeName) && paragraph_needed($node)) ||
1826: (string_in_array(\@fix_p_if_br_or_p, $node->nodeName) && paragraph_inside($node))) {
1827: # if non-empty, add paragraphs where needed between all br and remove br
1828: # (it would be easier to just put everything in a p and fix it afterwards, but there are performance issues
1829: # when a paragraph has many blocks directly inside)
1830: my $doc = $node->ownerDocument;
1831: my $p = undef;
1832: my @new_children = ();
1833: my $next;
1834: for (my $child=$node->firstChild; defined $child; $child=$next) {
1835: $next = $child->nextSibling;
1836: $node->removeChild($child);
1837: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'br') {
1838: if (defined $p) {
1839: push(@new_children, $p);
1840: } else {
1841: push(@new_children, $doc->createElement('p'));
1842: }
1843: $p = undef;
1844: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array(\@inline_like_block, $child->nodeName)) {
1845: # inline_like_block: use the paragraph if there is one, otherwise do not create one
1846: if (defined $p) {
1847: $p->appendChild($child);
1848: } else {
1849: push(@new_children, $child);
1850: }
1851: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1852: # these children are blocks and should not be in a paragraph
1853: if (defined $p) {
1854: push(@new_children, $p);
1855: $p = undef;
1856: }
1857: push(@new_children, $child);
1858: } elsif ($child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^[ \t\f\n\r]*$/) {
1859: # blank text: add to paragraph if there is one and there is a next node, otherwise keep out of the paragraph
1860: if (defined $p) {
1861: if (defined $next) {
1862: $p->appendChild($child);
1863: } else {
1864: push(@new_children, $p);
1865: $p = undef;
1866: push(@new_children, $child);
1867: }
1868: } else {
1869: push(@new_children, $child);
1870: }
1871: } elsif ($child->nodeType == XML_TEXT_NODE ||
1872: $child->nodeType == XML_ELEMENT_NODE || $child->nodeType == XML_CDATA_SECTION_NODE ||
1873: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1874: # these children require a paragraph
1875: if (!defined $p) {
1876: $p = $doc->createElement('p');
1877: }
1878: $p->appendChild($child);
1879: } else {
1880: # these children do not require a paragraph (XML comments, PI)
1881: # -> do not move them in a new paragraph
1882: if (defined $p) {
1883: push(@new_children, $p);
1884: $p = undef;
1885: }
1886: push(@new_children, $child);
1887: }
1888: }
1889: if (defined $p) {
1890: push(@new_children, $p);
1891: }
1892: foreach my $child (@new_children) {
1893: $node->appendChild($child);
1894: }
1895: }
1896: # now fix the paragraphs everywhere, so that all inline nodes are inside a paragraph, and block nodes are outside
1897: my $next;
1898: for (my $child=$node->firstChild; defined $child; $child=$next) {
1899: $next = $child->nextSibling;
1900: if ($child->nodeType == XML_ELEMENT_NODE && defined $child->firstChild) {
1901: if ($child->nodeName eq 'p') {
1902: fix_paragraph($child, $all_block);
1903: } else {
1904: fix_paragraphs_inside($child, $all_block);
1905: }
1906: }
1907: }
1908: }
1909:
1910: # returns 1 if a paragraph is needed inside this node (assuming the parent can have paragraphs)
1911: sub paragraph_needed {
1912: my ($node) = @_;
1913: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1914: if (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
1915: ($child->nodeType == XML_ELEMENT_NODE && !string_in_array(\@inline_like_block, $child->nodeName)) ||
1916: $child->nodeType == XML_CDATA_SECTION_NODE ||
1917: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1918: return(1);
1919: }
1920: }
1921: return(0);
1922: }
1923:
1924: # returns 1 if there is a paragraph or br in a child of this node, or inside an inline child
1925: sub paragraph_inside {
1926: my ($node) = @_;
1927: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
1928: # (also used in first_block below)
1929: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
1930: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1931: if ($child->nodeType == XML_ELEMENT_NODE) {
1932: my $name = $child->nodeName;
1933: if ($name eq 'p' || $name eq 'br') {
1934: return(1);
1935: } elsif (string_in_array(\@splitable_inline, $name)) {
1936: if (paragraph_inside($child)) {
1937: return(1);
1938: }
1939: }
1940: }
1941: }
1942: return(0);
1943: }
1944:
1945: # fixes paragraphs inside paragraphs (without a block in-between)
1946: sub fix_paragraph {
1947: my ($p, $all_block) = @_;
1948: my $loop_right = 1; # this loops is to avoid out of memory errors with recurse, see below
1949: while ($loop_right) {
1950: $loop_right = 0;
1951: my $block = find_first_block($p, $all_block);
1952: if (defined $block) {
1953: my $trees = clone_ancestor_around_node($p, $block);
1954: my $doc = $p->ownerDocument;
1955: my $replacement = $doc->createDocumentFragment();
1956: my $left = $trees->{'left'};
1957: my $middle = $trees->{'middle'};
1958: my $right = $trees->{'right'};
1959: my $left_needs_p = 0; # 1 if it needs a paragraph (used to replace br later)
1960:
1961: if (defined $left) {
1962: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
1963: for (my $child=$left->firstChild; defined $child; $child=$child->nextSibling) {
1964: if ($child->nodeType == XML_ELEMENT_NODE) {
1965: fix_paragraphs_inside($child, $all_block);
1966: }
1967: }
1968: if (!paragraph_needed($left)) {
1969: # this was just blank text, comments or inline responses, it should not create a new paragraph
1970: my $next;
1971: for (my $child=$left->firstChild; defined $child; $child=$next) {
1972: $next = $child->nextSibling;
1973: $left->removeChild($child);
1974: $replacement->appendChild($child);
1975: }
1976: } else {
1977: $left_needs_p = 1;
1978: $replacement->appendChild($left);
1979: }
1980: }
1981:
1982: my $n = $middle->firstChild;
1983: while (defined $n) {
1984: if ($n->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $n->nodeName) || $n->nodeName eq 'br')) {
1985: if ($n->nodeName eq 'p') {
1986: my $parent = $n->parentNode;
1987: # first apply recursion
1988: fix_paragraph($n, $all_block);
1989: # now the p might have been replaced by several nodes, which should replace the initial p
1990: my $next_block;
1991: for (my $block=$parent->firstChild; defined $block; $block=$next_block) {
1992: $next_block = $block->nextSibling;
1993: if ($block->nodeName eq 'p') {
1994: $parent->removeChild($block);
1995: # for each parent before $middle, clone in-between the p and its children (to preserve the styles)
1996: if (defined $block->firstChild) {
1997: for (my $p=$parent; $p!=$middle; $p=$p->parentNode) {
1998: my $newp = $p->cloneNode(0);
1999: my $next;
2000: for (my $child=$block->firstChild; defined $child; $child=$next) {
2001: $next = $child->nextSibling;
2002: $block->removeChild($child);
2003: $newp->appendChild($child);
2004: }
2005: $block->appendChild($newp);
2006: }
2007: }
2008: }
2009: $replacement->appendChild($block);
2010: }
2011: } else {
2012: # replace the whole p by this block, forgetting about intermediate inline elements
2013: $n->parentNode->removeChild($n);
2014: if ($n->nodeName eq 'br') {
2015: # replace a br by a paragraph if there was nothing before in the paragraph,
2016: # otherwise remove it because it already broke the paragraph in half
2017: if (!defined $left || !$left_needs_p) {
2018: $replacement->appendChild($middle);
2019: }
2020: } else {
2021: fix_paragraphs_inside($n, $all_block);
2022: $replacement->appendChild($n);
2023: }
2024: }
2025: last;
2026: }
2027: $n = $n->firstChild;
2028: if (defined $n && defined $n->nextSibling) {
2029: die "Error in post_xml.fix_paragraph: block not found";
2030: }
2031: }
2032:
2033: if (defined $right) {
2034: if ($block->nodeName eq 'p') {
2035: # remove attributes on the right paragraph
2036: my @attributelist = $right->attributes();
2037: foreach my $att (@attributelist) {
2038: $right->removeAttribute($att->nodeName);
2039: }
2040: }
2041: if ($right->firstChild->nodeType == XML_TEXT_NODE && $right->firstChild->nodeValue =~ /^[ \t\f\n\r]*$/) {
2042: # remove the first text node with whitespace only from the p, it should not trigger the creation of a p
2043: # (but take nbsp into account, so we should not use \s here)
2044: my $first = $right->firstChild;
2045: $right->removeChild($first);
2046: $replacement->appendChild($first);
2047: }
2048: if (defined $right->firstChild) {
2049: if (paragraph_needed($right)) {
2050: $replacement->appendChild($right);
2051: #fix_paragraph($right, $all_block); This is taking way too much memory for blocks with many children
2052: # -> loop instead of recurse
2053: $loop_right = 1;
2054: } else {
2055: # this was just blank text, comments or inline responses, it should not create a new paragraph
2056: my $next;
2057: for (my $child=$right->firstChild; defined $child; $child=$next) {
2058: $next = $child->nextSibling;
2059: $right->removeChild($child);
2060: $replacement->appendChild($child);
2061: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2062: if ($child->nodeType == XML_ELEMENT_NODE) {
2063: fix_paragraphs_inside($child, $all_block);
2064: }
2065: }
2066: }
2067: }
2068: }
2069:
2070: $p->parentNode->replaceChild($replacement, $p);
2071:
2072: if ($loop_right) {
2073: $p = $right;
2074: }
2075:
2076: } else {
2077: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2078: my $next;
2079: for (my $child=$p->firstChild; defined $child; $child=$next) {
2080: $next = $child->nextSibling;
2081: if ($child->nodeType == XML_ELEMENT_NODE) {
2082: fix_paragraphs_inside($child, $all_block);
2083: }
2084: }
2085: }
2086: }
2087: }
2088:
2089: sub find_first_block {
2090: my ($node, $all_block) = @_;
2091: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
2092: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
2093: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2094: if ($child->nodeType == XML_ELEMENT_NODE) {
2095: if (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br') {
2096: return($child);
2097: }
2098: if (string_in_array(\@splitable_inline, $child->nodeName)) {
2099: my $block = find_first_block($child, $all_block);
2100: if (defined $block) {
2101: return($block);
2102: }
2103: }
2104: }
2105: }
2106: return(undef);
2107: }
2108:
2109: # Creates clones of the ancestor containing the descendants before the node, at the node, and after the node.
2110: # returns a hash with: left, middle, right (left and right can be undef)
2111: sub clone_ancestor_around_node {
2112: my ($ancestor, $node) = @_;
2113: my $middle_node;
2114: my ($left, $middle, $right);
2115: for (my $child=$ancestor->firstChild; defined $child; $child=$child->nextSibling) {
2116: if ($child == $node || is_ancestor_of($child, $node)) {
2117: $middle_node = $child;
2118: last;
2119: }
2120: }
2121: if (!defined $middle_node) {
2122: die "error in split_ancestor_around_node: middle not found";
2123: }
2124: if (defined $middle_node->previousSibling) {
2125: $left = $ancestor->cloneNode(0);
2126: for (my $child=$ancestor->firstChild; $child != $middle_node; $child=$child->nextSibling) {
2127: $left->appendChild($child->cloneNode(1));
2128: }
2129: }
2130: $middle = $ancestor->cloneNode(0);
2131: if ($middle_node == $node) {
2132: $middle->appendChild($middle_node->cloneNode(1));
2133: } else {
2134: my $subres = clone_ancestor_around_node($middle_node, $node);
2135: my $subleft = $subres->{'left'};
2136: if (defined $subleft) {
2137: if (!defined $left) {
2138: $left = $ancestor->cloneNode(0);
2139: }
2140: $left->appendChild($subleft);
2141: }
2142: $middle->appendChild($subres->{'middle'});
2143: my $subright = $subres->{'right'};
2144: if (defined $subright) {
2145: $right = $ancestor->cloneNode(0);
2146: $right->appendChild($subright);
2147: }
2148: }
2149: if (defined $middle_node->nextSibling) {
2150: if (!defined $right) {
2151: $right = $ancestor->cloneNode(0);
2152: }
2153: for (my $child=$middle_node->nextSibling; defined $child; $child=$child->nextSibling) {
2154: $right->appendChild($child->cloneNode(1));
2155: }
2156: }
2157: my %result = ();
2158: $result{'left'} = $left;
2159: $result{'middle'} = $middle;
2160: $result{'right'} = $right;
2161: return(\%result);
2162: }
2163:
2164: sub is_ancestor_of {
2165: my ($n1, $n2) = @_;
2166: my $n = $n2->parentNode;
2167: while (defined $n) {
2168: if ($n == $n1) {
2169: return(1);
2170: }
2171: $n = $n->parentNode;
2172: }
2173: return(0);
2174: }
2175:
2176: # removes empty style elements and replaces the ones with only whitespaces inside by their content
2177: # also remove hints that have become empty after empty style removal.
2178: sub remove_empty_style {
2179: my ($root) = @_;
2180: # actually, preserve some elements like ins when they have whitespace, only remove if they are empty
2181: my @remove_if_empty = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u', 'hint');
2182: my @remove_if_blank = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'font', 'u', 'hint');
2183: foreach my $name (@remove_if_empty) {
2184: my @nodes = $root->getElementsByTagName($name);
2185: while (scalar(@nodes) > 0) {
2186: my $node = pop(@nodes);
2187: if (!defined $node->firstChild) {
2188: my $parent = $node->parentNode;
2189: if (defined $node->previousSibling && $node->previousSibling->nodeType == XML_TEXT_NODE &&
2190: $node->previousSibling->nodeValue =~ /\$\S*$/) {
2191: # case $a<sup></sup>x
2192: my $value = $node->previousSibling->nodeValue;
2193: $value =~ s/\$(\S*)$/\$\{$1\}/;
2194: $node->previousSibling->setData($value);
2195: }
2196: $parent->removeChild($node);
2197: $parent->normalize();
2198: # now that we removed the node, check if the parent has become an empty style, and so on
2199: while (defined $parent && string_in_array(\@remove_if_empty, $parent->nodeName) && !defined $parent->firstChild) {
2200: my $grandparent = $parent->parentNode;
2201: $grandparent->removeChild($parent);
2202: remove_reference_from_array(\@nodes, $parent);
2203: $parent = $grandparent;
2204: }
2205: }
2206: }
2207: }
2208: foreach my $name (@remove_if_blank) {
2209: my @nodes = $root->getElementsByTagName($name);
2210: while (scalar(@nodes) > 0) {
2211: my $node = pop(@nodes);
2212: if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE) {
2213: # NOTE: careful, with UTF-8, \s matches non-breaking spaces and we want to preserve these
2214: if ($node->firstChild->nodeValue =~ /^[\t\n\f\r ]*$/) {
2215: my $parent = $node->parentNode;
2216: replace_by_children($node);
2217: $parent->normalize();
2218: # now that we removed the node, check if the parent has become a style with only whitespace, and so on
2219: while (defined $parent && string_in_array(\@remove_if_blank, $parent->nodeName) &&
2220: (!defined $parent->firstChild ||
2221: (!defined $parent->firstChild->nextSibling && $parent->firstChild->nodeType == XML_TEXT_NODE &&
2222: $parent->firstChild->nodeValue =~ /^^[\t\n\f\r ]*$/))) {
2223: my $grandparent = $parent->parentNode;
2224: replace_by_children($parent);
2225: remove_reference_from_array(\@nodes, $parent);
2226: $parent = $grandparent;
2227: }
2228: }
2229: }
2230: }
2231: }
2232: }
2233:
2234: # remove whitespace inside LON-CAPA elements that have an empty content-model (HTML ones are handled by html_to_xml)
2235: sub fix_empty_lc_elements {
2236: my ($node) = @_;
2237: my @lcempty = ('arc','axis','backgroundplot','drawoptionlist','drawvectorsum','fill','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','hiddenline','hiddensubmission','key','line','location','organicstructure','parameter','plotobject','plotvector','responseparam','spline','textline');
2238: if (string_in_array(\@lcempty, $node->nodeName)) {
2239: if (defined $node->firstChild && !defined $node->firstChild->nextSibling &&
2240: $node->firstChild->nodeType == XML_TEXT_NODE && $node->firstChild->nodeValue =~ /^\s*$/) {
2241: $node->removeChild($node->firstChild);
2242: }
2243: if (defined $node->firstChild) {
2244: if ($warnings) {
2245: print "Warning: a ".$node->nodeName." has something inside\n";
2246: }
2247: }
2248: return;
2249: }
2250: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2251: if ($child->nodeType == XML_ELEMENT_NODE) {
2252: fix_empty_lc_elements($child);
2253: }
2254: }
2255: }
2256:
2257: # turn some attribute values into lowercase when they should be
2258: sub lowercase_attribute_values {
2259: my ($root) = @_;
2260: my @with_yesno = (['radiobuttonresponse', ['randomize']],
2261: ['optionresponse', ['randomize']],
2262: ['matchresponse', ['randomize']],
2263: ['itemgroup', ['randomize']],
2264: ['rankresponse', ['randomize']],
2265: ['functionplotresponse', ['xaxisvisible', 'yaxisvisible', 'gridvisible']],
2266: ['backgroundplot', ['fixed']],
2267: ['drawvectorsum', ['showvalue']],
2268: ['textline', ['readonly']],
2269: ['hint', ['showoncorrect']],
2270: ['body', ['dir']],
2271: ['img', ['encrypturl']],
2272: ['form', ['method']],
2273: ['input', ['type']]
2274: );
2275: foreach my $el_attributes (@with_yesno) {
2276: my $el_name = $el_attributes->[0];
2277: my @elements = $root->getElementsByTagName($el_name);
2278: foreach my $element (@elements) {
2279: my $att_list = $el_attributes->[1];
2280: foreach my $att_name (@$att_list) {
2281: my $att_value = $element->getAttribute($att_name);
2282: if (!defined $att_value) {
2283: next;
2284: }
2285: if ($att_value eq 'yes' || $att_value eq 'no') {
2286: next;
2287: }
2288: if ($att_value =~ /\s*yes\s*/i) {
2289: $element->setAttribute($att_name, 'yes');
2290: } elsif ($att_value =~ /\s*no\s*/i) {
2291: $element->setAttribute($att_name, 'no');
2292: }
2293: }
2294: }
2295: }
2296: }
2297:
2298: # fixes spelling mistakes for numericalresponse/@unit
2299: sub replace_numericalresponse_unit_attribute {
2300: my ($root) = @_;
2301: my @numericalresponses = $root->getElementsByTagName('numericalresponse');
2302: foreach my $numericalresponse (@numericalresponses) {
2303: if (defined $numericalresponse->getAttribute('units') && !defined $numericalresponse->getAttribute('unit')) {
2304: $numericalresponse->setAttribute('unit', $numericalresponse->getAttribute('units'));
2305: $numericalresponse->removeAttribute('units');
2306: }
2307: }
2308:
2309: }
2310:
2311: # Replaces &format and &prettyprint by <num> whenever possible.
2312: # Also replaces &chemparse by <chem>.
2313: # If the function call is enclosed in <display>, the <display> element is removed.
2314: sub replace_functions_by_elements {
2315: my ($root) = @_;
2316: my $doc = $root->ownerDocument;
2317: my @preserve = ('script','answer','parse','m','tm','dtm','numericalhintscript'); # display is handled later
2318: my @all = $root->getElementsByTagName('*');
2319: foreach my $element (@all) {
2320: if (string_in_array(\@preserve, $element->nodeName)) {
2321: next;
2322: }
2323: my $changed = 0;
2324: my $next;
2325: for (my $child=$element->firstChild; defined $child; $child=$next) {
2326: $next = $child->nextSibling;
2327: if ($child->nodeType == XML_TEXT_NODE) {
2328: my $value = $child->nodeValue;
2329: if ($value =~ /^(.*)&(?:format|prettyprint)\((\$\{?[a-zA-Z0-9]*\}?(?:\[[^\]]*\])?|[0-9.]+)\s?,\s?(["'][,.\$]?[0-9][eEfFgGsS]["']|\$[a-zA-Z0-9]*)\)(.*)$/s) {
2330: # NOTE: we don't check for &prettyprint's 3rd argument (target), but it has not been seen outside of script elements.
2331: # NOTE: the format options ',' and '$' are not supported by &format in current LON-CAPA since rev 1.81 of default_homework.lcpm,
2332: # but are supported by &prettyprint;
2333: # if we use (like current LON-CAPA) &prettyprint for <num> implementation, it will change a few resulting documents
2334: # (by making them display something they were probably intended to display, but which did not).
2335: # Usage of <num> with &prettyprint instead of &format might also change the display when there is an exponent.
2336: my $before = $1;
2337: my $number = $2;
2338: my $format = $3;
2339: my $after = $4;
2340: $format =~ s/^['"]|['"]$//g;
2341: # do not change this if the parent is <display> and there are other things before or after &format
2342: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2343: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2344: last;
2345: }
2346: my $replacement = $doc->createDocumentFragment();
2347: my $num = $doc->createElement('num');
2348: $num->setAttribute('format', $format);
2349: $num->appendChild($doc->createTextNode($number));
2350: if (length($before) > 0) {
2351: $replacement->appendChild($doc->createTextNode($before));
2352: }
2353: $replacement->appendChild($num);
2354: if (length($after) > 0) {
2355: $replacement->appendChild($doc->createTextNode($after));
2356: }
2357: $element->replaceChild($replacement, $child);
2358: $changed = 1;
2359: $next = $element->firstChild; # start over, there might be another &format in the same text node
2360: } elsif ($value =~ /^(.*)&chemparse\(([^'"()]*|'[^']*'|"[^"]*")\)(.*)$/s) {
2361: my $before = $1;
2362: my $reaction = $2;
2363: my $after = $3;
2364: $reaction =~ s/^'(.*)'$/$1/;
2365: $reaction =~ s/^"(.*)"$/$1/;
2366: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2367: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2368: last;
2369: }
2370: my $replacement = $doc->createDocumentFragment();
2371: my $chem = $doc->createElement('chem');
2372: $chem->appendChild($doc->createTextNode($reaction));
2373: if (length($before) > 0) {
2374: $replacement->appendChild($doc->createTextNode($before));
2375: }
2376: $replacement->appendChild($chem);
2377: if (length($after) > 0) {
2378: $replacement->appendChild($doc->createTextNode($after));
2379: }
2380: $element->replaceChild($replacement, $child);
2381: $changed = 1;
2382: $next = $element->firstChild;
2383: }
2384: }
2385: }
2386: if ($changed && $element->nodeName eq 'display') {
2387: my $first = $element->firstChild;
2388: if ($first->nodeType == XML_ELEMENT_NODE && string_in_array(['num','chem'], $first->nodeName) &&
2389: !defined $first->nextSibling) {
2390: # remove useless display element
2391: replace_by_children($element);
2392: }
2393: }
2394: }
2395: }
2396:
2397: # pretty-print using im-memory DOM tree
2398: sub pretty {
2399: my ($node, $all_block, $indent_level) = @_;
2400: my $doc = $node->ownerDocument;
2401: $indent_level ||= 0;
2402: my $type = $node->nodeType;
2403: if ($type == XML_ELEMENT_NODE) {
2404: my $name = $node->nodeName;
2405: if ((string_in_array($all_block, $name) || string_in_array(\@inline_like_block, $name)) &&
2406: !string_in_array(\@preserve_elements, $name)) {
2407: # make sure there is a newline at the beginning and at the end if there is anything inside
2408: if (defined $node->firstChild && !string_in_array(\@no_newline_inside, $name)) {
2409: my $first = $node->firstChild;
2410: if ($first->nodeType == XML_TEXT_NODE) {
2411: my $text = $first->nodeValue;
2412: if ($text !~ /^ *\n/) {
2413: $first->setData("\n" . $text);
2414: }
2415: } else {
2416: $node->insertBefore($doc->createTextNode("\n"), $first);
2417: }
2418: my $last = $node->lastChild;
2419: if ($last->nodeType == XML_TEXT_NODE) {
2420: my $text = $last->nodeValue;
2421: if ($text !~ /\n *$/) {
2422: $last->setData($text . "\n");
2423: }
2424: } else {
2425: $node->appendChild($doc->createTextNode("\n"));
2426: }
2427: }
2428:
2429: # indent and make sure there is a newline before and after a block element
2430: my $newline_indent = "\n".(' ' x (2*($indent_level + 1)));
2431: my $newline_indent_last = "\n".(' ' x (2*$indent_level));
2432: my $next;
2433: for (my $child=$node->firstChild; defined $child; $child=$next) {
2434: $next = $child->nextSibling;
2435: if ($child->nodeType == XML_ELEMENT_NODE) {
2436: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
2437: # make sure there is a newline before and after a block element
2438: if (defined $child->previousSibling && $child->previousSibling->nodeType == XML_TEXT_NODE) {
2439: my $prev = $child->previousSibling;
2440: my $text = $prev->nodeValue;
2441: if ($text !~ /\n *$/) {
2442: $prev->setData($text . $newline_indent);
2443: }
2444: } else {
2445: $node->insertBefore($doc->createTextNode($newline_indent), $child);
2446: }
2447: if (defined $next && $next->nodeType == XML_TEXT_NODE) {
2448: my $text = $next->nodeValue;
2449: if ($text !~ /^ *\n/) {
2450: $next->setData($newline_indent . $text);
2451: }
2452: } else {
2453: $node->insertAfter($doc->createTextNode($newline_indent), $child);
2454: }
2455: }
2456: pretty($child, $all_block, $indent_level+1);
2457: } elsif ($child->nodeType == XML_TEXT_NODE) {
2458: my $text = $child->nodeValue;
2459: # collapse newlines
2460: $text =~ s/\n([\t ]*\n)+/\n/g;
2461: # indent and remove spaces and tabs before newlines
2462: if (defined $next) {
2463: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2464: } else {
2465: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2466: $text =~ s/[\t ]*\n[\t ]*$/$newline_indent_last/e;
2467: }
2468: $child->setData($text);
2469: }
2470: }
2471:
2472: # removes whitespace at the beginning and end of p td, th and li (except for nbsp at the beginning)
2473: my @to_trim = ('p','td','th','li');
2474: if (string_in_array(\@to_trim, $name) && defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
2475: my $text = $node->firstChild->nodeValue;
2476: $text =~ s/^[ \t\f\n\r]*//;
2477: if ($text eq '') {
2478: $node->removeChild($node->firstChild);
2479: } else {
2480: $node->firstChild->setData($text);
2481: }
2482: }
2483: if (string_in_array(\@to_trim, $name) && defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
2484: my $text = $node->lastChild->nodeValue;
2485: $text =~ s/\s*$//;
2486: if ($text eq '') {
2487: $node->removeChild($node->lastChild);
2488: } else {
2489: $node->lastChild->setData($text);
2490: }
2491: }
2492: } elsif (string_in_array(\@preserve_elements, $name)) {
2493: # collapse newlines at the beginning and the end of scripts
2494: if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
2495: my $text = $node->firstChild->nodeValue;
2496: $text =~ s/^\n( *\n)+/\n/;
2497: if ($text eq '') {
2498: $node->removeChild($node->firstChild);
2499: } else {
2500: $node->firstChild->setData($text);
2501: }
2502: }
2503: if (defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
2504: my $text = $node->lastChild->nodeValue;
2505: $text =~ s/\n( *\n)+$/\n/;
2506: if ($text eq '') {
2507: $node->removeChild($node->lastChild);
2508: } else {
2509: $node->lastChild->setData($text);
2510: }
2511: }
2512: }
2513: }
2514: }
2515:
2516: sub replace_tm_dtm {
2517: my ($root) = @_;
2518: my $doc = $root->ownerDocument;
2519: my @elements = $root->getElementsByTagName('tm');
2520: push(@elements, $root->getElementsByTagName('dtm'));
2521: foreach my $element (@elements) {
2522: my $first = $element->firstChild;
2523: if (defined $first && $first->nodeType == XML_TEXT_NODE) {
2524: my $text = $first->nodeValue;
2525: if ($element->nodeName eq 'tm') {
2526: $first->setData('$'.$text.'$');
2527: } else {
2528: $first->setData('$$'.$text.'$$');
2529: }
2530: }
2531: $element->setNodeName('m');
2532: }
2533: }
2534:
2535:
2536: ######## utilities ########
2537:
2538: ##
2539: # Trims a string (really, this should be built-in in Perl, this is ridiculous, ugly and slow)
2540: # @param {string} s - the string to trim
2541: # @returns the trimmed string
2542: ##
2543: sub trim {
2544: my ($s) = @_;
2545: $s =~ s/^\s+//;
2546: $s =~ s/\s+$//;
2547: return($s);
2548: }
2549:
2550: ##
2551: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
2552: # @param {Array<string>} array - reference to the array of strings
2553: # @param {string} value - the string to look for
2554: # @returns 1 if found, 0 otherwise
2555: ##
2556: sub string_in_array {
2557: my ($array, $value) = @_;
2558: # NOTE: would it be faster to use "any { $value eq $_ } @{$array}" from List::Util ?
2559: foreach my $v (@{$array}) {
2560: if ($v eq $value) {
2561: return 1;
2562: }
2563: }
2564: return 0;
2565: }
2566:
2567: ##
2568: # Tests if an object is in an array (using ==)
2569: # @param {Array<Object>} array - reference to the array of references
2570: # @param {Object} ref - the reference to look for
2571: # @returns 1 if found, 0 otherwise
2572: ##
2573: sub reference_in_array {
2574: my ($array, $ref) = @_;
2575: foreach my $v (@{$array}) {
2576: if ($v == $ref) {
2577: return 1;
2578: }
2579: }
2580: return 0;
2581: }
2582:
2583: ##
2584: # returns the index of a string in an array
2585: # @param {Array<Object>} array - reference to the array of strings
2586: # @param {string} s - the string to look for (using eq)
2587: # @returns the index if found, -1 otherwise
2588: ##
2589: sub index_of_string {
2590: my ($array, $s) = @_;
2591: for (my $i=0; $i<scalar(@{$array}); $i++) {
2592: if ($array->[$i] eq $s) {
2593: return $i;
2594: }
2595: }
2596: return -1;
2597: }
2598:
2599: ##
2600: # returns the index of a reference in an array
2601: # @param {Array<Object>} array - reference to the array of references
2602: # @param {Object} ref - the reference to look for
2603: # @returns the index if found, -1 otherwise
2604: ##
2605: sub index_of_reference {
2606: my ($array, $ref) = @_;
2607: for (my $i=0; $i<scalar(@{$array}); $i++) {
2608: if ($array->[$i] == $ref) {
2609: return $i;
2610: }
2611: }
2612: return -1;
2613: }
2614:
2615: ##
2616: # if found, removes a string from an array, otherwise do nothing
2617: # @param {Array<string>} array - reference to the array of string
2618: # @param {string} s - the string to look for (using eq)
2619: ##
2620: sub remove_string_from_array {
2621: my ($array, $s) = @_;
2622: my $index = index_of_string($array, $s);
2623: if ($index != -1) {
2624: splice(@$array, $index, 1);
2625: }
2626: }
2627:
2628: ##
2629: # if found, removes a reference from an array, otherwise do nothing
2630: # @param {Array<Object>} array - reference to the array of references
2631: # @param {Object} ref - the reference to look for
2632: ##
2633: sub remove_reference_from_array {
2634: my ($array, $ref) = @_;
2635: my $index = index_of_reference($array, $ref);
2636: if ($index != -1) {
2637: splice(@$array, $index, 1);
2638: }
2639: }
2640:
2641: ##
2642: # replaces a node by its children
2643: # @param {Node} node - the DOM node
2644: ##
2645: sub replace_by_children {
2646: my ($node) = @_;
2647: my $parent = $node->parentNode;
2648: my $next;
2649: my $previous;
2650: for (my $child=$node->firstChild; defined $child; $child=$next) {
2651: $next = $child->nextSibling;
2652: if ((!defined $previous || !defined $next) &&
2653: $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
2654: next; # do not keep first and last whitespace nodes
2655: } else {
2656: if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
2657: # remove whitespace at the beginning
2658: my $value = $child->nodeValue;
2659: $value =~ s/^\s+//;
2660: $child->setData($value);
2661: }
2662: if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
2663: # and at the end
2664: my $value = $child->nodeValue;
2665: $value =~ s/\s+$//;
2666: $child->setData($value);
2667: }
2668: }
2669: $node->removeChild($child);
2670: $parent->insertBefore($child, $node);
2671: $previous = $child;
2672: }
2673: $parent->removeChild($node);
2674: }
2675:
2676: ##
2677: # returns the trimmed attribute value if the attribute exists and is not blank, undef otherwise
2678: # @param {Node} node - the DOM node
2679: # @param {string} attribute_name - the attribute name
2680: ##
2681: sub get_non_empty_attribute {
2682: my ($node, $attribute_name) = @_;
2683: my $value = $node->getAttribute($attribute_name);
2684: if (defined $value && $value !~ /^\s*$/) {
2685: $value = trim($value);
2686: return($value);
2687: }
2688: return(undef);
2689: }
2690:
2691: ##
2692: # Returns a CSS property value from the style attribute of the element, or undef if not defined
2693: # @param {Element} el - the DOM element
2694: # @param {string} property_name - the CSS property name
2695: ##
2696: sub get_css_property {
2697: my ($el, $property_name) = @_;
2698: my $style = $el->getAttribute('style');
2699: if (defined $style) {
2700: $style =~ s/^\s*;\s*//;
2701: $style =~ s/\s*;\s*$//;
2702: } else {
2703: $style = '';
2704: }
2705: my @pairs = split(';', $style);
2706: foreach my $pair (@pairs) {
2707: my @name_value = split(':', $pair);
2708: if (scalar(@name_value) != 2) {
2709: next;
2710: }
2711: my $name = trim($name_value[0]);
2712: my $value = trim($name_value[1]);
2713: if (lc($name) eq $property_name) {
2714: return($value); # return the first one found
2715: }
2716: }
2717: return(undef);
2718: }
2719:
2720: ##
2721: # Returns the reference to a hash CSS property name => value from the style attribute of the element.
2722: # Returns an empty list if the style attribute is not defined,
2723: # @param {Element} el - the DOM element
2724: # @return {Hash<string, string>} reference to the hash property name => property value
2725: ##
2726: sub get_css_properties {
2727: my ($el) = @_;
2728: my $style = $el->getAttribute('style');
2729: if (defined $style) {
2730: $style =~ s/^\s*;\s*//;
2731: $style =~ s/\s*;\s*$//;
2732: } else {
2733: $style = '';
2734: }
2735: my @pairs = split(';', $style);
2736: tie (my %hash, 'Tie::IxHash', ());
2737: foreach my $pair (@pairs) {
2738: my @name_value = split(':', $pair);
2739: if (scalar(@name_value) != 2) {
2740: next;
2741: }
2742: my $name = trim($name_value[0]);
2743: my $value = trim($name_value[1]);
2744: if (defined $hash{$name}) {
2745: # duplicate property in the style attribute: keep only the last one
2746: delete $hash{$name};
2747: }
2748: $hash{$name} = $value;
2749: }
2750: return(\%hash);
2751: }
2752:
2753: ##
2754: # Sets a CSS property in the style attribute of an element
2755: # @param {Element} el - the DOM element
2756: # @param {string} property_name - the CSS property name
2757: # @param {string} property_value - the CSS property value
2758: ##
2759: sub set_css_property {
2760: my ($el, $property_name, $property_value) = @_;
2761: my $hash_ref = { $property_name => $property_value };
2762: set_css_properties($el, $hash_ref);
2763: }
2764:
2765: ##
2766: # Sets several CSS properties in the style attribute of an element
2767: # @param {Element} el - the DOM element
2768: # @param {Hash<string, string>} properties - reference to the hash property name => property value
2769: ##
2770: sub set_css_properties {
2771: my ($el, $properties) = @_;
2772: my $hash = get_css_properties($el);
2773: foreach my $property_name (keys %$properties) {
2774: my $property_value = $properties->{$property_name};
2775: if (defined $hash->{$property_name}) {
2776: delete $hash->{$property_name}; # to add the new one at the end
2777: }
2778: $hash->{$property_name} = $property_value;
2779: }
2780: my $style = '';
2781: foreach my $key (keys %$hash) {
2782: $style .= $key.':'.$hash->{$key}.'; ';
2783: }
2784: $style =~ s/; $//;
2785: $el->setAttribute('style', $style);
2786: }
2787:
2788: 1;
2789: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>