Annotation of modules/damieng/clean_xml/post_xml.pm, revision 1.4
1.1 damieng 1: #!/usr/bin/perl
2:
3: package post_xml;
4:
5: use strict;
6: use utf8;
7: use warnings;
8:
9: use File::Basename;
10: use File::Temp qw/ tempfile /;
11: use Cwd 'abs_path';
12: use XML::LibXML;
13: use HTML::TokeParser; # used to parse sty files
14: use Tie::IxHash; # for ordered hashes
15:
16: use Env qw(RES_DIR); # path of res directory parent (without the / at the end)
17:
18: no warnings 'recursion'; # yes, fix_paragraph is using heavy recursion, I know
19:
20: # these are constants
21: my @block_elements = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm');
22: my @inline_like_block = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup'); # inline elements treated like blocks for pretty print and some other things
23: my @responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse','essayresponse','radiobuttonresponse','matchresponse','rankresponse','imageresponse','functionplotresponse');
24: my @block_html = ('html','head','body','section','h1','h2','h3','h4','h5','h6','div','p','ul','ol','li','table','tbody','tr','td','th','dl','dt','dd','pre','noscript','hr','address','blockquote','object','applet','embed','map','form','fieldset','iframe','center','frameset');
25: my @no_newline_inside = ('import','parserlib','scriptlib','data','function','label','xlabel','ylabel','tic','text','rectangle','image','title','h1','h2','h3','h4','h5','h6','li','td','p');
26: my @preserve_elements = ('script','answer','pre');
27: my @accepting_style = ('section','h1','h2','h3','h4','h5','h6','div','p','li','td','th','dt','dd','pre','blockquote');
28: my @latex_math = ('\alpha', '\theta', '\omicron', '\tau', '\beta', '\vartheta', '\pi', '\upsilon', '\gamma', '\gamma', '\varpi', '\phi', '\delta', '\kappa', '\rho', '\varphi', '\epsilon', '\lambda', '\varrho', '\chi', '\varepsilon', '\mu', '\sigma', '\psi', '\zeta', '\nu', '\varsigma', '\omega', '\eta', '\xi',
29: '\Gamma', '\Lambda', '\Sigma', '\Psi', '\Delta', '\Xi', '\Upsilon', '\Omega', '\Theta', '\Pi', '\Phi',
30: '\pm', '\cap', '\diamond', '\oplus', '\mp', '\cup', '\bigtriangleup', '\ominus', '\times', '\uplus', '\bigtriangledown', '\otimes', '\div', '\sqcap', '\triangleleft', '\oslash', '\ast', '\sqcup', '\triangleright', '\odot', '\star', '\vee', '\lhd$', '\bigcirc', '\circ', '\wedge', '\rhd$', '\dagger', '\bullet', '\setminus', '\unlhd$', '\ddagger', '\cdot', '\wr', '\unrhd$', '\amalg', '+', '-',
31: '\leq', '\geq', '\equiv', '\models', '\prec', '\succ', '\sim', '\perp', '\preceq', '\succeq', '\simeq', '\mid', '\ll', '\gg', '\asymp', '\parallel', '\subset', '\supset', '\approx', '\bowtie', '\subseteq', '\supseteq', '\cong', '\Join$', '\sqsubset$', '\sqsupset$', '\neq', '\smile', '\sqsubseteq', '\sqsupseteq', '\doteq', '\frown', '\in', '\ni', '\propto', '\vdash', '\dashv',
32: '\colon', '\ldotp', '\cdotp',
33: '\leftarrow', '\longleftarrow', '\uparrow', '\Leftarrow', '\Longleftarrow', '\Uparrow', '\rightarrow', '\longrightarrow', '\downarrow', '\Rightarrow', '\Longrightarrow', '\Downarrow', '\leftrightarrow', '\longleftrightarrow', '\updownarrow', '\Leftrightarrow', '\Longleftrightarrow', '\Updownarrow', '\mapsto', '\longmapsto', '\nearrow', '\hookleftarrow', '\hookrightarrow', '\searrow', '\leftharpoonup', '\rightharpoonup', '\swarrow', '\leftharpoondown', '\rightharpoondown', '\nwarrow', '\rightleftharpoons', '\leadsto$',
34: '\ldots', '\cdots', '\vdots', '\ddots', '\aleph', '\prime', '\forall', '\infty', '\hbar', '\emptyset', '\exists', '\Box$', '\imath', '\nabla', '\neg', '\Diamond$', '\jmath', '\surd', '\flat', '\triangle', '\ell', '\top', '\natural', '\clubsuit', '\wp', '\bot', '\sharp', '\diamondsuit', '\Re', '\|', '\backslash', '\heartsuit', '\Im', '\angle', '\partial', '\spadesuit', '\mho$',
35: '\sum', '\bigcap', '\bigodot', '\prod', '\bigcup', '\bigotimes', '\coprod', '\bigsqcup', '\bigoplus', '\int', '\bigvee', '\biguplus', '\oint', '\bigwedge',
36: '\arccos', '\cos', '\csc', '\exp', '\ker', '\limsup', '\min', '\sinh', '\arcsin', '\cosh', '\deg', '\gcd', '\lg', '\ln', '\Pr', '\sup', '\arctan', '\cot', '\det', '\hom', '\lim', '\log', '\sec', '\tan', '\arg', '\coth', '\dim', '\inf', '\liminf', '\max', '\sin', '\tanh',
37: '\uparrow', '\Uparrow', '\downarrow', '\Downarrow', '\updownarrow', '\Updownarrow', '\lfloor', '\rfloor', '\lceil', '\rceil', '\langle', '\rangle', '\backslash',
38: '\rmoustache', '\lmoustache', '\rgroup', '\lgroup', '\arrowvert', '\Arrowvert', '\bracevert',
39: '\hat{', '\acute{', '\bar{', '\dot{', '\breve{', '\check{', '\grave{', '\vec{', '\ddot{', '\tilde{',
40: '\widetilde{', '\widehat{', '\overleftarrow{', '\overrightarrow{', '\overline{', '\underline{', '\overbrace{', '\underbrace{', '\sqrt{', '\sqrt[', '\frac{'
41: );
42: # list of elements that can contain style elements:
43: my @containing_styles = ('library','problem',@responses,'foil','item','text','hintgroup','hintpart','label','part','preduedate','postanswerdate','solved','notsolved','block','while','web','standalone','problemtype','languageblock','translated','lang','window','windowlink','togglebox','instructorcomment','body','section','div','p','li','dd','td','th','blockquote','object','applet','video','audio','canvas','fieldset','button',
44: 'span','strong','em','b','i','sup','sub','code','kbd','samp','tt','ins','del','var','small','big','u','font');
45: my @html_styles = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'u');
46:
47:
48: # Parses the XML document and fixes many things to turn it into a LON-CAPA 3 document
49: # Returns the text of the document.
50: sub post_xml {
51: my ($textref, $new_path) = @_;
52:
53: my $dom_doc = XML::LibXML->load_xml(string => $textref);
54:
55: my $root = fix_structure($dom_doc);
56:
57: remove_elements($root, ['startouttext','startoutext','startottext','startouttex','startouttect','atartouttext','starouttext','starttextout','starttext','starttextarea','endouttext','endoutext','endoutttext','endouttxt','endouutext','ednouttext','endouttex','endoouttext','endouttest','endtextout','endtextarea','startpartmarker','endpartmarker','basefont','x-claris-tagview','x-claris-window','x-sas-window']);
58:
59: remove_empty_attributes($root);
60:
61: fix_attribute_case($root);
62:
63: my $fix_by_hand = replace_m($root);
64:
65: my @all_block = (@block_elements, @block_html);
66: add_sty_blocks($new_path, $root, \@all_block); # must come before the subs using @all_block
67:
68: fix_block_styles($root, \@all_block);
69: $root->normalize();
70:
71: fix_fonts($root, \@all_block);
72:
73: replace_u($root);
74:
75: remove_bad_cdata_sections($root);
76:
77: add_cdata_sections($root);
78:
79: fix_style_element($root);
80:
81: fix_tables($root);
82:
83: fix_lists($root);
84:
85: fix_wrong_name_for_img($root); # should be before replace_deprecated_attributes_by_css
86:
87: replace_deprecated_attributes_by_css($root);
88:
89: replace_center($root, \@all_block); # must come after replace_deprecated_attributes_by_css
90:
91: replace_nobr($root);
92:
93: remove_useless_notsolved($root);
94:
95: fix_paragraphs_inside($root, \@all_block);
96:
97: remove_empty_style($root);
98:
99: fix_empty_lc_elements($root);
100:
101: lowercase_attribute_values($root);
102:
103: replace_numericalresponse_unit_attribute($root);
104:
105: replace_functions_by_elements($root);
106:
107: pretty($root, \@all_block);
108:
109: replace_tm_dtm($root);
110:
111: open my $out, '>', $new_path;
112: print $out $dom_doc->toString(); # byte string !
113: close $out;
114:
115: if ($fix_by_hand) {
116: die "The file has been converted but it should be fixed by hand.";
117: }
118: }
119:
120: sub fix_structure {
121: my ($doc) = @_;
122: # the 'loncapa' root element has already been added in pre_xml
123: my $root = $doc->documentElement;
124: # inside the root, replace html, problem and library elements by their content
125: my @toreplace = ('html','problem','library');
126: foreach my $name (@toreplace) {
127: my @elements = $root->getElementsByTagName($name);
128: foreach my $element (@elements) {
129: replace_by_children($element);
130: }
131: }
132: # insert all link and style elements inside a new head element
133: my $current_node = undef;
134: my @heads = $doc->getElementsByTagName('head');
135: my @links = $doc->getElementsByTagName('link');
136: my @styles = $doc->getElementsByTagName('style');
137: my @titles = $doc->getElementsByTagName('title');
138: if (scalar(@titles) > 0) {
139: # NOTE: there is a title element in gnuplot, not to be confused with the one inside HTML head
140: for (my $i=0; $i<scalar(@titles); $i++) {
141: my $title = $titles[$i];
142: my $found_gnuplot = 0;
143: my $ancestor = $title->parentNode;
144: while (defined $ancestor) {
145: if ($ancestor->nodeName eq 'gnuplot') {
146: $found_gnuplot = 1;
147: last;
148: }
149: $ancestor = $ancestor->parentNode;
150: }
151: if ($found_gnuplot) {
152: splice(@titles, $i, 1);
153: $i--;
154: }
155: }
156: }
157: if (scalar(@heads) > 0 || scalar(@titles) > 0 || scalar(@links) > 0 || scalar(@styles) > 0) {
158: my $htmlhead = $doc->createElement('head');
159: foreach my $head (@heads) {
160: my $next;
161: for (my $child=$head->firstChild; defined $child; $child=$next) {
162: $next = $child->nextSibling;
163: $head->removeChild($child);
164: if ($child->nodeType != XML_ELEMENT_NODE ||
165: string_in_array(['title','script','style','meta','link','import','base'], $child->nodeName)) {
166: $htmlhead->appendChild($child);
167: } else {
168: # this should not be in head
169: insert_after_or_first($root, $child, $current_node);
170: }
171: }
172: $head->parentNode->removeChild($head);
173: }
174: foreach my $child (@titles, @links, @styles) {
175: $child->parentNode->removeChild($child);
176: $htmlhead->appendChild($child);
177: }
178: insert_after_or_first($root, $htmlhead, $current_node);
179: $current_node = $htmlhead;
180: }
181: # body
182: my $htmlbody = undef;
183: my @bodies = $doc->getElementsByTagName('body');
184: if (scalar(@bodies) > 0) {
185: # TODO: fix content and position of body elements
186: if ($root->nodeName eq 'problem') {
187: foreach my $body (@bodies) {
188: replace_by_children($body);
189: }
190: }
191: }
192: # add all the meta elements afterwards when they are LON-CAPA meta. Remove all HTML meta.
193: my @meta_names = ('abstract','author','authorspace','avetries','avetries_list','clear','comefrom','comefrom_list','copyright','correct','count','course','course_list','courserestricted','creationdate','dependencies','depth','difficulty','difficulty_list','disc','disc_list','domain','end','field','firstname','generation','goto','goto_list','groupname','helpful','highestgradelevel','hostname','id','keynum','keywords','language','lastname','lastrevisiondate','lowestgradelevel','middlename','mime','modifyinguser','notes','owner','permanentemail','scope','sequsage','sequsage_list','standards','start','stdno','stdno_list','subject','technical','title','url','username','value','version');
194: my @metas = $doc->getElementsByTagName('meta');
195: foreach my $meta (@metas) {
196: $meta->parentNode->removeChild($meta);
197: my $name = $meta->getAttribute('name');
198: my $content = $meta->getAttribute('content');
199: if (defined $name && defined $content && string_in_array(\@meta_names, lc($name))) {
200: my $lcmeta = $doc->createElement('meta');
201: $lcmeta->setAttribute('name', lc($name));
202: $lcmeta->setAttribute('content', $content);
203: insert_after_or_first($root, $lcmeta, $current_node);
204: $current_node = $lcmeta;
205: }
206: }
207: return($root);
208: }
209:
210: # insert the new child under parent after the reference child, or as the first child if the reference child is not defined
211: sub insert_after_or_first {
212: my ($parent, $newchild, $refchild) = @_;
213: if (defined $refchild) {
214: $parent->insertAfter($newchild, $refchild);
215: } elsif (defined $parent->firstChild) {
216: $parent->insertBefore($newchild, $parent->firstChild);
217: } else {
218: $parent->appendChild($newchild);
219: }
220: }
221:
222: # removes all elements with given names inside the node, but keep the content
223: sub remove_elements {
224: my ($node, $to_remove) = @_;
225: my $nextChild;
226: for (my $child=$node->firstChild; defined $child; $child=$nextChild) {
227: $nextChild = $child->nextSibling;
228: my $type = $node->nodeType;
229: if ($type == XML_ELEMENT_NODE) {
230: if (string_in_array($to_remove, $child->nodeName)) {
231: my $first_non_white = $child->firstChild;
232: if (defined $first_non_white && $first_non_white->nodeType == XML_TEXT_NODE &&
233: $first_non_white->nodeValue =~ /^\s*$/) {
234: $first_non_white = $first_non_white->nextSibling;
235: }
236: if (defined $first_non_white) {
237: $nextChild = $first_non_white;
238: replace_by_children($child);
239: } else {
240: $node->removeChild($child);
241: }
242: } else {
243: remove_elements($child, $to_remove);
244: }
245: }
246: }
247: }
248:
249: # removes some attributes that have an invalid empty value
250: sub remove_empty_attributes {
251: my ($root) = @_;
252: my $doc = $root->ownerDocument;
253: # this list is based on validation errors in the MSU subset (it could be more complete if it was based on the schema)
254: my @attributes = (
255: ['curve', ['pointsize']],
256: ['foil', ['location']],
257: ['foilgroup', ['checkboxoptions', 'options', 'texoptions']],
258: ['gnuplot', ['pattern', 'texwidth']],
259: ['img', ['height', 'texheight', 'texwidth', 'texwrap', 'width']],
260: ['import', ['importmode']],
261: ['optionresponse', ['max']],
262: ['organicstructure', ['options']],
263: ['radiobuttonresponse', ['max']],
264: ['randomlabel', ['height', 'texwidth', 'width']],
265: ['stringresponse', ['type']],
266: ['textline', ['size']],
267: );
268: foreach my $element_attributes (@attributes) {
269: my $element_name = $element_attributes->[0];
270: my $attribute_names = $element_attributes->[1];
271: my @elements = $doc->getElementsByTagName($element_name);
272: foreach my $element (@elements) {
273: foreach my $attribute_name (@$attribute_names) {
274: my $value = $element->getAttribute($attribute_name);
275: if (defined $value && $value =~ /^\s*$/) {
276: $element->removeAttribute($attribute_name);
277: }
278: }
279: }
280: }
281: }
282:
283: # fixes the case for a few attributes that are not all lowercase
284: # (the HTML parser used in html_to_xml turns everything lowercase, which is a good thing in general)
285: sub fix_attribute_case {
286: my ($root) = @_;
287: my $doc = $root->ownerDocument;
288: my @attributes = (
289: ['labelgroup', ['TeXsize']],
290: ['h1', ['TeXsize']],
291: ['h2', ['TeXsize']],
292: ['h3', ['TeXsize']],
293: ['h4', ['TeXsize']],
294: ['h5', ['TeXsize']],
295: ['h6', ['TeXsize']],
296: # font and basefont have a TeXsize but will be removed
297: ['optionresponse', ['TeXlayout']],
298: ['itemgroup', ['TeXitemgroupwidth']],
299: ['Task', ['OptionalRequired']],
300: ['Question', ['OptionalRequired','Mandatory']],
301: ['Instance', ['OptionalRequired','Disabled']],
302: ['Criteria', ['Mandatory']],
303: ['table', ['TeXwidth','TeXtheme']],
304: ['td', ['TeXwidth']],
305: ['th', ['TeXwidth']],
306: ['img', ['TeXwidth','TeXheight','TeXwrap']],
307: );
308: foreach my $element_attributes (@attributes) {
309: my $element_name = $element_attributes->[0];
310: my $attribute_names = $element_attributes->[1];
311: my @elements = $doc->getElementsByTagName($element_name);
312: foreach my $element (@elements) {
313: foreach my $attribute_name (@$attribute_names) {
314: my $value = $element->getAttribute(lc($attribute_name));
315: if (defined $value) {
316: $element->removeAttribute(lc($attribute_name));
317: $element->setAttribute($attribute_name, $value);
318: }
319: }
320: }
321: }
322: }
323:
324: # Replaces m by HTML, tm and/or dtm (which will be replaced by <m> later, but they are useful
325: # to know if the element is a block element or not).
326: # m might contain non-math LaTeX, while tm and dtm may only contain math.
327: # Returns 1 if the file should be fixed by hand, 0 otherwise.
328: sub replace_m {
329: my ($root) = @_;
330: my $doc = $root->ownerDocument;
331: my $fix_by_hand = 0;
332: # search for variable declarations
333: my @variables = ();
334: my @scripts = $root->getElementsByTagName('script');
335: foreach my $script (@scripts) {
336: my $type = $script->getAttribute('type');
337: if (defined $type && $type eq 'loncapa/perl') {
338: if (defined $script->firstChild && $script->firstChild->nodeType == XML_TEXT_NODE) {
339: my $text = $script->firstChild->nodeValue;
340: # NOTE: we are not interested in replacing "@value", only "$value"
341: # this regexp is for " $a = ..." and " $a[...] = ..."
342: while ($text =~ /^[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
343: if (!string_in_array(\@variables, $1)) {
344: push(@variables, $1);
345: }
346: }
347: # this regexp is for "...; $a = ..." and "...; $a[...] = ..."
348: while ($text =~ /^[^'"\/;]+;[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) {
349: if (!string_in_array(\@variables, $1)) {
350: push(@variables, $1);
351: }
352: }
1.2 damieng 353: # this regexp is for " @a = ..."
354: while ($text =~ /^[ \t]*\@([a-zA-Z_0-9]+)[ \t]*=/gm) {
355: if (!string_in_array(\@variables, $1)) {
356: push(@variables, $1);
357: }
358: }
1.1 damieng 359: # this regexp is for " ($a, $b, $c) = ..."
360: my @matches = ($text =~ /^[ \t]*\([ \t]*\$([a-zA-Z_0-9]+)(?:[ \t]*,[ \t]*\$([a-zA-Z_0-9]+))*[ \t]*\)[ \t]*=/gm);
361: foreach my $match (@matches) {
362: if (!defined $match) {
363: next; # not sure why it happens, but it does
364: }
365: if (!string_in_array(\@variables, $match)) {
366: push(@variables, $match);
367: }
368: }
369: # and this one is for "push @a"
370: while ($text =~ /^[ \t]*push @([a-zA-Z_0-9]+)[ \t,]*/gm) {
371: if (!string_in_array(\@variables, $1)) {
372: push(@variables, $1);
373: }
374: }
375: # use the opportunity to report usage of <m> in Perl scripts
376: if ($text =~ /^[^#].*<m[ >]/m) {
377: print "WARNING: <m> is used in a script, it should be converted by hand\n";
378: $fix_by_hand = 1;
379: }
380: }
381: }
382: }
383: my @ms = $root->getElementsByTagName('m');
384: foreach my $m (@ms) {
385: if (!defined $m->firstChild) {
386: $m->parentNode->removeChild($m);
387: next;
388: }
389: if (defined $m->firstChild->nextSibling || $m->firstChild->nodeType != XML_TEXT_NODE) {
390: print "WARNING: m value is not simple text\n";
391: $fix_by_hand = 1;
392: next;
393: }
394: my $text = $m->firstChild->nodeValue;
395: my $text_before_variable_replacement = $text;
396: my $var_key1 = 'dfhg3df54hg65hg4';
397: my $var_key2 = 'dfhg654d6f5g4h5f';
398: my $eval = defined $m->getAttribute('eval') && $m->getAttribute('eval') eq 'on';
399: if ($eval) {
400: # replace variables
401: foreach my $variable (@variables) {
402: my $replacement = $var_key1.$variable.$var_key2;
403: $text =~ s/\$$variable(?![a-zA-Z])/$replacement/ge;
404: $text =~ s/\$\{$variable\}/$replacement/ge;
405: }
406: }
407: # check if the expression is enclosed in math separators: $ $$ \( \) \[ \]
408: # if so, replace the whole node by dtm or tm
409: my $new_text;
410: my $new_node_name;
411: if ($text =~ /^\s*\$\$([^\$]*)\$\$\s*$/) {
412: $new_node_name = 'dtm';
413: $new_text = $1;
414: } elsif ($text =~ /^\s*\\\[(.*)\\\]\s*$/s) {
415: $new_node_name = 'dtm';
416: $new_text = $1;
417: } elsif ($text =~ /^\s*\$([^\$]*)\$\s*$/) {
418: $new_node_name = 'tm';
419: $new_text = $1;
420: } elsif ($text =~ /^\s*\\\((.*)\\\)\s*$/s) {
421: $new_node_name = 'tm';
422: $new_text = $1;
423: }
424: if (defined $new_node_name) {
425: if ($eval) {
426: foreach my $variable (@variables) {
427: my $replacement = $var_key1.$variable.$var_key2;
428: $new_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
429: $new_text =~ s/$replacement/\$$variable/g;
430: }
431: }
432: my $new_node = $doc->createElement($new_node_name);
433: if ($eval) {
434: $new_node->setAttribute('eval', 'on');
435: }
436: $new_node->appendChild($doc->createTextNode($new_text));
437: $m->parentNode->replaceChild($new_node, $m);
438: next;
439: }
440: if ($text !~ /\$|\\\(|\\\)|\\\[|\\\]/) {
441: # there are no math separators inside
442: # try to guess if this is meant as math
443: my $found_math = 0;
444: foreach my $symbol (@latex_math) {
445: if (index($text, $symbol) != -1) {
446: $found_math = 1;
447: last;
448: }
449: }
450: if ($found_math) {
451: # interpret the whole text as LaTeX inline math
452: my $new_node = $doc->createElement('tm');
453: if ($eval) {
454: $new_node->setAttribute('eval', 'on');
455: }
456: $new_node->appendChild($doc->createTextNode($text_before_variable_replacement));
457: $m->parentNode->replaceChild($new_node, $m);
458: next;
459: }
460: # no math symbol found, we will convert the text with tth
461: }
462:
463: # there are math separators inside, even after hiding variables, or there was no math symbol
464:
465: # hide math parts inside before running tth
466: my $math_key1 = '#ghjgdh5hg45gf';
467: my $math_key2 = '#';
468: my @maths = ();
469: my @separators = (['$$','$$'], ['\\(','\\)'], ['\\[','\\]'], ['$','$']);
470: foreach my $seps (@separators) {
471: my $sep1 = $seps->[0];
472: my $sep2 = $seps->[1];
473: my $pos1 = index($text, $sep1);
474: if ($pos1 == -1) {
475: next;
476: }
477: my $pos2 = index($text, $sep2, $pos1+length($sep1));
478: while ($pos1 != -1 && $pos2 != -1) {
479: my $replace = substr($text, $pos1, $pos2+length($sep2)-$pos1);
480: push(@maths, $replace);
481: my $by = $math_key1.scalar(@maths).$math_key2;
482: $text = substr($text, 0, $pos1).$by.substr($text, $pos2+length($sep2));
483: $pos1 = index($text, $sep1);
484: if ($pos1 != -1) {
485: $pos2 = index($text, $sep2, $pos1+length($sep1));
486: }
487: }
488: }
489: # get HTML as text from tth
490: my $html_text = tth($text);
491: # replace math by replacements
492: for (my $i=0; $i < scalar(@maths); $i++) {
493: my $math = $maths[$i];
494: $math =~ s/&/&/g;
495: $math =~ s/</</g;
496: $math =~ s/>/>/g;
497: if ($math =~ /^\$\$(.*)\$\$$/s) {
498: $math = '<dtm>'.$1.'</dtm>';
499: } elsif ($math =~ /^\\\[(.*)\\\]$/s) {
500: $math = '<dtm>'.$1.'</dtm>';
501: } elsif ($math =~ /^\\\((.*)\\\)$/s) {
502: $math = '<tm>'.$1.'</tm>';
503: } elsif ($math =~ /^\$(.*)\$$/s) {
504: $math = '<tm>'.$1.'</tm>';
505: }
506: my $replace = $math_key1.($i+1).$math_key2;
507: $html_text =~ s/$replace/$math/;
508: }
509: # replace variables if necessary
510: if ($eval) {
511: foreach my $variable (@variables) {
512: my $replacement = $var_key1.$variable.$var_key2;
513: $html_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g;
514: $html_text =~ s/$replacement/\$$variable/g;
515: }
516: }
517: my $fragment = html_to_dom($html_text);
518: $doc->adoptNode($fragment);
519: $m->parentNode->replaceChild($fragment, $m);
520:
521: }
522: return $fix_by_hand;
523: }
524:
525: # Returns the HTML equivalent of LaTeX input, using tth
526: sub tth {
527: my ($text) = @_;
528: my ($fh, $tmp_path) = tempfile();
529: binmode($fh, ':utf8');
530: print $fh $text;
531: close $fh;
532: my $output = `tth -r -w2 -u -y0 < $tmp_path 2>/dev/null`;
533: # hopefully the temp file will not be removed before this point (otherwise we should use unlink_on_destroy 0)
534: $output =~ s/^\s*|\s*$//;
535: $output =~ s/<div class="p"><!----><\/div>/<br\/>/; # why is tth using such ugly markup for \newline ?
536: return $output;
537: }
538:
539: # transform simple HTML into a DOM fragment (which will need to be adopted by the document)
540: sub html_to_dom {
541: my ($text) = @_;
542: $text = '<root>'.$text.'</root>';
543: my $textref = html_to_xml::html_to_xml(\$text);
544: utf8::upgrade($$textref); # otherwise the XML parser fails when the HTML parser turns into a character
545: my $dom_doc = XML::LibXML->load_xml(string => $textref);
546: my $root = $dom_doc->documentElement;
547: remove_empty_style($root);
548: my $fragment = $dom_doc->createDocumentFragment();
549: my $next;
550: for (my $n=$root->firstChild; defined $n; $n=$next) {
551: $next = $n->nextSibling;
552: $root->removeChild($n);
553: $fragment->appendChild($n);
554: }
555: return($fragment);
556: }
557:
558: # Use the linked sty files to guess which newly defined elements should be considered blocks.
559: # Also adds to @containing_styles the sty elements that contain styles.
560: # @param {string} fn - the .lc file path (we only extract the directory path from it)
561: sub add_sty_blocks {
562: my ($fn, $root, $all_block) = @_;
563: my $doc = $root->ownerDocument;
564: my @parserlibs = $doc->getElementsByTagName('parserlib');
565: my @libs = ();
566: foreach my $parserlib (@parserlibs) {
567: if (defined $parserlib->firstChild && $parserlib->firstChild->nodeType == XML_TEXT_NODE) {
568: my $value = $parserlib->firstChild->nodeValue;
569: $value =~ s/^\s+|\s+$//g;
570: if ($value ne '') {
571: push(@libs, $value);
572: }
573: }
574: }
575: my ($name, $path, $suffix) = fileparse($fn);
576: foreach my $sty (@libs) {
577: if (substr($sty, 0, 1) eq '/') {
578: $sty = $RES_DIR.$sty;
579: } else {
580: $sty = $path.$sty;
581: }
582: my $new_elements = parse_sty($sty, $all_block);
583: better_guess($root, $new_elements, $all_block);
584: my $new_blocks = $new_elements->{'block'};
585: my $new_inlines = $new_elements->{'inline'};
586: push(@$all_block, @{$new_blocks});
587: #push(@inlines, @{$new_inlines}); # we are not using a list of inline elements at this point
588: }
589: }
590:
591: ##
592: # Parses a sty file and returns lists of block and inline elements.
593: # @param {string} fn - the file path
594: ##
595: sub parse_sty {
596: my ($fn, $all_block) = @_;
597: my @blocks = ();
598: my @inlines = ();
599: my $p = HTML::TokeParser->new($fn);
600: if (! $p) {
601: die "post_xml.pl: parse_sty: Error reading $fn\n";
602: }
603: $p->empty_element_tags(1);
604: my $in_definetag = 0;
605: my $in_render = 0;
606: my %newtags = ();
607: my $newtag = '';
608: my $is_block = 0;
609: while (my $token = $p->get_token) {
610: if ($token->[0] eq 'S') {
611: my $tag = lc($token->[1]);
612: if ($tag eq 'definetag') {
613: $in_definetag = 1;
614: $is_block = 0;
615: my $attributes = $token->[2];
616: $newtag = $attributes->{'name'};
617: if (substr($newtag, 0, 1) eq '/') {
618: $newtag = substr($newtag, 1);
619: }
620: } elsif ($in_definetag && $tag eq 'render') {
621: $in_render = 1;
622: $is_block = 0;
623: } elsif ($in_render) {
624: if (string_in_array($all_block, $tag)) {
625: $is_block = 1;
626: }
627: }
628: } elsif ($token->[0] eq 'E') {
629: my $tag = lc($token->[1]);
630: if ($tag eq 'definetag') {
631: $in_definetag = 0;
632: if (defined $newtags{$newtag}) {
633: $newtags{$newtag} = $newtags{$newtag} || $is_block;
634: } else {
635: $newtags{$newtag} = $is_block;
636: }
637: } elsif ($in_definetag && $tag eq 'render') {
638: $in_render = 0;
639: }
640: }
641: }
642: foreach $newtag (keys(%newtags)) {
643: if ($newtags{$newtag} == 1) {
644: push(@blocks, $newtag);
645: } else {
646: push(@inlines, $newtag);
647: }
648: }
649: return {'block'=>\@blocks, 'inline'=>\@inlines};
650: }
651:
652: ##
653: # Marks as block the elements that contain block elements in the input file.
654: # Also adds to @containing_styles the sty elements that contain styles.
655: # @param {string} fn - the file path
656: # @param {Hash<string,Array>} new_elements - contains arrays in 'block' and 'inline'
657: ##
658: sub better_guess {
659: my ($root, $new_elements, $all_block) = @_;
660: my $new_blocks = $new_elements->{'block'};
661: my $new_inlines = $new_elements->{'inline'};
662:
663: my @change = (); # change these elements from inline to block
664: foreach my $tag (@{$new_inlines}) {
665: my @nodes = $root->getElementsByTagName($tag);
666: NODE_LOOP: foreach my $node (@nodes) {
667: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
668: if ($child->nodeType == XML_ELEMENT_NODE) {
669: if (string_in_array($all_block, $child->nodeName) || string_in_array($new_blocks, $child->nodeName)) {
670: push(@change, $tag);
671: last NODE_LOOP;
672: }
673: }
674: }
675: }
676: }
677: foreach my $inline (@change) {
678: my $index = 0;
679: $index++ until $new_inlines->[$index] eq $inline;
680: splice(@{$new_inlines}, $index, 1);
681: push(@{$new_blocks}, $inline);
682: }
683: # add to @containing_styles when a style is used inside
684: # NOTE: some sty elements will be added even though they should not, but if we don't do that
685: # all style will be removed in the sty elements.
686: foreach my $tag ((@{$new_blocks}, @{$new_inlines})) {
687: my @nodes = $root->getElementsByTagName($tag);
688: NODE_LOOP: foreach my $node (@nodes) {
689: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
690: if ($child->nodeType == XML_ELEMENT_NODE) {
691: if (string_in_array(\@html_styles, $child->nodeName)) {
692: push(@containing_styles, $tag);
693: last NODE_LOOP;
694: }
695: }
696: }
697: }
698: }
699: }
700:
701: # When a style element contains a block, move the style inside the block where it is allowed.
702: # style/block/other -> block/style/other
703: # When a style is used where it is not allowed, move it inside its children or remove it (unless it contains only text)
704: # element_not_containing_styles/style/other -> element_not_containing_styles/other/style (except if other is a style)
705: # The fix is not perfect in the case of element_not_containing_styles/style1/style2/block/text (style1 will be lost):
706: # element_not_containing_styles/style1/style2/block/text -> element_not_containing_styles/block/style2/text
707: # (a solution to this problem would be to merge the styles in a span)
708: # NOTE: .sty defined elements might have been added to @containing_styles by better_guess().
709: sub fix_block_styles {
710: my ($element, $all_block) = @_;
711: my $doc = $element->ownerDocument;
712: if (string_in_array(\@html_styles, $element->nodeName)) {
713: # move spaces out of the style element
714: if (defined $element->firstChild && $element->firstChild->nodeType == XML_TEXT_NODE) {
715: my $child = $element->firstChild;
716: if ($child->nodeValue =~ /^(\s+)(\S.*)$/s) {
717: $element->parentNode->insertBefore($doc->createTextNode($1), $element);
718: $child->setData($2);
719: }
720: }
721: if (defined $element->lastChild && $element->lastChild->nodeType == XML_TEXT_NODE) {
722: my $child = $element->lastChild;
723: if ($child->nodeValue =~ /^(.*\S)(\s+)$/s) {
724: $element->parentNode->insertAfter($doc->createTextNode($2), $element);
725: $child->setData($1);
726: }
727: }
728:
729: my $found_block = 0;
730: for (my $child=$element->firstChild; defined $child; $child=$child->nextSibling) {
731: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
732: $found_block = 1;
733: last;
734: }
735: }
736: my $no_style_here = !string_in_array(\@containing_styles, $element->parentNode->nodeName);
737: if ($found_block || $no_style_here) {
738: # there is a block or the style is not allowed here,
739: # the style element has to be replaced by its modified children
740: my $s; # a clone of the style
741: my $next;
742: for (my $child=$element->firstChild; defined $child; $child=$next) {
743: $next = $child->nextSibling;
744: if ($child->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $child->nodeName) ||
745: $child->nodeName eq 'br' || $no_style_here)) {
746: # avoid inverting a style with a style with $no_style_here (that would cause endless recursion)
747: if (!$no_style_here || (!string_in_array(\@html_styles, $child->nodeName) &&
748: string_in_array(\@containing_styles, $child->nodeName))) {
749: # block node or inline node when the style is not allowed:
750: # move all children inside the style, and make the style the only child
751: $s = $element->cloneNode();
752: my $next2;
753: for (my $child2=$child->firstChild; defined $child2; $child2=$next2) {
754: $next2 = $child2->nextSibling;
755: $child->removeChild($child2);
756: $s->appendChild($child2);
757: }
758: $child->appendChild($s);
759: }
760: $s = undef;
761: } elsif (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
762: $child->nodeType == XML_ELEMENT_NODE) {
763: # if the style is allowed, move text and inline nodes inside the style
764: if (!$no_style_here) {
765: if (!defined $s) {
766: $s = $element->cloneNode();
767: $element->insertBefore($s, $child);
768: }
769: $element->removeChild($child);
770: $s->appendChild($child);
771: }
772: } else {
773: # do not put other nodes inside the style
774: $s = undef;
775: }
776: }
777: # now replace by children and fix them
778: my $parent = $element->parentNode;
779: for (my $child=$element->firstChild; defined $child; $child=$next) {
780: $next = $child->nextSibling;
781: $element->removeChild($child);
782: $parent->insertBefore($child, $element);
783: if ($child->nodeType == XML_ELEMENT_NODE) {
784: fix_block_styles($child, $all_block);
785: }
786: }
787: $parent->removeChild($element);
788: return;
789: }
790: }
791: # otherwise fix all children
792: my $next;
793: for (my $child=$element->firstChild; defined $child; $child=$next) {
794: $next = $child->nextSibling;
795: if ($child->nodeType == XML_ELEMENT_NODE) {
796: fix_block_styles($child, $all_block);
797: }
798: }
799: }
800:
801: # removes empty font elements and font elements that contain at least one block element
802: # replaces other font elements by equivalent span
803: sub fix_fonts {
804: my ($root, $all_block) = @_;
805: my $doc = $root->ownerDocument;
806: my @fonts = $root->getElementsByTagName('font');
807: @fonts = reverse(@fonts); # to deal with the ancestor last in the case of font/font
808: foreach my $font (@fonts) {
809: my $block = 0;
810: for (my $child=$font->firstChild; defined $child; $child=$child->nextSibling) {
811: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
812: $block = 1;
813: last;
814: }
815: }
816: if (!defined $font->firstChild || $block) {
817: # empty font or font containing block elements
818: # replace this node by its content
819: replace_by_children($font);
820: } else {
821: # replace by equivalent span
822: my $color = get_non_empty_attribute($font, 'color');
823: my $size = get_non_empty_attribute($font, 'size');
824: my $face = get_non_empty_attribute($font, 'face');
825: if (defined $face) {
826: $face =~ s/^,|,$//;
827: }
828: if (!defined $color && !defined $size && !defined $face) {
829: # useless font element: replace this node by its content
830: replace_by_children($font);
831: next;
832: }
833: my $replacement;
834: tie (my %properties, 'Tie::IxHash', ());
835: if (!defined $color && !defined $size && defined $face && lc($face) eq 'symbol') {
836: $replacement = $doc->createDocumentFragment();
837: } else {
838: $replacement = $doc->createElement('span');
839: my $css = '';
840: if (defined $color) {
1.3 damieng 841: $color =~ s/^x/#/;
1.1 damieng 842: $properties{'color'} = $color;
843: }
844: if (defined $size) {
845: my %hash = (
846: '1' => 'x-small',
847: '2' => 'small',
848: '3' => 'medium',
849: '4' => 'large',
850: '5' => 'x-large',
851: '6' => 'xx-large',
852: '7' => '300%',
853: '-1' => 'small',
854: '-2' => 'x-small',
855: '+1' => 'large',
856: '+2' => 'x-large',
857: '+3' => 'xx-large',
858: '+4' => '300%',
859: );
860: my $value = $hash{$size};
861: if (!defined $value) {
862: $value = 'medium';
863: }
864: $properties{'font-size'} = $value;
865: }
866: if (defined $face) {
867: if (lc($face) ne 'symbol' && lc($face) ne 'bold') {
868: $properties{'font-family'} = $face;
869: }
870: }
871: set_css_properties($replacement, \%properties);
872: }
873: if (defined $face && lc($face) eq 'symbol') {
874: # convert all content to unicode
875: my $next;
876: for (my $child=$font->firstChild; defined $child; $child=$next) {
877: $next = $child->nextSibling;
878: if ($child->nodeType == XML_TEXT_NODE) {
879: my $value = $child->nodeValue;
880: $value =~ tr/ABGDEZHQIKLMNXOPRSTUFCYWabgdezhqiklmnxoprVstufcywJjv¡«¬®/ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϕϖϒ↔←→/;
881: $child->setData($value);
882: }
883: }
884: }
885: # replace the font node
886: if ($replacement->nodeType == XML_ELEMENT_NODE && !defined $font->previousSibling &&
887: !defined $font->nextSibling && string_in_array(\@accepting_style, $font->parentNode->nodeName)) {
888: # use CSS on the parent block and replace font by its children instead of using a new element
889: set_css_properties($font->parentNode, \%properties);
890: replace_by_children($font);
891: } else {
892: # move all font children inside the replacement (span or fragment)
893: my $next;
894: for (my $child=$font->firstChild; defined $child; $child=$next) {
895: $next = $child->nextSibling;
896: $font->removeChild($child);
897: $replacement->appendChild($child);
898: }
899: # replace font
900: $font->parentNode->replaceChild($replacement, $font);
901: }
902: }
903: }
904: $root->normalize();
905: }
906:
907: # replaces u by <span style="text-decoration: underline">
908: sub replace_u {
909: my ($root) = @_;
910: my $doc = $root->ownerDocument;
911: my @us = $root->getElementsByTagName('u');
912: foreach my $u (@us) {
913: my $span = $doc->createElement('span');
914: $span->setAttribute('style', 'text-decoration: underline');
915: my $next;
916: for (my $child=$u->firstChild; defined $child; $child=$next) {
917: $next = $child->nextSibling;
918: $u->removeChild($child);
919: $span->appendChild($child);
920: }
921: $u->parentNode->replaceChild($span, $u);
922: }
923: }
924:
925: # removes CDATA sections tags that have not been parsed correcty by the HTML parser
926: # also removes bad comments in script elements
927: sub remove_bad_cdata_sections {
928: my ($root) = @_;
929: my $doc = $root->ownerDocument;
930: foreach my $name (@preserve_elements) {
931: my @nodes = $root->getElementsByTagName($name);
932: foreach my $node (@nodes) {
933: if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
934: my $value = $node->firstChild->nodeValue;
935: if ($name eq 'script' && (!defined $node->getAttribute('type') || $node->getAttribute('type') ne 'loncapa/perl') &&
936: !defined $node->firstChild->nextSibling && $value =~ /^(\s*)<!--(.*)-->(\s*)$/) {
937: # web browsers interpret that as a real comment when it is on 1 line, but the Perl HTML parser thinks it is the script
938: # -> turning it back into a comment
939: # (this is only true for Javascript script elements, since LON-CAPA does not parse loncapa/perl scripts in the same way)
940: $node->removeChild($node->firstChild);
941: $node->appendChild($doc->createComment($2));
942: next;
943: }
944: # at the beginning:
945: $value =~ s/^(\s*)<!\[CDATA\[/$1/; # <![CDATA[
946: $value =~ s/^(\s*)\/\*\s*<!\[CDATA\[\s*\*\//$1/; # /* <![CDATA[ */
947: $value =~ s/^(\s*)\/\/\s*<!\[CDATA\[/$1/; # // <![CDATA[
948: $value =~ s/^(\s*)(\/\/)?\s*<!--/$1/; # // <!--
949: # at the end:
950: $value =~ s/\/\/\s*\]\]>(\s*)$/$1/; # // ]]>
951: $value =~ s/\]\]>(\s*)$/$1/; # ]]>
952: $value =~ s/(\/\/)?\s*-->(\s*)$/$2/; # // -->
953: $value =~ s/\/\*\s*\]\]>\s*\*\/(\s*)$/$1/; # /* ]]> */
954:
955: $value = "\n".$value."\n";
956: $value =~ s/\s*(\n[ \t]*)/$1/;
957: $value =~ s/\s+$/\n/;
958: $node->firstChild->setData($value);
959: }
960: }
961: }
962: }
963:
964: # adds CDATA sections to scripts
965: sub add_cdata_sections {
966: my ($root) = @_;
967: my $doc = $root->ownerDocument;
968: my @scripts = $root->getElementsByTagName('script');
969: my @answers = $root->getElementsByTagName('answer');
970: foreach my $answer (@answers) {
971: my $ancestor = $answer->parentNode;
972: my $found_capa_response = 0;
973: while (defined $ancestor) {
974: if ($ancestor->nodeName eq 'numericalresponse' || $ancestor->nodeName eq 'formularesponse') {
975: $found_capa_response = 1;
976: last;
977: }
978: $ancestor = $ancestor->parentNode;
979: }
980: if (!$found_capa_response) {
981: push(@scripts, $answer);
982: }
983: }
984: foreach my $script (@scripts) {
985: # use a CDATA section in the normal situation, for any script
986: my $first = $script->firstChild;
987: if (defined $first && $first->nodeType == XML_TEXT_NODE && !defined $first->nextSibling) {
988: my $cdata = $doc->createCDATASection($first->nodeValue);
989: $script->replaceChild($cdata, $first);
990: }
991: }
992: }
993:
994: # removes "<!--" and "-->" at the beginning and end of style elements
995: sub fix_style_element {
996: my ($root) = @_;
997: my @styles = $root->getElementsByTagName('style');
998: foreach my $style (@styles) {
999: if (defined $style->firstChild && $style->firstChild->nodeType == XML_TEXT_NODE &&
1000: !defined $style->firstChild->nextSibling) {
1001: my $text = $style->firstChild->nodeValue;
1002: if ($text =~ /^\s*<!--(.*)-->\s*$/s) {
1003: $style->firstChild->setData($1);
1004: }
1005: }
1006: }
1007: }
1008:
1009: # create missing cells at the end of table rows
1010: sub fix_tables {
1011: my ($root) = @_;
1012: my @tables = $root->getElementsByTagName('table');
1013: foreach my $table (@tables) {
1014: fix_cells($table);
1015: foreach my $tbody ($table->getChildrenByTagName('tbody')) {
1016: fix_cells($tbody);
1017: }
1018: foreach my $thead ($table->getChildrenByTagName('thead')) {
1019: fix_cells($thead);
1020: }
1021: foreach my $tfoot ($table->getChildrenByTagName('tfoot')) {
1022: fix_cells($tfoot);
1023: }
1024: }
1025: }
1026:
1027: # create missing cells at the end of table rows
1028: sub fix_cells {
1029: my ($table) = @_; # could actually be table, tbody, thead or tfoot
1030: my $doc = $table->ownerDocument;
1031: my @nb_cells = ();
1032: my $max_nb_cells = 0;
1033: my @rowspans = ();
1034: my @trs = $table->getChildrenByTagName('tr');
1035: foreach my $tr (@trs) {
1036: my $nb_cells;
1037: if (defined $rowspans[0]) {
1038: $nb_cells = shift(@rowspans);
1039: } else {
1040: $nb_cells = 0;
1041: }
1042: for (my $cell=$tr->firstChild; defined $cell; $cell=$cell->nextSibling) {
1043: if ($cell->nodeName eq 'td' || $cell->nodeName eq 'th') {
1044: my $colspan = $cell->getAttribute('colspan');
1045: if (defined $colspan && $colspan =~ /^\s*[0-9]+\s*$/) {
1046: $nb_cells += $colspan;
1047: } else {
1048: $nb_cells++;
1049: }
1050: my $rowspan = $cell->getAttribute('rowspan');
1051: if (defined $rowspan && $rowspan =~ /^\s*[0-9]+\s*$/) {
1052: for (my $i=0; $i < $rowspan-1; $i++) {
1053: if (!defined $rowspans[$i]) {
1054: $rowspans[$i] = 1;
1055: } else {
1056: $rowspans[$i]++;
1057: }
1058: }
1059: }
1060: }
1061: }
1062: push(@nb_cells, $nb_cells);
1063: if ($nb_cells > $max_nb_cells) {
1064: $max_nb_cells = $nb_cells;
1065: }
1066: }
1067: foreach my $tr (@trs) {
1068: my $nb_cells = shift(@nb_cells);
1069: if ($nb_cells < $max_nb_cells) {
1070: for (1..($max_nb_cells - $nb_cells)) {
1071: $tr->appendChild($doc->createElement('td'));
1072: }
1073: }
1074: }
1075: }
1076:
1077: # replaces ul/ul by ul/li/ul and the same for ol (using the previous li if possible)
1078: # also adds a ul element when a li has no ul/ol ancestor
1079: sub fix_lists {
1080: my ($root) = @_;
1081: my $doc = $root->ownerDocument;
1082: my @uls = $root->getElementsByTagName('ul');
1083: my @ols = $root->getElementsByTagName('ol');
1084: my @lists = (@uls, @ols);
1085: foreach my $list (@lists) {
1086: my $next;
1087: for (my $child=$list->firstChild; defined $child; $child=$next) {
1088: $next = $child->nextSibling;
1089: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array(['ul','ol'], $child->nodeName)) {
1090: my $previous = $child->previousNonBlankSibling(); # note: non-DOM method
1091: $list->removeChild($child);
1092: if (defined $previous && $previous->nodeType == XML_ELEMENT_NODE && $previous->nodeName eq 'li') {
1093: $previous->appendChild($child);
1094: } else {
1095: my $li = $doc->createElement('li');
1096: $li->appendChild($child);
1097: if (!defined $next) {
1098: $list->appendChild($li);
1099: } else {
1100: $list->insertBefore($li, $next);
1101: }
1102: }
1103: }
1104: }
1105: }
1106: my @lis = $root->getElementsByTagName('li');
1107: foreach my $li (@lis) {
1108: my $found_list_ancestor = 0;
1109: my $ancestor = $li->parentNode;
1110: while (defined $ancestor) {
1111: if ($ancestor->nodeName eq 'ul' || $ancestor->nodeName eq 'ol') {
1112: $found_list_ancestor = 1;
1113: last;
1114: }
1115: $ancestor = $ancestor->parentNode;
1116: }
1117: if (!$found_list_ancestor) {
1118: # replace li by ul and add li under ul
1119: my $ul = $doc->createElement('ul');
1120: $li->parentNode->insertBefore($ul, $li);
1121: $li->parentNode->removeChild($li);
1122: $ul->appendChild($li);
1123: # add all other li afterwards inside ul (there might be text nodes in-between)
1124: my $next = $ul->nextSibling;
1125: while (defined $next) {
1126: my $next_next = $next->nextSibling;
1127: if ($next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^\s*$/ &&
1128: defined $next_next && $next_next->nodeType == XML_ELEMENT_NODE && $next_next->nodeName eq 'li') {
1129: $next->parentNode->removeChild($next);
1130: $ul->appendChild($next);
1131: $next = $next_next;
1132: $next_next = $next_next->nextSibling;
1133: }
1134: if ($next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'li') {
1135: $next->parentNode->removeChild($next);
1136: $ul->appendChild($next);
1137: } else {
1138: last;
1139: }
1140: $next = $next_next;
1141: }
1142: }
1143: }
1144: }
1145:
1146: # Some "image" elements are actually img element with a wrong name. This renames them.
1147: # Amazingly enough, "<image src=..." displays an image in some browsers
1148: # ("image" has existed at some point as an experimental HTML element).
1149: sub fix_wrong_name_for_img {
1150: my ($root) = @_;
1151: my @images = $root->getElementsByTagName('image');
1152: foreach my $image (@images) {
1153: if (!defined $image->getAttribute('src')) {
1154: next;
1155: }
1156: my $found_correct_ancestor = 0;
1157: my $ancestor = $image->parentNode;
1158: while (defined $ancestor) {
1159: if ($ancestor->nodeName eq 'drawimage' || $ancestor->nodeName eq 'imageresponse') {
1160: $found_correct_ancestor = 1;
1161: last;
1162: }
1163: $ancestor = $ancestor->parentNode;
1164: }
1165: if ($found_correct_ancestor) {
1166: next;
1167: }
1168: # this really has to be renamed "img"
1169: $image->setNodeName('img');
1170: }
1171: }
1172:
1173: # Replaces many deprecated attributes and replaces them by equivalent CSS when possible
1174: sub replace_deprecated_attributes_by_css {
1175: my ($root) = @_;
1176:
1177: fix_deprecated_in_tables($root);
1178:
1179: fix_deprecated_in_table_rows($root);
1180:
1181: fix_deprecated_in_table_cells($root);
1182:
1183: fix_deprecated_in_lists($root);
1184:
1185: fix_deprecated_in_list_items($root);
1186:
1187: fix_deprecated_in_hr($root);
1188:
1189: fix_deprecated_in_img($root);
1190:
1191: fix_deprecated_in_body($root);
1192:
1193: fix_align_attribute($root);
1194: }
1195:
1196: # Replaces deprecated attributes in tables
1197: sub fix_deprecated_in_tables {
1198: my ($root) = @_;
1199: my @tables = $root->getElementsByTagName('table');
1200: foreach my $table (@tables) {
1201: tie (my %new_properties, 'Tie::IxHash', ());
1202: my $align = $table->getAttribute('align');
1203: if (defined $align) {
1204: $table->removeAttribute('align');
1205: $align = lc(trim($align));
1206: }
1207: if ($table->parentNode->nodeName eq 'center' || (defined $align && $align eq 'center') ||
1208: (defined $table->parentNode->getAttribute('align') && $table->parentNode->getAttribute('align') eq 'center')) {
1209: $new_properties{'margin-left'} = 'auto';
1210: $new_properties{'margin-right'} = 'auto';
1211: }
1212: if (defined $align && ($align eq 'left' || $align eq 'right')) {
1213: $new_properties{'float'} = $align;
1214: }
1215: my $width = $table->getAttribute('width');
1216: if (defined $width) {
1217: $table->removeAttribute('width');
1218: $width = trim($width);
1219: if ($width =~ /^[0-9]+$/) {
1220: $width .= 'px';
1221: }
1222: if ($width ne '') {
1223: $new_properties{'width'} = $width;
1224: }
1225: }
1226: my $height = $table->getAttribute('height');
1227: if (defined $height) {
1228: $table->removeAttribute('height');
1229: # no replacement for table height
1230: }
1231: my $bgcolor = $table->getAttribute('bgcolor');
1232: if (defined $bgcolor) {
1233: $table->removeAttribute('bgcolor');
1234: $bgcolor = trim($bgcolor);
1235: $bgcolor =~ s/^x\s*//;
1236: if ($bgcolor ne '') {
1237: $new_properties{'background-color'} = $bgcolor;
1238: }
1239: }
1240: my $frame = $table->getAttribute('frame');
1241: if (defined $frame) {
1242: $table->removeAttribute('frame');
1243: $frame = lc(trim($frame));
1244: if ($frame eq 'void') {
1245: $new_properties{'border'} = 'none';
1246: } elsif ($frame eq 'above') {
1247: $new_properties{'border-top'} = '1px solid black';
1248: } elsif ($frame eq 'below') {
1249: $new_properties{'border-bottom'} = '1px solid black';
1250: } elsif ($frame eq 'hsides') {
1251: $new_properties{'border-top'} = '1px solid black';
1252: $new_properties{'border-bottom'} = '1px solid black';
1253: } elsif ($frame eq 'vsides') {
1254: $new_properties{'border-left'} = '1px solid black';
1255: $new_properties{'border-right'} = '1px solid black';
1256: } elsif ($frame eq 'lhs') {
1257: $new_properties{'border-left'} = '1px solid black';
1258: } elsif ($frame eq 'rhs') {
1259: $new_properties{'border-right'} = '1px solid black';
1260: } elsif ($frame eq 'box') {
1261: $new_properties{'border'} = '1px solid black';
1262: } elsif ($frame eq 'border') {
1263: $new_properties{'border'} = '1px solid black';
1264: }
1265: }
1266: if (scalar(keys %new_properties) > 0) {
1267: set_css_properties($table, \%new_properties);
1268: }
1269: # we can't replace the border attribute without creating a style block, but we can improve things like border="BORDER"
1270: my $border = $table->getAttribute('border');
1271: if (defined $border) {
1272: $border = trim($border);
1273: if ($border !~ /^\s*[0-9]+\s*(px)?\s*$/) {
1274: $table->setAttribute('border', '1');
1275: }
1276: }
1277: }
1278:
1279: }
1280:
1281: # Replaces deprecated attributes in tr elements
1282: sub fix_deprecated_in_table_rows {
1283: my ($root) = @_;
1284: my @trs = $root->getElementsByTagName('tr');
1285: foreach my $tr (@trs) {
1286: my $old_properties = get_css_properties($tr);
1287: tie (my %new_properties, 'Tie::IxHash', ());
1288: my $bgcolor = $tr->getAttribute('bgcolor');
1289: if (defined $bgcolor) {
1290: $tr->removeAttribute('bgcolor');
1291: if (!defined $old_properties->{'background-color'}) {
1292: $bgcolor = trim($bgcolor);
1293: $bgcolor =~ s/^x\s*//;
1294: if ($bgcolor ne '') {
1295: $new_properties{'background-color'} = $bgcolor;
1296: }
1297: }
1298: }
1299: my $align = $tr->getAttribute('align');
1300: if (defined $align && $align !~ /\s*char\s*/i) {
1301: $tr->removeAttribute('align');
1302: if (!defined $old_properties->{'text-align'}) {
1303: $align = lc(trim($align));
1304: if ($align ne '') {
1305: $new_properties{'text-align'} = $align;
1306: }
1307: }
1308: }
1309: my $valign = $tr->getAttribute('valign');
1310: if (defined $valign) {
1311: $tr->removeAttribute('valign');
1312: if (!defined $old_properties->{'vertical-align'}) {
1313: $valign = lc(trim($valign));
1314: if ($valign ne '') {
1315: $new_properties{'vertical-align'} = $valign;
1316: }
1317: }
1318: }
1319: if (scalar(keys %new_properties) > 0) {
1320: set_css_properties($tr, \%new_properties);
1321: }
1322: }
1323: }
1324:
1325: # Replaces deprecated attributes in table cells (td and th)
1326: sub fix_deprecated_in_table_cells {
1327: my ($root) = @_;
1328: my @tds = $root->getElementsByTagName('td');
1329: my @ths = $root->getElementsByTagName('th');
1330: my @cells = (@tds, @ths);
1331: foreach my $cell (@cells) {
1332: my $old_properties = get_css_properties($cell);
1333: tie (my %new_properties, 'Tie::IxHash', ());
1334: my $width = $cell->getAttribute('width');
1335: if (defined $width) {
1336: $cell->removeAttribute('width');
1337: if (!defined $old_properties->{'width'}) {
1338: $width = trim($width);
1339: if ($width =~ /^[0-9]+$/) {
1340: $width .= 'px';
1341: }
1342: if ($width ne '') {
1343: $new_properties{'width'} = $width;
1344: }
1345: }
1346: }
1347: my $height = $cell->getAttribute('height');
1348: if (defined $height) {
1349: $cell->removeAttribute('height');
1350: if (!defined $old_properties->{'height'}) {
1351: $height = trim($height);
1352: if ($height =~ /^[0-9]+$/) {
1353: $height .= 'px';
1354: }
1355: if ($height ne '') {
1356: $new_properties{'height'} = $height;
1357: }
1358: }
1359: }
1360: my $bgcolor = $cell->getAttribute('bgcolor');
1361: if (defined $bgcolor) {
1362: $cell->removeAttribute('bgcolor');
1363: if (!defined $old_properties->{'background-color'}) {
1364: $bgcolor = trim($bgcolor);
1365: $bgcolor =~ s/^x\s*//;
1366: if ($bgcolor ne '') {
1367: $new_properties{'background-color'} = $bgcolor;
1368: }
1369: }
1370: }
1371: my $align = $cell->getAttribute('align');
1372: if (defined $align && $align !~ /\s*char\s*/i) {
1373: $cell->removeAttribute('align');
1374: if (!defined $old_properties->{'text-align'}) {
1375: $align = lc(trim($align));
1376: if ($align ne '') {
1377: $new_properties{'text-align'} = $align;
1378: }
1379: }
1380: }
1381: my $valign = $cell->getAttribute('valign');
1382: if (defined $valign) {
1383: $cell->removeAttribute('valign');
1384: if (!defined $old_properties->{'vertical-align'}) {
1385: $valign = lc(trim($valign));
1386: if ($valign ne '') {
1387: $new_properties{'vertical-align'} = $valign;
1388: }
1389: }
1390: }
1391: if (scalar(keys %new_properties) > 0) {
1392: set_css_properties($cell, \%new_properties);
1393: }
1394: }
1395: }
1396:
1397: # Replaces deprecated attributes in lists (ul and ol)
1398: sub fix_deprecated_in_lists {
1399: my ($root) = @_;
1400: my @uls = $root->getElementsByTagName('ul');
1401: my @ols = $root->getElementsByTagName('ol');
1402: my @lists = (@uls, @ols);
1403: foreach my $list (@lists) {
1404: my $type = $list->getAttribute('type');
1405: if (defined $type) {
1406: my $lst = list_style_type($type);
1407: if (defined $lst) {
1408: $list->removeAttribute('type');
1409: if (!defined get_css_property($list, 'list-style-type')) {
1410: set_css_property($list, 'list-style-type', $lst);
1411: }
1412: }
1413: }
1414: }
1415: }
1416:
1417: # Replaces deprecated attributes in list items (li)
1418: sub fix_deprecated_in_list_items {
1419: my ($root) = @_;
1420: my @lis = $root->getElementsByTagName('li');
1421: foreach my $li (@lis) {
1422: my $type = $li->getAttribute('type');
1423: if (defined $type) {
1424: my $lst = list_style_type($type);
1425: if (defined $lst) {
1426: $li->removeAttribute('type');
1427: if (!defined get_css_property($li, 'list-style-type')) {
1428: set_css_property($li, 'list-style-type', $lst);
1429: }
1430: }
1431: }
1432: }
1433: }
1434:
1435: # returns the CSS list-style-type value equivalent to the given type attribute for a list or list item
1436: sub list_style_type {
1437: my ($type) = @_;
1438: my $value;
1439: $type = trim($type);
1440: if (lc($type) eq 'circle') {
1441: $value = 'circle';
1442: } elsif (lc($type) eq 'disc') {
1443: $value = 'disc';
1444: } elsif (lc($type) eq 'square') {
1445: $value = 'square';
1446: } elsif ($type eq 'a') {
1447: $value = 'lower-latin';
1448: } elsif ($type eq 'A') {
1449: $value = 'upper-latin';
1450: } elsif ($type eq 'i') {
1451: $value = 'lower-roman';
1452: } elsif ($type eq 'I') {
1453: $value = 'upper-roman';
1454: } elsif ($type eq '1') {
1455: $value = 'decimal';
1456: }
1457: return $value;
1458: }
1459:
1460: # Replaces deprecated attributes in hr
1461: sub fix_deprecated_in_hr {
1462: my ($root) = @_;
1463: my @hrs = $root->getElementsByTagName('hr');
1464: foreach my $hr (@hrs) {
1465: tie (my %new_properties, 'Tie::IxHash', ());
1466: my $align = $hr->getAttribute('align');
1467: if (defined $align) {
1468: $align = lc(trim($align));
1469: if ($align eq 'left') {
1470: $new_properties{'text-align'} = 'left';
1471: $new_properties{'margin-left'} = '0';
1472: } elsif ($align eq 'right') {
1473: $new_properties{'text-align'} = 'right';
1474: $new_properties{'margin-right'} = '0';
1475: }
1476: $hr->removeAttribute('align');
1477: }
1478: my $color = $hr->getAttribute('color');
1479: if (defined $color) {
1480: $color = trim($color);
1481: $color =~ s/^x\s*//;
1482: if ($color ne '') {
1483: $new_properties{'color'} = $color;
1484: $new_properties{'background-color'} = $color;
1485: }
1486: $hr->removeAttribute('color');
1487: }
1488: my $noshade = $hr->getAttribute('noshade');
1489: my $size = $hr->getAttribute('size');
1490: if (defined $noshade) {
1491: $new_properties{'border-width'} = '0';
1492: if (!defined $color) {
1493: $new_properties{'color'} = 'gray';
1494: $new_properties{'background-color'} = 'gray';
1495: }
1496: if (!defined $size) {
1497: $size = '2';
1498: }
1499: $hr->removeAttribute('noshade');
1500: }
1501: if (defined $size) {
1502: $size = trim($size);
1503: if ($size ne '') {
1504: $new_properties{'height'} = $size.'px';
1505: }
1506: if (defined $hr->getAttribute('size')) {
1507: $hr->removeAttribute('size');
1508: }
1509: }
1510: my $width = $hr->getAttribute('width');
1511: if (defined $width) {
1512: $width = trim($width);
1513: if ($width ne '') {
1514: if ($width !~ /\%$/) {
1515: $width .= 'px';
1516: }
1517: $new_properties{'width'} = $width;
1518: }
1519: $hr->removeAttribute('width');
1520: }
1521: if (scalar(keys %new_properties) > 0) {
1522: set_css_properties($hr, \%new_properties);
1523: }
1524: }
1525: }
1526:
1527: # Replaces deprecated attributes in img
1528: sub fix_deprecated_in_img {
1529: my ($root) = @_;
1530: my @imgs = $root->getElementsByTagName('img');
1531: foreach my $img (@imgs) {
1532: my $old_properties = get_css_properties($img);
1533: tie (my %new_properties, 'Tie::IxHash', ());
1534: my $align = $img->getAttribute('align');
1535: if (defined $align) {
1536: $align = lc(trim($align));
1537: if ($align eq 'middle' || $align eq 'top' || $align eq 'bottom') {
1538: $img->removeAttribute('align');
1539: if (!defined $old_properties->{'vertical-align'}) {
1540: $new_properties{'vertical-align'} = $align;
1541: }
1542: } elsif ($align eq 'left' || $align eq 'right') {
1543: $img->removeAttribute('align');
1544: if (!defined $old_properties->{'float'}) {
1545: $new_properties{'float'} = $align;
1546: }
1547: } elsif ($align eq 'center' || $align eq '') {
1548: $img->removeAttribute('align');
1549: }
1550: }
1551: my $border = $img->getAttribute('border');
1552: if (defined $border) {
1553: $border = lc(trim($border));
1554: if ($border =~ /^[0-9]+\s*(px)?$/) {
1555: $img->removeAttribute('border');
1556: if (!defined $old_properties->{'border'}) {
1557: if ($border !~ /px$/) {
1558: $border .= 'px';
1559: }
1560: $new_properties{'border'} = $border.' solid black';
1561: }
1562: }
1563: }
1564: my $hspace = $img->getAttribute('hspace');
1565: if (defined $hspace) {
1566: $hspace = lc(trim($hspace));
1567: if ($hspace =~ /^[0-9]+\s*(px)?$/) {
1568: $img->removeAttribute('hspace');
1569: if (!defined $old_properties->{'margin-left'} || !defined $old_properties->{'margin-right'}) {
1570: if ($hspace !~ /px$/) {
1571: $hspace .= 'px';
1572: }
1573: $new_properties{'margin-left'} = $hspace;
1574: $new_properties{'margin-right'} = $hspace;
1575: }
1576: }
1577: }
1578: if (scalar(keys %new_properties) > 0) {
1579: set_css_properties($img, \%new_properties);
1580: }
1581: }
1582: }
1583:
1584: # Replaces deprecated attributes in htmlbody (the style attribute could be used in a div for output)
1585: sub fix_deprecated_in_body {
1586: my ($root) = @_;
1587: my $doc = $root->ownerDocument;
1588: my @bodies = $root->getElementsByTagName('htmlbody');
1589: foreach my $body (@bodies) {
1590: my $old_properties = get_css_properties($body);
1591: tie (my %new_properties, 'Tie::IxHash', ());
1592: my $bgcolor = $body->getAttribute('bgcolor');
1593: if (defined $bgcolor) {
1594: $body->removeAttribute('bgcolor');
1595: if (!defined $old_properties->{'background-color'}) {
1596: $bgcolor = trim($bgcolor);
1597: $bgcolor =~ s/^x\s*//;
1598: if ($bgcolor ne '') {
1599: $new_properties{'background-color'} = $bgcolor;
1600: }
1601: }
1602: }
1603: my $color = $body->getAttribute('text');
1604: if (defined $color) {
1605: $body->removeAttribute('text');
1606: if (!defined $old_properties->{'color'}) {
1607: $color = trim($color);
1608: $color =~ s/^x\s*//;
1609: if ($color ne '') {
1610: $new_properties{'color'} = $color;
1611: }
1612: }
1613: }
1614: my $background = $body->getAttribute('background');
1615: if (defined $background && ($background =~ /\.jpe?g$|\.gif|\.png/i)) {
1616: $body->removeAttribute('background');
1617: if (!defined $old_properties->{'background-image'}) {
1618: $background = trim($background);
1619: if ($background ne '') {
1620: $new_properties{'background-image'} = 'url('.$background.')';
1621: }
1622: }
1623: }
1624: # NOTE: these attributes have never been standard and are better removed with no replacement
1625: foreach my $bad ('bottommargin', 'leftmargin', 'rightmargin', 'topmargin', 'marginheight', 'marginwidth') {
1626: if ($body->hasAttribute($bad)) {
1627: $body->removeAttribute($bad);
1628: }
1629: }
1630: # NOTE: link alink and vlink require a <style> block to be converted
1631: my $link = $body->getAttribute('link');
1632: my $alink = $body->getAttribute('alink');
1633: my $vlink = $body->getAttribute('vlink');
1634: if (defined $link || defined $alink || defined $vlink) {
1635: my $head;
1636: my @heads = $root->getElementsByTagName('htmlhead');
1637: if (scalar(@heads) > 0) {
1638: $head = $heads[0];
1639: } else {
1640: $head = $doc->createElement('htmlhead');
1641: $root->insertBefore($head, $root->firstChild);
1642: }
1643: my $style = $doc->createElement('style');
1644: $head->appendChild($style);
1645: my $css = "\n";
1646: if (defined $link) {
1647: $body->removeAttribute('link');
1648: $link = trim($link);
1649: $link =~ s/^x\s*//;
1650: $css .= ' a:link { color:'.$link.' }';
1651: $css .= "\n";
1652: }
1653: if (defined $alink) {
1654: $body->removeAttribute('alink');
1655: $alink = trim($alink);
1656: $alink =~ s/^x\s*//;
1657: $css .= ' a:active { color:'.$alink.' }';
1658: $css .= "\n";
1659: }
1660: if (defined $vlink) {
1661: $body->removeAttribute('vlink');
1662: $vlink = trim($vlink);
1663: $vlink =~ s/^x\s*//;
1664: $css .= ' a:visited { color:'.$vlink.' }';
1665: $css .= "\n";
1666: }
1667: $css .= ' ';
1668: $style->appendChild($doc->createTextNode($css));
1669: }
1670: if (scalar(keys %new_properties) > 0) {
1671: set_css_properties($body, \%new_properties);
1672: } elsif (!$body->hasAttributes) {
1673: $body->parentNode->removeChild($body);
1674: }
1675: }
1676: }
1677:
1678: # replaces <div align="center"> by <div style="text-align:center;">
1679: # also for p and h1..h6
1680: sub fix_align_attribute {
1681: my ($root) = @_;
1682: my @nodes = $root->getElementsByTagName('div');
1683: push(@nodes, $root->getElementsByTagName('p'));
1684: for (my $i=1; $i<=6; $i++) {
1685: push(@nodes, $root->getElementsByTagName('h'.$i));
1686: }
1687: foreach my $node (@nodes) {
1688: my $align = $node->getAttribute('align');
1689: if (defined $align) {
1690: $node->removeAttribute('align');
1691: $align = trim($align);
1692: if ($align ne '' && !defined get_css_property($node, 'text-align')) {
1693: set_css_property($node, 'text-align', lc($align));
1694: }
1695: }
1696: }
1697: }
1698:
1699: # replace center by a div or remove it if there is a table inside
1700: sub replace_center {
1701: my ($root, $all_block) = @_;
1702: my $doc = $root->ownerDocument;
1703: my @centers = $root->getElementsByTagName('center');
1704: foreach my $center (@centers) {
1705: if ($center->getChildrenByTagName('table')->size() > 0) { # note: getChildrenByTagName is not DOM (LibXML specific)
1706: replace_by_children($center);
1707: } else {
1708: if ((!defined $center->previousSibling ||
1709: ($center->previousSibling->nodeType == XML_TEXT_NODE && $center->previousSibling->nodeValue =~ /^\s*$/ && !defined $center->previousSibling->previousSibling)) &&
1710: (!defined $center->nextSibling ||
1711: ($center->nextSibling->nodeType == XML_TEXT_NODE && $center->nextSibling->nodeValue =~ /^\s*$/ && !defined $center->nextSibling->nextSibling)) &&
1712: string_in_array(\@accepting_style, $center->parentNode->nodeName)) {
1713: # use CSS on the parent block and replace center by its children
1714: set_css_property($center->parentNode, 'text-align', 'center');
1715: replace_by_children($center);
1716: } else {
1717: # use p or div ? check if there is a block inside
1718: my $found_block = 0;
1719: for (my $child=$center->firstChild; defined $child; $child=$child->nextSibling) {
1720: if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1721: $found_block = 1;
1722: last;
1723: }
1724: }
1725: my $new_node;
1726: if ($found_block) {
1727: $new_node = $doc->createElement('div');
1728: $new_node->setAttribute('style', 'text-align: center; margin: 0 auto');
1729: } else {
1730: $new_node = $doc->createElement('p');
1731: $new_node->setAttribute('style', 'text-align: center');
1732: }
1733: my $next;
1734: for (my $child=$center->firstChild; defined $child; $child=$next) {
1735: $next = $child->nextSibling;
1736: $center->removeChild($child);
1737: $new_node->appendChild($child);
1738: }
1739: $center->parentNode->replaceChild($new_node, $center);
1740: }
1741: }
1742: }
1743: }
1744:
1745: # replaces <nobr> by <span style="white-space:nowrap">
1746: sub replace_nobr {
1747: my ($root) = @_;
1748: my @nobrs = $root->getElementsByTagName('nobr');
1749: foreach my $nobr (@nobrs) {
1750: if (!defined $nobr->previousSibling && !defined $nobr->nextSibling &&
1751: string_in_array(\@accepting_style, $nobr->parentNode->nodeName)) {
1752: # use CSS on the parent block
1753: set_css_property($nobr->parentNode, 'white-space', 'nowrap');
1754: replace_by_children($nobr);
1755: } else {
1756: $nobr->setNodeName('span');
1757: $nobr->setAttribute('style', 'white-space:nowrap');
1758: }
1759: }
1760: }
1761:
1762: # removes notsolved tags in the case <hintgroup showoncorrect="no"><notsolved>...</notsolved></hintgroup>
1763: # and in the case <notsolved><hintgroup showoncorrect="no">...</hintgroup></notsolved>
1764: sub remove_useless_notsolved {
1765: my ($root) = @_;
1766: my @hintgroups = $root->getElementsByTagName('hintgroup');
1767: foreach my $hintgroup (@hintgroups) {
1768: my $showoncorrect = get_non_empty_attribute($hintgroup, 'showoncorrect');
1769: if (!defined $showoncorrect || $showoncorrect eq 'no') {
1770: my @notsolveds = $hintgroup->getElementsByTagName('notsolved');
1771: foreach my $notsolved (@notsolveds) {
1772: replace_by_children($notsolved);
1773: }
1774: }
1775: my $parent = $hintgroup->parentNode;
1776: if ($parent->nodeName eq 'notsolved' && scalar(@{$parent->nonBlankChildNodes()}) == 1) {
1777: replace_by_children($parent);
1778: }
1779: }
1780: }
1781:
1782: # adds a paragraph inside if needed and calls fix_paragraph for all paragraphs (including new ones)
1783: sub fix_paragraphs_inside {
1784: my ($node, $all_block) = @_;
1785: # blocks in which paragrahs will be added:
1786: my @blocks_with_p = ('loncapa','library','problem','part','problemtype','window','block','while','postanswerdate','preduedate','solved','notsolved','languageblock','translated','lang','instructorcomment','togglebox','standalone','form');
1787: my @fix_p_if_br_or_p = (@responses,'foil','item','text','label','hintgroup','hintpart','hint','web','windowlink','div','li','dd','td','th','blockquote');
1788: if ((string_in_array(\@blocks_with_p, $node->nodeName) && paragraph_needed($node)) ||
1789: (string_in_array(\@fix_p_if_br_or_p, $node->nodeName) && paragraph_inside($node))) {
1790: # if non-empty, add paragraphs where needed between all br and remove br
1791: # (it would be easier to just put everything in a p and fix it afterwards, but there are performance issues
1792: # when a paragraph has many blocks directly inside)
1793: my $doc = $node->ownerDocument;
1794: my $p = undef;
1795: my @new_children = ();
1796: my $next;
1797: for (my $child=$node->firstChild; defined $child; $child=$next) {
1798: $next = $child->nextSibling;
1799: $node->removeChild($child);
1800: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'br') {
1801: if (defined $p) {
1802: push(@new_children, $p);
1803: } else {
1804: push(@new_children, $doc->createElement('p'));
1805: }
1806: $p = undef;
1807: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array(\@inline_like_block, $child->nodeName)) {
1808: # inline_like_block: use the paragraph if there is one, otherwise do not create one
1809: if (defined $p) {
1810: $p->appendChild($child);
1811: } else {
1812: push(@new_children, $child);
1813: }
1814: } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) {
1815: # these children are blocks and should not be in a paragraph
1816: if (defined $p) {
1817: push(@new_children, $p);
1818: $p = undef;
1819: }
1820: push(@new_children, $child);
1821: } elsif ($child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^[ \t\f\n\r]*$/) {
1822: # blank text: add to paragraph if there is one and there is a next node, otherwise keep out of the paragraph
1823: if (defined $p) {
1824: if (defined $next) {
1825: $p->appendChild($child);
1826: } else {
1827: push(@new_children, $p);
1828: $p = undef;
1829: push(@new_children, $child);
1830: }
1831: } else {
1832: push(@new_children, $child);
1833: }
1834: } elsif ($child->nodeType == XML_TEXT_NODE ||
1835: $child->nodeType == XML_ELEMENT_NODE || $child->nodeType == XML_CDATA_SECTION_NODE ||
1836: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1837: # these children require a paragraph
1838: if (!defined $p) {
1839: $p = $doc->createElement('p');
1840: }
1841: $p->appendChild($child);
1842: } else {
1843: # these children do not require a paragraph (XML comments, PI)
1844: # -> do not move them in a new paragraph
1845: if (defined $p) {
1846: push(@new_children, $p);
1847: $p = undef;
1848: }
1849: push(@new_children, $child);
1850: }
1851: }
1852: if (defined $p) {
1853: push(@new_children, $p);
1854: }
1855: foreach my $child (@new_children) {
1856: $node->appendChild($child);
1857: }
1858: }
1859: # now fix the paragraphs everywhere, so that all inline nodes are inside a paragraph, and block nodes are outside
1860: my $next;
1861: for (my $child=$node->firstChild; defined $child; $child=$next) {
1862: $next = $child->nextSibling;
1863: if ($child->nodeType == XML_ELEMENT_NODE && defined $child->firstChild) {
1864: if ($child->nodeName eq 'p') {
1865: fix_paragraph($child, $all_block);
1866: } else {
1867: fix_paragraphs_inside($child, $all_block);
1868: }
1869: }
1870: }
1871: }
1872:
1873: # returns 1 if a paragraph is needed inside this node (assuming the parent can have paragraphs)
1874: sub paragraph_needed {
1875: my ($node) = @_;
1876: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1877: if (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) ||
1878: ($child->nodeType == XML_ELEMENT_NODE && !string_in_array(\@inline_like_block, $child->nodeName)) ||
1879: $child->nodeType == XML_CDATA_SECTION_NODE ||
1880: $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) {
1881: return(1);
1882: }
1883: }
1884: return(0);
1885: }
1886:
1887: # returns 1 if there is a paragraph or br in a child of this node, or inside an inline child
1888: sub paragraph_inside {
1889: my ($node) = @_;
1890: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
1891: # (also used in first_block below)
1892: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
1893: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
1894: if ($child->nodeType == XML_ELEMENT_NODE) {
1895: my $name = $child->nodeName;
1896: if ($name eq 'p' || $name eq 'br') {
1897: return(1);
1898: } elsif (string_in_array(\@splitable_inline, $name)) {
1899: if (paragraph_inside($child)) {
1900: return(1);
1901: }
1902: }
1903: }
1904: }
1905: return(0);
1906: }
1907:
1908: # fixes paragraphs inside paragraphs (without a block in-between)
1909: sub fix_paragraph {
1910: my ($p, $all_block) = @_;
1911: my $loop_right = 1; # this loops is to avoid out of memory errors with recurse, see below
1912: while ($loop_right) {
1913: $loop_right = 0;
1914: my $block = find_first_block($p, $all_block);
1915: if (defined $block) {
1916: my $trees = clone_ancestor_around_node($p, $block);
1917: my $doc = $p->ownerDocument;
1918: my $replacement = $doc->createDocumentFragment();
1919: my $left = $trees->{'left'};
1920: my $middle = $trees->{'middle'};
1921: my $right = $trees->{'right'};
1922: my $left_needs_p = 0; # 1 if it needs a paragraph (used to replace br later)
1923:
1924: if (defined $left) {
1925: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
1926: for (my $child=$left->firstChild; defined $child; $child=$child->nextSibling) {
1927: if ($child->nodeType == XML_ELEMENT_NODE) {
1928: fix_paragraphs_inside($child, $all_block);
1929: }
1930: }
1931: if (!paragraph_needed($left)) {
1932: # this was just blank text, comments or inline responses, it should not create a new paragraph
1933: my $next;
1934: for (my $child=$left->firstChild; defined $child; $child=$next) {
1935: $next = $child->nextSibling;
1936: $left->removeChild($child);
1937: $replacement->appendChild($child);
1938: }
1939: } else {
1940: $left_needs_p = 1;
1941: $replacement->appendChild($left);
1942: }
1943: }
1944:
1945: my $n = $middle->firstChild;
1946: while (defined $n) {
1947: if ($n->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $n->nodeName) || $n->nodeName eq 'br')) {
1948: if ($n->nodeName eq 'p') {
1949: my $parent = $n->parentNode;
1950: # first apply recursion
1951: fix_paragraph($n, $all_block);
1952: # now the p might have been replaced by several nodes, which should replace the initial p
1953: my $next_block;
1954: for (my $block=$parent->firstChild; defined $block; $block=$next_block) {
1955: $next_block = $block->nextSibling;
1956: if ($block->nodeName eq 'p') {
1957: $parent->removeChild($block);
1958: # for each parent before $middle, clone in-between the p and its children (to preserve the styles)
1959: if (defined $block->firstChild) {
1960: for (my $p=$parent; $p!=$middle; $p=$p->parentNode) {
1961: my $newp = $p->cloneNode(0);
1962: my $next;
1963: for (my $child=$block->firstChild; defined $child; $child=$next) {
1964: $next = $child->nextSibling;
1965: $block->removeChild($child);
1966: $newp->appendChild($child);
1967: }
1968: $block->appendChild($newp);
1969: }
1970: }
1971: }
1972: $replacement->appendChild($block);
1973: }
1974: } else {
1975: # replace the whole p by this block, forgetting about intermediate inline elements
1976: $n->parentNode->removeChild($n);
1977: if ($n->nodeName eq 'br') {
1978: # replace a br by a paragraph if there was nothing before in the paragraph,
1979: # otherwise remove it because it already broke the paragraph in half
1980: if (!defined $left || !$left_needs_p) {
1981: $replacement->appendChild($middle);
1982: }
1983: } else {
1984: fix_paragraphs_inside($n, $all_block);
1985: $replacement->appendChild($n);
1986: }
1987: }
1988: last;
1989: }
1990: $n = $n->firstChild;
1991: if (defined $n && defined $n->nextSibling) {
1992: die "Error in post_xml.fix_paragraph: block not found";
1993: }
1994: }
1995:
1996: if (defined $right) {
1997: if ($block->nodeName eq 'p') {
1998: # remove attributes on the right paragraph
1999: my @attributelist = $right->attributes();
2000: foreach my $att (@attributelist) {
2001: $right->removeAttribute($att->nodeName);
2002: }
2003: }
2004: if ($right->firstChild->nodeType == XML_TEXT_NODE && $right->firstChild->nodeValue =~ /^[ \t\f\n\r]*$/) {
2005: # remove the first text node with whitespace only from the p, it should not trigger the creation of a p
2006: # (but take nbsp into account, so we should not use \s here)
2007: my $first = $right->firstChild;
2008: $right->removeChild($first);
2009: $replacement->appendChild($first);
2010: }
2011: if (defined $right->firstChild) {
2012: if (paragraph_needed($right)) {
2013: $replacement->appendChild($right);
2014: #fix_paragraph($right, $all_block); This is taking way too much memory for blocks with many children
2015: # -> loop instead of recurse
2016: $loop_right = 1;
2017: } else {
2018: # this was just blank text, comments or inline responses, it should not create a new paragraph
2019: my $next;
2020: for (my $child=$right->firstChild; defined $child; $child=$next) {
2021: $next = $child->nextSibling;
2022: $right->removeChild($child);
2023: $replacement->appendChild($child);
2024: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2025: if ($child->nodeType == XML_ELEMENT_NODE) {
2026: fix_paragraphs_inside($child, $all_block);
2027: }
2028: }
2029: }
2030: }
2031: }
2032:
2033: $p->parentNode->replaceChild($replacement, $p);
2034:
2035: if ($loop_right) {
2036: $p = $right;
2037: }
2038:
2039: } else {
2040: # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup):
2041: my $next;
2042: for (my $child=$p->firstChild; defined $child; $child=$next) {
2043: $next = $child->nextSibling;
2044: if ($child->nodeType == XML_ELEMENT_NODE) {
2045: fix_paragraphs_inside($child, $all_block);
2046: }
2047: }
2048: }
2049: }
2050: }
2051:
2052: sub find_first_block {
2053: my ($node, $all_block) = @_;
2054: # inline elements that can be split in half if there is a paragraph inside (currently all HTML):
2055: my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u');
2056: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2057: if ($child->nodeType == XML_ELEMENT_NODE) {
2058: if (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br') {
2059: return($child);
2060: }
2061: if (string_in_array(\@splitable_inline, $child->nodeName)) {
2062: my $block = find_first_block($child, $all_block);
2063: if (defined $block) {
2064: return($block);
2065: }
2066: }
2067: }
2068: }
2069: return(undef);
2070: }
2071:
2072: # Creates clones of the ancestor containing the descendants before the node, at the node, and after the node.
2073: # returns a hash with: left, middle, right (left and right can be undef)
2074: sub clone_ancestor_around_node {
2075: my ($ancestor, $node) = @_;
2076: my $middle_node;
2077: my ($left, $middle, $right);
2078: for (my $child=$ancestor->firstChild; defined $child; $child=$child->nextSibling) {
2079: if ($child == $node || is_ancestor_of($child, $node)) {
2080: $middle_node = $child;
2081: last;
2082: }
2083: }
2084: if (!defined $middle_node) {
2085: die "error in split_ancestor_around_node: middle not found";
2086: }
2087: if (defined $middle_node->previousSibling) {
2088: $left = $ancestor->cloneNode(0);
2089: for (my $child=$ancestor->firstChild; $child != $middle_node; $child=$child->nextSibling) {
2090: $left->appendChild($child->cloneNode(1));
2091: }
2092: }
2093: $middle = $ancestor->cloneNode(0);
2094: if ($middle_node == $node) {
2095: $middle->appendChild($middle_node->cloneNode(1));
2096: } else {
2097: my $subres = clone_ancestor_around_node($middle_node, $node);
2098: my $subleft = $subres->{'left'};
2099: if (defined $subleft) {
2100: if (!defined $left) {
2101: $left = $ancestor->cloneNode(0);
2102: }
2103: $left->appendChild($subleft);
2104: }
2105: $middle->appendChild($subres->{'middle'});
2106: my $subright = $subres->{'right'};
2107: if (defined $subright) {
2108: $right = $ancestor->cloneNode(0);
2109: $right->appendChild($subright);
2110: }
2111: }
2112: if (defined $middle_node->nextSibling) {
2113: if (!defined $right) {
2114: $right = $ancestor->cloneNode(0);
2115: }
2116: for (my $child=$middle_node->nextSibling; defined $child; $child=$child->nextSibling) {
2117: $right->appendChild($child->cloneNode(1));
2118: }
2119: }
2120: my %result = ();
2121: $result{'left'} = $left;
2122: $result{'middle'} = $middle;
2123: $result{'right'} = $right;
2124: return(\%result);
2125: }
2126:
2127: sub is_ancestor_of {
2128: my ($n1, $n2) = @_;
2129: my $n = $n2->parentNode;
2130: while (defined $n) {
2131: if ($n == $n1) {
2132: return(1);
2133: }
2134: $n = $n->parentNode;
2135: }
2136: return(0);
2137: }
2138:
2139: # removes empty style elements and replaces the ones with only whitespaces inside by their content
2140: # also remove hints that have become empty after empty style removal.
2141: sub remove_empty_style {
2142: my ($root) = @_;
2143: # actually, preserve some elements like ins when they have whitespace, only remove if they are empty
2144: my @remove_if_empty = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u', 'hint');
2145: my @remove_if_blank = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'font', 'u', 'hint');
2146: foreach my $name (@remove_if_empty) {
2147: my @nodes = $root->getElementsByTagName($name);
2148: while (scalar(@nodes) > 0) {
2149: my $node = pop(@nodes);
2150: if (!defined $node->firstChild) {
2151: my $parent = $node->parentNode;
2152: if (defined $node->previousSibling && $node->previousSibling->nodeType == XML_TEXT_NODE &&
2153: $node->previousSibling->nodeValue =~ /\$\S*$/) {
2154: # case $a<sup></sup>x
2155: my $value = $node->previousSibling->nodeValue;
2156: $value =~ s/\$(\S*)$/\$\{$1\}/;
2157: $node->previousSibling->setData($value);
2158: }
2159: $parent->removeChild($node);
2160: $parent->normalize();
2161: # now that we removed the node, check if the parent has become an empty style, and so on
2162: while (defined $parent && string_in_array(\@remove_if_empty, $parent->nodeName) && !defined $parent->firstChild) {
2163: my $grandparent = $parent->parentNode;
2164: $grandparent->removeChild($parent);
2165: remove_reference_from_array(\@nodes, $parent);
2166: $parent = $grandparent;
2167: }
2168: }
2169: }
2170: }
2171: foreach my $name (@remove_if_blank) {
2172: my @nodes = $root->getElementsByTagName($name);
2173: while (scalar(@nodes) > 0) {
2174: my $node = pop(@nodes);
2175: if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE) {
2176: # NOTE: careful, with UTF-8, \s matches non-breaking spaces and we want to preserve these
2177: if ($node->firstChild->nodeValue =~ /^[\t\n\f\r ]*$/) {
2178: my $parent = $node->parentNode;
2179: replace_by_children($node);
2180: $parent->normalize();
2181: # now that we removed the node, check if the parent has become a style with only whitespace, and so on
2182: while (defined $parent && string_in_array(\@remove_if_blank, $parent->nodeName) &&
2183: (!defined $parent->firstChild ||
2184: (!defined $parent->firstChild->nextSibling && $parent->firstChild->nodeType == XML_TEXT_NODE &&
2185: $parent->firstChild->nodeValue =~ /^^[\t\n\f\r ]*$/))) {
2186: my $grandparent = $parent->parentNode;
2187: replace_by_children($parent);
2188: remove_reference_from_array(\@nodes, $parent);
2189: $parent = $grandparent;
2190: }
2191: }
2192: }
2193: }
2194: }
2195: }
2196:
2197: # remove whitespace inside LON-CAPA elements that have an empty content-model (HTML ones are handled by html_to_xml)
2198: sub fix_empty_lc_elements {
2199: my ($node) = @_;
2200: my @lcempty = ('arc','axis','backgroundplot','drawoptionlist','drawvectorsum','fill','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','hiddenline','hiddensubmission','key','line','location','organicstructure','parameter','plotobject','plotvector','responseparam','spline','textline');
2201: if (string_in_array(\@lcempty, $node->nodeName)) {
2202: if (defined $node->firstChild && !defined $node->firstChild->nextSibling &&
2203: $node->firstChild->nodeType == XML_TEXT_NODE && $node->firstChild->nodeValue =~ /^\s*$/) {
2204: $node->removeChild($node->firstChild);
2205: }
2206: if (defined $node->firstChild) {
2207: print "Warning: a ".$node->nodeName." has something inside\n";
2208: }
2209: return;
2210: }
2211: for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) {
2212: if ($child->nodeType == XML_ELEMENT_NODE) {
2213: fix_empty_lc_elements($child);
2214: }
2215: }
2216: }
2217:
2218: # turn some attribute values into lowercase when they should be
2219: sub lowercase_attribute_values {
2220: my ($root) = @_;
2221: my @with_yesno = (['radiobuttonresponse', ['randomize']],
2222: ['optionresponse', ['randomize']],
2223: ['matchresponse', ['randomize']],
2224: ['itemgroup', ['randomize']],
2225: ['rankresponse', ['randomize']],
2226: ['functionplotresponse', ['xaxisvisible', 'yaxisvisible', 'gridvisible']],
2227: ['backgroundplot', ['fixed']],
2228: ['drawvectorsum', ['showvalue']],
2229: ['textline', ['readonly']],
2230: ['hint', ['showoncorrect']],
2231: ['body', ['dir']],
2232: ['img', ['encrypturl']],
2233: ['form', ['method']],
2234: ['input', ['type']]
2235: );
2236: foreach my $el_attributes (@with_yesno) {
2237: my $el_name = $el_attributes->[0];
2238: my @elements = $root->getElementsByTagName($el_name);
2239: foreach my $element (@elements) {
2240: my $att_list = $el_attributes->[1];
2241: foreach my $att_name (@$att_list) {
2242: my $att_value = $element->getAttribute($att_name);
2243: if (!defined $att_value) {
2244: next;
2245: }
2246: if ($att_value eq 'yes' || $att_value eq 'no') {
2247: next;
2248: }
2249: if ($att_value =~ /\s*yes\s*/i) {
2250: $element->setAttribute($att_name, 'yes');
2251: } elsif ($att_value =~ /\s*no\s*/i) {
2252: $element->setAttribute($att_name, 'no');
2253: }
2254: }
2255: }
2256: }
2257: }
2258:
2259: # fixes spelling mistakes for numericalresponse/@unit
2260: sub replace_numericalresponse_unit_attribute {
2261: my ($root) = @_;
2262: my @numericalresponses = $root->getElementsByTagName('numericalresponse');
2263: foreach my $numericalresponse (@numericalresponses) {
2264: if (defined $numericalresponse->getAttribute('units') && !defined $numericalresponse->getAttribute('unit')) {
2265: $numericalresponse->setAttribute('unit', $numericalresponse->getAttribute('units'));
2266: $numericalresponse->removeAttribute('units');
2267: }
2268: }
2269:
2270: }
2271:
2272: # Replaces &format and &prettyprint by <num> whenever possible.
2273: # Also replaces &chemparse by <chem>.
2274: # If the function call is enclosed in <display>, the <display> element is removed.
2275: sub replace_functions_by_elements {
2276: my ($root) = @_;
2277: my $doc = $root->ownerDocument;
2278: my @preserve = ('script','answer','parse','m','tm','dtm','numericalhintscript'); # display is handled later
2279: my @all = $root->getElementsByTagName('*');
2280: foreach my $element (@all) {
2281: if (string_in_array(\@preserve, $element->nodeName)) {
2282: next;
2283: }
2284: my $changed = 0;
2285: my $next;
2286: for (my $child=$element->firstChild; defined $child; $child=$next) {
2287: $next = $child->nextSibling;
2288: if ($child->nodeType == XML_TEXT_NODE) {
2289: my $value = $child->nodeValue;
2290: if ($value =~ /^(.*)&(?:format|prettyprint)\((\$\{?[a-zA-Z0-9]*\}?(?:\[[^\]]*\])?|[0-9.]+)\s?,\s?(["'][,.\$]?[0-9][eEfFgGsS]["']|\$[a-zA-Z0-9]*)\)(.*)$/s) {
2291: # NOTE: we don't check for &prettyprint's 3rd argument (target), but it has not been seen outside of script elements.
2292: # NOTE: the format options ',' and '$' are not supported by &format in current LON-CAPA since rev 1.81 of default_homework.lcpm,
2293: # but are supported by &prettyprint;
2294: # if we use (like current LON-CAPA) &prettyprint for <num> implementation, it will change a few resulting documents
2295: # (by making them display something they were probably intended to display, but which did not).
2296: # Usage of <num> with &prettyprint instead of &format might also change the display when there is an exponent.
2297: my $before = $1;
2298: my $number = $2;
2299: my $format = $3;
2300: my $after = $4;
2301: $format =~ s/^['"]|['"]$//g;
2302: # do not change this if the parent is <display> and there are other things before or after &format
2303: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2304: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2305: last;
2306: }
2307: my $replacement = $doc->createDocumentFragment();
2308: my $num = $doc->createElement('num');
2309: $num->setAttribute('format', $format);
2310: $num->appendChild($doc->createTextNode($number));
2311: if (length($before) > 0) {
2312: $replacement->appendChild($doc->createTextNode($before));
2313: }
2314: $replacement->appendChild($num);
2315: if (length($after) > 0) {
2316: $replacement->appendChild($doc->createTextNode($after));
2317: }
2318: $element->replaceChild($replacement, $child);
2319: $changed = 1;
2320: $next = $element->firstChild; # start over, there might be another &format in the same text node
2321: } elsif ($value =~ /^(.*)&chemparse\(([^'"()]*|'[^']*'|"[^"]*")\)(.*)$/s) {
2322: my $before = $1;
2323: my $reaction = $2;
2324: my $after = $3;
2325: $reaction =~ s/^'(.*)'$/$1/;
2326: $reaction =~ s/^"(.*)"$/$1/;
2327: if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next ||
2328: $before !~ /^\s*$/ || $after !~ /^\s*$/)) {
2329: last;
2330: }
2331: my $replacement = $doc->createDocumentFragment();
2332: my $chem = $doc->createElement('chem');
2333: $chem->appendChild($doc->createTextNode($reaction));
2334: if (length($before) > 0) {
2335: $replacement->appendChild($doc->createTextNode($before));
2336: }
2337: $replacement->appendChild($chem);
2338: if (length($after) > 0) {
2339: $replacement->appendChild($doc->createTextNode($after));
2340: }
2341: $element->replaceChild($replacement, $child);
2342: $changed = 1;
2343: $next = $element->firstChild;
2344: }
2345: }
2346: }
2347: if ($changed && $element->nodeName eq 'display') {
2348: my $first = $element->firstChild;
2349: if ($first->nodeType == XML_ELEMENT_NODE && string_in_array(['num','chem'], $first->nodeName) &&
2350: !defined $first->nextSibling) {
2351: # remove useless display element
2352: replace_by_children($element);
2353: }
2354: }
2355: }
2356: }
2357:
2358: # pretty-print using im-memory DOM tree
2359: sub pretty {
2360: my ($node, $all_block, $indent_level) = @_;
2361: my $doc = $node->ownerDocument;
2362: $indent_level ||= 0;
2363: my $type = $node->nodeType;
2364: if ($type == XML_ELEMENT_NODE) {
2365: my $name = $node->nodeName;
2366: if ((string_in_array($all_block, $name) || string_in_array(\@inline_like_block, $name)) &&
2367: !string_in_array(\@preserve_elements, $name)) {
2368: # make sure there is a newline at the beginning and at the end if there is anything inside
2369: if (defined $node->firstChild && !string_in_array(\@no_newline_inside, $name)) {
2370: my $first = $node->firstChild;
2371: if ($first->nodeType == XML_TEXT_NODE) {
2372: my $text = $first->nodeValue;
2373: if ($text !~ /^ *\n/) {
2374: $first->setData("\n" . $text);
2375: }
2376: } else {
2377: $node->insertBefore($doc->createTextNode("\n"), $first);
2378: }
2379: my $last = $node->lastChild;
2380: if ($last->nodeType == XML_TEXT_NODE) {
2381: my $text = $last->nodeValue;
2382: if ($text !~ /\n *$/) {
2383: $last->setData($text . "\n");
2384: }
2385: } else {
2386: $node->appendChild($doc->createTextNode("\n"));
2387: }
2388: }
2389:
2390: # indent and make sure there is a newline before and after a block element
2391: my $newline_indent = "\n".(' ' x (2*($indent_level + 1)));
2392: my $newline_indent_last = "\n".(' ' x (2*$indent_level));
2393: my $next;
2394: for (my $child=$node->firstChild; defined $child; $child=$next) {
2395: $next = $child->nextSibling;
2396: if ($child->nodeType == XML_ELEMENT_NODE) {
2397: if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) {
2398: # make sure there is a newline before and after a block element
2399: if (defined $child->previousSibling && $child->previousSibling->nodeType == XML_TEXT_NODE) {
2400: my $prev = $child->previousSibling;
2401: my $text = $prev->nodeValue;
2402: if ($text !~ /\n *$/) {
2403: $prev->setData($text . $newline_indent);
2404: }
2405: } else {
2406: $node->insertBefore($doc->createTextNode($newline_indent), $child);
2407: }
2408: if (defined $next && $next->nodeType == XML_TEXT_NODE) {
2409: my $text = $next->nodeValue;
2410: if ($text !~ /^ *\n/) {
2411: $next->setData($newline_indent . $text);
2412: }
2413: } else {
2414: $node->insertAfter($doc->createTextNode($newline_indent), $child);
2415: }
2416: }
2417: pretty($child, $all_block, $indent_level+1);
2418: } elsif ($child->nodeType == XML_TEXT_NODE) {
2419: my $text = $child->nodeValue;
2420: # collapse newlines
2421: $text =~ s/\n([\t ]*\n)+/\n/g;
2422: # indent and remove spaces and tabs before newlines
2423: if (defined $next) {
2424: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2425: } else {
2426: $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge;
2427: $text =~ s/[\t ]*\n[\t ]*$/$newline_indent_last/e;
2428: }
2429: $child->setData($text);
2430: }
2431: }
2432:
2433: # removes whitespace at the beginning and end of p td, th and li (except for nbsp at the beginning)
2434: my @to_trim = ('p','td','th','li');
2435: if (string_in_array(\@to_trim, $name) && defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
2436: my $text = $node->firstChild->nodeValue;
2437: $text =~ s/^[ \t\f\n\r]*//;
2438: if ($text eq '') {
2439: $node->removeChild($node->firstChild);
2440: } else {
2441: $node->firstChild->setData($text);
2442: }
2443: }
2444: if (string_in_array(\@to_trim, $name) && defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
2445: my $text = $node->lastChild->nodeValue;
2446: $text =~ s/\s*$//;
2447: if ($text eq '') {
2448: $node->removeChild($node->lastChild);
2449: } else {
2450: $node->lastChild->setData($text);
2451: }
2452: }
2453: } elsif (string_in_array(\@preserve_elements, $name)) {
2454: # collapse newlines at the beginning and the end of scripts
2455: if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) {
2456: my $text = $node->firstChild->nodeValue;
2457: $text =~ s/^\n( *\n)+/\n/;
2458: if ($text eq '') {
2459: $node->removeChild($node->firstChild);
2460: } else {
2461: $node->firstChild->setData($text);
2462: }
2463: }
2464: if (defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) {
2465: my $text = $node->lastChild->nodeValue;
2466: $text =~ s/\n( *\n)+$/\n/;
2467: if ($text eq '') {
2468: $node->removeChild($node->lastChild);
2469: } else {
2470: $node->lastChild->setData($text);
2471: }
2472: }
2473: }
2474: }
2475: }
2476:
2477: sub replace_tm_dtm {
2478: my ($root) = @_;
2479: my $doc = $root->ownerDocument;
2480: my @elements = $root->getElementsByTagName('tm');
2481: push(@elements, $root->getElementsByTagName('dtm'));
2482: foreach my $element (@elements) {
2483: my $first = $element->firstChild;
2484: if (defined $first && $first->nodeType == XML_TEXT_NODE) {
2485: my $text = $first->nodeValue;
2486: if ($element->nodeName eq 'tm') {
2487: $first->setData('$'.$text.'$');
2488: } else {
2489: $first->setData('$$'.$text.'$$');
2490: }
2491: }
2492: $element->setNodeName('m');
2493: }
2494: }
2495:
2496:
2497: ######## utilities ########
2498:
2499: ##
2500: # Trims a string (really, this should be built-in in Perl, this is ridiculous, ugly and slow)
2501: # @param {string} s - the string to trim
2502: # @returns the trimmed string
2503: ##
2504: sub trim {
2505: my ($s) = @_;
2506: $s =~ s/^\s+//;
2507: $s =~ s/\s+$//;
2508: return($s);
2509: }
2510:
2511: ##
2512: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
2513: # @param {Array<string>} array - reference to the array of strings
2514: # @param {string} value - the string to look for
2515: # @returns 1 if found, 0 otherwise
2516: ##
2517: sub string_in_array {
2518: my ($array, $value) = @_;
1.4 ! damieng 2519: # NOTE: would it be faster to use "any { $value eq $_ } @{$array}" from List::Util ?
1.1 damieng 2520: foreach my $v (@{$array}) {
2521: if ($v eq $value) {
2522: return 1;
2523: }
2524: }
2525: return 0;
2526: }
2527:
2528: ##
2529: # Tests if an object is in an array (using ==)
2530: # @param {Array<Object>} array - reference to the array of references
2531: # @param {Object} ref - the reference to look for
2532: # @returns 1 if found, 0 otherwise
2533: ##
2534: sub reference_in_array {
2535: my ($array, $ref) = @_;
2536: foreach my $v (@{$array}) {
2537: if ($v == $ref) {
2538: return 1;
2539: }
2540: }
2541: return 0;
2542: }
2543:
2544: ##
2545: # returns the index of a string in an array
2546: # @param {Array<Object>} array - reference to the array of strings
2547: # @param {string} s - the string to look for (using eq)
2548: # @returns the index if found, -1 otherwise
2549: ##
2550: sub index_of_string {
2551: my ($array, $s) = @_;
2552: for (my $i=0; $i<scalar(@{$array}); $i++) {
2553: if ($array->[$i] eq $s) {
2554: return $i;
2555: }
2556: }
2557: return -1;
2558: }
2559:
2560: ##
2561: # returns the index of a reference in an array
2562: # @param {Array<Object>} array - reference to the array of references
2563: # @param {Object} ref - the reference to look for
2564: # @returns the index if found, -1 otherwise
2565: ##
2566: sub index_of_reference {
2567: my ($array, $ref) = @_;
2568: for (my $i=0; $i<scalar(@{$array}); $i++) {
2569: if ($array->[$i] == $ref) {
2570: return $i;
2571: }
2572: }
2573: return -1;
2574: }
2575:
2576: ##
2577: # if found, removes a string from an array, otherwise do nothing
2578: # @param {Array<string>} array - reference to the array of string
2579: # @param {string} s - the string to look for (using eq)
2580: ##
2581: sub remove_string_from_array {
2582: my ($array, $s) = @_;
2583: my $index = index_of_string($array, $s);
2584: if ($index != -1) {
2585: splice(@$array, $index, 1);
2586: }
2587: }
2588:
2589: ##
2590: # if found, removes a reference from an array, otherwise do nothing
2591: # @param {Array<Object>} array - reference to the array of references
2592: # @param {Object} ref - the reference to look for
2593: ##
2594: sub remove_reference_from_array {
2595: my ($array, $ref) = @_;
2596: my $index = index_of_reference($array, $ref);
2597: if ($index != -1) {
2598: splice(@$array, $index, 1);
2599: }
2600: }
2601:
2602: ##
2603: # replaces a node by its children
2604: # @param {Node} node - the DOM node
2605: ##
2606: sub replace_by_children {
2607: my ($node) = @_;
2608: my $parent = $node->parentNode;
2609: my $next;
2610: my $previous;
2611: for (my $child=$node->firstChild; defined $child; $child=$next) {
2612: $next = $child->nextSibling;
2613: if ((!defined $previous || !defined $next) &&
2614: $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
2615: next; # do not keep first and last whitespace nodes
2616: } else {
2617: if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
2618: # remove whitespace at the beginning
2619: my $value = $child->nodeValue;
2620: $value =~ s/^\s+//;
2621: $child->setData($value);
2622: }
2623: if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
2624: # and at the end
2625: my $value = $child->nodeValue;
2626: $value =~ s/\s+$//;
2627: $child->setData($value);
2628: }
2629: }
2630: $node->removeChild($child);
2631: $parent->insertBefore($child, $node);
2632: $previous = $child;
2633: }
2634: $parent->removeChild($node);
2635: }
2636:
2637: ##
2638: # returns the trimmed attribute value if the attribute exists and is not blank, undef otherwise
2639: # @param {Node} node - the DOM node
2640: # @param {string} attribute_name - the attribute name
2641: ##
2642: sub get_non_empty_attribute {
2643: my ($node, $attribute_name) = @_;
2644: my $value = $node->getAttribute($attribute_name);
2645: if (defined $value && $value !~ /^\s*$/) {
2646: $value = trim($value);
2647: return($value);
2648: }
2649: return(undef);
2650: }
2651:
2652: ##
2653: # Returns a CSS property value from the style attribute of the element, or undef if not defined
2654: # @param {Element} el - the DOM element
2655: # @param {string} property_name - the CSS property name
2656: ##
2657: sub get_css_property {
2658: my ($el, $property_name) = @_;
2659: my $style = $el->getAttribute('style');
2660: if (defined $style) {
2661: $style =~ s/^\s*;\s*//;
2662: $style =~ s/\s*;\s*$//;
2663: } else {
2664: $style = '';
2665: }
2666: my @pairs = split(';', $style);
2667: foreach my $pair (@pairs) {
2668: my @name_value = split(':', $pair);
2669: if (scalar(@name_value) != 2) {
2670: next;
2671: }
2672: my $name = trim($name_value[0]);
2673: my $value = trim($name_value[1]);
2674: if (lc($name) eq $property_name) {
2675: return($value); # return the first one found
2676: }
2677: }
2678: return(undef);
2679: }
2680:
2681: ##
2682: # Returns the reference to a hash CSS property name => value from the style attribute of the element.
2683: # Returns an empty list if the style attribute is not defined,
2684: # @param {Element} el - the DOM element
2685: # @return {Hash<string, string>} reference to the hash property name => property value
2686: ##
2687: sub get_css_properties {
2688: my ($el) = @_;
2689: my $style = $el->getAttribute('style');
2690: if (defined $style) {
2691: $style =~ s/^\s*;\s*//;
2692: $style =~ s/\s*;\s*$//;
2693: } else {
2694: $style = '';
2695: }
2696: my @pairs = split(';', $style);
2697: tie (my %hash, 'Tie::IxHash', ());
2698: foreach my $pair (@pairs) {
2699: my @name_value = split(':', $pair);
2700: if (scalar(@name_value) != 2) {
2701: next;
2702: }
2703: my $name = trim($name_value[0]);
2704: my $value = trim($name_value[1]);
2705: if (defined $hash{$name}) {
2706: # duplicate property in the style attribute: keep only the last one
2707: delete $hash{$name};
2708: }
2709: $hash{$name} = $value;
2710: }
2711: return(\%hash);
2712: }
2713:
2714: ##
2715: # Sets a CSS property in the style attribute of an element
2716: # @param {Element} el - the DOM element
2717: # @param {string} property_name - the CSS property name
2718: # @param {string} property_value - the CSS property value
2719: ##
2720: sub set_css_property {
2721: my ($el, $property_name, $property_value) = @_;
2722: my $hash_ref = { $property_name => $property_value };
2723: set_css_properties($el, $hash_ref);
2724: }
2725:
2726: ##
2727: # Sets several CSS properties in the style attribute of an element
2728: # @param {Element} el - the DOM element
2729: # @param {Hash<string, string>} properties - reference to the hash property name => property value
2730: ##
2731: sub set_css_properties {
2732: my ($el, $properties) = @_;
2733: my $hash = get_css_properties($el);
2734: foreach my $property_name (keys %$properties) {
2735: my $property_value = $properties->{$property_name};
2736: if (defined $hash->{$property_name}) {
2737: delete $hash->{$property_name}; # to add the new one at the end
2738: }
2739: $hash->{$property_name} = $property_value;
2740: }
2741: my $style = '';
2742: foreach my $key (keys %$hash) {
2743: $style .= $key.':'.$hash->{$key}.'; ';
2744: }
2745: $style =~ s/; $//;
2746: $el->setAttribute('style', $style);
2747: }
2748:
2749: 1;
2750: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>