modules/damieng/clean_xml/post_xml.pm - view

File: [LON-CAPA] / modules / damieng / clean_xml / post_xml.pm
Revision 1.5: download - view: text, annotated - select for diffs
Fri Dec 18 22:15:47 2015 UTC (8 years, 10 months ago) by damieng
Branches: MAIN
CVS tags: HEAD

changed paragraph transformation for translated and lang

#!/usr/bin/perl package post_xml; use strict; use utf8; use warnings; use File::Basename; use File::Temp qw/ tempfile /; use Cwd 'abs_path'; use XML::LibXML; use HTML::TokeParser; # used to parse sty files use Tie::IxHash; # for ordered hashes use Env qw(RES_DIR); # path of res directory parent (without the / at the end) no warnings 'recursion'; # yes, fix_paragraph is using heavy recursion, I know # these are constants my @block_elements = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm'); my @inline_like_block = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup','translated','lang'); # inline elements treated like blocks for pretty print and some other things my @responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse','essayresponse','radiobuttonresponse','matchresponse','rankresponse','imageresponse','functionplotresponse'); my @block_html = ('html','head','body','section','h1','h2','h3','h4','h5','h6','div','p','ul','ol','li','table','tbody','tr','td','th','dl','dt','dd','pre','noscript','hr','address','blockquote','object','applet','embed','map','form','fieldset','iframe','center','frameset'); my @no_newline_inside = ('import','parserlib','scriptlib','data','function','label','xlabel','ylabel','tic','text','rectangle','image','title','h1','h2','h3','h4','h5','h6','li','td','p'); my @preserve_elements = ('script','answer','pre'); my @accepting_style = ('section','h1','h2','h3','h4','h5','h6','div','p','li','td','th','dt','dd','pre','blockquote'); my @latex_math = ('\alpha', '\theta', '\omicron', '\tau', '\beta', '\vartheta', '\pi', '\upsilon', '\gamma', '\gamma', '\varpi', '\phi', '\delta', '\kappa', '\rho', '\varphi', '\epsilon', '\lambda', '\varrho', '\chi', '\varepsilon', '\mu', '\sigma', '\psi', '\zeta', '\nu', '\varsigma', '\omega', '\eta', '\xi', '\Gamma', '\Lambda', '\Sigma', '\Psi', '\Delta', '\Xi', '\Upsilon', '\Omega', '\Theta', '\Pi', '\Phi', '\pm', '\cap', '\diamond', '\oplus', '\mp', '\cup', '\bigtriangleup', '\ominus', '\times', '\uplus', '\bigtriangledown', '\otimes', '\div', '\sqcap', '\triangleleft', '\oslash', '\ast', '\sqcup', '\triangleright', '\odot', '\star', '\vee', '\lhd$', '\bigcirc', '\circ', '\wedge', '\rhd$', '\dagger', '\bullet', '\setminus', '\unlhd$', '\ddagger', '\cdot', '\wr', '\unrhd$', '\amalg', '+', '-', '\leq', '\geq', '\equiv', '\models', '\prec', '\succ', '\sim', '\perp', '\preceq', '\succeq', '\simeq', '\mid', '\ll', '\gg', '\asymp', '\parallel', '\subset', '\supset', '\approx', '\bowtie', '\subseteq', '\supseteq', '\cong', '\Join$', '\sqsubset$', '\sqsupset$', '\neq', '\smile', '\sqsubseteq', '\sqsupseteq', '\doteq', '\frown', '\in', '\ni', '\propto', '\vdash', '\dashv', '\colon', '\ldotp', '\cdotp', '\leftarrow', '\longleftarrow', '\uparrow', '\Leftarrow', '\Longleftarrow', '\Uparrow', '\rightarrow', '\longrightarrow', '\downarrow', '\Rightarrow', '\Longrightarrow', '\Downarrow', '\leftrightarrow', '\longleftrightarrow', '\updownarrow', '\Leftrightarrow', '\Longleftrightarrow', '\Updownarrow', '\mapsto', '\longmapsto', '\nearrow', '\hookleftarrow', '\hookrightarrow', '\searrow', '\leftharpoonup', '\rightharpoonup', '\swarrow', '\leftharpoondown', '\rightharpoondown', '\nwarrow', '\rightleftharpoons', '\leadsto$', '\ldots', '\cdots', '\vdots', '\ddots', '\aleph', '\prime', '\forall', '\infty', '\hbar', '\emptyset', '\exists', '\Box$', '\imath', '\nabla', '\neg', '\Diamond$', '\jmath', '\surd', '\flat', '\triangle', '\ell', '\top', '\natural', '\clubsuit', '\wp', '\bot', '\sharp', '\diamondsuit', '\Re', '\|', '\backslash', '\heartsuit', '\Im', '\angle', '\partial', '\spadesuit', '\mho$', '\sum', '\bigcap', '\bigodot', '\prod', '\bigcup', '\bigotimes', '\coprod', '\bigsqcup', '\bigoplus', '\int', '\bigvee', '\biguplus', '\oint', '\bigwedge', '\arccos', '\cos', '\csc', '\exp', '\ker', '\limsup', '\min', '\sinh', '\arcsin', '\cosh', '\deg', '\gcd', '\lg', '\ln', '\Pr', '\sup', '\arctan', '\cot', '\det', '\hom', '\lim', '\log', '\sec', '\tan', '\arg', '\coth', '\dim', '\inf', '\liminf', '\max', '\sin', '\tanh', '\uparrow', '\Uparrow', '\downarrow', '\Downarrow', '\updownarrow', '\Updownarrow', '\lfloor', '\rfloor', '\lceil', '\rceil', '\langle', '\rangle', '\backslash', '\rmoustache', '\lmoustache', '\rgroup', '\lgroup', '\arrowvert', '\Arrowvert', '\bracevert', '\hat{', '\acute{', '\bar{', '\dot{', '\breve{', '\check{', '\grave{', '\vec{', '\ddot{', '\tilde{', '\widetilde{', '\widehat{', '\overleftarrow{', '\overrightarrow{', '\overline{', '\underline{', '\overbrace{', '\underbrace{', '\sqrt{', '\sqrt[', '\frac{' ); # list of elements that can contain style elements: my @containing_styles = ('library','problem',@responses,'foil','item','text','hintgroup','hintpart','label','part','preduedate','postanswerdate','solved','notsolved','block','while','web','standalone','problemtype','languageblock','translated','lang','window','windowlink','togglebox','instructorcomment','body','section','div','p','li','dd','td','th','blockquote','object','applet','video','audio','canvas','fieldset','button', 'span','strong','em','b','i','sup','sub','code','kbd','samp','tt','ins','del','var','small','big','u','font'); my @html_styles = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'u'); # Parses the XML document and fixes many things to turn it into a LON-CAPA 3 document # Returns the text of the document. sub post_xml { my ($textref, $new_path) = @_; my $dom_doc = XML::LibXML->load_xml(string => $textref); my $root = fix_structure($dom_doc); remove_elements($root, ['startouttext','startoutext','startottext','startouttex','startouttect','atartouttext','starouttext','starttextout','starttext','starttextarea','endouttext','endoutext','endoutttext','endouttxt','endouutext','ednouttext','endouttex','endoouttext','endouttest','endtextout','endtextarea','startpartmarker','endpartmarker','basefont','x-claris-tagview','x-claris-window','x-sas-window']); remove_empty_attributes($root); fix_attribute_case($root); my $fix_by_hand = replace_m($root); my @all_block = (@block_elements, @block_html); add_sty_blocks($new_path, $root, \@all_block); # must come before the subs using @all_block fix_block_styles($root, \@all_block); $root->normalize(); fix_fonts($root, \@all_block); replace_u($root); remove_bad_cdata_sections($root); add_cdata_sections($root); fix_style_element($root); fix_tables($root); fix_lists($root); fix_wrong_name_for_img($root); # should be before replace_deprecated_attributes_by_css replace_deprecated_attributes_by_css($root); replace_center($root, \@all_block); # must come after replace_deprecated_attributes_by_css replace_nobr($root); remove_useless_notsolved($root); fix_paragraphs_inside($root, \@all_block); remove_empty_style($root); fix_empty_lc_elements($root); lowercase_attribute_values($root); replace_numericalresponse_unit_attribute($root); replace_functions_by_elements($root); pretty($root, \@all_block); replace_tm_dtm($root); open my $out, '>', $new_path; print $out $dom_doc->toString(); # byte string ! close $out; if ($fix_by_hand) { die "The file has been converted but it should be fixed by hand."; } } sub fix_structure { my ($doc) = @_; # the 'loncapa' root element has already been added in pre_xml my $root = $doc->documentElement; # inside the root, replace html, problem and library elements by their content my @toreplace = ('html','problem','library'); foreach my $name (@toreplace) { my @elements = $root->getElementsByTagName($name); foreach my $element (@elements) { replace_by_children($element); } } # insert all link and style elements inside a new head element my $current_node = undef; my @heads = $doc->getElementsByTagName('head'); my @links = $doc->getElementsByTagName('link'); my @styles = $doc->getElementsByTagName('style'); my @titles = $doc->getElementsByTagName('title'); if (scalar(@titles) > 0) { # NOTE: there is a title element in gnuplot, not to be confused with the one inside HTML head for (my $i=0; $i<scalar(@titles); $i++) { my $title = $titles[$i]; my $found_gnuplot = 0; my $ancestor = $title->parentNode; while (defined $ancestor) { if ($ancestor->nodeName eq 'gnuplot') { $found_gnuplot = 1; last; } $ancestor = $ancestor->parentNode; } if ($found_gnuplot) { splice(@titles, $i, 1); $i--; } } } if (scalar(@heads) > 0 || scalar(@titles) > 0 || scalar(@links) > 0 || scalar(@styles) > 0) { my $htmlhead = $doc->createElement('head'); foreach my $head (@heads) { my $next; for (my $child=$head->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $head->removeChild($child); if ($child->nodeType != XML_ELEMENT_NODE || string_in_array(['title','script','style','meta','link','import','base'], $child->nodeName)) { $htmlhead->appendChild($child); } else { # this should not be in head insert_after_or_first($root, $child, $current_node); } } $head->parentNode->removeChild($head); } foreach my $child (@titles, @links, @styles) { $child->parentNode->removeChild($child); $htmlhead->appendChild($child); } insert_after_or_first($root, $htmlhead, $current_node); $current_node = $htmlhead; } # body my $htmlbody = undef; my @bodies = $doc->getElementsByTagName('body'); if (scalar(@bodies) > 0) { # TODO: fix content and position of body elements if ($root->nodeName eq 'problem') { foreach my $body (@bodies) { replace_by_children($body); } } } # add all the meta elements afterwards when they are LON-CAPA meta. Remove all HTML meta. my @meta_names = ('abstract','author','authorspace','avetries','avetries_list','clear','comefrom','comefrom_list','copyright','correct','count','course','course_list','courserestricted','creationdate','dependencies','depth','difficulty','difficulty_list','disc','disc_list','domain','end','field','firstname','generation','goto','goto_list','groupname','helpful','highestgradelevel','hostname','id','keynum','keywords','language','lastname','lastrevisiondate','lowestgradelevel','middlename','mime','modifyinguser','notes','owner','permanentemail','scope','sequsage','sequsage_list','standards','start','stdno','stdno_list','subject','technical','title','url','username','value','version'); my @metas = $doc->getElementsByTagName('meta'); foreach my $meta (@metas) { $meta->parentNode->removeChild($meta); my $name = $meta->getAttribute('name'); my $content = $meta->getAttribute('content'); if (defined $name && defined $content && string_in_array(\@meta_names, lc($name))) { my $lcmeta = $doc->createElement('meta'); $lcmeta->setAttribute('name', lc($name)); $lcmeta->setAttribute('content', $content); insert_after_or_first($root, $lcmeta, $current_node); $current_node = $lcmeta; } } return($root); } # insert the new child under parent after the reference child, or as the first child if the reference child is not defined sub insert_after_or_first { my ($parent, $newchild, $refchild) = @_; if (defined $refchild) { $parent->insertAfter($newchild, $refchild); } elsif (defined $parent->firstChild) { $parent->insertBefore($newchild, $parent->firstChild); } else { $parent->appendChild($newchild); } } # removes all elements with given names inside the node, but keep the content sub remove_elements { my ($node, $to_remove) = @_; my $nextChild; for (my $child=$node->firstChild; defined $child; $child=$nextChild) { $nextChild = $child->nextSibling; my $type = $node->nodeType; if ($type == XML_ELEMENT_NODE) { if (string_in_array($to_remove, $child->nodeName)) { my $first_non_white = $child->firstChild; if (defined $first_non_white && $first_non_white->nodeType == XML_TEXT_NODE && $first_non_white->nodeValue =~ /^\s*$/) { $first_non_white = $first_non_white->nextSibling; } if (defined $first_non_white) { $nextChild = $first_non_white; replace_by_children($child); } else { $node->removeChild($child); } } else { remove_elements($child, $to_remove); } } } } # removes some attributes that have an invalid empty value sub remove_empty_attributes { my ($root) = @_; my $doc = $root->ownerDocument; # this list is based on validation errors in the MSU subset (it could be more complete if it was based on the schema) my @attributes = ( ['curve', ['pointsize']], ['foil', ['location']], ['foilgroup', ['checkboxoptions', 'options', 'texoptions']], ['gnuplot', ['pattern', 'texwidth']], ['img', ['height', 'texheight', 'texwidth', 'texwrap', 'width']], ['import', ['importmode']], ['optionresponse', ['max']], ['organicstructure', ['options']], ['radiobuttonresponse', ['max']], ['randomlabel', ['height', 'texwidth', 'width']], ['stringresponse', ['type']], ['textline', ['size']], ); foreach my $element_attributes (@attributes) { my $element_name = $element_attributes->[0]; my $attribute_names = $element_attributes->[1]; my @elements = $doc->getElementsByTagName($element_name); foreach my $element (@elements) { foreach my $attribute_name (@$attribute_names) { my $value = $element->getAttribute($attribute_name); if (defined $value && $value =~ /^\s*$/) { $element->removeAttribute($attribute_name); } } } } } # fixes the case for a few attributes that are not all lowercase # (the HTML parser used in html_to_xml turns everything lowercase, which is a good thing in general) sub fix_attribute_case { my ($root) = @_; my $doc = $root->ownerDocument; my @attributes = ( ['labelgroup', ['TeXsize']], ['h1', ['TeXsize']], ['h2', ['TeXsize']], ['h3', ['TeXsize']], ['h4', ['TeXsize']], ['h5', ['TeXsize']], ['h6', ['TeXsize']], # font and basefont have a TeXsize but will be removed ['optionresponse', ['TeXlayout']], ['itemgroup', ['TeXitemgroupwidth']], ['Task', ['OptionalRequired']], ['Question', ['OptionalRequired','Mandatory']], ['Instance', ['OptionalRequired','Disabled']], ['Criteria', ['Mandatory']], ['table', ['TeXwidth','TeXtheme']], ['td', ['TeXwidth']], ['th', ['TeXwidth']], ['img', ['TeXwidth','TeXheight','TeXwrap']], ); foreach my $element_attributes (@attributes) { my $element_name = $element_attributes->[0]; my $attribute_names = $element_attributes->[1]; my @elements = $doc->getElementsByTagName($element_name); foreach my $element (@elements) { foreach my $attribute_name (@$attribute_names) { my $value = $element->getAttribute(lc($attribute_name)); if (defined $value) { $element->removeAttribute(lc($attribute_name)); $element->setAttribute($attribute_name, $value); } } } } } # Replaces m by HTML, tm and/or dtm (which will be replaced by <m> later, but they are useful # to know if the element is a block element or not). # m might contain non-math LaTeX, while tm and dtm may only contain math. # Returns 1 if the file should be fixed by hand, 0 otherwise. sub replace_m { my ($root) = @_; my $doc = $root->ownerDocument; my $fix_by_hand = 0; # search for variable declarations my @variables = (); my @scripts = $root->getElementsByTagName('script'); foreach my $script (@scripts) { my $type = $script->getAttribute('type'); if (defined $type && $type eq 'loncapa/perl') { if (defined $script->firstChild && $script->firstChild->nodeType == XML_TEXT_NODE) { my $text = $script->firstChild->nodeValue; # NOTE: we are not interested in replacing "@value", only "$value" # this regexp is for " $a = ..." and " $a[...] = ..." while ($text =~ /^[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) { if (!string_in_array(\@variables, $1)) { push(@variables, $1); } } # this regexp is for "...; $a = ..." and "...; $a[...] = ..." while ($text =~ /^[^'"\/;]+;[ \t]*\$([a-zA-Z_0-9]+)(?:\[[^\]]+\])?[ \t]*=/gm) { if (!string_in_array(\@variables, $1)) { push(@variables, $1); } } # this regexp is for " @a = ..." while ($text =~ /^[ \t]*\@([a-zA-Z_0-9]+)[ \t]*=/gm) { if (!string_in_array(\@variables, $1)) { push(@variables, $1); } } # this regexp is for " ($a, $b, $c) = ..." my @matches = ($text =~ /^[ \t]*$[ \t]*\$([a-zA-Z_0-9]+)(?:[ \t]*,[ \t]*\$([a-zA-Z_0-9]+))*[ \t]*$[ \t]*=/gm); foreach my $match (@matches) { if (!defined $match) { next; # not sure why it happens, but it does } if (!string_in_array(\@variables, $match)) { push(@variables, $match); } } # and this one is for "push @a" while ($text =~ /^[ \t]*push @([a-zA-Z_0-9]+)[ \t,]*/gm) { if (!string_in_array(\@variables, $1)) { push(@variables, $1); } } # use the opportunity to report usage of <m> in Perl scripts if ($text =~ /^[^#].*<m[ >]/m) { print "WARNING: <m> is used in a script, it should be converted by hand\n"; $fix_by_hand = 1; } } } } my @ms = $root->getElementsByTagName('m'); foreach my $m (@ms) { if (!defined $m->firstChild) { $m->parentNode->removeChild($m); next; } if (defined $m->firstChild->nextSibling || $m->firstChild->nodeType != XML_TEXT_NODE) { print "WARNING: m value is not simple text\n"; $fix_by_hand = 1; next; } my $text = $m->firstChild->nodeValue; my $text_before_variable_replacement = $text; my $var_key1 = 'dfhg3df54hg65hg4'; my $var_key2 = 'dfhg654d6f5g4h5f'; my $eval = defined $m->getAttribute('eval') && $m->getAttribute('eval') eq 'on'; if ($eval) { # replace variables foreach my $variable (@variables) { my $replacement = $var_key1.$variable.$var_key2; $text =~ s/\$$variable(?![a-zA-Z])/$replacement/ge; $text =~ s/\$\{$variable\}/$replacement/ge; } } # check if the expression is enclosed in math separators: $ $$  \[ \] # if so, replace the whole node by dtm or tm my $new_text; my $new_node_name; if ($text =~ /^\s*\$\$([^\$]*)\$\$\s*$/) { $new_node_name = 'dtm'; $new_text = $1; } elsif ($text =~ /^\s*\\\[(.*)\\\]\s*$/s) { $new_node_name = 'dtm'; $new_text = $1; } elsif ($text =~ /^\s*\$([^\$]*)\$\s*$/) { $new_node_name = 'tm'; $new_text = $1; } elsif ($text =~ /^\s*\\$(.*)\\$\s*$/s) { $new_node_name = 'tm'; $new_text = $1; } if (defined $new_node_name) { if ($eval) { foreach my $variable (@variables) { my $replacement = $var_key1.$variable.$var_key2; $new_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g; $new_text =~ s/$replacement/\$$variable/g; } } my $new_node = $doc->createElement($new_node_name); if ($eval) { $new_node->setAttribute('eval', 'on'); } $new_node->appendChild($doc->createTextNode($new_text)); $m->parentNode->replaceChild($new_node, $m); next; } if ($text !~ /\$|\\$|\\$|\\\[|\\\]/) { # there are no math separators inside # try to guess if this is meant as math my $found_math = 0; foreach my $symbol (@latex_math) { if (index($text, $symbol) != -1) { $found_math = 1; last; } } if ($found_math) { # interpret the whole text as LaTeX inline math my $new_node = $doc->createElement('tm'); if ($eval) { $new_node->setAttribute('eval', 'on'); } $new_node->appendChild($doc->createTextNode($text_before_variable_replacement)); $m->parentNode->replaceChild($new_node, $m); next; } # no math symbol found, we will convert the text with tth } # there are math separators inside, even after hiding variables, or there was no math symbol # hide math parts inside before running tth my $math_key1 = '#ghjgdh5hg45gf'; my $math_key2 = '#'; my @maths = (); my @separators = (['$$','$$'], ['\$','\$'], ['\\[','\\]'], ['$','$']); foreach my $seps (@separators) { my $sep1 = $seps->[0]; my $sep2 = $seps->[1]; my $pos1 = index($text, $sep1); if ($pos1 == -1) { next; } my $pos2 = index($text, $sep2, $pos1+length($sep1)); while ($pos1 != -1 && $pos2 != -1) { my $replace = substr($text, $pos1, $pos2+length($sep2)-$pos1); push(@maths, $replace); my $by = $math_key1.scalar(@maths).$math_key2; $text = substr($text, 0, $pos1).$by.substr($text, $pos2+length($sep2)); $pos1 = index($text, $sep1); if ($pos1 != -1) { $pos2 = index($text, $sep2, $pos1+length($sep1)); } } } # get HTML as text from tth my $html_text = tth($text); # replace math by replacements for (my $i=0; $i < scalar(@maths); $i++) { my $math = $maths[$i]; $math =~ s/&/&/g; $math =~ s/</</g; $math =~ s/>/>/g; if ($math =~ /^\$\$(.*)\$\$$/s) { $math = '<dtm>'.$1.'</dtm>'; } elsif ($math =~ /^\\\[(.*)\\\]$/s) { $math = '<dtm>'.$1.'</dtm>'; } elsif ($math =~ /^\\$(.*)\\$$/s) { $math = '<tm>'.$1.'</tm>'; } elsif ($math =~ /^\$(.*)\$$/s) { $math = '<tm>'.$1.'</tm>'; } my $replace = $math_key1.($i+1).$math_key2; $html_text =~ s/$replace/$math/; } # replace variables if necessary if ($eval) { foreach my $variable (@variables) { my $replacement = $var_key1.$variable.$var_key2; $html_text =~ s/$replacement([a-zA-Z])/\${$variable}$1/g; $html_text =~ s/$replacement/\$$variable/g; } } my $fragment = html_to_dom($html_text); $doc->adoptNode($fragment); $m->parentNode->replaceChild($fragment, $m); } return $fix_by_hand; } # Returns the HTML equivalent of LaTeX input, using tth sub tth { my ($text) = @_; my ($fh, $tmp_path) = tempfile(); binmode($fh, ':utf8'); print $fh $text; close $fh; my $output = `tth -r -w2 -u -y0 < $tmp_path 2>/dev/null`; # hopefully the temp file will not be removed before this point (otherwise we should use unlink_on_destroy 0) $output =~ s/^\s*|\s*$//; $output =~ s/<div class="p"><\/div>/<br\/>/; # why is tth using such ugly markup for \newline ? return $output; } # transform simple HTML into a DOM fragment (which will need to be adopted by the document) sub html_to_dom { my ($text) = @_; $text = '<root>'.$text.'</root>'; my $textref = html_to_xml::html_to_xml(\$text); utf8::upgrade($$textref); # otherwise the XML parser fails when the HTML parser turns   into a character my $dom_doc = XML::LibXML->load_xml(string => $textref); my $root = $dom_doc->documentElement; remove_empty_style($root); my $fragment = $dom_doc->createDocumentFragment(); my $next; for (my $n=$root->firstChild; defined $n; $n=$next) { $next = $n->nextSibling; $root->removeChild($n); $fragment->appendChild($n); } return($fragment); } # Use the linked sty files to guess which newly defined elements should be considered blocks. # Also adds to @containing_styles the sty elements that contain styles. # @param {string} fn - the .lc file path (we only extract the directory path from it) sub add_sty_blocks { my ($fn, $root, $all_block) = @_; my $doc = $root->ownerDocument; my @parserlibs = $doc->getElementsByTagName('parserlib'); my @libs = (); foreach my $parserlib (@parserlibs) { if (defined $parserlib->firstChild && $parserlib->firstChild->nodeType == XML_TEXT_NODE) { my $value = $parserlib->firstChild->nodeValue; $value =~ s/^\s+|\s+$//g; if ($value ne '') { push(@libs, $value); } } } my ($name, $path, $suffix) = fileparse($fn); foreach my $sty (@libs) { if (substr($sty, 0, 1) eq '/') { $sty = $RES_DIR.$sty; } else { $sty = $path.$sty; } my $new_elements = parse_sty($sty, $all_block); better_guess($root, $new_elements, $all_block); my $new_blocks = $new_elements->{'block'}; my $new_inlines = $new_elements->{'inline'}; push(@$all_block, @{$new_blocks}); #push(@inlines, @{$new_inlines}); # we are not using a list of inline elements at this point } } ## # Parses a sty file and returns lists of block and inline elements. # @param {string} fn - the file path ## sub parse_sty { my ($fn, $all_block) = @_; my @blocks = (); my @inlines = (); my $p = HTML::TokeParser->new($fn); if (! $p) { die "post_xml.pl: parse_sty: Error reading $fn\n"; } $p->empty_element_tags(1); my $in_definetag = 0; my $in_render = 0; my %newtags = (); my $newtag = ''; my $is_block = 0; while (my $token = $p->get_token) { if ($token->[0] eq 'S') { my $tag = lc($token->[1]); if ($tag eq 'definetag') { $in_definetag = 1; $is_block = 0; my $attributes = $token->[2]; $newtag = $attributes->{'name'}; if (substr($newtag, 0, 1) eq '/') { $newtag = substr($newtag, 1); } } elsif ($in_definetag && $tag eq 'render') { $in_render = 1; $is_block = 0; } elsif ($in_render) { if (string_in_array($all_block, $tag)) { $is_block = 1; } } } elsif ($token->[0] eq 'E') { my $tag = lc($token->[1]); if ($tag eq 'definetag') { $in_definetag = 0; if (defined $newtags{$newtag}) { $newtags{$newtag} = $newtags{$newtag} || $is_block; } else { $newtags{$newtag} = $is_block; } } elsif ($in_definetag && $tag eq 'render') { $in_render = 0; } } } foreach $newtag (keys(%newtags)) { if ($newtags{$newtag} == 1) { push(@blocks, $newtag); } else { push(@inlines, $newtag); } } return {'block'=>\@blocks, 'inline'=>\@inlines}; } ## # Marks as block the elements that contain block elements in the input file. # Also adds to @containing_styles the sty elements that contain styles. # @param {string} fn - the file path # @param {Hash<string,Array>} new_elements - contains arrays in 'block' and 'inline' ## sub better_guess { my ($root, $new_elements, $all_block) = @_; my $new_blocks = $new_elements->{'block'}; my $new_inlines = $new_elements->{'inline'}; my @change = (); # change these elements from inline to block foreach my $tag (@{$new_inlines}) { my @nodes = $root->getElementsByTagName($tag); NODE_LOOP: foreach my $node (@nodes) { for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE) { if (string_in_array($all_block, $child->nodeName) || string_in_array($new_blocks, $child->nodeName)) { push(@change, $tag); last NODE_LOOP; } } } } } foreach my $inline (@change) { my $index = 0; $index++ until $new_inlines->[$index] eq $inline; splice(@{$new_inlines}, $index, 1); push(@{$new_blocks}, $inline); } # add to @containing_styles when a style is used inside # NOTE: some sty elements will be added even though they should not, but if we don't do that # all style will be removed in the sty elements. foreach my $tag ((@{$new_blocks}, @{$new_inlines})) { my @nodes = $root->getElementsByTagName($tag); NODE_LOOP: foreach my $node (@nodes) { for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE) { if (string_in_array(\@html_styles, $child->nodeName)) { push(@containing_styles, $tag); last NODE_LOOP; } } } } } } # When a style element contains a block, move the style inside the block where it is allowed. # style/block/other -> block/style/other # When a style is used where it is not allowed, move it inside its children or remove it (unless it contains only text) # element_not_containing_styles/style/other -> element_not_containing_styles/other/style (except if other is a style) # The fix is not perfect in the case of element_not_containing_styles/style1/style2/block/text (style1 will be lost): # element_not_containing_styles/style1/style2/block/text -> element_not_containing_styles/block/style2/text # (a solution to this problem would be to merge the styles in a span) # NOTE: .sty defined elements might have been added to @containing_styles by better_guess(). sub fix_block_styles { my ($element, $all_block) = @_; my $doc = $element->ownerDocument; if (string_in_array(\@html_styles, $element->nodeName)) { # move spaces out of the style element if (defined $element->firstChild && $element->firstChild->nodeType == XML_TEXT_NODE) { my $child = $element->firstChild; if ($child->nodeValue =~ /^(\s+)(\S.*)$/s) { $element->parentNode->insertBefore($doc->createTextNode($1), $element); $child->setData($2); } } if (defined $element->lastChild && $element->lastChild->nodeType == XML_TEXT_NODE) { my $child = $element->lastChild; if ($child->nodeValue =~ /^(.*\S)(\s+)$/s) { $element->parentNode->insertAfter($doc->createTextNode($2), $element); $child->setData($1); } } my $found_block = 0; for (my $child=$element->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) { $found_block = 1; last; } } my $no_style_here = !string_in_array(\@containing_styles, $element->parentNode->nodeName); if ($found_block || $no_style_here) { # there is a block or the style is not allowed here, # the style element has to be replaced by its modified children my $s; # a clone of the style my $next; for (my $child=$element->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br' || $no_style_here)) { # avoid inverting a style with a style with $no_style_here (that would cause endless recursion) if (!$no_style_here || (!string_in_array(\@html_styles, $child->nodeName) && string_in_array(\@containing_styles, $child->nodeName))) { # block node or inline node when the style is not allowed: # move all children inside the style, and make the style the only child $s = $element->cloneNode(); my $next2; for (my $child2=$child->firstChild; defined $child2; $child2=$next2) { $next2 = $child2->nextSibling; $child->removeChild($child2); $s->appendChild($child2); } $child->appendChild($s); } $s = undef; } elsif (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) || $child->nodeType == XML_ELEMENT_NODE) { # if the style is allowed, move text and inline nodes inside the style if (!$no_style_here) { if (!defined $s) { $s = $element->cloneNode(); $element->insertBefore($s, $child); } $element->removeChild($child); $s->appendChild($child); } } else { # do not put other nodes inside the style $s = undef; } } # now replace by children and fix them my $parent = $element->parentNode; for (my $child=$element->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $element->removeChild($child); $parent->insertBefore($child, $element); if ($child->nodeType == XML_ELEMENT_NODE) { fix_block_styles($child, $all_block); } } $parent->removeChild($element); return; } } # otherwise fix all children my $next; for (my $child=$element->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_ELEMENT_NODE) { fix_block_styles($child, $all_block); } } } # removes empty font elements and font elements that contain at least one block element # replaces other font elements by equivalent span sub fix_fonts { my ($root, $all_block) = @_; my $doc = $root->ownerDocument; my @fonts = $root->getElementsByTagName('font'); @fonts = reverse(@fonts); # to deal with the ancestor last in the case of font/font foreach my $font (@fonts) { my $block = 0; for (my $child=$font->firstChild; defined $child; $child=$child->nextSibling) { if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) { $block = 1; last; } } if (!defined $font->firstChild || $block) { # empty font or font containing block elements # replace this node by its content replace_by_children($font); } else { # replace by equivalent span my $color = get_non_empty_attribute($font, 'color'); my $size = get_non_empty_attribute($font, 'size'); my $face = get_non_empty_attribute($font, 'face'); if (defined $face) { $face =~ s/^,|,$//; } if (!defined $color && !defined $size && !defined $face) { # useless font element: replace this node by its content replace_by_children($font); next; } my $replacement; tie (my %properties, 'Tie::IxHash', ()); if (!defined $color && !defined $size && defined $face && lc($face) eq 'symbol') { $replacement = $doc->createDocumentFragment(); } else { $replacement = $doc->createElement('span'); my $css = ''; if (defined $color) { $color =~ s/^x/#/; $properties{'color'} = $color; } if (defined $size) { my %hash = ( '1' => 'x-small', '2' => 'small', '3' => 'medium', '4' => 'large', '5' => 'x-large', '6' => 'xx-large', '7' => '300%', '-1' => 'small', '-2' => 'x-small', '+1' => 'large', '+2' => 'x-large', '+3' => 'xx-large', '+4' => '300%', ); my $value = $hash{$size}; if (!defined $value) { $value = 'medium'; } $properties{'font-size'} = $value; } if (defined $face) { if (lc($face) ne 'symbol' && lc($face) ne 'bold') { $properties{'font-family'} = $face; } } set_css_properties($replacement, \%properties); } if (defined $face && lc($face) eq 'symbol') { # convert all content to unicode my $next; for (my $child=$font->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_TEXT_NODE) { my $value = $child->nodeValue; $value =~ tr/ABGDEZHQIKLMNXOPRSTUFCYWabgdezhqiklmnxoprVstufcywJjv¡«¬®/ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΤΥΦΧΨΩαβγδεζηθικλμνξοπρςστυφχψωϑϕϖϒ↔←→/; $child->setData($value); } } } # replace the font node if ($replacement->nodeType == XML_ELEMENT_NODE && !defined $font->previousSibling && !defined $font->nextSibling && string_in_array(\@accepting_style, $font->parentNode->nodeName)) { # use CSS on the parent block and replace font by its children instead of using a new element set_css_properties($font->parentNode, \%properties); replace_by_children($font); } else { # move all font children inside the replacement (span or fragment) my $next; for (my $child=$font->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $font->removeChild($child); $replacement->appendChild($child); } # replace font $font->parentNode->replaceChild($replacement, $font); } } } $root->normalize(); } # replaces u by <span style="text-decoration: underline"> sub replace_u { my ($root) = @_; my $doc = $root->ownerDocument; my @us = $root->getElementsByTagName('u'); foreach my $u (@us) { my $span = $doc->createElement('span'); $span->setAttribute('style', 'text-decoration: underline'); my $next; for (my $child=$u->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $u->removeChild($child); $span->appendChild($child); } $u->parentNode->replaceChild($span, $u); } } # removes CDATA sections tags that have not been parsed correcty by the HTML parser # also removes bad comments in script elements sub remove_bad_cdata_sections { my ($root) = @_; my $doc = $root->ownerDocument; foreach my $name (@preserve_elements) { my @nodes = $root->getElementsByTagName($name); foreach my $node (@nodes) { if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) { my $value = $node->firstChild->nodeValue; if ($name eq 'script' && (!defined $node->getAttribute('type') || $node->getAttribute('type') ne 'loncapa/perl') && !defined $node->firstChild->nextSibling && $value =~ /^(\s*)(\s*)$/) { # web browsers interpret that as a real comment when it is on 1 line, but the Perl HTML parser thinks it is the script # -> turning it back into a comment # (this is only true for Javascript script elements, since LON-CAPA does not parse loncapa/perl scripts in the same way) $node->removeChild($node->firstChild); $node->appendChild($doc->createComment($2)); next; } # at the beginning: $value =~ s/^(\s*)<!\[CDATA\[/$1/; # <![CDATA[ $value =~ s/^(\s*)\/\*\s*<!\[CDATA\[\s*\*\//$1/; # /* <![CDATA[ */ $value =~ s/^(\s*)\/\/\s*<!\[CDATA\[/$1/; # // <![CDATA[ $value =~ s/^(\s*)(\/\/)?\s*(\s*)$/$2/; # // --> $value =~ s/\/\*\s*\]\]>\s*\*\/(\s*)$/$1/; # /* ]]> */ $value = "\n".$value."\n"; $value =~ s/\s*(\n[ \t]*)/$1/; $value =~ s/\s+$/\n/; $node->firstChild->setData($value); } } } } # adds CDATA sections to scripts sub add_cdata_sections { my ($root) = @_; my $doc = $root->ownerDocument; my @scripts = $root->getElementsByTagName('script'); my @answers = $root->getElementsByTagName('answer'); foreach my $answer (@answers) { my $ancestor = $answer->parentNode; my $found_capa_response = 0; while (defined $ancestor) { if ($ancestor->nodeName eq 'numericalresponse' || $ancestor->nodeName eq 'formularesponse') { $found_capa_response = 1; last; } $ancestor = $ancestor->parentNode; } if (!$found_capa_response) { push(@scripts, $answer); } } foreach my $script (@scripts) { # use a CDATA section in the normal situation, for any script my $first = $script->firstChild; if (defined $first && $first->nodeType == XML_TEXT_NODE && !defined $first->nextSibling) { my $cdata = $doc->createCDATASection($first->nodeValue); $script->replaceChild($cdata, $first); } } } # removes "" at the beginning and end of style elements sub fix_style_element { my ($root) = @_; my @styles = $root->getElementsByTagName('style'); foreach my $style (@styles) { if (defined $style->firstChild && $style->firstChild->nodeType == XML_TEXT_NODE && !defined $style->firstChild->nextSibling) { my $text = $style->firstChild->nodeValue; if ($text =~ /^\s*\s*$/s) { $style->firstChild->setData($1); } } } } # create missing cells at the end of table rows sub fix_tables { my ($root) = @_; my @tables = $root->getElementsByTagName('table'); foreach my $table (@tables) { fix_cells($table); foreach my $tbody ($table->getChildrenByTagName('tbody')) { fix_cells($tbody); } foreach my $thead ($table->getChildrenByTagName('thead')) { fix_cells($thead); } foreach my $tfoot ($table->getChildrenByTagName('tfoot')) { fix_cells($tfoot); } } } # create missing cells at the end of table rows sub fix_cells { my ($table) = @_; # could actually be table, tbody, thead or tfoot my $doc = $table->ownerDocument; my @nb_cells = (); my $max_nb_cells = 0; my @rowspans = (); my @trs = $table->getChildrenByTagName('tr'); foreach my $tr (@trs) { my $nb_cells; if (defined $rowspans[0]) { $nb_cells = shift(@rowspans); } else { $nb_cells = 0; } for (my $cell=$tr->firstChild; defined $cell; $cell=$cell->nextSibling) { if ($cell->nodeName eq 'td' || $cell->nodeName eq 'th') { my $colspan = $cell->getAttribute('colspan'); if (defined $colspan && $colspan =~ /^\s*[0-9]+\s*$/) { $nb_cells += $colspan; } else { $nb_cells++; } my $rowspan = $cell->getAttribute('rowspan'); if (defined $rowspan && $rowspan =~ /^\s*[0-9]+\s*$/) { for (my $i=0; $i < $rowspan-1; $i++) { if (!defined $rowspans[$i]) { $rowspans[$i] = 1; } else { $rowspans[$i]++; } } } } } push(@nb_cells, $nb_cells); if ($nb_cells > $max_nb_cells) { $max_nb_cells = $nb_cells; } } foreach my $tr (@trs) { my $nb_cells = shift(@nb_cells); if ($nb_cells < $max_nb_cells) { for (1..($max_nb_cells - $nb_cells)) { $tr->appendChild($doc->createElement('td')); } } } } # replaces ul/ul by ul/li/ul and the same for ol (using the previous li if possible) # also adds a ul element when a li has no ul/ol ancestor sub fix_lists { my ($root) = @_; my $doc = $root->ownerDocument; my @uls = $root->getElementsByTagName('ul'); my @ols = $root->getElementsByTagName('ol'); my @lists = (@uls, @ols); foreach my $list (@lists) { my $next; for (my $child=$list->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_ELEMENT_NODE && string_in_array(['ul','ol'], $child->nodeName)) { my $previous = $child->previousNonBlankSibling(); # note: non-DOM method $list->removeChild($child); if (defined $previous && $previous->nodeType == XML_ELEMENT_NODE && $previous->nodeName eq 'li') { $previous->appendChild($child); } else { my $li = $doc->createElement('li'); $li->appendChild($child); if (!defined $next) { $list->appendChild($li); } else { $list->insertBefore($li, $next); } } } } } my @lis = $root->getElementsByTagName('li'); foreach my $li (@lis) { my $found_list_ancestor = 0; my $ancestor = $li->parentNode; while (defined $ancestor) { if ($ancestor->nodeName eq 'ul' || $ancestor->nodeName eq 'ol') { $found_list_ancestor = 1; last; } $ancestor = $ancestor->parentNode; } if (!$found_list_ancestor) { # replace li by ul and add li under ul my $ul = $doc->createElement('ul'); $li->parentNode->insertBefore($ul, $li); $li->parentNode->removeChild($li); $ul->appendChild($li); # add all other li afterwards inside ul (there might be text nodes in-between) my $next = $ul->nextSibling; while (defined $next) { my $next_next = $next->nextSibling; if ($next->nodeType == XML_TEXT_NODE && $next->nodeValue =~ /^\s*$/ && defined $next_next && $next_next->nodeType == XML_ELEMENT_NODE && $next_next->nodeName eq 'li') { $next->parentNode->removeChild($next); $ul->appendChild($next); $next = $next_next; $next_next = $next_next->nextSibling; } if ($next->nodeType == XML_ELEMENT_NODE && $next->nodeName eq 'li') { $next->parentNode->removeChild($next); $ul->appendChild($next); } else { last; } $next = $next_next; } } } } # Some "image" elements are actually img element with a wrong name. This renames them. # Amazingly enough, "<image src=..." displays an image in some browsers # ("image" has existed at some point as an experimental HTML element). sub fix_wrong_name_for_img { my ($root) = @_; my @images = $root->getElementsByTagName('image'); foreach my $image (@images) { if (!defined $image->getAttribute('src')) { next; } my $found_correct_ancestor = 0; my $ancestor = $image->parentNode; while (defined $ancestor) { if ($ancestor->nodeName eq 'drawimage' || $ancestor->nodeName eq 'imageresponse') { $found_correct_ancestor = 1; last; } $ancestor = $ancestor->parentNode; } if ($found_correct_ancestor) { next; } # this really has to be renamed "img" $image->setNodeName('img'); } } # Replaces many deprecated attributes and replaces them by equivalent CSS when possible sub replace_deprecated_attributes_by_css { my ($root) = @_; fix_deprecated_in_tables($root); fix_deprecated_in_table_rows($root); fix_deprecated_in_table_cells($root); fix_deprecated_in_lists($root); fix_deprecated_in_list_items($root); fix_deprecated_in_hr($root); fix_deprecated_in_img($root); fix_deprecated_in_body($root); fix_align_attribute($root); } # Replaces deprecated attributes in tables sub fix_deprecated_in_tables { my ($root) = @_; my @tables = $root->getElementsByTagName('table'); foreach my $table (@tables) { tie (my %new_properties, 'Tie::IxHash', ()); my $align = $table->getAttribute('align'); if (defined $align) { $table->removeAttribute('align'); $align = lc(trim($align)); } if ($table->parentNode->nodeName eq 'center' || (defined $align && $align eq 'center') || (defined $table->parentNode->getAttribute('align') && $table->parentNode->getAttribute('align') eq 'center')) { $new_properties{'margin-left'} = 'auto'; $new_properties{'margin-right'} = 'auto'; } if (defined $align && ($align eq 'left' || $align eq 'right')) { $new_properties{'float'} = $align; } my $width = $table->getAttribute('width'); if (defined $width) { $table->removeAttribute('width'); $width = trim($width); if ($width =~ /^[0-9]+$/) { $width .= 'px'; } if ($width ne '') { $new_properties{'width'} = $width; } } my $height = $table->getAttribute('height'); if (defined $height) { $table->removeAttribute('height'); # no replacement for table height } my $bgcolor = $table->getAttribute('bgcolor'); if (defined $bgcolor) { $table->removeAttribute('bgcolor'); $bgcolor = trim($bgcolor); $bgcolor =~ s/^x\s*//; if ($bgcolor ne '') { $new_properties{'background-color'} = $bgcolor; } } my $frame = $table->getAttribute('frame'); if (defined $frame) { $table->removeAttribute('frame'); $frame = lc(trim($frame)); if ($frame eq 'void') { $new_properties{'border'} = 'none'; } elsif ($frame eq 'above') { $new_properties{'border-top'} = '1px solid black'; } elsif ($frame eq 'below') { $new_properties{'border-bottom'} = '1px solid black'; } elsif ($frame eq 'hsides') { $new_properties{'border-top'} = '1px solid black'; $new_properties{'border-bottom'} = '1px solid black'; } elsif ($frame eq 'vsides') { $new_properties{'border-left'} = '1px solid black'; $new_properties{'border-right'} = '1px solid black'; } elsif ($frame eq 'lhs') { $new_properties{'border-left'} = '1px solid black'; } elsif ($frame eq 'rhs') { $new_properties{'border-right'} = '1px solid black'; } elsif ($frame eq 'box') { $new_properties{'border'} = '1px solid black'; } elsif ($frame eq 'border') { $new_properties{'border'} = '1px solid black'; } } if (scalar(keys %new_properties) > 0) { set_css_properties($table, \%new_properties); } # we can't replace the border attribute without creating a style block, but we can improve things like border="BORDER" my $border = $table->getAttribute('border'); if (defined $border) { $border = trim($border); if ($border !~ /^\s*[0-9]+\s*(px)?\s*$/) { $table->setAttribute('border', '1'); } } } } # Replaces deprecated attributes in tr elements sub fix_deprecated_in_table_rows { my ($root) = @_; my @trs = $root->getElementsByTagName('tr'); foreach my $tr (@trs) { my $old_properties = get_css_properties($tr); tie (my %new_properties, 'Tie::IxHash', ()); my $bgcolor = $tr->getAttribute('bgcolor'); if (defined $bgcolor) { $tr->removeAttribute('bgcolor'); if (!defined $old_properties->{'background-color'}) { $bgcolor = trim($bgcolor); $bgcolor =~ s/^x\s*//; if ($bgcolor ne '') { $new_properties{'background-color'} = $bgcolor; } } } my $align = $tr->getAttribute('align'); if (defined $align && $align !~ /\s*char\s*/i) { $tr->removeAttribute('align'); if (!defined $old_properties->{'text-align'}) { $align = lc(trim($align)); if ($align ne '') { $new_properties{'text-align'} = $align; } } } my $valign = $tr->getAttribute('valign'); if (defined $valign) { $tr->removeAttribute('valign'); if (!defined $old_properties->{'vertical-align'}) { $valign = lc(trim($valign)); if ($valign ne '') { $new_properties{'vertical-align'} = $valign; } } } if (scalar(keys %new_properties) > 0) { set_css_properties($tr, \%new_properties); } } } # Replaces deprecated attributes in table cells (td and th) sub fix_deprecated_in_table_cells { my ($root) = @_; my @tds = $root->getElementsByTagName('td'); my @ths = $root->getElementsByTagName('th'); my @cells = (@tds, @ths); foreach my $cell (@cells) { my $old_properties = get_css_properties($cell); tie (my %new_properties, 'Tie::IxHash', ()); my $width = $cell->getAttribute('width'); if (defined $width) { $cell->removeAttribute('width'); if (!defined $old_properties->{'width'}) { $width = trim($width); if ($width =~ /^[0-9]+$/) { $width .= 'px'; } if ($width ne '') { $new_properties{'width'} = $width; } } } my $height = $cell->getAttribute('height'); if (defined $height) { $cell->removeAttribute('height'); if (!defined $old_properties->{'height'}) { $height = trim($height); if ($height =~ /^[0-9]+$/) { $height .= 'px'; } if ($height ne '') { $new_properties{'height'} = $height; } } } my $bgcolor = $cell->getAttribute('bgcolor'); if (defined $bgcolor) { $cell->removeAttribute('bgcolor'); if (!defined $old_properties->{'background-color'}) { $bgcolor = trim($bgcolor); $bgcolor =~ s/^x\s*//; if ($bgcolor ne '') { $new_properties{'background-color'} = $bgcolor; } } } my $align = $cell->getAttribute('align'); if (defined $align && $align !~ /\s*char\s*/i) { $cell->removeAttribute('align'); if (!defined $old_properties->{'text-align'}) { $align = lc(trim($align)); if ($align ne '') { $new_properties{'text-align'} = $align; } } } my $valign = $cell->getAttribute('valign'); if (defined $valign) { $cell->removeAttribute('valign'); if (!defined $old_properties->{'vertical-align'}) { $valign = lc(trim($valign)); if ($valign ne '') { $new_properties{'vertical-align'} = $valign; } } } if (scalar(keys %new_properties) > 0) { set_css_properties($cell, \%new_properties); } } } # Replaces deprecated attributes in lists (ul and ol) sub fix_deprecated_in_lists { my ($root) = @_; my @uls = $root->getElementsByTagName('ul'); my @ols = $root->getElementsByTagName('ol'); my @lists = (@uls, @ols); foreach my $list (@lists) { my $type = $list->getAttribute('type'); if (defined $type) { my $lst = list_style_type($type); if (defined $lst) { $list->removeAttribute('type'); if (!defined get_css_property($list, 'list-style-type')) { set_css_property($list, 'list-style-type', $lst); } } } } } # Replaces deprecated attributes in list items (li) sub fix_deprecated_in_list_items { my ($root) = @_; my @lis = $root->getElementsByTagName('li'); foreach my $li (@lis) { my $type = $li->getAttribute('type'); if (defined $type) { my $lst = list_style_type($type); if (defined $lst) { $li->removeAttribute('type'); if (!defined get_css_property($li, 'list-style-type')) { set_css_property($li, 'list-style-type', $lst); } } } } } # returns the CSS list-style-type value equivalent to the given type attribute for a list or list item sub list_style_type { my ($type) = @_; my $value; $type = trim($type); if (lc($type) eq 'circle') { $value = 'circle'; } elsif (lc($type) eq 'disc') { $value = 'disc'; } elsif (lc($type) eq 'square') { $value = 'square'; } elsif ($type eq 'a') { $value = 'lower-latin'; } elsif ($type eq 'A') { $value = 'upper-latin'; } elsif ($type eq 'i') { $value = 'lower-roman'; } elsif ($type eq 'I') { $value = 'upper-roman'; } elsif ($type eq '1') { $value = 'decimal'; } return $value; } # Replaces deprecated attributes in hr sub fix_deprecated_in_hr { my ($root) = @_; my @hrs = $root->getElementsByTagName('hr'); foreach my $hr (@hrs) { tie (my %new_properties, 'Tie::IxHash', ()); my $align = $hr->getAttribute('align'); if (defined $align) { $align = lc(trim($align)); if ($align eq 'left') { $new_properties{'text-align'} = 'left'; $new_properties{'margin-left'} = '0'; } elsif ($align eq 'right') { $new_properties{'text-align'} = 'right'; $new_properties{'margin-right'} = '0'; } $hr->removeAttribute('align'); } my $color = $hr->getAttribute('color'); if (defined $color) { $color = trim($color); $color =~ s/^x\s*//; if ($color ne '') { $new_properties{'color'} = $color; $new_properties{'background-color'} = $color; } $hr->removeAttribute('color'); } my $noshade = $hr->getAttribute('noshade'); my $size = $hr->getAttribute('size'); if (defined $noshade) { $new_properties{'border-width'} = '0'; if (!defined $color) { $new_properties{'color'} = 'gray'; $new_properties{'background-color'} = 'gray'; } if (!defined $size) { $size = '2'; } $hr->removeAttribute('noshade'); } if (defined $size) { $size = trim($size); if ($size ne '') { $new_properties{'height'} = $size.'px'; } if (defined $hr->getAttribute('size')) { $hr->removeAttribute('size'); } } my $width = $hr->getAttribute('width'); if (defined $width) { $width = trim($width); if ($width ne '') { if ($width !~ /\%$/) { $width .= 'px'; } $new_properties{'width'} = $width; } $hr->removeAttribute('width'); } if (scalar(keys %new_properties) > 0) { set_css_properties($hr, \%new_properties); } } } # Replaces deprecated attributes in img sub fix_deprecated_in_img { my ($root) = @_; my @imgs = $root->getElementsByTagName('img'); foreach my $img (@imgs) { my $old_properties = get_css_properties($img); tie (my %new_properties, 'Tie::IxHash', ()); my $align = $img->getAttribute('align'); if (defined $align) { $align = lc(trim($align)); if ($align eq 'middle' || $align eq 'top' || $align eq 'bottom') { $img->removeAttribute('align'); if (!defined $old_properties->{'vertical-align'}) { $new_properties{'vertical-align'} = $align; } } elsif ($align eq 'left' || $align eq 'right') { $img->removeAttribute('align'); if (!defined $old_properties->{'float'}) { $new_properties{'float'} = $align; } } elsif ($align eq 'center' || $align eq '') { $img->removeAttribute('align'); } } my $border = $img->getAttribute('border'); if (defined $border) { $border = lc(trim($border)); if ($border =~ /^[0-9]+\s*(px)?$/) { $img->removeAttribute('border'); if (!defined $old_properties->{'border'}) { if ($border !~ /px$/) { $border .= 'px'; } $new_properties{'border'} = $border.' solid black'; } } } my $hspace = $img->getAttribute('hspace'); if (defined $hspace) { $hspace = lc(trim($hspace)); if ($hspace =~ /^[0-9]+\s*(px)?$/) { $img->removeAttribute('hspace'); if (!defined $old_properties->{'margin-left'} || !defined $old_properties->{'margin-right'}) { if ($hspace !~ /px$/) { $hspace .= 'px'; } $new_properties{'margin-left'} = $hspace; $new_properties{'margin-right'} = $hspace; } } } if (scalar(keys %new_properties) > 0) { set_css_properties($img, \%new_properties); } } } # Replaces deprecated attributes in htmlbody (the style attribute could be used in a div for output) sub fix_deprecated_in_body { my ($root) = @_; my $doc = $root->ownerDocument; my @bodies = $root->getElementsByTagName('htmlbody'); foreach my $body (@bodies) { my $old_properties = get_css_properties($body); tie (my %new_properties, 'Tie::IxHash', ()); my $bgcolor = $body->getAttribute('bgcolor'); if (defined $bgcolor) { $body->removeAttribute('bgcolor'); if (!defined $old_properties->{'background-color'}) { $bgcolor = trim($bgcolor); $bgcolor =~ s/^x\s*//; if ($bgcolor ne '') { $new_properties{'background-color'} = $bgcolor; } } } my $color = $body->getAttribute('text'); if (defined $color) { $body->removeAttribute('text'); if (!defined $old_properties->{'color'}) { $color = trim($color); $color =~ s/^x\s*//; if ($color ne '') { $new_properties{'color'} = $color; } } } my $background = $body->getAttribute('background'); if (defined $background && ($background =~ /\.jpe?g$|\.gif|\.png/i)) { $body->removeAttribute('background'); if (!defined $old_properties->{'background-image'}) { $background = trim($background); if ($background ne '') { $new_properties{'background-image'} = 'url('.$background.')'; } } } # NOTE: these attributes have never been standard and are better removed with no replacement foreach my $bad ('bottommargin', 'leftmargin', 'rightmargin', 'topmargin', 'marginheight', 'marginwidth') { if ($body->hasAttribute($bad)) { $body->removeAttribute($bad); } } # NOTE: link alink and vlink require a <style> block to be converted my $link = $body->getAttribute('link'); my $alink = $body->getAttribute('alink'); my $vlink = $body->getAttribute('vlink'); if (defined $link || defined $alink || defined $vlink) { my $head; my @heads = $root->getElementsByTagName('htmlhead'); if (scalar(@heads) > 0) { $head = $heads[0]; } else { $head = $doc->createElement('htmlhead'); $root->insertBefore($head, $root->firstChild); } my $style = $doc->createElement('style'); $head->appendChild($style); my $css = "\n"; if (defined $link) { $body->removeAttribute('link'); $link = trim($link); $link =~ s/^x\s*//; $css .= ' a:link { color:'.$link.' }'; $css .= "\n"; } if (defined $alink) { $body->removeAttribute('alink'); $alink = trim($alink); $alink =~ s/^x\s*//; $css .= ' a:active { color:'.$alink.' }'; $css .= "\n"; } if (defined $vlink) { $body->removeAttribute('vlink'); $vlink = trim($vlink); $vlink =~ s/^x\s*//; $css .= ' a:visited { color:'.$vlink.' }'; $css .= "\n"; } $css .= ' '; $style->appendChild($doc->createTextNode($css)); } if (scalar(keys %new_properties) > 0) { set_css_properties($body, \%new_properties); } elsif (!$body->hasAttributes) { $body->parentNode->removeChild($body); } } } # replaces <div align="center"> by <div style="text-align:center;"> # also for p and h1..h6 sub fix_align_attribute { my ($root) = @_; my @nodes = $root->getElementsByTagName('div'); push(@nodes, $root->getElementsByTagName('p')); for (my $i=1; $i<=6; $i++) { push(@nodes, $root->getElementsByTagName('h'.$i)); } foreach my $node (@nodes) { my $align = $node->getAttribute('align'); if (defined $align) { $node->removeAttribute('align'); $align = trim($align); if ($align ne '' && !defined get_css_property($node, 'text-align')) { set_css_property($node, 'text-align', lc($align)); } } } } # replace center by a div or remove it if there is a table inside sub replace_center { my ($root, $all_block) = @_; my $doc = $root->ownerDocument; my @centers = $root->getElementsByTagName('center'); foreach my $center (@centers) { if ($center->getChildrenByTagName('table')->size() > 0) { # note: getChildrenByTagName is not DOM (LibXML specific) replace_by_children($center); } else { if ((!defined $center->previousSibling || ($center->previousSibling->nodeType == XML_TEXT_NODE && $center->previousSibling->nodeValue =~ /^\s*$/ && !defined $center->previousSibling->previousSibling)) && (!defined $center->nextSibling || ($center->nextSibling->nodeType == XML_TEXT_NODE && $center->nextSibling->nodeValue =~ /^\s*$/ && !defined $center->nextSibling->nextSibling)) && string_in_array(\@accepting_style, $center->parentNode->nodeName)) { # use CSS on the parent block and replace center by its children set_css_property($center->parentNode, 'text-align', 'center'); replace_by_children($center); } else { # use p or div ? check if there is a block inside my $found_block = 0; for (my $child=$center->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) { $found_block = 1; last; } } my $new_node; if ($found_block) { $new_node = $doc->createElement('div'); $new_node->setAttribute('style', 'text-align: center; margin: 0 auto'); } else { $new_node = $doc->createElement('p'); $new_node->setAttribute('style', 'text-align: center'); } my $next; for (my $child=$center->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $center->removeChild($child); $new_node->appendChild($child); } $center->parentNode->replaceChild($new_node, $center); } } } } # replaces <nobr> by <span style="white-space:nowrap"> sub replace_nobr { my ($root) = @_; my @nobrs = $root->getElementsByTagName('nobr'); foreach my $nobr (@nobrs) { if (!defined $nobr->previousSibling && !defined $nobr->nextSibling && string_in_array(\@accepting_style, $nobr->parentNode->nodeName)) { # use CSS on the parent block set_css_property($nobr->parentNode, 'white-space', 'nowrap'); replace_by_children($nobr); } else { $nobr->setNodeName('span'); $nobr->setAttribute('style', 'white-space:nowrap'); } } } # removes notsolved tags in the case <hintgroup showoncorrect="no"><notsolved>...</notsolved></hintgroup> # and in the case <notsolved><hintgroup showoncorrect="no">...</hintgroup></notsolved> sub remove_useless_notsolved { my ($root) = @_; my @hintgroups = $root->getElementsByTagName('hintgroup'); foreach my $hintgroup (@hintgroups) { my $showoncorrect = get_non_empty_attribute($hintgroup, 'showoncorrect'); if (!defined $showoncorrect || $showoncorrect eq 'no') { my @notsolveds = $hintgroup->getElementsByTagName('notsolved'); foreach my $notsolved (@notsolveds) { replace_by_children($notsolved); } } my $parent = $hintgroup->parentNode; if ($parent->nodeName eq 'notsolved' && scalar(@{$parent->nonBlankChildNodes()}) == 1) { replace_by_children($parent); } } } # adds a paragraph inside if needed and calls fix_paragraph for all paragraphs (including new ones) sub fix_paragraphs_inside { my ($node, $all_block) = @_; # blocks in which paragrahs will be added: my @blocks_with_p = ('loncapa','library','problem','part','problemtype','window','block','while','postanswerdate','preduedate','solved','notsolved','languageblock','instructorcomment','togglebox','standalone','form'); my @fix_p_if_br_or_p = (@responses,'foil','item','text','label','hintgroup','hintpart','hint','web','windowlink','div','li','dd','td','th','blockquote'); if ((string_in_array(\@blocks_with_p, $node->nodeName) && paragraph_needed($node)) || (string_in_array(\@fix_p_if_br_or_p, $node->nodeName) && paragraph_inside($node))) { # if non-empty, add paragraphs where needed between all br and remove br # (it would be easier to just put everything in a p and fix it afterwards, but there are performance issues # when a paragraph has many blocks directly inside) my $doc = $node->ownerDocument; my $p = undef; my @new_children = (); my $next; for (my $child=$node->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $node->removeChild($child); if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'br') { if (defined $p) { push(@new_children, $p); } else { push(@new_children, $doc->createElement('p')); } $p = undef; } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array(\@inline_like_block, $child->nodeName)) { # inline_like_block: use the paragraph if there is one, otherwise do not create one if (defined $p) { $p->appendChild($child); } else { push(@new_children, $child); } } elsif ($child->nodeType == XML_ELEMENT_NODE && string_in_array($all_block, $child->nodeName)) { # these children are blocks and should not be in a paragraph if (defined $p) { push(@new_children, $p); $p = undef; } push(@new_children, $child); } elsif ($child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^[ \t\f\n\r]*$/) { # blank text: add to paragraph if there is one and there is a next node, otherwise keep out of the paragraph if (defined $p) { if (defined $next) { $p->appendChild($child); } else { push(@new_children, $p); $p = undef; push(@new_children, $child); } } else { push(@new_children, $child); } } elsif ($child->nodeType == XML_TEXT_NODE || $child->nodeType == XML_ELEMENT_NODE || $child->nodeType == XML_CDATA_SECTION_NODE || $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) { # these children require a paragraph if (!defined $p) { $p = $doc->createElement('p'); } $p->appendChild($child); } else { # these children do not require a paragraph (XML comments, PI) # -> do not move them in a new paragraph if (defined $p) { push(@new_children, $p); $p = undef; } push(@new_children, $child); } } if (defined $p) { push(@new_children, $p); } foreach my $child (@new_children) { $node->appendChild($child); } } # now fix the paragraphs everywhere, so that all inline nodes are inside a paragraph, and block nodes are outside my $next; for (my $child=$node->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_ELEMENT_NODE && defined $child->firstChild) { if ($child->nodeName eq 'p') { fix_paragraph($child, $all_block); } else { fix_paragraphs_inside($child, $all_block); } } } } # returns 1 if a paragraph is needed inside this node (assuming the parent can have paragraphs) sub paragraph_needed { my ($node) = @_; for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) { if (($child->nodeType == XML_TEXT_NODE && $child->nodeValue !~ /^\s*$/) || ($child->nodeType == XML_ELEMENT_NODE && !string_in_array(\@inline_like_block, $child->nodeName)) || $child->nodeType == XML_CDATA_SECTION_NODE || $child->nodeType == XML_ENTITY_NODE || $child->nodeType == XML_ENTITY_REF_NODE) { return(1); } } return(0); } # returns 1 if there is a paragraph or br in a child of this node, or inside an inline child sub paragraph_inside { my ($node) = @_; # inline elements that can be split in half if there is a paragraph inside (currently all HTML): # (also used in first_block below) my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u'); for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE) { my $name = $child->nodeName; if ($name eq 'p' || $name eq 'br') { return(1); } elsif (string_in_array(\@splitable_inline, $name)) { if (paragraph_inside($child)) { return(1); } } } } return(0); } # fixes paragraphs inside paragraphs (without a block in-between) sub fix_paragraph { my ($p, $all_block) = @_; my $loop_right = 1; # this loops is to avoid out of memory errors with recurse, see below while ($loop_right) { $loop_right = 0; my $block = find_first_block($p, $all_block); if (defined $block) { my $trees = clone_ancestor_around_node($p, $block); my $doc = $p->ownerDocument; my $replacement = $doc->createDocumentFragment(); my $left = $trees->{'left'}; my $middle = $trees->{'middle'}; my $right = $trees->{'right'}; my $left_needs_p = 0; # 1 if it needs a paragraph (used to replace br later) if (defined $left) { # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup): for (my $child=$left->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE) { fix_paragraphs_inside($child, $all_block); } } if (!paragraph_needed($left)) { # this was just blank text, comments or inline responses, it should not create a new paragraph my $next; for (my $child=$left->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $left->removeChild($child); $replacement->appendChild($child); } } else { $left_needs_p = 1; $replacement->appendChild($left); } } my $n = $middle->firstChild; while (defined $n) { if ($n->nodeType == XML_ELEMENT_NODE && (string_in_array($all_block, $n->nodeName) || $n->nodeName eq 'br')) { if ($n->nodeName eq 'p') { my $parent = $n->parentNode; # first apply recursion fix_paragraph($n, $all_block); # now the p might have been replaced by several nodes, which should replace the initial p my $next_block; for (my $block=$parent->firstChild; defined $block; $block=$next_block) { $next_block = $block->nextSibling; if ($block->nodeName eq 'p') { $parent->removeChild($block); # for each parent before $middle, clone in-between the p and its children (to preserve the styles) if (defined $block->firstChild) { for (my $p=$parent; $p!=$middle; $p=$p->parentNode) { my $newp = $p->cloneNode(0); my $next; for (my $child=$block->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $block->removeChild($child); $newp->appendChild($child); } $block->appendChild($newp); } } } $replacement->appendChild($block); } } else { # replace the whole p by this block, forgetting about intermediate inline elements $n->parentNode->removeChild($n); if ($n->nodeName eq 'br') { # replace a br by a paragraph if there was nothing before in the paragraph, # otherwise remove it because it already broke the paragraph in half if (!defined $left || !$left_needs_p) { $replacement->appendChild($middle); } } else { fix_paragraphs_inside($n, $all_block); $replacement->appendChild($n); } } last; } $n = $n->firstChild; if (defined $n && defined $n->nextSibling) { die "Error in post_xml.fix_paragraph: block not found"; } } if (defined $right) { if ($block->nodeName eq 'p') { # remove attributes on the right paragraph my @attributelist = $right->attributes(); foreach my $att (@attributelist) { $right->removeAttribute($att->nodeName); } } if ($right->firstChild->nodeType == XML_TEXT_NODE && $right->firstChild->nodeValue =~ /^[ \t\f\n\r]*$/) { # remove the first text node with whitespace only from the p, it should not trigger the creation of a p # (but take nbsp into account, so we should not use \s here) my $first = $right->firstChild; $right->removeChild($first); $replacement->appendChild($first); } if (defined $right->firstChild) { if (paragraph_needed($right)) { $replacement->appendChild($right); #fix_paragraph($right, $all_block); This is taking way too much memory for blocks with many children # -> loop instead of recurse $loop_right = 1; } else { # this was just blank text, comments or inline responses, it should not create a new paragraph my $next; for (my $child=$right->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; $right->removeChild($child); $replacement->appendChild($child); # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup): if ($child->nodeType == XML_ELEMENT_NODE) { fix_paragraphs_inside($child, $all_block); } } } } } $p->parentNode->replaceChild($replacement, $p); if ($loop_right) { $p = $right; } } else { # fix paragraphs inside, in case one of the descendants can have paragraphs inside (like numericalresponse/hintgroup): my $next; for (my $child=$p->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_ELEMENT_NODE) { fix_paragraphs_inside($child, $all_block); } } } } } sub find_first_block { my ($node, $all_block) = @_; # inline elements that can be split in half if there is a paragraph inside (currently all HTML): my @splitable_inline = ('span', 'a', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u'); for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE) { if (string_in_array($all_block, $child->nodeName) || $child->nodeName eq 'br') { return($child); } if (string_in_array(\@splitable_inline, $child->nodeName)) { my $block = find_first_block($child, $all_block); if (defined $block) { return($block); } } } } return(undef); } # Creates clones of the ancestor containing the descendants before the node, at the node, and after the node. # returns a hash with: left, middle, right (left and right can be undef) sub clone_ancestor_around_node { my ($ancestor, $node) = @_; my $middle_node; my ($left, $middle, $right); for (my $child=$ancestor->firstChild; defined $child; $child=$child->nextSibling) { if ($child == $node || is_ancestor_of($child, $node)) { $middle_node = $child; last; } } if (!defined $middle_node) { die "error in split_ancestor_around_node: middle not found"; } if (defined $middle_node->previousSibling) { $left = $ancestor->cloneNode(0); for (my $child=$ancestor->firstChild; $child != $middle_node; $child=$child->nextSibling) { $left->appendChild($child->cloneNode(1)); } } $middle = $ancestor->cloneNode(0); if ($middle_node == $node) { $middle->appendChild($middle_node->cloneNode(1)); } else { my $subres = clone_ancestor_around_node($middle_node, $node); my $subleft = $subres->{'left'}; if (defined $subleft) { if (!defined $left) { $left = $ancestor->cloneNode(0); } $left->appendChild($subleft); } $middle->appendChild($subres->{'middle'}); my $subright = $subres->{'right'}; if (defined $subright) { $right = $ancestor->cloneNode(0); $right->appendChild($subright); } } if (defined $middle_node->nextSibling) { if (!defined $right) { $right = $ancestor->cloneNode(0); } for (my $child=$middle_node->nextSibling; defined $child; $child=$child->nextSibling) { $right->appendChild($child->cloneNode(1)); } } my %result = (); $result{'left'} = $left; $result{'middle'} = $middle; $result{'right'} = $right; return(\%result); } sub is_ancestor_of { my ($n1, $n2) = @_; my $n = $n2->parentNode; while (defined $n) { if ($n == $n1) { return(1); } $n = $n->parentNode; } return(0); } # removes empty style elements and replaces the ones with only whitespaces inside by their content # also remove hints that have become empty after empty style removal. sub remove_empty_style { my ($root) = @_; # actually, preserve some elements like ins when they have whitespace, only remove if they are empty my @remove_if_empty = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'code', 'kbd', 'samp', 'tt', 'ins', 'del', 'var', 'small', 'big', 'font', 'u', 'hint'); my @remove_if_blank = ('span', 'strong', 'em' , 'b', 'i', 'sup', 'sub', 'tt', 'var', 'small', 'big', 'font', 'u', 'hint'); foreach my $name (@remove_if_empty) { my @nodes = $root->getElementsByTagName($name); while (scalar(@nodes) > 0) { my $node = pop(@nodes); if (!defined $node->firstChild) { my $parent = $node->parentNode; if (defined $node->previousSibling && $node->previousSibling->nodeType == XML_TEXT_NODE && $node->previousSibling->nodeValue =~ /\$\S*$/) { # case $a<sup></sup>x my $value = $node->previousSibling->nodeValue; $value =~ s/\$(\S*)$/\$\{$1\}/; $node->previousSibling->setData($value); } $parent->removeChild($node); $parent->normalize(); # now that we removed the node, check if the parent has become an empty style, and so on while (defined $parent && string_in_array(\@remove_if_empty, $parent->nodeName) && !defined $parent->firstChild) { my $grandparent = $parent->parentNode; $grandparent->removeChild($parent); remove_reference_from_array(\@nodes, $parent); $parent = $grandparent; } } } } foreach my $name (@remove_if_blank) { my @nodes = $root->getElementsByTagName($name); while (scalar(@nodes) > 0) { my $node = pop(@nodes); if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE) { # NOTE: careful, with UTF-8, \s matches non-breaking spaces and we want to preserve these if ($node->firstChild->nodeValue =~ /^[\t\n\f\r ]*$/) { my $parent = $node->parentNode; replace_by_children($node); $parent->normalize(); # now that we removed the node, check if the parent has become a style with only whitespace, and so on while (defined $parent && string_in_array(\@remove_if_blank, $parent->nodeName) && (!defined $parent->firstChild || (!defined $parent->firstChild->nextSibling && $parent->firstChild->nodeType == XML_TEXT_NODE && $parent->firstChild->nodeValue =~ /^^[\t\n\f\r ]*$/))) { my $grandparent = $parent->parentNode; replace_by_children($parent); remove_reference_from_array(\@nodes, $parent); $parent = $grandparent; } } } } } } # remove whitespace inside LON-CAPA elements that have an empty content-model (HTML ones are handled by html_to_xml) sub fix_empty_lc_elements { my ($node) = @_; my @lcempty = ('arc','axis','backgroundplot','drawoptionlist','drawvectorsum','fill','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','hiddenline','hiddensubmission','key','line','location','organicstructure','parameter','plotobject','plotvector','responseparam','spline','textline'); if (string_in_array(\@lcempty, $node->nodeName)) { if (defined $node->firstChild && !defined $node->firstChild->nextSibling && $node->firstChild->nodeType == XML_TEXT_NODE && $node->firstChild->nodeValue =~ /^\s*$/) { $node->removeChild($node->firstChild); } if (defined $node->firstChild) { print "Warning: a ".$node->nodeName." has something inside\n"; } return; } for (my $child=$node->firstChild; defined $child; $child=$child->nextSibling) { if ($child->nodeType == XML_ELEMENT_NODE) { fix_empty_lc_elements($child); } } } # turn some attribute values into lowercase when they should be sub lowercase_attribute_values { my ($root) = @_; my @with_yesno = (['radiobuttonresponse', ['randomize']], ['optionresponse', ['randomize']], ['matchresponse', ['randomize']], ['itemgroup', ['randomize']], ['rankresponse', ['randomize']], ['functionplotresponse', ['xaxisvisible', 'yaxisvisible', 'gridvisible']], ['backgroundplot', ['fixed']], ['drawvectorsum', ['showvalue']], ['textline', ['readonly']], ['hint', ['showoncorrect']], ['body', ['dir']], ['img', ['encrypturl']], ['form', ['method']], ['input', ['type']] ); foreach my $el_attributes (@with_yesno) { my $el_name = $el_attributes->[0]; my @elements = $root->getElementsByTagName($el_name); foreach my $element (@elements) { my $att_list = $el_attributes->[1]; foreach my $att_name (@$att_list) { my $att_value = $element->getAttribute($att_name); if (!defined $att_value) { next; } if ($att_value eq 'yes' || $att_value eq 'no') { next; } if ($att_value =~ /\s*yes\s*/i) { $element->setAttribute($att_name, 'yes'); } elsif ($att_value =~ /\s*no\s*/i) { $element->setAttribute($att_name, 'no'); } } } } } # fixes spelling mistakes for numericalresponse/@unit sub replace_numericalresponse_unit_attribute { my ($root) = @_; my @numericalresponses = $root->getElementsByTagName('numericalresponse'); foreach my $numericalresponse (@numericalresponses) { if (defined $numericalresponse->getAttribute('units') && !defined $numericalresponse->getAttribute('unit')) { $numericalresponse->setAttribute('unit', $numericalresponse->getAttribute('units')); $numericalresponse->removeAttribute('units'); } } } # Replaces &format and &prettyprint by <num> whenever possible. # Also replaces &chemparse by <chem>. # If the function call is enclosed in <display>, the <display> element is removed. sub replace_functions_by_elements { my ($root) = @_; my $doc = $root->ownerDocument; my @preserve = ('script','answer','parse','m','tm','dtm','numericalhintscript'); # display is handled later my @all = $root->getElementsByTagName('*'); foreach my $element (@all) { if (string_in_array(\@preserve, $element->nodeName)) { next; } my $changed = 0; my $next; for (my $child=$element->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_TEXT_NODE) { my $value = $child->nodeValue; if ($value =~ /^(.*)&(?:format|prettyprint)$(\$\{?[a-zA-Z0-9]*\}?(?:\[[^\]]*\])?|[0-9.]+)\s?,\s?(["'][,.\$]?[0-9][eEfFgGsS]["']|\$[a-zA-Z0-9]*)$(.*)$/s) { # NOTE: we don't check for &prettyprint's 3rd argument (target), but it has not been seen outside of script elements. # NOTE: the format options ',' and '$' are not supported by &format in current LON-CAPA since rev 1.81 of default_homework.lcpm, # but are supported by &prettyprint; # if we use (like current LON-CAPA) &prettyprint for <num> implementation, it will change a few resulting documents # (by making them display something they were probably intended to display, but which did not). # Usage of <num> with &prettyprint instead of &format might also change the display when there is an exponent. my $before = $1; my $number = $2; my $format = $3; my $after = $4; $format =~ s/^['"]|['"]$//g; # do not change this if the parent is <display> and there are other things before or after &format if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next || $before !~ /^\s*$/ || $after !~ /^\s*$/)) { last; } my $replacement = $doc->createDocumentFragment(); my $num = $doc->createElement('num'); $num->setAttribute('format', $format); $num->appendChild($doc->createTextNode($number)); if (length($before) > 0) { $replacement->appendChild($doc->createTextNode($before)); } $replacement->appendChild($num); if (length($after) > 0) { $replacement->appendChild($doc->createTextNode($after)); } $element->replaceChild($replacement, $child); $changed = 1; $next = $element->firstChild; # start over, there might be another &format in the same text node } elsif ($value =~ /^(.*)&chemparse$([^'"()]*|'[^']*'|"[^"]*")$(.*)$/s) { my $before = $1; my $reaction = $2; my $after = $3; $reaction =~ s/^'(.*)'$/$1/; $reaction =~ s/^"(.*)"$/$1/; if ($element->nodeName eq 'display' && (defined $child->previousSibling || defined $next || $before !~ /^\s*$/ || $after !~ /^\s*$/)) { last; } my $replacement = $doc->createDocumentFragment(); my $chem = $doc->createElement('chem'); $chem->appendChild($doc->createTextNode($reaction)); if (length($before) > 0) { $replacement->appendChild($doc->createTextNode($before)); } $replacement->appendChild($chem); if (length($after) > 0) { $replacement->appendChild($doc->createTextNode($after)); } $element->replaceChild($replacement, $child); $changed = 1; $next = $element->firstChild; } } } if ($changed && $element->nodeName eq 'display') { my $first = $element->firstChild; if ($first->nodeType == XML_ELEMENT_NODE && string_in_array(['num','chem'], $first->nodeName) && !defined $first->nextSibling) { # remove useless display element replace_by_children($element); } } } } # pretty-print using im-memory DOM tree sub pretty { my ($node, $all_block, $indent_level) = @_; my $doc = $node->ownerDocument; $indent_level ||= 0; my $type = $node->nodeType; if ($type == XML_ELEMENT_NODE) { my $name = $node->nodeName; if ((string_in_array($all_block, $name) || string_in_array(\@inline_like_block, $name)) && !string_in_array(\@preserve_elements, $name)) { # make sure there is a newline at the beginning and at the end if there is anything inside if (defined $node->firstChild && !string_in_array(\@no_newline_inside, $name)) { my $first = $node->firstChild; if ($first->nodeType == XML_TEXT_NODE) { my $text = $first->nodeValue; if ($text !~ /^ *\n/) { $first->setData("\n" . $text); } } else { $node->insertBefore($doc->createTextNode("\n"), $first); } my $last = $node->lastChild; if ($last->nodeType == XML_TEXT_NODE) { my $text = $last->nodeValue; if ($text !~ /\n *$/) { $last->setData($text . "\n"); } } else { $node->appendChild($doc->createTextNode("\n")); } } # indent and make sure there is a newline before and after a block element my $newline_indent = "\n".(' ' x (2*($indent_level + 1))); my $newline_indent_last = "\n".(' ' x (2*$indent_level)); my $next; for (my $child=$node->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ($child->nodeType == XML_ELEMENT_NODE) { if (string_in_array($all_block, $child->nodeName) || string_in_array(\@inline_like_block, $child->nodeName)) { # make sure there is a newline before and after a block element if (defined $child->previousSibling && $child->previousSibling->nodeType == XML_TEXT_NODE) { my $prev = $child->previousSibling; my $text = $prev->nodeValue; if ($text !~ /\n *$/) { $prev->setData($text . $newline_indent); } } else { $node->insertBefore($doc->createTextNode($newline_indent), $child); } if (defined $next && $next->nodeType == XML_TEXT_NODE) { my $text = $next->nodeValue; if ($text !~ /^ *\n/) { $next->setData($newline_indent . $text); } } else { $node->insertAfter($doc->createTextNode($newline_indent), $child); } } pretty($child, $all_block, $indent_level+1); } elsif ($child->nodeType == XML_TEXT_NODE) { my $text = $child->nodeValue; # collapse newlines $text =~ s/\n([\t ]*\n)+/\n/g; # indent and remove spaces and tabs before newlines if (defined $next) { $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge; } else { $text =~ s/[\t ]*\n[\t ]*/$newline_indent/ge; $text =~ s/[\t ]*\n[\t ]*$/$newline_indent_last/e; } $child->setData($text); } } # removes whitespace at the beginning and end of p td, th and li (except for nbsp at the beginning) my @to_trim = ('p','td','th','li'); if (string_in_array(\@to_trim, $name) && defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) { my $text = $node->firstChild->nodeValue; $text =~ s/^[ \t\f\n\r]*//; if ($text eq '') { $node->removeChild($node->firstChild); } else { $node->firstChild->setData($text); } } if (string_in_array(\@to_trim, $name) && defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) { my $text = $node->lastChild->nodeValue; $text =~ s/\s*$//; if ($text eq '') { $node->removeChild($node->lastChild); } else { $node->lastChild->setData($text); } } } elsif (string_in_array(\@preserve_elements, $name)) { # collapse newlines at the beginning and the end of scripts if (defined $node->firstChild && $node->firstChild->nodeType == XML_TEXT_NODE) { my $text = $node->firstChild->nodeValue; $text =~ s/^\n( *\n)+/\n/; if ($text eq '') { $node->removeChild($node->firstChild); } else { $node->firstChild->setData($text); } } if (defined $node->lastChild && $node->lastChild->nodeType == XML_TEXT_NODE) { my $text = $node->lastChild->nodeValue; $text =~ s/\n( *\n)+$/\n/; if ($text eq '') { $node->removeChild($node->lastChild); } else { $node->lastChild->setData($text); } } } } } sub replace_tm_dtm { my ($root) = @_; my $doc = $root->ownerDocument; my @elements = $root->getElementsByTagName('tm'); push(@elements, $root->getElementsByTagName('dtm')); foreach my $element (@elements) { my $first = $element->firstChild; if (defined $first && $first->nodeType == XML_TEXT_NODE) { my $text = $first->nodeValue; if ($element->nodeName eq 'tm') { $first->setData('$'.$text.'$'); } else { $first->setData('$$'.$text.'$$'); } } $element->setNodeName('m'); } } ######## utilities ######## ## # Trims a string (really, this should be built-in in Perl, this is ridiculous, ugly and slow) # @param {string} s - the string to trim # @returns the trimmed string ## sub trim { my ($s) = @_; $s =~ s/^\s+//; $s =~ s/\s+$//; return($s); } ## # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array) # @param {Array<string>} array - reference to the array of strings # @param {string} value - the string to look for # @returns 1 if found, 0 otherwise ## sub string_in_array { my ($array, $value) = @_; # NOTE: would it be faster to use "any { $value eq $_ } @{$array}" from List::Util ? foreach my $v (@{$array}) { if ($v eq $value) { return 1; } } return 0; } ## # Tests if an object is in an array (using ==) # @param {Array<Object>} array - reference to the array of references # @param {Object} ref - the reference to look for # @returns 1 if found, 0 otherwise ## sub reference_in_array { my ($array, $ref) = @_; foreach my $v (@{$array}) { if ($v == $ref) { return 1; } } return 0; } ## # returns the index of a string in an array # @param {Array<Object>} array - reference to the array of strings # @param {string} s - the string to look for (using eq) # @returns the index if found, -1 otherwise ## sub index_of_string { my ($array, $s) = @_; for (my $i=0; $i<scalar(@{$array}); $i++) { if ($array->[$i] eq $s) { return $i; } } return -1; } ## # returns the index of a reference in an array # @param {Array<Object>} array - reference to the array of references # @param {Object} ref - the reference to look for # @returns the index if found, -1 otherwise ## sub index_of_reference { my ($array, $ref) = @_; for (my $i=0; $i<scalar(@{$array}); $i++) { if ($array->[$i] == $ref) { return $i; } } return -1; } ## # if found, removes a string from an array, otherwise do nothing # @param {Array<string>} array - reference to the array of string # @param {string} s - the string to look for (using eq) ## sub remove_string_from_array { my ($array, $s) = @_; my $index = index_of_string($array, $s); if ($index != -1) { splice(@$array, $index, 1); } } ## # if found, removes a reference from an array, otherwise do nothing # @param {Array<Object>} array - reference to the array of references # @param {Object} ref - the reference to look for ## sub remove_reference_from_array { my ($array, $ref) = @_; my $index = index_of_reference($array, $ref); if ($index != -1) { splice(@$array, $index, 1); } } ## # replaces a node by its children # @param {Node} node - the DOM node ## sub replace_by_children { my ($node) = @_; my $parent = $node->parentNode; my $next; my $previous; for (my $child=$node->firstChild; defined $child; $child=$next) { $next = $child->nextSibling; if ((!defined $previous || !defined $next) && $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) { next; # do not keep first and last whitespace nodes } else { if (!defined $previous && $child->nodeType == XML_TEXT_NODE) { # remove whitespace at the beginning my $value = $child->nodeValue; $value =~ s/^\s+//; $child->setData($value); } if (!defined $next && $child->nodeType == XML_TEXT_NODE) { # and at the end my $value = $child->nodeValue; $value =~ s/\s+$//; $child->setData($value); } } $node->removeChild($child); $parent->insertBefore($child, $node); $previous = $child; } $parent->removeChild($node); } ## # returns the trimmed attribute value if the attribute exists and is not blank, undef otherwise # @param {Node} node - the DOM node # @param {string} attribute_name - the attribute name ## sub get_non_empty_attribute { my ($node, $attribute_name) = @_; my $value = $node->getAttribute($attribute_name); if (defined $value && $value !~ /^\s*$/) { $value = trim($value); return($value); } return(undef); } ## # Returns a CSS property value from the style attribute of the element, or undef if not defined # @param {Element} el - the DOM element # @param {string} property_name - the CSS property name ## sub get_css_property { my ($el, $property_name) = @_; my $style = $el->getAttribute('style'); if (defined $style) { $style =~ s/^\s*;\s*//; $style =~ s/\s*;\s*$//; } else { $style = ''; } my @pairs = split(';', $style); foreach my $pair (@pairs) { my @name_value = split(':', $pair); if (scalar(@name_value) != 2) { next; } my $name = trim($name_value[0]); my $value = trim($name_value[1]); if (lc($name) eq $property_name) { return($value); # return the first one found } } return(undef); } ## # Returns the reference to a hash CSS property name => value from the style attribute of the element. # Returns an empty list if the style attribute is not defined, # @param {Element} el - the DOM element # @return {Hash<string, string>} reference to the hash property name => property value ## sub get_css_properties { my ($el) = @_; my $style = $el->getAttribute('style'); if (defined $style) { $style =~ s/^\s*;\s*//; $style =~ s/\s*;\s*$//; } else { $style = ''; } my @pairs = split(';', $style); tie (my %hash, 'Tie::IxHash', ()); foreach my $pair (@pairs) { my @name_value = split(':', $pair); if (scalar(@name_value) != 2) { next; } my $name = trim($name_value[0]); my $value = trim($name_value[1]); if (defined $hash{$name}) { # duplicate property in the style attribute: keep only the last one delete $hash{$name}; } $hash{$name} = $value; } return(\%hash); } ## # Sets a CSS property in the style attribute of an element # @param {Element} el - the DOM element # @param {string} property_name - the CSS property name # @param {string} property_value - the CSS property value ## sub set_css_property { my ($el, $property_name, $property_value) = @_; my $hash_ref = { $property_name => $property_value }; set_css_properties($el, $hash_ref); } ## # Sets several CSS properties in the style attribute of an element # @param {Element} el - the DOM element # @param {Hash<string, string>} properties - reference to the hash property name => property value ## sub set_css_properties { my ($el, $properties) = @_; my $hash = get_css_properties($el); foreach my $property_name (keys %$properties) { my $property_value = $properties->{$property_name}; if (defined $hash->{$property_name}) { delete $hash->{$property_name}; # to add the new one at the end } $hash->{$property_name} = $property_value; } my $style = ''; foreach my $key (keys %$hash) { $style .= $key.':'.$hash->{$key}.'; '; } $style =~ s/; $//; $el->setAttribute('style', $style); } 1; __END__