Annotation of loncom/homework/cleanxml/xml_to_loncapa.pm, revision 1.10
1.1 damieng 1: # The LearningOnline Network
2: # convert_file takes a well-formed XML file content and converts it to LON-CAPA syntax.
3: #
1.10 ! damieng 4: # $Id: xml_to_loncapa.pm,v 1.9 2016/01/21 22:11:13 damieng Exp $
1.1 damieng 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ###
29:
30: #!/usr/bin/perl
31:
32: package Apache::xml_to_loncapa;
33:
34: use strict;
35: use utf8;
36: use warnings;
37:
38: use XML::LibXML;
39:
40:
1.8 damieng 41: # LON-CAPA block elements that cannot be found within startouttext/endouttext
42: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
1.1 damieng 43:
1.8 damieng 44: # LON-CAPA elements that can be found within startouttext/endouttext:
45: my @loncapa_in_text = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto','translated','lang'); # not textline
1.1 damieng 46:
47: # HTML elements that trigger the addition of startouttext/endouttext
1.9 damieng 48: my @html_trigger = ('header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table');
1.1 damieng 49:
1.9 damieng 50: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'data', 'function', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend','comment');
1.1 damieng 51:
52: my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse');
53:
1.4 damieng 54: # see http://www.w3.org/TR/html-polyglot/#empty-elements
55: # and http://tiffanybbrown.com/2011/03/23/html5-does-not-allow-self-closing-tags/
56: # HTML elements that do not have an empty content, and must never use a self-closing tag:
57: my @non_empty_html = ('title','style','script','noscript','body','section','header','footer','article','aside','nav','h1','h2','h3','h4','h5','h6','div','p','li','dt','dd','caption','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','address','blockquote','bdo','ruby','rb','rp','rt','rtc','figure','figcaption','object','applet','video','audio','canvas','label','option','textarea','fieldset','legend','button','iframe');
58:
1.1 damieng 59:
60: # Converts a file and return the modified contents
61: sub convert_file {
62: my ($contents) = @_;
63:
64: my $dom_doc = XML::LibXML->load_xml(string => $contents);
1.5 damieng 65: my $root = $dom_doc->documentElement();
66: if (defined $root && $root->nodeName ne 'html') {
67: add_outtext($dom_doc);
68: }
1.1 damieng 69: return node_to_string($dom_doc);
70: }
71:
72:
73: sub node_to_string {
74: my ($node) = @_;
75:
76: if ($node->nodeType == XML_DOCUMENT_NODE) {
77: my $root = $node->documentElement();
78: return node_to_string($root);
79: } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
80: my $parent = $node->parentNode;
81: my $parent_name = $parent->nodeName;
82: my $grandparent_name;
83: if (defined $parent->parentNode) {
84: $grandparent_name = $parent->parentNode->nodeName;
85: }
1.6 damieng 86: my @no_escape = ('m', 'script', 'style', 'display', 'parse', 'answer');
1.1 damieng 87: if (string_in_array(\@no_escape, $parent_name) &&
88: ($parent_name ne 'answer' ||
89: (defined $grandparent_name &&
90: $grandparent_name ne 'numericalresponse' &&
91: $grandparent_name ne 'formularesponse'))) {
92: return $node->nodeValue;
93: } else {
94: return $node->toString();
95: }
96: } elsif ($node->nodeType == XML_ELEMENT_NODE) {
97: my $s = '';
98: my $tag = $node->nodeName;
99: $s .= "<$tag";
100: my @attributes = $node->attributes();
101: foreach my $attribute (@attributes) {
102: $s .= ' ';
103: $s .= $attribute->nodeName;
104: $s .= '="';
1.2 damieng 105: $s .= escape_attribute($attribute->nodeValue);
1.1 damieng 106: $s .= '"';
107: }
1.4 damieng 108: if ($node->hasChildNodes() || string_in_array(\@non_empty_html, $tag)) {
1.1 damieng 109: $s .= '>';
110: foreach my $child ($node->childNodes) {
111: $s .= node_to_string($child);
112: }
113: $s .= "</$tag>";
114: } else {
115: $s .= '/>';
116: }
117: return $s;
118: } else {
119: return $node->toString();
120: }
121: }
122:
1.2 damieng 123: # Escapes an attribute value
124: sub escape_attribute {
1.1 damieng 125: my ($s) = @_;
1.2 damieng 126: # normal XML escapes do not work with LON-CAPA, for instance with reactionresponse
127: #$s =~ s/&/&/sg;
128: #$s =~ s/</</sg;
129: #$s =~ s/>/>/sg;
130: $s =~ s/"/"/sg;
1.1 damieng 131: return $s;
132: }
133:
134: # Adds startouttext and endouttext where useful for the colorful editor
135: sub add_outtext {
136: my ($node) = @_;
137:
138: if ($node->nodeType == XML_DOCUMENT_NODE) {
139: my $root = $node->documentElement();
140: add_outtext($root);
141: return;
142: }
143: if ($node->nodeType != XML_ELEMENT_NODE) {
144: return;
145: }
146: if (string_in_array(\@simple_data, $node->nodeName)) {
147: return;
148: }
149: convert_paragraphs($node);
1.8 damieng 150: if ($node->nodeName eq 'hintgroup' && !defined $node->firstChild) {
151: # empty hintgroup: colorful editor needs start/end outtext
152: add_endouttext($node, undef);
153: add_startouttext($node, $node->firstChild);
154: }
1.1 damieng 155: my $next;
156: my $in_outtext = 0;
157: for (my $child=$node->firstChild; defined $child; $child=$next) {
158: $next = $child->nextSibling;
159: if (!$in_outtext && inside_outtext($child)) {
160: add_startouttext($node, $child);
161: $in_outtext = 1;
162: } elsif ($in_outtext && !continue_outtext($child)) {
163: add_endouttext($node, $child);
164: $in_outtext = 0;
165: }
166: if (!$in_outtext) {
167: add_outtext($child);
168: }
169: }
170: if ($in_outtext) {
171: add_endouttext($node);
172: }
173: }
174:
175: # Returns 1 if this node should trigger the addition of startouttext before it
176: sub inside_outtext {
177: my ($node) = @_;
178: if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
179: return 1;
180: }
181: if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_trigger, $node->nodeName)) {
182: if (contains_loncapa_block($node)) {
183: return 0;
184: } else {
185: return 1;
186: }
187: }
1.8 damieng 188: if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_in_text, $node->nodeName)) {
1.1 damieng 189: return 1;
190: }
191: return 0;
192: }
193:
194: # Returns 1 if the outtext environment can continue with this node
195: sub continue_outtext {
196: my ($node) = @_;
197: if (inside_outtext($node)) {
198: return 1;
199: }
200: if ($node->nodeType == XML_TEXT_NODE) {
201: return 1; # continue even if this is just spaces
202: }
203: return 0;
204: }
205:
206: # Returns 1 if the node contains a LON-CAPA block in a descendant.
207: sub contains_loncapa_block {
208: my ($node) = @_;
209: foreach my $child ($node->childNodes) {
210: if ($child->nodeType == XML_ELEMENT_NODE) {
211: if (string_in_array(\@loncapa_block, $child->nodeName)) {
212: return 1;
213: }
214: if (contains_loncapa_block($child)) {
215: return 1;
216: }
217: }
218: }
219: return 0;
220: }
221:
222: sub add_startouttext {
223: my ($parent, $before_node) = @_;
224: my $doc = $parent->ownerDocument;
225: if ($before_node->nodeType == XML_TEXT_NODE) {
226: # split space at the beginning of the node
227: if ($before_node->nodeValue =~ /^(\s+)(.*?)$/s) {
228: my $space_node = $doc->createTextNode($1);
229: $before_node->setData($2);
230: $parent->insertBefore($space_node, $before_node);
231: }
232: }
233: my $startouttext = $doc->createElement('startouttext');
234: $parent->insertBefore($startouttext, $before_node);
235: }
236:
237: sub add_endouttext {
238: my ($parent, $before_node) = @_;
239: my $doc = $parent->ownerDocument;
240: my $endouttext = $doc->createElement('endouttext');
241: my $before_before;
242: if (defined $before_node) {
243: $before_before = $before_node->previousSibling;
244: } else {
245: $before_before = $parent->lastChild;
246: }
247: if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
248: # split space at the end of the node
249: if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
250: $before_before->setData($1);
251: my $space_node = $doc->createTextNode($2);
252: if (defined $before_node) {
253: $parent->insertBefore($space_node, $before_node);
254: } else {
255: $parent->appendChild($space_node);
256: }
257: $before_node = $space_node;
258: }
259: }
260: if (defined $before_node) {
261: $parent->insertBefore($endouttext, $before_node);
262: } else {
263: $parent->appendChild($endouttext);
264: }
1.7 damieng 265: # replace spaces afterwards by a \n + indentation
266: my $next = $endouttext->nextSibling;
267: if (defined $next && $next->nodeType == XML_TEXT_NODE) {
268: my $v = $next->nodeValue;
269: if ($v =~ /^ /) {
270: $v =~ s/^ +//;
271: if ($parent->firstChild->nodeType == XML_TEXT_NODE &&
272: $parent->firstChild->nodeValue =~ /^\n +$/) {
273: $v = $parent->firstChild->nodeValue.$v;
274: } else {
275: $v = "\n".$v;
276: }
277: $next->setData($v);
278: }
279: }
1.1 damieng 280: }
281:
282: # Convert paragraph children when one contains an inline response into content + <br>
283: # (the colorful editor does not support paragraphs containing inline responses)
284: sub convert_paragraphs {
285: my ($parent) = @_;
286: my $p_child_with_inline_response = 0;
287: foreach my $child ($parent->childNodes) {
288: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
289: foreach my $child2 ($child->childNodes) {
290: if ($child2->nodeType == XML_ELEMENT_NODE) {
291: if (string_in_array(\@inline_responses, $child2->nodeName)) {
292: $p_child_with_inline_response = 1;
293: last;
294: }
295: }
296: }
297: }
298: if ($p_child_with_inline_response) {
299: last;
300: }
301: }
302: if ($p_child_with_inline_response) {
303: my $doc = $parent->ownerDocument;
304: my $next;
305: for (my $child=$parent->firstChild; defined $child; $child=$next) {
306: $next = $child->nextSibling;
307: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
308: replace_by_children($child);
309: if (defined $next && (defined $next->nextSibling || $next->nodeType != XML_TEXT_NODE ||
310: $next->nodeValue !~ /^\s*$/)) {
311: # we only add a br if there is something after
312: my $br = $doc->createElement('br');
313: $parent->insertBefore($br, $next);
1.10 ! damieng 314: # add another br to make up for the p margin
! 315: $br = $doc->createElement('br');
! 316: $parent->insertBefore($br, $next);
1.1 damieng 317: }
318: }
319: }
320: }
321: }
322:
323: ##
324: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
325: # @param {Array<string>} array - reference to the array of strings
326: # @param {string} value - the string to look for
327: # @returns 1 if found, 0 otherwise
328: ##
329: sub string_in_array {
330: my ($array, $value) = @_;
331: foreach my $v (@{$array}) {
332: if ($v eq $value) {
333: return 1;
334: }
335: }
336: return 0;
337: }
338:
339: ##
340: # replaces a node by its children
341: # @param {Node} node - the DOM node
342: ##
343: sub replace_by_children {
344: my ($node) = @_;
345: my $parent = $node->parentNode;
346: my $next;
347: my $previous;
348: for (my $child=$node->firstChild; defined $child; $child=$next) {
349: $next = $child->nextSibling;
350: if ((!defined $previous || !defined $next) &&
351: $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
352: next; # do not keep first and last whitespace nodes
353: } else {
354: if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
355: # remove whitespace at the beginning
356: my $value = $child->nodeValue;
357: $value =~ s/^\s+//;
358: $child->setData($value);
359: }
360: if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
361: # and at the end
362: my $value = $child->nodeValue;
363: $value =~ s/\s+$//;
364: $child->setData($value);
365: }
366: }
367: $node->removeChild($child);
368: $parent->insertBefore($child, $node);
369: $previous = $child;
370: }
371: $parent->removeChild($node);
372: }
373:
374: 1;
375: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>