Annotation of modules/damieng/clean_xml/xml_to_loncapa.pl, revision 1.3
1.1 damieng 1: #!/usr/bin/perl
2:
3: # This takes a well-formed XML file as input, and converts it to LON-CAPA syntax.
4:
5: use strict;
6: use utf8;
7: use warnings;
8:
9: use XML::LibXML;
10:
1.2 damieng 11:
12: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
13:
14: my @loncapa_inline = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto'); # not textline
15:
1.3 ! damieng 16: # HTML elements that trigger the addition of startouttext/endouttext
! 17: my @html_trigger = ('header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','img','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table');
1.2 damieng 18:
1.3 ! damieng 19: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'data', 'function', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend' );
! 20:
! 21: my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse');
1.2 damieng 22:
23:
1.1 damieng 24: binmode(STDOUT, ':encoding(UTF-8)');
25:
26: if (scalar(@ARGV) != 1) {
27: print STDERR "Usage: perl xml_to_loncapa.pl file.xml\n";
28: exit(1);
29: }
30:
31: # find the command-line argument encoding
32: use I18N::Langinfo qw(langinfo CODESET);
33: my $codeset = langinfo(CODESET);
34: use Encode qw(decode);
35: @ARGV = map { decode $codeset, $_ } @ARGV;
36:
37: my $pathname = "$ARGV[0]";
38: if (-f $pathname) {
39: convert_file($pathname);
40: }
41:
42: # Converts a file, creating a .loncapa file in the same directory.
43: # TODO: use the right extension based on content (or just ouput content)
44: sub convert_file {
45: my ($pathname) = @_;
46:
47: # create a name for the new file
48: my $newpath = $pathname.'.loncapa';
49:
50: print "converting $pathname...\n";
51:
52: my $dom_doc = XML::LibXML->load_xml(location => $pathname);
53:
54: open my $out, '>:encoding(UTF-8)', $newpath;
1.2 damieng 55: add_outtext($dom_doc);
1.1 damieng 56: print $out node_to_string($dom_doc);
57: close $out;
58: }
59:
60: sub node_to_string {
61: my ($node) = @_;
62:
63: if ($node->nodeType == XML_DOCUMENT_NODE) {
64: my $root = $node->documentElement();
65: return node_to_string($root);
66: } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
67: my $parent = $node->parentNode;
68: my $parent_name = $parent->nodeName;
69: my $grandparent_name;
70: if (defined $parent->parentNode) {
71: $grandparent_name = $parent->parentNode->nodeName;
72: }
73: my @no_escape = ('m', 'script', 'display', 'parse', 'answer');
74: if (string_in_array(\@no_escape, $parent_name) &&
75: ($parent_name ne 'answer' ||
76: (defined $grandparent_name &&
77: $grandparent_name ne 'numericalresponse' &&
78: $grandparent_name ne 'formularesponse'))) {
79: return $node->nodeValue;
80: } else {
81: return $node->toString();
82: }
83: } elsif ($node->nodeType == XML_ELEMENT_NODE) {
84: my $s = '';
85: my $tag = $node->nodeName;
86: $s .= "<$tag";
87: my @attributes = $node->attributes();
88: foreach my $attribute (@attributes) {
89: $s .= ' ';
90: $s .= $attribute->nodeName;
91: $s .= '="';
92: $s .= escape($attribute->nodeValue);
93: $s .= '"';
94: }
95: if ($node->hasChildNodes()) {
96: $s .= '>';
97: foreach my $child ($node->childNodes) {
98: $s .= node_to_string($child);
99: }
100: $s .= "</$tag>";
101: } else {
102: $s .= '/>';
103: }
104: return $s;
105: } else {
106: return $node->toString();
107: }
108: }
109:
110: # Escapes a string for LON-CAPA output (used for text nodes, not attribute values)
111: sub escape {
112: my ($s) = @_;
113: $s =~ s/&/&/sg;
114: $s =~ s/</</sg;
115: $s =~ s/>/>/sg;
116: # quot and apos do not need to be escaped outside attribute values
117: return $s;
118: }
119:
1.3 ! damieng 120: # Adds startouttext and endouttext where useful for the colorful editor
1.2 damieng 121: sub add_outtext {
122: my ($node) = @_;
123:
124: if ($node->nodeType == XML_DOCUMENT_NODE) {
125: my $root = $node->documentElement();
126: add_outtext($root);
127: return;
128: }
129: if ($node->nodeType != XML_ELEMENT_NODE) {
130: return;
131: }
132: if (string_in_array(\@simple_data, $node->nodeName)) {
133: return;
134: }
1.3 ! damieng 135: convert_paragraphs($node);
1.2 damieng 136: my $next;
137: my $in_outtext = 0;
138: for (my $child=$node->firstChild; defined $child; $child=$next) {
139: $next = $child->nextSibling;
140: if (!$in_outtext && inside_outtext($child)) {
1.3 ! damieng 141: add_startouttext($node, $child);
1.2 damieng 142: $in_outtext = 1;
143: } elsif ($in_outtext && !continue_outtext($child)) {
144: add_endouttext($node, $child);
145: $in_outtext = 0;
146: }
147: if (!$in_outtext) {
148: add_outtext($child);
149: }
150: }
151: if ($in_outtext) {
152: add_endouttext($node);
153: }
154: }
155:
156: # Returns 1 if this node should trigger the addition of startouttext before it
157: sub inside_outtext {
158: my ($node) = @_;
159: if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
160: return 1;
161: }
1.3 ! damieng 162: if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_trigger, $node->nodeName)) {
1.2 damieng 163: if (contains_loncapa_block($node)) {
164: return 0;
165: } else {
166: return 1;
167: }
168: }
169: if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_inline, $node->nodeName)) {
170: return 1;
171: }
172: return 0;
173: }
174:
175: # Returns 1 if the outtext environment can continue with this node
176: sub continue_outtext {
177: my ($node) = @_;
178: if (inside_outtext($node)) {
179: return 1;
180: }
181: if ($node->nodeType == XML_TEXT_NODE) {
182: return 1; # continue even if this is just spaces
183: }
184: return 0;
185: }
186:
187: # Returns 1 if the node contains a LON-CAPA block in a descendant.
188: sub contains_loncapa_block {
189: my ($node) = @_;
190: foreach my $child ($node->childNodes) {
191: if ($child->nodeType == XML_ELEMENT_NODE) {
192: if (string_in_array(\@loncapa_block, $child->nodeName)) {
193: return 1;
194: }
195: if (contains_loncapa_block($child)) {
196: return 1;
197: }
198: }
199: }
200: return 0;
201: }
202:
1.3 ! damieng 203: sub add_startouttext {
! 204: my ($parent, $before_node) = @_;
! 205: my $doc = $parent->ownerDocument;
! 206: if ($before_node->nodeType == XML_TEXT_NODE) {
! 207: # split space at the beginning of the node
! 208: if ($before_node->nodeValue =~ /^(\s+)(.*?)$/s) {
! 209: my $space_node = $doc->createTextNode($1);
! 210: $before_node->setData($2);
! 211: $parent->insertBefore($space_node, $before_node);
! 212: }
! 213: }
! 214: my $startouttext = $doc->createElement('startouttext');
! 215: $parent->insertBefore($startouttext, $before_node);
! 216: }
! 217:
1.2 damieng 218: sub add_endouttext {
219: my ($parent, $before_node) = @_;
220: my $doc = $parent->ownerDocument;
221: my $endouttext = $doc->createElement('endouttext');
222: my $before_before;
223: if (defined $before_node) {
224: $before_before = $before_node->previousSibling;
225: } else {
226: $before_before = $parent->lastChild;
227: }
228: if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
1.3 ! damieng 229: # split space at the end of the node
1.2 damieng 230: if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
231: $before_before->setData($1);
232: my $space_node = $doc->createTextNode($2);
233: if (defined $before_node) {
234: $parent->insertBefore($space_node, $before_node);
235: } else {
236: $parent->appendChild($space_node);
237: }
238: $before_node = $space_node;
239: }
240: }
241: if (defined $before_node) {
242: $parent->insertBefore($endouttext, $before_node);
243: } else {
244: $parent->appendChild($endouttext);
245: }
246: }
247:
1.3 ! damieng 248: # Convert paragraph children when one contains an inline response into content + <br>
! 249: # (the colorful editor does not support paragraphs containing inline responses)
! 250: sub convert_paragraphs {
! 251: my ($parent) = @_;
! 252: my $p_child_with_inline_response = 0;
! 253: foreach my $child ($parent->childNodes) {
! 254: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
! 255: foreach my $child2 ($child->childNodes) {
! 256: if ($child2->nodeType == XML_ELEMENT_NODE) {
! 257: if (string_in_array(\@inline_responses, $child2->nodeName)) {
! 258: $p_child_with_inline_response = 1;
! 259: last;
! 260: }
! 261: }
! 262: }
! 263: }
! 264: if ($p_child_with_inline_response) {
! 265: last;
! 266: }
! 267: }
! 268: if ($p_child_with_inline_response) {
! 269: my $doc = $parent->ownerDocument;
! 270: my $next;
! 271: for (my $child=$parent->firstChild; defined $child; $child=$next) {
! 272: $next = $child->nextSibling;
! 273: if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
! 274: replace_by_children($child);
! 275: if (defined $next && (defined $next->nextSibling || $next->nodeType != XML_TEXT_NODE ||
! 276: $next->nodeValue !~ /^\s*$/)) {
! 277: # we only add a br if there is something after
! 278: my $br = $doc->createElement('br');
! 279: $parent->insertBefore($br, $next);
! 280: }
! 281: }
! 282: }
! 283: }
! 284: }
! 285:
1.1 damieng 286: ##
287: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
288: # @param {Array<string>} array - reference to the array of strings
289: # @param {string} value - the string to look for
290: # @returns 1 if found, 0 otherwise
291: ##
292: sub string_in_array {
293: my ($array, $value) = @_;
294: foreach my $v (@{$array}) {
295: if ($v eq $value) {
296: return 1;
297: }
298: }
299: return 0;
300: }
1.3 ! damieng 301:
! 302: ##
! 303: # replaces a node by its children
! 304: # @param {Node} node - the DOM node
! 305: ##
! 306: sub replace_by_children {
! 307: my ($node) = @_;
! 308: my $parent = $node->parentNode;
! 309: my $next;
! 310: my $previous;
! 311: for (my $child=$node->firstChild; defined $child; $child=$next) {
! 312: $next = $child->nextSibling;
! 313: if ((!defined $previous || !defined $next) &&
! 314: $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
! 315: next; # do not keep first and last whitespace nodes
! 316: } else {
! 317: if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
! 318: # remove whitespace at the beginning
! 319: my $value = $child->nodeValue;
! 320: $value =~ s/^\s+//;
! 321: $child->setData($value);
! 322: }
! 323: if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
! 324: # and at the end
! 325: my $value = $child->nodeValue;
! 326: $value =~ s/\s+$//;
! 327: $child->setData($value);
! 328: }
! 329: }
! 330: $node->removeChild($child);
! 331: $parent->insertBefore($child, $node);
! 332: $previous = $child;
! 333: }
! 334: $parent->removeChild($node);
! 335: }
! 336:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>