Annotation of modules/damieng/clean_xml/xml_to_loncapa.pl, revision 1.2
1.1 damieng 1: #!/usr/bin/perl
2:
3: # This takes a well-formed XML file as input, and converts it to LON-CAPA syntax.
4:
5: use strict;
6: use utf8;
7: use warnings;
8:
9: use XML::LibXML;
10:
1.2 ! damieng 11:
! 12: my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','translated','lang','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');
! 13:
! 14: my @loncapa_inline = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto'); # not textline
! 15:
! 16: my @html_elements = ('html','meta','head','title','base','link','style','noscript','body','header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','img','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','label','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table'); # without script
! 17:
! 18: my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend' );
! 19:
! 20:
1.1 damieng 21: binmode(STDOUT, ':encoding(UTF-8)');
22:
23: if (scalar(@ARGV) != 1) {
24: print STDERR "Usage: perl xml_to_loncapa.pl file.xml\n";
25: exit(1);
26: }
27:
28: # find the command-line argument encoding
29: use I18N::Langinfo qw(langinfo CODESET);
30: my $codeset = langinfo(CODESET);
31: use Encode qw(decode);
32: @ARGV = map { decode $codeset, $_ } @ARGV;
33:
34: my $pathname = "$ARGV[0]";
35: if (-f $pathname) {
36: convert_file($pathname);
37: }
38:
39: # Converts a file, creating a .loncapa file in the same directory.
40: # TODO: use the right extension based on content (or just ouput content)
41: sub convert_file {
42: my ($pathname) = @_;
43:
44: # create a name for the new file
45: my $newpath = $pathname.'.loncapa';
46:
47: print "converting $pathname...\n";
48:
49: my $dom_doc = XML::LibXML->load_xml(location => $pathname);
50:
51: open my $out, '>:encoding(UTF-8)', $newpath;
1.2 ! damieng 52: add_outtext($dom_doc);
1.1 damieng 53: print $out node_to_string($dom_doc);
54: close $out;
55: }
56:
57: sub node_to_string {
58: my ($node) = @_;
59:
60: if ($node->nodeType == XML_DOCUMENT_NODE) {
61: my $root = $node->documentElement();
62: return node_to_string($root);
63: } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
64: my $parent = $node->parentNode;
65: my $parent_name = $parent->nodeName;
66: my $grandparent_name;
67: if (defined $parent->parentNode) {
68: $grandparent_name = $parent->parentNode->nodeName;
69: }
70: my @no_escape = ('m', 'script', 'display', 'parse', 'answer');
71: if (string_in_array(\@no_escape, $parent_name) &&
72: ($parent_name ne 'answer' ||
73: (defined $grandparent_name &&
74: $grandparent_name ne 'numericalresponse' &&
75: $grandparent_name ne 'formularesponse'))) {
76: return $node->nodeValue;
77: } else {
78: return $node->toString();
79: }
80: } elsif ($node->nodeType == XML_ELEMENT_NODE) {
81: my $s = '';
82: my $tag = $node->nodeName;
83: $s .= "<$tag";
84: my @attributes = $node->attributes();
85: foreach my $attribute (@attributes) {
86: $s .= ' ';
87: $s .= $attribute->nodeName;
88: $s .= '="';
89: $s .= escape($attribute->nodeValue);
90: $s .= '"';
91: }
92: if ($node->hasChildNodes()) {
93: $s .= '>';
94: foreach my $child ($node->childNodes) {
95: $s .= node_to_string($child);
96: }
97: $s .= "</$tag>";
98: } else {
99: $s .= '/>';
100: }
101: return $s;
102: } else {
103: return $node->toString();
104: }
105: }
106:
107: # Escapes a string for LON-CAPA output (used for text nodes, not attribute values)
108: sub escape {
109: my ($s) = @_;
110: $s =~ s/&/&/sg;
111: $s =~ s/</</sg;
112: $s =~ s/>/>/sg;
113: # quot and apos do not need to be escaped outside attribute values
114: return $s;
115: }
116:
1.2 ! damieng 117: # Adds startouttext and endouttext where useful for the colorfull editor
! 118: sub add_outtext {
! 119: my ($node) = @_;
! 120:
! 121: if ($node->nodeType == XML_DOCUMENT_NODE) {
! 122: my $root = $node->documentElement();
! 123: add_outtext($root);
! 124: return;
! 125: }
! 126: if ($node->nodeType != XML_ELEMENT_NODE) {
! 127: return;
! 128: }
! 129: if (string_in_array(\@simple_data, $node->nodeName)) {
! 130: return;
! 131: }
! 132: my $next;
! 133: my $in_outtext = 0;
! 134: for (my $child=$node->firstChild; defined $child; $child=$next) {
! 135: $next = $child->nextSibling;
! 136: if (!$in_outtext && inside_outtext($child)) {
! 137: # Add startouttext
! 138: my $doc = $node->ownerDocument;
! 139: my $startouttext = $doc->createElement('startouttext');
! 140: $node->insertBefore($startouttext, $child);
! 141: $in_outtext = 1;
! 142: } elsif ($in_outtext && !continue_outtext($child)) {
! 143: # Add endouttext
! 144: add_endouttext($node, $child);
! 145: $in_outtext = 0;
! 146: }
! 147: if (!$in_outtext) {
! 148: add_outtext($child);
! 149: }
! 150: }
! 151: if ($in_outtext) {
! 152: # Add endouttext
! 153: add_endouttext($node);
! 154: }
! 155: }
! 156:
! 157: # Returns 1 if this node should trigger the addition of startouttext before it
! 158: sub inside_outtext {
! 159: my ($node) = @_;
! 160: if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
! 161: return 1;
! 162: }
! 163: if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_elements, $node->nodeName)) {
! 164: if (contains_loncapa_block($node)) {
! 165: return 0;
! 166: } else {
! 167: return 1;
! 168: }
! 169: }
! 170: if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_inline, $node->nodeName)) {
! 171: return 1;
! 172: }
! 173: return 0;
! 174: }
! 175:
! 176: # Returns 1 if the outtext environment can continue with this node
! 177: sub continue_outtext {
! 178: my ($node) = @_;
! 179: if (inside_outtext($node)) {
! 180: return 1;
! 181: }
! 182: if ($node->nodeType == XML_TEXT_NODE) {
! 183: return 1; # continue even if this is just spaces
! 184: }
! 185: return 0;
! 186: }
! 187:
! 188: # Returns 1 if the node contains a LON-CAPA block in a descendant.
! 189: sub contains_loncapa_block {
! 190: my ($node) = @_;
! 191: foreach my $child ($node->childNodes) {
! 192: if ($child->nodeType == XML_ELEMENT_NODE) {
! 193: if (string_in_array(\@loncapa_block, $child->nodeName)) {
! 194: return 1;
! 195: }
! 196: if (contains_loncapa_block($child)) {
! 197: return 1;
! 198: }
! 199: }
! 200: }
! 201: return 0;
! 202: }
! 203:
! 204: sub add_endouttext {
! 205: my ($parent, $before_node) = @_;
! 206: my $doc = $parent->ownerDocument;
! 207: my $endouttext = $doc->createElement('endouttext');
! 208: my $before_before;
! 209: if (defined $before_node) {
! 210: $before_before = $before_node->previousSibling;
! 211: } else {
! 212: $before_before = $parent->lastChild;
! 213: }
! 214: if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
! 215: # split space at the end of the node before endouttext
! 216: if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
! 217: $before_before->setData($1);
! 218: my $space_node = $doc->createTextNode($2);
! 219: if (defined $before_node) {
! 220: $parent->insertBefore($space_node, $before_node);
! 221: } else {
! 222: $parent->appendChild($space_node);
! 223: }
! 224: $before_node = $space_node;
! 225: }
! 226: }
! 227: if (defined $before_node) {
! 228: $parent->insertBefore($endouttext, $before_node);
! 229: } else {
! 230: $parent->appendChild($endouttext);
! 231: }
! 232: }
! 233:
1.1 damieng 234: ##
235: # Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
236: # @param {Array<string>} array - reference to the array of strings
237: # @param {string} value - the string to look for
238: # @returns 1 if found, 0 otherwise
239: ##
240: sub string_in_array {
241: my ($array, $value) = @_;
242: foreach my $v (@{$array}) {
243: if ($v eq $value) {
244: return 1;
245: }
246: }
247: return 0;
248: }
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>