File:  [LON-CAPA] / loncom / homework / cleanxml / xml_to_loncapa.pm
Revision 1.10: download - view: text, annotated - select for diffs
Thu Nov 10 21:53:56 2016 UTC (8 years, 1 month ago) by damieng
Branches: MAIN
CVS tags: version_2_12_X, version_2_11_5_msu, version_2_11_4_msu, HEAD
improved <br><br> conversion

# The LearningOnline Network
# convert_file takes a well-formed XML file content and converts it to LON-CAPA syntax.
#
# $Id: xml_to_loncapa.pm,v 1.10 2016/11/10 21:53:56 damieng Exp $
#
# Copyright Michigan State University Board of Trustees
#
# This file is part of the LearningOnline Network with CAPA (LON-CAPA).
#
# LON-CAPA is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# LON-CAPA is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with LON-CAPA; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
#
# /home/httpd/html/adm/gpl.txt
#
# http://www.lon-capa.org/
#
###

#!/usr/bin/perl

package Apache::xml_to_loncapa;

use strict;
use utf8;
use warnings;

use XML::LibXML;


# LON-CAPA block elements that cannot be found within startouttext/endouttext
my @loncapa_block = ('parameter','location','answer','foil','image','polygon','rectangle','text','conceptgroup','itemgroup','item','label','data','function','array','unit','answergroup','functionplotresponse','functionplotruleset','functionplotelements','functionplotcustomrule','essayresponse','hintpart','formulahint','numericalhint','reactionhint','organichint','optionhint','radiobuttonhint','stringhint','customhint','mathhint','formulahintcondition','numericalhintcondition','reactionhintcondition','organichintcondition','optionhintcondition','radiobuttonhintcondition','stringhintcondition','customhintcondition','mathhintcondition','imageresponse','foilgroup','datasubmission','textfield','hiddensubmission','radiobuttonresponse','rankresponse','matchresponse','import','style','script','window','block','library','notsolved','part','postanswerdate','preduedate','problem','problemtype','randomlabel','bgimg','labelgroup','randomlist','solved','while','tex','print','web','gnuplot','curve','Task','IntroParagraph','ClosingParagraph','Question','QuestionText','Setup','Instance','InstanceText','Criteria','CriteriaText','GraderNote','languageblock','instructorcomment','dataresponse','togglebox','standalone','comment','drawimage','allow','displayduedate','displaytitle','responseparam','organicstructure','scriptlib','parserlib','drawoptionlist','spline','backgroundplot','plotobject','plotvector','drawvectorsum','functionplotrule','functionplotvectorrule','functionplotvectorsumrule','axis','key','xtics','ytics','title','xlabel','ylabel','hiddenline','dtm','stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse', 'hint', 'hintgroup');

# LON-CAPA elements that can be found within startouttext/endouttext:
my @loncapa_in_text = ('display','m','lm','chem','num','parse','algebra','displayweight','displaystudentphoto','translated','lang'); # not textline

# HTML elements that trigger the addition of startouttext/endouttext
my @html_trigger = ('header','footer','aside','h1','h2','h3','h4','h5','h6','li','dd','dt','tbody','tr','caption','thead','tfoot','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','br','hr','address','blockquote','figure','figcaption','object','param','embed','applet','video','source','audio','map','area','canvas','form','input','select','optgroup','option','textarea','fieldset','legend','button','iframe','section','div','p','ul','ol','dl','table');

my @simple_data = ('polygon', 'rectangle', 'vector', 'value', 'answer', 'title', 'data', 'function', 'xlabel', 'ylabel', 'tic', 'parserlib', 'scriptlib', 'import', 'tex', 'text', 'image', 'display', 'm', 'lm', 'num', 'algebra', 'chem', 'parse', 'title', 'style', 'script', 'ins', 'del', 'label', 'option', 'textarea', 'legend','comment');

my @inline_responses = ('stringresponse','optionresponse','numericalresponse','formularesponse','mathresponse','organicresponse','reactionresponse','customresponse','externalresponse');

# see http://www.w3.org/TR/html-polyglot/#empty-elements
# and http://tiffanybbrown.com/2011/03/23/html5-does-not-allow-self-closing-tags/
# HTML elements that do not have an empty content, and must never use a self-closing tag:
my @non_empty_html = ('title','style','script','noscript','body','section','header','footer','article','aside','nav','h1','h2','h3','h4','h5','h6','div','p','li','dt','dd','caption','td','th','span','a','em','strong','b','i','sup','sub','pre','code','kbd','samp','cite','q','tt','ins','del','var','small','big','address','blockquote','bdo','ruby','rb','rp','rt','rtc','figure','figcaption','object','applet','video','audio','canvas','label','option','textarea','fieldset','legend','button','iframe');


# Converts a file and return the modified contents
sub convert_file {
  my ($contents) = @_;

  my $dom_doc = XML::LibXML->load_xml(string => $contents);
  my $root = $dom_doc->documentElement();
  if (defined $root && $root->nodeName ne 'html') {
    add_outtext($dom_doc);
  }
  return node_to_string($dom_doc);
}


sub node_to_string {
  my ($node) = @_;
  
  if ($node->nodeType == XML_DOCUMENT_NODE) {
    my $root = $node->documentElement();
    return node_to_string($root);
  } elsif ($node->nodeType == XML_TEXT_NODE || $node->nodeType == XML_CDATA_SECTION_NODE) {
    my $parent = $node->parentNode;
    my $parent_name = $parent->nodeName;
    my $grandparent_name;
    if (defined $parent->parentNode) {
      $grandparent_name = $parent->parentNode->nodeName;
    }
    my @no_escape = ('m', 'script', 'style', 'display', 'parse', 'answer');
    if (string_in_array(\@no_escape, $parent_name) &&
        ($parent_name ne 'answer' ||
        (defined $grandparent_name &&
        $grandparent_name ne 'numericalresponse' &&
        $grandparent_name ne 'formularesponse'))) {
      return $node->nodeValue;
    } else {
      return $node->toString();
    }
  } elsif ($node->nodeType == XML_ELEMENT_NODE) {
    my $s = '';
    my $tag = $node->nodeName;
    $s .= "<$tag";
    my @attributes = $node->attributes();
    foreach my $attribute (@attributes) {
      $s .= ' ';
      $s .= $attribute->nodeName;
      $s .= '="';
      $s .= escape_attribute($attribute->nodeValue);
      $s .= '"';
    }
    if ($node->hasChildNodes() || string_in_array(\@non_empty_html, $tag)) {
      $s .= '>';
      foreach my $child ($node->childNodes) {
        $s .= node_to_string($child);
      }
      $s .= "</$tag>";
    } else {
      $s .= '/>';
    }
    return $s;
  } else {
    return $node->toString();
  }
}

# Escapes an attribute value
sub escape_attribute {
  my ($s) = @_;
  # normal XML escapes do not work with LON-CAPA, for instance with reactionresponse
  #$s =~ s/&/&amp;/sg;
  #$s =~ s/</&lt;/sg;
  #$s =~ s/>/&gt;/sg;
  $s =~ s/"/&quot;/sg;
  return $s;
}

# Adds startouttext and endouttext where useful for the colorful editor
sub add_outtext {
  my ($node) = @_;
  
  if ($node->nodeType == XML_DOCUMENT_NODE) {
    my $root = $node->documentElement();
    add_outtext($root);
    return;
  }
  if ($node->nodeType != XML_ELEMENT_NODE) {
    return;
  }
  if (string_in_array(\@simple_data, $node->nodeName)) {
    return;
  }
  convert_paragraphs($node);
  if ($node->nodeName eq 'hintgroup' && !defined $node->firstChild) {
    # empty hintgroup: colorful editor needs start/end outtext
    add_endouttext($node, undef);
    add_startouttext($node, $node->firstChild);
  }
  my $next;
  my $in_outtext = 0;
  for (my $child=$node->firstChild; defined $child; $child=$next) {
    $next = $child->nextSibling;
    if (!$in_outtext && inside_outtext($child)) {
      add_startouttext($node, $child);
      $in_outtext = 1;
    } elsif ($in_outtext && !continue_outtext($child)) {
      add_endouttext($node, $child);
      $in_outtext = 0;
    }
    if (!$in_outtext) {
      add_outtext($child);
    }
  }
  if ($in_outtext) {
    add_endouttext($node);
  }
}

# Returns 1 if this node should trigger the addition of startouttext before it
sub inside_outtext {
  my ($node) = @_;
  if ($node->nodeType == XML_TEXT_NODE && $node->nodeValue !~ /^\s*$/) {
    return 1;
  }
  if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@html_trigger, $node->nodeName)) {
    if (contains_loncapa_block($node)) {
      return 0;
    } else {
      return 1;
    }
  }
  if ($node->nodeType == XML_ELEMENT_NODE && string_in_array(\@loncapa_in_text, $node->nodeName)) {
    return 1;
  }
  return 0;
}

# Returns 1 if the outtext environment can continue with this node
sub continue_outtext {
  my ($node) = @_;
  if (inside_outtext($node)) {
    return 1;
  }
  if ($node->nodeType == XML_TEXT_NODE) {
    return 1; # continue even if this is just spaces
  }
  return 0;
}

# Returns 1 if the node contains a LON-CAPA block in a descendant.
sub contains_loncapa_block {
  my ($node) = @_;
  foreach my $child ($node->childNodes) {
    if ($child->nodeType == XML_ELEMENT_NODE) {
      if (string_in_array(\@loncapa_block, $child->nodeName)) {
        return 1;
      }
      if (contains_loncapa_block($child)) {
        return 1;
      }
    }
  }
  return 0;
}

sub add_startouttext {
  my ($parent, $before_node) = @_;
  my $doc = $parent->ownerDocument;
  if ($before_node->nodeType == XML_TEXT_NODE) {
    # split space at the beginning of the node
    if ($before_node->nodeValue =~ /^(\s+)(.*?)$/s) {
      my $space_node = $doc->createTextNode($1);
      $before_node->setData($2);
      $parent->insertBefore($space_node, $before_node);
    }
  }
  my $startouttext = $doc->createElement('startouttext');
  $parent->insertBefore($startouttext, $before_node);
}

sub add_endouttext {
  my ($parent, $before_node) = @_;
  my $doc = $parent->ownerDocument;
  my $endouttext = $doc->createElement('endouttext');
  my $before_before;
  if (defined $before_node) {
    $before_before = $before_node->previousSibling;
  } else {
    $before_before = $parent->lastChild;
  }
  if (defined $before_before && $before_before->nodeType == XML_TEXT_NODE) {
    # split space at the end of the node
    if ($before_before->nodeValue =~ /^(.*?)(\s+)$/s) {
      $before_before->setData($1);
      my $space_node = $doc->createTextNode($2);
      if (defined $before_node) {
        $parent->insertBefore($space_node, $before_node);
      } else {
        $parent->appendChild($space_node);
      }
      $before_node = $space_node;
    }
  }
  if (defined $before_node) {
    $parent->insertBefore($endouttext, $before_node);
  } else {
    $parent->appendChild($endouttext);
  }
  # replace spaces afterwards by a \n + indentation
  my $next = $endouttext->nextSibling;
  if (defined $next && $next->nodeType == XML_TEXT_NODE) {
    my $v = $next->nodeValue;
    if ($v =~ /^ /) {
      $v =~ s/^ +//;
      if ($parent->firstChild->nodeType == XML_TEXT_NODE &&
          $parent->firstChild->nodeValue =~ /^\n +$/) {
        $v = $parent->firstChild->nodeValue.$v;
      } else {
        $v = "\n".$v;
      }
      $next->setData($v);
    }
  }
}

# Convert paragraph children when one contains an inline response into content + <br>
# (the colorful editor does not support paragraphs containing inline responses)
sub convert_paragraphs {
  my ($parent) = @_;
  my $p_child_with_inline_response = 0;
  foreach my $child ($parent->childNodes) {
    if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
      foreach my $child2 ($child->childNodes) {
        if ($child2->nodeType == XML_ELEMENT_NODE) {
          if (string_in_array(\@inline_responses, $child2->nodeName)) {
            $p_child_with_inline_response = 1;
            last;
          }
        }
      }
    }
    if ($p_child_with_inline_response) {
      last;
    }
  }
  if ($p_child_with_inline_response) {
    my $doc = $parent->ownerDocument;
    my $next;
    for (my $child=$parent->firstChild; defined $child; $child=$next) {
      $next = $child->nextSibling;
      if ($child->nodeType == XML_ELEMENT_NODE && $child->nodeName eq 'p') {
        replace_by_children($child);
        if (defined $next && (defined $next->nextSibling || $next->nodeType != XML_TEXT_NODE ||
            $next->nodeValue !~ /^\s*$/)) {
          # we only add a br if there is something after
          my $br = $doc->createElement('br');
          $parent->insertBefore($br, $next);
          # add another br to make up for the p margin
          $br = $doc->createElement('br');
          $parent->insertBefore($br, $next);
        }
      }
    }
  }
}

##
# Tests if a string is in an array (using eq) (to avoid Smartmatch warnings with $value ~~ @array)
# @param {Array<string>} array - reference to the array of strings
# @param {string} value - the string to look for
# @returns 1 if found, 0 otherwise
##
sub string_in_array {
  my ($array, $value) = @_;
  foreach my $v (@{$array}) {
    if ($v eq $value) {
      return 1;
    }
  }
  return 0;
}

##
# replaces a node by its children
# @param {Node} node - the DOM node
##
sub replace_by_children {
  my ($node) = @_;
  my $parent = $node->parentNode;
  my $next;
  my $previous;
  for (my $child=$node->firstChild; defined $child; $child=$next) {
    $next = $child->nextSibling;
    if ((!defined $previous || !defined $next) &&
        $child->nodeType == XML_TEXT_NODE && $child->nodeValue =~ /^\s*$/) {
      next; # do not keep first and last whitespace nodes
    } else {
      if (!defined $previous && $child->nodeType == XML_TEXT_NODE) {
        # remove whitespace at the beginning
        my $value = $child->nodeValue;
        $value =~ s/^\s+//;
        $child->setData($value);
      }
      if (!defined $next && $child->nodeType == XML_TEXT_NODE) {
        # and at the end
        my $value = $child->nodeValue;
        $value =~ s/\s+$//;
        $child->setData($value);
      }
    }
    $node->removeChild($child);
    $parent->insertBefore($child, $node);
    $previous = $child;
  }
  $parent->removeChild($node);
}

1;
__END__

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>