loncom/interface/lonhtmlgateway.pm - annotate

Return to lonhtmlgateway.pm CVS log
Up to [LON-CAPA] / loncom / interface
Annotation of loncom/interface/lonhtmlgateway.pm, revision 1.5

1.1       faziophi    1: # The LearningOnline Network with CAPA
                      2: # gateway for html input/output to be properly parsed and handled
                      3: #
1.5     ! raeburn     4: # $Id: lonhtmlgateway.pm,v 1.4 2010/05/04 19:45:05 faziophi Exp $
1.1       faziophi    5: #
                      6: # Copyright Michigan State University Board of Trustees
                      7: #
                      8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
                      9: #
                     10: # LON-CAPA is free software; you can redistribute it and/or modify
                     11: # it under the terms of the GNU General Public License as published by
                     12: # the Free Software Foundation; either version 2 of the License, or
                     13: # (at your option) any later version.
                     14: #
                     15: # LON-CAPA is distributed in the hope that it will be useful,
                     16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                     18: # GNU General Public License for more details.
                     19: #
                     20: # You should have received a copy of the GNU General Public License
                     21: # along with LON-CAPA; if not, write to the Free Software
                     22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
                     23: #
                     24: # /home/httpd/html/adm/gpl.txt
                     25: #
                     26: # http://www.lon-capa.org/
                     27: #
                     28: ######################################################################
                     29: ######################################################################
                     30: 
                     31: =pod
                     32: 
                     33: =head1 NAME
                     34: 
                     35: Apache::lonhtmlgateway - properly parse and handle HTML input and output
                     36: 
                     37: =head1 SYNOPSIS
                     38: 
                     39: This is part of the LearningOnline Network with CAPA project
                     40: described at http://www.lon-capa.org.
                     41: 
                     42: =head1 INTRODUCTION
                     43: 
                     44: lonhtmlgateway is an object-oriented module used to parse and correct
                     45: malformed HTML input from the client, as well as to perform processing
                     46: of custom LON-CAPA HTML output before it is sent along to the end-user.
                     47: It replaces a number of subroutines in various modules, and adds new
                     48: code to tidy and process malformed HTML using XML::LibXML. 
                     49: 
                     50: This module is intended to be used for all non-authoring perspectives
                     51: in the system.
                     52: 
                     53: New to LON-CAPA version 3.0.
                     54: 
                     55: =head2 Example Usage
                     56: 
                     57: Below is intended code to be invoked and called for use outside 
                     58: of this module:
                     59: 
                     60:     $gateway = Apache::lonhtmlgateway->new();
                     61:     $gateway = Apache::lonhtmlgateway->new($target);
                     62:     
                     63:     $xhtml = $gateway->process_incoming_html($html);
                     64:     $xhtml = $gateway->process_incoming_html($html, $legacy);
                     65:     
                     66:     $xml = $gateway->process_html_to_xml($html);
                     67:     $xhtml = $gateway->process_xml_to_html($xml); 
                     68:     
                     69:     $bool = Apache::lonhtmlgateway->contains_block_level_tags($input);
                     70:     
                     71: =head1 GLOBAL VARIABLES
                     72: 
                     73: =over 4
                     74: 
                     75: =cut 
                     76: 
                     77: ######################################################################
                     78: ######################################################################
                     79: 
                     80: package Apache::lonhtmlgateway;
                     81: 
                     82: use strict;
                     83: use utf8;
                     84: use Time::Local;
                     85: use Time::HiRes;
                     86: use Apache::lonlocal;
                     87: use Apache::lonnet;
                     88: use Apache::lonhtmlcommon;
                     89: use Apache::lonxml;
                     90: use Apache::lontexconvert;
                     91: use lib '/home/httpd/lib/perl/';
                     92: use LONCAPA;
                     93: use XML::LibXML;
                     94: use Encode;
                     95: use HTML::Entities;
                     96: use HTML::LCParser();
                     97: use Safe();
                     98: 
                     99: local $XML::LibXML::skipXMLDeclaration = 1;
                    100: local $XML::LibXML::skipDTD = 1;
                    101: local $XML::LibXML::setTagCompression = 1;
                    102: 
                    103: ##############################################
                    104: ##############################################
                    105: 
                    106: =item %LONCAPA_ALLOWED_STANDARD_TAGS
                    107: 
                    108: This is a hash of all tags, both HTML and custom LON-CAPA tags that
                    109: are allowed in non-authoring spaces.  Examples of this include
                    110: course documents, bulletin boards, discussion posts, templated pages,
                    111: etc.  In addition, in the event of rich text editing, the WYSIWYG
                    112: editor needs to know how to display LON-CAPA custom tags as either
                    113: inline-level (<span>) or block-level (<div>). Therefore, the hash is
                    114: set up with uppercase tag names as keys ("H1"), and the corresponding
                    115: entry an integer constant indicating that tag's role or purpose:
                    116: 
                    117: =over 4
                    118: 
                    119: =item 0 =
                    120: 
                    121: Tag is explictly not allowed.  Currently not used anywhere in this
                    122: module, but reserved for the future in case certain tags would like
                    123: to be explicitly blacklisted.
                    124: 
                    125: =item 1 =
                    126: 
                    127: Tag is allowed, and in cases where it is unclear, is rendered as an
                    128: inline-level element.  Example: <algebra> should be rendered as an 
                    129: inline element.
                    130: 
                    131: =item 2 =
                    132: 
                    133: Tag is allowed, and in cases where it is unclear, is rendered as a
                    134: block-level element.  Example: <md> should be rendered as a block
                    135: element.
                    136: 
                    137: =back
                    138: 
                    139: =back
                    140: 
                    141: =cut
                    142: 
                    143: ##############################################
                    144: ##############################################
                    145: 
                    146: our %LONCAPA_ALLOWED_STANDARD_TAGS = (
                    147:     # standard html header tags
                    148:     H1 => 2, H2 => 2, H3 => 2, H4 => 2, H5 => 2, H6 => 2,
                    149:     # basic inline formatting and phrases
                    150:     B => 1, I => 1, U => 1, STRONG => 1, EM => 1, STRIKE => 1,
                    151:     BIG => 1, SMALL => 1, INS => 1, DEL => 1, S => 1,
                    152:     Q => 1, DFN => 1, CODE => 1, SAMP => 1, KBD => 1, VAR => 1,
                    153:     SUB => 1, SUP => 1,
                    154:     # linking and embedding
                    155:     A => 1, IMG => 1, 
                    156:     # block level tags
                    157:     P => 2, DIV => 2, OL => 2, UL => 2, LI => 2, ADDRESS => 2,
                    158:     BR => 2, HR => 2, BLOCKQUOTE => 2, PRE => 2, 
                    159:     # table-related tags
                    160:     TABLE => 2, CAPTION => 2, TBODY => 2, TR => 2, TD => 2,
                    161:     TH => 2, 
                    162:     # LON-CAPA custom tags
                    163:     M => 1, MI => 1, MD => 2, ALGEBRA => 1,
                    164:     CHEM => 1
                    165: );
                    166: 
                    167: ##############################################
                    168: ##############################################
                    169: 
                    170: =head1 PARSING LON-CAPA CUSTOM TAGS
                    171: 
                    172: This module maintains a hash %custom_tag_parsers, containing 
                    173: lowercase tag names as keys and function references as entries.
                    174: Convention used here names the actual parsing function whose
                    175: reference is stored here to be of the name &parse_tagname_tag().
                    176: These functions are called during the processing of outgoing 
                    177: HTML output in the &process_outgoing_html() function.
                    178: 
                    179: Each of these functions is passed the following arguments:
                    180: 
                    181: =over 4
                    182: 
                    183: =item self
                    184: 
                    185: Reference to Apache::lonhtmlgateway object calling the function.
                    186: 
                    187: =item input
                    188: 
                    189: Textual context extracted between the <tag> and </tag> tags.
                    190: Note that this text I<could> contain HTML entities.  Thus, for 
                    191: functions that cannot handle entitized input, 
                    192: &HTML::Entities::decode_entities() should be called on this data
                    193: before further handing it off.
                    194: 
                    195: =back
                    196: 
                    197: Example hash entry:
                    198: 
                    199:     mi => \&parse_mi_tag,
                    200: 
                    201: =head2 Currently Supported Custom Tags
                    202: 
                    203: =over 4
                    204: 
                    205: =item <algebra>
                    206: 
                    207: Intended to convert and simplify simple algebraic functions into
                    208: readable output.  Corrects cases such as double negatives or 
                    209: eliminates coefficients of 1 where appropriate.  The actual
                    210: handling of content contained in this tag takes place inside
                    211: L<Apache::lontexconvert>, which in turn uses the AlgParser 
                    212: module to actually process the input.
                    213: 
                    214: Usage:
                    215:     <algebra>2*x+(-5)</algebra>
                    216: 
                    217: =item <chem>
                    218: 
                    219: Formatter for chemical equations, adding superscripts, subscripts,
                    220: and appropriate arrow characters as appropriate.  This parser is
                    221: wholly contained inside this module, but is a copy of a routine
                    222: found in homework/default_homework.lcpm.
                    223: 
                    224: Usage:
                    225:     <chem>CH3CO2H + H2O <=> CH3CO2- + H3O+</chem>
                    226: 
                    227: =back
                    228: 
                    229: =head3 Math Mode Tags
                    230: 
                    231: These tags are intended for LaTeX math mode input, in order to
                    232: produce complex mathematical and scientific constructs, which
                    233: normal HTML cannot produce.  The output is later rendered by
                    234: a user-defined TeX engine in web target, or handled directly
                    235: in the case of tex target.  The only difference between the tags
                    236: below is determining the author's intent on how to appropriately
                    237: render the contents within the tag - this intent is
                    238: important in preserving the What You See Is What You Get philosophy
                    239: of the rich text editor.
                    240: 
                    241: =over 4
                    242: 
                    243: =item <mi>
                    244: 
                    245: Inline math mode tag.  Content is surrounded by "$" characters and
                    246: passed to the parser for the <m> tag.
                    247: 
                    248: I<New for LON-CAPA 3.0>.
                    249: 
                    250: =item <md>
                    251: 
                    252: Display block math mode tag.  Content is surrounded by "\[" and 
                    253: "\]" characters and passed to the parser for the <m> tag.
                    254: 
                    255: I<New for LON-CAPA 3.0>.
                    256: 
                    257: =item <m>
                    258: 
                    259: Math mode tag.  Allows author to fully specify the display of their
                    260: TeX input, and contain mixed inline-and-block content within a single
                    261: tag.  
                    262: 
                    263: Due to tools such as the rich text editor needing to know whether a
                    264: custom tag is block-level or inline-level on render, the use of this
                    265: tag is discouraged starting with LON-CAPA 3.0 although it will continue
                    266: to function.  Fully compatible with legacy LON-CAPA 2.x content.
                    267: 
                    268: =back
                    269: 
                    270: =cut
                    271: 
                    272: ##############################################
                    273: ##############################################
                    274: 
                    275: my %custom_tag_parsers = (
                    276:     mi => \&parse_mi_tag,
                    277:     md => \&parse_md_tag,
                    278:     m => \&parse_m_tag,
                    279:     algebra => \&parse_algebra_tag,
                    280:     chem => \&parse_chem_tag
                    281: );
                    282: 
                    283: ##############################################
                    284: ##############################################
                    285: 
                    286: =head1 CLASS OBJECT CONSTRUCTOR
                    287: 
                    288: =over 4
                    289: 
                    290: =item new
                    291: 
                    292:     $gateway = Apache::libhtmlgateway->new();
                    293:     $gateway = Apache::libhtmlgateway->new($target);
                    294: 
                    295: Constructs and returns a new gateway object.  An optional argument
                    296: allows one to specify the target of the output, defaults to 'web'.
                    297: Behind the scenes, a single XML::LibXML parser object is created
                    298: behind the scenes.  On destroy, this parser object is destroyed
                    299: as well.
                    300: 
                    301: =back
                    302: 
                    303: =cut
                    304: 
                    305: ##############################################
                    306: ##############################################
                    307: 
                    308: sub new {
                    309:     my $invocant = shift;
                    310:     my $class = ref($invocant) || $invocant;
                    311:     my $target = shift;
                    312:     # create a new parser instance for libxml
                    313:     my $self = {
                    314:         parser => XML::LibXML->new(),
                    315:         target => ($target) ? $target : 'web'
                    316:     };
                    317:     # options for the libxml parser
                    318:     $self->{parser}->recover(1);
                    319:     $self->{parser}->recover_silently(1);
                    320:     bless($self, $class);  # bless = pray that it works
                    321:     return $self;
                    322: }
                    323: 
                    324: sub DESTROY {
                    325:     my $self = shift;
                    326:     my $parser = $self->{parser};
                    327:     undef $parser;  # destroy the parser instance
                    328: }
                    329: 
                    330: ##############################################
                    331: ##############################################
                    332: 
                    333: =head1 PUBLIC OBJECT METHODS
                    334: 
                    335: =over 4
                    336: 
                    337: =item process_html_to_xml
                    338: 
                    339:     $xml = $gateway->process_html_to_xml($html);
                    340: 
                    341: Takes presumably-malformed HTML, encodes ampersands characters 
                    342: and passes the result to the Xml::LibXML parser, which creates
                    343: a DOM tree in memory of the content.  This parse is as error-tolerant
                    344: as can be set, and libxml attempts to recover from any errors as much
                    345: as possible. This DOM tree is then taken and serialized,
                    346: eliminating unbalanced and malformed tags along the way. This
                    347: XML code (without any header tags) is then returned to the caller.
                    348: 
                    349: =cut
                    350: 
                    351: ##############################################
                    352: ##############################################
                    353: 
                    354: sub process_html_to_xml {
                    355:     my $self = shift;
                    356:     my $input = shift;
                    357:     my $parser = $self->{parser};
                    358: 
                    359:     if (length($input) < 1) { return ""; }
                    360:     
                    361:     # only encode ampersands -- brackets may be valid tags
                    362:     my $encoded = &HTML::Entities::encode_entities($input, '&');
                    363:     
                    364:     # for the <chem> tag, we want the strings "<=>", "<-", "->" to be properly
                    365:     # entitized so the parser doesn't destroy it
                    366:     $encoded =~ s/(\<\s*chem\s*>.*)\<\=\>(.*\<\s*\/chem\s*>)/$1\&lt\;\&\#61\;\&gt\;$2/gi;
                    367:     $encoded =~ s/(\<\s*chem\s*>.*)\-\>(.*\<\s*\/chem\s*>)/$1\-\&gt\;$2/gi;
                    368:     $encoded =~ s/(\<\s*chem\s*>.*)\<\-(.*\<\s*\/chem\s*>)/$1\&lt\;\-$2/gi;
                    369:     
                    370:     # parse into libXML to tidy tags, we suppress any errors
                    371:     # because otherwise the parser complains about non-HTML
                    372:     # tags to STDERR and the Apache error logs
                    373:     my $dom = $parser->parse_html_string($encoded,
                    374:         {
                    375:             suppress_errors => 1,
                    376:             suppress_warnings => 1,
                    377:             recover => 2
                    378:         }
                    379:     );
                    380:     # the dom returns a full <html> structure, so just get
                    381:     # all the child nodes of the <body> tag and put them together
                    382:     my @body_nodes = $dom->findnodes('/html/body');
                    383:     my @body_children = $body_nodes[0]->childNodes;
                    384:     my $xml = "";
                    385:     foreach my $child (@body_children) {
                    386:         $xml .= $child->toString();
                    387:     }
                    388:     # entities passed into $input are in the form of '&amp;lt;'
                    389:     # they are double entities
                    390:     return $xml;    
                    391: }
                    392: 
                    393: ##############################################
                    394: ##############################################
                    395: 
                    396: =item process_xml_to_html
                    397: 
                    398:     $xhtml = $gateway->process_xml_to_html($xml);
                    399: 
                    400: Takes XML input, decodes ampersands characters 
                    401: and passes the result then to the caller.
                    402: 
                    403: =cut
                    404: 
                    405: ##############################################
                    406: ##############################################
                    407: 
                    408: sub process_xml_to_html {
                    409:     my $self = shift;
                    410:     my $input = shift;
                    411:     # decode one level of entities (XML) such that the
                    412:     # output is returned to the original level of entities
                    413:     # $input "&lt;" --> $xml "&amp;lt;" --> "&lt;"
                    414:     my $xhtml = &HTML::Entities::decode_entities($input);
                    415:     # now we have valid XHTML that can be stored and parsed
                    416:     return $xhtml;
                    417: }
                    418: 
                    419: ##############################################
                    420: ##############################################
                    421: 
                    422: =item process_incoming_html
                    423: 
                    424:     $xhtml = $gateway->process_incoming_html($html);
                    425:     $xhtml = $gateway->process_incoming_html($html, $legacy);
                    426: 
                    427: Designed to be called for all raw HTML inputs from the client
                    428: side before storing or rendering data.  Decodes UTF-8 data,
                    429: trims leading and trailing "\n" and "<br />" tags.  Processes
                    430: the result through the XML parser, converts this back to
                    431: balanced well-formed XHTML, re-encodes the result as UTF-8,
                    432: and returns the result to the caller.
                    433: 
                    434: =over 4
                    435: 
                    436: =item legacy
                    437: 
                    438:     $legacy = 0;
                    439:     $legacy = 1; 
                    440: 
                    441: I<(optional)> If true, adds additional processing intended
                    442: to emulate LON-CAPA 2.x parsing of the content.
                    443: 
                    444: =back
                    445: 
                    446: =cut
                    447: 
                    448: ##############################################
                    449: ##############################################
                    450: 
                    451: sub process_incoming_html {
                    452:     # this should be called by all HTML inputs before storing
                    453:     # data --> for consistency's sake, call process_html_to_xml
                    454:     # afterwards if you need to embed this in XML later on
                    455:     my $self = shift;
                    456:     my $input = shift;
                    457:     my $legacy = shift;
                    458:     
                    459:     # no idea why i have to call this to get unicode characters
                    460:     # working, but i do, so here it is.
                    461:     $input = &Encode::decode_utf8($input);
                    462:     
                    463:     # trim leading and trailing whitespace and HTML breaks
                    464:     chomp($input);
                    465:     $input =~ s/\s+$//s;
                    466:     $input =~ s/^\s+//s;
                    467:     $input =~ s/\<br\s*\/*\>$//s;
                    468:     my $no_p_input = (length($input) > 0 && $input !~ m/.*\<[\s]*p[\s]*\>.*/is);
                    469:     my $xml = $self->process_html_to_xml($input);
                    470:     if ($legacy && !&contains_block_level_tags($input)) {
                    471:         # the xml returns content inside a <p> tag
                    472:         # if there are no block tags... thus to preserve
                    473:         # old behavior, we strip out that <p></p>
                    474:         if ($no_p_input) {
                    475:             $xml =~ s/^\<p\>(.*)\<\/p\>/$1/si;
                    476:         }
                    477:     }
                    478:     my $xhtml = $self->process_xml_to_html($xml);
                    479:     # see above unicode encoding comment
                    480:     $xhtml = &Encode::encode_utf8($xhtml);
                    481:     return $xhtml;
                    482: }
                    483: 
                    484: ##############################################
                    485: ##############################################
                    486: 
                    487: =item process_outgoing_html
                    488: 
                    489:     $html = $gateway->process_outgoing_html($xhtml);
                    490:     $html = $gateway->process_outgoing_html($xhtml, $legacy);
                    491: 
                    492: Designed to be called for all HTML outputs to the client
                    493: side before rendering data.  This entitizes all non-allowed
                    494: tags, as was previously done in Apache::lonfeedback, and
                    495: processes and converts all LON-CAPA supported custom tags (see
                    496: above) to their respective output HTML.
                    497: 
                    498: =over 4
                    499: 
                    500: =item legacy
                    501: 
                    502:     $legacy = 0;
                    503:     $legacy = 1; 
                    504: 
                    505: I<(optional)> If true, adds additional processing intended
                    506: to emulate LON-CAPA 2.x parsing of the content.  This includes
                    507: behavior to convert "\n" to "<br />" if there are no block-level
                    508: tags detected in the input.  In addition, raw URLs are converted
                    509: automatically to <a> links.
                    510: 
                    511: =back
                    512: 
                    513: =back
                    514: 
                    515: =cut
                    516: 
                    517: ##############################################
                    518: ##############################################
                    519: 
                    520: sub process_outgoing_html {
                    521:     # this should be called on all HTML outputs before displaying
                    522:     # because it will filter out all non-HTML+LONCAPA tags.
                    523:     # tags are not filtered at input stage for greater backwards
                    524:     # compatibility.  note that this disregards course preference.
                    525:     my $self = shift;
                    526:     my $input = shift;
                    527:     my $legacy = shift;
                    528:     
                    529:     my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
                    530:     # entitize all tags that are not explicitly allowed
                    531:     $input =~ s/\<(\/?\s*(\w+)[^\>\<]*)/
                    532:         {($html{uc($2)}&&(length($1)<1000))?"\<$1":"\&lt;$1"}/ge;
                    533:     $input =~ s/(\<?\s*(\w+)[^\<\>]*)\>/
                    534:         {($html{uc($2)}&&(length($1)<1000))?"$1\>":"$1\&gt;"}/ge;
                    535:     if ($legacy) {
                    536:         unless (&contains_block_level_tags($self, $input)) {
                    537:             $input = $self->legacy_newline_to_br($input); 
                    538:         }
                    539:         $input = $self->legacy_raw_href_to_link($input);
                    540:     }
                    541:     # at this point, we need to convert our own custom tags
                    542:     # into the appropriate output
                    543:     # see above for supported tags
                    544:     my $output = "";
                    545:     my $parser = HTML::LCParser->new(\$input);
                    546:     while (my $token = $parser->get_token()) {
                    547:     	if ($token->[0] eq 'T') {
1.3       faziophi  548:             if ($self->{target} ne 'tex') {
                    549:     	        $output .= &Apache::lontexconvert::smiley($token->[1]);
                    550:     	    } else {
1.4       faziophi  551:                 my $t = $token->[1];
                    552:                 $t =~ s/([^\n\r\t &<>!\#%\(-;=?-~])/num_entity($1)/ge;
                    553:                 $output .= $t;
1.3       faziophi  554:             }
                    555:         } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
1.1       faziophi  556:     	    $output .= $token->[1];
                    557:     	} elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
                    558:     	    $output .= $token->[2];
                    559:     	} elsif ($token->[0] eq 'S') {
                    560:     	    my $tag = lc($token->[1]);
                    561:     	    if (exists($custom_tag_parsers{$tag})) {
                    562:     	        my $text = $parser->get_text();
                    563:     	        $output .= $custom_tag_parsers{$tag}(
                    564:     	            $self, $text, $self->{target});
                    565:     	    } else {
                    566:     	        $output .= $token->[4];
                    567:     	    }
                    568:     	}
                    569:     }
                    570:     return $output;
                    571: }
                    572: 
                    573: ##############################################
                    574: ##############################################
                    575: 
                    576: =head1 STATIC CLASS METHODS
                    577: 
                    578: The following are static class methods that can be called
                    579: by any object.
                    580: 
                    581: =over 4
                    582: 
                    583: =item contains_block_level_tags
                    584: 
                    585:     $bool = Apache::lonhtmlgateway::contains_block_level_tags($input);
                    586:     
                    587: Uses a regular expression to find, in the input data, any tags 
                    588: described in %LONCAPA_ALLOWED_STANDARD_TAGS as block-level.
                    589: Returns 1 if true, 0 if false. 
                    590: 
                    591: =cut
                    592: 
                    593: ##############################################
                    594: ##############################################
                    595: 
                    596: sub contains_block_level_tags {
                    597:     my $class = shift;
                    598:     my $input = shift;
                    599:     my @block_level_tags = @{&get_block_level_tags($class)};
                    600:     foreach my $tag (@block_level_tags) {
                    601:         if ($input =~ m/\<\/?\s*$tag[^\>\<]*/gi) {
                    602:             # if your input loves this regular expression
                    603:             # as much as i do, then return true.
                    604:             # it searches for either a <tag> or <tag />
                    605:             return 1;
                    606:         }
                    607:     }
                    608:     return 0;
                    609: }
                    610: 
                    611: ##############################################
                    612: ##############################################
                    613: 
                    614: =item get_block_level_tags
                    615: 
                    616:     @tags = Apache::lonhtmlgateway::get_block_level_tags();
                    617:     
                    618: Return an array with any tags described in 
                    619: %LONCAPA_ALLOWED_STANDARD_TAGS as block-level. Note that these
                    620: tags are returned in no particular order, and the tag names
                    621: are returned in uppercase.
                    622: 
                    623: 
                    624: =cut
                    625: 
                    626: ##############################################
                    627: ##############################################
                    628: 
                    629: sub get_block_level_tags {
                    630:     my $class = shift;
                    631:     my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
                    632:     my @block = [];
                    633:     foreach my $tag (keys(%html)) {
                    634:         if ($html{$tag} == 2) {
                    635:             push(@block, $tag);
                    636:         }
                    637:     }
                    638:     return \@block;
                    639: }
                    640: 
1.4       faziophi  641: sub num_entity {
                    642:     sprintf "&#x%X;", ord($_[0]);
                    643: }
                    644: 
1.1       faziophi  645: ##############################################
                    646: ##############################################
                    647: 
                    648: =head2 Legacy Functions
                    649: 
                    650: These functions are intended to process input in the same or
                    651: a similar way to how it was processed in LON-CAPA 2.x.
                    652: 
                    653: =item legacy_newline_to_br
                    654: 
                    655: I<(formerly Apache::lonfeedback::newline_to_br)>
                    656: 
                    657:     $converted = Apache::lonhtmlgateway::legacy_newline_to_br($input);
                    658:     
                    659: Parse the input using HTML::LCParser, and in any text nodes
                    660: which contain "\n" characters, replace those characters with
                    661: an HTML "<br />" tag.
                    662: 
                    663: =cut
                    664: 
                    665: ##############################################
                    666: ##############################################
                    667: 
                    668: sub legacy_newline_to_br {
                    669:     my $class = shift;
                    670:     my $input = shift;
                    671:     my $output;
                    672:     my $parser = HTML::LCParser->new(\$input);
                    673:     while (my $token = $parser->get_token()) {
                    674:     	if ($token->[0] eq 'T') {
                    675:     	    my $text = $token->[1];
                    676:     	    $text =~ s/\n/\<br \/\>/g;
                    677:     	    $output .= $text;
                    678:     	} elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
                    679:     	    $output .= $token->[1];
                    680:     	} elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
                    681:     	    $output .= $token->[2];
                    682:     	} elsif ($token->[0] eq 'S') {
                    683:     	    $output .= $token->[4];
                    684:     	}
                    685:     }
                    686:     return $output;
                    687: }
                    688: 
                    689: ##############################################
                    690: ##############################################
                    691: 
                    692: =item legacy_raw_href_to_link
                    693: 
                    694: I<(formerly Apache::lonhtmlcommon::raw_href_to_link)>
                    695:     
                    696:     $converted = Apache::lonhtmlgateway::legacy_raw_href_to_link($input);
                    697:     
                    698: Search for any links/URLs within the input text, and convert them
                    699: to <a> tags whose content is embedded inside a <tt> tag.
                    700: 
                    701: =back
                    702: 
                    703: =cut
                    704: 
                    705: ##############################################
                    706: ##############################################
                    707: 
                    708: sub legacy_raw_href_to_link {
                    709:     my $class = shift;
                    710:     my $input = shift;
                    711:     $input =~ s/(https?\:\/\/[^\s\'\"\<]+)([\s\<]|$)/<a href="$1"><tt>$1<\/tt><\/a>$2/gi;
                    712:     return $input;
                    713: }
                    714: 
                    715: sub parse_algebra_tag {
                    716:     my $self = shift;
                    717:     my $input = shift;
                    718:     # the <algebra> parser does NOT handle entities,
                    719:     # unlike the general <m> parser; thus we run
                    720:     # the content of this tag through HTML::Entities,
1.2       faziophi  721:     # decoding it first. we also just get the tex, and
                    722:     # feed it through as if it were an <mi> tag.
1.1       faziophi  723:     $input = &HTML::Entities::decode($input);
1.5     ! raeburn   724:     my $algebra = 
        !           725:         &Apache::lontexconvert::algebra($input,'tex',undef,undef,undef,'tth');
1.2       faziophi  726:     return &parse_m_tag($self, $algebra);
1.1       faziophi  727: }
                    728: 
                    729: sub parse_mi_tag {
                    730:     my $self = shift;
                    731:     my $input = shift;
1.2       faziophi  732:     return &parse_m_tag($self, '\ensuremath{'.$input.'}');
1.1       faziophi  733: }
                    734: 
                    735: sub parse_md_tag {
                    736:     my $self = shift;
                    737:     my $input = shift;
                    738:     return &parse_m_tag($self, '\['.$input.'\]');
                    739: }
                    740: 
                    741: sub parse_m_tag {
                    742:     my $self = shift;
                    743:     my $input = shift;
1.4       faziophi  744:     if ($self->{target} ne 'tex') {
                    745:         return &Apache::lontexconvert::to_convert($input, $self->{target});
                    746:     } else {
                    747:         return '<m>'.$input.'</m>';
                    748:     }
1.1       faziophi  749: }
                    750: 
                    751: sub parse_chem_tag {
                    752:     my $self = shift;
                    753:     my $input = shift;
                    754:     my $target = $self->{target};
                    755:     # as with the <algebra> tag, some portions of the
                    756:     # <chem> input may be coming in encoded, especially
                    757:     # arrows -- so decode it in HTML::Entities
                    758:     $input = &HTML::Entities::decode($input);
                    759:     my @tokens = split(/(\s\+|\->|<=>|<\-|\.)/,$input);
                    760:     my $formula = '';
                    761:     foreach my $token (@tokens) {
                    762:     	if ($token eq '->' ) {
                    763:     	    if ($target eq 'web') {
                    764:     	        $formula .= '&#8594; ';
                    765:     	    } else {
                    766:     	        $formula .= '<m>\ensuremath{\rightarrow}</m> ';
                    767:     	    }
                    768:     	    next;
                    769:     	}
                    770:     	if ($token eq '<-' ) {
                    771:     	    if ($target eq 'web') {
                    772:     	        $formula .= '&#8592; ';
                    773:     	    } else {
                    774:     	        $formula .= '<m>\ensuremath{\leftarrow}</m> ';
                    775:     	    }
                    776:     	    next;
                    777:     	}  
                    778:     	if ($token eq '<=>') {
                    779:     	    if ($target eq 'web') {
                    780:     		$formula .= '&#8652; ';
                    781:     	    } else {
                    782:     		$formula .= '<m>\ensuremath{\rightleftharpoons}</m> ';
                    783:     	    }
                    784:     	    next;
                    785:     	}
                    786:     	if ($token eq '.') {
                    787:     	  $formula =~ s/(\&nbsp\;| )$//;
                    788:     	  $formula .= '&middot;';
                    789:     	  next;
                    790:     	}
                    791:     	$token =~ /^\s*([\d|\/]*(?:&frac\d\d)?)(.*)/;
                    792:             $formula .= $1 if ($1 ne '1');  # stoichiometric coefficient
                    793:     	my $molecule = $2;
                    794:     	# subscripts
                    795:     	$molecule =~ s|(?<=[a-zA-Z\)\]\s])(\d+)|<sub>$1</sub>|g;
                    796:     	# superscripts
                    797:     	$molecule =~ s|\^(\d*[+\-]*)|<sup>$1</sup>|g;
                    798:     	# strip whitespace
                    799:     	$molecule =~ s/\s*//g;
                    800:     	# forced space
                    801:     	$molecule =~ s/_/ /g;
                    802:     	$molecule =~ s/-/&minus;/g;
                    803:     	$formula .= $molecule.'&nbsp;';
                    804:     }
                    805:     # get rid of trailing space
                    806:     $formula =~ s/(\&nbsp\;| )$//;
                    807:     return $formula;
                    808: }
                    809: 
                    810: ##############################################
                    811: ##############################################
                    812: 
                    813: =head1 AUTHORS
                    814: 
                    815: Phil Fazio
                    816: 
                    817: =head1 VERSION
                    818: 
1.5     ! raeburn   819: $Id: lonhtmlgateway.pm,v 1.4 2010/05/04 19:45:05 faziophi Exp $
1.1       faziophi  820: 
                    821: =cut
                    822: 
                    823: ##############################################
                    824: ##############################################
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>