Annotation of loncom/interface/lonhtmlgateway.pm, revision 1.5
1.1 faziophi 1: # The LearningOnline Network with CAPA
2: # gateway for html input/output to be properly parsed and handled
3: #
1.5 ! raeburn 4: # $Id: lonhtmlgateway.pm,v 1.4 2010/05/04 19:45:05 faziophi Exp $
1.1 faziophi 5: #
6: # Copyright Michigan State University Board of Trustees
7: #
8: # This file is part of the LearningOnline Network with CAPA (LON-CAPA).
9: #
10: # LON-CAPA is free software; you can redistribute it and/or modify
11: # it under the terms of the GNU General Public License as published by
12: # the Free Software Foundation; either version 2 of the License, or
13: # (at your option) any later version.
14: #
15: # LON-CAPA is distributed in the hope that it will be useful,
16: # but WITHOUT ANY WARRANTY; without even the implied warranty of
17: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18: # GNU General Public License for more details.
19: #
20: # You should have received a copy of the GNU General Public License
21: # along with LON-CAPA; if not, write to the Free Software
22: # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
23: #
24: # /home/httpd/html/adm/gpl.txt
25: #
26: # http://www.lon-capa.org/
27: #
28: ######################################################################
29: ######################################################################
30:
31: =pod
32:
33: =head1 NAME
34:
35: Apache::lonhtmlgateway - properly parse and handle HTML input and output
36:
37: =head1 SYNOPSIS
38:
39: This is part of the LearningOnline Network with CAPA project
40: described at http://www.lon-capa.org.
41:
42: =head1 INTRODUCTION
43:
44: lonhtmlgateway is an object-oriented module used to parse and correct
45: malformed HTML input from the client, as well as to perform processing
46: of custom LON-CAPA HTML output before it is sent along to the end-user.
47: It replaces a number of subroutines in various modules, and adds new
48: code to tidy and process malformed HTML using XML::LibXML.
49:
50: This module is intended to be used for all non-authoring perspectives
51: in the system.
52:
53: New to LON-CAPA version 3.0.
54:
55: =head2 Example Usage
56:
57: Below is intended code to be invoked and called for use outside
58: of this module:
59:
60: $gateway = Apache::lonhtmlgateway->new();
61: $gateway = Apache::lonhtmlgateway->new($target);
62:
63: $xhtml = $gateway->process_incoming_html($html);
64: $xhtml = $gateway->process_incoming_html($html, $legacy);
65:
66: $xml = $gateway->process_html_to_xml($html);
67: $xhtml = $gateway->process_xml_to_html($xml);
68:
69: $bool = Apache::lonhtmlgateway->contains_block_level_tags($input);
70:
71: =head1 GLOBAL VARIABLES
72:
73: =over 4
74:
75: =cut
76:
77: ######################################################################
78: ######################################################################
79:
80: package Apache::lonhtmlgateway;
81:
82: use strict;
83: use utf8;
84: use Time::Local;
85: use Time::HiRes;
86: use Apache::lonlocal;
87: use Apache::lonnet;
88: use Apache::lonhtmlcommon;
89: use Apache::lonxml;
90: use Apache::lontexconvert;
91: use lib '/home/httpd/lib/perl/';
92: use LONCAPA;
93: use XML::LibXML;
94: use Encode;
95: use HTML::Entities;
96: use HTML::LCParser();
97: use Safe();
98:
99: local $XML::LibXML::skipXMLDeclaration = 1;
100: local $XML::LibXML::skipDTD = 1;
101: local $XML::LibXML::setTagCompression = 1;
102:
103: ##############################################
104: ##############################################
105:
106: =item %LONCAPA_ALLOWED_STANDARD_TAGS
107:
108: This is a hash of all tags, both HTML and custom LON-CAPA tags that
109: are allowed in non-authoring spaces. Examples of this include
110: course documents, bulletin boards, discussion posts, templated pages,
111: etc. In addition, in the event of rich text editing, the WYSIWYG
112: editor needs to know how to display LON-CAPA custom tags as either
113: inline-level (<span>) or block-level (<div>). Therefore, the hash is
114: set up with uppercase tag names as keys ("H1"), and the corresponding
115: entry an integer constant indicating that tag's role or purpose:
116:
117: =over 4
118:
119: =item 0 =
120:
121: Tag is explictly not allowed. Currently not used anywhere in this
122: module, but reserved for the future in case certain tags would like
123: to be explicitly blacklisted.
124:
125: =item 1 =
126:
127: Tag is allowed, and in cases where it is unclear, is rendered as an
128: inline-level element. Example: <algebra> should be rendered as an
129: inline element.
130:
131: =item 2 =
132:
133: Tag is allowed, and in cases where it is unclear, is rendered as a
134: block-level element. Example: <md> should be rendered as a block
135: element.
136:
137: =back
138:
139: =back
140:
141: =cut
142:
143: ##############################################
144: ##############################################
145:
146: our %LONCAPA_ALLOWED_STANDARD_TAGS = (
147: # standard html header tags
148: H1 => 2, H2 => 2, H3 => 2, H4 => 2, H5 => 2, H6 => 2,
149: # basic inline formatting and phrases
150: B => 1, I => 1, U => 1, STRONG => 1, EM => 1, STRIKE => 1,
151: BIG => 1, SMALL => 1, INS => 1, DEL => 1, S => 1,
152: Q => 1, DFN => 1, CODE => 1, SAMP => 1, KBD => 1, VAR => 1,
153: SUB => 1, SUP => 1,
154: # linking and embedding
155: A => 1, IMG => 1,
156: # block level tags
157: P => 2, DIV => 2, OL => 2, UL => 2, LI => 2, ADDRESS => 2,
158: BR => 2, HR => 2, BLOCKQUOTE => 2, PRE => 2,
159: # table-related tags
160: TABLE => 2, CAPTION => 2, TBODY => 2, TR => 2, TD => 2,
161: TH => 2,
162: # LON-CAPA custom tags
163: M => 1, MI => 1, MD => 2, ALGEBRA => 1,
164: CHEM => 1
165: );
166:
167: ##############################################
168: ##############################################
169:
170: =head1 PARSING LON-CAPA CUSTOM TAGS
171:
172: This module maintains a hash %custom_tag_parsers, containing
173: lowercase tag names as keys and function references as entries.
174: Convention used here names the actual parsing function whose
175: reference is stored here to be of the name &parse_tagname_tag().
176: These functions are called during the processing of outgoing
177: HTML output in the &process_outgoing_html() function.
178:
179: Each of these functions is passed the following arguments:
180:
181: =over 4
182:
183: =item self
184:
185: Reference to Apache::lonhtmlgateway object calling the function.
186:
187: =item input
188:
189: Textual context extracted between the <tag> and </tag> tags.
190: Note that this text I<could> contain HTML entities. Thus, for
191: functions that cannot handle entitized input,
192: &HTML::Entities::decode_entities() should be called on this data
193: before further handing it off.
194:
195: =back
196:
197: Example hash entry:
198:
199: mi => \&parse_mi_tag,
200:
201: =head2 Currently Supported Custom Tags
202:
203: =over 4
204:
205: =item <algebra>
206:
207: Intended to convert and simplify simple algebraic functions into
208: readable output. Corrects cases such as double negatives or
209: eliminates coefficients of 1 where appropriate. The actual
210: handling of content contained in this tag takes place inside
211: L<Apache::lontexconvert>, which in turn uses the AlgParser
212: module to actually process the input.
213:
214: Usage:
215: <algebra>2*x+(-5)</algebra>
216:
217: =item <chem>
218:
219: Formatter for chemical equations, adding superscripts, subscripts,
220: and appropriate arrow characters as appropriate. This parser is
221: wholly contained inside this module, but is a copy of a routine
222: found in homework/default_homework.lcpm.
223:
224: Usage:
225: <chem>CH3CO2H + H2O <=> CH3CO2- + H3O+</chem>
226:
227: =back
228:
229: =head3 Math Mode Tags
230:
231: These tags are intended for LaTeX math mode input, in order to
232: produce complex mathematical and scientific constructs, which
233: normal HTML cannot produce. The output is later rendered by
234: a user-defined TeX engine in web target, or handled directly
235: in the case of tex target. The only difference between the tags
236: below is determining the author's intent on how to appropriately
237: render the contents within the tag - this intent is
238: important in preserving the What You See Is What You Get philosophy
239: of the rich text editor.
240:
241: =over 4
242:
243: =item <mi>
244:
245: Inline math mode tag. Content is surrounded by "$" characters and
246: passed to the parser for the <m> tag.
247:
248: I<New for LON-CAPA 3.0>.
249:
250: =item <md>
251:
252: Display block math mode tag. Content is surrounded by "\[" and
253: "\]" characters and passed to the parser for the <m> tag.
254:
255: I<New for LON-CAPA 3.0>.
256:
257: =item <m>
258:
259: Math mode tag. Allows author to fully specify the display of their
260: TeX input, and contain mixed inline-and-block content within a single
261: tag.
262:
263: Due to tools such as the rich text editor needing to know whether a
264: custom tag is block-level or inline-level on render, the use of this
265: tag is discouraged starting with LON-CAPA 3.0 although it will continue
266: to function. Fully compatible with legacy LON-CAPA 2.x content.
267:
268: =back
269:
270: =cut
271:
272: ##############################################
273: ##############################################
274:
275: my %custom_tag_parsers = (
276: mi => \&parse_mi_tag,
277: md => \&parse_md_tag,
278: m => \&parse_m_tag,
279: algebra => \&parse_algebra_tag,
280: chem => \&parse_chem_tag
281: );
282:
283: ##############################################
284: ##############################################
285:
286: =head1 CLASS OBJECT CONSTRUCTOR
287:
288: =over 4
289:
290: =item new
291:
292: $gateway = Apache::libhtmlgateway->new();
293: $gateway = Apache::libhtmlgateway->new($target);
294:
295: Constructs and returns a new gateway object. An optional argument
296: allows one to specify the target of the output, defaults to 'web'.
297: Behind the scenes, a single XML::LibXML parser object is created
298: behind the scenes. On destroy, this parser object is destroyed
299: as well.
300:
301: =back
302:
303: =cut
304:
305: ##############################################
306: ##############################################
307:
308: sub new {
309: my $invocant = shift;
310: my $class = ref($invocant) || $invocant;
311: my $target = shift;
312: # create a new parser instance for libxml
313: my $self = {
314: parser => XML::LibXML->new(),
315: target => ($target) ? $target : 'web'
316: };
317: # options for the libxml parser
318: $self->{parser}->recover(1);
319: $self->{parser}->recover_silently(1);
320: bless($self, $class); # bless = pray that it works
321: return $self;
322: }
323:
324: sub DESTROY {
325: my $self = shift;
326: my $parser = $self->{parser};
327: undef $parser; # destroy the parser instance
328: }
329:
330: ##############################################
331: ##############################################
332:
333: =head1 PUBLIC OBJECT METHODS
334:
335: =over 4
336:
337: =item process_html_to_xml
338:
339: $xml = $gateway->process_html_to_xml($html);
340:
341: Takes presumably-malformed HTML, encodes ampersands characters
342: and passes the result to the Xml::LibXML parser, which creates
343: a DOM tree in memory of the content. This parse is as error-tolerant
344: as can be set, and libxml attempts to recover from any errors as much
345: as possible. This DOM tree is then taken and serialized,
346: eliminating unbalanced and malformed tags along the way. This
347: XML code (without any header tags) is then returned to the caller.
348:
349: =cut
350:
351: ##############################################
352: ##############################################
353:
354: sub process_html_to_xml {
355: my $self = shift;
356: my $input = shift;
357: my $parser = $self->{parser};
358:
359: if (length($input) < 1) { return ""; }
360:
361: # only encode ampersands -- brackets may be valid tags
362: my $encoded = &HTML::Entities::encode_entities($input, '&');
363:
364: # for the <chem> tag, we want the strings "<=>", "<-", "->" to be properly
365: # entitized so the parser doesn't destroy it
366: $encoded =~ s/(\<\s*chem\s*>.*)\<\=\>(.*\<\s*\/chem\s*>)/$1\<\;\&\#61\;\>\;$2/gi;
367: $encoded =~ s/(\<\s*chem\s*>.*)\-\>(.*\<\s*\/chem\s*>)/$1\-\>\;$2/gi;
368: $encoded =~ s/(\<\s*chem\s*>.*)\<\-(.*\<\s*\/chem\s*>)/$1\<\;\-$2/gi;
369:
370: # parse into libXML to tidy tags, we suppress any errors
371: # because otherwise the parser complains about non-HTML
372: # tags to STDERR and the Apache error logs
373: my $dom = $parser->parse_html_string($encoded,
374: {
375: suppress_errors => 1,
376: suppress_warnings => 1,
377: recover => 2
378: }
379: );
380: # the dom returns a full <html> structure, so just get
381: # all the child nodes of the <body> tag and put them together
382: my @body_nodes = $dom->findnodes('/html/body');
383: my @body_children = $body_nodes[0]->childNodes;
384: my $xml = "";
385: foreach my $child (@body_children) {
386: $xml .= $child->toString();
387: }
388: # entities passed into $input are in the form of '&lt;'
389: # they are double entities
390: return $xml;
391: }
392:
393: ##############################################
394: ##############################################
395:
396: =item process_xml_to_html
397:
398: $xhtml = $gateway->process_xml_to_html($xml);
399:
400: Takes XML input, decodes ampersands characters
401: and passes the result then to the caller.
402:
403: =cut
404:
405: ##############################################
406: ##############################################
407:
408: sub process_xml_to_html {
409: my $self = shift;
410: my $input = shift;
411: # decode one level of entities (XML) such that the
412: # output is returned to the original level of entities
413: # $input "<" --> $xml "&lt;" --> "<"
414: my $xhtml = &HTML::Entities::decode_entities($input);
415: # now we have valid XHTML that can be stored and parsed
416: return $xhtml;
417: }
418:
419: ##############################################
420: ##############################################
421:
422: =item process_incoming_html
423:
424: $xhtml = $gateway->process_incoming_html($html);
425: $xhtml = $gateway->process_incoming_html($html, $legacy);
426:
427: Designed to be called for all raw HTML inputs from the client
428: side before storing or rendering data. Decodes UTF-8 data,
429: trims leading and trailing "\n" and "<br />" tags. Processes
430: the result through the XML parser, converts this back to
431: balanced well-formed XHTML, re-encodes the result as UTF-8,
432: and returns the result to the caller.
433:
434: =over 4
435:
436: =item legacy
437:
438: $legacy = 0;
439: $legacy = 1;
440:
441: I<(optional)> If true, adds additional processing intended
442: to emulate LON-CAPA 2.x parsing of the content.
443:
444: =back
445:
446: =cut
447:
448: ##############################################
449: ##############################################
450:
451: sub process_incoming_html {
452: # this should be called by all HTML inputs before storing
453: # data --> for consistency's sake, call process_html_to_xml
454: # afterwards if you need to embed this in XML later on
455: my $self = shift;
456: my $input = shift;
457: my $legacy = shift;
458:
459: # no idea why i have to call this to get unicode characters
460: # working, but i do, so here it is.
461: $input = &Encode::decode_utf8($input);
462:
463: # trim leading and trailing whitespace and HTML breaks
464: chomp($input);
465: $input =~ s/\s+$//s;
466: $input =~ s/^\s+//s;
467: $input =~ s/\<br\s*\/*\>$//s;
468: my $no_p_input = (length($input) > 0 && $input !~ m/.*\<[\s]*p[\s]*\>.*/is);
469: my $xml = $self->process_html_to_xml($input);
470: if ($legacy && !&contains_block_level_tags($input)) {
471: # the xml returns content inside a <p> tag
472: # if there are no block tags... thus to preserve
473: # old behavior, we strip out that <p></p>
474: if ($no_p_input) {
475: $xml =~ s/^\<p\>(.*)\<\/p\>/$1/si;
476: }
477: }
478: my $xhtml = $self->process_xml_to_html($xml);
479: # see above unicode encoding comment
480: $xhtml = &Encode::encode_utf8($xhtml);
481: return $xhtml;
482: }
483:
484: ##############################################
485: ##############################################
486:
487: =item process_outgoing_html
488:
489: $html = $gateway->process_outgoing_html($xhtml);
490: $html = $gateway->process_outgoing_html($xhtml, $legacy);
491:
492: Designed to be called for all HTML outputs to the client
493: side before rendering data. This entitizes all non-allowed
494: tags, as was previously done in Apache::lonfeedback, and
495: processes and converts all LON-CAPA supported custom tags (see
496: above) to their respective output HTML.
497:
498: =over 4
499:
500: =item legacy
501:
502: $legacy = 0;
503: $legacy = 1;
504:
505: I<(optional)> If true, adds additional processing intended
506: to emulate LON-CAPA 2.x parsing of the content. This includes
507: behavior to convert "\n" to "<br />" if there are no block-level
508: tags detected in the input. In addition, raw URLs are converted
509: automatically to <a> links.
510:
511: =back
512:
513: =back
514:
515: =cut
516:
517: ##############################################
518: ##############################################
519:
520: sub process_outgoing_html {
521: # this should be called on all HTML outputs before displaying
522: # because it will filter out all non-HTML+LONCAPA tags.
523: # tags are not filtered at input stage for greater backwards
524: # compatibility. note that this disregards course preference.
525: my $self = shift;
526: my $input = shift;
527: my $legacy = shift;
528:
529: my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
530: # entitize all tags that are not explicitly allowed
531: $input =~ s/\<(\/?\s*(\w+)[^\>\<]*)/
532: {($html{uc($2)}&&(length($1)<1000))?"\<$1":"\<$1"}/ge;
533: $input =~ s/(\<?\s*(\w+)[^\<\>]*)\>/
534: {($html{uc($2)}&&(length($1)<1000))?"$1\>":"$1\>"}/ge;
535: if ($legacy) {
536: unless (&contains_block_level_tags($self, $input)) {
537: $input = $self->legacy_newline_to_br($input);
538: }
539: $input = $self->legacy_raw_href_to_link($input);
540: }
541: # at this point, we need to convert our own custom tags
542: # into the appropriate output
543: # see above for supported tags
544: my $output = "";
545: my $parser = HTML::LCParser->new(\$input);
546: while (my $token = $parser->get_token()) {
547: if ($token->[0] eq 'T') {
1.3 faziophi 548: if ($self->{target} ne 'tex') {
549: $output .= &Apache::lontexconvert::smiley($token->[1]);
550: } else {
1.4 faziophi 551: my $t = $token->[1];
552: $t =~ s/([^\n\r\t &<>!\#%\(-;=?-~])/num_entity($1)/ge;
553: $output .= $t;
1.3 faziophi 554: }
555: } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
1.1 faziophi 556: $output .= $token->[1];
557: } elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
558: $output .= $token->[2];
559: } elsif ($token->[0] eq 'S') {
560: my $tag = lc($token->[1]);
561: if (exists($custom_tag_parsers{$tag})) {
562: my $text = $parser->get_text();
563: $output .= $custom_tag_parsers{$tag}(
564: $self, $text, $self->{target});
565: } else {
566: $output .= $token->[4];
567: }
568: }
569: }
570: return $output;
571: }
572:
573: ##############################################
574: ##############################################
575:
576: =head1 STATIC CLASS METHODS
577:
578: The following are static class methods that can be called
579: by any object.
580:
581: =over 4
582:
583: =item contains_block_level_tags
584:
585: $bool = Apache::lonhtmlgateway::contains_block_level_tags($input);
586:
587: Uses a regular expression to find, in the input data, any tags
588: described in %LONCAPA_ALLOWED_STANDARD_TAGS as block-level.
589: Returns 1 if true, 0 if false.
590:
591: =cut
592:
593: ##############################################
594: ##############################################
595:
596: sub contains_block_level_tags {
597: my $class = shift;
598: my $input = shift;
599: my @block_level_tags = @{&get_block_level_tags($class)};
600: foreach my $tag (@block_level_tags) {
601: if ($input =~ m/\<\/?\s*$tag[^\>\<]*/gi) {
602: # if your input loves this regular expression
603: # as much as i do, then return true.
604: # it searches for either a <tag> or <tag />
605: return 1;
606: }
607: }
608: return 0;
609: }
610:
611: ##############################################
612: ##############################################
613:
614: =item get_block_level_tags
615:
616: @tags = Apache::lonhtmlgateway::get_block_level_tags();
617:
618: Return an array with any tags described in
619: %LONCAPA_ALLOWED_STANDARD_TAGS as block-level. Note that these
620: tags are returned in no particular order, and the tag names
621: are returned in uppercase.
622:
623:
624: =cut
625:
626: ##############################################
627: ##############################################
628:
629: sub get_block_level_tags {
630: my $class = shift;
631: my %html = %Apache::lonhtmlgateway::LONCAPA_ALLOWED_STANDARD_TAGS;
632: my @block = [];
633: foreach my $tag (keys(%html)) {
634: if ($html{$tag} == 2) {
635: push(@block, $tag);
636: }
637: }
638: return \@block;
639: }
640:
1.4 faziophi 641: sub num_entity {
642: sprintf "&#x%X;", ord($_[0]);
643: }
644:
1.1 faziophi 645: ##############################################
646: ##############################################
647:
648: =head2 Legacy Functions
649:
650: These functions are intended to process input in the same or
651: a similar way to how it was processed in LON-CAPA 2.x.
652:
653: =item legacy_newline_to_br
654:
655: I<(formerly Apache::lonfeedback::newline_to_br)>
656:
657: $converted = Apache::lonhtmlgateway::legacy_newline_to_br($input);
658:
659: Parse the input using HTML::LCParser, and in any text nodes
660: which contain "\n" characters, replace those characters with
661: an HTML "<br />" tag.
662:
663: =cut
664:
665: ##############################################
666: ##############################################
667:
668: sub legacy_newline_to_br {
669: my $class = shift;
670: my $input = shift;
671: my $output;
672: my $parser = HTML::LCParser->new(\$input);
673: while (my $token = $parser->get_token()) {
674: if ($token->[0] eq 'T') {
675: my $text = $token->[1];
676: $text =~ s/\n/\<br \/\>/g;
677: $output .= $text;
678: } elsif ($token->[0] eq 'D' || $token->[0] eq 'C') {
679: $output .= $token->[1];
680: } elsif ($token->[0] eq 'PI' || $token->[0] eq 'E') {
681: $output .= $token->[2];
682: } elsif ($token->[0] eq 'S') {
683: $output .= $token->[4];
684: }
685: }
686: return $output;
687: }
688:
689: ##############################################
690: ##############################################
691:
692: =item legacy_raw_href_to_link
693:
694: I<(formerly Apache::lonhtmlcommon::raw_href_to_link)>
695:
696: $converted = Apache::lonhtmlgateway::legacy_raw_href_to_link($input);
697:
698: Search for any links/URLs within the input text, and convert them
699: to <a> tags whose content is embedded inside a <tt> tag.
700:
701: =back
702:
703: =cut
704:
705: ##############################################
706: ##############################################
707:
708: sub legacy_raw_href_to_link {
709: my $class = shift;
710: my $input = shift;
711: $input =~ s/(https?\:\/\/[^\s\'\"\<]+)([\s\<]|$)/<a href="$1"><tt>$1<\/tt><\/a>$2/gi;
712: return $input;
713: }
714:
715: sub parse_algebra_tag {
716: my $self = shift;
717: my $input = shift;
718: # the <algebra> parser does NOT handle entities,
719: # unlike the general <m> parser; thus we run
720: # the content of this tag through HTML::Entities,
1.2 faziophi 721: # decoding it first. we also just get the tex, and
722: # feed it through as if it were an <mi> tag.
1.1 faziophi 723: $input = &HTML::Entities::decode($input);
1.5 ! raeburn 724: my $algebra =
! 725: &Apache::lontexconvert::algebra($input,'tex',undef,undef,undef,'tth');
1.2 faziophi 726: return &parse_m_tag($self, $algebra);
1.1 faziophi 727: }
728:
729: sub parse_mi_tag {
730: my $self = shift;
731: my $input = shift;
1.2 faziophi 732: return &parse_m_tag($self, '\ensuremath{'.$input.'}');
1.1 faziophi 733: }
734:
735: sub parse_md_tag {
736: my $self = shift;
737: my $input = shift;
738: return &parse_m_tag($self, '\['.$input.'\]');
739: }
740:
741: sub parse_m_tag {
742: my $self = shift;
743: my $input = shift;
1.4 faziophi 744: if ($self->{target} ne 'tex') {
745: return &Apache::lontexconvert::to_convert($input, $self->{target});
746: } else {
747: return '<m>'.$input.'</m>';
748: }
1.1 faziophi 749: }
750:
751: sub parse_chem_tag {
752: my $self = shift;
753: my $input = shift;
754: my $target = $self->{target};
755: # as with the <algebra> tag, some portions of the
756: # <chem> input may be coming in encoded, especially
757: # arrows -- so decode it in HTML::Entities
758: $input = &HTML::Entities::decode($input);
759: my @tokens = split(/(\s\+|\->|<=>|<\-|\.)/,$input);
760: my $formula = '';
761: foreach my $token (@tokens) {
762: if ($token eq '->' ) {
763: if ($target eq 'web') {
764: $formula .= '→ ';
765: } else {
766: $formula .= '<m>\ensuremath{\rightarrow}</m> ';
767: }
768: next;
769: }
770: if ($token eq '<-' ) {
771: if ($target eq 'web') {
772: $formula .= '← ';
773: } else {
774: $formula .= '<m>\ensuremath{\leftarrow}</m> ';
775: }
776: next;
777: }
778: if ($token eq '<=>') {
779: if ($target eq 'web') {
780: $formula .= '⇌ ';
781: } else {
782: $formula .= '<m>\ensuremath{\rightleftharpoons}</m> ';
783: }
784: next;
785: }
786: if ($token eq '.') {
787: $formula =~ s/(\ \;| )$//;
788: $formula .= '·';
789: next;
790: }
791: $token =~ /^\s*([\d|\/]*(?:&frac\d\d)?)(.*)/;
792: $formula .= $1 if ($1 ne '1'); # stoichiometric coefficient
793: my $molecule = $2;
794: # subscripts
795: $molecule =~ s|(?<=[a-zA-Z\)\]\s])(\d+)|<sub>$1</sub>|g;
796: # superscripts
797: $molecule =~ s|\^(\d*[+\-]*)|<sup>$1</sup>|g;
798: # strip whitespace
799: $molecule =~ s/\s*//g;
800: # forced space
801: $molecule =~ s/_/ /g;
802: $molecule =~ s/-/−/g;
803: $formula .= $molecule.' ';
804: }
805: # get rid of trailing space
806: $formula =~ s/(\ \;| )$//;
807: return $formula;
808: }
809:
810: ##############################################
811: ##############################################
812:
813: =head1 AUTHORS
814:
815: Phil Fazio
816:
817: =head1 VERSION
818:
1.5 ! raeburn 819: $Id: lonhtmlgateway.pm,v 1.4 2010/05/04 19:45:05 faziophi Exp $
1.1 faziophi 820:
821: =cut
822:
823: ##############################################
824: ##############################################
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>