loncom/homework/math_parser/Tokenizer.pm - annotate

Return to Tokenizer.pm CVS log
Up to [LON-CAPA] / loncom / homework / math_parser
Annotation of loncom/homework/math_parser/Tokenizer.pm, revision 1.3

1.1       damieng     1: # The LearningOnline Network with CAPA - LON-CAPA
                      2: # String tokenizer
                      3: #
1.3     ! raeburn     4: # $Id: Tokenizer.pm,v 1.3 2023/03/13 18:30:00 raeburn Exp $
        !             5: #
1.1       damieng     6: # Copyright (C) 2014 Michigan State University Board of Trustees
                      7: #
                      8: # This program is free software: you can redistribute it and/or modify
                      9: # it under the terms of the GNU General Public License as published by
                     10: # the Free Software Foundation, either version 3 of the License, or
                     11: # (at your option) any later version.
                     12: #
                     13: # This program is distributed in the hope that it will be useful,
                     14: # but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
                     16: # GNU General Public License for more details.
                     17: #
                     18: # You should have received a copy of the GNU General Public License
                     19: # along with this program. If not, see <http://www.gnu.org/licenses/>.
                     20: #
                     21: 
                     22: ##
                     23: # String tokenizer. Recognizes only names, numbers, and parser operators.
                     24: ##
                     25: package Apache::math_parser::Tokenizer;
                     26: 
                     27: use strict;
                     28: use warnings;
                     29: use utf8;
                     30: 
                     31: use aliased 'Apache::math_parser::Definitions';
                     32: use aliased 'Apache::math_parser::ParseException';
                     33: use aliased 'Apache::math_parser::Token';
                     34: 
                     35: ##
                     36: # @constructor
                     37: # @param {Definitions} defs - Operator definitions
                     38: # @param {string} text - The text to tokenize
                     39: ##
                     40: sub new {
                     41:     my $class = shift;
                     42:     my $self = {
                     43:         _defs => shift,
                     44:         _text => shift,
                     45:     };
                     46:     bless $self, $class;
                     47:     return $self;
                     48: }
                     49: 
                     50: # Attribute helpers
                     51: 
                     52: ##
                     53: # Operator definitions
                     54: # @returns {Definitions}
                     55: ##
                     56: sub defs {
                     57:     my $self = shift;
                     58:     return $self->{_defs};
                     59: }
                     60: 
                     61: ##
                     62: # The text to tokenize
                     63: # @returns {string}
                     64: ##
                     65: sub text {
                     66:     my $self = shift;
                     67:     return $self->{_text};
                     68: }
                     69: 
                     70: 
                     71: ##
                     72: # Tokenizes the text.
                     73: # Can throw a ParseException.
                     74: # @returns {Token[]}
                     75: ##
                     76: sub tokenize {
                     77:     my( $self ) = @_;
                     78:     my( $text, $c, $i, $from, @tokens, $value );
                     79:     my @operators = @{$self->defs->operators};
                     80:     my $dec1 = Definitions->DECIMAL_SIGN_1;
                     81:     my $dec2 = Definitions->DECIMAL_SIGN_2;
                     82:     
                     83:     $text = $self->text;
                     84:     if (!defined $text) {
                     85:         die "Math Tokenizer: undefined text";
                     86:     }
                     87:     if (!utf8::is_utf8($text)) {
                     88:         utf8::decode($text);
                     89:     }
                     90:     $i = 0;
                     91:     $c = $i < length($text) ? substr($text, $i, 1) : '';
                     92:     @tokens = ();
                     93:     
                     94: main:
                     95:     while ($c ne '') {
                     96:         $from = $i;
                     97:         
                     98:         # ignore whitespace
                     99:         if ($c le ' ') {
                    100:             $i++;
                    101:             $c = $i < length($text) ? substr($text, $i, 1) : '';
                    102:             next;
                    103:         }
                    104:         
                    105:         # check for numbers before operators
                    106:         # (numbers starting with . will not be confused with the . operator)
                    107:         if (($c ge '0' && $c le '9') ||
                    108:                 (($c eq $dec1 || $c eq $dec2) &&
                    109:                 (substr($text, $i+1, 1) ge '0' && substr($text, $i+1, 1) le '9'))) {
                    110:             $value = '';
                    111:             
                    112:             if ($c ne $dec1 && $c ne $dec2) {
                    113:                 $i++;
                    114:                 $value .= $c;
                    115:                 # Look for more digits.
                    116:                 for (;;) {
                    117:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
                    118:                     if ($c lt '0' || $c gt '9') {
                    119:                         last;
                    120:                     }
                    121:                     $i++;
                    122:                     $value .= $c;
                    123:                 }
                    124:             }
                    125:             
                    126:             # Look for a decimal fraction part.
                    127:             if ($c eq $dec1 || $c eq $dec2) {
                    128:                 $i++;
                    129:                 $value .= $c;
                    130:                 for (;;) {
                    131:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
                    132:                     if ($c lt '0' || $c gt '9') {
                    133:                         last;
                    134:                     }
                    135:                     $i++;
                    136:                     $value .= $c;
                    137:                 }
                    138:             }
                    139:             
                    140:             # Look for an exponent part.
                    141:             if ($c eq 'e' || $c eq 'E') {
                    142:                 $i++;
                    143:                 $value .= $c;
                    144:                 $c = $i < length($text) ? substr($text, $i, 1) : '';
                    145:                 if ($c eq '-' || $c eq '+') {
                    146:                     $i++;
                    147:                     $value .= $c;
                    148:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
                    149:                 }
                    150:                 if ($c lt '0' || $c gt '9') {
                    151:                     # syntax error in number exponent
                    152:                     die ParseException->new("Syntax error in number exponent.", $from, $i);
                    153:                 }
                    154:                 do {
                    155:                     $i++;
                    156:                     $value .= $c;
                    157:                     $c = $i < length($text) ? substr($text, $i, 1) : '';
                    158:                 } while ($c ge '0' && $c le '9');
                    159:             }
                    160:             
                    161:             # Convert the string value to a number. If it is finite, then it is a good token.
                    162:             my $n = eval "\$value =~ tr/".$dec1.$dec2."/../";
                    163:             if (!($n == 9**9**9 || $n == -9**9**9 || ! defined( $n <=> 9**9**9 ))) {
                    164:                 push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
                    165:                 next;
                    166:             } else {
                    167:                 # syntax error in number
                    168:                 die ParseException->new("Syntax error in number.", $from, $i);
                    169:             }
                    170:         }
                    171:         
                    172:         # check for operators before names (they could be confused with
                    173:         # variables if they don't use special characters)
                    174:         for (my $iop = 0; $iop < scalar(@operators); $iop++) {
                    175:             my $op = $operators[$iop];
                    176:             my $opid = $op->id;
                    177:             if (substr($text, $i, length($opid)) eq $opid) {
                    178:                 $i += length($op->id);
                    179:                 $c = $i < length($text) ? substr($text, $i, 1) : '';
                    180:                 push(@tokens, Token->new(Token->OPERATOR, $from, $i - 1, $op->id, $op));
                    181:                 next main;
                    182:             }
                    183:         }
                    184:         
                    185:         # names
                    186:         if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
1.2       damieng   187:                 ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' || $c eq '°') {
1.1       damieng   188:             $value = $c;
                    189:             $i++;
                    190:             for (;;) {
                    191:                 $c = $i < length($text) ? substr($text, $i, 1) : '';
                    192:                 if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
                    193:                         ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' ||
                    194:                         ($c ge '0' && $c le '9') || $c eq '_') {
                    195:                     $value .= $c;
                    196:                     $i++;
                    197:                 } else {
                    198:                     last;
                    199:                 }
                    200:             }
                    201:             # "i" is turned into a NUMBER token
                    202:             if ($value eq "i") {
                    203:                 push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
                    204:                 next;
                    205:             }
                    206:             push(@tokens, Token->new(Token->NAME, $from, $i - 1, $value));
                    207:             next;
                    208:         }
                    209:         
                    210:         # unrecognized operator
                    211:         die ParseException->new("Unrecognized operator.", $from, $i);
                    212:     }
                    213:     return @tokens;
                    214: }
                    215: 
                    216: 1;
                    217: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>