Annotation of loncom/homework/math_parser/Tokenizer.pm, revision 1.1
1.1 ! damieng 1: # The LearningOnline Network with CAPA - LON-CAPA
! 2: # String tokenizer
! 3: #
! 4: # Copyright (C) 2014 Michigan State University Board of Trustees
! 5: #
! 6: # This program is free software: you can redistribute it and/or modify
! 7: # it under the terms of the GNU General Public License as published by
! 8: # the Free Software Foundation, either version 3 of the License, or
! 9: # (at your option) any later version.
! 10: #
! 11: # This program is distributed in the hope that it will be useful,
! 12: # but WITHOUT ANY WARRANTY; without even the implied warranty of
! 13: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
! 14: # GNU General Public License for more details.
! 15: #
! 16: # You should have received a copy of the GNU General Public License
! 17: # along with this program. If not, see <http://www.gnu.org/licenses/>.
! 18: #
! 19:
! 20: ##
! 21: # String tokenizer. Recognizes only names, numbers, and parser operators.
! 22: ##
! 23: package Apache::math_parser::Tokenizer;
! 24:
! 25: use strict;
! 26: use warnings;
! 27: use utf8;
! 28:
! 29: use aliased 'Apache::math_parser::Definitions';
! 30: use aliased 'Apache::math_parser::ParseException';
! 31: use aliased 'Apache::math_parser::Token';
! 32:
! 33: ##
! 34: # @constructor
! 35: # @param {Definitions} defs - Operator definitions
! 36: # @param {string} text - The text to tokenize
! 37: ##
! 38: sub new {
! 39: my $class = shift;
! 40: my $self = {
! 41: _defs => shift,
! 42: _text => shift,
! 43: };
! 44: bless $self, $class;
! 45: return $self;
! 46: }
! 47:
! 48: # Attribute helpers
! 49:
! 50: ##
! 51: # Operator definitions
! 52: # @returns {Definitions}
! 53: ##
! 54: sub defs {
! 55: my $self = shift;
! 56: return $self->{_defs};
! 57: }
! 58:
! 59: ##
! 60: # The text to tokenize
! 61: # @returns {string}
! 62: ##
! 63: sub text {
! 64: my $self = shift;
! 65: return $self->{_text};
! 66: }
! 67:
! 68:
! 69: ##
! 70: # Tokenizes the text.
! 71: # Can throw a ParseException.
! 72: # @returns {Token[]}
! 73: ##
! 74: sub tokenize {
! 75: my( $self ) = @_;
! 76: my( $text, $c, $i, $from, @tokens, $value );
! 77: my @operators = @{$self->defs->operators};
! 78: my $dec1 = Definitions->DECIMAL_SIGN_1;
! 79: my $dec2 = Definitions->DECIMAL_SIGN_2;
! 80:
! 81: $text = $self->text;
! 82: if (!defined $text) {
! 83: die "Math Tokenizer: undefined text";
! 84: }
! 85: if (!utf8::is_utf8($text)) {
! 86: utf8::decode($text);
! 87: }
! 88: $i = 0;
! 89: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 90: @tokens = ();
! 91:
! 92: main:
! 93: while ($c ne '') {
! 94: $from = $i;
! 95:
! 96: # ignore whitespace
! 97: if ($c le ' ') {
! 98: $i++;
! 99: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 100: next;
! 101: }
! 102:
! 103: # check for numbers before operators
! 104: # (numbers starting with . will not be confused with the . operator)
! 105: if (($c ge '0' && $c le '9') ||
! 106: (($c eq $dec1 || $c eq $dec2) &&
! 107: (substr($text, $i+1, 1) ge '0' && substr($text, $i+1, 1) le '9'))) {
! 108: $value = '';
! 109:
! 110: if ($c ne $dec1 && $c ne $dec2) {
! 111: $i++;
! 112: $value .= $c;
! 113: # Look for more digits.
! 114: for (;;) {
! 115: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 116: if ($c lt '0' || $c gt '9') {
! 117: last;
! 118: }
! 119: $i++;
! 120: $value .= $c;
! 121: }
! 122: }
! 123:
! 124: # Look for a decimal fraction part.
! 125: if ($c eq $dec1 || $c eq $dec2) {
! 126: $i++;
! 127: $value .= $c;
! 128: for (;;) {
! 129: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 130: if ($c lt '0' || $c gt '9') {
! 131: last;
! 132: }
! 133: $i++;
! 134: $value .= $c;
! 135: }
! 136: }
! 137:
! 138: # Look for an exponent part.
! 139: if ($c eq 'e' || $c eq 'E') {
! 140: $i++;
! 141: $value .= $c;
! 142: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 143: if ($c eq '-' || $c eq '+') {
! 144: $i++;
! 145: $value .= $c;
! 146: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 147: }
! 148: if ($c lt '0' || $c gt '9') {
! 149: # syntax error in number exponent
! 150: die ParseException->new("Syntax error in number exponent.", $from, $i);
! 151: }
! 152: do {
! 153: $i++;
! 154: $value .= $c;
! 155: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 156: } while ($c ge '0' && $c le '9');
! 157: }
! 158:
! 159: # Convert the string value to a number. If it is finite, then it is a good token.
! 160: my $n = eval "\$value =~ tr/".$dec1.$dec2."/../";
! 161: if (!($n == 9**9**9 || $n == -9**9**9 || ! defined( $n <=> 9**9**9 ))) {
! 162: push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
! 163: next;
! 164: } else {
! 165: # syntax error in number
! 166: die ParseException->new("Syntax error in number.", $from, $i);
! 167: }
! 168: }
! 169:
! 170: # check for operators before names (they could be confused with
! 171: # variables if they don't use special characters)
! 172: for (my $iop = 0; $iop < scalar(@operators); $iop++) {
! 173: my $op = $operators[$iop];
! 174: my $opid = $op->id;
! 175: if (substr($text, $i, length($opid)) eq $opid) {
! 176: $i += length($op->id);
! 177: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 178: push(@tokens, Token->new(Token->OPERATOR, $from, $i - 1, $op->id, $op));
! 179: next main;
! 180: }
! 181: }
! 182:
! 183: # names
! 184: if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
! 185: ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ') {
! 186: $value = $c;
! 187: $i++;
! 188: for (;;) {
! 189: $c = $i < length($text) ? substr($text, $i, 1) : '';
! 190: if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
! 191: ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' ||
! 192: ($c ge '0' && $c le '9') || $c eq '_') {
! 193: $value .= $c;
! 194: $i++;
! 195: } else {
! 196: last;
! 197: }
! 198: }
! 199: # "i" is turned into a NUMBER token
! 200: if ($value eq "i") {
! 201: push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
! 202: next;
! 203: }
! 204: push(@tokens, Token->new(Token->NAME, $from, $i - 1, $value));
! 205: next;
! 206: }
! 207:
! 208: # unrecognized operator
! 209: die ParseException->new("Unrecognized operator.", $from, $i);
! 210: }
! 211: return @tokens;
! 212: }
! 213:
! 214: 1;
! 215: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>