Return to Tokenizer.pm CVS log | Up to [LON-CAPA] / loncom / homework / math_parser |
1.1 damieng 1: # The LearningOnline Network with CAPA - LON-CAPA
2: # String tokenizer
3: #
4: # Copyright (C) 2014 Michigan State University Board of Trustees
5: #
6: # This program is free software: you can redistribute it and/or modify
7: # it under the terms of the GNU General Public License as published by
8: # the Free Software Foundation, either version 3 of the License, or
9: # (at your option) any later version.
10: #
11: # This program is distributed in the hope that it will be useful,
12: # but WITHOUT ANY WARRANTY; without even the implied warranty of
13: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14: # GNU General Public License for more details.
15: #
16: # You should have received a copy of the GNU General Public License
17: # along with this program. If not, see <http://www.gnu.org/licenses/>.
18: #
19:
20: ##
21: # String tokenizer. Recognizes only names, numbers, and parser operators.
22: ##
23: package Apache::math_parser::Tokenizer;
24:
25: use strict;
26: use warnings;
27: use utf8;
28:
29: use aliased 'Apache::math_parser::Definitions';
30: use aliased 'Apache::math_parser::ParseException';
31: use aliased 'Apache::math_parser::Token';
32:
33: ##
34: # @constructor
35: # @param {Definitions} defs - Operator definitions
36: # @param {string} text - The text to tokenize
37: ##
38: sub new {
39: my $class = shift;
40: my $self = {
41: _defs => shift,
42: _text => shift,
43: };
44: bless $self, $class;
45: return $self;
46: }
47:
48: # Attribute helpers
49:
50: ##
51: # Operator definitions
52: # @returns {Definitions}
53: ##
54: sub defs {
55: my $self = shift;
56: return $self->{_defs};
57: }
58:
59: ##
60: # The text to tokenize
61: # @returns {string}
62: ##
63: sub text {
64: my $self = shift;
65: return $self->{_text};
66: }
67:
68:
69: ##
70: # Tokenizes the text.
71: # Can throw a ParseException.
72: # @returns {Token[]}
73: ##
74: sub tokenize {
75: my( $self ) = @_;
76: my( $text, $c, $i, $from, @tokens, $value );
77: my @operators = @{$self->defs->operators};
78: my $dec1 = Definitions->DECIMAL_SIGN_1;
79: my $dec2 = Definitions->DECIMAL_SIGN_2;
80:
81: $text = $self->text;
82: if (!defined $text) {
83: die "Math Tokenizer: undefined text";
84: }
85: if (!utf8::is_utf8($text)) {
86: utf8::decode($text);
87: }
88: $i = 0;
89: $c = $i < length($text) ? substr($text, $i, 1) : '';
90: @tokens = ();
91:
92: main:
93: while ($c ne '') {
94: $from = $i;
95:
96: # ignore whitespace
97: if ($c le ' ') {
98: $i++;
99: $c = $i < length($text) ? substr($text, $i, 1) : '';
100: next;
101: }
102:
103: # check for numbers before operators
104: # (numbers starting with . will not be confused with the . operator)
105: if (($c ge '0' && $c le '9') ||
106: (($c eq $dec1 || $c eq $dec2) &&
107: (substr($text, $i+1, 1) ge '0' && substr($text, $i+1, 1) le '9'))) {
108: $value = '';
109:
110: if ($c ne $dec1 && $c ne $dec2) {
111: $i++;
112: $value .= $c;
113: # Look for more digits.
114: for (;;) {
115: $c = $i < length($text) ? substr($text, $i, 1) : '';
116: if ($c lt '0' || $c gt '9') {
117: last;
118: }
119: $i++;
120: $value .= $c;
121: }
122: }
123:
124: # Look for a decimal fraction part.
125: if ($c eq $dec1 || $c eq $dec2) {
126: $i++;
127: $value .= $c;
128: for (;;) {
129: $c = $i < length($text) ? substr($text, $i, 1) : '';
130: if ($c lt '0' || $c gt '9') {
131: last;
132: }
133: $i++;
134: $value .= $c;
135: }
136: }
137:
138: # Look for an exponent part.
139: if ($c eq 'e' || $c eq 'E') {
140: $i++;
141: $value .= $c;
142: $c = $i < length($text) ? substr($text, $i, 1) : '';
143: if ($c eq '-' || $c eq '+') {
144: $i++;
145: $value .= $c;
146: $c = $i < length($text) ? substr($text, $i, 1) : '';
147: }
148: if ($c lt '0' || $c gt '9') {
149: # syntax error in number exponent
150: die ParseException->new("Syntax error in number exponent.", $from, $i);
151: }
152: do {
153: $i++;
154: $value .= $c;
155: $c = $i < length($text) ? substr($text, $i, 1) : '';
156: } while ($c ge '0' && $c le '9');
157: }
158:
159: # Convert the string value to a number. If it is finite, then it is a good token.
160: my $n = eval "\$value =~ tr/".$dec1.$dec2."/../";
161: if (!($n == 9**9**9 || $n == -9**9**9 || ! defined( $n <=> 9**9**9 ))) {
162: push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
163: next;
164: } else {
165: # syntax error in number
166: die ParseException->new("Syntax error in number.", $from, $i);
167: }
168: }
169:
170: # check for operators before names (they could be confused with
171: # variables if they don't use special characters)
172: for (my $iop = 0; $iop < scalar(@operators); $iop++) {
173: my $op = $operators[$iop];
174: my $opid = $op->id;
175: if (substr($text, $i, length($opid)) eq $opid) {
176: $i += length($op->id);
177: $c = $i < length($text) ? substr($text, $i, 1) : '';
178: push(@tokens, Token->new(Token->OPERATOR, $from, $i - 1, $op->id, $op));
179: next main;
180: }
181: }
182:
183: # names
184: if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
1.2 ! damieng 185: ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' || $c eq '°') {
1.1 damieng 186: $value = $c;
187: $i++;
188: for (;;) {
189: $c = $i < length($text) ? substr($text, $i, 1) : '';
190: if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
191: ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' ||
192: ($c ge '0' && $c le '9') || $c eq '_') {
193: $value .= $c;
194: $i++;
195: } else {
196: last;
197: }
198: }
199: # "i" is turned into a NUMBER token
200: if ($value eq "i") {
201: push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
202: next;
203: }
204: push(@tokens, Token->new(Token->NAME, $from, $i - 1, $value));
205: next;
206: }
207:
208: # unrecognized operator
209: die ParseException->new("Unrecognized operator.", $from, $i);
210: }
211: return @tokens;
212: }
213:
214: 1;
215: __END__