Annotation of loncom/homework/math_parser/Tokenizer.pm, revision 1.3
1.1 damieng 1: # The LearningOnline Network with CAPA - LON-CAPA
2: # String tokenizer
3: #
1.3 ! raeburn 4: # $Id: Tokenizer.pm,v 1.3 2023/03/13 18:30:00 raeburn Exp $
! 5: #
1.1 damieng 6: # Copyright (C) 2014 Michigan State University Board of Trustees
7: #
8: # This program is free software: you can redistribute it and/or modify
9: # it under the terms of the GNU General Public License as published by
10: # the Free Software Foundation, either version 3 of the License, or
11: # (at your option) any later version.
12: #
13: # This program is distributed in the hope that it will be useful,
14: # but WITHOUT ANY WARRANTY; without even the implied warranty of
15: # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16: # GNU General Public License for more details.
17: #
18: # You should have received a copy of the GNU General Public License
19: # along with this program. If not, see <http://www.gnu.org/licenses/>.
20: #
21:
22: ##
23: # String tokenizer. Recognizes only names, numbers, and parser operators.
24: ##
25: package Apache::math_parser::Tokenizer;
26:
27: use strict;
28: use warnings;
29: use utf8;
30:
31: use aliased 'Apache::math_parser::Definitions';
32: use aliased 'Apache::math_parser::ParseException';
33: use aliased 'Apache::math_parser::Token';
34:
35: ##
36: # @constructor
37: # @param {Definitions} defs - Operator definitions
38: # @param {string} text - The text to tokenize
39: ##
40: sub new {
41: my $class = shift;
42: my $self = {
43: _defs => shift,
44: _text => shift,
45: };
46: bless $self, $class;
47: return $self;
48: }
49:
50: # Attribute helpers
51:
52: ##
53: # Operator definitions
54: # @returns {Definitions}
55: ##
56: sub defs {
57: my $self = shift;
58: return $self->{_defs};
59: }
60:
61: ##
62: # The text to tokenize
63: # @returns {string}
64: ##
65: sub text {
66: my $self = shift;
67: return $self->{_text};
68: }
69:
70:
71: ##
72: # Tokenizes the text.
73: # Can throw a ParseException.
74: # @returns {Token[]}
75: ##
76: sub tokenize {
77: my( $self ) = @_;
78: my( $text, $c, $i, $from, @tokens, $value );
79: my @operators = @{$self->defs->operators};
80: my $dec1 = Definitions->DECIMAL_SIGN_1;
81: my $dec2 = Definitions->DECIMAL_SIGN_2;
82:
83: $text = $self->text;
84: if (!defined $text) {
85: die "Math Tokenizer: undefined text";
86: }
87: if (!utf8::is_utf8($text)) {
88: utf8::decode($text);
89: }
90: $i = 0;
91: $c = $i < length($text) ? substr($text, $i, 1) : '';
92: @tokens = ();
93:
94: main:
95: while ($c ne '') {
96: $from = $i;
97:
98: # ignore whitespace
99: if ($c le ' ') {
100: $i++;
101: $c = $i < length($text) ? substr($text, $i, 1) : '';
102: next;
103: }
104:
105: # check for numbers before operators
106: # (numbers starting with . will not be confused with the . operator)
107: if (($c ge '0' && $c le '9') ||
108: (($c eq $dec1 || $c eq $dec2) &&
109: (substr($text, $i+1, 1) ge '0' && substr($text, $i+1, 1) le '9'))) {
110: $value = '';
111:
112: if ($c ne $dec1 && $c ne $dec2) {
113: $i++;
114: $value .= $c;
115: # Look for more digits.
116: for (;;) {
117: $c = $i < length($text) ? substr($text, $i, 1) : '';
118: if ($c lt '0' || $c gt '9') {
119: last;
120: }
121: $i++;
122: $value .= $c;
123: }
124: }
125:
126: # Look for a decimal fraction part.
127: if ($c eq $dec1 || $c eq $dec2) {
128: $i++;
129: $value .= $c;
130: for (;;) {
131: $c = $i < length($text) ? substr($text, $i, 1) : '';
132: if ($c lt '0' || $c gt '9') {
133: last;
134: }
135: $i++;
136: $value .= $c;
137: }
138: }
139:
140: # Look for an exponent part.
141: if ($c eq 'e' || $c eq 'E') {
142: $i++;
143: $value .= $c;
144: $c = $i < length($text) ? substr($text, $i, 1) : '';
145: if ($c eq '-' || $c eq '+') {
146: $i++;
147: $value .= $c;
148: $c = $i < length($text) ? substr($text, $i, 1) : '';
149: }
150: if ($c lt '0' || $c gt '9') {
151: # syntax error in number exponent
152: die ParseException->new("Syntax error in number exponent.", $from, $i);
153: }
154: do {
155: $i++;
156: $value .= $c;
157: $c = $i < length($text) ? substr($text, $i, 1) : '';
158: } while ($c ge '0' && $c le '9');
159: }
160:
161: # Convert the string value to a number. If it is finite, then it is a good token.
162: my $n = eval "\$value =~ tr/".$dec1.$dec2."/../";
163: if (!($n == 9**9**9 || $n == -9**9**9 || ! defined( $n <=> 9**9**9 ))) {
164: push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
165: next;
166: } else {
167: # syntax error in number
168: die ParseException->new("Syntax error in number.", $from, $i);
169: }
170: }
171:
172: # check for operators before names (they could be confused with
173: # variables if they don't use special characters)
174: for (my $iop = 0; $iop < scalar(@operators); $iop++) {
175: my $op = $operators[$iop];
176: my $opid = $op->id;
177: if (substr($text, $i, length($opid)) eq $opid) {
178: $i += length($op->id);
179: $c = $i < length($text) ? substr($text, $i, 1) : '';
180: push(@tokens, Token->new(Token->OPERATOR, $from, $i - 1, $op->id, $op));
181: next main;
182: }
183: }
184:
185: # names
186: if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
1.2 damieng 187: ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' || $c eq '°') {
1.1 damieng 188: $value = $c;
189: $i++;
190: for (;;) {
191: $c = $i < length($text) ? substr($text, $i, 1) : '';
192: if (($c ge 'a' && $c le 'z') || ($c ge 'A' && $c le 'Z') ||
193: ($c ge 'α' && $c le 'ω') || ($c ge 'Α' && $c le 'Ω') || $c eq 'µ' ||
194: ($c ge '0' && $c le '9') || $c eq '_') {
195: $value .= $c;
196: $i++;
197: } else {
198: last;
199: }
200: }
201: # "i" is turned into a NUMBER token
202: if ($value eq "i") {
203: push(@tokens, Token->new(Token->NUMBER, $from, $i - 1, $value));
204: next;
205: }
206: push(@tokens, Token->new(Token->NAME, $from, $i - 1, $value));
207: next;
208: }
209:
210: # unrecognized operator
211: die ParseException->new("Unrecognized operator.", $from, $i);
212: }
213: return @tokens;
214: }
215:
216: 1;
217: __END__
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>