@@ -14,11 +14,11 @@ def __init__(self, kind, start, end, value=None):
1414 self .value = value
1515
1616 def __repr__ (self ):
17- return '<Token kind={} at {}..{} value={}>' .format (
17+ return u '<Token kind={} at {}..{} value={}>' .format (
1818 get_token_kind_desc (self .kind ),
1919 self .start ,
2020 self .end ,
21- self .value if self . value is not None else 'None'
21+ repr ( self .value )
2222 )
2323
2424 def __eq__ (self , other ):
@@ -65,7 +65,7 @@ class TokenKind(object):
6565
6666def get_token_desc (token ):
6767 if token .value :
68- return '{} "{}"' .format (
68+ return u '{} "{}"' .format (
6969 get_token_kind_desc (token .kind ),
7070 token .value
7171 )
@@ -103,7 +103,8 @@ def get_token_kind_desc(kind):
103103def char_code_at (s , pos ):
104104 if 0 <= pos < len (s ):
105105 return ord (s [pos ])
106- return 0
106+
107+ return None
107108
108109
109110PUNCT_CODE_TO_KIND = {
@@ -122,6 +123,10 @@ def char_code_at(s, pos):
122123}
123124
124125
126+ def print_char_code (code ):
127+ return 'EOF' if code is None else json .dumps (unichr (code ))
128+
129+
125130def read_token (source , from_position ):
126131 """Gets the next token from the source starting at the given position.
127132
@@ -132,30 +137,52 @@ def read_token(source, from_position):
132137 body_length = len (body )
133138
134139 position = position_after_whitespace (body , from_position )
135- code = char_code_at (body , position )
136140
137141 if position >= body_length :
138142 return Token (TokenKind .EOF , position , position )
139143
144+ code = char_code_at (body , position )
145+
146+ if code < 0x0020 and code not in (0x0009 , 0x000A , 0x000D ):
147+ raise LanguageError (
148+ source , position ,
149+ u'Invalid character {}.' .format (print_char_code (code ))
150+ )
151+
140152 kind = PUNCT_CODE_TO_KIND .get (code )
141153 if kind is not None :
142154 return Token (kind , position , position + 1 )
143155
144156 if code == 46 : # .
145- if char_code_at (body , position + 1 ) == 46 and \
146- char_code_at (body , position + 2 ) == 46 :
157+ if char_code_at (body , position + 1 ) == char_code_at (body , position + 2 ) == 46 :
147158 return Token (TokenKind .SPREAD , position , position + 3 )
159+
148160 elif 65 <= code <= 90 or code == 95 or 97 <= code <= 122 :
149161 # A-Z, _, a-z
150162 return read_name (source , position )
163+
151164 elif code == 45 or 48 <= code <= 57 : # -, 0-9
152165 return read_number (source , position , code )
166+
153167 elif code == 34 : # "
154168 return read_string (source , position )
155169
156170 raise LanguageError (
157171 source , position ,
158- u'Unexpected character {}' .format (json .dumps (body [position ])))
172+ u'Unexpected character {}.' .format (print_char_code (code )))
173+
174+ ignored_whitespace_characters = frozenset ([
175+ # BOM
176+ 0xFEFF ,
177+ # White Space
178+ 0x0009 , # tab
179+ 0x0020 , # space
180+ # Line Terminator
181+ 0x000A , # new line
182+ 0x000D , # carriage return
183+ # Comma
184+ 0x002C
185+ ])
159186
160187
161188def position_after_whitespace (body , start_position ):
@@ -166,20 +193,16 @@ def position_after_whitespace(body, start_position):
166193 position = start_position
167194 while position < body_length :
168195 code = char_code_at (body , position )
169- if code in (
170- 32 , # space
171- 44 , # comma
172- 160 , # '\xa0'
173- 0x2028 , # line separator
174- 0x2029 , # paragraph separator
175- ) or (code > 8 and code < 14 ): # whitespace
196+ if code in ignored_whitespace_characters :
176197 position += 1
198+
177199 elif code == 35 : # #, skip comments
178200 position += 1
179201 while position < body_length :
180202 code = char_code_at (body , position )
181- if not code or code in (10 , 13 , 0x2028 , 0x2029 ):
203+ if not ( code is not None and ( code > 0x001F or code == 0x0009 ) and code not in (0x000A , 0x000D ) ):
182204 break
205+
183206 position += 1
184207 else :
185208 break
@@ -191,7 +214,7 @@ def read_number(source, start, first_code):
191214 or an int depending on whether a decimal point appears.
192215
193216 Int: -?(0|[1-9][0-9]*)
194- Float: -?(0|[1-9][0-9]*)\.[0-9]+(e- ?[0-9]+)?"""
217+ Float: -?(0|[1-9][0-9]*)( \.[0-9]+)?((E|e)(+|-) ?[0-9]+)?"""
195218 code = first_code
196219 body = source .body
197220 position = start
@@ -204,43 +227,34 @@ def read_number(source, start, first_code):
204227 if code == 48 : # 0
205228 position += 1
206229 code = char_code_at (body , position )
207- elif 49 <= code <= 57 : # 1 - 9
208- position += 1
209- code = char_code_at (body , position )
210- while 48 <= code <= 57 : # 0 - 9
211- position += 1
212- code = char_code_at (body , position )
230+
231+ if code is not None and 48 <= code <= 57 :
232+ raise LanguageError (
233+ source ,
234+ position ,
235+ u'Invalid number, unexpected digit after 0: {}.' .format (print_char_code (code ))
236+ )
213237 else :
214- raise LanguageError (source , position , 'Invalid number' )
238+ position = read_digits (source , position , code )
239+ code = char_code_at (body , position )
215240
216241 if code == 46 : # .
217242 is_float = True
218243
219244 position += 1
220245 code = char_code_at (body , position )
221- if 48 <= code <= 57 : # 0 - 9
222- position += 1
223- code = char_code_at (body , position )
224- while 48 <= code <= 57 : # 0 - 9
225- position += 1
226- code = char_code_at (body , position )
227- else :
228- raise LanguageError (source , position , 'Invalid number' )
246+ position = read_digits (source , position , code )
247+ code = char_code_at (body , position )
229248
230- if code == 101 : # e
249+ if code in (69 , 101 ): # E e
250+ is_float = True
251+ position += 1
252+ code = char_code_at (body , position )
253+ if code in (43 , 45 ): # + -
231254 position += 1
232255 code = char_code_at (body , position )
233- if code == 45 : # -
234- position += 1
235- code = char_code_at (body , position )
236- if 48 <= code <= 57 : # 0 - 9
237- position += 1
238- code = char_code_at (body , position )
239- while 48 <= code <= 57 : # 0 - 9
240- position += 1
241- code = char_code_at (body , position )
242- else :
243- raise LanguageError (source , position , 'Invalid number' )
256+
257+ position = read_digits (source , position , code )
244258
245259 return Token (
246260 TokenKind .FLOAT if is_float else TokenKind .INT ,
@@ -250,6 +264,28 @@ def read_number(source, start, first_code):
250264 )
251265
252266
267+ def read_digits (source , start , first_code ):
268+ body = source .body
269+ position = start
270+ code = first_code
271+
272+ if code is not None and 48 <= code <= 57 : # 0 - 9
273+ while True :
274+ position += 1
275+ code = char_code_at (body , position )
276+
277+ if not (code is not None and 48 <= code <= 57 ):
278+ break
279+
280+ return position
281+
282+ raise LanguageError (
283+ source ,
284+ position ,
285+ u'Invalid number, expected digit but got: {}.' .format (print_char_code (code ))
286+ )
287+
288+
253289ESCAPED_CHAR_CODES = {
254290 34 : '"' ,
255291 47 : '/' ,
@@ -268,47 +304,73 @@ def read_string(source, start):
268304 "([^"\\ \u000A \u000D \u2028 \u2029 ]|(\\ (u[0-9a-fA-F]{4}|["\\ /bfnrt])))*"
269305 """
270306 body = source .body
307+ body_length = len (body )
308+
271309 position = start + 1
272310 chunk_start = position
273- code = None
274- value = u''
311+ code = 0
312+ value = []
313+ append = value .append
275314
276- while position < len ( body ) :
315+ while position < body_length :
277316 code = char_code_at (body , position )
278- if not code or code in (34 , 10 , 13 , 0x2028 , 0x2029 ):
317+ if not (
318+ code is not None and
319+ code not in (
320+ # LineTerminator
321+ 0x000A , 0x000D ,
322+ # Quote
323+ 34
324+ )
325+ ):
279326 break
327+
328+ if code < 0x0020 and code != 0x0009 :
329+ raise LanguageError (
330+ source ,
331+ position ,
332+ u'Invalid character within String: {}.' .format (print_char_code (code ))
333+ )
334+
280335 position += 1
281336 if code == 92 : # \
282- value += body [chunk_start :position - 1 ]
337+ append (body [chunk_start :position - 1 ])
338+
283339 code = char_code_at (body , position )
284340 escaped = ESCAPED_CHAR_CODES .get (code )
285341 if escaped is not None :
286- value += escaped
287- elif code == 117 :
342+ append (escaped )
343+
344+ elif code == 117 : # u
288345 char_code = uni_char_code (
289346 char_code_at (body , position + 1 ) or 0 ,
290347 char_code_at (body , position + 2 ) or 0 ,
291348 char_code_at (body , position + 3 ) or 0 ,
292349 char_code_at (body , position + 4 ) or 0 ,
293350 )
351+
294352 if char_code < 0 :
295353 raise LanguageError (
296354 source , position ,
297- 'Bad character escape sequence' )
298- value += unichr (char_code )
355+ u'Invalid character escape sequence: \\ u{}.' .format (body [position + 1 : position + 5 ])
356+ )
357+
358+ append (unichr (char_code ))
299359 position += 4
300360 else :
301361 raise LanguageError (
302362 source , position ,
303- 'Bad character escape sequence' )
363+ u'Invalid character escape sequence: \\ {}.' .format (unichr (code ))
364+ )
365+
304366 position += 1
305367 chunk_start = position
306368
307- if code != 34 :
369+ if code != 34 : # Quote (")
308370 raise LanguageError (source , position , 'Unterminated string' )
309371
310- value += body [chunk_start :position ]
311- return Token (TokenKind .STRING , start , position + 1 , value )
372+ append ( body [chunk_start :position ])
373+ return Token (TokenKind .STRING , start , position + 1 , u'' . join ( value ) )
312374
313375
314376def uni_char_code (a , b , c , d ):
@@ -348,15 +410,17 @@ def read_name(source, position):
348410 body = source .body
349411 body_length = len (body )
350412 end = position + 1
351- code = None
413+
352414 while end != body_length :
353415 code = char_code_at (body , end )
354- if not code or not (
416+ if not ( code is not None and (
355417 code == 95 or # _
356418 48 <= code <= 57 or # 0-9
357419 65 <= code <= 90 or # A-Z
358420 97 <= code <= 122 # a-z
359- ):
421+ )) :
360422 break
423+
361424 end += 1
425+
362426 return Token (TokenKind .NAME , position , end , body [position :end ])
0 commit comments