@@ -67,12 +67,16 @@ def print_code_point_at(self, location: int) -> str:
6767 if location >= len (body ):
6868 return TokenKind .EOF .value
6969 char = body [location ]
70- code = ord (char )
7170 # Printable ASCII
72- if 0x20 <= code <= 0x7E :
71+ if " \x20 " <= char <= " \x7E " :
7372 return "'\" '" if char == '"' else f"'{ char } '"
7473 # Unicode code point
75- return f"U+{ code :04X} "
74+ point = (
75+ decode_surrogate_pair (ord (char ), ord (body [location + 1 ]))
76+ if is_supplementary_code_point (body , location )
77+ else ord (char )
78+ )
79+ return f"U+{ point :04X} "
7680
7781 def create_token (
7882 self , kind : TokenKind , start : int , end : int , value : Optional [str ] = None
@@ -141,7 +145,8 @@ def read_next_token(self, start: int) -> Token:
141145 if char == "'"
142146 else (
143147 f"Unexpected character: { self .print_code_point_at (position )} ."
144- if is_source_character (char )
148+ if is_unicode_scalar_value (char )
149+ or is_supplementary_code_point (body , position )
145150 else f"Invalid character: { self .print_code_point_at (position )} ."
146151 )
147152 )
@@ -158,10 +163,14 @@ def read_comment(self, start: int) -> Token:
158163 position = start + 1
159164 while position < body_length :
160165 char = body [position ]
161-
162- if char in "\r \n " or not is_source_character (char ):
166+ if char in "\r \n " :
163167 break
164- position += 1
168+ if is_unicode_scalar_value (char ):
169+ position += 1
170+ elif is_supplementary_code_point (body , position ):
171+ position += 2
172+ else :
173+ break # pragma: no cover
165174
166175 return self .create_token (
167176 TokenKind .COMMENT ,
@@ -270,7 +279,11 @@ def read_string(self, start: int) -> Token:
270279 if char == "\\ " :
271280 append (body [chunk_start :position ])
272281 escape = (
273- self .read_escaped_unicode (position )
282+ (
283+ self .read_escaped_unicode_variable_width (position )
284+ if body [position + 2 : position + 3 ] == "{"
285+ else self .read_escaped_unicode_fixed_width (position )
286+ )
274287 if body [position + 1 : position + 2 ] == "u"
275288 else self .read_escaped_character (position )
276289 )
@@ -282,8 +295,10 @@ def read_string(self, start: int) -> Token:
282295 if char in "\r \n " :
283296 break
284297
285- if is_source_character (char ):
298+ if is_unicode_scalar_value (char ):
286299 position += 1
300+ elif is_supplementary_code_point (body , position ):
301+ position += 2
287302 else :
288303 raise GraphQLSyntaxError (
289304 self .source ,
@@ -294,11 +309,50 @@ def read_string(self, start: int) -> Token:
294309
295310 raise GraphQLSyntaxError (self .source , position , "Unterminated string." )
296311
297- def read_escaped_unicode (self , position : int ) -> EscapeSequence :
312+ def read_escaped_unicode_variable_width (self , position : int ) -> EscapeSequence :
313+ body = self .source .body
314+ point = 0
315+ size = 3
316+ max_size = min (12 , len (body ) - position )
317+ # Cannot be larger than 12 chars (\u{00000000}).
318+ while size < max_size :
319+ char = body [position + size ]
320+ size += 1
321+ if char == "}" :
322+ # Must be at least 5 chars (\u{0}) and encode a Unicode scalar value.
323+ if size < 5 or not (
324+ 0 <= point <= 0xD7FF or 0xE000 <= point <= 0x10FFFF
325+ ):
326+ break
327+ return EscapeSequence (chr (point ), size )
328+ # Append this hex digit to the code point.
329+ point = (point << 4 ) | read_hex_digit (char )
330+ if point < 0 :
331+ break
332+
333+ raise GraphQLSyntaxError (
334+ self .source ,
335+ position ,
336+ f"Invalid Unicode escape sequence: '{ body [position : position + size ]} '." ,
337+ )
338+
339+ def read_escaped_unicode_fixed_width (self , position : int ) -> EscapeSequence :
298340 body = self .source .body
299341 code = read_16_bit_hex_code (body , position + 2 )
300- if code >= 0 :
342+
343+ if 0 <= code <= 0xD7FF or 0xE000 <= code <= 0x10FFFF :
301344 return EscapeSequence (chr (code ), 6 )
345+
346+ # GraphQL allows JSON-style surrogate pair escape sequences, but only when
347+ # a valid pair is formed.
348+ if 0xD800 <= code <= 0xDBFF :
349+ if body [position + 6 : position + 8 ] == "\\ u" :
350+ trailing_code = read_16_bit_hex_code (body , position + 8 )
351+ if 0xDC00 <= trailing_code <= 0xDFFF :
352+ return EscapeSequence (
353+ chr (decode_surrogate_pair (code , trailing_code )), 12
354+ )
355+
302356 raise GraphQLSyntaxError (
303357 self .source ,
304358 position ,
@@ -351,8 +405,10 @@ def read_block_string(self, start: int) -> Token:
351405 self .line_start = position
352406 continue
353407
354- if is_source_character (char ):
408+ if is_unicode_scalar_value (char ):
355409 position += 1
410+ elif is_supplementary_code_point (body , position ):
411+ position += 2
356412 else :
357413 raise GraphQLSyntaxError (
358414 self .source ,
@@ -477,9 +533,31 @@ def read_hex_digit(char: str) -> int:
477533 return - 1
478534
479535
480- def is_source_character (char : str ) -> bool :
481- """Check whether this is a SourceCharacter"""
482- return char >= " " or char in "\t \r \n "
536+ def is_unicode_scalar_value (char : str ) -> bool :
537+ """Check whether this is a Unicode scalar value.
538+
539+ A Unicode scalar value is any Unicode code point except surrogate code
540+ points. In other words, the inclusive ranges of values 0x0000 to 0xD7FF and
541+ 0xE000 to 0x10FFFF.
542+ """
543+ return "\x00 " <= char <= "\ud7ff " or "\ue000 " <= char <= "\U0010ffff "
544+
545+
546+ def is_supplementary_code_point (body : str , location : int ) -> bool :
547+ """
548+ Check whether the current location is a supplementary code point.
549+
550+ The GraphQL specification defines source text as a sequence of unicode scalar
551+ values (which Unicode defines to exclude surrogate code points).
552+ """
553+ return (
554+ "\ud800 " <= body [location ] <= "\udbff "
555+ and "\udc00 " <= body [location + 1 ] <= "\udfff "
556+ )
557+
558+
559+ def decode_surrogate_pair (leading : int , trailing : int ) -> int :
560+ return 0x10000 + (((leading & 0x03FF ) << 10 ) | (trailing & 0x03FF ))
483561
484562
485563def is_name_start (char : str ) -> bool :
0 commit comments