Skip to content

Commit 2f135a3

Browse files
committed
Lexer: use standard Python methods to handle Unicode
Replicates graphql/graphql-js@cd35c99
1 parent 4fb6cd2 commit 2f135a3

File tree

2 files changed

+18
-12
lines changed

2 files changed

+18
-12
lines changed

src/graphql/language/lexer.py

Lines changed: 16 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,12 @@ def print_code_point_at(self, location: int) -> str:
7272
if "\x20" <= char <= "\x7E":
7373
return "'\"'" if char == '"' else f"'{char}'"
7474
# Unicode code point
75-
point = (
76-
decode_surrogate_pair(ord(char), ord(body[location + 1]))
75+
point = ord(
76+
body[location : location + 2]
77+
.encode("utf-16", "surrogatepass")
78+
.decode("utf-16")
7779
if is_supplementary_code_point(body, location)
78-
else ord(char)
80+
else char
7981
)
8082
return f"U+{point:04X}"
8183

@@ -351,7 +353,10 @@ def read_escaped_unicode_fixed_width(self, position: int) -> EscapeSequence:
351353
trailing_code = read_16_bit_hex_code(body, position + 8)
352354
if 0xDC00 <= trailing_code <= 0xDFFF:
353355
return EscapeSequence(
354-
chr(decode_surrogate_pair(code, trailing_code)), 12
356+
(chr(code) + chr(trailing_code))
357+
.encode("utf-16", "surrogatepass")
358+
.decode("utf-16"),
359+
12,
355360
)
356361

357362
raise GraphQLSyntaxError(
@@ -546,11 +551,10 @@ def is_supplementary_code_point(body: str, location: int) -> bool:
546551
The GraphQL specification defines source text as a sequence of unicode scalar
547552
values (which Unicode defines to exclude surrogate code points).
548553
"""
549-
return (
550-
"\ud800" <= body[location] <= "\udbff"
551-
and "\udc00" <= body[location + 1] <= "\udfff"
552-
)
553-
554-
555-
def decode_surrogate_pair(leading: int, trailing: int) -> int:
556-
return 0x10000 + (((leading & 0x03FF) << 10) | (trailing & 0x03FF))
554+
try:
555+
return (
556+
"\ud800" <= body[location] <= "\udbff"
557+
and "\udc00" <= body[location + 1] <= "\udfff"
558+
)
559+
except IndexError:
560+
return False

tests/language/test_lexer.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -539,6 +539,8 @@ def lex_reports_useful_unknown_character_error():
539539
assert_syntax_error("\uD83D\uDE00", "Unexpected character: U+1F600.", (1, 1))
540540
assert_syntax_error("\uD800\uDC00", "Unexpected character: U+10000.", (1, 1))
541541
assert_syntax_error("\uDBFF\uDFFF", "Unexpected character: U+10FFFF.", (1, 1))
542+
assert_syntax_error("\uD800", "Invalid character: U+D800.", (1, 1))
543+
assert_syntax_error("\uDBFF", "Invalid character: U+DBFF.", (1, 1))
542544
assert_syntax_error("\uDEAD", "Invalid character: U+DEAD.", (1, 1))
543545

544546
# noinspection PyArgumentEqualDefault

0 commit comments

Comments
 (0)