Lexer: use standard Python methods to handle Unicode

Cito · Cito · commit 2f135a3d71ef · 2021-12-26T21:37:25.000+01:00
Replicates graphql/graphql-js@cd35c99
diff --git a/src/graphql/language/lexer.py b/src/graphql/language/lexer.py
@@ -72,10 +72,12 @@ def print_code_point_at(self, location: int) -> str:
         if "\x20" <= char <= "\x7E":
             return "'\"'" if char == '"' else f"'{char}'"
         # Unicode code point
-        point = (
-            decode_surrogate_pair(ord(char), ord(body[location + 1]))
+        point = ord(
+            body[location : location + 2]
+            .encode("utf-16", "surrogatepass")
+            .decode("utf-16")
             if is_supplementary_code_point(body, location)
-            else ord(char)
+            else char
         )
         return f"U+{point:04X}"
 
@@ -351,7 +353,10 @@ def read_escaped_unicode_fixed_width(self, position: int) -> EscapeSequence:
                 trailing_code = read_16_bit_hex_code(body, position + 8)
                 if 0xDC00 <= trailing_code <= 0xDFFF:
                     return EscapeSequence(
-                        chr(decode_surrogate_pair(code, trailing_code)), 12
+                        (chr(code) + chr(trailing_code))
+                        .encode("utf-16", "surrogatepass")
+                        .decode("utf-16"),
+                        12,
                     )
 
         raise GraphQLSyntaxError(
@@ -546,11 +551,10 @@ def is_supplementary_code_point(body: str, location: int) -> bool:
     The GraphQL specification defines source text as a sequence of unicode scalar
     values (which Unicode defines to exclude surrogate code points).
     """
-    return (
-        "\ud800" <= body[location] <= "\udbff"
-        and "\udc00" <= body[location + 1] <= "\udfff"
-    )
-
-
-def decode_surrogate_pair(leading: int, trailing: int) -> int:
-    return 0x10000 + (((leading & 0x03FF) << 10) | (trailing & 0x03FF))
+    try:
+        return (
+            "\ud800" <= body[location] <= "\udbff"
+            and "\udc00" <= body[location + 1] <= "\udfff"
+        )
+    except IndexError:
+        return False
diff --git a/tests/language/test_lexer.py b/tests/language/test_lexer.py
@@ -539,6 +539,8 @@ def lex_reports_useful_unknown_character_error():
         assert_syntax_error("\uD83D\uDE00", "Unexpected character: U+1F600.", (1, 1))
         assert_syntax_error("\uD800\uDC00", "Unexpected character: U+10000.", (1, 1))
         assert_syntax_error("\uDBFF\uDFFF", "Unexpected character: U+10FFFF.", (1, 1))
+        assert_syntax_error("\uD800", "Invalid character: U+D800.", (1, 1))
+        assert_syntax_error("\uDBFF", "Invalid character: U+DBFF.", (1, 1))
         assert_syntax_error("\uDEAD", "Invalid character: U+DEAD.", (1, 1))
 
     # noinspection PyArgumentEqualDefault