Add strict lexer rules

jg-rp · jg-rp · commit e98453d2ab6f · 2025-08-16T09:44:43.000+01:00
diff --git a/jsonpath/env.py b/jsonpath/env.py
@@ -2,7 +2,20 @@
 
 from __future__ import annotations
 
-import re
+try:
+    import regex  # noqa: F401
+
+    REGEX_AVAILABLE = True
+except ImportError:
+    REGEX_AVAILABLE = False
+
+try:
+    import iregexp_check  # noqa: F401
+
+    IREGEXP_AVAILABLE = True
+except ImportError:
+    IREGEXP_AVAILABLE = False
+
 from decimal import Decimal
 from operator import getitem
 from typing import TYPE_CHECKING
@@ -90,6 +103,7 @@ class attributes `root_token`, `self_token` and `filter_context_token`.
             **New in version 0.10.0**
         strict: When `True`, follow RFC 9535 strictly.
             **New in version 2.0.0**
+
     ## Class attributes
 
     Attributes:
@@ -160,10 +174,20 @@ def __init__(
         """When `True`, follow RFC 9535 strictly.
         
         This includes things like enforcing a leading root identifier and
-        ensuring there's not leading or trailing whitespace when parsing a
+        ensuring there's no leading or trailing whitespace when parsing a
         JSONPath query.
         """
 
+        self.regex_available: bool = REGEX_AVAILABLE
+        """When `True`, the third party `regex` package is available."""
+
+        self.iregexp_available: bool = IREGEXP_AVAILABLE
+        """When `True`, the iregexp_check package is available.
+        
+        iregexp_check will be used to validate regular expressions against RFC 9485,
+        if available.
+        """
+
         self.lexer: Lexer = self.lexer_class(env=self)
         """The lexer bound to this environment."""
 
@@ -589,7 +613,8 @@ def compare(  # noqa: PLR0911
             return left in right
         if operator == "contains" and isinstance(left, (Mapping, Sequence)):
             return right in left
-        if operator == "=~" and isinstance(right, re.Pattern) and isinstance(left, str):
+        if operator == "=~" and hasattr(right, "fullmatch") and isinstance(left, str):
+            # Right should be a regex.Pattern or an re.Pattern.
             return bool(right.fullmatch(left))
         return False
 
diff --git a/jsonpath/function_extensions/_pattern.py b/jsonpath/function_extensions/_pattern.py
@@ -0,0 +1,31 @@
+from typing import List
+
+
+def map_re(pattern: str) -> str:
+    escaped = False
+    char_class = False
+    parts: List[str] = []
+    for ch in pattern:
+        if escaped:
+            parts.append(ch)
+            escaped = False
+            continue
+
+        if ch == ".":
+            if not char_class:
+                parts.append(r"(?:(?![\r\n])\P{Cs}|\p{Cs}\p{Cs})")
+            else:
+                parts.append(ch)
+        elif ch == "\\":
+            escaped = True
+            parts.append(ch)
+        elif ch == "[":
+            char_class = True
+            parts.append(ch)
+        elif ch == "]":
+            char_class = False
+            parts.append(ch)
+        else:
+            parts.append(ch)
+
+    return "".join(parts)
diff --git a/jsonpath/function_extensions/match.py b/jsonpath/function_extensions/match.py
@@ -8,6 +8,8 @@
 from jsonpath.function_extensions import ExpressionType
 from jsonpath.function_extensions import FilterFunction
 
+from ._pattern import map_re
+
 
 class Match(FilterFunction):
     """A type-aware implementation of the standard `match` function."""
@@ -18,7 +20,8 @@ class Match(FilterFunction):
     def __call__(self, string: str, pattern: str) -> bool:
         """Return `True` if _string_ matches _pattern_, or `False` otherwise."""
         try:
-            # re.fullmatch caches compiled patterns internally
-            return bool(re.fullmatch(pattern, string))
+            # XXX: re.fullmatch caches compiled patterns internally, but `map_re` is not
+            # cached.
+            return bool(re.fullmatch(map_re(pattern), string))
         except (TypeError, re.error):
             return False
diff --git a/jsonpath/function_extensions/search.py b/jsonpath/function_extensions/search.py
@@ -8,6 +8,8 @@
 from jsonpath.function_extensions import ExpressionType
 from jsonpath.function_extensions import FilterFunction
 
+from ._pattern import map_re
+
 
 class Search(FilterFunction):
     """A type-aware implementation of the standard `search` function."""
@@ -18,7 +20,8 @@ class Search(FilterFunction):
     def __call__(self, string: str, pattern: str) -> bool:
         """Return `True` if _string_ contains _pattern_, or `False` otherwise."""
         try:
-            # re.search caches compiled patterns internally
-            return bool(re.search(pattern, string))
+            # XXX: re.search caches compiled patterns internally, but `map_re` is not
+            # cached.
+            return bool(re.search(map_re(pattern), string))
         except (TypeError, re.error):
             return False
diff --git a/jsonpath/lex.py b/jsonpath/lex.py
@@ -118,7 +118,7 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
         # func(
         self.function_pattern = r"(?P<G_FUNC>[a-z][a-z_0-9]+)(?P<G_FUNC_PAREN>\()"
 
-        self.rules = self.compile_rules()
+        self.rules = self.compile_strict_rules() if env.strict else self.compile_rules()
 
     def compile_rules(self) -> Pattern[str]:
         """Prepare regular expression rules."""
@@ -190,6 +190,62 @@ def compile_rules(self) -> Pattern[str]:
             re.DOTALL,
         )
 
+    def compile_strict_rules(self) -> Pattern[str]:
+        """Prepare regular expression rules in strict mode."""
+        env_tokens = [
+            (TOKEN_ROOT, self.env.root_token),
+            (TOKEN_SELF, self.env.self_token),
+        ]
+
+        rules = [
+            (TOKEN_DOUBLE_QUOTE_STRING, self.double_quote_pattern),
+            (TOKEN_SINGLE_QUOTE_STRING, self.single_quote_pattern),
+            (TOKEN_DOT_KEY_PROPERTY, self.dot_key_pattern),
+            (TOKEN_DOT_PROPERTY, self.dot_property_pattern),
+            (TOKEN_FLOAT, r"-?\d+\.\d*(?:[eE][+-]?\d+)?"),
+            (TOKEN_INT, r"-?\d+(?P<G_EXP>[eE][+\-]?\d+)?\b"),
+            (TOKEN_DDOT, r"\.\."),
+            (TOKEN_DOT, r"\."),
+            (TOKEN_AND, r"&&"),
+            (TOKEN_OR, r"\|\|"),
+            *[
+                (token, re.escape(pattern))
+                for token, pattern in sorted(
+                    env_tokens, key=lambda x: len(x[1]), reverse=True
+                )
+                if pattern
+            ],
+            (TOKEN_WILD, r"\*"),
+            (TOKEN_FILTER, r"\?"),
+            (TOKEN_TRUE, r"true\b"),
+            (TOKEN_FALSE, r"false\b"),
+            (TOKEN_NULL, r"null\b"),
+            (TOKEN_LBRACKET, r"\["),
+            (TOKEN_RBRACKET, r"]"),
+            (TOKEN_COMMA, r","),
+            (TOKEN_COLON, r":"),
+            (TOKEN_EQ, r"=="),
+            (TOKEN_NE, r"!="),
+            (TOKEN_LG, r"<>"),
+            (TOKEN_LE, r"<="),
+            (TOKEN_GE, r">="),
+            (TOKEN_RE, r"=~"),
+            (TOKEN_LT, r"<"),
+            (TOKEN_GT, r">"),
+            (TOKEN_NOT, r"!"),  # Must go after "!="
+            (TOKEN_FUNCTION, self.function_pattern),
+            (TOKEN_NAME, self.key_pattern),  # Must go after reserved words
+            (TOKEN_LPAREN, r"\("),
+            (TOKEN_RPAREN, r"\)"),
+            (TOKEN_WHITESPACE, r"[ \n\t\r]+"),
+            (TOKEN_ERROR, r"."),
+        ]
+
+        return re.compile(
+            "|".join(f"(?P<{token}>{pattern})" for token, pattern in rules),
+            re.DOTALL,
+        )
+
     def tokenize(self, path: str) -> Iterator[Token]:  # noqa PLR0912
         """Generate a sequence of tokens from a JSONPath string."""
         _token = partial(Token, path=path)
diff --git a/tests/test_compliance.py b/tests/test_compliance.py
@@ -37,17 +37,7 @@ class Case:
 
 
 SKIP = {
-    "functions, match, dot matcher on \\u2028": "standard library re policy",
-    "functions, match, dot matcher on \\u2029": "standard library re policy",
-    "functions, search, dot matcher on \\u2028": "standard library re policy",
-    "functions, search, dot matcher on \\u2029": "standard library re policy",
-    "functions, match, filter, match function, unicode char class, uppercase": "\\p not supported",  # noqa: E501
-    "functions, match, filter, match function, unicode char class negated, uppercase": "\\P not supported",  # noqa: E501
-    "functions, search, filter, search function, unicode char class, uppercase": "\\p not supported",  # noqa: E501
-    "functions, search, filter, search function, unicode char class negated, uppercase": "\\P not supported",  # noqa: E501
-    "filter, equals number, decimal fraction, no fractional digit": "expected behavior policy",  # noqa: E501
-    "filter, equals number, decimal fraction, no int digit": "expected behavior policy",
-    "filter, equals number, invalid no int digit": "expected behavior policy",
+    # "filter, equals number, invalid no int digit": "expected behavior policy",
     "filter, equals number, invalid 00": "expected behavior policy",
     "filter, equals number, invalid leading 0": "expected behavior policy",
     "filter, equals number, invalid no fractional digit": "expected behavior policy",
@@ -63,9 +53,9 @@ class Case:
     "slice selector, step, minus space": "expected behavior policy",
     "slice selector, step, -0": "expected behavior policy",
     "slice selector, step, leading -0": "expected behavior policy",
-    "filter, true, incorrectly capitalized": "flexible literal policy",
-    "filter, false, incorrectly capitalized": "flexible literal policy",
-    "filter, null, incorrectly capitalized": "flexible literal policy",
+    # "filter, true, incorrectly capitalized": "flexible literal policy",
+    # "filter, false, incorrectly capitalized": "flexible literal policy",
+    # "filter, null, incorrectly capitalized": "flexible literal policy",
     "name selector, double quotes, single high surrogate": "expected behavior policy",
     "name selector, double quotes, single low surrogate": "expected behavior policy",
     "name selector, double quotes, high high surrogate": "expected behavior policy",
@@ -76,6 +66,17 @@ class Case:
     "name selector, double quotes, supplementary surrogate": "expected behavior policy",
 }
 
+# CTS test that will only pass if the third party `regex` package is installed.
+REGEX_ONLY = {
+    "functions, match, filter, match function, unicode char class, uppercase",
+    "functions, match, filter, match function, unicode char class negated, uppercase",
+    "functions, search, filter, search function, unicode char class, uppercase",
+    "functions, search, filter, search function, unicode char class negated, uppercase",
+}
+
+# TODO: Test compliance without strict mode. Assert expected failures.
+# TODO: Test runner in `no-regexp` env
+
 
 def cases() -> List[Case]:
     with open("tests/cts/cts.json", encoding="utf8") as fd:
@@ -98,6 +99,9 @@ def env() -> JSONPathEnvironment:
 
 @pytest.mark.parametrize("case", valid_cases(), ids=operator.attrgetter("name"))
 def test_compliance(env: JSONPathEnvironment, case: Case) -> None:
+    if not env.regex_available and case.name in REGEX_ONLY:
+        pytest.skip(reason="requires regex package")
+
     if case.name in SKIP:
         pytest.skip(reason=SKIP[case.name])
 
@@ -116,6 +120,9 @@ def test_compliance(env: JSONPathEnvironment, case: Case) -> None:
 
 @pytest.mark.parametrize("case", valid_cases(), ids=operator.attrgetter("name"))
 def test_compliance_async(env: JSONPathEnvironment, case: Case) -> None:
+    if not env.regex_available and case.name in REGEX_ONLY:
+        pytest.skip(reason="requires regex package")
+
     if case.name in SKIP:
         pytest.skip(reason=SKIP[case.name])