Skip to content

Commit e98453d

Browse files
committed
Add strict lexer rules
1 parent a5605a1 commit e98453d

File tree

6 files changed

+147
-22
lines changed

6 files changed

+147
-22
lines changed

jsonpath/env.py

Lines changed: 28 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,20 @@
22

33
from __future__ import annotations
44

5-
import re
5+
try:
6+
import regex # noqa: F401
7+
8+
REGEX_AVAILABLE = True
9+
except ImportError:
10+
REGEX_AVAILABLE = False
11+
12+
try:
13+
import iregexp_check # noqa: F401
14+
15+
IREGEXP_AVAILABLE = True
16+
except ImportError:
17+
IREGEXP_AVAILABLE = False
18+
619
from decimal import Decimal
720
from operator import getitem
821
from typing import TYPE_CHECKING
@@ -90,6 +103,7 @@ class attributes `root_token`, `self_token` and `filter_context_token`.
90103
**New in version 0.10.0**
91104
strict: When `True`, follow RFC 9535 strictly.
92105
**New in version 2.0.0**
106+
93107
## Class attributes
94108
95109
Attributes:
@@ -160,10 +174,20 @@ def __init__(
160174
"""When `True`, follow RFC 9535 strictly.
161175
162176
This includes things like enforcing a leading root identifier and
163-
ensuring there's not leading or trailing whitespace when parsing a
177+
ensuring there's no leading or trailing whitespace when parsing a
164178
JSONPath query.
165179
"""
166180

181+
self.regex_available: bool = REGEX_AVAILABLE
182+
"""When `True`, the third party `regex` package is available."""
183+
184+
self.iregexp_available: bool = IREGEXP_AVAILABLE
185+
"""When `True`, the iregexp_check package is available.
186+
187+
iregexp_check will be used to validate regular expressions against RFC 9485,
188+
if available.
189+
"""
190+
167191
self.lexer: Lexer = self.lexer_class(env=self)
168192
"""The lexer bound to this environment."""
169193

@@ -589,7 +613,8 @@ def compare( # noqa: PLR0911
589613
return left in right
590614
if operator == "contains" and isinstance(left, (Mapping, Sequence)):
591615
return right in left
592-
if operator == "=~" and isinstance(right, re.Pattern) and isinstance(left, str):
616+
if operator == "=~" and hasattr(right, "fullmatch") and isinstance(left, str):
617+
# Right should be a regex.Pattern or an re.Pattern.
593618
return bool(right.fullmatch(left))
594619
return False
595620

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
from typing import List
2+
3+
4+
def map_re(pattern: str) -> str:
5+
escaped = False
6+
char_class = False
7+
parts: List[str] = []
8+
for ch in pattern:
9+
if escaped:
10+
parts.append(ch)
11+
escaped = False
12+
continue
13+
14+
if ch == ".":
15+
if not char_class:
16+
parts.append(r"(?:(?![\r\n])\P{Cs}|\p{Cs}\p{Cs})")
17+
else:
18+
parts.append(ch)
19+
elif ch == "\\":
20+
escaped = True
21+
parts.append(ch)
22+
elif ch == "[":
23+
char_class = True
24+
parts.append(ch)
25+
elif ch == "]":
26+
char_class = False
27+
parts.append(ch)
28+
else:
29+
parts.append(ch)
30+
31+
return "".join(parts)

jsonpath/function_extensions/match.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from jsonpath.function_extensions import ExpressionType
99
from jsonpath.function_extensions import FilterFunction
1010

11+
from ._pattern import map_re
12+
1113

1214
class Match(FilterFunction):
1315
"""A type-aware implementation of the standard `match` function."""
@@ -18,7 +20,8 @@ class Match(FilterFunction):
1820
def __call__(self, string: str, pattern: str) -> bool:
1921
"""Return `True` if _string_ matches _pattern_, or `False` otherwise."""
2022
try:
21-
# re.fullmatch caches compiled patterns internally
22-
return bool(re.fullmatch(pattern, string))
23+
# XXX: re.fullmatch caches compiled patterns internally, but `map_re` is not
24+
# cached.
25+
return bool(re.fullmatch(map_re(pattern), string))
2326
except (TypeError, re.error):
2427
return False

jsonpath/function_extensions/search.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@
88
from jsonpath.function_extensions import ExpressionType
99
from jsonpath.function_extensions import FilterFunction
1010

11+
from ._pattern import map_re
12+
1113

1214
class Search(FilterFunction):
1315
"""A type-aware implementation of the standard `search` function."""
@@ -18,7 +20,8 @@ class Search(FilterFunction):
1820
def __call__(self, string: str, pattern: str) -> bool:
1921
"""Return `True` if _string_ contains _pattern_, or `False` otherwise."""
2022
try:
21-
# re.search caches compiled patterns internally
22-
return bool(re.search(pattern, string))
23+
# XXX: re.search caches compiled patterns internally, but `map_re` is not
24+
# cached.
25+
return bool(re.search(map_re(pattern), string))
2326
except (TypeError, re.error):
2427
return False

jsonpath/lex.py

Lines changed: 57 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def __init__(self, *, env: JSONPathEnvironment) -> None:
118118
# func(
119119
self.function_pattern = r"(?P<G_FUNC>[a-z][a-z_0-9]+)(?P<G_FUNC_PAREN>\()"
120120

121-
self.rules = self.compile_rules()
121+
self.rules = self.compile_strict_rules() if env.strict else self.compile_rules()
122122

123123
def compile_rules(self) -> Pattern[str]:
124124
"""Prepare regular expression rules."""
@@ -190,6 +190,62 @@ def compile_rules(self) -> Pattern[str]:
190190
re.DOTALL,
191191
)
192192

193+
def compile_strict_rules(self) -> Pattern[str]:
194+
"""Prepare regular expression rules in strict mode."""
195+
env_tokens = [
196+
(TOKEN_ROOT, self.env.root_token),
197+
(TOKEN_SELF, self.env.self_token),
198+
]
199+
200+
rules = [
201+
(TOKEN_DOUBLE_QUOTE_STRING, self.double_quote_pattern),
202+
(TOKEN_SINGLE_QUOTE_STRING, self.single_quote_pattern),
203+
(TOKEN_DOT_KEY_PROPERTY, self.dot_key_pattern),
204+
(TOKEN_DOT_PROPERTY, self.dot_property_pattern),
205+
(TOKEN_FLOAT, r"-?\d+\.\d*(?:[eE][+-]?\d+)?"),
206+
(TOKEN_INT, r"-?\d+(?P<G_EXP>[eE][+\-]?\d+)?\b"),
207+
(TOKEN_DDOT, r"\.\."),
208+
(TOKEN_DOT, r"\."),
209+
(TOKEN_AND, r"&&"),
210+
(TOKEN_OR, r"\|\|"),
211+
*[
212+
(token, re.escape(pattern))
213+
for token, pattern in sorted(
214+
env_tokens, key=lambda x: len(x[1]), reverse=True
215+
)
216+
if pattern
217+
],
218+
(TOKEN_WILD, r"\*"),
219+
(TOKEN_FILTER, r"\?"),
220+
(TOKEN_TRUE, r"true\b"),
221+
(TOKEN_FALSE, r"false\b"),
222+
(TOKEN_NULL, r"null\b"),
223+
(TOKEN_LBRACKET, r"\["),
224+
(TOKEN_RBRACKET, r"]"),
225+
(TOKEN_COMMA, r","),
226+
(TOKEN_COLON, r":"),
227+
(TOKEN_EQ, r"=="),
228+
(TOKEN_NE, r"!="),
229+
(TOKEN_LG, r"<>"),
230+
(TOKEN_LE, r"<="),
231+
(TOKEN_GE, r">="),
232+
(TOKEN_RE, r"=~"),
233+
(TOKEN_LT, r"<"),
234+
(TOKEN_GT, r">"),
235+
(TOKEN_NOT, r"!"), # Must go after "!="
236+
(TOKEN_FUNCTION, self.function_pattern),
237+
(TOKEN_NAME, self.key_pattern), # Must go after reserved words
238+
(TOKEN_LPAREN, r"\("),
239+
(TOKEN_RPAREN, r"\)"),
240+
(TOKEN_WHITESPACE, r"[ \n\t\r]+"),
241+
(TOKEN_ERROR, r"."),
242+
]
243+
244+
return re.compile(
245+
"|".join(f"(?P<{token}>{pattern})" for token, pattern in rules),
246+
re.DOTALL,
247+
)
248+
193249
def tokenize(self, path: str) -> Iterator[Token]: # noqa PLR0912
194250
"""Generate a sequence of tokens from a JSONPath string."""
195251
_token = partial(Token, path=path)

tests/test_compliance.py

Lines changed: 21 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -37,17 +37,7 @@ class Case:
3737

3838

3939
SKIP = {
40-
"functions, match, dot matcher on \\u2028": "standard library re policy",
41-
"functions, match, dot matcher on \\u2029": "standard library re policy",
42-
"functions, search, dot matcher on \\u2028": "standard library re policy",
43-
"functions, search, dot matcher on \\u2029": "standard library re policy",
44-
"functions, match, filter, match function, unicode char class, uppercase": "\\p not supported", # noqa: E501
45-
"functions, match, filter, match function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
46-
"functions, search, filter, search function, unicode char class, uppercase": "\\p not supported", # noqa: E501
47-
"functions, search, filter, search function, unicode char class negated, uppercase": "\\P not supported", # noqa: E501
48-
"filter, equals number, decimal fraction, no fractional digit": "expected behavior policy", # noqa: E501
49-
"filter, equals number, decimal fraction, no int digit": "expected behavior policy",
50-
"filter, equals number, invalid no int digit": "expected behavior policy",
40+
# "filter, equals number, invalid no int digit": "expected behavior policy",
5141
"filter, equals number, invalid 00": "expected behavior policy",
5242
"filter, equals number, invalid leading 0": "expected behavior policy",
5343
"filter, equals number, invalid no fractional digit": "expected behavior policy",
@@ -63,9 +53,9 @@ class Case:
6353
"slice selector, step, minus space": "expected behavior policy",
6454
"slice selector, step, -0": "expected behavior policy",
6555
"slice selector, step, leading -0": "expected behavior policy",
66-
"filter, true, incorrectly capitalized": "flexible literal policy",
67-
"filter, false, incorrectly capitalized": "flexible literal policy",
68-
"filter, null, incorrectly capitalized": "flexible literal policy",
56+
# "filter, true, incorrectly capitalized": "flexible literal policy",
57+
# "filter, false, incorrectly capitalized": "flexible literal policy",
58+
# "filter, null, incorrectly capitalized": "flexible literal policy",
6959
"name selector, double quotes, single high surrogate": "expected behavior policy",
7060
"name selector, double quotes, single low surrogate": "expected behavior policy",
7161
"name selector, double quotes, high high surrogate": "expected behavior policy",
@@ -76,6 +66,17 @@ class Case:
7666
"name selector, double quotes, supplementary surrogate": "expected behavior policy",
7767
}
7868

69+
# CTS test that will only pass if the third party `regex` package is installed.
70+
REGEX_ONLY = {
71+
"functions, match, filter, match function, unicode char class, uppercase",
72+
"functions, match, filter, match function, unicode char class negated, uppercase",
73+
"functions, search, filter, search function, unicode char class, uppercase",
74+
"functions, search, filter, search function, unicode char class negated, uppercase",
75+
}
76+
77+
# TODO: Test compliance without strict mode. Assert expected failures.
78+
# TODO: Test runner in `no-regexp` env
79+
7980

8081
def cases() -> List[Case]:
8182
with open("tests/cts/cts.json", encoding="utf8") as fd:
@@ -98,6 +99,9 @@ def env() -> JSONPathEnvironment:
9899

99100
@pytest.mark.parametrize("case", valid_cases(), ids=operator.attrgetter("name"))
100101
def test_compliance(env: JSONPathEnvironment, case: Case) -> None:
102+
if not env.regex_available and case.name in REGEX_ONLY:
103+
pytest.skip(reason="requires regex package")
104+
101105
if case.name in SKIP:
102106
pytest.skip(reason=SKIP[case.name])
103107

@@ -116,6 +120,9 @@ def test_compliance(env: JSONPathEnvironment, case: Case) -> None:
116120

117121
@pytest.mark.parametrize("case", valid_cases(), ids=operator.attrgetter("name"))
118122
def test_compliance_async(env: JSONPathEnvironment, case: Case) -> None:
123+
if not env.regex_available and case.name in REGEX_ONLY:
124+
pytest.skip(reason="requires regex package")
125+
119126
if case.name in SKIP:
120127
pytest.skip(reason=SKIP[case.name])
121128

0 commit comments

Comments
 (0)