Skip to content

Commit 790ee7e

Browse files
committed
Refactor regex error handling and parser structure; support Python 3.9 and above
1 parent 031c6f5 commit 790ee7e

File tree

7 files changed

+354
-17
lines changed

7 files changed

+354
-17
lines changed

.github/workflows/python-package.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ jobs:
1515
strategy:
1616
fail-fast: false
1717
matrix:
18-
python-version: ["3.10", "3.11", "3.12"]
18+
python-version: ["3.9", "3.10", "3.11", "3.12"]
1919

2020
steps:
2121
- uses: actions/checkout@v4

regex_enumerator/regex_parser.py renamed to regex_enumerator/parser/regex_parser.py

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,5 @@
1-
from .regex_tree import Alternative, BackReference, CharClass, RegexTree
2-
3-
4-
class RegexError(Exception):
5-
def __init__(self, regex: str, index: int, message: str):
6-
self.regex = regex
7-
self.index = index
8-
self.message = message
9-
10-
def __str__(self):
11-
caret_line = ' ' * self.index + '^'
12-
return f"\n{self.regex}\n{caret_line}\n{self.message}"
1+
from ..regex_tree import Alternative, BackReference, CharClass, RegexTree
2+
from ..regex_error import RegexError
133

144

155
class RegexParser:
Lines changed: 334 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,334 @@
1+
from ..regex_tree import Alternative, BackReference, CharClass, RegexTree
2+
from ..regex_error import RegexError
3+
4+
5+
class RegexParser:
6+
WORDS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
7+
HEX = '0123456789abcdefABCDEF'
8+
DIGITS = '0123456789'
9+
SPACES = ' \t\n\r\f\v'
10+
11+
def __init__(self, regex: str, charset: str, precompute: bool):
12+
self.regex = regex
13+
self.charset = charset
14+
self.precompute = precompute
15+
16+
def parse(self) -> RegexTree:
17+
self.index = 0
18+
return self._parseRegex(False)
19+
20+
def _parseRegex(self, to_close: bool) -> RegexTree:
21+
alternatives: list[Alternative] = []
22+
elements: list[CharClass | RegexTree | BackReference] = []
23+
named_groups: dict[str, RegexTree] = {}
24+
ordered_groups: list[RegexTree] = []
25+
min_len_group, max_len_group = 1, 1
26+
27+
while self.index < len(self.regex):
28+
char = self.regex[self.index]
29+
self.index += 1
30+
if char == '(':
31+
if self.index < len(self.regex) and self.regex[self.index] == '?':
32+
self.index += 1
33+
if self.index >= len(self.regex):
34+
self._raise_error("Invalid group")
35+
elif self.regex[self.index] == '<':
36+
self.index += 1
37+
name = ''
38+
while self.index < len(self.regex) and self.regex[self.index] != '>':
39+
name += self.regex[self.index]
40+
self.index += 1
41+
if self.index >= len(self.regex) or self.regex[self.index] != '>' or name == '':
42+
self._raise_error("Invalid named group")
43+
self.index += 1
44+
if name in named_groups:
45+
self._raise_error("Duplicate named group")
46+
subTree = self._parseRegex(True)
47+
named_groups[name] = subTree
48+
ordered_groups.append(subTree)
49+
elif self.regex[self.index] == ':':
50+
self.index += 1
51+
subTree = self._parseRegex(True)
52+
else:
53+
self._raise_error("Invalid group")
54+
else:
55+
subTree = self._parseRegex(True)
56+
ordered_groups.append(subTree)
57+
elements.append(subTree)
58+
elif char == ')':
59+
if not to_close:
60+
self._raise_error("Unmatched closing parenthesis")
61+
min_len_group, max_len_group = self._parseQuantifier()
62+
to_close = False
63+
break
64+
elif char == '|':
65+
alternatives.append(Alternative(elements))
66+
elements = []
67+
named_groups = {}
68+
ordered_groups = []
69+
elif char == '[':
70+
chars = self._parseCharClass()
71+
min_len, max_len = self._parseQuantifier()
72+
elements.append(
73+
CharClass(chars, min_len, max_len, self.precompute))
74+
elif char == '.':
75+
chars = list(self.charset)
76+
min_len, max_len = self._parseQuantifier()
77+
elements.append(
78+
CharClass(chars, min_len, max_len, self.precompute))
79+
elif char == '\\':
80+
reference = self._parseBackReferenceLookahead()
81+
if reference is None:
82+
chars = self._parseEscapeChar()
83+
min_len, max_len = self._parseQuantifier()
84+
elements.append(
85+
CharClass([chars], min_len, max_len, self.precompute))
86+
continue
87+
if isinstance(reference, str):
88+
if reference not in named_groups:
89+
self._raise_error("Named back reference not found")
90+
group = named_groups[reference]
91+
else:
92+
if reference < 1 or reference > len(ordered_groups):
93+
self._raise_error(
94+
"Positional back reference not found")
95+
group = ordered_groups[reference - 1]
96+
min_len, max_len = self._parseQuantifier()
97+
reference = BackReference(
98+
group, min_len, max_len, self.precompute)
99+
group.add_reference(reference)
100+
elements.append(reference)
101+
else:
102+
min_len, max_len = self._parseQuantifier()
103+
elements.append(
104+
CharClass([char], min_len, max_len, self.precompute))
105+
106+
if to_close:
107+
self._raise_error("Unmatched opening parenthesis")
108+
109+
alternatives.append(Alternative(elements))
110+
return RegexTree(alternatives, min_len_group, max_len_group, self.precompute)
111+
112+
def _parseBackReferenceLookahead(self) -> str | int | None:
113+
if len(self.regex) <= self.index:
114+
self._raise_error("Incomplete escape sequence")
115+
116+
char = self.regex[self.index]
117+
118+
if char == 'k':
119+
self.index += 1
120+
name = ''
121+
if len(self.regex) <= self.index or self.regex[self.index] != '<':
122+
self._raise_error("Invalid named back reference")
123+
self.index += 1
124+
while self.index < len(self.regex) and self.regex[self.index] != '>':
125+
name += self.regex[self.index]
126+
self.index += 1
127+
if len(self.regex) <= self.index or self.regex[self.index] != '>' or name == '':
128+
self._raise_error("Invalid named back reference")
129+
self.index += 1
130+
return name
131+
elif char.isdigit():
132+
num = int(char)
133+
self.index += 1
134+
while self.index < len(self.regex) and self.regex[self.index].isdigit():
135+
num = num * 10 + int(self.regex[self.index])
136+
self.index += 1
137+
return num
138+
139+
def _parseEscapeChar(self) -> str:
140+
141+
if len(self.regex) <= self.index:
142+
self._raise_error("Incomplete escape sequence")
143+
144+
char = self.regex[self.index]
145+
self.index += 1
146+
147+
if char == 'd':
148+
return self.DIGITS
149+
elif char == 'D':
150+
return ''.join([c for c in self.charset if not c.isdigit()])
151+
elif char == 'w':
152+
return self.WORDS
153+
elif char == 'W':
154+
return ''.join([c for c in self.charset if c not in self.WORDS])
155+
elif char == 's':
156+
return self.SPACES
157+
elif char == 'S':
158+
return ''.join([c for c in self.charset if c not in self.SPACES])
159+
elif char == 't':
160+
return '\t'
161+
elif char == 'r':
162+
return '\r'
163+
elif char == 'n':
164+
return '\n'
165+
elif char == 'v':
166+
return '\v'
167+
elif char == 'f':
168+
return '\f'
169+
elif char == 'x':
170+
if len(self.regex) < self.index + 1 or self.regex[self.index] not in self.HEX:
171+
self._raise_error('Invalid ASCII escape character')
172+
if len(self.regex) < self.index + 2 or self.regex[self.index + 1] not in self.HEX:
173+
num = int(self.regex[self.index], 16)
174+
self.index += 1
175+
else:
176+
num = int(self.regex[self.index: self.index + 2], 16)
177+
self.index += 2
178+
if num < 32 or num > 126:
179+
self._raise_error(f"Invalid ASCII escape character {num}")
180+
return chr(num)
181+
elif char == 'u':
182+
code = []
183+
for _ in range(4):
184+
if len(self.regex) <= self.index or self.regex[self.index] not in self.HEX:
185+
self._raise_error("Invalid unicode escape character")
186+
code.append(self.regex[self.index])
187+
self.index += 1
188+
num = int(''.join(code), 16)
189+
return chr(num)
190+
elif char == 'p' or char == 'P':
191+
self._raise_error("Unicode property not supported")
192+
else:
193+
return char
194+
195+
def _parseCharClass(self) -> list[str]:
196+
chars_list: list[str] = []
197+
first_char = None
198+
range_divider = False
199+
negated = False
200+
201+
if len(self.regex) <= self.index:
202+
self._raise_error("Unclosed character class")
203+
204+
if self.regex[self.index] == '^':
205+
negated = True
206+
self.index += 1
207+
208+
len_regex = len(self.regex)
209+
210+
while self.index < len_regex and self.regex[self.index] != ']':
211+
char = self.regex[self.index]
212+
self.index += 1
213+
214+
if char == '-' and first_char is not None and not range_divider:
215+
range_divider = True
216+
continue
217+
if char == '\\':
218+
escape_char = self._parseEscapeChar()
219+
if len(escape_char) > 1 or escape_char == '-':
220+
chars_list.append(escape_char)
221+
if range_divider:
222+
chars_list.append('-')
223+
assert first_char is not None
224+
chars_list.append(first_char)
225+
elif first_char is not None:
226+
chars_list.append(first_char)
227+
continue
228+
char = escape_char
229+
230+
if first_char is None:
231+
first_char = char
232+
elif range_divider:
233+
chars_list.extend([chr(c) for c in range(
234+
ord(first_char), ord(char) + 1)])
235+
first_char = None
236+
range_divider = False
237+
else:
238+
chars_list.append(first_char)
239+
first_char = char
240+
241+
if len(self.regex) <= self.index or self.regex[self.index] != ']':
242+
self._raise_error("Unclosed character class")
243+
244+
self.index += 1
245+
246+
if range_divider:
247+
chars_list.append('-')
248+
assert first_char is not None
249+
chars_list.append(first_char)
250+
elif first_char is not None:
251+
chars_list.append(first_char)
252+
253+
if negated:
254+
chars_list = [
255+
c for c in self.charset if c not in ''.join(chars_list)]
256+
257+
return chars_list
258+
259+
def _parseQuantifier(self) -> tuple[int, int | None]:
260+
261+
if len(self.regex) <= self.index:
262+
return 1, 1
263+
264+
char = self.regex[self.index]
265+
266+
if char == '*':
267+
self.index += 1
268+
return 0, None
269+
elif char == '+':
270+
self.index += 1
271+
return 1, None
272+
elif char == '?':
273+
self.index += 1
274+
return 0, 1
275+
elif char == '{':
276+
self.index += 1
277+
return self._parseMinMax()
278+
else:
279+
return 1, 1
280+
281+
def _parseMinMax(self) -> tuple[int, int | None]:
282+
self._skipSpaces()
283+
284+
min_len = 0
285+
if self.index >= len(self.regex) or not self.regex[self.index].isdigit():
286+
self._raise_error("Invalid quantifier")
287+
while self.index < len(self.regex) and self.regex[self.index].isdigit():
288+
min_len = min_len * 10 + int(self.regex[self.index])
289+
self.index += 1
290+
291+
self._skipSpaces()
292+
293+
if self.index >= len(self.regex):
294+
self._raise_error("Invalid quantifier")
295+
296+
if self.regex[self.index] == '}':
297+
self.index += 1
298+
return min_len, min_len
299+
if self.regex[self.index] != ',':
300+
self._raise_error("Invalid quantifier")
301+
302+
self.index += 1
303+
self._skipSpaces()
304+
305+
if self.index >= len(self.regex) or self.regex[self.index] not in '0123456789}':
306+
self._raise_error("Invalid quantifier")
307+
308+
if self.regex[self.index] == '}':
309+
self.index += 1
310+
return min_len, None
311+
312+
max_len = 0
313+
while self.index < len(self.regex) and self.regex[self.index].isdigit():
314+
max_len = max_len * 10 + int(self.regex[self.index])
315+
self.index += 1
316+
317+
if max_len < min_len:
318+
self._raise_error(
319+
"Max length cannot be less than min length in quantifier")
320+
321+
self._skipSpaces()
322+
323+
if self.index >= len(self.regex) or self.regex[self.index] != '}':
324+
self._raise_error("Invalid quantifier")
325+
self.index += 1
326+
327+
return min_len, max_len
328+
329+
def _skipSpaces(self):
330+
while self.index < len(self.regex) and self.regex[self.index] == ' ':
331+
self.index += 1
332+
333+
def _raise_error(self, message: str):
334+
raise RegexError(self.regex, self.index, message)

regex_enumerator/regex_enumerator.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1-
from .regex_parser import RegexParser
1+
import sys
2+
if sys.version_info >= (3, 10):
3+
from .parser.regex_parser import RegexParser
4+
else:
5+
from .parser.regex_parser_legacy import RegexParser
26
from .regex_tree import RegexTree
37

48

regex_enumerator/regex_error.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
class RegexError(Exception):
2+
def __init__(self, regex: str, index: int, message: str):
3+
self.regex = regex
4+
self.index = index
5+
self.message = message
6+
7+
def __str__(self):
8+
caret_line = ' ' * self.index + '^'
9+
return f"\n{self.regex}\n{caret_line}\n{self.message}"

setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,9 +18,9 @@
1818
'exhaustive matching', 'exhaustive search', 'regex testing', 'regex tools', 'string enumeration', 'data generation'],
1919
long_description=long_description,
2020
long_description_content_type="text/markdown",
21-
python_requires='>=3.10',
21+
python_requires='>=3.9',
2222
classifiers=[
23-
'Programming Language :: Python :: 3.10',
23+
'Programming Language :: Python :: 3.9',
2424
'Operating System :: OS Independent',
2525
'License :: OSI Approved :: MIT License',
2626
],

0 commit comments

Comments
 (0)