Skip to content

Commit 6e91423

Browse files
authored
Merge pull request #2532 from haxtibal/tdmg/tighten_node_parsing
feat(backend/sdoc_source_code): tighten lark grammar for custom source nodes
2 parents 1644d3c + 2a40506 commit 6e91423

File tree

7 files changed

+142
-36
lines changed

7 files changed

+142
-36
lines changed

strictdoc/backend/sdoc_source_code/caching_reader.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from typing import Optional, Union
66

7+
from strictdoc.backend.sdoc.models.grammar_element import GrammarElement
78
from strictdoc.backend.sdoc.pickle_cache import PickleCache
89
from strictdoc.backend.sdoc_source_code.models.source_file_info import (
910
SourceFileTraceabilityInfo,
@@ -26,7 +27,9 @@
2627
class SourceFileTraceabilityCachingReader:
2728
@staticmethod
2829
def read_from_file(
29-
path_to_file: str, project_config: ProjectConfig
30+
path_to_file: str,
31+
project_config: ProjectConfig,
32+
source_node_grammar_element: Optional[GrammarElement],
3033
) -> Optional[SourceFileTraceabilityInfo]:
3134
unpickled_content = PickleCache.read_from_cache(
3235
path_to_file, project_config, "source_file"
@@ -39,7 +42,7 @@ def read_from_file(
3942
return unpickled_content
4043

4144
reader = SourceFileTraceabilityCachingReader._get_reader(
42-
path_to_file, project_config
45+
path_to_file, project_config, source_node_grammar_element
4346
)
4447
try:
4548
traceability_info = reader.read_from_file(path_to_file)
@@ -59,7 +62,9 @@ def read_from_file(
5962

6063
@staticmethod
6164
def _get_reader(
62-
path_to_file: str, project_config: ProjectConfig
65+
path_to_file: str,
66+
project_config: ProjectConfig,
67+
source_node_grammar_element: Optional[GrammarElement],
6368
) -> Union[
6469
SourceFileTraceabilityReader,
6570
SourceFileTraceabilityReader_Python,
@@ -77,8 +82,15 @@ def _get_reader(
7782
or path_to_file.endswith(".hpp")
7883
or path_to_file.endswith(".cpp")
7984
):
80-
parse_nodes = project_config.shall_parse_nodes(path_to_file)
81-
return SourceFileTraceabilityReader_C(parse_nodes=parse_nodes)
85+
custom_tags = (
86+
[
87+
field.title
88+
for field in source_node_grammar_element.fields
89+
]
90+
if source_node_grammar_element is not None
91+
else None
92+
)
93+
return SourceFileTraceabilityReader_C(custom_tags=custom_tags)
8294
if path_to_file.endswith(".robot"):
8395
return SourceFileTraceabilityReader_Robot()
8496
return SourceFileTraceabilityReader()

strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from string import Template
6+
from typing import Optional
67

78
from lark import Lark, ParseTree, UnexpectedToken
89

@@ -17,7 +18,17 @@ class GrammarTemplate(Template):
1718

1819

1920
RELATION_MARKER_START = r"@relation[\(\{]"
20-
REGEX_NODE_NAME = r"[A-Za-z0-9_\-]+"
21+
22+
NODE_GRAMMAR_EXTENSION = GrammarTemplate("""
23+
node_field: node_name ":" node_multiline_value
24+
node_name: /##CUSTOM_TAGS/
25+
node_multiline_value: (_WS_INLINE | _NL) (NODE_FIRST_STRING_VALUE _NL) (NODE_STRING_VALUE _NL)*
26+
27+
NODE_FIRST_STRING_VALUE.2: /\\s*[^\n\r]+/x
28+
NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*[A-Z_]+: )[^\n\r]+/x
29+
30+
_NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*(##CUSTOM_TAGS): )|(##RESERVED_KEYWORDS)).+/
31+
""")
2132

2233
GRAMMAR = GrammarTemplate("""
2334
start: ##START
@@ -29,14 +40,7 @@ class GrammarTemplate(Template):
2940
relation_scope: /file|class|function|line|range_start|range_end/
3041
relation_role: ALPHANUMERIC_WORD
3142
32-
node_field: node_name ":" node_multiline_value
33-
node_name: /(?!(##RESERVED_KEYWORDS))##REGEX_NODE_NAME/
34-
node_multiline_value: (_WS_INLINE | _NL) (NODE_FIRST_STRING_VALUE _NL) (NODE_STRING_VALUE _NL)*
35-
36-
NODE_FIRST_STRING_VALUE.2: /\\s*[^\n\r]+/x
37-
NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*[A-Z_]+: )[^\n\r]+/x
38-
39-
_NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*##REGEX_NODE_NAME: )|(##RESERVED_KEYWORDS)).+/
43+
##GRAMMAR_EXTENSION
4044
4145
_NORMAL_STRING_NO_MARKER: /(?!\\s*##RELATION_MARKER_START).+/
4246
@@ -62,16 +66,23 @@ class GrammarTemplate(Template):
6266

6367
class MarkerLexer:
6468
@staticmethod
65-
def parse(source_input: str, parse_nodes: bool = False) -> ParseTree:
66-
if parse_nodes:
69+
def parse(
70+
source_input: str, custom_tags: Optional[list[str]] = None
71+
) -> ParseTree:
72+
if custom_tags is not None:
73+
grammar_extension = NODE_GRAMMAR_EXTENSION.substitute(
74+
CUSTOM_TAGS="|".join(f"{tag}(?=:)" for tag in custom_tags),
75+
RESERVED_KEYWORDS=RESERVED_KEYWORDS,
76+
RELATION_MARKER_START=RELATION_MARKER_START,
77+
)
6778
start = "(relation_marker | node_field | _NORMAL_STRING_NO_MARKER_NO_NODE | _WS)*"
6879
else:
80+
grammar_extension = ""
6981
start = "(relation_marker | _NORMAL_STRING_NO_MARKER | _WS)*"
7082

7183
grammar = GRAMMAR.substitute(
72-
REGEX_NODE_NAME=REGEX_NODE_NAME,
84+
GRAMMAR_EXTENSION=grammar_extension,
7385
RELATION_MARKER_START=RELATION_MARKER_START,
74-
RESERVED_KEYWORDS=RESERVED_KEYWORDS,
7586
REGEX_REQ=REGEX_REQ,
7687
START=start,
7788
)

strictdoc/backend/sdoc_source_code/marker_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def parse(
3232
comment_line_start: int,
3333
entity_name: Optional[str] = None,
3434
col_offset: int = 0,
35-
parse_nodes: bool = False,
35+
custom_tags: Optional[list[str]] = None,
3636
) -> SourceNode:
3737
"""
3838
Parse relation markers from source file comments.
@@ -54,7 +54,7 @@ def parse(
5454
input_string = preprocess_source_code_comment(input_string)
5555

5656
tree: ParseTree = MarkerLexer.parse(
57-
input_string, parse_nodes=parse_nodes
57+
input_string, custom_tags=custom_tags
5858
)
5959

6060
for element_ in tree.children:

strictdoc/backend/sdoc_source_code/reader_c.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@
4040

4141

4242
class SourceFileTraceabilityReader_C:
43-
def __init__(self, parse_nodes: bool = False) -> None:
44-
self.parse_nodes: bool = parse_nodes
43+
def __init__(self, custom_tags: Optional[list[str]] = None) -> None:
44+
self.custom_tags: Optional[list[str]] = custom_tags
4545

4646
def read(
4747
self,
@@ -93,7 +93,7 @@ def read(
9393
if input_buffer[-1] == 10
9494
else node_.end_point[0] + 1,
9595
node_.start_point[0] + 1,
96-
parse_nodes=self.parse_nodes,
96+
custom_tags=self.custom_tags,
9797
)
9898
for marker_ in source_node.markers:
9999
if not isinstance(marker_, FunctionRangeMarker):
@@ -192,7 +192,7 @@ def read(
192192
function_last_line,
193193
function_comment_node.start_point[0] + 1,
194194
entity_name=function_display_name,
195-
parse_nodes=self.parse_nodes,
195+
custom_tags=self.custom_tags,
196196
)
197197
for marker_ in source_node.markers:
198198
if isinstance(marker_, FunctionRangeMarker) and (
@@ -300,7 +300,7 @@ def read(
300300
function_last_line,
301301
function_comment_node.start_point[0] + 1,
302302
entity_name=function_display_name,
303-
parse_nodes=self.parse_nodes,
303+
custom_tags=self.custom_tags,
304304
)
305305
traceability_info.source_nodes.append(source_node)
306306
for marker_ in source_node.markers:
@@ -356,7 +356,7 @@ def read(
356356
node_.start_point[0] + 1,
357357
node_.end_point[0] + 1,
358358
node_.start_point[0] + 1,
359-
parse_nodes=False,
359+
custom_tags=None,
360360
)
361361

362362
for marker_ in source_node.markers:

strictdoc/core/project_config.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,26 @@ def shall_parse_nodes(self, path_to_file: str) -> bool:
458458

459459
return False
460460

461+
def parse_nodes_type(self, path_to_file: str) -> Optional[tuple[str, str]]:
462+
if self.source_root_path is None:
463+
return None
464+
465+
for sdoc_source_config_entry_ in self.source_nodes:
466+
# FIXME: Move the setting of full paths to .finalize() of this config
467+
# class when it is implemented.
468+
full_path = sdoc_source_config_entry_.setdefault(
469+
"full_path",
470+
os.path.join(
471+
self.source_root_path, sdoc_source_config_entry_["path"]
472+
),
473+
)
474+
if path_to_file.startswith(full_path):
475+
return sdoc_source_config_entry_[
476+
"uid"
477+
], sdoc_source_config_entry_["node_type"]
478+
479+
return None
480+
461481

462482
class ProjectConfigLoader:
463483
@staticmethod

strictdoc/core/traceability_index_builder.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
from strictdoc.backend.sdoc.models.document import SDocDocument
1616
from strictdoc.backend.sdoc.models.document_from_file import DocumentFromFile
1717
from strictdoc.backend.sdoc.models.document_grammar import DocumentGrammar
18-
from strictdoc.backend.sdoc.models.grammar_element import ReferenceType
18+
from strictdoc.backend.sdoc.models.grammar_element import (
19+
GrammarElement,
20+
ReferenceType,
21+
)
1922
from strictdoc.backend.sdoc.models.inline_link import InlineLink
2023
from strictdoc.backend.sdoc.models.model import SDocDocumentFromFileIF
2124
from strictdoc.backend.sdoc.models.node import SDocNode
@@ -139,9 +142,18 @@ def create(
139142
with measure_performance(
140143
f"Reading source: {source_file.in_doctree_source_file_rel_path}"
141144
):
145+
source_node_grammar_element = (
146+
TraceabilityIndexBuilder.source_node_grammar_element(
147+
source_file.full_path,
148+
project_config,
149+
traceability_index,
150+
)
151+
)
142152
traceability_info = (
143153
SourceFileTraceabilityCachingReader.read_from_file(
144-
source_file.full_path, project_config
154+
source_file.full_path,
155+
project_config,
156+
source_node_grammar_element,
145157
)
146158
)
147159

@@ -863,3 +875,20 @@ def _filter_nodes(
863875
raise StrictDocException(
864876
f"Cannot apply a filter query to a node: {attribute_error_}"
865877
) from attribute_error_
878+
879+
@staticmethod
880+
def source_node_grammar_element(
881+
path_to_file: str,
882+
project_config: ProjectConfig,
883+
traceability_index: TraceabilityIndex,
884+
) -> Optional[GrammarElement]:
885+
maybe_parse_nodes_type = project_config.parse_nodes_type(path_to_file)
886+
if maybe_parse_nodes_type is None:
887+
return None
888+
parse_nodes_uid, parse_nodes_type = maybe_parse_nodes_type
889+
sdoc_document = assert_cast(
890+
traceability_index.get_node_by_uid_weak2(parse_nodes_uid),
891+
SDocDocument,
892+
)
893+
assert sdoc_document.grammar is not None
894+
return sdoc_document.grammar.elements_by_type.get(parse_nodes_type)

tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def test_30_relation_and_field():
191191
FOOBAR
192192
"""
193193

194-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
194+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
195195
assert tree.data == "start"
196196

197197
assert len(tree.children) == 5
@@ -256,7 +256,7 @@ def test_31_single_node_field():
256256
STATEMENT: This can likely replace _weak below with no problem.
257257
"""
258258

259-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
259+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
260260
assert tree.data == "start"
261261

262262
assert len(tree.children) == 1
@@ -291,7 +291,7 @@ def test_31B_single_node_field():
291291
292292
""" # noqa: W293
293293

294-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
294+
tree = MarkerLexer.parse(input_string, custom_tags=["INTENTION"])
295295
assert tree.data == "start"
296296

297297
assert len(tree.children) == 1
@@ -326,7 +326,7 @@ def test_31C_single_node_field():
326326
}
327327
""" # noqa: W293
328328

329-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
329+
tree = MarkerLexer.parse(input_string, custom_tags=["INTENTION"])
330330
assert tree.data == "start"
331331

332332
assert len(tree.children) == 2
@@ -351,7 +351,7 @@ def test_32_two_single_line_fields():
351351
STATEMENT: This can likely replace _weak below with no problem.
352352
"""
353353

354-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
354+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
355355
assert tree.data == "start"
356356

357357
assert len(tree.children) == 2
@@ -375,7 +375,9 @@ def test_32B_two_single_line_fields_consecutive():
375375
STATEMENTT: This can likely replace _weak below with no problem.
376376
"""
377377

378-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
378+
tree = MarkerLexer.parse(
379+
input_string, custom_tags=["STATEMENT", "STATEMENTT"]
380+
)
379381

380382
assert tree.data == "start"
381383

@@ -403,7 +405,7 @@ def test_33_multiline_and_multiparagraph_fields():
403405
FOOBAR
404406
"""
405407

406-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
408+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
407409
assert tree.data == "start"
408410

409411
assert len(tree.children) == 1
@@ -434,6 +436,38 @@ def test_60_exclude_reserved_keywords():
434436
assert len(tree.children) == 0
435437

436438

439+
def test_70_exclude_similar_but_not_in_grammar():
440+
input_string = """
441+
Note: This is ordinary comment text.
442+
443+
STATEMENT: This can likely replace _weak below with no problem.
444+
FYI: More ordinary comment text.
445+
446+
TEST: This can likely replace _weak below with no problem.
447+
448+
Hint: Again, ordinary comment text.
449+
"""
450+
451+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT", "TEST"])
452+
assert tree.data == "start"
453+
assert len(tree.children) == 2
454+
assert tree.children[0].data == "node_field"
455+
assert tree.children[0].children[0].data == "node_name"
456+
assert tree.children[0].children[0].children[0].value == "STATEMENT"
457+
assert (
458+
tree.children[0].children[1].children[0].value
459+
== "This can likely replace _weak below with no problem."
460+
)
461+
assert tree.children[0].children[1].data == "node_multiline_value"
462+
assert tree.children[1].children[0].data == "node_name"
463+
assert tree.children[1].children[0].children[0].value == "TEST"
464+
assert (
465+
tree.children[1].children[1].children[0].value
466+
== "This can likely replace _weak below with no problem."
467+
)
468+
assert tree.children[1].children[1].data == "node_multiline_value"
469+
470+
437471
def test_80_linux_spdx_like_identifiers():
438472
input_string = """\
439473
SPDX-ID: REQ-1
@@ -445,7 +479,7 @@ def test_80_linux_spdx_like_identifiers():
445479
And this is the same statement's another paragraph.
446480
"""
447481

448-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
482+
tree = MarkerLexer.parse(input_string, custom_tags=["SPDX-ID", "SPDX-Text"])
449483
assert tree.data == "start"
450484

451485
assert len(tree.children) == 2

0 commit comments

Comments
 (0)