Skip to content

Commit 2a40506

Browse files
committed
feat(backend/sdoc_source_code): tighten lark grammar for custom source nodes
With source node parsing enabled, the comment parser considered any word followed by ":" as StrictDoc relevant custom tag. This is overly greedy. We know right on startup which tag values are allowed and now narrow the search to exactly them. This prevents parsing errors or unexpected results where comments have "someword:" by chance instead of as intentional SDoc annotaion.
1 parent 9125120 commit 2a40506

File tree

7 files changed

+142
-36
lines changed

7 files changed

+142
-36
lines changed

strictdoc/backend/sdoc_source_code/caching_reader.py

Lines changed: 17 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from typing import Optional, Union
66

7+
from strictdoc.backend.sdoc.models.grammar_element import GrammarElement
78
from strictdoc.backend.sdoc.pickle_cache import PickleCache
89
from strictdoc.backend.sdoc_source_code.models.source_file_info import (
910
SourceFileTraceabilityInfo,
@@ -26,7 +27,9 @@
2627
class SourceFileTraceabilityCachingReader:
2728
@staticmethod
2829
def read_from_file(
29-
path_to_file: str, project_config: ProjectConfig
30+
path_to_file: str,
31+
project_config: ProjectConfig,
32+
source_node_grammar_element: Optional[GrammarElement],
3033
) -> Optional[SourceFileTraceabilityInfo]:
3134
unpickled_content = PickleCache.read_from_cache(
3235
path_to_file, project_config, "source_file"
@@ -39,7 +42,7 @@ def read_from_file(
3942
return unpickled_content
4043

4144
reader = SourceFileTraceabilityCachingReader._get_reader(
42-
path_to_file, project_config
45+
path_to_file, project_config, source_node_grammar_element
4346
)
4447
try:
4548
traceability_info = reader.read_from_file(path_to_file)
@@ -59,7 +62,9 @@ def read_from_file(
5962

6063
@staticmethod
6164
def _get_reader(
62-
path_to_file: str, project_config: ProjectConfig
65+
path_to_file: str,
66+
project_config: ProjectConfig,
67+
source_node_grammar_element: Optional[GrammarElement],
6368
) -> Union[
6469
SourceFileTraceabilityReader,
6570
SourceFileTraceabilityReader_Python,
@@ -77,8 +82,15 @@ def _get_reader(
7782
or path_to_file.endswith(".hpp")
7883
or path_to_file.endswith(".cpp")
7984
):
80-
parse_nodes = project_config.shall_parse_nodes(path_to_file)
81-
return SourceFileTraceabilityReader_C(parse_nodes=parse_nodes)
85+
custom_tags = (
86+
[
87+
field.title
88+
for field in source_node_grammar_element.fields
89+
]
90+
if source_node_grammar_element is not None
91+
else None
92+
)
93+
return SourceFileTraceabilityReader_C(custom_tags=custom_tags)
8294
if path_to_file.endswith(".robot"):
8395
return SourceFileTraceabilityReader_Robot()
8496
return SourceFileTraceabilityReader()

strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py

Lines changed: 24 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
"""
44

55
from string import Template
6+
from typing import Optional
67

78
from lark import Lark, ParseTree, UnexpectedToken
89

@@ -17,7 +18,17 @@ class GrammarTemplate(Template):
1718

1819

1920
RELATION_MARKER_START = r"@relation[\(\{]"
20-
REGEX_NODE_NAME = r"[A-Za-z0-9_\-]+"
21+
22+
NODE_GRAMMAR_EXTENSION = GrammarTemplate("""
23+
node_field: node_name ":" node_multiline_value
24+
node_name: /##CUSTOM_TAGS/
25+
node_multiline_value: (_WS_INLINE | _NL) (NODE_FIRST_STRING_VALUE _NL) (NODE_STRING_VALUE _NL)*
26+
27+
NODE_FIRST_STRING_VALUE.2: /\\s*[^\n\r]+/x
28+
NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*[A-Z_]+: )[^\n\r]+/x
29+
30+
_NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*(##CUSTOM_TAGS): )|(##RESERVED_KEYWORDS)).+/
31+
""")
2132

2233
GRAMMAR = GrammarTemplate("""
2334
start: ##START
@@ -29,14 +40,7 @@ class GrammarTemplate(Template):
2940
relation_scope: /file|class|function|line|range_start|range_end/
3041
relation_role: ALPHANUMERIC_WORD
3142
32-
node_field: node_name ":" node_multiline_value
33-
node_name: /(?!(##RESERVED_KEYWORDS))##REGEX_NODE_NAME/
34-
node_multiline_value: (_WS_INLINE | _NL) (NODE_FIRST_STRING_VALUE _NL) (NODE_STRING_VALUE _NL)*
35-
36-
NODE_FIRST_STRING_VALUE.2: /\\s*[^\n\r]+/x
37-
NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*[A-Z_]+: )[^\n\r]+/x
38-
39-
_NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*##REGEX_NODE_NAME: )|(##RESERVED_KEYWORDS)).+/
43+
##GRAMMAR_EXTENSION
4044
4145
_NORMAL_STRING_NO_MARKER: /(?!\\s*##RELATION_MARKER_START).+/
4246
@@ -62,16 +66,23 @@ class GrammarTemplate(Template):
6266

6367
class MarkerLexer:
6468
@staticmethod
65-
def parse(source_input: str, parse_nodes: bool = False) -> ParseTree:
66-
if parse_nodes:
69+
def parse(
70+
source_input: str, custom_tags: Optional[list[str]] = None
71+
) -> ParseTree:
72+
if custom_tags is not None:
73+
grammar_extension = NODE_GRAMMAR_EXTENSION.substitute(
74+
CUSTOM_TAGS="|".join(f"{tag}(?=:)" for tag in custom_tags),
75+
RESERVED_KEYWORDS=RESERVED_KEYWORDS,
76+
RELATION_MARKER_START=RELATION_MARKER_START,
77+
)
6778
start = "(relation_marker | node_field | _NORMAL_STRING_NO_MARKER_NO_NODE | _WS)*"
6879
else:
80+
grammar_extension = ""
6981
start = "(relation_marker | _NORMAL_STRING_NO_MARKER | _WS)*"
7082

7183
grammar = GRAMMAR.substitute(
72-
REGEX_NODE_NAME=REGEX_NODE_NAME,
84+
GRAMMAR_EXTENSION=grammar_extension,
7385
RELATION_MARKER_START=RELATION_MARKER_START,
74-
RESERVED_KEYWORDS=RESERVED_KEYWORDS,
7586
REGEX_REQ=REGEX_REQ,
7687
START=start,
7788
)

strictdoc/backend/sdoc_source_code/marker_parser.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def parse(
3232
comment_line_start: int,
3333
entity_name: Optional[str] = None,
3434
col_offset: int = 0,
35-
parse_nodes: bool = False,
35+
custom_tags: Optional[list[str]] = None,
3636
) -> SourceNode:
3737
"""
3838
Parse relation markers from source file comments.
@@ -54,7 +54,7 @@ def parse(
5454
input_string = preprocess_source_code_comment(input_string)
5555

5656
tree: ParseTree = MarkerLexer.parse(
57-
input_string, parse_nodes=parse_nodes
57+
input_string, custom_tags=custom_tags
5858
)
5959

6060
for element_ in tree.children:

strictdoc/backend/sdoc_source_code/reader_c.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@
4040

4141

4242
class SourceFileTraceabilityReader_C:
43-
def __init__(self, parse_nodes: bool = False) -> None:
44-
self.parse_nodes: bool = parse_nodes
43+
def __init__(self, custom_tags: Optional[list[str]] = None) -> None:
44+
self.custom_tags: Optional[list[str]] = custom_tags
4545

4646
def read(
4747
self,
@@ -93,7 +93,7 @@ def read(
9393
if input_buffer[-1] == 10
9494
else node_.end_point[0] + 1,
9595
node_.start_point[0] + 1,
96-
parse_nodes=self.parse_nodes,
96+
custom_tags=self.custom_tags,
9797
)
9898
for marker_ in source_node.markers:
9999
if not isinstance(marker_, FunctionRangeMarker):
@@ -192,7 +192,7 @@ def read(
192192
function_last_line,
193193
function_comment_node.start_point[0] + 1,
194194
entity_name=function_display_name,
195-
parse_nodes=self.parse_nodes,
195+
custom_tags=self.custom_tags,
196196
)
197197
for marker_ in source_node.markers:
198198
if isinstance(marker_, FunctionRangeMarker) and (
@@ -300,7 +300,7 @@ def read(
300300
function_last_line,
301301
function_comment_node.start_point[0] + 1,
302302
entity_name=function_display_name,
303-
parse_nodes=self.parse_nodes,
303+
custom_tags=self.custom_tags,
304304
)
305305
traceability_info.source_nodes.append(source_node)
306306
for marker_ in source_node.markers:
@@ -356,7 +356,7 @@ def read(
356356
node_.start_point[0] + 1,
357357
node_.end_point[0] + 1,
358358
node_.start_point[0] + 1,
359-
parse_nodes=False,
359+
custom_tags=None,
360360
)
361361

362362
for marker_ in source_node.markers:

strictdoc/core/project_config.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -458,6 +458,26 @@ def shall_parse_nodes(self, path_to_file: str) -> bool:
458458

459459
return False
460460

461+
def parse_nodes_type(self, path_to_file: str) -> Optional[tuple[str, str]]:
462+
if self.source_root_path is None:
463+
return None
464+
465+
for sdoc_source_config_entry_ in self.source_nodes:
466+
# FIXME: Move the setting of full paths to .finalize() of this config
467+
# class when it is implemented.
468+
full_path = sdoc_source_config_entry_.setdefault(
469+
"full_path",
470+
os.path.join(
471+
self.source_root_path, sdoc_source_config_entry_["path"]
472+
),
473+
)
474+
if path_to_file.startswith(full_path):
475+
return sdoc_source_config_entry_[
476+
"uid"
477+
], sdoc_source_config_entry_["node_type"]
478+
479+
return None
480+
461481

462482
class ProjectConfigLoader:
463483
@staticmethod

strictdoc/core/traceability_index_builder.py

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,10 @@
1515
from strictdoc.backend.sdoc.models.document import SDocDocument
1616
from strictdoc.backend.sdoc.models.document_from_file import DocumentFromFile
1717
from strictdoc.backend.sdoc.models.document_grammar import DocumentGrammar
18-
from strictdoc.backend.sdoc.models.grammar_element import ReferenceType
18+
from strictdoc.backend.sdoc.models.grammar_element import (
19+
GrammarElement,
20+
ReferenceType,
21+
)
1922
from strictdoc.backend.sdoc.models.inline_link import InlineLink
2023
from strictdoc.backend.sdoc.models.model import SDocDocumentFromFileIF
2124
from strictdoc.backend.sdoc.models.node import SDocNode
@@ -139,9 +142,18 @@ def create(
139142
with measure_performance(
140143
f"Reading source: {source_file.in_doctree_source_file_rel_path}"
141144
):
145+
source_node_grammar_element = (
146+
TraceabilityIndexBuilder.source_node_grammar_element(
147+
source_file.full_path,
148+
project_config,
149+
traceability_index,
150+
)
151+
)
142152
traceability_info = (
143153
SourceFileTraceabilityCachingReader.read_from_file(
144-
source_file.full_path, project_config
154+
source_file.full_path,
155+
project_config,
156+
source_node_grammar_element,
145157
)
146158
)
147159

@@ -863,3 +875,20 @@ def _filter_nodes(
863875
raise StrictDocException(
864876
f"Cannot apply a filter query to a node: {attribute_error_}"
865877
) from attribute_error_
878+
879+
@staticmethod
880+
def source_node_grammar_element(
881+
path_to_file: str,
882+
project_config: ProjectConfig,
883+
traceability_index: TraceabilityIndex,
884+
) -> Optional[GrammarElement]:
885+
maybe_parse_nodes_type = project_config.parse_nodes_type(path_to_file)
886+
if maybe_parse_nodes_type is None:
887+
return None
888+
parse_nodes_uid, parse_nodes_type = maybe_parse_nodes_type
889+
sdoc_document = assert_cast(
890+
traceability_index.get_node_by_uid_weak2(parse_nodes_uid),
891+
SDocDocument,
892+
)
893+
assert sdoc_document.grammar is not None
894+
return sdoc_document.grammar.elements_by_type.get(parse_nodes_type)

tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py

Lines changed: 42 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -191,7 +191,7 @@ def test_30_relation_and_field():
191191
FOOBAR
192192
"""
193193

194-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
194+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
195195
assert tree.data == "start"
196196

197197
assert len(tree.children) == 5
@@ -256,7 +256,7 @@ def test_31_single_node_field():
256256
STATEMENT: This can likely replace _weak below with no problem.
257257
"""
258258

259-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
259+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
260260
assert tree.data == "start"
261261

262262
assert len(tree.children) == 1
@@ -291,7 +291,7 @@ def test_31B_single_node_field():
291291
292292
""" # noqa: W293
293293

294-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
294+
tree = MarkerLexer.parse(input_string, custom_tags=["INTENTION"])
295295
assert tree.data == "start"
296296

297297
assert len(tree.children) == 1
@@ -326,7 +326,7 @@ def test_31C_single_node_field():
326326
}
327327
""" # noqa: W293
328328

329-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
329+
tree = MarkerLexer.parse(input_string, custom_tags=["INTENTION"])
330330
assert tree.data == "start"
331331

332332
assert len(tree.children) == 2
@@ -351,7 +351,7 @@ def test_32_two_single_line_fields():
351351
STATEMENT: This can likely replace _weak below with no problem.
352352
"""
353353

354-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
354+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
355355
assert tree.data == "start"
356356

357357
assert len(tree.children) == 2
@@ -375,7 +375,9 @@ def test_32B_two_single_line_fields_consecutive():
375375
STATEMENTT: This can likely replace _weak below with no problem.
376376
"""
377377

378-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
378+
tree = MarkerLexer.parse(
379+
input_string, custom_tags=["STATEMENT", "STATEMENTT"]
380+
)
379381

380382
assert tree.data == "start"
381383

@@ -403,7 +405,7 @@ def test_33_multiline_and_multiparagraph_fields():
403405
FOOBAR
404406
"""
405407

406-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
408+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
407409
assert tree.data == "start"
408410

409411
assert len(tree.children) == 1
@@ -434,6 +436,38 @@ def test_60_exclude_reserved_keywords():
434436
assert len(tree.children) == 0
435437

436438

439+
def test_70_exclude_similar_but_not_in_grammar():
440+
input_string = """
441+
Note: This is ordinary comment text.
442+
443+
STATEMENT: This can likely replace _weak below with no problem.
444+
FYI: More ordinary comment text.
445+
446+
TEST: This can likely replace _weak below with no problem.
447+
448+
Hint: Again, ordinary comment text.
449+
"""
450+
451+
tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT", "TEST"])
452+
assert tree.data == "start"
453+
assert len(tree.children) == 2
454+
assert tree.children[0].data == "node_field"
455+
assert tree.children[0].children[0].data == "node_name"
456+
assert tree.children[0].children[0].children[0].value == "STATEMENT"
457+
assert (
458+
tree.children[0].children[1].children[0].value
459+
== "This can likely replace _weak below with no problem."
460+
)
461+
assert tree.children[0].children[1].data == "node_multiline_value"
462+
assert tree.children[1].children[0].data == "node_name"
463+
assert tree.children[1].children[0].children[0].value == "TEST"
464+
assert (
465+
tree.children[1].children[1].children[0].value
466+
== "This can likely replace _weak below with no problem."
467+
)
468+
assert tree.children[1].children[1].data == "node_multiline_value"
469+
470+
437471
def test_80_linux_spdx_like_identifiers():
438472
input_string = """\
439473
SPDX-ID: REQ-1
@@ -445,7 +479,7 @@ def test_80_linux_spdx_like_identifiers():
445479
And this is the same statement's another paragraph.
446480
"""
447481

448-
tree = MarkerLexer.parse(input_string, parse_nodes=True)
482+
tree = MarkerLexer.parse(input_string, custom_tags=["SPDX-ID", "SPDX-Text"])
449483
assert tree.data == "start"
450484

451485
assert len(tree.children) == 2

0 commit comments

Comments
 (0)