feat(backend/sdoc_source_code): tighten lark grammar for custom source nodes

haxtibal · haxtibal · commit 2a405065d02c · 2025-10-28T18:53:47.000+01:00
With source node parsing enabled, the comment parser considered any word
followed by ":" as StrictDoc relevant custom tag. This is overly greedy.

We know right on startup which tag values are allowed and now narrow the
search to exactly them. This prevents parsing errors or unexpected
results where comments have "someword:" by chance instead of as
intentional SDoc annotaion.
diff --git a/strictdoc/backend/sdoc_source_code/caching_reader.py b/strictdoc/backend/sdoc_source_code/caching_reader.py
@@ -4,6 +4,7 @@
 
 from typing import Optional, Union
 
+from strictdoc.backend.sdoc.models.grammar_element import GrammarElement
 from strictdoc.backend.sdoc.pickle_cache import PickleCache
 from strictdoc.backend.sdoc_source_code.models.source_file_info import (
     SourceFileTraceabilityInfo,
@@ -26,7 +27,9 @@
 class SourceFileTraceabilityCachingReader:
     @staticmethod
     def read_from_file(
-        path_to_file: str, project_config: ProjectConfig
+        path_to_file: str,
+        project_config: ProjectConfig,
+        source_node_grammar_element: Optional[GrammarElement],
     ) -> Optional[SourceFileTraceabilityInfo]:
         unpickled_content = PickleCache.read_from_cache(
             path_to_file, project_config, "source_file"
@@ -39,7 +42,7 @@ def read_from_file(
             return unpickled_content
 
         reader = SourceFileTraceabilityCachingReader._get_reader(
-            path_to_file, project_config
+            path_to_file, project_config, source_node_grammar_element
         )
         try:
             traceability_info = reader.read_from_file(path_to_file)
@@ -59,7 +62,9 @@ def read_from_file(
 
     @staticmethod
     def _get_reader(
-        path_to_file: str, project_config: ProjectConfig
+        path_to_file: str,
+        project_config: ProjectConfig,
+        source_node_grammar_element: Optional[GrammarElement],
     ) -> Union[
         SourceFileTraceabilityReader,
         SourceFileTraceabilityReader_Python,
@@ -77,8 +82,15 @@ def _get_reader(
                 or path_to_file.endswith(".hpp")
                 or path_to_file.endswith(".cpp")
             ):
-                parse_nodes = project_config.shall_parse_nodes(path_to_file)
-                return SourceFileTraceabilityReader_C(parse_nodes=parse_nodes)
+                custom_tags = (
+                    [
+                        field.title
+                        for field in source_node_grammar_element.fields
+                    ]
+                    if source_node_grammar_element is not None
+                    else None
+                )
+                return SourceFileTraceabilityReader_C(custom_tags=custom_tags)
             if path_to_file.endswith(".robot"):
                 return SourceFileTraceabilityReader_Robot()
         return SourceFileTraceabilityReader()
diff --git a/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py b/strictdoc/backend/sdoc_source_code/comment_parser/marker_lexer.py
@@ -3,6 +3,7 @@
 """
 
 from string import Template
+from typing import Optional
 
 from lark import Lark, ParseTree, UnexpectedToken
 
@@ -17,7 +18,17 @@ class GrammarTemplate(Template):
 
 
 RELATION_MARKER_START = r"@relation[\(\{]"
-REGEX_NODE_NAME = r"[A-Za-z0-9_\-]+"
+
+NODE_GRAMMAR_EXTENSION = GrammarTemplate("""
+node_field: node_name ":" node_multiline_value
+node_name: /##CUSTOM_TAGS/
+node_multiline_value: (_WS_INLINE | _NL) (NODE_FIRST_STRING_VALUE _NL) (NODE_STRING_VALUE _NL)*
+
+NODE_FIRST_STRING_VALUE.2: /\\s*[^\n\r]+/x
+NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*[A-Z_]+: )[^\n\r]+/x
+
+_NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*(##CUSTOM_TAGS): )|(##RESERVED_KEYWORDS)).+/
+""")
 
 GRAMMAR = GrammarTemplate("""
 start: ##START
@@ -29,14 +40,7 @@ class GrammarTemplate(Template):
 relation_scope: /file|class|function|line|range_start|range_end/
 relation_role: ALPHANUMERIC_WORD
 
-node_field: node_name ":" node_multiline_value
-node_name: /(?!(##RESERVED_KEYWORDS))##REGEX_NODE_NAME/
-node_multiline_value: (_WS_INLINE | _NL) (NODE_FIRST_STRING_VALUE _NL) (NODE_STRING_VALUE _NL)*
-
-NODE_FIRST_STRING_VALUE.2: /\\s*[^\n\r]+/x
-NODE_STRING_VALUE.2: /(?![ ]*##RELATION_MARKER_START)(?!\\s*[A-Z_]+: )[^\n\r]+/x
-
-_NORMAL_STRING_NO_MARKER_NO_NODE: /(?!\\s*##RELATION_MARKER_START)((?!\\s*##REGEX_NODE_NAME: )|(##RESERVED_KEYWORDS)).+/
+##GRAMMAR_EXTENSION
 
 _NORMAL_STRING_NO_MARKER: /(?!\\s*##RELATION_MARKER_START).+/
 
@@ -62,16 +66,23 @@ class GrammarTemplate(Template):
 
 class MarkerLexer:
     @staticmethod
-    def parse(source_input: str, parse_nodes: bool = False) -> ParseTree:
-        if parse_nodes:
+    def parse(
+        source_input: str, custom_tags: Optional[list[str]] = None
+    ) -> ParseTree:
+        if custom_tags is not None:
+            grammar_extension = NODE_GRAMMAR_EXTENSION.substitute(
+                CUSTOM_TAGS="|".join(f"{tag}(?=:)" for tag in custom_tags),
+                RESERVED_KEYWORDS=RESERVED_KEYWORDS,
+                RELATION_MARKER_START=RELATION_MARKER_START,
+            )
             start = "(relation_marker | node_field | _NORMAL_STRING_NO_MARKER_NO_NODE | _WS)*"
         else:
+            grammar_extension = ""
             start = "(relation_marker | _NORMAL_STRING_NO_MARKER | _WS)*"
 
         grammar = GRAMMAR.substitute(
-            REGEX_NODE_NAME=REGEX_NODE_NAME,
+            GRAMMAR_EXTENSION=grammar_extension,
             RELATION_MARKER_START=RELATION_MARKER_START,
-            RESERVED_KEYWORDS=RESERVED_KEYWORDS,
             REGEX_REQ=REGEX_REQ,
             START=start,
         )
diff --git a/strictdoc/backend/sdoc_source_code/marker_parser.py b/strictdoc/backend/sdoc_source_code/marker_parser.py
@@ -32,7 +32,7 @@ def parse(
         comment_line_start: int,
         entity_name: Optional[str] = None,
         col_offset: int = 0,
-        parse_nodes: bool = False,
+        custom_tags: Optional[list[str]] = None,
     ) -> SourceNode:
         """
         Parse relation markers from source file comments.
@@ -54,7 +54,7 @@ def parse(
         input_string = preprocess_source_code_comment(input_string)
 
         tree: ParseTree = MarkerLexer.parse(
-            input_string, parse_nodes=parse_nodes
+            input_string, custom_tags=custom_tags
         )
 
         for element_ in tree.children:
diff --git a/strictdoc/backend/sdoc_source_code/reader_c.py b/strictdoc/backend/sdoc_source_code/reader_c.py
@@ -40,8 +40,8 @@
 
 
 class SourceFileTraceabilityReader_C:
-    def __init__(self, parse_nodes: bool = False) -> None:
-        self.parse_nodes: bool = parse_nodes
+    def __init__(self, custom_tags: Optional[list[str]] = None) -> None:
+        self.custom_tags: Optional[list[str]] = custom_tags
 
     def read(
         self,
@@ -93,7 +93,7 @@ def read(
                             if input_buffer[-1] == 10
                             else node_.end_point[0] + 1,
                             node_.start_point[0] + 1,
-                            parse_nodes=self.parse_nodes,
+                            custom_tags=self.custom_tags,
                         )
                         for marker_ in source_node.markers:
                             if not isinstance(marker_, FunctionRangeMarker):
@@ -192,7 +192,7 @@ def read(
                         function_last_line,
                         function_comment_node.start_point[0] + 1,
                         entity_name=function_display_name,
-                        parse_nodes=self.parse_nodes,
+                        custom_tags=self.custom_tags,
                     )
                     for marker_ in source_node.markers:
                         if isinstance(marker_, FunctionRangeMarker) and (
@@ -300,7 +300,7 @@ def read(
                         function_last_line,
                         function_comment_node.start_point[0] + 1,
                         entity_name=function_display_name,
-                        parse_nodes=self.parse_nodes,
+                        custom_tags=self.custom_tags,
                     )
                     traceability_info.source_nodes.append(source_node)
                     for marker_ in source_node.markers:
@@ -356,7 +356,7 @@ def read(
                     node_.start_point[0] + 1,
                     node_.end_point[0] + 1,
                     node_.start_point[0] + 1,
-                    parse_nodes=False,
+                    custom_tags=None,
                 )
 
                 for marker_ in source_node.markers:
diff --git a/strictdoc/core/project_config.py b/strictdoc/core/project_config.py
@@ -458,6 +458,26 @@ def shall_parse_nodes(self, path_to_file: str) -> bool:
 
         return False
 
+    def parse_nodes_type(self, path_to_file: str) -> Optional[tuple[str, str]]:
+        if self.source_root_path is None:
+            return None
+
+        for sdoc_source_config_entry_ in self.source_nodes:
+            # FIXME: Move the setting of full paths to .finalize() of this config
+            #        class when it is implemented.
+            full_path = sdoc_source_config_entry_.setdefault(
+                "full_path",
+                os.path.join(
+                    self.source_root_path, sdoc_source_config_entry_["path"]
+                ),
+            )
+            if path_to_file.startswith(full_path):
+                return sdoc_source_config_entry_[
+                    "uid"
+                ], sdoc_source_config_entry_["node_type"]
+
+        return None
+
 
 class ProjectConfigLoader:
     @staticmethod
diff --git a/strictdoc/core/traceability_index_builder.py b/strictdoc/core/traceability_index_builder.py
@@ -15,7 +15,10 @@
 from strictdoc.backend.sdoc.models.document import SDocDocument
 from strictdoc.backend.sdoc.models.document_from_file import DocumentFromFile
 from strictdoc.backend.sdoc.models.document_grammar import DocumentGrammar
-from strictdoc.backend.sdoc.models.grammar_element import ReferenceType
+from strictdoc.backend.sdoc.models.grammar_element import (
+    GrammarElement,
+    ReferenceType,
+)
 from strictdoc.backend.sdoc.models.inline_link import InlineLink
 from strictdoc.backend.sdoc.models.model import SDocDocumentFromFileIF
 from strictdoc.backend.sdoc.models.node import SDocNode
@@ -139,9 +142,18 @@ def create(
                 with measure_performance(
                     f"Reading source: {source_file.in_doctree_source_file_rel_path}"
                 ):
+                    source_node_grammar_element = (
+                        TraceabilityIndexBuilder.source_node_grammar_element(
+                            source_file.full_path,
+                            project_config,
+                            traceability_index,
+                        )
+                    )
                     traceability_info = (
                         SourceFileTraceabilityCachingReader.read_from_file(
-                            source_file.full_path, project_config
+                            source_file.full_path,
+                            project_config,
+                            source_node_grammar_element,
                         )
                     )
 
@@ -863,3 +875,20 @@ def _filter_nodes(
             raise StrictDocException(
                 f"Cannot apply a filter query to a node: {attribute_error_}"
             ) from attribute_error_
+
+    @staticmethod
+    def source_node_grammar_element(
+        path_to_file: str,
+        project_config: ProjectConfig,
+        traceability_index: TraceabilityIndex,
+    ) -> Optional[GrammarElement]:
+        maybe_parse_nodes_type = project_config.parse_nodes_type(path_to_file)
+        if maybe_parse_nodes_type is None:
+            return None
+        parse_nodes_uid, parse_nodes_type = maybe_parse_nodes_type
+        sdoc_document = assert_cast(
+            traceability_index.get_node_by_uid_weak2(parse_nodes_uid),
+            SDocDocument,
+        )
+        assert sdoc_document.grammar is not None
+        return sdoc_document.grammar.elements_by_type.get(parse_nodes_type)
diff --git a/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py b/tests/unit/strictdoc/backend/sdoc_source_code/test_marker_lexer.py
@@ -191,7 +191,7 @@ def test_30_relation_and_field():
 FOOBAR
 """
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
     assert tree.data == "start"
 
     assert len(tree.children) == 5
@@ -256,7 +256,7 @@ def test_31_single_node_field():
         STATEMENT: This can likely replace _weak below with no problem.
     """
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
     assert tree.data == "start"
 
     assert len(tree.children) == 1
@@ -291,7 +291,7 @@ def test_31B_single_node_field():
   
     """  # noqa: W293
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(input_string, custom_tags=["INTENTION"])
     assert tree.data == "start"
 
     assert len(tree.children) == 1
@@ -326,7 +326,7 @@ def test_31C_single_node_field():
 }
 """  # noqa: W293
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(input_string, custom_tags=["INTENTION"])
     assert tree.data == "start"
 
     assert len(tree.children) == 2
@@ -351,7 +351,7 @@ def test_32_two_single_line_fields():
         STATEMENT: This can likely replace _weak below with no problem.
     """
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
     assert tree.data == "start"
 
     assert len(tree.children) == 2
@@ -375,7 +375,9 @@ def test_32B_two_single_line_fields_consecutive():
         STATEMENTT: This can likely replace _weak below with no problem.
     """
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(
+        input_string, custom_tags=["STATEMENT", "STATEMENTT"]
+    )
 
     assert tree.data == "start"
 
@@ -403,7 +405,7 @@ def test_33_multiline_and_multiparagraph_fields():
 FOOBAR
 """
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT"])
     assert tree.data == "start"
 
     assert len(tree.children) == 1
@@ -434,6 +436,38 @@ def test_60_exclude_reserved_keywords():
     assert len(tree.children) == 0
 
 
+def test_70_exclude_similar_but_not_in_grammar():
+    input_string = """
+        Note: This is ordinary comment text.
+
+        STATEMENT: This can likely replace _weak below with no problem.
+        FYI: More ordinary comment text.
+
+        TEST: This can likely replace _weak below with no problem.
+
+        Hint: Again, ordinary comment text.
+    """
+
+    tree = MarkerLexer.parse(input_string, custom_tags=["STATEMENT", "TEST"])
+    assert tree.data == "start"
+    assert len(tree.children) == 2
+    assert tree.children[0].data == "node_field"
+    assert tree.children[0].children[0].data == "node_name"
+    assert tree.children[0].children[0].children[0].value == "STATEMENT"
+    assert (
+        tree.children[0].children[1].children[0].value
+        == "This can likely replace _weak below with no problem."
+    )
+    assert tree.children[0].children[1].data == "node_multiline_value"
+    assert tree.children[1].children[0].data == "node_name"
+    assert tree.children[1].children[0].children[0].value == "TEST"
+    assert (
+        tree.children[1].children[1].children[0].value
+        == "This can likely replace _weak below with no problem."
+    )
+    assert tree.children[1].children[1].data == "node_multiline_value"
+
+
 def test_80_linux_spdx_like_identifiers():
     input_string = """\
 SPDX-ID: REQ-1
@@ -445,7 +479,7 @@ def test_80_linux_spdx_like_identifiers():
            And this is the same statement's another paragraph.
 """
 
-    tree = MarkerLexer.parse(input_string, parse_nodes=True)
+    tree = MarkerLexer.parse(input_string, custom_tags=["SPDX-ID", "SPDX-Text"])
     assert tree.data == "start"
 
     assert len(tree.children) == 2