✨ NEW: Add SyntaxTreeNode (#129)

hukkinj1 · web-flow · commit 888636c972fa · 2021-02-25T21:29:52.000+01:00
This is a more comprehensive replacement for
`nest_tokens` and `NestedTokens` (which are now deprecated).
It allows for the `Token` stream to be converted to/from
a nested tree structure, with opening/closing tokens collapsed into a single
`SyntaxTreeNode` and the intermediate tokens set as children.
diff --git a/markdown_it/token.py b/markdown_it/token.py
@@ -1,4 +1,5 @@
 from typing import Any, List, Optional, Union
+import warnings
 
 import attr
 
@@ -134,6 +135,12 @@ def nest_tokens(tokens: List[Token]) -> List[Union[Token, NestedTokens]]:
     ``NestedTokens`` contain the open and close tokens and a list of children
     of all tokens in between (recursively nested)
     """
+    warnings.warn(
+        "`markdown_it.token.nest_tokens` and `markdown_it.token.NestedTokens`"
+        " are deprecated. Please migrate to `markdown_it.tree.SyntaxTreeNode`",
+        DeprecationWarning,
+    )
+
     output: List[Union[Token, NestedTokens]] = []
 
     tokens = list(reversed(tokens))
diff --git a/markdown_it/tree.py b/markdown_it/tree.py
@@ -0,0 +1,262 @@
+"""A tree representation of a linear markdown-it token stream.
+
+This module is not part of upstream JavaScript markdown-it.
+"""
+from typing import NamedTuple, Sequence, Tuple, Dict, List, Optional, Any
+
+from .token import Token
+from .utils import _removesuffix
+
+
+class SyntaxTreeNode:
+    """A Markdown syntax tree node.
+
+    A class that can be used to construct a tree representation of a linear
+    `markdown-it-py` token stream. Use `SyntaxTreeNode.from_tokens` to
+    initialize instead of the `__init__` method.
+
+    Each node in the tree represents either:
+      - root of the Markdown document
+      - a single unnested `Token`
+      - a `Token` "_open" and "_close" token pair, and the tokens nested in
+          between
+    """
+
+    class _NesterTokens(NamedTuple):
+        opening: Token
+        closing: Token
+
+    def __init__(self) -> None:
+        """Initialize a root node with no children.
+
+        You probably need `SyntaxTreeNode.from_tokens` instead.
+        """
+        # Only nodes representing an unnested token have self.token
+        self.token: Optional[Token] = None
+
+        # Only containers have nester tokens
+        self.nester_tokens: Optional[SyntaxTreeNode._NesterTokens] = None
+
+        # Root node does not have self.parent
+        self.parent: Optional["SyntaxTreeNode"] = None
+
+        # Empty list unless a non-empty container, or unnested token that has
+        # children (i.e. inline or img)
+        self.children: List["SyntaxTreeNode"] = []
+
+    @staticmethod
+    def from_tokens(tokens: Sequence[Token]) -> "SyntaxTreeNode":
+        """Instantiate a `SyntaxTreeNode` from a token stream.
+
+        This is the standard method for instantiating `SyntaxTreeNode`.
+        """
+        root = SyntaxTreeNode()
+        root._set_children_from_tokens(tokens)
+        return root
+
+    def to_tokens(self) -> List[Token]:
+        """Recover the linear token stream."""
+
+        def recursive_collect_tokens(
+            node: "SyntaxTreeNode", token_list: List[Token]
+        ) -> None:
+            if node.type == "root":
+                for child in node.children:
+                    recursive_collect_tokens(child, token_list)
+            elif node.token:
+                token_list.append(node.token)
+            else:
+                assert node.nester_tokens
+                token_list.append(node.nester_tokens.opening)
+                for child in node.children:
+                    recursive_collect_tokens(child, token_list)
+                token_list.append(node.nester_tokens.closing)
+
+        tokens: List[Token] = []
+        recursive_collect_tokens(self, tokens)
+        return tokens
+
+    @property
+    def is_nested(self) -> bool:
+        """Is this node nested?.
+
+        Returns `True` if the node represents a `Token` pair and tokens in the
+        sequence between them, where `Token.nesting` of the first `Token` in
+        the pair is 1 and nesting of the other `Token` is -1.
+        """
+        return bool(self.nester_tokens)
+
+    @property
+    def siblings(self) -> Sequence["SyntaxTreeNode"]:
+        """Get siblings of the node.
+
+        Gets the whole group of siblings, including self.
+        """
+        if not self.parent:
+            return [self]
+        return self.parent.children
+
+    @property
+    def type(self) -> str:
+        """Get a string type of the represented syntax.
+
+        - "root" for root nodes
+        - `Token.type` if the node represents an unnested token
+        - `Token.type` of the opening token, with "_open" suffix stripped, if
+            the node represents a nester token pair
+        """
+        if not self.token and not self.nester_tokens:
+            return "root"
+        if self.token:
+            return self.token.type
+        assert self.nester_tokens
+        return _removesuffix(self.nester_tokens.opening.type, "_open")
+
+    @property
+    def next_sibling(self) -> Optional["SyntaxTreeNode"]:
+        """Get the next node in the sequence of siblings.
+
+        Returns `None` if this is the last sibling.
+        """
+        self_index = self.siblings.index(self)
+        if self_index + 1 < len(self.siblings):
+            return self.siblings[self_index + 1]
+        return None
+
+    @property
+    def previous_sibling(self) -> Optional["SyntaxTreeNode"]:
+        """Get the previous node in the sequence of siblings.
+
+        Returns `None` if this is the first sibling.
+        """
+        self_index = self.siblings.index(self)
+        if self_index - 1 >= 0:
+            return self.siblings[self_index - 1]
+        return None
+
+    def _make_child(
+        self,
+        *,
+        token: Optional[Token] = None,
+        nester_tokens: Optional[_NesterTokens] = None,
+    ) -> "SyntaxTreeNode":
+        """Make and return a child node for `self`."""
+        if token and nester_tokens or not token and not nester_tokens:
+            raise ValueError("must specify either `token` or `nester_tokens`")
+        child = SyntaxTreeNode()
+        if token:
+            child.token = token
+        else:
+            child.nester_tokens = nester_tokens
+        child.parent = self
+        self.children.append(child)
+        return child
+
+    def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None:
+        """Convert the token stream to a tree structure and set the resulting
+        nodes as children of `self`."""
+        reversed_tokens = list(reversed(tokens))
+        while reversed_tokens:
+            token = reversed_tokens.pop()
+
+            if token.nesting == 0:
+                child = self._make_child(token=token)
+                if token.children:
+                    child._set_children_from_tokens(token.children)
+                continue
+
+            assert token.nesting == 1
+
+            nested_tokens = [token]
+            nesting = 1
+            while reversed_tokens and nesting != 0:
+                token = reversed_tokens.pop()
+                nested_tokens.append(token)
+                nesting += token.nesting
+            if nesting != 0:
+                raise ValueError(f"unclosed tokens starting {nested_tokens[0]}")
+
+            child = self._make_child(
+                nester_tokens=SyntaxTreeNode._NesterTokens(
+                    nested_tokens[0], nested_tokens[-1]
+                )
+            )
+            child._set_children_from_tokens(nested_tokens[1:-1])
+
+    # NOTE:
+    # The values of the properties defined below directly map to properties
+    # of the underlying `Token`s. A root node does not translate to a `Token`
+    # object, so calling these property getters on a root node will raise an
+    # `AttributeError`.
+    #
+    # There is no mapping for `Token.nesting` because the `is_nested` property
+    # provides that data, and can be called on any node type, including root.
+
+    def _attribute_token(self) -> Token:
+        """Return the `Token` that is used as the data source for the
+        properties defined below."""
+        if self.token:
+            return self.token
+        if self.nester_tokens:
+            return self.nester_tokens.opening
+        raise AttributeError("Root node does not have the accessed attribute")
+
+    @property
+    def tag(self) -> str:
+        """html tag name, e.g. \"p\""""
+        return self._attribute_token().tag
+
+    @property
+    def attrs(self) -> Dict[str, Any]:
+        """Html attributes."""
+        token_attrs = self._attribute_token().attrs
+        if token_attrs is None:
+            return {}
+        # Type ignore because `Token`s attribute types are not perfect
+        return dict(token_attrs)  # type: ignore
+
+    @property
+    def map(self) -> Optional[Tuple[int, int]]:
+        """Source map info. Format: `Tuple[ line_begin, line_end ]`"""
+        map_ = self._attribute_token().map
+        if map_:
+            # Type ignore because `Token`s attribute types are not perfect
+            return tuple(map_)  # type: ignore
+        return None
+
+    @property
+    def level(self) -> int:
+        """nesting level, the same as `state.level`"""
+        return self._attribute_token().level
+
+    @property
+    def content(self) -> str:
+        """In a case of self-closing tag (code, html, fence, etc.), it
+        has contents of this tag."""
+        return self._attribute_token().content
+
+    @property
+    def markup(self) -> str:
+        """'*' or '_' for emphasis, fence string for fence, etc."""
+        return self._attribute_token().markup
+
+    @property
+    def info(self) -> str:
+        """fence infostring"""
+        return self._attribute_token().info
+
+    @property
+    def meta(self) -> dict:
+        """A place for plugins to store an arbitrary data."""
+        return self._attribute_token().meta
+
+    @property
+    def block(self) -> bool:
+        """True for block-level tokens, false for inline tokens."""
+        return self._attribute_token().block
+
+    @property
+    def hidden(self) -> bool:
+        """If it's true, ignore this element when rendering.
+        Used for tight lists to hide paragraphs."""
+        return self._attribute_token().hidden
diff --git a/markdown_it/utils.py b/markdown_it/utils.py
@@ -37,3 +37,14 @@ def read_fixture_file(path: Union[str, Path]) -> List[list]:
 
             last_pos = i
     return tests
+
+
+def _removesuffix(string: str, suffix: str) -> str:
+    """Remove a suffix from a string.
+
+    Replace this with str.removesuffix() from stdlib when minimum Python
+    version is 3.9.
+    """
+    if suffix and string.endswith(suffix):
+        return string[: -len(suffix)]
+    return string
diff --git a/tests/test_tree.py b/tests/test_tree.py
@@ -0,0 +1,56 @@
+from markdown_it import MarkdownIt
+from markdown_it.tree import SyntaxTreeNode
+
+EXAMPLE_MARKDOWN = """
+## Heading here
+
+Some paragraph text and **emphasis here** and more text here.
+"""
+
+
+def test_tree_to_tokens_conversion():
+    tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
+    tokens_after_roundtrip = SyntaxTreeNode.from_tokens(tokens).to_tokens()
+    assert tokens == tokens_after_roundtrip
+
+
+def test_property_passthrough():
+    tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
+    heading_open = tokens[0]
+    tree = SyntaxTreeNode.from_tokens(tokens)
+    heading_node = tree.children[0]
+    assert heading_open.tag == heading_node.tag
+    assert tuple(heading_open.map) == heading_node.map
+    assert heading_open.level == heading_node.level
+    assert heading_open.content == heading_node.content
+    assert heading_open.markup == heading_node.markup
+    assert heading_open.info == heading_node.info
+    assert heading_open.meta == heading_node.meta
+    assert heading_open.block == heading_node.block
+    assert heading_open.hidden == heading_node.hidden
+
+
+def test_type():
+    tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
+    tree = SyntaxTreeNode.from_tokens(tokens)
+    # Root type is "root"
+    assert tree.type == "root"
+    # "_open" suffix must be stripped from nested token type
+    assert tree.children[0].type == "heading"
+    # For unnested tokens, node type must remain same as token type
+    assert tree.children[0].children[0].type == "inline"
+
+
+def test_sibling_traverse():
+    tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
+    tree = SyntaxTreeNode.from_tokens(tokens)
+    paragraph_inline_node = tree.children[1].children[0]
+    text_node = paragraph_inline_node.children[0]
+    assert text_node.type == "text"
+    strong_node = text_node.next_sibling
+    assert strong_node.type == "strong"
+    another_text_node = strong_node.next_sibling
+    assert another_text_node.type == "text"
+    assert another_text_node.next_sibling is None
+    assert another_text_node.previous_sibling.previous_sibling == text_node
+    assert text_node.previous_sibling is None