Skip to content

Commit 888636c

Browse files
authored
✨ NEW: Add SyntaxTreeNode (#129)
This is a more comprehensive replacement for `nest_tokens` and `NestedTokens` (which are now deprecated). It allows for the `Token` stream to be converted to/from a nested tree structure, with opening/closing tokens collapsed into a single `SyntaxTreeNode` and the intermediate tokens set as children.
1 parent 11eb374 commit 888636c

File tree

4 files changed

+336
-0
lines changed

4 files changed

+336
-0
lines changed

markdown_it/token.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
from typing import Any, List, Optional, Union
2+
import warnings
23

34
import attr
45

@@ -134,6 +135,12 @@ def nest_tokens(tokens: List[Token]) -> List[Union[Token, NestedTokens]]:
134135
``NestedTokens`` contain the open and close tokens and a list of children
135136
of all tokens in between (recursively nested)
136137
"""
138+
warnings.warn(
139+
"`markdown_it.token.nest_tokens` and `markdown_it.token.NestedTokens`"
140+
" are deprecated. Please migrate to `markdown_it.tree.SyntaxTreeNode`",
141+
DeprecationWarning,
142+
)
143+
137144
output: List[Union[Token, NestedTokens]] = []
138145

139146
tokens = list(reversed(tokens))

markdown_it/tree.py

Lines changed: 262 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
"""A tree representation of a linear markdown-it token stream.
2+
3+
This module is not part of upstream JavaScript markdown-it.
4+
"""
5+
from typing import NamedTuple, Sequence, Tuple, Dict, List, Optional, Any
6+
7+
from .token import Token
8+
from .utils import _removesuffix
9+
10+
11+
class SyntaxTreeNode:
12+
"""A Markdown syntax tree node.
13+
14+
A class that can be used to construct a tree representation of a linear
15+
`markdown-it-py` token stream. Use `SyntaxTreeNode.from_tokens` to
16+
initialize instead of the `__init__` method.
17+
18+
Each node in the tree represents either:
19+
- root of the Markdown document
20+
- a single unnested `Token`
21+
- a `Token` "_open" and "_close" token pair, and the tokens nested in
22+
between
23+
"""
24+
25+
class _NesterTokens(NamedTuple):
26+
opening: Token
27+
closing: Token
28+
29+
def __init__(self) -> None:
30+
"""Initialize a root node with no children.
31+
32+
You probably need `SyntaxTreeNode.from_tokens` instead.
33+
"""
34+
# Only nodes representing an unnested token have self.token
35+
self.token: Optional[Token] = None
36+
37+
# Only containers have nester tokens
38+
self.nester_tokens: Optional[SyntaxTreeNode._NesterTokens] = None
39+
40+
# Root node does not have self.parent
41+
self.parent: Optional["SyntaxTreeNode"] = None
42+
43+
# Empty list unless a non-empty container, or unnested token that has
44+
# children (i.e. inline or img)
45+
self.children: List["SyntaxTreeNode"] = []
46+
47+
@staticmethod
48+
def from_tokens(tokens: Sequence[Token]) -> "SyntaxTreeNode":
49+
"""Instantiate a `SyntaxTreeNode` from a token stream.
50+
51+
This is the standard method for instantiating `SyntaxTreeNode`.
52+
"""
53+
root = SyntaxTreeNode()
54+
root._set_children_from_tokens(tokens)
55+
return root
56+
57+
def to_tokens(self) -> List[Token]:
58+
"""Recover the linear token stream."""
59+
60+
def recursive_collect_tokens(
61+
node: "SyntaxTreeNode", token_list: List[Token]
62+
) -> None:
63+
if node.type == "root":
64+
for child in node.children:
65+
recursive_collect_tokens(child, token_list)
66+
elif node.token:
67+
token_list.append(node.token)
68+
else:
69+
assert node.nester_tokens
70+
token_list.append(node.nester_tokens.opening)
71+
for child in node.children:
72+
recursive_collect_tokens(child, token_list)
73+
token_list.append(node.nester_tokens.closing)
74+
75+
tokens: List[Token] = []
76+
recursive_collect_tokens(self, tokens)
77+
return tokens
78+
79+
@property
80+
def is_nested(self) -> bool:
81+
"""Is this node nested?.
82+
83+
Returns `True` if the node represents a `Token` pair and tokens in the
84+
sequence between them, where `Token.nesting` of the first `Token` in
85+
the pair is 1 and nesting of the other `Token` is -1.
86+
"""
87+
return bool(self.nester_tokens)
88+
89+
@property
90+
def siblings(self) -> Sequence["SyntaxTreeNode"]:
91+
"""Get siblings of the node.
92+
93+
Gets the whole group of siblings, including self.
94+
"""
95+
if not self.parent:
96+
return [self]
97+
return self.parent.children
98+
99+
@property
100+
def type(self) -> str:
101+
"""Get a string type of the represented syntax.
102+
103+
- "root" for root nodes
104+
- `Token.type` if the node represents an unnested token
105+
- `Token.type` of the opening token, with "_open" suffix stripped, if
106+
the node represents a nester token pair
107+
"""
108+
if not self.token and not self.nester_tokens:
109+
return "root"
110+
if self.token:
111+
return self.token.type
112+
assert self.nester_tokens
113+
return _removesuffix(self.nester_tokens.opening.type, "_open")
114+
115+
@property
116+
def next_sibling(self) -> Optional["SyntaxTreeNode"]:
117+
"""Get the next node in the sequence of siblings.
118+
119+
Returns `None` if this is the last sibling.
120+
"""
121+
self_index = self.siblings.index(self)
122+
if self_index + 1 < len(self.siblings):
123+
return self.siblings[self_index + 1]
124+
return None
125+
126+
@property
127+
def previous_sibling(self) -> Optional["SyntaxTreeNode"]:
128+
"""Get the previous node in the sequence of siblings.
129+
130+
Returns `None` if this is the first sibling.
131+
"""
132+
self_index = self.siblings.index(self)
133+
if self_index - 1 >= 0:
134+
return self.siblings[self_index - 1]
135+
return None
136+
137+
def _make_child(
138+
self,
139+
*,
140+
token: Optional[Token] = None,
141+
nester_tokens: Optional[_NesterTokens] = None,
142+
) -> "SyntaxTreeNode":
143+
"""Make and return a child node for `self`."""
144+
if token and nester_tokens or not token and not nester_tokens:
145+
raise ValueError("must specify either `token` or `nester_tokens`")
146+
child = SyntaxTreeNode()
147+
if token:
148+
child.token = token
149+
else:
150+
child.nester_tokens = nester_tokens
151+
child.parent = self
152+
self.children.append(child)
153+
return child
154+
155+
def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None:
156+
"""Convert the token stream to a tree structure and set the resulting
157+
nodes as children of `self`."""
158+
reversed_tokens = list(reversed(tokens))
159+
while reversed_tokens:
160+
token = reversed_tokens.pop()
161+
162+
if token.nesting == 0:
163+
child = self._make_child(token=token)
164+
if token.children:
165+
child._set_children_from_tokens(token.children)
166+
continue
167+
168+
assert token.nesting == 1
169+
170+
nested_tokens = [token]
171+
nesting = 1
172+
while reversed_tokens and nesting != 0:
173+
token = reversed_tokens.pop()
174+
nested_tokens.append(token)
175+
nesting += token.nesting
176+
if nesting != 0:
177+
raise ValueError(f"unclosed tokens starting {nested_tokens[0]}")
178+
179+
child = self._make_child(
180+
nester_tokens=SyntaxTreeNode._NesterTokens(
181+
nested_tokens[0], nested_tokens[-1]
182+
)
183+
)
184+
child._set_children_from_tokens(nested_tokens[1:-1])
185+
186+
# NOTE:
187+
# The values of the properties defined below directly map to properties
188+
# of the underlying `Token`s. A root node does not translate to a `Token`
189+
# object, so calling these property getters on a root node will raise an
190+
# `AttributeError`.
191+
#
192+
# There is no mapping for `Token.nesting` because the `is_nested` property
193+
# provides that data, and can be called on any node type, including root.
194+
195+
def _attribute_token(self) -> Token:
196+
"""Return the `Token` that is used as the data source for the
197+
properties defined below."""
198+
if self.token:
199+
return self.token
200+
if self.nester_tokens:
201+
return self.nester_tokens.opening
202+
raise AttributeError("Root node does not have the accessed attribute")
203+
204+
@property
205+
def tag(self) -> str:
206+
"""html tag name, e.g. \"p\""""
207+
return self._attribute_token().tag
208+
209+
@property
210+
def attrs(self) -> Dict[str, Any]:
211+
"""Html attributes."""
212+
token_attrs = self._attribute_token().attrs
213+
if token_attrs is None:
214+
return {}
215+
# Type ignore because `Token`s attribute types are not perfect
216+
return dict(token_attrs) # type: ignore
217+
218+
@property
219+
def map(self) -> Optional[Tuple[int, int]]:
220+
"""Source map info. Format: `Tuple[ line_begin, line_end ]`"""
221+
map_ = self._attribute_token().map
222+
if map_:
223+
# Type ignore because `Token`s attribute types are not perfect
224+
return tuple(map_) # type: ignore
225+
return None
226+
227+
@property
228+
def level(self) -> int:
229+
"""nesting level, the same as `state.level`"""
230+
return self._attribute_token().level
231+
232+
@property
233+
def content(self) -> str:
234+
"""In a case of self-closing tag (code, html, fence, etc.), it
235+
has contents of this tag."""
236+
return self._attribute_token().content
237+
238+
@property
239+
def markup(self) -> str:
240+
"""'*' or '_' for emphasis, fence string for fence, etc."""
241+
return self._attribute_token().markup
242+
243+
@property
244+
def info(self) -> str:
245+
"""fence infostring"""
246+
return self._attribute_token().info
247+
248+
@property
249+
def meta(self) -> dict:
250+
"""A place for plugins to store an arbitrary data."""
251+
return self._attribute_token().meta
252+
253+
@property
254+
def block(self) -> bool:
255+
"""True for block-level tokens, false for inline tokens."""
256+
return self._attribute_token().block
257+
258+
@property
259+
def hidden(self) -> bool:
260+
"""If it's true, ignore this element when rendering.
261+
Used for tight lists to hide paragraphs."""
262+
return self._attribute_token().hidden

markdown_it/utils.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,14 @@ def read_fixture_file(path: Union[str, Path]) -> List[list]:
3737

3838
last_pos = i
3939
return tests
40+
41+
42+
def _removesuffix(string: str, suffix: str) -> str:
43+
"""Remove a suffix from a string.
44+
45+
Replace this with str.removesuffix() from stdlib when minimum Python
46+
version is 3.9.
47+
"""
48+
if suffix and string.endswith(suffix):
49+
return string[: -len(suffix)]
50+
return string

tests/test_tree.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from markdown_it import MarkdownIt
2+
from markdown_it.tree import SyntaxTreeNode
3+
4+
EXAMPLE_MARKDOWN = """
5+
## Heading here
6+
7+
Some paragraph text and **emphasis here** and more text here.
8+
"""
9+
10+
11+
def test_tree_to_tokens_conversion():
12+
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
13+
tokens_after_roundtrip = SyntaxTreeNode.from_tokens(tokens).to_tokens()
14+
assert tokens == tokens_after_roundtrip
15+
16+
17+
def test_property_passthrough():
18+
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
19+
heading_open = tokens[0]
20+
tree = SyntaxTreeNode.from_tokens(tokens)
21+
heading_node = tree.children[0]
22+
assert heading_open.tag == heading_node.tag
23+
assert tuple(heading_open.map) == heading_node.map
24+
assert heading_open.level == heading_node.level
25+
assert heading_open.content == heading_node.content
26+
assert heading_open.markup == heading_node.markup
27+
assert heading_open.info == heading_node.info
28+
assert heading_open.meta == heading_node.meta
29+
assert heading_open.block == heading_node.block
30+
assert heading_open.hidden == heading_node.hidden
31+
32+
33+
def test_type():
34+
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
35+
tree = SyntaxTreeNode.from_tokens(tokens)
36+
# Root type is "root"
37+
assert tree.type == "root"
38+
# "_open" suffix must be stripped from nested token type
39+
assert tree.children[0].type == "heading"
40+
# For unnested tokens, node type must remain same as token type
41+
assert tree.children[0].children[0].type == "inline"
42+
43+
44+
def test_sibling_traverse():
45+
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
46+
tree = SyntaxTreeNode.from_tokens(tokens)
47+
paragraph_inline_node = tree.children[1].children[0]
48+
text_node = paragraph_inline_node.children[0]
49+
assert text_node.type == "text"
50+
strong_node = text_node.next_sibling
51+
assert strong_node.type == "strong"
52+
another_text_node = strong_node.next_sibling
53+
assert another_text_node.type == "text"
54+
assert another_text_node.next_sibling is None
55+
assert another_text_node.previous_sibling.previous_sibling == text_node
56+
assert text_node.previous_sibling is None

0 commit comments

Comments
 (0)