Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions markdown_it/token.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from typing import Any, List, Optional, Union
import warnings

import attr

Expand Down Expand Up @@ -134,6 +135,12 @@ def nest_tokens(tokens: List[Token]) -> List[Union[Token, NestedTokens]]:
``NestedTokens`` contain the open and close tokens and a list of children
of all tokens in between (recursively nested)
"""
warnings.warn(
"`markdown_it.token.nest_tokens` and `markdown_it.token.NestedTokens`"
" are deprecated. Please migrate to `markdown_it.tree.SyntaxTreeNode`",
DeprecationWarning,
)

output: List[Union[Token, NestedTokens]] = []

tokens = list(reversed(tokens))
Expand Down
262 changes: 262 additions & 0 deletions markdown_it/tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,262 @@
"""A tree representation of a linear markdown-it token stream.
This module is not part of upstream JavaScript markdown-it.
"""
from typing import NamedTuple, Sequence, Tuple, Dict, List, Optional, Any

from .token import Token
from .utils import _removesuffix


class SyntaxTreeNode:
"""A Markdown syntax tree node.
A class that can be used to construct a tree representation of a linear
`markdown-it-py` token stream. Use `SyntaxTreeNode.from_tokens` to
initialize instead of the `__init__` method.
Each node in the tree represents either:
- root of the Markdown document
- a single unnested `Token`
- a `Token` "_open" and "_close" token pair, and the tokens nested in
between
"""

class _NesterTokens(NamedTuple):
opening: Token
closing: Token

def __init__(self) -> None:
"""Initialize a root node with no children.
You probably need `SyntaxTreeNode.from_tokens` instead.
"""
# Only nodes representing an unnested token have self.token
self.token: Optional[Token] = None

# Only containers have nester tokens
self.nester_tokens: Optional[SyntaxTreeNode._NesterTokens] = None

# Root node does not have self.parent
self.parent: Optional["SyntaxTreeNode"] = None

# Empty list unless a non-empty container, or unnested token that has
# children (i.e. inline or img)
self.children: List["SyntaxTreeNode"] = []

@staticmethod
def from_tokens(tokens: Sequence[Token]) -> "SyntaxTreeNode":
"""Instantiate a `SyntaxTreeNode` from a token stream.
This is the standard method for instantiating `SyntaxTreeNode`.
"""
root = SyntaxTreeNode()
root._set_children_from_tokens(tokens)
return root

def to_tokens(self) -> List[Token]:
"""Recover the linear token stream."""

def recursive_collect_tokens(
node: "SyntaxTreeNode", token_list: List[Token]
) -> None:
if node.type == "root":
for child in node.children:
recursive_collect_tokens(child, token_list)
elif node.token:
token_list.append(node.token)
else:
assert node.nester_tokens
token_list.append(node.nester_tokens.opening)
for child in node.children:
recursive_collect_tokens(child, token_list)
token_list.append(node.nester_tokens.closing)

tokens: List[Token] = []
recursive_collect_tokens(self, tokens)
return tokens

@property
def is_nested(self) -> bool:
"""Is this node nested?.
Returns `True` if the node represents a `Token` pair and tokens in the
sequence between them, where `Token.nesting` of the first `Token` in
the pair is 1 and nesting of the other `Token` is -1.
"""
return bool(self.nester_tokens)

@property
def siblings(self) -> Sequence["SyntaxTreeNode"]:
"""Get siblings of the node.
Gets the whole group of siblings, including self.
"""
if not self.parent:
return [self]
return self.parent.children

@property
def type(self) -> str:
"""Get a string type of the represented syntax.
- "root" for root nodes
- `Token.type` if the node represents an unnested token
- `Token.type` of the opening token, with "_open" suffix stripped, if
the node represents a nester token pair
"""
if not self.token and not self.nester_tokens:
return "root"
if self.token:
return self.token.type
assert self.nester_tokens
return _removesuffix(self.nester_tokens.opening.type, "_open")

@property
def next_sibling(self) -> Optional["SyntaxTreeNode"]:
"""Get the next node in the sequence of siblings.
Returns `None` if this is the last sibling.
"""
self_index = self.siblings.index(self)
if self_index + 1 < len(self.siblings):
return self.siblings[self_index + 1]
return None

@property
def previous_sibling(self) -> Optional["SyntaxTreeNode"]:
"""Get the previous node in the sequence of siblings.
Returns `None` if this is the first sibling.
"""
self_index = self.siblings.index(self)
if self_index - 1 >= 0:
return self.siblings[self_index - 1]
return None

def _make_child(
self,
*,
token: Optional[Token] = None,
nester_tokens: Optional[_NesterTokens] = None,
) -> "SyntaxTreeNode":
"""Make and return a child node for `self`."""
if token and nester_tokens or not token and not nester_tokens:
raise ValueError("must specify either `token` or `nester_tokens`")
child = SyntaxTreeNode()
if token:
child.token = token
else:
child.nester_tokens = nester_tokens
child.parent = self
self.children.append(child)
return child

def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None:
"""Convert the token stream to a tree structure and set the resulting
nodes as children of `self`."""
reversed_tokens = list(reversed(tokens))
while reversed_tokens:
token = reversed_tokens.pop()

if token.nesting == 0:
child = self._make_child(token=token)
if token.children:
child._set_children_from_tokens(token.children)
continue

assert token.nesting == 1

nested_tokens = [token]
nesting = 1
while reversed_tokens and nesting != 0:
token = reversed_tokens.pop()
nested_tokens.append(token)
nesting += token.nesting
if nesting != 0:
raise ValueError(f"unclosed tokens starting {nested_tokens[0]}")

child = self._make_child(
nester_tokens=SyntaxTreeNode._NesterTokens(
nested_tokens[0], nested_tokens[-1]
)
)
child._set_children_from_tokens(nested_tokens[1:-1])

# NOTE:
# The values of the properties defined below directly map to properties
# of the underlying `Token`s. A root node does not translate to a `Token`
# object, so calling these property getters on a root node will raise an
# `AttributeError`.
#
# There is no mapping for `Token.nesting` because the `is_nested` property
# provides that data, and can be called on any node type, including root.

def _attribute_token(self) -> Token:
"""Return the `Token` that is used as the data source for the
properties defined below."""
if self.token:
return self.token
if self.nester_tokens:
return self.nester_tokens.opening
raise AttributeError("Root node does not have the accessed attribute")

@property
def tag(self) -> str:
"""html tag name, e.g. \"p\""""
return self._attribute_token().tag

@property
def attrs(self) -> Dict[str, Any]:
"""Html attributes."""
token_attrs = self._attribute_token().attrs
if token_attrs is None:
return {}
# Type ignore because `Token`s attribute types are not perfect
return dict(token_attrs) # type: ignore

@property
def map(self) -> Optional[Tuple[int, int]]:
"""Source map info. Format: `Tuple[ line_begin, line_end ]`"""
map_ = self._attribute_token().map
if map_:
# Type ignore because `Token`s attribute types are not perfect
return tuple(map_) # type: ignore
return None

@property
def level(self) -> int:
"""nesting level, the same as `state.level`"""
return self._attribute_token().level

@property
def content(self) -> str:
"""In a case of self-closing tag (code, html, fence, etc.), it
has contents of this tag."""
return self._attribute_token().content

@property
def markup(self) -> str:
"""'*' or '_' for emphasis, fence string for fence, etc."""
return self._attribute_token().markup

@property
def info(self) -> str:
"""fence infostring"""
return self._attribute_token().info

@property
def meta(self) -> dict:
"""A place for plugins to store an arbitrary data."""
return self._attribute_token().meta

@property
def block(self) -> bool:
"""True for block-level tokens, false for inline tokens."""
return self._attribute_token().block

@property
def hidden(self) -> bool:
"""If it's true, ignore this element when rendering.
Used for tight lists to hide paragraphs."""
return self._attribute_token().hidden
11 changes: 11 additions & 0 deletions markdown_it/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,14 @@ def read_fixture_file(path: Union[str, Path]) -> List[list]:

last_pos = i
return tests


def _removesuffix(string: str, suffix: str) -> str:
"""Remove a suffix from a string.
Replace this with str.removesuffix() from stdlib when minimum Python
version is 3.9.
"""
if suffix and string.endswith(suffix):
return string[: -len(suffix)]
return string
56 changes: 56 additions & 0 deletions tests/test_tree.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from markdown_it import MarkdownIt
from markdown_it.tree import SyntaxTreeNode

EXAMPLE_MARKDOWN = """
## Heading here

Some paragraph text and **emphasis here** and more text here.
"""


def test_tree_to_tokens_conversion():
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
tokens_after_roundtrip = SyntaxTreeNode.from_tokens(tokens).to_tokens()
assert tokens == tokens_after_roundtrip


def test_property_passthrough():
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
heading_open = tokens[0]
tree = SyntaxTreeNode.from_tokens(tokens)
heading_node = tree.children[0]
assert heading_open.tag == heading_node.tag
assert tuple(heading_open.map) == heading_node.map
assert heading_open.level == heading_node.level
assert heading_open.content == heading_node.content
assert heading_open.markup == heading_node.markup
assert heading_open.info == heading_node.info
assert heading_open.meta == heading_node.meta
assert heading_open.block == heading_node.block
assert heading_open.hidden == heading_node.hidden


def test_type():
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
tree = SyntaxTreeNode.from_tokens(tokens)
# Root type is "root"
assert tree.type == "root"
# "_open" suffix must be stripped from nested token type
assert tree.children[0].type == "heading"
# For unnested tokens, node type must remain same as token type
assert tree.children[0].children[0].type == "inline"


def test_sibling_traverse():
tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN)
tree = SyntaxTreeNode.from_tokens(tokens)
paragraph_inline_node = tree.children[1].children[0]
text_node = paragraph_inline_node.children[0]
assert text_node.type == "text"
strong_node = text_node.next_sibling
assert strong_node.type == "strong"
another_text_node = strong_node.next_sibling
assert another_text_node.type == "text"
assert another_text_node.next_sibling is None
assert another_text_node.previous_sibling.previous_sibling == text_node
assert text_node.previous_sibling is None