- Notifications
You must be signed in to change notification settings - Fork 84
✨ NEW: Add SyntaxTreeNode #129
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
6 commits Select commit Hold shift + click to select a range
e9508d2 Add SyntaxTreeNode
hukkinj1 4f1dd36 Deprecate nest_tokens
hukkinj1 de1a055 More accurate typehint
hukkinj1 55ef65e Add is_nested. Add docstrings
hukkinj1 9887533 Add property docstrings
hukkinj1 c96b2cb Add tests
hukkinj1 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,262 @@ | ||
| """A tree representation of a linear markdown-it token stream. | ||
| This module is not part of upstream JavaScript markdown-it. | ||
| """ | ||
| from typing import NamedTuple, Sequence, Tuple, Dict, List, Optional, Any | ||
| | ||
| from .token import Token | ||
| from .utils import _removesuffix | ||
| | ||
| | ||
| class SyntaxTreeNode: | ||
| """A Markdown syntax tree node. | ||
| A class that can be used to construct a tree representation of a linear | ||
| `markdown-it-py` token stream. Use `SyntaxTreeNode.from_tokens` to | ||
| initialize instead of the `__init__` method. | ||
| Each node in the tree represents either: | ||
| - root of the Markdown document | ||
| - a single unnested `Token` | ||
| - a `Token` "_open" and "_close" token pair, and the tokens nested in | ||
| between | ||
| """ | ||
| | ||
| class _NesterTokens(NamedTuple): | ||
| opening: Token | ||
| closing: Token | ||
| | ||
| def __init__(self) -> None: | ||
| """Initialize a root node with no children. | ||
| You probably need `SyntaxTreeNode.from_tokens` instead. | ||
| """ | ||
| # Only nodes representing an unnested token have self.token | ||
| self.token: Optional[Token] = None | ||
| | ||
| # Only containers have nester tokens | ||
| self.nester_tokens: Optional[SyntaxTreeNode._NesterTokens] = None | ||
| | ||
| # Root node does not have self.parent | ||
| self.parent: Optional["SyntaxTreeNode"] = None | ||
| | ||
| # Empty list unless a non-empty container, or unnested token that has | ||
| # children (i.e. inline or img) | ||
| self.children: List["SyntaxTreeNode"] = [] | ||
| | ||
| @staticmethod | ||
| def from_tokens(tokens: Sequence[Token]) -> "SyntaxTreeNode": | ||
| """Instantiate a `SyntaxTreeNode` from a token stream. | ||
| This is the standard method for instantiating `SyntaxTreeNode`. | ||
| """ | ||
| root = SyntaxTreeNode() | ||
| root._set_children_from_tokens(tokens) | ||
| return root | ||
| | ||
| def to_tokens(self) -> List[Token]: | ||
| """Recover the linear token stream.""" | ||
| | ||
| def recursive_collect_tokens( | ||
| node: "SyntaxTreeNode", token_list: List[Token] | ||
| ) -> None: | ||
| if node.type == "root": | ||
| for child in node.children: | ||
| recursive_collect_tokens(child, token_list) | ||
| elif node.token: | ||
| token_list.append(node.token) | ||
| else: | ||
| assert node.nester_tokens | ||
| token_list.append(node.nester_tokens.opening) | ||
| for child in node.children: | ||
| recursive_collect_tokens(child, token_list) | ||
| token_list.append(node.nester_tokens.closing) | ||
| | ||
| tokens: List[Token] = [] | ||
| recursive_collect_tokens(self, tokens) | ||
| return tokens | ||
| | ||
| @property | ||
| def is_nested(self) -> bool: | ||
| """Is this node nested?. | ||
| Returns `True` if the node represents a `Token` pair and tokens in the | ||
| sequence between them, where `Token.nesting` of the first `Token` in | ||
| the pair is 1 and nesting of the other `Token` is -1. | ||
| """ | ||
| return bool(self.nester_tokens) | ||
| | ||
| @property | ||
| def siblings(self) -> Sequence["SyntaxTreeNode"]: | ||
| """Get siblings of the node. | ||
| Gets the whole group of siblings, including self. | ||
| """ | ||
| if not self.parent: | ||
| return [self] | ||
| return self.parent.children | ||
| | ||
| @property | ||
| def type(self) -> str: | ||
| """Get a string type of the represented syntax. | ||
| - "root" for root nodes | ||
| - `Token.type` if the node represents an unnested token | ||
| - `Token.type` of the opening token, with "_open" suffix stripped, if | ||
| the node represents a nester token pair | ||
| """ | ||
| if not self.token and not self.nester_tokens: | ||
| return "root" | ||
| if self.token: | ||
| return self.token.type | ||
| assert self.nester_tokens | ||
| return _removesuffix(self.nester_tokens.opening.type, "_open") | ||
| | ||
| @property | ||
| def next_sibling(self) -> Optional["SyntaxTreeNode"]: | ||
| """Get the next node in the sequence of siblings. | ||
| Returns `None` if this is the last sibling. | ||
| """ | ||
| self_index = self.siblings.index(self) | ||
| if self_index + 1 < len(self.siblings): | ||
| return self.siblings[self_index + 1] | ||
| return None | ||
| | ||
| @property | ||
| def previous_sibling(self) -> Optional["SyntaxTreeNode"]: | ||
| """Get the previous node in the sequence of siblings. | ||
| Returns `None` if this is the first sibling. | ||
| """ | ||
| self_index = self.siblings.index(self) | ||
| if self_index - 1 >= 0: | ||
| return self.siblings[self_index - 1] | ||
| return None | ||
| | ||
| def _make_child( | ||
| self, | ||
| *, | ||
| token: Optional[Token] = None, | ||
| nester_tokens: Optional[_NesterTokens] = None, | ||
| ) -> "SyntaxTreeNode": | ||
| """Make and return a child node for `self`.""" | ||
| if token and nester_tokens or not token and not nester_tokens: | ||
| raise ValueError("must specify either `token` or `nester_tokens`") | ||
| child = SyntaxTreeNode() | ||
| if token: | ||
| child.token = token | ||
| else: | ||
| child.nester_tokens = nester_tokens | ||
| child.parent = self | ||
| self.children.append(child) | ||
| return child | ||
| | ||
| def _set_children_from_tokens(self, tokens: Sequence[Token]) -> None: | ||
| """Convert the token stream to a tree structure and set the resulting | ||
| nodes as children of `self`.""" | ||
| reversed_tokens = list(reversed(tokens)) | ||
| while reversed_tokens: | ||
| token = reversed_tokens.pop() | ||
| | ||
| if token.nesting == 0: | ||
| child = self._make_child(token=token) | ||
| if token.children: | ||
| child._set_children_from_tokens(token.children) | ||
| continue | ||
| | ||
| assert token.nesting == 1 | ||
| | ||
| nested_tokens = [token] | ||
| nesting = 1 | ||
| while reversed_tokens and nesting != 0: | ||
| token = reversed_tokens.pop() | ||
| nested_tokens.append(token) | ||
| nesting += token.nesting | ||
| if nesting != 0: | ||
| raise ValueError(f"unclosed tokens starting {nested_tokens[0]}") | ||
| | ||
| child = self._make_child( | ||
| nester_tokens=SyntaxTreeNode._NesterTokens( | ||
| nested_tokens[0], nested_tokens[-1] | ||
| ) | ||
| ) | ||
| child._set_children_from_tokens(nested_tokens[1:-1]) | ||
| | ||
| # NOTE: | ||
| # The values of the properties defined below directly map to properties | ||
| # of the underlying `Token`s. A root node does not translate to a `Token` | ||
| # object, so calling these property getters on a root node will raise an | ||
| # `AttributeError`. | ||
| # | ||
| # There is no mapping for `Token.nesting` because the `is_nested` property | ||
| # provides that data, and can be called on any node type, including root. | ||
| | ||
| def _attribute_token(self) -> Token: | ||
| """Return the `Token` that is used as the data source for the | ||
| properties defined below.""" | ||
| if self.token: | ||
| return self.token | ||
| if self.nester_tokens: | ||
| return self.nester_tokens.opening | ||
| raise AttributeError("Root node does not have the accessed attribute") | ||
| | ||
| @property | ||
| def tag(self) -> str: | ||
| """html tag name, e.g. \"p\"""" | ||
| return self._attribute_token().tag | ||
| | ||
| @property | ||
| def attrs(self) -> Dict[str, Any]: | ||
| """Html attributes.""" | ||
| token_attrs = self._attribute_token().attrs | ||
| if token_attrs is None: | ||
| return {} | ||
| # Type ignore because `Token`s attribute types are not perfect | ||
| return dict(token_attrs) # type: ignore | ||
| | ||
| @property | ||
| def map(self) -> Optional[Tuple[int, int]]: | ||
| """Source map info. Format: `Tuple[ line_begin, line_end ]`""" | ||
| map_ = self._attribute_token().map | ||
| if map_: | ||
| # Type ignore because `Token`s attribute types are not perfect | ||
| return tuple(map_) # type: ignore | ||
| return None | ||
| | ||
| @property | ||
| def level(self) -> int: | ||
| """nesting level, the same as `state.level`""" | ||
| return self._attribute_token().level | ||
| | ||
| @property | ||
| def content(self) -> str: | ||
| """In a case of self-closing tag (code, html, fence, etc.), it | ||
| has contents of this tag.""" | ||
| return self._attribute_token().content | ||
| | ||
| @property | ||
| def markup(self) -> str: | ||
| """'*' or '_' for emphasis, fence string for fence, etc.""" | ||
| return self._attribute_token().markup | ||
| | ||
| @property | ||
| def info(self) -> str: | ||
| """fence infostring""" | ||
| return self._attribute_token().info | ||
| | ||
| @property | ||
| def meta(self) -> dict: | ||
| """A place for plugins to store an arbitrary data.""" | ||
| return self._attribute_token().meta | ||
| | ||
| @property | ||
| def block(self) -> bool: | ||
| """True for block-level tokens, false for inline tokens.""" | ||
| return self._attribute_token().block | ||
| | ||
| @property | ||
| def hidden(self) -> bool: | ||
| """If it's true, ignore this element when rendering. | ||
| Used for tight lists to hide paragraphs.""" | ||
| return self._attribute_token().hidden | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,56 @@ | ||
| from markdown_it import MarkdownIt | ||
| from markdown_it.tree import SyntaxTreeNode | ||
| | ||
| EXAMPLE_MARKDOWN = """ | ||
| ## Heading here | ||
| | ||
| Some paragraph text and **emphasis here** and more text here. | ||
| """ | ||
| | ||
| | ||
| def test_tree_to_tokens_conversion(): | ||
| tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN) | ||
| tokens_after_roundtrip = SyntaxTreeNode.from_tokens(tokens).to_tokens() | ||
| assert tokens == tokens_after_roundtrip | ||
| | ||
| | ||
| def test_property_passthrough(): | ||
| tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN) | ||
| heading_open = tokens[0] | ||
| tree = SyntaxTreeNode.from_tokens(tokens) | ||
| heading_node = tree.children[0] | ||
| assert heading_open.tag == heading_node.tag | ||
| assert tuple(heading_open.map) == heading_node.map | ||
| assert heading_open.level == heading_node.level | ||
| assert heading_open.content == heading_node.content | ||
| assert heading_open.markup == heading_node.markup | ||
| assert heading_open.info == heading_node.info | ||
| assert heading_open.meta == heading_node.meta | ||
| assert heading_open.block == heading_node.block | ||
| assert heading_open.hidden == heading_node.hidden | ||
| | ||
| | ||
| def test_type(): | ||
| tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN) | ||
| tree = SyntaxTreeNode.from_tokens(tokens) | ||
| # Root type is "root" | ||
| assert tree.type == "root" | ||
| # "_open" suffix must be stripped from nested token type | ||
| assert tree.children[0].type == "heading" | ||
| # For unnested tokens, node type must remain same as token type | ||
| assert tree.children[0].children[0].type == "inline" | ||
| | ||
| | ||
| def test_sibling_traverse(): | ||
| tokens = MarkdownIt().parse(EXAMPLE_MARKDOWN) | ||
| tree = SyntaxTreeNode.from_tokens(tokens) | ||
| paragraph_inline_node = tree.children[1].children[0] | ||
| text_node = paragraph_inline_node.children[0] | ||
| assert text_node.type == "text" | ||
| strong_node = text_node.next_sibling | ||
| assert strong_node.type == "strong" | ||
| another_text_node = strong_node.next_sibling | ||
| assert another_text_node.type == "text" | ||
| assert another_text_node.next_sibling is None | ||
| assert another_text_node.previous_sibling.previous_sibling == text_node | ||
| assert text_node.previous_sibling is None |
Add this suggestion to a batch that can be applied as a single commit. This suggestion is invalid because no changes were made to the code. Suggestions cannot be applied while the pull request is closed. Suggestions cannot be applied while viewing a subset of changes. Only one suggestion per line can be applied in a batch. Add this suggestion to a batch that can be applied as a single commit. Applying suggestions on deleted lines is not supported. You must change the existing code in this line in order to create a valid suggestion. Outdated suggestions cannot be applied. This suggestion has been applied or marked resolved. Suggestions cannot be applied from pending reviews. Suggestions cannot be applied on multi-line comments. Suggestions cannot be applied while the pull request is queued to merge. Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.