Skip to content
133 changes: 90 additions & 43 deletions markdown/extensions/toc.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@

from . import Extension
from ..treeprocessors import Treeprocessor
from ..util import code_escape, parseBoolValue, AMP_SUBSTITUTE, HTML_PLACEHOLDER_RE, AtomicString
from ..util import parseBoolValue, AMP_SUBSTITUTE
from ..treeprocessors import UnescapeTreeprocessor
from ..serializers import RE_AMP
import re
import html
import unicodedata
from copy import deepcopy
from html import unescape as html_unescape
import xml.etree.ElementTree as etree
from typing import TYPE_CHECKING, Any, Iterator, MutableSet

Expand All @@ -35,6 +38,8 @@

def slugify(value: str, separator: str, unicode: bool = False) -> str:
""" Slugify a string, to make it URL friendly. """
# First convert HTML entities to Unicode characters
value = html_unescape(value)
if not unicode:
# Replace Extended Latin characters with ASCII, i.e. `žlutý` => `zluty`
value = unicodedata.normalize('NFKD', value)
Expand Down Expand Up @@ -63,41 +68,81 @@ def unique(id: str, ids: MutableSet[str]) -> str:
return id


def get_name(el: etree.Element) -> str:
"""Get title name."""

text = []
for c in el.itertext():
if isinstance(c, AtomicString):
text.append(html.unescape(c))
else:
text.append(c)
return ''.join(text).strip()


def stashedHTML2text(text: str, md: Markdown, strip_entities: bool = True) -> str:
""" Extract raw HTML from stash, reduce to plain text and swap with placeholder. """
def _html_sub(m: re.Match[str]) -> str:
""" Substitute raw html with plain text. """
try:
raw = md.htmlStash.rawHtmlBlocks[int(m.group(1))]
except (IndexError, TypeError): # pragma: no cover
return m.group(0)
# Strip out tags and/or entities - leaving text
res = re.sub(r'(<[^>]+>)', '', raw)
if strip_entities:
res = re.sub(r'(&[\#a-zA-Z0-9]+;)', '', res)
return res

return HTML_PLACEHOLDER_RE.sub(_html_sub, text)


def unescape(text: str) -> str:
""" Unescape escaped text. """
def md_unescape(text: str) -> str:
""" Unescape Markdown backslash escaped text. """
c = UnescapeTreeprocessor()
return c.unescape(text)


def strip_tags(text: str) -> str:
""" Strip HTML tags and return plain text. Note: HTML entities are unaffected. """
# A comment could contain a tag, so strip comments first
while (start := text.find('<!--')) != -1 and (end := text.find('-->', start)) != -1:
text = f'{text[:start]}{text[end + 3:]}'

while (start := text.find('<')) != -1 and (end := text.find('>', start)) != -1:
text = f'{text[:start]}{text[end + 1:]}'

# Collapse whitespace
text = ' '.join(text.split())
return text


def escape_cdata(text: str) -> str:
""" Escape character data. """
if "&" in text:
# Only replace & when not part of an entity
text = RE_AMP.sub('&amp;', text)
if "<" in text:
text = text.replace("<", "&lt;")
if ">" in text:
text = text.replace(">", "&gt;")
return text


def run_postprocessors(text: str, md: Markdown) -> str:
""" Run postprocessors from Markdown instance on text. """
for pp in md.postprocessors:
text = pp.run(text)
return text.strip()


def render_inner_html(el: etree.Element, md: Markdown) -> str:
""" Fully render inner html of an etree element as a string. """
# The UnescapeTreeprocessor runs after TOC so run here.
text = md_unescape(md.serializer(el))

# strip parent tag
start = text.index('>') + 1
end = text.rindex('<')
text = text[start:end].strip()

return run_postprocessors(text, md)


def copy_element(el: etree.Element, exclude_fnrefs=True) -> etree.Element:
""" Return a deep copy of an etree element, optionally with footnote references removed. """
el = deepcopy(el)
# Remove footnote references, which look like this: `<sup id="fnref:1">...</sup>`.
if exclude_fnrefs:
for sup in el.findall('sup'):
id = sup.get('id', '')
if id.startswith('fnref'):
# We have a footnote reference. Remove it.
parent = el.find(f'.//sup[@id="{id}"]..')
if sup.tail:
# Preserve the tail text
siblings = list(parent)
pos = siblings.index(sup)
if pos == 0:
parent.text = f'{parent.text or ""}{sup.tail}'
else:
sibling = siblings[pos - 1]
sibling.tail = f'{sibling.tail or ""}{sup.tail}'
parent.remove(sup)
return el


def nest_toc_tokens(toc_list):
"""Given an unsorted list with errors and skips, return a nested one.

Expand Down Expand Up @@ -300,27 +345,29 @@ def run(self, doc: etree.Element) -> None:
for el in doc.iter():
if isinstance(el.tag, str) and self.header_rgx.match(el.tag):
self.set_level(el)
text = get_name(el)
html = render_inner_html(copy_element(el), self.md)
text = strip_tags(html)

# Do not override pre-existing ids
if "id" not in el.attrib:
innertext = unescape(stashedHTML2text(text, self.md))
el.attrib["id"] = unique(self.slugify(innertext, self.sep), used_ids)
el.attrib["id"] = unique(self.slugify(text, self.sep), used_ids)

if 'data-toc-label' in el.attrib:
text = md_unescape(el.attrib['data-toc-label'])
text = run_postprocessors(text, self.md)
text = strip_tags(text)
text = escape_cdata(text)
# Remove the data-toc-label attribute as it is no longer needed
del el.attrib['data-toc-label']

if int(el.tag[-1]) >= self.toc_top and int(el.tag[-1]) <= self.toc_bottom:
toc_tokens.append({
'level': int(el.tag[-1]),
'id': el.attrib["id"],
'name': unescape(stashedHTML2text(
code_escape(el.attrib.get('data-toc-label', text)),
self.md, strip_entities=False
))
'name': text,
'html': html
})

# Remove the data-toc-label attribute as it is no longer needed
if 'data-toc-label' in el.attrib:
del el.attrib['data-toc-label']

if self.use_anchors:
self.add_anchor(el, el.attrib["id"])
if self.use_permalinks not in [False, None]:
Expand Down
39 changes: 22 additions & 17 deletions tests/test_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -420,9 +420,9 @@ def testUniqueIds(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'header', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header_1', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header_2', 'name': 'Header', 'children': []},
{'level': 1, 'id': 'header', 'name': 'Header', 'html': 'Header', 'children': []},
{'level': 1, 'id': 'header_1', 'name': 'Header', 'html': 'Header', 'children': []},
{'level': 1, 'id': 'header_2', 'name': 'Header', 'html': 'Header', 'children': []},
])

def testHtmlEntities(self):
Expand All @@ -441,7 +441,7 @@ def testHtmlEntities(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'children': []},
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &amp; bar', 'html': 'Foo &amp; bar', 'children': []},
])

def testHtmlSpecialChars(self):
Expand All @@ -460,7 +460,7 @@ def testHtmlSpecialChars(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'children': []},
{'level': 1, 'id': 'foo-bar', 'name': 'Foo &gt; &amp; bar', 'html': 'Foo &gt; &amp; bar', 'children': []},
])

def testRawHtml(self):
Expand All @@ -479,7 +479,7 @@ def testRawHtml(self):
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'children': []},
{'level': 1, 'id': 'foo-bar-baz', 'name': 'Foo Bar Baz.', 'html': 'Foo <b>Bar</b> Baz.', 'children': []},
])

def testBaseLevel(self):
Expand Down Expand Up @@ -508,9 +508,9 @@ def testBaseLevel(self):
'</div>\n'
)
self.assertEqual(md.toc_tokens, [
{'level': 5, 'id': 'some-header', 'name': 'Some Header', 'children': [
{'level': 6, 'id': 'next-level', 'name': 'Next Level', 'children': []},
{'level': 6, 'id': 'too-high', 'name': 'Too High', 'children': []},
{'level': 5, 'id': 'some-header', 'name': 'Some Header', 'html': 'Some Header', 'children': [
{'level': 6, 'id': 'next-level', 'name': 'Next Level', 'html': 'Next Level', 'children': []},
{'level': 6, 'id': 'too-high', 'name': 'Too High', 'html': 'Too High', 'children': []},
]},
])

Expand All @@ -532,9 +532,13 @@ def testHeaderInlineMarkup(self):
'</ul>\n' # noqa
'</div>\n'
)
self.assertEqual(self.md.toc_tokens, [
{'level': 1, 'id': 'some-header-with-markup', 'name': 'Some Header with markup.', 'children': []},
])
self.assertEqual(self.md.toc_tokens, [{
'level': 1,
'id': 'some-header-with-markup',
'name': 'Some Header with markup.',
'html': 'Some <em>Header</em> with <a href="http://example.com">markup</a>.',
'children': []
}])

def testTitle(self):
""" Test TOC Title. """
Expand All @@ -549,6 +553,7 @@ def testTitle(self):

def testWithAttrList(self):
""" Test TOC with `attr_list` Extension. """
self.maxDiff = None
md = markdown.Markdown(extensions=['toc', 'attr_list'])
text = ('# Header 1\n\n'
'## Header 2 { #foo }\n\n'
Expand Down Expand Up @@ -580,12 +585,12 @@ def testWithAttrList(self):
'</div>\n'
)
self.assertEqual(md.toc_tokens, [
{'level': 1, 'id': 'header-1', 'name': 'Header 1', 'children': [
{'level': 2, 'id': 'foo', 'name': 'Header 2', 'children': []},
{'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'children': []}
{'level': 1, 'id': 'header-1', 'name': 'Header 1', 'html': 'Header 1', 'children': [
{'level': 2, 'id': 'foo', 'name': 'Header 2', 'html': 'Header 2', 'children': []},
{'level': 2, 'id': 'header-3', 'name': 'Foo Bar', 'html': 'Header 3', 'children': []}
]},
{'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'children': []},
{'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'children': []},
{'level': 1, 'id': 'header-4', 'name': 'Foo &gt; Baz', 'html': 'Header 4', 'children': []},
{'level': 1, 'id': 'header-5', 'name': 'Foo Quux', 'html': 'Header 5', 'children': []},
])

def testUniqueFunc(self):
Expand Down
1 change: 1 addition & 0 deletions tests/test_syntax/extensions/test_smarty.py
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,7 @@ def test_smarty_and_toc(self):
'level': 1,
'id': 'foo-bar',
'name': 'Foo &mdash; bar',
'html': '<em>Foo</em> &mdash; <code>bar</code>',
'children': [],
},
],
Expand Down
Loading