Skip to content

Commit de76d24

Browse files
committed
Inital attempt at an inline refactor.
This is very much incomplete and not fully tested. Likely still a long way to go. At least it works with a some simple stuff. However, tails are currenlty processed twice and various now-uneeded pieces have not yet been removed. Added a new treeprocessor which runs inlinepatterns without the need to use placeholders or alter patterns. Note that any pattern that made use of groups will need to be updated (reduce each group by 1) as a result.
1 parent b73967d commit de76d24

File tree

2 files changed

+142
-33
lines changed

2 files changed

+142
-33
lines changed

markdown/inlinepatterns.py

Lines changed: 31 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -95,32 +95,32 @@ def build_inlinepatterns(md, **kwargs):
9595
NOIMG = r'(?<!\!)'
9696

9797
# `e=f()` or ``e=f("`")``
98-
BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\2(?!`)'
98+
BACKTICK_RE = r'(?<!\\)(`+)(.+?)(?<!`)\1(?!`)'
9999

100100
# \<
101101
ESCAPE_RE = r'\\(.)'
102102

103103
# *emphasis*
104-
EMPHASIS_RE = r'(\*)([^\*]+)\2'
104+
EMPHASIS_RE = r'(\*)([^\*]+)\1'
105105

106106
# **strong**
107-
STRONG_RE = r'(\*{2})(.+?)\2'
107+
STRONG_RE = r'(\*{2})(.+?)\1'
108108

109109
# __smart__strong__
110-
SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\2(?!\w)'
110+
SMART_STRONG_RE = r'(?<!\w)(_{2})(?!_)(.+?)(?<!_)\1(?!\w)'
111111

112112
# _smart_emphasis_
113-
SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\2(?!\w)'
113+
SMART_EMPHASIS_RE = r'(?<!\w)(_)(?!_)(.+?)(?<!_)\1(?!\w)'
114114

115115
# ***strongem*** or ***em*strong**
116-
EM_STRONG_RE = r'(\*|_)\2{2}(.+?)\2(.*?)\2{2}'
116+
EM_STRONG_RE = r'(\*|_)\1{2}(.+?)\1(.*?)\1{2}'
117117

118118
# ***strong**em*
119-
STRONG_EM_RE = r'(\*|_)\2{2}(.+?)\2{2}(.*?)\2'
119+
STRONG_EM_RE = r'(\*|_)\1{2}(.+?)\1{2}(.*?)\1'
120120

121121
# [text](url) or [text](<url>) or [text](url "title")
122122
LINK_RE = NOIMG + BRK + \
123-
r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\12\s*)?\)'''
123+
r'''\(\s*(<.*?>|((?:(?:\(.*?\))|[^\(\)]))*?)\s*((['"])(.*?)\1\s*)?\)'''
124124

125125
# ![alttxt](http://x.com/) or ![alttxt](<http://x.com/>)
126126
IMAGE_LINK_RE = r'\!' + BRK + r'\s*\((<.*?>|([^")]+"[^"]*"|[^\)]*))\)'
@@ -181,8 +181,7 @@ def __init__(self, pattern, md=None):
181181
182182
"""
183183
self.pattern = pattern
184-
self.compiled_re = re.compile("^(.*?)%s(.*?)$" % pattern,
185-
re.DOTALL | re.UNICODE)
184+
self.compiled_re = re.compile(pattern, re.DOTALL | re.UNICODE)
186185

187186
if md:
188187
self.md = md
@@ -215,7 +214,7 @@ def unescape(self, text):
215214
return text
216215

217216
def get_stash(m):
218-
id = m.group(1)
217+
id = m.group(0)
219218
if id in stash:
220219
value = stash.get(id)
221220
if isinstance(value, util.string_type):
@@ -227,16 +226,16 @@ def get_stash(m):
227226

228227

229228
class SimpleTextPattern(Pattern):
230-
""" Return a simple text of group(2) of a Pattern. """
229+
""" Return a simple text of group(1) of a Pattern. """
231230
def handleMatch(self, m):
232-
return m.group(2)
231+
return m.group(1)
233232

234233

235234
class EscapePattern(Pattern):
236235
""" Return an escaped character. """
237236

238237
def handleMatch(self, m):
239-
char = m.group(2)
238+
char = m.group(1)
240239
if char in self.md.ESCAPED_CHARS:
241240
return '%s%s%s' % (util.STX, ord(char), util.ETX)
242241
else:
@@ -245,7 +244,7 @@ def handleMatch(self, m):
245244

246245
class SimpleTagPattern(Pattern):
247246
"""
248-
Return element of type `tag` with a text attribute of group(3)
247+
Return element of type `tag` with a text attribute of group(2)
249248
of a Pattern.
250249
251250
"""
@@ -255,7 +254,7 @@ def __init__(self, pattern, tag):
255254

256255
def handleMatch(self, m):
257256
el = util.etree.Element(self.tag)
258-
el.text = m.group(3)
257+
el.text = m.group(2)
259258
return el
260259

261260

@@ -273,7 +272,7 @@ def __init__(self, pattern):
273272

274273
def handleMatch(self, m):
275274
el = util.etree.Element(self.tag)
276-
el.text = util.AtomicString(m.group(3).strip())
275+
el.text = util.AtomicString(m.group(2).strip())
277276
return el
278277

279278

@@ -287,16 +286,16 @@ def handleMatch(self, m):
287286
tag1, tag2 = self.tag.split(",")
288287
el1 = util.etree.Element(tag1)
289288
el2 = util.etree.SubElement(el1, tag2)
290-
el2.text = m.group(3)
291-
if len(m.groups()) == 5:
292-
el2.tail = m.group(4)
289+
el2.text = m.group(2)
290+
if len(m.groups()) == 3: # TODO: confirm this is right. maybe 4?
291+
el2.tail = m.group(3)
293292
return el1
294293

295294

296295
class HtmlPattern(Pattern):
297296
""" Store raw inline html and return a placeholder. """
298297
def handleMatch(self, m):
299-
rawhtml = self.unescape(m.group(2))
298+
rawhtml = self.unescape(m.group(1))
300299
place_holder = self.md.htmlStash.store(rawhtml)
301300
return place_holder
302301

@@ -308,7 +307,7 @@ def unescape(self, text):
308307
return text
309308

310309
def get_stash(m):
311-
id = m.group(1)
310+
id = m.group(0)
312311
value = stash.get(id)
313312
if value is not None:
314313
try:
@@ -323,9 +322,9 @@ class LinkPattern(Pattern):
323322
""" Return a link element from the given match. """
324323
def handleMatch(self, m):
325324
el = util.etree.Element("a")
326-
el.text = m.group(2)
327-
title = m.group(13)
328-
href = m.group(9)
325+
el.text = m.group(1)
326+
title = m.group(12)
327+
href = m.group(8)
329328

330329
if href:
331330
if href[0] == "<":
@@ -344,7 +343,7 @@ class ImagePattern(LinkPattern):
344343
""" Return a img element from the given match. """
345344
def handleMatch(self, m):
346345
el = util.etree.Element("img")
347-
src_parts = m.group(9).split()
346+
src_parts = m.group(8).split()
348347
if src_parts:
349348
src = src_parts[0]
350349
if src[0] == "<" and src[-1] == ">":
@@ -365,21 +364,21 @@ class ReferencePattern(LinkPattern):
365364

366365
def handleMatch(self, m):
367366
try:
368-
id = m.group(9).lower()
367+
id = m.group(8).lower()
369368
except IndexError:
370369
id = None
371370
if not id:
372371
# if we got something like "[Google][]" or "[Goggle]"
373372
# we'll use "google" as the id
374-
id = m.group(2).lower()
373+
id = m.group(1).lower()
375374

376375
# Clean up linebreaks in id
377376
id = self.NEWLINE_CLEANUP_RE.sub(' ', id)
378377
if id not in self.md.references: # ignore undefined refs
379378
return None
380379
href, title = self.md.references[id]
381380

382-
text = m.group(2)
381+
text = m.group(1)
383382
return self.makeTag(href, title, text)
384383

385384
def makeTag(self, href, title, text):
@@ -408,8 +407,8 @@ class AutolinkPattern(Pattern):
408407
""" Return a link Element given an autolink (`<http://example/com>`). """
409408
def handleMatch(self, m):
410409
el = util.etree.Element("a")
411-
el.set('href', self.unescape(m.group(2)))
412-
el.text = util.AtomicString(m.group(2))
410+
el.set('href', self.unescape(m.group(1)))
411+
el.text = util.AtomicString(m.group(1))
413412
return el
414413

415414

@@ -419,7 +418,7 @@ class AutomailPattern(Pattern):
419418
"""
420419
def handleMatch(self, m):
421420
el = util.etree.Element('a')
422-
email = self.unescape(m.group(2))
421+
email = self.unescape(m.group(1))
423422
if email.startswith("mailto:"):
424423
email = email[len("mailto:"):]
425424

markdown/treeprocessors.py

Lines changed: 111 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import unicode_literals
22
from __future__ import absolute_import
3+
import re
34
from . import util
45
from . import odict
56

@@ -44,6 +45,115 @@ class InlineProcessor(Treeprocessor):
4445
"""
4546
A Treeprocessor that traverses a tree, applying inline patterns.
4647
"""
48+
49+
def __init__(self, md):
50+
super(InlineProcessor, self).__init__(md)
51+
self.TOKEN_RE = re.compile(r'|'.join('\\{0}'.format(x) for x in md.ESCAPED_CHARS))
52+
53+
def apply_patterns(self, text):
54+
"""
55+
Match patterns at begining og given text.
56+
57+
"""
58+
59+
match = node = None
60+
for pattern in self.md.inlinePatterns.values():
61+
match = pattern.getCompiledRegExp().match(text)
62+
if match:
63+
node = pattern.handleMatch(match)
64+
text = text[match.end():]
65+
break
66+
67+
if not match:
68+
# Step forward one character
69+
return text[0], None, text[1:]
70+
71+
if isString(node):
72+
return node, None, text
73+
74+
return '', node, text
75+
76+
def handle_inline(self, text):
77+
"""
78+
Apply inline patterns to the given text.
79+
80+
"""
81+
82+
elem_text = ''
83+
children = []
84+
while text:
85+
m = self.TOKEN_RE.search(text)
86+
if m:
87+
if children:
88+
children[-1].tail = children[-1].tail + text[:m.start()] if children[-1].tail else text[:m.start()]
89+
else:
90+
elem_text += text[:m.start()]
91+
92+
pre_text, node, text = self.apply_patterns(text[m.start():])
93+
94+
if children:
95+
children[-1].tail = children[-1].tail + pre_text if children[-1].tail else pre_text
96+
else:
97+
elem_text += pre_text
98+
99+
if node is not None:
100+
children.append(node)
101+
else:
102+
# No more matches.
103+
if children:
104+
children[-1].tail = children[-1].tail + text if children[-1].tail else text
105+
else:
106+
elem_text += text
107+
break
108+
return elem_text, children
109+
110+
def handle_elem(self, elem, parent, pos):
111+
"""
112+
Apply patterns to an element and its children recursively.
113+
114+
"""
115+
116+
if elem.text and elem.text.strip() and not isinstance(elem.text, util.AtomicString):
117+
text = elem.text
118+
elem.text = None
119+
elem.text, children = self.handle_inline(text)
120+
elem.extend(children)
121+
if elem.tail and elem.tail.strip() and not isinstance(elem.tail, util.AtomicString):
122+
tail = elem.tail
123+
elem.tail = None
124+
elem.tail, siblings = self.handle_inline(tail)
125+
parent.extend(siblings) # TODO: maybe fix this?
126+
127+
# Recursively step through children
128+
for cpos, child in enumerate(elem):
129+
self.handle_elem(child, elem, cpos)
130+
131+
def run(self, tree):
132+
"""
133+
Apply inline patterns to a parsed Markdown tree.
134+
135+
Iterate over ElementTree, find elements with inline tag, apply inline
136+
patterns and append newly created Elements to tree. If you don't
137+
want to process your data with inline paterns, instead of normal
138+
string, use subclass AtomicString:
139+
140+
node.text = markdown.util.AtomicString("This will not be processed.")
141+
142+
Arguments:
143+
144+
* tree: ElementTree object, representing Markdown tree.
145+
146+
Returns: None.
147+
148+
"""
149+
150+
for pos, child in enumerate(tree):
151+
self.handle_elem(child, tree, pos)
152+
153+
class _InlineProcessor(Treeprocessor):
154+
"""
155+
A Treeprocessor that traverses a tree, applying inline patterns.
156+
"""
47157

48158
def __init__(self, md):
49159
self.__placeholder_prefix = util.INLINE_PLACEHOLDER_PREFIX
@@ -268,7 +378,7 @@ def run(self, tree):
268378
want to process your data with inline paterns, instead of normal
269379
string, use subclass AtomicString:
270380
271-
node.text = markdown.AtomicString("This will not be processed.")
381+
node.text = markdown.util.AtomicString("This will not be processed.")
272382
273383
Arguments:
274384

0 commit comments

Comments
 (0)