Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Speedup line_offset property
* Replace dynamic regex with string find operation * Add cache of where each line starts so we don't have quadratic behavior identifying line numbers when importing large chunks of html
  • Loading branch information
eanorige authored Oct 20, 2023
commit 4a2a7dcadabef8b6a92891f794c7e688e762b9a9
14 changes: 14 additions & 0 deletions markdown/htmlparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ def __init__(self, md, *args, **kwargs):
# Block tags that should contain no content (self closing)
self.empty_tags = set(['hr'])

self.lineno_start_cache = [0]


# This calls self.reset
super().__init__(*args, **kwargs)
self.md = md
Expand All @@ -94,6 +97,8 @@ def reset(self):
self.stack = [] # When `inraw==True`, stack contains a list of tags
self._cache = []
self.cleandoc = []
self.lineno_start_cache = [0]

super().reset()

def close(self):
Expand All @@ -114,6 +119,15 @@ def close(self):
@property
def line_offset(self) -> int:
"""Returns char index in `self.rawdata` for the start of the current line. """
for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
last_line_start_pos = self.lineno_start_cache[ii]
lf_pos = self.rawdata.find('\n', last_line_start_pos)
if lf_pos == -1:
# No more newlines found. Use end of rawdata.
lf_pos = len(self.rawdata)
self.lineno_start_cache.append(lf_pos+1)

return self.lineno_start_cache[self.lineno-1]
if self.lineno > 1 and '\n' in self.rawdata:
m = re.match(r'([^\n]*\n){{{}}}'.format(self.lineno-1), self.rawdata)
if m:
Expand Down