Skip to content

Commit 57f9a52

Browse files
Improved the performance of the regular languages compiler: generate fewer and better regexes.
1 parent cdaa3e1 commit 57f9a52

File tree

2 files changed

+130
-43
lines changed

2 files changed

+130
-43
lines changed

prompt_toolkit/contrib/regular_languages/__init__.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,17 +54,16 @@
5454
5555
There is one tricky bit:
5656
57-
Ofter we operate on incomplete input (this is by definition the case for
57+
Often we operate on incomplete input (this is by definition the case for
5858
autocompletion) and we have to decide for the cursor position in which
5959
possible state the grammar it could be and in which way variables could be
6060
matched up to that point.
6161
6262
To solve this problem, the compiler takes the original regular expression and
63-
translates it into a set of other regular expressions which each match prefixes
64-
of strings that would match the first expression. (We translate it into
65-
multiple expression, because we want to have each possible state the regex
66-
could be in -- in case there are several or-clauses with each different
67-
completers.)
63+
translates it into a set of other regular expressions which each match certain
64+
prefixes of the original regular expression. We generate one prefix regular
65+
expression for every named variable (with this variable being the end of that
66+
expression).
6867
6968
7069
TODO: some examples of:

prompt_toolkit/contrib/regular_languages/compiler.py

Lines changed: 125 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -175,10 +175,20 @@ def transform(node: Node) -> str:
175175

176176
# `Repeat`.
177177
elif isinstance(node, Repeat):
178-
return "(?:%s){%i,%s}%s" % (
178+
if node.max_repeat is None:
179+
if node.min_repeat == 0:
180+
repeat_sign = "*"
181+
elif node.min_repeat == 1:
182+
repeat_sign = "+"
183+
else:
184+
repeat_sign = "{%i,%s}" % (
185+
node.min_repeat,
186+
("" if node.max_repeat is None else str(node.max_repeat)),
187+
)
188+
189+
return "(?:%s)%s%s" % (
179190
transform(node.childnode),
180-
node.min_repeat,
181-
("" if node.max_repeat is None else str(node.max_repeat)),
191+
repeat_sign,
182192
("" if node.greedy else "?"),
183193
)
184194
else:
@@ -194,39 +204,113 @@ def _transform_prefix(
194204
Yield all the regular expressions matching a prefix of the grammar
195205
defined by the `Node` instance.
196206
197-
This can yield multiple expressions, because in the case of on OR
198-
operation in the grammar, we can have another outcome depending on
199-
which clause would appear first. E.g. "(A|B)C" is not the same as
200-
"(B|A)C" because the regex engine is lazy and takes the first match.
201-
However, because we the current input is actually a prefix of the
202-
grammar which might not yet contain the data for "C", we need to know
203-
both intermediate states, in order to call the appropriate
204-
autocompletion for both cases.
207+
For each `Variable`, one regex pattern will be generated, with this
208+
named group at the end. This is required because a regex engine will
209+
terminate once a match is found. For autocompletion however, we need
210+
the matches for all possible paths, so that we can provide completions
211+
for each `Variable`.
212+
213+
- So, in the case of an `Any` (`A|B|C)', we generate a pattern for each
214+
clause. This is one for `A`, one for `B` and one for `C`. Unless some
215+
groups don't contain a `Variable`, then these can be merged together.
216+
- In the case of a `NodeSequence` (`ABC`), we generate a pattern for
217+
each prefix that ends with a variable, and one pattern for the whole
218+
sequence. So, that's one for `A`, one for `AB` and one for `ABC`.
205219
206220
:param root_node: The :class:`Node` instance for which we generate the grammar.
207221
:param create_group_func: A callable which takes a `Node` and returns the next
208222
free name for this node.
209223
"""
210224

225+
def contains_variable(node: Node) -> bool:
226+
if isinstance(node, Regex):
227+
return False
228+
elif isinstance(node, Variable):
229+
return True
230+
elif isinstance(node, (Lookahead, Repeat)):
231+
return contains_variable(node.childnode)
232+
elif isinstance(node, (NodeSequence, AnyNode)):
233+
return any(contains_variable(child) for child in node.children)
234+
235+
return False
236+
211237
def transform(node: Node) -> Iterable[str]:
212-
# Generate regexes for all permutations of this OR. Each node
213-
# should be in front once.
238+
# Generate separate pattern for all terms that contain variables
239+
# within this OR. Terms that don't contain a variable can be merged
240+
# together in one pattern.
214241
if isinstance(node, AnyNode):
242+
# If we have a definition like:
243+
# (?P<name> .*) | (?P<city> .*)
244+
# Then we want to be able to generate completions for both the
245+
# name as well as the city. We do this by yielding two
246+
# different regular expressions, because the engine won't
247+
# follow multiple paths, if multiple are possible.
248+
children_with_variable = []
249+
children_without_variable = []
215250
for c in node.children:
216-
for r in transform(c):
217-
yield "(?:%s)?" % r
251+
if contains_variable(c):
252+
children_with_variable.append(c)
253+
else:
254+
children_without_variable.append(c)
255+
256+
for c in children_with_variable:
257+
yield from transform(c)
218258

219-
# For a sequence. We can either have a match for the sequence
220-
# of all the children, or for an exact match of the first X
221-
# children, followed by a partial match of the next children.
259+
# Merge options without variable together.
260+
if children_without_variable:
261+
yield "|".join(
262+
r for c in children_without_variable for r in transform(c)
263+
)
264+
265+
# For a sequence, generate a pattern for each prefix that ends with
266+
# a variable + one pattern of the complete sequence.
267+
# (This is because, for autocompletion, we match the text before
268+
# the cursor, and completions are given for the variable that we
269+
# match right before the cursor.)
222270
elif isinstance(node, NodeSequence):
271+
# For all components in the sequence, compute prefix patterns,
272+
# as well as full patterns.
273+
complete = [cls._transform(c, create_group_func) for c in node.children]
274+
prefixes = [list(transform(c)) for c in node.children]
275+
variable_nodes = [contains_variable(c) for c in node.children]
276+
277+
# If any child is contains a variable, we should yield a
278+
# pattern up to that point, so that we are sure this will be
279+
# matched.
223280
for i in range(len(node.children)):
224-
a = [
225-
cls._transform(c, create_group_func) for c in node.children[:i]
226-
]
227-
228-
for c_str in transform(node.children[i]):
229-
yield "(?:%s)" % ("".join(a) + c_str)
281+
if variable_nodes[i]:
282+
for c_str in prefixes[i]:
283+
yield "".join(complete[:i]) + c_str
284+
285+
# If there are non-variable nodes, merge all the prefixes into
286+
# one pattern. If the input is: "[part1] [part2] [part3]", then
287+
# this gets compiled into:
288+
# (complete1 + (complete2 + (complete3 | partial3) | partial2) | partial1 )
289+
# For nodes that contain a variable, we skip the "|partial"
290+
# part here, because thees are matched with the previous
291+
# patterns.
292+
if not all(variable_nodes):
293+
result = []
294+
295+
# Start with complete patterns.
296+
for i in range(len(node.children)):
297+
result.append("(?:")
298+
result.append(complete[i])
299+
300+
# Add prefix patterns.
301+
for i in range(len(node.children) - 1, -1, -1):
302+
if variable_nodes[i]:
303+
# No need to yield a prefix for this one, we did
304+
# the variable prefixes earlier.
305+
result.append(")")
306+
else:
307+
result.append("|(?:")
308+
# If this yields multiple, we should yield all combinations.
309+
assert len(prefixes[i]) == 1
310+
result.append(prefixes[i][0])
311+
result.append("))")
312+
313+
yield "".join(result)
230314

231315
elif isinstance(node, Regex):
232316
yield "(?:%s)?" % node.regex
@@ -251,23 +335,26 @@ def transform(node: Node) -> Iterable[str]:
251335
# match, followed by a partial match.
252336
prefix = cls._transform(node.childnode, create_group_func)
253337

254-
for c_str in transform(node.childnode):
255-
if node.max_repeat:
256-
repeat_sign = "{,%i}" % (node.max_repeat - 1)
257-
else:
258-
repeat_sign = "*"
259-
yield "(?:%s)%s%s(?:%s)?" % (
260-
prefix,
261-
repeat_sign,
262-
("" if node.greedy else "?"),
263-
c_str,
264-
)
338+
if node.max_repeat == 1:
339+
yield from transform(node.childnode)
340+
else:
341+
for c_str in transform(node.childnode):
342+
if node.max_repeat:
343+
repeat_sign = "{,%i}" % (node.max_repeat - 1)
344+
else:
345+
repeat_sign = "*"
346+
yield "(?:%s)%s%s%s" % (
347+
prefix,
348+
repeat_sign,
349+
("" if node.greedy else "?"),
350+
c_str,
351+
)
265352

266353
else:
267354
raise TypeError("Got %r" % node)
268355

269356
for r in transform(root_node):
270-
yield "^%s$" % r
357+
yield "^(?:%s)$" % r
271358

272359
def match(self, string: str) -> Optional["Match"]:
273360
"""
@@ -303,6 +390,7 @@ def match_prefix(self, string: str) -> Optional["Match"]:
303390
return Match(
304391
string, matches2, self._group_names_to_nodes, self.unescape_funcs
305392
)
393+
306394
return None
307395

308396

@@ -343,7 +431,7 @@ def get_tuples() -> Iterable[Tuple[str, Tuple[int, int]]]:
343431

344432
def _nodes_to_values(self) -> List[Tuple[str, str, Tuple[int, int]]]:
345433
"""
346-
Returns list of list of (Node, string_value) tuples.
434+
Returns list of (Node, string_value) tuples.
347435
"""
348436

349437
def is_none(sl: Tuple[int, int]) -> bool:

0 commit comments

Comments
 (0)