Skip to content

Commit 1ffa8cb

Browse files
committed
Implement lookahead and negative lookahead assertions
1 parent e038243 commit 1ffa8cb

File tree

3 files changed

+149
-47
lines changed

3 files changed

+149
-47
lines changed

Sources/_StringProcessing/Compiler.swift

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,17 @@ class Compiler {
7979
case .trivia, .empty:
8080
break
8181

82-
// FIXME: This can't be right...
8382
case .group(let g):
84-
try emit(g.child)
83+
switch g.kind.value {
84+
85+
case .lookahead, .negativeLookahead,
86+
.lookbehind, .negativeLookbehind:
87+
try compileLookaround(g.kind.value, g.child)
88+
89+
default:
90+
// FIXME: This can't be right...
91+
try emit(g.child)
92+
}
8593

8694
case .quantification(let quant):
8795
try emitQuantification(quant)
@@ -101,6 +109,60 @@ class Compiler {
101109
}
102110
}
103111

112+
func compileLookaround(
113+
_ kind: AST.Group.Kind,
114+
_ child: AST
115+
) throws {
116+
switch kind {
117+
case .lookbehind, .negativeLookbehind:
118+
throw unsupported("\(kind) assertions")
119+
120+
case .lookahead, .negativeLookahead:
121+
break
122+
123+
default: fatalError("unreachable")
124+
}
125+
126+
let positive = kind == .lookahead
127+
assert(positive || kind == .negativeLookahead)
128+
_ = """
129+
save(restoringAt: success)
130+
save(restoringAt: intercept)
131+
<sub-pattern> // failure restores at intercept
132+
clearSavePoint // remove intercept
133+
<if negative>:
134+
clearSavePoint // remove success
135+
fail // positive->success, negative propagates
136+
intercept:
137+
<if positive>:
138+
clearSavePoint // remove result
139+
fail // positive propagates, negative->success
140+
success:
141+
...
142+
"""
143+
144+
let intercept = builder.makeAddress()
145+
let success = builder.makeAddress()
146+
147+
builder.buildSave(success)
148+
builder.buildSave(intercept)
149+
try emit(child)
150+
builder.buildClear()
151+
if !positive {
152+
builder.buildClear()
153+
}
154+
builder.buildFail()
155+
156+
builder.label(intercept)
157+
if positive {
158+
builder.buildClear()
159+
}
160+
builder.buildFail()
161+
162+
builder.label(success)
163+
}
164+
165+
104166
func compileQuantification(
105167
low: Int,
106168
high: Int?,

Tests/RegexTests/CompileTests.swift

Lines changed: 38 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,28 @@ import XCTest
66

77
extension RegexTests {
88

9+
private func testCompilationEquivalence(
10+
_ equivs: [String]
11+
) throws {
12+
assert(!equivs.isEmpty)
13+
let progs = try equivs.map {
14+
try _compileRegex($0).engine.program
15+
}
16+
let ref = progs.first!
17+
for prog in progs.dropFirst() {
18+
guard ref.instructions.elementsEqual(
19+
prog.instructions) else {
20+
XCTFail("""
21+
Reference:
22+
\(ref)
23+
Current:
24+
\(prog)
25+
""")
26+
continue
27+
}
28+
}
29+
}
30+
931
func testCompileQuantification() throws {
1032

1133
// NOTE: While we might change how we compile
@@ -26,15 +48,22 @@ extension RegexTests {
2648
]
2749

2850
for row in equivalents {
29-
let progs = try row.map {
30-
try _compileRegex($0).engine.program
31-
}
32-
let ref = progs.first!
33-
for prog in progs.dropFirst() {
34-
XCTAssert(ref.instructions.elementsEqual(
35-
prog.instructions))
36-
37-
}
51+
try testCompilationEquivalence(row)
52+
}
53+
}
54+
55+
func testCompileGroups() throws {
56+
let equivalents: Array<[String]> = [
57+
["(?= assert)",
58+
"(*pla: assert)",
59+
"(*positive_lookahead: assert)"],
60+
["(?! assert)",
61+
"(*nla: assert)",
62+
"(*negative_lookahead: assert)"]
63+
]
64+
65+
for row in equivalents {
66+
try testCompilationEquivalence(row)
3867
}
3968
}
4069
}

Tests/RegexTests/MatchTests.swift

Lines changed: 47 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -589,59 +589,40 @@ extension RegexTests {
589589
input: "\u{7}\u{1b}\u{a}\n\r\t abc", match: "a")
590590
}
591591

592-
func testMatchGroups() {
593-
// MARK: Groups
594-
595-
// Named captures
596-
matchTest(
597-
#"a(?<label>b)c"#, input: "123abcxyz", match: "abc")
598-
matchTest(
599-
#"a(?'label'b)c"#, input: "123abcxyz", match: "abc")
600-
matchTest(
601-
#"a(?P<label>b)c"#, input: "123abcxyz", match: "abc")
602-
603-
// Other groups
604-
matchTest(
605-
#"a(?:b)c"#, input: "123abcxyz", match: "abc")
606-
matchTest(
607-
"(?|(a)|(b)|(c))", input: "123abcxyz", match: "a")
608-
609-
matchTest(
610-
#"(?:a|.b)c"#, input: "123abcacxyz", match: "abc")
611-
matchTest(
612-
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true)
613-
matchTest(
614-
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true)
615-
matchTest(
616-
#"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac")
617-
matchTest(
618-
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true)
619-
592+
func testAssertions() {
620593
matchTest(
621594
#"\d+(?= dollars)"#,
622-
input: "Price: 100 dollars", match: "100", xfail: true)
595+
input: "Price: 100 dollars", match: "100")
596+
matchTest(
597+
#"\d+(?= pesos)"#,
598+
input: "Price: 100 dollars", match: nil)
623599
matchTest(
624600
#"(?=\d+ dollars)\d+"#,
625-
input: "Price: 100 dollars", match: "100", xfail: true)
601+
input: "Price: 100 dollars", match: "100",
602+
xfail: true) // TODO
603+
626604
matchTest(
627605
#"\d+(*pla: dollars)"#,
628-
input: "Price: 100 dollars", match: "100", xfail: true)
606+
input: "Price: 100 dollars", match: "100")
629607
matchTest(
630608
#"\d+(*positive_lookahead: dollars)"#,
631-
input: "Price: 100 dollars", match: "100", xfail: true)
609+
input: "Price: 100 dollars", match: "100")
632610

633611
matchTest(
634612
#"\d+(?! dollars)"#,
635-
input: "Price: 100 pesos", match: "100", xfail: true)
613+
input: "Price: 100 pesos", match: "100")
614+
matchTest(
615+
#"\d+(?! dollars)"#,
616+
input: "Price: 100 dollars", match: "10")
636617
matchTest(
637618
#"(?!\d+ dollars)\d+"#,
638-
input: "Price: 100 pesos", match: "100", xfail: true)
619+
input: "Price: 100 pesos", match: "100")
639620
matchTest(
640621
#"\d+(*nla: dollars)"#,
641-
input: "Price: 100 pesos", match: "100", xfail: true)
622+
input: "Price: 100 pesos", match: "100")
642623
matchTest(
643624
#"\d+(*negative_lookahead: dollars)"#,
644-
input: "Price: 100 pesos", match: "100", xfail: true)
625+
input: "Price: 100 pesos", match: "100")
645626

646627
matchTest(
647628
#"(?<=USD)\d+"#, input: "Price: USD100", match: "100", xfail: true)
@@ -664,6 +645,36 @@ extension RegexTests {
664645
// engines generally enforce that lookbehinds are fixed width
665646
matchTest(
666647
#"\d{3}(?<!USD\d{3})"#, input: "Price: JYP100", match: "100", xfail: true)
648+
}
649+
650+
func testMatchGroups() {
651+
// MARK: Groups
652+
653+
// Named captures
654+
matchTest(
655+
#"a(?<label>b)c"#, input: "123abcxyz", match: "abc")
656+
matchTest(
657+
#"a(?'label'b)c"#, input: "123abcxyz", match: "abc")
658+
matchTest(
659+
#"a(?P<label>b)c"#, input: "123abcxyz", match: "abc")
660+
661+
// Other groups
662+
matchTest(
663+
#"a(?:b)c"#, input: "123abcxyz", match: "abc")
664+
matchTest(
665+
"(?|(a)|(b)|(c))", input: "123abcxyz", match: "a")
666+
667+
matchTest(
668+
#"(?:a|.b)c"#, input: "123abcacxyz", match: "abc")
669+
matchTest(
670+
#"(?>a|.b)c"#, input: "123abcacxyz", match: "ac", xfail: true)
671+
matchTest(
672+
"(*atomic:a|.b)c", input: "123abcacxyz", match: "ac", xfail: true)
673+
matchTest(
674+
#"(?:a+)[a-z]c"#, input: "123aacacxyz", match: "aac")
675+
matchTest(
676+
#"(?>a+)[a-z]c"#, input: "123aacacxyz", match: "ac", xfail: true)
677+
667678

668679
// TODO: Test example where non-atomic is significant
669680
matchTest(

0 commit comments

Comments
 (0)