Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 30 additions & 0 deletions Documentation/ProgrammersManual.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Programmer's Manual

## Programming patterns

### Engine quick checks and fast paths

In the engine nomenclature, a quick-check results in a yes/no/maybe while a thorough check always results in a definite answer.

The nature of quick checks and fast paths is that they bifurcate testing coverage. One easy way to prevent this in simple cases is to assert that a definite quick result matches the thorough result.

One example of this pattern is matching against a builtin character class. The engine has a `_matchBuiltinCC`

```swift
func _matchBuiltinCC(...) -> Input.Index? {
// Calls _quickMatchBuiltinCC, if that gives a definite result
// asserts that it is the same as the result of
// _thoroughMatchBuiltinCC and returns it. Otherwise returns the
// result of _thoroughMatchBuiltinCC
}

@inline(__always)
func _quickMatchBuiltinCC(...) -> QuickResult<Input.Index?>

@inline(never)
func _thoroughMatchBuiltinCC(...) -> Input.Index?
```

The thorough check is never inlined, as it is a lot of cold code. Note that quick and thorough functions should be pure, that is they shouldn't update processor state.


267 changes: 168 additions & 99 deletions Sources/_StringProcessing/Engine/MEBuiltins.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,114 +9,27 @@ extension Character {
}

extension Processor {
mutating func matchBuiltin(
mutating func matchBuiltinCC(
_ cc: _CharacterClassModel.Representation,
_ isInverted: Bool,
_ isStrictASCII: Bool,
_ isScalarSemantics: Bool
isInverted: Bool,
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> Bool {
guard let next = _doMatchBuiltin(
guard let next = _matchBuiltinCC(
cc,
isInverted,
isStrictASCII,
isScalarSemantics
in: input,
at: currentPosition,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics
) else {
signalFailure()
return false
}
currentPosition = next
return true
}

func _doMatchBuiltin(
_ cc: _CharacterClassModel.Representation,
_ isInverted: Bool,
_ isStrictASCII: Bool,
_ isScalarSemantics: Bool
) -> Input.Index? {
guard let char = load(), let scalar = loadScalar() else {
return nil
}

let asciiCheck = !isStrictASCII
|| (scalar.isASCII && isScalarSemantics)
|| char.isASCII

var matched: Bool
var next: Input.Index
switch (isScalarSemantics, cc) {
case (_, .anyGrapheme):
next = input.index(after: currentPosition)
case (_, .anyScalar):
next = input.unicodeScalars.index(after: currentPosition)
case (true, _):
next = input.unicodeScalars.index(after: currentPosition)
case (false, _):
next = input.index(after: currentPosition)
}

switch cc {
case .any, .anyGrapheme:
matched = true
case .anyScalar:
if isScalarSemantics {
matched = true
} else {
matched = input.isOnGraphemeClusterBoundary(next)
}
case .digit:
if isScalarSemantics {
matched = scalar.properties.numericType != nil && asciiCheck
} else {
matched = char.isNumber && asciiCheck
}
case .horizontalWhitespace:
if isScalarSemantics {
matched = scalar.isHorizontalWhitespace && asciiCheck
} else {
matched = char._isHorizontalWhitespace && asciiCheck
}
case .verticalWhitespace:
if isScalarSemantics {
matched = scalar.isNewline && asciiCheck
} else {
matched = char._isNewline && asciiCheck
}
case .newlineSequence:
if isScalarSemantics {
matched = scalar.isNewline && asciiCheck
if matched && scalar == "\r"
&& next != input.endIndex && input.unicodeScalars[next] == "\n" {
// Match a full CR-LF sequence even in scalar semantics
input.unicodeScalars.formIndex(after: &next)
}
} else {
matched = char._isNewline && asciiCheck
}
case .whitespace:
if isScalarSemantics {
matched = scalar.properties.isWhitespace && asciiCheck
} else {
matched = char.isWhitespace && asciiCheck
}
case .word:
if isScalarSemantics {
matched = scalar.properties.isAlphabetic && asciiCheck
} else {
matched = char.isWordCharacter && asciiCheck
}
}

if isInverted {
matched.toggle()
}

guard matched else {
return nil
}
return next
}

func isAtStartOfLine(_ payload: AssertionPayload) -> Bool {
if currentPosition == subjectBounds.lowerBound { return true }
switch payload.semanticLevel {
Expand All @@ -126,7 +39,7 @@ extension Processor {
return input.unicodeScalars[input.unicodeScalars.index(before: currentPosition)].isNewline
}
}

func isAtEndOfLine(_ payload: AssertionPayload) -> Bool {
if currentPosition == subjectBounds.upperBound { return true }
switch payload.semanticLevel {
Expand Down Expand Up @@ -169,7 +82,7 @@ extension Processor {
return isAtStartOfLine(payload)
case .endOfLine:
return isAtEndOfLine(payload)

case .caretAnchor:
if payload.anchorsMatchNewlines {
return isAtStartOfLine(payload)
Expand Down Expand Up @@ -202,3 +115,159 @@ extension Processor {
}
}
}

// MARK: Built-in character class matching

// Mentioned in ProgrammersManual.md, update docs if redesigned
@_effects(releasenone)
func _matchBuiltinCC(
_ cc: _CharacterClassModel.Representation,
in input: String,
at currentPosition: String.Index,
isInverted: Bool,
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> String.Index? {
guard currentPosition < input.endIndex else {
return nil
}
if case .definite(let result) = _quickMatchBuiltinCC(
cc,
in: input,
at: currentPosition,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics
) {
assert(result == _thoroughMatchBuiltinCC(
cc,
in: input,
at: currentPosition,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics))
return result
}
return _thoroughMatchBuiltinCC(
cc,
in: input,
at: currentPosition,
isInverted: isInverted,
isStrictASCII: isStrictASCII,
isScalarSemantics: isScalarSemantics)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[Future] It would be great if we had a macro to expand into this pattern, since we have to essentially repeat the same function call three times. Something like:

return #quickMatch( _thoroughMatchBuiltinCC( cc, in: input, at: currentPosition, isInverted: isInverted, isStrictASCII: isStrictASCII, isScalarSemantics: isScalarSemantics))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that would be nice. When I did the string re-gutting, I had some (always-inline) functions that took two different closures and applied this kind of pattern, but it turned too unwieldy and for the UTF-8 switch it ended up being better to just stamp the code out manually. A macro would be ideal here.

}

// Mentioned in ProgrammersManual.md, update docs if redesigned
@_effects(releasenone)
@inline(__always)
func _quickMatchBuiltinCC(
_ cc: _CharacterClassModel.Representation,
in input: String,
at currentPosition: String.Index,
isInverted: Bool,
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> QuickResult<String.Index?> {
assert(currentPosition < input.endIndex)
guard let (next, result) = input._quickMatch(
cc, at: currentPosition, isScalarSemantics: isScalarSemantics
) else {
return .unknown
}
return .definite(result == isInverted ? nil : next)
}

// Mentioned in ProgrammersManual.md, update docs if redesigned
@_effects(releasenone)
@inline(never)
func _thoroughMatchBuiltinCC(
_ cc: _CharacterClassModel.Representation,
in input: String,
at currentPosition: String.Index,
isInverted: Bool,
isStrictASCII: Bool,
isScalarSemantics: Bool
) -> String.Index? {
assert(currentPosition < input.endIndex)
let char = input[currentPosition]
let scalar = input.unicodeScalars[currentPosition]

let asciiCheck = !isStrictASCII
|| (scalar.isASCII && isScalarSemantics)
|| char.isASCII

var matched: Bool
var next: String.Index
switch (isScalarSemantics, cc) {
case (_, .anyGrapheme):
next = input.index(after: currentPosition)
case (_, .anyScalar):
next = input.unicodeScalars.index(after: currentPosition)
case (true, _):
next = input.unicodeScalars.index(after: currentPosition)
case (false, _):
next = input.index(after: currentPosition)
}

switch cc {
case .any, .anyGrapheme:
matched = true
case .anyScalar:
if isScalarSemantics {
matched = true
} else {
matched = input.isOnGraphemeClusterBoundary(next)
}
case .digit:
if isScalarSemantics {
matched = scalar.properties.numericType != nil && asciiCheck
} else {
matched = char.isNumber && asciiCheck
}
case .horizontalWhitespace:
if isScalarSemantics {
matched = scalar.isHorizontalWhitespace && asciiCheck
} else {
matched = char._isHorizontalWhitespace && asciiCheck
}
case .verticalWhitespace:
if isScalarSemantics {
matched = scalar.isNewline && asciiCheck
} else {
matched = char._isNewline && asciiCheck
}
case .newlineSequence:
if isScalarSemantics {
matched = scalar.isNewline && asciiCheck
if matched && scalar == "\r"
&& next != input.endIndex && input.unicodeScalars[next] == "\n" {
// Match a full CR-LF sequence even in scalar semantics
input.unicodeScalars.formIndex(after: &next)
}
} else {
matched = char._isNewline && asciiCheck
}
case .whitespace:
if isScalarSemantics {
matched = scalar.properties.isWhitespace && asciiCheck
} else {
matched = char.isWhitespace && asciiCheck
}
case .word:
if isScalarSemantics {
matched = scalar.properties.isAlphabetic && asciiCheck
} else {
matched = char.isWordCharacter && asciiCheck
}
}

if isInverted {
matched.toggle()
}

guard matched else {
return nil
}
return next
}

10 changes: 6 additions & 4 deletions Sources/_StringProcessing/Engine/MEQuantify.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ extension Processor {
UnicodeScalar.init(_value: UInt32(payload.asciiChar)), true)
case .builtin:
// We only emit .quantify if it consumes a single character
next = _doMatchBuiltin(
next = _matchBuiltinCC(
payload.builtin,
payload.builtinIsInverted,
payload.builtinIsStrict,
false)
in: input,
at: currentPosition,
isInverted: payload.builtinIsInverted,
isStrictASCII: payload.builtinIsStrict,
isScalarSemantics: false)
case .any:
let matched = currentPosition != input.endIndex
&& (!input[currentPosition].isNewline || payload.anyMatchesNewline)
Expand Down
8 changes: 4 additions & 4 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -583,11 +583,11 @@ extension Processor {

case .matchBuiltin:
let payload = payload.characterClassPayload
if matchBuiltin(
if matchBuiltinCC(
payload.cc,
payload.isInverted,
payload.isStrictASCII,
payload.isScalarSemantics
isInverted: payload.isInverted,
isStrictASCII: payload.isStrictASCII,
isScalarSemantics: payload.isScalarSemantics
) {
controller.step()
}
Expand Down
Loading