Skip to content

Commit

Permalink
Allow Regex<AnyRegexOutput> to be used in the DSL.
Browse files Browse the repository at this point in the history
Apply a workaround to allow `Regex<AnyRegexOutput>` to be used in the DSL. This workaround emits each nested `Regex<AnyRegexOutput>` as a custom matcher so that it's essentially treated as a separate compilation unit.

A proper fix for this is to introduce scoped type erasure in the matching engine so that all type erasure (including the top-level one) goes through this model. I left some stubs in (`beginTypeErase`, `endTypeErase`) which I'll implement in a follow-up.

Since this implementation is using `Executor.match(...) -> Regex.Match` in the regex compiler, we need to add availability annotations to the `Executor` and `Compiler` types.

Resolves rdar://94320030.
  • Loading branch information
rxwei committed Jun 21, 2022
1 parent cc9efb9 commit 63124f5
Show file tree
Hide file tree
Showing 11 changed files with 163 additions and 6 deletions.
31 changes: 31 additions & 0 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
@_implementationOnly import _RegexParser

@available(SwiftStdlib 5.7, *)
extension Compiler {
struct ByteCodeGen {
var options: MatchingOptions
Expand All @@ -15,8 +16,15 @@ extension Compiler {
}
}

@available(SwiftStdlib 5.7, *)
extension Compiler.ByteCodeGen {
mutating func emitRoot(_ root: DSLTree.Node) throws -> Program {
// FIXME: Remove once output type erasure is represented in the matching
// engine. This workaround is to prevent a top-level `Regex<AnyRegexOutput>`
// from being emitted as a matcher, which would be an infinite recursion.
if case let .typeErase(child) = root {
return try emitRoot(child)
}
// The whole match (`.0` element of output) is equivalent to an implicit
// capture over the entire regex.
try emitNode(.capture(name: nil, reference: nil, root))
Expand All @@ -25,6 +33,7 @@ extension Compiler.ByteCodeGen {
}
}

@available(SwiftStdlib 5.7, *)
fileprivate extension Compiler.ByteCodeGen {
mutating func emitAtom(_ a: DSLTree.Atom) throws {
defer {
Expand Down Expand Up @@ -765,6 +774,28 @@ fileprivate extension Compiler.ByteCodeGen {
case .characterPredicate:
throw Unsupported("character predicates")

case .typeErase(let child):
// FIXME: This is a workaround for `Regex<AnyRegexOutput>` not working in
// the DSL. This separates any `Regex<AnyRegexOutput>` into its own
// compilation unit, but is less efficient. We should instead represent
// output type erasure in the matching engine (`beginTypeErase`,
// `endTypeErase`).
//
// Long-term design:
// beginTypeErase
// <code for child>
// endTypeErase
let program = try Compiler(tree: DSLTree(child)).emit()
let executor = Executor(program: program)
return emitMatcher { input, startIndex, range in
guard let match: Regex<AnyRegexOutput>.Match = try executor.match(
input, in: startIndex..<range.upperBound, .partialFromFront
) else {
return nil
}
return (match.range.upperBound, match.output)
}

case .trivia, .empty:
return nil
}
Expand Down
2 changes: 2 additions & 0 deletions Sources/_StringProcessing/Compiler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

@_implementationOnly import _RegexParser

@available(SwiftStdlib 5.7, *)
class Compiler {
let tree: DSLTree

Expand All @@ -34,6 +35,7 @@ class Compiler {
}
}

@available(SwiftStdlib 5.7, *)
func _compileRegex(
_ regex: String, _ syntax: SyntaxOptions = .traditional
) throws -> Executor {
Expand Down
2 changes: 1 addition & 1 deletion Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ extension DSLTree.Node {
case .orderedChoice, .conditional, .concatenation,
.capture, .nonCapturingGroup,
.quantification, .trivia, .empty,
.absentFunction: return nil
.absentFunction, .typeErase: return nil

case .consumer:
fatalError("FIXME: Is this where we handle them?")
Expand Down
9 changes: 9 additions & 0 deletions Sources/_StringProcessing/Engine/Instruction.swift
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,15 @@ extension Instruction {
///
case backreference

/// Push a new type erasure scope into the capture stack.
case beginTypeErase

/// Pop the last type erasure scope, create a `AnyRegexOutput` from that
/// scope, and store it in a value register.
///
/// endTypeErase(_: ValReg)
case endTypeErase

// MARK: Matching: State transitions

// TODO: State transitions need more work. We want
Expand Down
8 changes: 8 additions & 0 deletions Sources/_StringProcessing/Engine/MEBuilder.swift
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,14 @@ extension MEProgram.Builder {
.init(capture: cap, transform: trans)))
}

mutating func buildBeginTypeErase() {
instructions.append(.init(.beginTypeErase))
}

mutating func buildEndTypeErase() {
instructions.append(.init(.endTypeErase))
}

mutating func buildMatcher(
_ fun: MatcherRegister, into reg: ValueRegister
) {
Expand Down
6 changes: 6 additions & 0 deletions Sources/_StringProcessing/Engine/Processor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -527,6 +527,12 @@ extension Processor {
storedCaptures[capNum].registerValue(
value, overwriteInitial: sp)
controller.step()

case .beginTypeErase:
fatalError("Unimplemented")

case .endTypeErase:
fatalError("Unimplemented")
}

}
Expand Down
1 change: 1 addition & 0 deletions Sources/_StringProcessing/Executor.swift
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

@_implementationOnly import _RegexParser

@available(SwiftStdlib 5.7, *)
struct Executor {
// TODO: consider let, for now lets us toggle tracing
var engine: Engine<String>
Expand Down
3 changes: 3 additions & 0 deletions Sources/_StringProcessing/PrintAsPattern.swift
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,9 @@ extension PrettyPrinter {

case .absentFunction:
print("/* TODO: absent function */")

case .typeErase:
print("/* TODO: type erasure */")
}
}

Expand Down
8 changes: 7 additions & 1 deletion Sources/_StringProcessing/Regex/Core.swift
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,13 @@ public struct Regex<Output>: RegexComponent {
}

public var regex: Regex<Output> {
self
if Output.self == AnyRegexOutput.self {
if case .typeErase = root {
return self
}
return .init(node: .typeErase(root))
}
return self
}
}

Expand Down
27 changes: 23 additions & 4 deletions Sources/_StringProcessing/Regex/DSLTree.swift
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,10 @@ extension DSLTree {

case matcher(Any.Type, _MatcherInterface)

// MARK: - Type erasure

case typeErase(Node)

// TODO: Would this just boil down to a consumer?
case characterPredicate(_CharacterPredicateInterface)
}
Expand Down Expand Up @@ -265,6 +269,7 @@ extension DSLTree.Node {
case let .capture(_, _, n, _): return [n]
case let .nonCapturingGroup(_, n): return [n]
case let .quantification(_, _, n): return [n]
case let .typeErase(n): return [n]

case let .conditional(_, t, f): return [t,f]

Expand Down Expand Up @@ -486,6 +491,7 @@ public struct CaptureTransform: Hashable, CustomStringConvertible {
// These wrapper types are required because even @_spi-marked public APIs can't
// include symbols from implementation-only dependencies.

@available(SwiftStdlib 5.7, *)
extension DSLTree.Node {
func _addCaptures(
to list: inout CaptureList,
Expand Down Expand Up @@ -551,7 +557,7 @@ extension DSLTree.Node {
break

case .customCharacterClass, .atom, .trivia, .empty,
.quotedLiteral, .consumer, .characterPredicate:
.quotedLiteral, .consumer, .characterPredicate, .typeErase:
break
}
}
Expand All @@ -566,7 +572,7 @@ extension DSLTree.Node {
.conditional, .quantification, .customCharacterClass, .atom,
.trivia, .empty, .quotedLiteral, .regexLiteral, .absentFunction,
.convertedRegexLiteral, .consumer,
.characterPredicate, .matcher:
.characterPredicate, .matcher, .typeErase:
return false
}
}
Expand All @@ -583,16 +589,28 @@ extension DSLTree.Node {

/// Returns the type of the whole match, i.e. `.0` element type of the output.
var wholeMatchType: Any.Type {
if case .matcher(let type, _) = outputDefiningNode {
switch outputDefiningNode {
case .matcher(let type, _):
return type
case .typeErase:
return AnyRegexOutput.self
default:
return Substring.self
}
return Substring.self
}
}

extension DSLTree {
@available(SwiftStdlib 5.7, *)
var captureList: CaptureList {
var list = CaptureList()
// FIXME: This is peering through any top-level `.typeErase`. Once type
// erasure was handled in the engine, this can be simplified to using `root`
// directly.
var root = root
while case let .typeErase(child) = root {
root = child
}
list.append(.init(type: root.wholeMatchType, optionalDepth: 0, .fake))
root._addCaptures(to: &list, optionalNesting: 0)
return list
Expand Down Expand Up @@ -620,6 +638,7 @@ extension DSLTree {
case let .capture(_, _, n, _): return [_Tree(n)]
case let .nonCapturingGroup(_, n): return [_Tree(n)]
case let .quantification(_, _, n): return [_Tree(n)]
case let .typeErase(n): return [_Tree(n)]

case let .conditional(_, t, f): return [_Tree(t), _Tree(f)]

Expand Down
72 changes: 72 additions & 0 deletions Tests/RegexBuilderTests/RegexDSLTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1071,6 +1071,78 @@ class RegexDSLTests: XCTestCase {
}
}
}

func testTypeErasedRegexInDSL() throws {
do {
let input = "johnappleseed: 12."
let numberRegex = try! Regex(#"(\d+)\.?"#)
let regex = Regex {
Capture {
OneOrMore(.word)
}
ZeroOrMore(.whitespace)
":"
ZeroOrMore(.whitespace)
numberRegex
}
let match = try XCTUnwrap(input.wholeMatch(of: regex))
XCTAssertEqual(match.0, input[...])
XCTAssertEqual(match.1, "johnappleseed")
}
do {
let input = "johnappleseed: 12."
let numberRegex = try! Regex(#"(\d+)\.?"#)
let regex = Regex {
Capture {
OneOrMore(.word)
}
ZeroOrMore(.whitespace)
":"
ZeroOrMore(.whitespace)
Capture { numberRegex }
}
let match = try XCTUnwrap(input.wholeMatch(of: regex))
XCTAssertEqual(match.0, input[...])
XCTAssertEqual(match.1, "johnappleseed")
XCTAssertEqual(match.2[0].value as? Substring, "12.")
XCTAssertEqual(match.2[1].value as? Substring, "12")
}
do {
let input = "johnappleseed: 12."
// Anchors should be with respect to the entire input.
let numberRegex = try! Regex(#"^(\d+)\.?"#)
let regex = Regex {
Capture {
OneOrMore(.word)
}
ZeroOrMore(.whitespace)
":"
ZeroOrMore(.whitespace)
Capture { numberRegex }
}
XCTAssertNil(input.wholeMatch(of: regex))
}
do {
let input = "johnappleseed: 12.[12]"
// Backreferences in a type-erased regex are scoped to the type-erased
// regex itself. `\1` here should refer to "12", not "johnappleseed"
let numberRegex = try! Regex(#"(\d+)\.?\[\1\]"#)
let regex = Regex {
Capture {
OneOrMore(.word)
}
ZeroOrMore(.whitespace)
":"
ZeroOrMore(.whitespace)
Capture { numberRegex }
}
let match = try XCTUnwrap(input.wholeMatch(of: regex))
XCTAssertEqual(match.0, input[...])
XCTAssertEqual(match.1, "johnappleseed")
XCTAssertEqual(match.2[0].value as? Substring, "12.[12]")
XCTAssertEqual(match.2[1].value as? Substring, "12")
}
}
}

extension Unicode.Scalar {
Expand Down

0 comments on commit 63124f5

Please sign in to comment.