diff --git a/Sources/_StringProcessing/ByteCodeGen.swift b/Sources/_StringProcessing/ByteCodeGen.swift index bcfc8a2c2..724a49d8f 100644 --- a/Sources/_StringProcessing/ByteCodeGen.swift +++ b/Sources/_StringProcessing/ByteCodeGen.swift @@ -1,5 +1,6 @@ @_implementationOnly import _RegexParser +@available(SwiftStdlib 5.7, *) extension Compiler { struct ByteCodeGen { var options: MatchingOptions @@ -15,8 +16,15 @@ extension Compiler { } } +@available(SwiftStdlib 5.7, *) extension Compiler.ByteCodeGen { mutating func emitRoot(_ root: DSLTree.Node) throws -> Program { + // FIXME: Remove once output type erasure is represented in the matching + // engine. This workaround is to prevent a top-level `Regex` + // from being emitted as a matcher, which would be an infinite recursion. + if case let .typeErase(child) = root { + return try emitRoot(child) + } // The whole match (`.0` element of output) is equivalent to an implicit // capture over the entire regex. try emitNode(.capture(name: nil, reference: nil, root)) @@ -25,6 +33,7 @@ extension Compiler.ByteCodeGen { } } +@available(SwiftStdlib 5.7, *) fileprivate extension Compiler.ByteCodeGen { mutating func emitAtom(_ a: DSLTree.Atom) throws { defer { @@ -765,6 +774,28 @@ fileprivate extension Compiler.ByteCodeGen { case .characterPredicate: throw Unsupported("character predicates") + case .typeErase(let child): + // FIXME: This is a workaround for `Regex` not working in + // the DSL. This separates any `Regex` into its own + // compilation unit, but is less efficient. We should instead represent + // output type erasure in the matching engine (`beginTypeErase`, + // `endTypeErase`). + // + // Long-term design: + // beginTypeErase + // + // endTypeErase + let program = try Compiler(tree: DSLTree(child)).emit() + let executor = Executor(program: program) + return emitMatcher { input, startIndex, range in + guard let match: Regex.Match = try executor.match( + input, in: startIndex.. Executor { diff --git a/Sources/_StringProcessing/ConsumerInterface.swift b/Sources/_StringProcessing/ConsumerInterface.swift index a912fd136..ccd8a01d0 100644 --- a/Sources/_StringProcessing/ConsumerInterface.swift +++ b/Sources/_StringProcessing/ConsumerInterface.swift @@ -39,7 +39,7 @@ extension DSLTree.Node { case .orderedChoice, .conditional, .concatenation, .capture, .nonCapturingGroup, .quantification, .trivia, .empty, - .absentFunction: return nil + .absentFunction, .typeErase: return nil case .consumer: fatalError("FIXME: Is this where we handle them?") diff --git a/Sources/_StringProcessing/Engine/Instruction.swift b/Sources/_StringProcessing/Engine/Instruction.swift index 9144c031f..a705b6bee 100644 --- a/Sources/_StringProcessing/Engine/Instruction.swift +++ b/Sources/_StringProcessing/Engine/Instruction.swift @@ -278,6 +278,15 @@ extension Instruction { /// case backreference + /// Push a new type erasure scope into the capture stack. + case beginTypeErase + + /// Pop the last type erasure scope, create a `AnyRegexOutput` from that + /// scope, and store it in a value register. + /// + /// endTypeErase(_: ValReg) + case endTypeErase + // MARK: Matching: State transitions // TODO: State transitions need more work. We want diff --git a/Sources/_StringProcessing/Engine/MEBuilder.swift b/Sources/_StringProcessing/Engine/MEBuilder.swift index 13b2d3798..b853cee4c 100644 --- a/Sources/_StringProcessing/Engine/MEBuilder.swift +++ b/Sources/_StringProcessing/Engine/MEBuilder.swift @@ -255,6 +255,14 @@ extension MEProgram.Builder { .init(capture: cap, transform: trans))) } + mutating func buildBeginTypeErase() { + instructions.append(.init(.beginTypeErase)) + } + + mutating func buildEndTypeErase() { + instructions.append(.init(.endTypeErase)) + } + mutating func buildMatcher( _ fun: MatcherRegister, into reg: ValueRegister ) { diff --git a/Sources/_StringProcessing/Engine/Processor.swift b/Sources/_StringProcessing/Engine/Processor.swift index a81d2ce06..14ad9fc5e 100644 --- a/Sources/_StringProcessing/Engine/Processor.swift +++ b/Sources/_StringProcessing/Engine/Processor.swift @@ -527,6 +527,12 @@ extension Processor { storedCaptures[capNum].registerValue( value, overwriteInitial: sp) controller.step() + + case .beginTypeErase: + fatalError("Unimplemented") + + case .endTypeErase: + fatalError("Unimplemented") } } diff --git a/Sources/_StringProcessing/Executor.swift b/Sources/_StringProcessing/Executor.swift index 295a732de..61b98f866 100644 --- a/Sources/_StringProcessing/Executor.swift +++ b/Sources/_StringProcessing/Executor.swift @@ -11,6 +11,7 @@ @_implementationOnly import _RegexParser +@available(SwiftStdlib 5.7, *) struct Executor { // TODO: consider let, for now lets us toggle tracing var engine: Engine diff --git a/Sources/_StringProcessing/PrintAsPattern.swift b/Sources/_StringProcessing/PrintAsPattern.swift index 03fca2e1f..07fe0773b 100644 --- a/Sources/_StringProcessing/PrintAsPattern.swift +++ b/Sources/_StringProcessing/PrintAsPattern.swift @@ -234,6 +234,9 @@ extension PrettyPrinter { case .absentFunction: print("/* TODO: absent function */") + + case .typeErase: + print("/* TODO: type erasure */") } } diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 882d9069d..be62035af 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -56,7 +56,13 @@ public struct Regex: RegexComponent { } public var regex: Regex { - self + if Output.self == AnyRegexOutput.self { + if case .typeErase = root { + return self + } + return .init(node: .typeErase(root)) + } + return self } } diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index 72c5f1526..e45d8120f 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -93,6 +93,10 @@ extension DSLTree { case matcher(Any.Type, _MatcherInterface) + // MARK: - Type erasure + + case typeErase(Node) + // TODO: Would this just boil down to a consumer? case characterPredicate(_CharacterPredicateInterface) } @@ -265,6 +269,7 @@ extension DSLTree.Node { case let .capture(_, _, n, _): return [n] case let .nonCapturingGroup(_, n): return [n] case let .quantification(_, _, n): return [n] + case let .typeErase(n): return [n] case let .conditional(_, t, f): return [t,f] @@ -486,6 +491,7 @@ public struct CaptureTransform: Hashable, CustomStringConvertible { // These wrapper types are required because even @_spi-marked public APIs can't // include symbols from implementation-only dependencies. +@available(SwiftStdlib 5.7, *) extension DSLTree.Node { func _addCaptures( to list: inout CaptureList, @@ -551,7 +557,7 @@ extension DSLTree.Node { break case .customCharacterClass, .atom, .trivia, .empty, - .quotedLiteral, .consumer, .characterPredicate: + .quotedLiteral, .consumer, .characterPredicate, .typeErase: break } } @@ -566,7 +572,7 @@ extension DSLTree.Node { .conditional, .quantification, .customCharacterClass, .atom, .trivia, .empty, .quotedLiteral, .regexLiteral, .absentFunction, .convertedRegexLiteral, .consumer, - .characterPredicate, .matcher: + .characterPredicate, .matcher, .typeErase: return false } } @@ -583,16 +589,28 @@ extension DSLTree.Node { /// Returns the type of the whole match, i.e. `.0` element type of the output. var wholeMatchType: Any.Type { - if case .matcher(let type, _) = outputDefiningNode { + switch outputDefiningNode { + case .matcher(let type, _): return type + case .typeErase: + return AnyRegexOutput.self + default: + return Substring.self } - return Substring.self } } extension DSLTree { + @available(SwiftStdlib 5.7, *) var captureList: CaptureList { var list = CaptureList() + // FIXME: This is peering through any top-level `.typeErase`. Once type + // erasure was handled in the engine, this can be simplified to using `root` + // directly. + var root = root + while case let .typeErase(child) = root { + root = child + } list.append(.init(type: root.wholeMatchType, optionalDepth: 0, .fake)) root._addCaptures(to: &list, optionalNesting: 0) return list @@ -620,6 +638,7 @@ extension DSLTree { case let .capture(_, _, n, _): return [_Tree(n)] case let .nonCapturingGroup(_, n): return [_Tree(n)] case let .quantification(_, _, n): return [_Tree(n)] + case let .typeErase(n): return [_Tree(n)] case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] diff --git a/Tests/RegexBuilderTests/RegexDSLTests.swift b/Tests/RegexBuilderTests/RegexDSLTests.swift index fc31e575f..c90e04a4e 100644 --- a/Tests/RegexBuilderTests/RegexDSLTests.swift +++ b/Tests/RegexBuilderTests/RegexDSLTests.swift @@ -1071,6 +1071,78 @@ class RegexDSLTests: XCTestCase { } } } + + func testTypeErasedRegexInDSL() throws { + do { + let input = "johnappleseed: 12." + let numberRegex = try! Regex(#"(\d+)\.?"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + numberRegex + } + let match = try XCTUnwrap(input.wholeMatch(of: regex)) + XCTAssertEqual(match.0, input[...]) + XCTAssertEqual(match.1, "johnappleseed") + } + do { + let input = "johnappleseed: 12." + let numberRegex = try! Regex(#"(\d+)\.?"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + Capture { numberRegex } + } + let match = try XCTUnwrap(input.wholeMatch(of: regex)) + XCTAssertEqual(match.0, input[...]) + XCTAssertEqual(match.1, "johnappleseed") + XCTAssertEqual(match.2[0].value as? Substring, "12.") + XCTAssertEqual(match.2[1].value as? Substring, "12") + } + do { + let input = "johnappleseed: 12." + // Anchors should be with respect to the entire input. + let numberRegex = try! Regex(#"^(\d+)\.?"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + Capture { numberRegex } + } + XCTAssertNil(input.wholeMatch(of: regex)) + } + do { + let input = "johnappleseed: 12.[12]" + // Backreferences in a type-erased regex are scoped to the type-erased + // regex itself. `\1` here should refer to "12", not "johnappleseed" + let numberRegex = try! Regex(#"(\d+)\.?\[\1\]"#) + let regex = Regex { + Capture { + OneOrMore(.word) + } + ZeroOrMore(.whitespace) + ":" + ZeroOrMore(.whitespace) + Capture { numberRegex } + } + let match = try XCTUnwrap(input.wholeMatch(of: regex)) + XCTAssertEqual(match.0, input[...]) + XCTAssertEqual(match.1, "johnappleseed") + XCTAssertEqual(match.2[0].value as? Substring, "12.[12]") + XCTAssertEqual(match.2[1].value as? Substring, "12") + } + } } extension Unicode.Scalar {