swiftlang
diff --git a/‎Sources/RegexBuilder/CharacterClass.swift‎
Lines changed: 23 additions & 11 deletions b/‎Sources/RegexBuilder/CharacterClass.swift‎
Lines changed: 23 additions & 11 deletions
diff --git a/‎Sources/_StringProcessing/ByteCodeGen.swift‎
Lines changed: 18 additions & 138 deletions b/‎Sources/_StringProcessing/ByteCodeGen.swift‎
Lines changed: 18 additions & 138 deletions
diff --git a/‎Sources/_StringProcessing/ConsumerInterface.swift‎
Lines changed: 15 additions & 11 deletions b/‎Sources/_StringProcessing/ConsumerInterface.swift‎
Lines changed: 15 additions & 11 deletions
@@ -15,27 +15,39 @@
 @available(SwiftStdlib 5.7, *)
 public struct CharacterClass {
   internal var ccc: DSLTree.CustomCharacterClass
+  /// The builtin character class, if this CharacterClass is representable by one
+  internal var builtin: DSLTree.Atom.CharacterClass?
 
   init(_ ccc: DSLTree.CustomCharacterClass) {
     self.ccc = ccc
+    self.builtin = nil
   }
 
-  init(unconverted atom: DSLTree._AST.Atom) {
-    self.ccc = .init(members: [.atom(.unconverted(atom))])
+  init(builtin: DSLTree.Atom.CharacterClass) {
+    self.ccc = .init(members: [.atom(.characterClass(builtin))])
+    self.builtin = builtin
   }
 }
 
 @available(SwiftStdlib 5.7, *)
 extension CharacterClass: RegexComponent {
   public var regex: Regex<Substring> {
-    _RegexFactory().customCharacterClass(ccc)
+    if let cc = builtin {
+      return _RegexFactory().characterClass(cc)
+    } else {
+      return _RegexFactory().customCharacterClass(ccc)
+    }
   }
 }
 
 @available(SwiftStdlib 5.7, *)
 extension CharacterClass {
   public var inverted: CharacterClass {
-    CharacterClass(ccc.inverted)
+    if let inv = builtin?.inverted {
+      return CharacterClass(builtin: inv)
+    } else {
+      return CharacterClass(ccc.inverted)
+    }
   }
 }
 
@@ -50,15 +62,15 @@ extension RegexComponent where Self == CharacterClass {
   }
 
   public static var anyGraphemeCluster: CharacterClass {
-    .init(unconverted: ._anyGrapheme)
+    .init(builtin: .anyGrapheme)
   }
 
   public static var whitespace: CharacterClass {
-    .init(unconverted: ._whitespace)
+    .init(builtin: .whitespace)
   }
 
   public static var digit: CharacterClass {
-    .init(unconverted: ._digit)
+    .init(builtin: .digit)
   }
 
   public static var hexDigit: CharacterClass {
@@ -70,19 +82,19 @@ extension RegexComponent where Self == CharacterClass {
   }
 
   public static var horizontalWhitespace: CharacterClass {
-    .init(unconverted: ._horizontalWhitespace)
+    .init(builtin: .horizontalWhitespace)
   }
 
   public static var newlineSequence: CharacterClass {
-    .init(unconverted: ._newlineSequence)
+    .init(builtin: .newlineSequence)
   }
 
   public static var verticalWhitespace: CharacterClass {
-    .init(unconverted: ._verticalWhitespace)
+    .init(builtin: .verticalWhitespace)
   }
 
   public static var word: CharacterClass {
-    .init(unconverted: ._word)
+    .init(builtin: .word)
   }
 }
 
 
@@ -74,6 +74,9 @@ fileprivate extension Compiler.ByteCodeGen {
         emitMatchScalar(s)
       }
 
+    case let .characterClass(cc):
+      emitCharacterClass(cc)
+
     case let .assertion(kind):
       try emitAssertion(kind)
 
@@ -148,147 +151,24 @@ fileprivate extension Compiler.ByteCodeGen {
     }
   }
 
-  mutating func emitStartOfLine() {
-    builder.buildAssert { [semanticLevel = options.semanticLevel]
-        (_, _, input, pos, subjectBounds) in
-      if pos == subjectBounds.lowerBound { return true }
-      switch semanticLevel {
-      case .graphemeCluster:
-        return input[input.index(before: pos)].isNewline
-      case .unicodeScalar:
-        return input.unicodeScalars[input.unicodeScalars.index(before: pos)].isNewline
-      }
-    }
-  }
-
-  mutating func emitEndOfLine() {
-    builder.buildAssert { [semanticLevel = options.semanticLevel]
-      (_, _, input, pos, subjectBounds) in
-      if pos == subjectBounds.upperBound { return true }
-      switch semanticLevel {
-      case .graphemeCluster:
-        return input[pos].isNewline
-      case .unicodeScalar:
-        return input.unicodeScalars[pos].isNewline
-      }
-    }
-  }
-
   mutating func emitAssertion(
     _ kind: DSLTree.Atom.Assertion
   ) throws {
-    // FIXME: Depends on API model we have... We may want to
-    // think through some of these with API interactions in mind
-    //
-    // This might break how we use `bounds` for both slicing
-    // and things like `firstIndex`, that is `firstIndex` may
-    // need to supply both a slice bounds and a per-search bounds.
-    switch kind {
-    case .startOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.lowerBound
-      }
-
-    case .endOfSubjectBeforeNewline:
-      builder.buildAssert { [semanticLevel = options.semanticLevel]
-          (_, _, input, pos, subjectBounds) in
-        if pos == subjectBounds.upperBound { return true }
-        switch semanticLevel {
-        case .graphemeCluster:
-          return input.index(after: pos) == subjectBounds.upperBound
-           && input[pos].isNewline
-        case .unicodeScalar:
-          return input.unicodeScalars.index(after: pos) == subjectBounds.upperBound
-           && input.unicodeScalars[pos].isNewline
-        }
-      }
-
-    case .endOfSubject:
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in
-        pos == subjectBounds.upperBound
-      }
-
-    case .resetStartOfMatch:
-      // FIXME: Figure out how to communicate this out
+    if kind == .resetStartOfMatch {
       throw Unsupported(#"\K (reset/keep assertion)"#)
-
-    case .firstMatchingPositionInSubject:
-      // TODO: We can probably build a nice model with API here
-      
-      // FIXME: This needs to be based on `searchBounds`,
-      // not the `subjectBounds` given as an argument here
-      builder.buildAssert { (_, _, input, pos, subjectBounds) in false }
-
-    case .textSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .notTextSegment:
-      builder.buildAssert { (_, _, input, pos, _) in
-        // FIXME: Grapheme or word based on options
-        !input.isOnGraphemeClusterBoundary(pos)
-      }
-
-    case .startOfLine:
-      emitStartOfLine()
-
-    case .endOfLine:
-      emitEndOfLine()
-
-    case .caretAnchor:
-      if options.anchorsMatchNewlines {
-        emitStartOfLine()
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.lowerBound
-        }
-      }
-
-    case .dollarAnchor:
-      if options.anchorsMatchNewlines {
-        emitEndOfLine()
-      } else {
-        builder.buildAssert { (_, _, input, pos, subjectBounds) in
-          pos == subjectBounds.upperBound
-        }
-      }
-
-    case .wordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return _CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
-
-    case .notWordBoundary:
-      builder.buildAssert { [options]
-          (cache, maxIndex, input, pos, subjectBounds) in
-        if options.usesSimpleUnicodeBoundaries {
-          // TODO: How should we handle bounds?
-          return !_CharacterClassModel.word.isBoundary(
-            input,
-            at: pos,
-            bounds: subjectBounds,
-            with: options
-          )
-        } else {
-          return !input.isOnWordBoundary(at: pos, using: &cache, &maxIndex)
-        }
-      }
     }
+    builder.buildAssert(
+      by: kind,
+      options.anchorsMatchNewlines,
+      options.usesSimpleUnicodeBoundaries,
+      options.usesASCIIWord,
+      options.semanticLevel)
   }
-  
+
+  mutating func emitCharacterClass(_ cc: DSLTree.Atom.CharacterClass) {
+    builder.buildMatchBuiltin(model: cc.asRuntimeModel(options))
+  }
+
   mutating func emitMatchScalar(_ s: UnicodeScalar) {
     assert(options.semanticLevel == .unicodeScalar)
     if options.isCaseInsensitive && s.properties.isCased {
@@ -907,10 +787,10 @@ fileprivate extension Compiler.ByteCodeGen {
       } else {
         builder.buildMatchAsciiBitset(asciiBitset)
       }
-    } else {
-      let consumer = try ccc.generateConsumer(options)
-      builder.buildConsume(by: consumer)
+      return
     }
+    let consumer = try ccc.generateConsumer(options)
+    builder.buildConsume(by: consumer)
   }
 
   mutating func emitConcatenation(_ children: [DSLTree.Node]) throws {
 
@@ -162,6 +162,8 @@ extension DSLTree.Atom {
     case .assertion:
       // TODO: We could handle, should this be total?
       return nil
+    case .characterClass(let cc):
+      return cc.generateConsumer(opts)
 
     case .backreference:
       // TODO: Should we handle?
@@ -182,6 +184,15 @@ extension DSLTree.Atom {
   }
 }
 
+extension DSLTree.Atom.CharacterClass {
+  func generateConsumer(_ opts: MatchingOptions) -> MEProgram.ConsumeFunction {
+    let model = asRuntimeModel(opts)
+    return { input, bounds in
+      model.matches(in: input, at: bounds.lowerBound)
+    }
+  }
+}
+
 extension String {
   /// Compares this string to `other` using the loose matching rule UAX44-LM2,
   /// which ignores case, whitespace, underscores, and nearly all medial
@@ -269,16 +280,6 @@ extension AST.Atom {
   func generateConsumer(
     _ opts: MatchingOptions
   ) throws -> MEProgram.ConsumeFunction? {
-    // TODO: Wean ourselves off of this type...
-    if let cc = self.characterClass?.withMatchLevel(
-      opts.matchLevel
-    ) {
-      return { input, bounds in
-        // FIXME: should we worry about out of bounds?
-        cc.matches(in: input, at: bounds.lowerBound, with: opts)
-      }
-    }
-
     switch kind {
     case let .scalar(s):
       assertionFailure(
@@ -312,8 +313,11 @@ extension AST.Atom {
     case .caretAnchor, .dollarAnchor:
       // handled in emitAssertion
       return nil
+    case .escaped:
+      // handled in emitAssertion and emitCharacterClass
+      return nil
 
-    case .scalarSequence, .escaped, .keyboardControl, .keyboardMeta,
+    case .scalarSequence, .keyboardControl, .keyboardMeta,
         .keyboardMetaControl, .backreference, .subpattern, .callout,
         .backtrackingDirective, .changeMatchingOptions, .invalid:
       // FIXME: implement
Original file line number	Diff line number	Diff line change
`@@ -15,27 +15,39 @@`
`15`	`15`	`@available(SwiftStdlib 5.7, *)`
`16`	`16`	`public struct CharacterClass {`
`17`	`17`	`internal var ccc: DSLTree.CustomCharacterClass`
	`18`	`+ /// The builtin character class, if this CharacterClass is representable by one`
	`19`	`+ internal var builtin: DSLTree.Atom.CharacterClass?`
`18`	`20`
`19`	`21`	`init(_ ccc: DSLTree.CustomCharacterClass) {`
`20`	`22`	`self.ccc = ccc`
	`23`	`+ self.builtin = nil`
`21`	`24`	`}`
`22`	`25`
`23`		`- init(unconverted atom: DSLTree._AST.Atom) {`
`24`		`- self.ccc = .init(members: [.atom(.unconverted(atom))])`
	`26`	`+ init(builtin: DSLTree.Atom.CharacterClass) {`
	`27`	`+ self.ccc = .init(members: [.atom(.characterClass(builtin))])`
	`28`	`+ self.builtin = builtin`
`25`	`29`	`}`
`26`	`30`	`}`
`27`	`31`
`28`	`32`	`@available(SwiftStdlib 5.7, *)`
`29`	`33`	`extension CharacterClass: RegexComponent {`
`30`	`34`	`public var regex: Regex<Substring> {`
`31`		`- _RegexFactory().customCharacterClass(ccc)`
	`35`	`+ if let cc = builtin {`
	`36`	`+ return _RegexFactory().characterClass(cc)`
	`37`	`+ } else {`
	`38`	`+ return _RegexFactory().customCharacterClass(ccc)`
	`39`	`+ }`
`32`	`40`	`}`
`33`	`41`	`}`
`34`	`42`
`35`	`43`	`@available(SwiftStdlib 5.7, *)`
`36`	`44`	`extension CharacterClass {`
`37`	`45`	`public var inverted: CharacterClass {`
`38`		`- CharacterClass(ccc.inverted)`
	`46`	`+ if let inv = builtin?.inverted {`
	`47`	`+ return CharacterClass(builtin: inv)`
	`48`	`+ } else {`
	`49`	`+ return CharacterClass(ccc.inverted)`
	`50`	`+ }`
`39`	`51`	`}`
`40`	`52`	`}`
`41`	`53`
`@@ -50,15 +62,15 @@ extension RegexComponent where Self == CharacterClass {`
`50`	`62`	`}`
`51`	`63`
`52`	`64`	`public static var anyGraphemeCluster: CharacterClass {`
`53`		`- .init(unconverted: ._anyGrapheme)`
	`65`	`+ .init(builtin: .anyGrapheme)`
`54`	`66`	`}`
`55`	`67`
`56`	`68`	`public static var whitespace: CharacterClass {`
`57`		`- .init(unconverted: ._whitespace)`
	`69`	`+ .init(builtin: .whitespace)`
`58`	`70`	`}`
`59`	`71`
`60`	`72`	`public static var digit: CharacterClass {`
`61`		`- .init(unconverted: ._digit)`
	`73`	`+ .init(builtin: .digit)`
`62`	`74`	`}`
`63`	`75`
`64`	`76`	`public static var hexDigit: CharacterClass {`
`@@ -70,19 +82,19 @@ extension RegexComponent where Self == CharacterClass {`
`70`	`82`	`}`
`71`	`83`
`72`	`84`	`public static var horizontalWhitespace: CharacterClass {`
`73`		`- .init(unconverted: ._horizontalWhitespace)`
	`85`	`+ .init(builtin: .horizontalWhitespace)`
`74`	`86`	`}`
`75`	`87`
`76`	`88`	`public static var newlineSequence: CharacterClass {`
`77`		`- .init(unconverted: ._newlineSequence)`
	`89`	`+ .init(builtin: .newlineSequence)`
`78`	`90`	`}`
`79`	`91`
`80`	`92`	`public static var verticalWhitespace: CharacterClass {`
`81`		`- .init(unconverted: ._verticalWhitespace)`
	`93`	`+ .init(builtin: .verticalWhitespace)`
`82`	`94`	`}`
`83`	`95`
`84`	`96`	`public static var word: CharacterClass {`
`85`		`- .init(unconverted: ._word)`
	`97`	`+ .init(builtin: .word)`
`86`	`98`	`}`
`87`	`99`	`}`
`88`	`100`