Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Sources/_StringProcessing/ByteCodeGen.swift
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,7 @@ extension Compiler.ByteCodeGen {
if options.isCaseInsensitive && c.isCased {
// TODO: buildCaseInsensitiveMatch(c) or buildMatch(c, caseInsensitive: true)
builder.buildConsume { input, bounds in
let inputChar = input[bounds.lowerBound].lowercased()
let matchChar = c.lowercased()
return inputChar == matchChar
return input[bounds.lowerBound].caseFoldedEquals(c)
? input.index(after: bounds.lowerBound)
: nil
}
Expand Down Expand Up @@ -637,11 +635,13 @@ extension Compiler.ByteCodeGen {
if options.isCaseInsensitive {
// TODO: buildCaseInsensitiveMatchSequence(c) or alternative
builder.buildConsume { input, bounds in
// FIXME: This needs to iterate over the case-folded strings, not
// iterate and then case-fold as we go.
var iterator = s.makeIterator()
var currentIndex = bounds.lowerBound
while let ch = iterator.next() {
guard currentIndex < bounds.upperBound,
ch.lowercased() == input[currentIndex].lowercased()
ch.caseFoldedEquals(input[currentIndex])
else { return nil }
input.formIndex(after: &currentIndex)
}
Expand Down
2 changes: 1 addition & 1 deletion Sources/_StringProcessing/ConsumerInterface.swift
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ extension DSLTree.Atom {
return { input, bounds in
let low = bounds.lowerBound
if isCaseInsensitive && c.isCased {
return input[low].lowercased() == c.lowercased()
return input[low].caseFoldedEquals(c)
? input.index(after: low)
: nil
} else {
Expand Down
18 changes: 18 additions & 0 deletions Sources/_StringProcessing/Unicode/CaseConversion.swift
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,24 @@
//
//===----------------------------------------------------------------------===//

@_spi(_Unicode) import Swift

// TODO

extension Character {
/// Whether this character and `c` are equal when case folded.
func caseFoldedEquals(_ c: Character) -> Bool {
guard #available(SwiftStdlib 5.7, *) else { fatalError() }
let foldedSelf = unicodeScalars.map(\.properties._caseFolded).joined()
let foldedOther = c.unicodeScalars.map(\.properties._caseFolded).joined()
return foldedSelf == foldedOther
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes a lot of intermediary arrays, which is unfortunate because nearly all Characters fit in the small-string form, and those that don't are usually case invariant. Does .lazy.map(...).elementsEqual( work?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think what we really want is a stdlib API that can do the case-folded comparison — it's wasteful even to UTF-8 encode these when we should be able to convert, canonicalize, and compare character-by-character.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We can write the code here or in the stdlib (I agree stdlib SPI makes sense). But in the meantime, can we write the code we want the stdlib to have, or at least approximate it by making the mapping lazy?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Otherwise, it looks like case-insensitive matching could do multiple array allocations for every single character in the input

}
}

extension UnicodeScalar {
/// Whether this Unicode scalar and `s` are equal when case folded.
func caseFoldedEquals(_ s: UnicodeScalar) -> Bool {
guard #available(SwiftStdlib 5.7, *) else { fatalError() }
return properties._caseFolded == s.properties._caseFolded
}
}
2 changes: 1 addition & 1 deletion Sources/_StringProcessing/_CharacterClassModel.swift
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@ public struct _CharacterClassModel: Hashable {
switch self {
case .character(let c):
if options.isCaseInsensitive {
return c.lowercased() == character.lowercased()
return c.caseFoldedEquals(character)
} else {
return c == character
}
Expand Down
33 changes: 22 additions & 11 deletions Tests/RegexTests/UTS18Tests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -230,20 +230,31 @@ extension UTS18Tests {
expectFirstMatch("Dåb", regex(#"Dåb"#).ignoresCase(), "Dåb")
expectFirstMatch("dÅB", regex(#"Dåb"#).ignoresCase(), "dÅB")
expectFirstMatch("D\u{212B}B", regex(#"Dåb"#).ignoresCase(), "D\u{212B}B")
}

let sigmas = "σΣς"
expectFirstMatch(sigmas, regex(#"^σ+$"#).ignoresCase(), sigmas[...])
expectFirstMatch(sigmas, regex(#"^Σ+$"#).ignoresCase(), sigmas[...])
expectFirstMatch(sigmas, regex(#"^ς+$"#).ignoresCase(), sigmas[...])

func testSimpleLooseMatches_XFail() {
XCTExpectFailure("Need case folding support") {
let sigmas = "σΣς"
expectFirstMatch(sigmas, regex(#"σ+"#).ignoresCase(), sigmas[...])
expectFirstMatch(sigmas, regex(#"Σ+"#).ignoresCase(), sigmas[...])
expectFirstMatch(sigmas, regex(#"ς+"#).ignoresCase(), sigmas[...])

// TODO: Test German sharp S
// TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]]
// Custom character classes
for regexCh in sigmas {
for inputCh in sigmas {
expectFirstMatch(String(inputCh), regex("[\(regexCh)]").ignoresCase(), String(inputCh)[...])
if regexCh != inputCh {
XCTAssertFalse(String(inputCh).contains(regex("[\(regexCh)]")))
}
}
}

expectFirstMatch("Strauß", regex("ß").ignoresCase(), "ß")
XCTExpectFailure {
expectFirstMatch("Strauss", regex("ß").ignoresCase(), "ss")
}

// TODO: Test char classes, e.g. [\p{Block=Phonetic_Extensions} [A-E]]
// TODO: Document when full case folding applies
}

// RL1.6 Line Boundaries
//
// To meet this requirement, if an implementation provides for line-boundary
Expand Down