mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
Carefully overhaul our word breaking implementation to follow the recommendations of Unicode Annex #29. Start exposing the core primitives (as well as `String`-level interfaces), so that folks can prototype proper API for these concepts. - Fix `_wordIndex(after:)` to always advance forward. It now requires its input index to be on a word boundary. Remove the `@_spi` attribute, exposing it as a (hidden, but) public entry point. - The old SPIs `_wordIndex(before:)` and `_nearestWordIndex(atOrBelow:)` were irredemably broken; follow the Unicode recommendation for implementing random-access text segmentation and replace them both with a new public `_wordIndex(somewhereAtOrBefore:)` entry pont. - Expose handcrafted low-level state machines for detecting word boundaries (_WordRecognizer`, `_RandomAccessWordRecognizer`), following the design of `_CharacterRecognizer`. - Add tests to reliably validate that the two state machine flavors always produce consistent results. rdar://155482680
312 lines
9.4 KiB
Swift
312 lines
9.4 KiB
Swift
// RUN: %empty-directory(%t)
|
|
// RUN: %target-run-stdlib-swift
|
|
|
|
// REQUIRES: executable_test
|
|
// REQUIRES: objc_interop
|
|
// REQUIRES: optimized_stdlib
|
|
|
|
// Validate that the various forms of word breaking all lead to consistent
|
|
// results by exhaustively enumerating all possible state machine inputs up to
|
|
// an adequately high length.
|
|
//
|
|
// The word breaking algorithm only cares about word break properties, not
|
|
// specific scalar values. This lets us only use a single representative sample
|
|
// in each class, drastically cutting down the input space to iterate through.
|
|
// This makes it practical to do this up to limits that give us practically
|
|
// useful results.
|
|
|
|
import StdlibUnittest
|
|
|
|
let suite = TestSuite("UnicodeWordRecognizer")
|
|
defer { runAllTests() }
|
|
|
|
// One representative sample from each character class that's relevant to word breaking
|
|
let samples: [Unicode.Scalar] = [
|
|
"\u{000D}", // CR
|
|
"\u{000A}", // LF
|
|
"\u{2028}", // Newline (LINE SEPARATOR)
|
|
"\u{0041}", // ALetter (LATIN CAPITAL LETTER A)
|
|
"\u{0022}", // Double_Quote (QUOTATION MARK)
|
|
"\u{0027}", // Single_Quote (APOSTROPHE)
|
|
"\u{200D}", // ZWJ (ZERO WIDTH JOINER)
|
|
"\u{1F1E6}", // RI (REGIONAL INDICATOR SYMBOL LETTER A)
|
|
"\u{05D0}", // Hebrew_Letter (HEBREW LETTER ALEF)
|
|
"\u{0300}", // Extend (COMBINING GRAVE ACCENT)
|
|
"\u{00AD}", // Format (SOFT HYPHEN)
|
|
"\u{3031}", // Katakana (VERTICAL KANA REPEAT MARK)
|
|
"\u{003A}", // MidLetter (COLON)
|
|
"\u{002C}", // MidNum (COMMA)
|
|
"\u{002E}", // MidNumLet (FULL STOP)
|
|
"\u{0030}", // Numeric (DIGIT ZERO)
|
|
"\u{005F}", // ExtendNumLet (LOW LINE)
|
|
"\u{0020}", // WSegSpace (SPACE)
|
|
"\u{00A9}", // \p{Extended_Pictographic} (COPYRIGHT)
|
|
"\u{0021}", // Any (EXCLAMATION MARK)
|
|
]
|
|
|
|
/// Call `body` with every array of the specified count consisting of
|
|
/// integer elements in the given range. This is calculating the
|
|
/// Cartesian `n`-ary power of the `range` argument.
|
|
///
|
|
/// withEveryArray(of: 1 ..< 3, count: 3) { print($0) }
|
|
/// // [1, 1, 1]
|
|
/// // [2, 1, 1]
|
|
/// // [1, 2, 1]
|
|
/// // [2, 2, 1]
|
|
/// // [1, 1, 2]
|
|
/// // [2, 1, 2]
|
|
/// // [1, 2, 2]
|
|
/// // [2, 2, 2]
|
|
func withEveryArray<E: Error>(
|
|
of range: Range<Int>,
|
|
count n: Int,
|
|
_ body: ([Int]) throws(E) -> Void
|
|
) throws(E) {
|
|
var vector: [Int] = .init(repeating: range.lowerBound, count: n)
|
|
guard n > 0 else {
|
|
try body(vector)
|
|
return
|
|
}
|
|
while true {
|
|
try body(vector)
|
|
var i = 0
|
|
while true {
|
|
vector[i] += 1
|
|
if vector[i] < range.upperBound {
|
|
break
|
|
}
|
|
vector[i] = range.lowerBound
|
|
i += 1
|
|
if i == n {
|
|
return // done
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func string(for vector: [Int]) -> String {
|
|
var s = ""
|
|
for digit in vector {
|
|
s.unicodeScalars.append(samples[digit])
|
|
}
|
|
return s
|
|
}
|
|
|
|
extension Unicode.Scalar {
|
|
var unicodeNotation: String {
|
|
let v = String(self.value, radix: 16, uppercase: true)
|
|
return "U+\(String(repeating: "0", count: max(0, 4 - v.count)))\(v)"
|
|
}
|
|
}
|
|
|
|
extension String {
|
|
var scalarDescriptions: String {
|
|
return self.unicodeScalars
|
|
.lazy.map { $0.unicodeNotation }
|
|
.joined(separator: " ")
|
|
}
|
|
}
|
|
|
|
extension Collection {
|
|
/// Return a sorted array of all valid indices in the collection, including
|
|
/// the end index.
|
|
func allIndices() -> [Index] {
|
|
var result: [Index] = []
|
|
result.reserveCapacity(count + 1)
|
|
result.append(contentsOf: indices)
|
|
result.append(endIndex)
|
|
return result
|
|
}
|
|
}
|
|
|
|
extension Sequence where Element: Equatable {
|
|
/// Returns true if the elements of `self` form a sub-sequence of `other`,
|
|
/// where both inputs are monotonic. `self` is not allowed to contain
|
|
/// more than a single copy of any item in `other`.
|
|
func isMonotonicSubsequence(of other: some Sequence<Element>) -> Bool {
|
|
var i = makeIterator()
|
|
var j = other.makeIterator()
|
|
var b = j.next()
|
|
while let a = i.next() {
|
|
while true {
|
|
if b == nil { return false }
|
|
if a == b {
|
|
b = j.next()
|
|
break
|
|
}
|
|
b = j.next()
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
/// Returns true if the elements of `self` form a sub-sequence of `other`,
|
|
/// where both inputs are monotonic. `self` is allowed to contain duplicate
|
|
/// elements.
|
|
func isMonotonicRepeatingSubsequence(of other: some Sequence<Element>) -> Bool {
|
|
var i = makeIterator()
|
|
var j = other.makeIterator()
|
|
var b = j.next()
|
|
while let a = i.next() {
|
|
while true {
|
|
if b == nil { return false }
|
|
if a == b { break }
|
|
b = j.next()
|
|
}
|
|
}
|
|
return true
|
|
}
|
|
|
|
}
|
|
|
|
extension String {
|
|
/// Returns all word boundaries within the string, using a single word
|
|
/// recognizer instance. This is the most efficient way to find word
|
|
/// boundaries, as it processes each scalar exactly once.
|
|
@available(StdlibDeploymentTarget 6.3, *)
|
|
func fastWordBreaks() -> [String.Index] {
|
|
var result: [String.Index] = []
|
|
var i = self.startIndex
|
|
var recognizer = Unicode._WordRecognizer()
|
|
var candidate = i
|
|
while i < self.endIndex {
|
|
let (setCandidate, breakAtCandidate, breakHere) =
|
|
recognizer.hasBreak(before: self.unicodeScalars[i])
|
|
if setCandidate {
|
|
candidate = i
|
|
}
|
|
if breakAtCandidate {
|
|
result.append(candidate)
|
|
}
|
|
if breakHere {
|
|
result.append(i)
|
|
}
|
|
self.unicodeScalars.formIndex(after: &i)
|
|
}
|
|
if recognizer.hasCandidateBreakAtEnd() {
|
|
result.append(candidate)
|
|
}
|
|
result.append(i)
|
|
return result
|
|
}
|
|
|
|
/// Returns all word breaks without keeping persistent state, using
|
|
/// `_wordIndex(after:)`. This forgets lookahead information after each word
|
|
/// boundary, so it needs to process some scalars twice, resulting in a
|
|
/// performance regression vs `fastWordBreaks()`. However, both variants are
|
|
/// supposed to have the same results.
|
|
@available(StdlibDeploymentTarget 5.7, *)
|
|
func slowWordBreaks() -> [String.Index] {
|
|
var result: [String.Index] = []
|
|
var i = self.startIndex
|
|
while i < self.endIndex {
|
|
result.append(i)
|
|
i = self._wordIndex(after: i)
|
|
}
|
|
result.append(i)
|
|
return result
|
|
}
|
|
|
|
/// Return all "safe" word breaks in this string by using the backwards word
|
|
/// recognizer state machine, starting from the end, feeding it every Unicode
|
|
/// scalar in the string, and collecting all word boundaries detected.
|
|
///
|
|
/// This is expected to sometimes skip over word boundaries that are detected
|
|
/// when going forward. However, it must never report a word boundary at a
|
|
/// position that isn't also detected by the forward recognizer.
|
|
@available(StdlibDeploymentTarget 6.3, *)
|
|
func safeWordBreaks() -> [String.Index] {
|
|
var result: [String.Index] = []
|
|
guard !self.isEmpty else { return result }
|
|
result.append(self.endIndex) // There is always an implicit wordbreak at the end.
|
|
var i = self.unicodeScalars.index(before: self.endIndex)
|
|
var recognizer = Unicode._RandomAccessWordRecognizer(before: self.unicodeScalars[i])
|
|
var candidate = i
|
|
while i > self.startIndex {
|
|
let j = self.unicodeScalars.index(before: i)
|
|
let r = recognizer.hasGuaranteedBreak(after: self.unicodeScalars[j])
|
|
if r.setCandidate {
|
|
candidate = i
|
|
}
|
|
if r.breakAtCandidate {
|
|
result.append(candidate)
|
|
}
|
|
if r.breakHere {
|
|
result.append(i)
|
|
}
|
|
i = j
|
|
}
|
|
result.reverse()
|
|
return result
|
|
}
|
|
|
|
/// Return an array of "safe" word boundaries detected by the backwards word
|
|
/// recognizer state machine, invoked through
|
|
/// `_wordIndex(somewhereAtOrBefore:)`, one result per each scalar position in
|
|
/// the string (including its end index).
|
|
///
|
|
/// This is expected to be some monotonically increasing subsequence of word
|
|
/// boundaries detected in the forward direction, allowing some repeated
|
|
/// items.
|
|
@available(StdlibDeploymentTarget 6.3, *)
|
|
func randomAccessWordBreaks() -> [String.Index] {
|
|
unicodeScalars.allIndices().map { self._wordIndex(somewhereAtOrBefore: $0) }
|
|
}
|
|
}
|
|
|
|
@available(StdlibDeploymentTarget 6.3, *)
|
|
func check(length: Int) {
|
|
withEveryArray(of: 0 ..< samples.count, count: length) { vector in
|
|
let str = string(for: vector)
|
|
|
|
let fastBreaks = str.fastWordBreaks()
|
|
let slowBreaks = str.slowWordBreaks()
|
|
expectEqual(
|
|
fastBreaks, slowBreaks,
|
|
"""
|
|
Inconsistent word boundaries in stateful vs stateless iteration:
|
|
input: \(str.debugDescription) (\(str.scalarDescriptions))
|
|
""")
|
|
|
|
let safeBreaks = str.safeWordBreaks()
|
|
expectTrue(
|
|
safeBreaks.isMonotonicSubsequence(of: fastBreaks),
|
|
"""
|
|
Inconsistent safe word boundaries:
|
|
input: \(str.debugDescription) (\(str.scalarDescriptions))")
|
|
""")
|
|
|
|
let randomAccessBreaks = str.randomAccessWordBreaks()
|
|
expectTrue(
|
|
randomAccessBreaks.isMonotonicRepeatingSubsequence(of: fastBreaks),
|
|
"""
|
|
Inconsistent random-access word boundaries:
|
|
input: \(str.debugDescription) (\(str.scalarDescriptions))
|
|
breaks: \(fastBreaks)
|
|
random-access breaks: \(randomAccessBreaks)
|
|
""")
|
|
}
|
|
}
|
|
|
|
if #available(StdlibDeploymentTarget 6.3, *) {
|
|
suite.test("Exhaustive consistency checks, length 1") {
|
|
check(length: 1)
|
|
}
|
|
|
|
suite.test("Exhaustive consistency checks, length 2") {
|
|
check(length: 2)
|
|
}
|
|
|
|
suite.test("Exhaustive consistency checks, length 3") {
|
|
check(length: 3)
|
|
}
|
|
|
|
suite.test("Exhaustive consistency checks, length 4") {
|
|
check(length: 4)
|
|
}
|
|
|
|
suite.test("Exhaustive consistency checks, length 5") {
|
|
check(length: 5)
|
|
}
|
|
}
|