// RUN: %empty-directory(%t) // RUN: %target-run-stdlib-swift // REQUIRES: executable_test // REQUIRES: objc_interop // REQUIRES: optimized_stdlib // Validate that the various forms of word breaking all lead to consistent // results by exhaustively enumerating all possible state machine inputs up to // an adequately high length. // // The word breaking algorithm only cares about word break properties, not // specific scalar values. This lets us only use a single representative sample // in each class, drastically cutting down the input space to iterate through. // This makes it practical to do this up to limits that give us practically // useful results. import StdlibUnittest let suite = TestSuite("UnicodeWordRecognizer") defer { runAllTests() } // One representative sample from each character class that's relevant to word breaking let samples: [Unicode.Scalar] = [ "\u{000D}", // CR "\u{000A}", // LF "\u{2028}", // Newline (LINE SEPARATOR) "\u{0041}", // ALetter (LATIN CAPITAL LETTER A) "\u{0022}", // Double_Quote (QUOTATION MARK) "\u{0027}", // Single_Quote (APOSTROPHE) "\u{200D}", // ZWJ (ZERO WIDTH JOINER) "\u{1F1E6}", // RI (REGIONAL INDICATOR SYMBOL LETTER A) "\u{05D0}", // Hebrew_Letter (HEBREW LETTER ALEF) "\u{0300}", // Extend (COMBINING GRAVE ACCENT) "\u{00AD}", // Format (SOFT HYPHEN) "\u{3031}", // Katakana (VERTICAL KANA REPEAT MARK) "\u{003A}", // MidLetter (COLON) "\u{002C}", // MidNum (COMMA) "\u{002E}", // MidNumLet (FULL STOP) "\u{0030}", // Numeric (DIGIT ZERO) "\u{005F}", // ExtendNumLet (LOW LINE) "\u{0020}", // WSegSpace (SPACE) "\u{00A9}", // \p{Extended_Pictographic} (COPYRIGHT) "\u{0021}", // Any (EXCLAMATION MARK) ] /// Call `body` with every array of the specified count consisting of /// integer elements in the given range. This is calculating the /// Cartesian `n`-ary power of the `range` argument. /// /// withEveryArray(of: 1 ..< 3, count: 3) { print($0) } /// // [1, 1, 1] /// // [2, 1, 1] /// // [1, 2, 1] /// // [2, 2, 1] /// // [1, 1, 2] /// // [2, 1, 2] /// // [1, 2, 2] /// // [2, 2, 2] func withEveryArray( of range: Range, count n: Int, _ body: ([Int]) throws(E) -> Void ) throws(E) { var vector: [Int] = .init(repeating: range.lowerBound, count: n) guard n > 0 else { try body(vector) return } while true { try body(vector) var i = 0 while true { vector[i] += 1 if vector[i] < range.upperBound { break } vector[i] = range.lowerBound i += 1 if i == n { return // done } } } } func string(for vector: [Int]) -> String { var s = "" for digit in vector { s.unicodeScalars.append(samples[digit]) } return s } extension Unicode.Scalar { var unicodeNotation: String { let v = String(self.value, radix: 16, uppercase: true) return "U+\(String(repeating: "0", count: max(0, 4 - v.count)))\(v)" } } extension String { var scalarDescriptions: String { return self.unicodeScalars .lazy.map { $0.unicodeNotation } .joined(separator: " ") } } extension Collection { /// Return a sorted array of all valid indices in the collection, including /// the end index. func allIndices() -> [Index] { var result: [Index] = [] result.reserveCapacity(count + 1) result.append(contentsOf: indices) result.append(endIndex) return result } } extension Sequence where Element: Equatable { /// Returns true if the elements of `self` form a sub-sequence of `other`, /// where both inputs are monotonic. `self` is not allowed to contain /// more than a single copy of any item in `other`. func isMonotonicSubsequence(of other: some Sequence) -> Bool { var i = makeIterator() var j = other.makeIterator() var b = j.next() while let a = i.next() { while true { if b == nil { return false } if a == b { b = j.next() break } b = j.next() } } return true } /// Returns true if the elements of `self` form a sub-sequence of `other`, /// where both inputs are monotonic. `self` is allowed to contain duplicate /// elements. func isMonotonicRepeatingSubsequence(of other: some Sequence) -> Bool { var i = makeIterator() var j = other.makeIterator() var b = j.next() while let a = i.next() { while true { if b == nil { return false } if a == b { break } b = j.next() } } return true } } extension String { /// Returns all word boundaries within the string, using a single word /// recognizer instance. This is the most efficient way to find word /// boundaries, as it processes each scalar exactly once. @available(StdlibDeploymentTarget 6.3, *) func fastWordBreaks() -> [String.Index] { var result: [String.Index] = [] var i = self.startIndex var recognizer = Unicode._WordRecognizer() var candidate = i while i < self.endIndex { let (setCandidate, breakAtCandidate, breakHere) = recognizer.hasBreak(before: self.unicodeScalars[i]) if setCandidate { candidate = i } if breakAtCandidate { result.append(candidate) } if breakHere { result.append(i) } self.unicodeScalars.formIndex(after: &i) } if recognizer.hasCandidateBreakAtEnd() { result.append(candidate) } result.append(i) return result } /// Returns all word breaks without keeping persistent state, using /// `_wordIndex(after:)`. This forgets lookahead information after each word /// boundary, so it needs to process some scalars twice, resulting in a /// performance regression vs `fastWordBreaks()`. However, both variants are /// supposed to have the same results. @available(StdlibDeploymentTarget 5.7, *) func slowWordBreaks() -> [String.Index] { var result: [String.Index] = [] var i = self.startIndex while i < self.endIndex { result.append(i) i = self._wordIndex(after: i) } result.append(i) return result } /// Return all "safe" word breaks in this string by using the backwards word /// recognizer state machine, starting from the end, feeding it every Unicode /// scalar in the string, and collecting all word boundaries detected. /// /// This is expected to sometimes skip over word boundaries that are detected /// when going forward. However, it must never report a word boundary at a /// position that isn't also detected by the forward recognizer. @available(StdlibDeploymentTarget 6.3, *) func safeWordBreaks() -> [String.Index] { var result: [String.Index] = [] guard !self.isEmpty else { return result } result.append(self.endIndex) // There is always an implicit wordbreak at the end. var i = self.unicodeScalars.index(before: self.endIndex) var recognizer = Unicode._RandomAccessWordRecognizer(before: self.unicodeScalars[i]) var candidate = i while i > self.startIndex { let j = self.unicodeScalars.index(before: i) let r = recognizer.hasGuaranteedBreak(after: self.unicodeScalars[j]) if r.setCandidate { candidate = i } if r.breakAtCandidate { result.append(candidate) } if r.breakHere { result.append(i) } i = j } result.reverse() return result } /// Return an array of "safe" word boundaries detected by the backwards word /// recognizer state machine, invoked through /// `_wordIndex(somewhereAtOrBefore:)`, one result per each scalar position in /// the string (including its end index). /// /// This is expected to be some monotonically increasing subsequence of word /// boundaries detected in the forward direction, allowing some repeated /// items. @available(StdlibDeploymentTarget 6.3, *) func randomAccessWordBreaks() -> [String.Index] { unicodeScalars.allIndices().map { self._wordIndex(somewhereAtOrBefore: $0) } } } @available(StdlibDeploymentTarget 6.3, *) func check(length: Int) { withEveryArray(of: 0 ..< samples.count, count: length) { vector in let str = string(for: vector) let fastBreaks = str.fastWordBreaks() let slowBreaks = str.slowWordBreaks() expectEqual( fastBreaks, slowBreaks, """ Inconsistent word boundaries in stateful vs stateless iteration: input: \(str.debugDescription) (\(str.scalarDescriptions)) """) let safeBreaks = str.safeWordBreaks() expectTrue( safeBreaks.isMonotonicSubsequence(of: fastBreaks), """ Inconsistent safe word boundaries: input: \(str.debugDescription) (\(str.scalarDescriptions))") """) let randomAccessBreaks = str.randomAccessWordBreaks() expectTrue( randomAccessBreaks.isMonotonicRepeatingSubsequence(of: fastBreaks), """ Inconsistent random-access word boundaries: input: \(str.debugDescription) (\(str.scalarDescriptions)) breaks: \(fastBreaks) random-access breaks: \(randomAccessBreaks) """) } } if #available(StdlibDeploymentTarget 6.3, *) { suite.test("Exhaustive consistency checks, length 1") { check(length: 1) } suite.test("Exhaustive consistency checks, length 2") { check(length: 2) } suite.test("Exhaustive consistency checks, length 3") { check(length: 3) } suite.test("Exhaustive consistency checks, length 4") { check(length: 4) } suite.test("Exhaustive consistency checks, length 5") { check(length: 5) } }