[stdlib] Fix implementation of Unicode text segmentation for word boundaries

Carefully overhaul our word breaking implementation to follow the recommendations of Unicode Annex #29. Start exposing the core primitives (as well as `String`-level interfaces), so that folks can prototype proper API for these concepts. - Fix `_wordIndex(after:)` to always advance forward. It now requires its input index to be on a word boundary. Remove the `@_spi` attribute, exposing it as a (hidden, but) public entry point. - The old SPIs `_wordIndex(before:)` and `_nearestWordIndex(atOrBelow:)` were irredemably broken; follow the Unicode recommendation for implementing random-access text segmentation and replace them both with a new public `_wordIndex(somewhereAtOrBefore:)` entry pont. - Expose handcrafted low-level state machines for detecting word boundaries (_WordRecognizer`, `_RandomAccessWordRecognizer`), following the design of `_CharacterRecognizer`. - Add tests to reliably validate that the two state machine flavors always produce consistent results. rdar://155482680
2025-12-21 12:14:44 +01:00 · 2025-07-24 17:25:56 -07:00
parent 22b1205cf2
commit 3e18a07187
7 changed files with 1165 additions and 702 deletions
--- a/validation-test/stdlib/StringWordBreaking.swift
+++ b/validation-test/stdlib/StringWordBreaking.swift
@@ -2,35 +2,116 @@
 // RUN: %target-run-stdlib-swift %S/Inputs/

 // REQUIRES: executable_test
-// REQUIRES: objc_interop
 // REQUIRES: optimized_stdlib

-@_spi(_Unicode)
-import Swift
-
 import StdlibUnittest
 import StdlibUnicodeUnittest
 import Foundation

 let StringWordBreaking = TestSuite("StringWordBreaking")
-
-// FIXME: Reenable once we figure out what to do with WordView
-// @available(SwiftStdlib 5.7, *)
-// extension String._WordView {
-//   var backwardsCount: Int {
-//     var c = 0
-//     var index = endIndex
-//     while index != startIndex {
-//       c += 1
-//       formIndex(before: &index)
-//     }
-//     return c
-//   }
-// }
+defer { runAllTests() }

 extension String {
+  /// Returns all word boundaries within the string, using a single word
+  /// recognizer instance. This is the most efficient way to find word
+  /// boundaries, as it processes each scalar exactly once.
+  @available(StdlibDeploymentTarget 6.3, *)
+  func fastWordBreaks() -> [String.Index] {
+    var result: [String.Index] = []
+    var i = self.startIndex
+    var recognizer = Unicode._WordRecognizer()
+    var candidate = i
+    while i < self.endIndex {
+      let (setCandidate, breakAtCandidate, breakHere) =
+        recognizer.hasBreak(before: self.unicodeScalars[i])
+      if setCandidate {
+        candidate = i
+      }
+      if breakAtCandidate {
+        result.append(candidate)
+      }
+      if breakHere {
+        result.append(i)
+      }
+      self.unicodeScalars.formIndex(after: &i)
+    }
+    if recognizer.hasCandidateBreakAtEnd() {
+      result.append(candidate)
+    }
+    result.append(i)
+    return result
+  }
+
+  /// Return the word boundary position preceding a known boundary within this
+  /// string.
+  ///
+  /// This implements the word boundary specification of [Unicode Annex
+  /// #29](https://unicode.org/reports/tr29/#Default_Word_Boundaries). The
+  /// algorithm is not stable, and it allows implementers to tailor it to their
+  /// needs; accordingly, the result of this operation may vary between Unicode
+  /// implementations and system configurations, including versions of the Swift
+  /// Standard Library.
+  ///
+  /// - Note: If the input index is not on a word boundary, then it is first
+  /// rounded down to the nearest boundary before starting this operation.
+  ///
+  /// - Warning: Using this method to iterate over the word breaks in a string
+  ///    backward has worst-case complexity that is proportional to the _square_
+  ///    of the length of the string. It is usually a better idea to keep a
+  ///    cache of known word boundaries, calculated by iterating _forwards_ from
+  ///    the start index, or a position returned by
+  ///    `_wordIndex(somewhereAtOrBefore:)`.
+  ///
+  /// - Parameter i: A valid index addressing a word boundary within this
+  ///    string.
+  /// - Returns: The first word break strictly following `i` in the string.
+  @available(StdlibDeploymentTarget 6.3, *)
+  public func _wordIndex(before i: String.Index) -> String.Index {
+    let i = self.unicodeScalars._index(roundingDown: i)
+    var j = _wordIndex(somewhereAtOrBefore: unicodeScalars.index(before: i))
+
+    // We know there is a stable break at `j`, however, the backward search may
+    // have skipped over some conditional breaks that it could not fully
+    // evaluate. Find the closest actual break that precedes `i` by iterating
+    // forward until we reach or jump over it.
+    precondition(j < i)
+    var recognizer = Unicode._WordRecognizer()
+    var bestBreak = j
+    var candidate = j
+    while j < self.endIndex {
+      let r = recognizer.hasBreak(before: self.unicodeScalars[j])
+      if r.setCandidate { candidate = j }
+      if r.breakAtCandidate {
+        guard candidate < i else { break }
+        bestBreak = candidate
+      }
+      if r.breakHere {
+        guard j < i else { break }
+        bestBreak = j
+      }
+      self.unicodeScalars.formIndex(after: &j)
+    }
+    if j == self.endIndex, candidate < i, recognizer.hasCandidateBreakAtEnd() {
+      bestBreak = candidate
+    }
+    precondition(bestBreak < i)
+    return bestBreak
+  }
+}
+
+extension String {
+  @available(SwiftStdlib 6.3, *)
+  var statefulWords: [String] {
+    let breaks = fastWordBreaks()
+    var prev = breaks[0]
+    return breaks.dropFirst().map { next in
+      defer { prev = next }
+      return String(self[prev ..< next])
+    }
+  }
+
  @available(SwiftStdlib 5.9, *)
-  var _words: [String] {
+  var statelessWords: [String] {
    var result: [String] = []

    var i = startIndex
@@ -48,8 +129,8 @@ extension String {
    return result
  }

-  @available(SwiftStdlib 5.9, *)
-  var _wordsBackwards: [String] {
+  @available(SwiftStdlib 6.3, *)
+  var backwardWords: [String] {
    var result: [String] = []

    var i = endIndex
@@ -68,54 +149,22 @@ extension String {
  }
 }

-if #available(SwiftStdlib 6.1, *) {
-  StringWordBreaking.test("word breaking") {
-    for wordBreakTest in wordBreakTests {
-      expectEqual(
-        wordBreakTest.1,
-        wordBreakTest.0._words,
-        "string: \(String(reflecting: wordBreakTest.0))")
-      expectEqual(
-        wordBreakTest.1.reversed(),
-        wordBreakTest.0._wordsBackwards,
-        "string: \(String(reflecting: wordBreakTest.0))")
-    }
+extension Unicode.Scalar {
+  var unicodeNotation: String {
+      let v = String(self.value, radix: 16, uppercase: true)
+      return "U+\(String(repeating: "0", count: max(0, 4 - v.count)))\(v)"
  }
 }

-// rdar://116652595
-//
-// We were accidentally hanging when rounding word indices for some concoctions of
-// strings. In particular, where we had a pair of scalars create a constraint
-// for the preceding pair, but the preceding extend rules were not taking the
-// constraint into consideration.
-if #available(SwiftStdlib 5.10, *) {
-  StringWordBreaking.test("word breaking backward extend constraints") {
-    let strs = ["日\u{FE0F}:X ", "👨‍👨‍👧‍👦\u{FE0F}:X ", "⛔️:X ", "⛔️·X ", "⛔️：X "]
-    let strWords = [
-      ["日\u{FE0F}", ":", "X", " "],
-      ["👨‍👨‍👧‍👦\u{FE0F}", ":", "X", " "],
-      ["⛔️", ":", "X", " "],
-      ["⛔️", "·", "X", " "],
-      ["⛔️", "：", "X", " "]
-    ]
-
-    for (str, words) in zip(strs, strWords) {
-      expectEqual(
-        words,
-        str._words,
-        "string: \(String(reflecting: str))"
-      )
-
-      expectEqual(
-        words.reversed(),
-        str._wordsBackwards,
-        "string: \(String(reflecting: str))"
-      )
-    }
+extension String {
+  var scalarDescriptions: String {
+    return self.unicodeScalars
+      .lazy.map { $0.unicodeNotation }
+      .joined(separator: " ")
  }
 }

+#if _runtime(_ObjC)
 // The most simple subclass of NSString that CoreFoundation does not know
 // about.
 class NonContiguousNSString : NSString {
@@ -123,16 +172,17 @@ class NonContiguousNSString : NSString {
    fatalError("don't call this initializer")
  }
  required init(itemProviderData data: Data, typeIdentifier: String) throws {
-    fatalError("don't call this initializer")    
+    fatalError("don't call this initializer")
  }

-  override init() { 
+  override init() {
    _value = []
-    super.init() 
+    super.init()
  }

-  init(_ value: [UInt16]) {
-    _value = value
+  @inline(never)
+  init(_ value: some Sequence<UInt16>) {
+    _value = Array(value)
    super.init()
  }

@@ -157,36 +207,79 @@ extension _StringGuts {
  @_silgen_name("$ss11_StringGutsV9isForeignSbvg")
  func _isForeign() -> Bool
 }
+#endif

-func getUTF16Array(from string: String) -> [UInt16] {
-  var result: [UInt16] = []
-
-  for cp in string.utf16 {
-    result.append(cp)
+func testCases() -> [(String, [String])] {
+  var tests = StdlibUnicodeUnittest.wordBreakTests
+  if #available(SwiftStdlib 5.10, *) {
+    // rdar://116652595
+    //
+    // We were accidentally hanging when rounding word indices for some
+    // concoctions of strings. In particular, where we had a pair of scalars
+    // create a constraint for the preceding pair, but the preceding extend
+    // rules were not taking the constraint into consideration.
+    tests += [
+      ("日\u{FE0F}:X ", ["日\u{FE0F}", ":", "X", " "]),
+      ("👨‍👨‍👧‍👦\u{FE0F}:X ", ["👨‍👨‍👧‍👦\u{FE0F}", ":", "X", " "]),
+      ("⛔️:X ", ["⛔️", ":", "X", " "]),
+      ("⛔️·X ", ["⛔️", "·", "X", " "]),
+      ("⛔️：X ", ["⛔️", "：", "X", " "]),
+    ]
  }
-
-  return result
+  if #available(SwiftStdlib 6.3, *) {
+    tests += [
+      // https://github.com/swiftlang/swift-experimental-string-processing/issues/818
+      // rdar://154902007
+      ("\u{2060}\u{2018}\u{2060}\u{2060}example.com\u{2060}\u{2060}\u{2019}",
+       ["\u{2060}", "\u{2018}\u{2060}\u{2060}", "example.com\u{2060}\u{2060}", "\u{2019}"]),
+    ]
+  }
+  return tests
 }

 if #available(SwiftStdlib 6.1, *) {
-  StringWordBreaking.test("word breaking foreign") {
-    for wordBreakTest in wordBreakTests {
-      let foreignTest = NonContiguousNSString(
-        getUTF16Array(from: wordBreakTest.0)
-      )
-      let test = foreignTest as String
-
-      expectTrue(test._guts._isForeign())
+  StringWordBreaking.test("word breaking") {
+    for (input, expectedWords) in testCases() {
      expectEqual(
-        wordBreakTest.1,
-        test._words,
-        "string: \(String(reflecting: wordBreakTest.0))")
-      expectEqual(
-        wordBreakTest.1.reversed(),
-        test._wordsBackwards,
-        "string: \(String(reflecting: wordBreakTest.0))")
+        input.statelessWords,
+        expectedWords,
+        "input: \(input.debugDescription) \(input.scalarDescriptions)")
+      if #available(SwiftStdlib 6.3, *) {
+        expectEqual(
+          input.statefulWords,
+          expectedWords,
+          "input: \(input.debugDescription) \(input.scalarDescriptions)")
+        expectEqual(
+          input.backwardWords,
+          expectedWords.reversed(),
+          "input: \(input.debugDescription) \(input.scalarDescriptions)")
+      }
    }
  }
 }

-runAllTests()
+#if _runtime(_ObjC)
+if #available(SwiftStdlib 6.1, *) {
+  StringWordBreaking.test("word breaking foreign") {
+    for (nativeString, expectedWords) in testCases() {
+      let input = NonContiguousNSString(nativeString.utf16) as String
+
+      expectTrue(input._guts._isForeign())
+      expectEqual(
+        input.statelessWords,
+        expectedWords,
+        "input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)")
+      if #available(SwiftStdlib 6.3, *) {
+        expectEqual(
+          input.statefulWords,
+          expectedWords,
+          "input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)")
+        expectEqual(
+          input.backwardWords,
+          expectedWords.reversed(),
+          "input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)")
+      }
+    }
+  }
+}
+#endif