Files
swift-mirror/validation-test/stdlib/StringWordBreaking.swift
2025-08-08 14:01:09 -07:00

287 lines
8.5 KiB
Swift
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// RUN: %empty-directory(%t)
// RUN: %target-run-stdlib-swift %S/Inputs/
// REQUIRES: executable_test
// REQUIRES: optimized_stdlib
// REQUIRES: objc_interop
// FIXME: Text segmentation test cases are only available when we have Foundation
import StdlibUnittest
import StdlibUnicodeUnittest
import Foundation
let StringWordBreaking = TestSuite("StringWordBreaking")
defer { runAllTests() }
extension String {
/// Returns all word boundaries within the string, using a single word
/// recognizer instance. This is the most efficient way to find word
/// boundaries, as it processes each scalar exactly once.
@available(StdlibDeploymentTarget 6.3, *)
func fastWordBreaks() -> [String.Index] {
var result: [String.Index] = []
var i = self.startIndex
var recognizer = Unicode._WordRecognizer()
var candidate = i
while i < self.endIndex {
let (setCandidate, breakAtCandidate, breakHere) =
recognizer.hasBreak(before: self.unicodeScalars[i])
if setCandidate {
candidate = i
}
if breakAtCandidate {
result.append(candidate)
}
if breakHere {
result.append(i)
}
self.unicodeScalars.formIndex(after: &i)
}
if recognizer.hasCandidateBreakAtEnd() {
result.append(candidate)
}
result.append(i)
return result
}
/// Return the word boundary position preceding a known boundary within this
/// string.
///
/// This implements the word boundary specification of [Unicode Annex
/// #29](https://unicode.org/reports/tr29/#Default_Word_Boundaries). The
/// algorithm is not stable, and it allows implementers to tailor it to their
/// needs; accordingly, the result of this operation may vary between Unicode
/// implementations and system configurations, including versions of the Swift
/// Standard Library.
///
/// - Note: If the input index is not on a word boundary, then it is first
/// rounded down to the nearest boundary before starting this operation.
///
/// - Warning: Using this method to iterate over the word breaks in a string
/// backward has worst-case complexity that is proportional to the _square_
/// of the length of the string. It is usually a better idea to keep a
/// cache of known word boundaries, calculated by iterating _forwards_ from
/// the start index, or a position returned by
/// `_wordIndex(somewhereAtOrBefore:)`.
///
/// - Parameter i: A valid index addressing a word boundary within this
/// string.
/// - Returns: The first word break strictly following `i` in the string.
@available(StdlibDeploymentTarget 6.3, *)
public func _wordIndex(before i: String.Index) -> String.Index {
let i = self.unicodeScalars._index(roundingDown: i)
var j = _wordIndex(somewhereAtOrBefore: unicodeScalars.index(before: i))
// We know there is a stable break at `j`, however, the backward search may
// have skipped over some conditional breaks that it could not fully
// evaluate. Find the closest actual break that precedes `i` by iterating
// forward until we reach or jump over it.
precondition(j < i)
var recognizer = Unicode._WordRecognizer()
var bestBreak = j
var candidate = j
while j < self.endIndex {
let r = recognizer.hasBreak(before: self.unicodeScalars[j])
if r.setCandidate { candidate = j }
if r.breakAtCandidate {
guard candidate < i else { break }
bestBreak = candidate
}
if r.breakHere {
guard j < i else { break }
bestBreak = j
}
self.unicodeScalars.formIndex(after: &j)
}
if j == self.endIndex, candidate < i, recognizer.hasCandidateBreakAtEnd() {
bestBreak = candidate
}
precondition(bestBreak < i)
return bestBreak
}
}
extension String {
@available(SwiftStdlib 6.3, *)
var statefulWords: [String] {
let breaks = fastWordBreaks()
var prev = breaks[0]
return breaks.dropFirst().map { next in
defer { prev = next }
return String(self[prev ..< next])
}
}
@available(SwiftStdlib 5.9, *)
var statelessWords: [String] {
var result: [String] = []
var i = startIndex
while i < endIndex {
let start = i
let end = _wordIndex(after: i)
let substr = self[start ..< end]
result.append(String(substr))
i = end
}
return result
}
@available(SwiftStdlib 6.3, *)
var backwardWords: [String] {
var result: [String] = []
var i = endIndex
while i > startIndex {
let end = i
let start = _wordIndex(before: i)
let substr = self[start ..< end]
result.append(String(substr))
i = start
}
return result
}
}
extension Unicode.Scalar {
var unicodeNotation: String {
let v = String(self.value, radix: 16, uppercase: true)
return "U+\(String(repeating: "0", count: max(0, 4 - v.count)))\(v)"
}
}
extension String {
var scalarDescriptions: String {
return self.unicodeScalars
.lazy.map { $0.unicodeNotation }
.joined(separator: " ")
}
}
#if _runtime(_ObjC)
// The most simple subclass of NSString that CoreFoundation does not know
// about.
class NonContiguousNSString : NSString {
required init(coder aDecoder: NSCoder) {
fatalError("don't call this initializer")
}
required init(itemProviderData data: Data, typeIdentifier: String) throws {
fatalError("don't call this initializer")
}
override init() {
_value = []
super.init()
}
@inline(never)
init(_ value: some Sequence<UInt16>) {
_value = Array(value)
super.init()
}
@objc(copyWithZone:) override func copy(with zone: NSZone?) -> Any {
// Ensure that copying this string produces a class that CoreFoundation
// does not know about.
return self
}
@objc override var length: Int {
return _value.count
}
@objc override func character(at index: Int) -> unichar {
return _value[index]
}
var _value: [UInt16]
}
extension _StringGuts {
@_silgen_name("$ss11_StringGutsV9isForeignSbvg")
func _isForeign() -> Bool
}
#endif
func testCases() -> [(String, [String])] {
var tests = StdlibUnicodeUnittest.wordBreakTests
if #available(SwiftStdlib 5.10, *) {
// rdar://116652595
//
// We were accidentally hanging when rounding word indices for some
// concoctions of strings. In particular, where we had a pair of scalars
// create a constraint for the preceding pair, but the preceding extend
// rules were not taking the constraint into consideration.
tests += [
("\u{FE0F}:X ", ["\u{FE0F}", ":", "X", " "]),
("👨‍👨‍👧‍👦\u{FE0F}:X ", ["👨‍👨‍👧‍👦\u{FE0F}", ":", "X", " "]),
("⛔️:X ", ["⛔️", ":", "X", " "]),
("·X ", ["⛔️", "·", "X", " "]),
("X ", ["⛔️", "", "X", " "]),
]
}
if #available(SwiftStdlib 6.3, *) {
tests += [
// https://github.com/swiftlang/swift-experimental-string-processing/issues/818
// rdar://154902007
("\u{2060}\u{2018}\u{2060}\u{2060}example.com\u{2060}\u{2060}\u{2019}",
["\u{2060}", "\u{2018}\u{2060}\u{2060}", "example.com\u{2060}\u{2060}", "\u{2019}"]),
]
}
return tests
}
if #available(SwiftStdlib 6.1, *) {
StringWordBreaking.test("word breaking") {
for (input, expectedWords) in testCases() {
expectEqual(
input.statelessWords,
expectedWords,
"input: \(input.debugDescription) \(input.scalarDescriptions)")
if #available(SwiftStdlib 6.3, *) {
expectEqual(
input.statefulWords,
expectedWords,
"input: \(input.debugDescription) \(input.scalarDescriptions)")
expectEqual(
input.backwardWords,
expectedWords.reversed(),
"input: \(input.debugDescription) \(input.scalarDescriptions)")
}
}
}
}
if #available(SwiftStdlib 6.1, *) {
StringWordBreaking.test("word breaking foreign") {
for (nativeString, expectedWords) in testCases() {
let input = NonContiguousNSString(nativeString.utf16) as String
expectTrue(input._guts._isForeign())
expectEqual(
input.statelessWords,
expectedWords,
"input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)")
if #available(SwiftStdlib 6.3, *) {
expectEqual(
input.statefulWords,
expectedWords,
"input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)")
expectEqual(
input.backwardWords,
expectedWords.reversed(),
"input: \(nativeString.debugDescription) \(nativeString.scalarDescriptions)")
}
}
}
}