Files
swift-mirror/stdlib/public/core/UnicodeSPI.swift
Karoy Lorentey 3e18a07187 [stdlib] Fix implementation of Unicode text segmentation for word boundaries
Carefully overhaul our word breaking implementation to follow the recommendations of Unicode Annex #29. Start exposing the core primitives (as well as `String`-level interfaces), so that folks can prototype proper API for these concepts.

- Fix `_wordIndex(after:)` to always advance forward. It now requires its input index to be on a word boundary. Remove the `@_spi` attribute, exposing it as a (hidden, but) public entry point.
- The old SPIs `_wordIndex(before:)` and `_nearestWordIndex(atOrBelow:)` were irredemably broken; follow the Unicode recommendation for implementing random-access text segmentation and replace them both with a new public `_wordIndex(somewhereAtOrBefore:)` entry pont.
- Expose handcrafted low-level state machines for detecting word boundaries (_WordRecognizer`, `_RandomAccessWordRecognizer`), following the design of `_CharacterRecognizer`.
- Add tests to reliably validate that the two state machine flavors always produce consistent results.

rdar://155482680
2025-08-05 20:04:46 -07:00

210 lines
5.3 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import SwiftShims
//===----------------------------------------------------------------------===//
// Unicode.NFD
//===----------------------------------------------------------------------===//
extension Unicode {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public struct _NFD {
let base: Substring.UnicodeScalarView
}
}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFD: Sendable {}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFD {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public struct Iterator {
var base: Unicode._InternalNFD<Substring.UnicodeScalarView>.Iterator
}
}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFD.Iterator: Sendable {}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFD.Iterator: IteratorProtocol {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public mutating func next() -> Unicode.Scalar? {
base.next()
}
}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFD: Sequence {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public func makeIterator() -> Iterator {
Iterator(base: base._internalNFD.makeIterator())
}
}
extension String {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _nfd: Unicode._NFD {
Unicode._NFD(base: self[...].unicodeScalars)
}
}
extension Substring {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _nfd: Unicode._NFD {
Unicode._NFD(base: unicodeScalars)
}
}
//===----------------------------------------------------------------------===//
// Unicode.NFC
//===----------------------------------------------------------------------===//
extension Unicode {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public struct _NFC {
let base: Substring.UnicodeScalarView
}
}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFC: Sendable {}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFC {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public struct Iterator {
var base: Unicode._InternalNFC<Substring.UnicodeScalarView>.Iterator
}
}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFC.Iterator: Sendable {}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFC.Iterator: IteratorProtocol {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public mutating func next() -> Unicode.Scalar? {
base.next()
}
}
@available(SwiftStdlib 5.7, *)
extension Unicode._NFC: Sequence {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public func makeIterator() -> Iterator {
Iterator(base: base._internalNFC.makeIterator())
}
}
extension String {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _nfc: Unicode._NFC {
Unicode._NFC(base: self[...].unicodeScalars)
}
}
extension Substring {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _nfc: Unicode._NFC {
Unicode._NFC(base: unicodeScalars)
}
}
//===----------------------------------------------------------------------===//
// Unicode.Script
//===----------------------------------------------------------------------===//
extension Unicode.Scalar.Properties {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _script: UInt8 {
let rawValue = _swift_stdlib_getScript(_scalar.value)
_internalInvariant(rawValue != .max, "Unknown script rawValue")
return rawValue
}
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _scriptExtensions: [UInt8] {
var count: UInt8 = 0
let pointer = unsafe _swift_stdlib_getScriptExtensions(_scalar.value, &count)
guard let pointer = unsafe pointer else {
return [_script]
}
var result: [UInt8] = []
result.reserveCapacity(Int(count))
for i in 0 ..< count {
let rawValue = unsafe pointer[Int(i)]
_internalInvariant(rawValue != .max, "Unknown script rawValue")
result.append(rawValue)
}
return result
}
}
//===----------------------------------------------------------------------===//
// Case folding
//===----------------------------------------------------------------------===//
extension Unicode.Scalar.Properties {
@_spi(_Unicode)
@available(SwiftStdlib 5.7, *)
public var _caseFolded: String {
var buffer: (UInt32, UInt32, UInt32) = (.max, .max, .max)
unsafe withUnsafeMutableBytes(of: &buffer) {
// This is safe because the memory is already UInt32
let ptr = unsafe $0.baseAddress!.assumingMemoryBound(to: UInt32.self)
unsafe _swift_stdlib_getCaseMapping(_scalar.value, ptr)
}
var result = ""
// Max mapping is 3 scalars and the max UTF8 bytes of a scalar is 4.
result.reserveCapacity(12)
unsafe withUnsafeBytes(of: &buffer) {
for unsafe scalar in unsafe $0.bindMemory(to: UInt32.self) {
guard scalar != .max else {
break
}
result.unicodeScalars.append(Unicode.Scalar(scalar)!)
}
}
return result
}
}