mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
[stdlib] Fix implementation of Unicode text segmentation for word boundaries
Carefully overhaul our word breaking implementation to follow the recommendations of Unicode Annex #29. Start exposing the core primitives (as well as `String`-level interfaces), so that folks can prototype proper API for these concepts. - Fix `_wordIndex(after:)` to always advance forward. It now requires its input index to be on a word boundary. Remove the `@_spi` attribute, exposing it as a (hidden, but) public entry point. - The old SPIs `_wordIndex(before:)` and `_nearestWordIndex(atOrBelow:)` were irredemably broken; follow the Unicode recommendation for implementing random-access text segmentation and replace them both with a new public `_wordIndex(somewhereAtOrBefore:)` entry pont. - Expose handcrafted low-level state machines for detecting word boundaries (_WordRecognizer`, `_RandomAccessWordRecognizer`), following the design of `_CharacterRecognizer`. - Add tests to reliably validate that the two state machine flavors always produce consistent results. rdar://155482680
This commit is contained in:
@@ -13,7 +13,7 @@
|
||||
import SwiftShims
|
||||
|
||||
extension Unicode {
|
||||
internal enum _GraphemeBreakProperty {
|
||||
internal enum _GraphemeBreakProperty: Sendable {
|
||||
case any
|
||||
case control
|
||||
case extend
|
||||
@@ -86,7 +86,7 @@ extension Unicode {
|
||||
}
|
||||
|
||||
extension Unicode {
|
||||
internal enum _WordBreakProperty {
|
||||
internal enum _WordBreakProperty: UInt8, Sendable {
|
||||
case aLetter
|
||||
case any
|
||||
case doubleQuote
|
||||
@@ -105,8 +105,8 @@ extension Unicode {
|
||||
case singleQuote
|
||||
case wSegSpace
|
||||
case zwj
|
||||
|
||||
init(from scalar: Unicode.Scalar) {
|
||||
|
||||
internal init(from scalar: Unicode.Scalar) {
|
||||
switch scalar.value {
|
||||
case 0xA ... 0xD,
|
||||
0x85,
|
||||
@@ -122,7 +122,7 @@ extension Unicode {
|
||||
self = .regionalIndicator
|
||||
default:
|
||||
let rawValue = _swift_stdlib_getWordBreakProperty(scalar.value)
|
||||
|
||||
|
||||
switch rawValue {
|
||||
case 0:
|
||||
self = .extend
|
||||
|
||||
Reference in New Issue
Block a user