Files
swift-mirror/stdlib/public/core/UnicodeBreakProperty.swift
Karoy Lorentey 3e18a07187 [stdlib] Fix implementation of Unicode text segmentation for word boundaries
Carefully overhaul our word breaking implementation to follow the recommendations of Unicode Annex #29. Start exposing the core primitives (as well as `String`-level interfaces), so that folks can prototype proper API for these concepts.

- Fix `_wordIndex(after:)` to always advance forward. It now requires its input index to be on a word boundary. Remove the `@_spi` attribute, exposing it as a (hidden, but) public entry point.
- The old SPIs `_wordIndex(before:)` and `_nearestWordIndex(atOrBelow:)` were irredemably broken; follow the Unicode recommendation for implementing random-access text segmentation and replace them both with a new public `_wordIndex(somewhereAtOrBefore:)` entry pont.
- Expose handcrafted low-level state machines for detecting word boundaries (_WordRecognizer`, `_RandomAccessWordRecognizer`), following the design of `_CharacterRecognizer`.
- Add tests to reliably validate that the two state machine flavors always produce consistent results.

rdar://155482680
2025-08-05 20:04:46 -07:00

158 lines
3.6 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import SwiftShims
extension Unicode {
internal enum _GraphemeBreakProperty: Sendable {
case any
case control
case extend
case extendedPictographic
case l
case lv
case lvt
case prepend
case regionalIndicator
case spacingMark
case t
case v
case zwj
@inline(__always)
init(from scalar: Unicode.Scalar) {
switch scalar.value {
// Some fast paths for ascii characters...
case 0x0 ... 0x1F:
self = .control
case 0x20 ... 0x7E:
self = .any
case 0x200D:
self = .zwj
case 0x1100 ... 0x115F,
0xA960 ... 0xA97C:
self = .l
case 0x1160 ... 0x11A7,
0xD7B0 ... 0xD7C6:
self = .v
case 0x11A8 ... 0x11FF,
0xD7CB ... 0xD7FB:
self = .t
case 0xAC00 ... 0xD7A3:
if scalar.value % 28 == 16 {
self = .lv
} else {
self = .lvt
}
case 0x1F1E6 ... 0x1F1FF:
self = .regionalIndicator
case 0x1FC00 ... 0x1FFFD:
self = .extendedPictographic
case 0xE01F0 ... 0xE0FFF:
self = .control
default:
// Otherwise, default to binary searching the data array.
let rawEnumValue = _swift_stdlib_getGraphemeBreakProperty(scalar.value)
switch rawEnumValue {
case 0:
self = .control
case 1:
self = .extend
case 2:
self = .prepend
case 3:
self = .spacingMark
// Extended pictographic uses 2 values for its representation.
case 4, 5:
self = .extendedPictographic
default:
self = .any
}
}
}
}
}
extension Unicode {
internal enum _WordBreakProperty: UInt8, Sendable {
case aLetter
case any
case doubleQuote
case extend
case extendedPictographic
case extendNumLet
case format
case hebrewLetter
case katakana
case midLetter
case midNum
case midNumLet
case newlineCRLF
case numeric
case regionalIndicator
case singleQuote
case wSegSpace
case zwj
internal init(from scalar: Unicode.Scalar) {
switch scalar.value {
case 0xA ... 0xD,
0x85,
0x2028 ... 0x2029:
self = .newlineCRLF
case 0x22:
self = .doubleQuote
case 0x27:
self = .singleQuote
case 0x200D:
self = .zwj
case 0x1F1E6 ... 0x1F1FF:
self = .regionalIndicator
default:
let rawValue = _swift_stdlib_getWordBreakProperty(scalar.value)
switch rawValue {
case 0:
self = .extend
case 1:
self = .format
case 2:
self = .katakana
case 3:
self = .hebrewLetter
case 4:
self = .aLetter
case 5:
self = .midNumLet
case 6:
self = .midLetter
case 7:
self = .midNum
case 8:
self = .numeric
case 9:
self = .extendNumLet
case 10:
self = .wSegSpace
case 11:
self = .extendedPictographic
default:
self = .any
}
}
}
}
}