mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
Carefully overhaul our word breaking implementation to follow the recommendations of Unicode Annex #29. Start exposing the core primitives (as well as `String`-level interfaces), so that folks can prototype proper API for these concepts. - Fix `_wordIndex(after:)` to always advance forward. It now requires its input index to be on a word boundary. Remove the `@_spi` attribute, exposing it as a (hidden, but) public entry point. - The old SPIs `_wordIndex(before:)` and `_nearestWordIndex(atOrBelow:)` were irredemably broken; follow the Unicode recommendation for implementing random-access text segmentation and replace them both with a new public `_wordIndex(somewhereAtOrBefore:)` entry pont. - Expose handcrafted low-level state machines for detecting word boundaries (_WordRecognizer`, `_RandomAccessWordRecognizer`), following the design of `_CharacterRecognizer`. - Add tests to reliably validate that the two state machine flavors always produce consistent results. rdar://155482680
774 lines
30 KiB
Swift
774 lines
30 KiB
Swift
//===----------------------------------------------------------------------===//
|
||
//
|
||
//
|
||
// This source file is part of the Swift.org open source project
|
||
//
|
||
// Copyright (c) 2022 - 2025 Apple Inc. and the Swift project authors
|
||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||
//
|
||
// See https://swift.org/LICENSE.txt for license information
|
||
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
extension Unicode {
|
||
/// A state machine for recognizing word boundaries in an arbitrary series of
|
||
/// Unicode scalars, based on the specification in [Unicode Annex
|
||
/// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules).
|
||
///
|
||
/// The text segmentation algorithm is not stable, and it allows implementers
|
||
/// to tailor it to their needs. Accordingly, reported word boundaries may
|
||
/// vary in arbitrary ways between Unicode implementations and system
|
||
/// configurations, including between versions of the Swift Standard Library.
|
||
///
|
||
/// To implement the rules as specified, this low-level construct has built-in
|
||
/// support to defer making a decision on whether there is a word boundary
|
||
/// between two Unicode scalars until more scalars are fed to the state
|
||
/// machine; that is to say, it implements limited lookahead. The API surface
|
||
/// only allows one such candidate position to exist at any given time -- this
|
||
/// corresponds to allowing looking ahead up to the next word boundary. (In
|
||
/// the unlikely case the rules evolve to require looking ahead even further,
|
||
/// then this interface will need to be modified or replaced accordingly.)
|
||
///
|
||
/// To detect word breaks in a sequence of Unicode scalars, feed each of them
|
||
/// to the recognizer by calling its `hasBreak(before:)` method. The method
|
||
/// indicates if there is a word break preceding the given scalar, or at a
|
||
/// previously reported candidate position. When every scalar in the text has
|
||
/// been fed to the recognizer, the `hasCandidateBreakAtEnd()` method should
|
||
/// be called to determine if there is a break at the last reported candidate
|
||
/// position. There is also an (implicit) word break at the end of text
|
||
/// position.
|
||
///
|
||
/// Note that `_WordRecognizer` does not take or return actual text positions
|
||
/// (such as a string index); it is entirely independent of the underlying
|
||
/// text representation, and it is able to work with any container model. (For
|
||
/// example, it can be used to incrementally recognize word breaks in UTF-16
|
||
/// data streamed from a network connection, or iterate over word boundaries
|
||
/// in piecewise contiguous UTF-8 buffers stored in a rope data structure.) Of
|
||
/// course, it is also possible to use it to detect word breaks in a standard
|
||
/// `String` value, such as done by this example function:
|
||
///
|
||
/// func collectWordBreaks(in string: String) -> [String.Index] {
|
||
/// var result: [String.Index] = []
|
||
/// var recognizer = Unicode._WordRecognizer()
|
||
/// var candidate = string.startIndex
|
||
/// for i in string.unicodeScalars.indices {
|
||
/// let r = recognizer.hasBreak(before: string.unicodeScalars[i])
|
||
/// if r.setCandidate { candidate = i }
|
||
/// if r.breakAtCandidate { result.append(candidate) }
|
||
/// if r.breakHere { result.append(i) }
|
||
/// }
|
||
/// if recognizer.hasCandidateBreakAtEnd() {
|
||
/// result.append(candidate)
|
||
/// }
|
||
/// result.append(string.endIndex)
|
||
/// return result
|
||
/// }
|
||
///
|
||
/// When used this way, the state machine is able to efficiently iterate over
|
||
/// all breaks within the string by visiting each scalar exactly once, without
|
||
/// any backtracking.
|
||
///
|
||
/// It is also possible to discard the recognizer after each detected
|
||
/// boundary, reinitializing it from scratch for each iteration step:
|
||
///
|
||
/// func wordBreak(
|
||
/// after knownBreak: String.Index, in string: String
|
||
/// ) -> String.Index {
|
||
/// var recognizer = Unicode._WordRecognizer(after: string.unicodeScalars[knownBreak])
|
||
/// var i = string.unicodeScalars.index(after: knownBreak)
|
||
/// var candidate = i
|
||
/// while i < string.endIndex {
|
||
/// let r = recognizer.hasBreak(before: string.unicodeScalars[i])
|
||
/// if r.setCandidate { candidate = i }
|
||
/// if r.breakAtCandidate { return candidate }
|
||
/// if r.breakHere { return i }
|
||
/// string.uncodeScalars.formIndex(after: &i)
|
||
/// }
|
||
/// if recognizer.hasCandidateBreakAtEnd() {
|
||
/// return candidate
|
||
/// }
|
||
/// return i
|
||
/// }
|
||
///
|
||
/// However, note that iterating this way is less efficient, because it
|
||
/// discards lookahead information -- some scalars will be processed multiple
|
||
/// times. The rules are carefully constructed so that the algorithm reports
|
||
/// the same word boundaries whether or not recognizer state is preserved.
|
||
@available(StdlibDeploymentTarget 6.3, *)
|
||
public // Core primitive
|
||
struct _WordRecognizer: Sendable {
|
||
// FIXME: We also need proper public API for this
|
||
|
||
/// The last scalar that was fed to `hasBreak(before:)`.
|
||
var _prevScalar: Unicode.Scalar
|
||
/// The cached word break property of `_prevScalar`.
|
||
var _prevCategory: _WordBreakProperty
|
||
/// The word break property of the last preceding scalar that wasn't ignored by rule WB4.
|
||
var _baseCategory: _WordBreakProperty
|
||
/// The current state of the recognizer.
|
||
var _state: _State
|
||
|
||
/// Initialize a new word recognizer at the _start of text_ (sot)
|
||
/// position.
|
||
///
|
||
/// The resulting state machine will report a word break before the first
|
||
/// scalar that is fed to it.
|
||
public init() {
|
||
// To avoid having to handle the empty case specially, we use LF as the
|
||
// placeholder before the first scalar. Per WB3a, we always produce a break
|
||
// following a line feed.
|
||
_baseCategory = .newlineCRLF
|
||
_prevScalar = Unicode.Scalar(0x0A as UInt8)
|
||
_prevCategory = .newlineCRLF
|
||
_state = .ordinary
|
||
}
|
||
|
||
/// Initialize a new word recognizer with a state after a previously
|
||
/// recognized word boundary.
|
||
///
|
||
/// This enables clients to iterate over word boundaries without maintaining
|
||
/// a persistent recognizer state. However, iterating this way may result in
|
||
/// an arbitrarily large amount of duplicate work. (This is because the word
|
||
/// segmentation algorithm requires looking ahead by as much as a full
|
||
/// word's worth of Unicode scalars, and when the old recognizer is
|
||
/// discarded, the information thus collected has to be recreated from
|
||
/// scratch.) Whenever possible, it is therefore preferable to iterate over
|
||
/// multiple word boundaries using a single recognizer instance.
|
||
///
|
||
/// - Parameter scalar: The Unicode scalar immediately following a known
|
||
/// word boundary position.
|
||
public init(after scalar: Unicode.Scalar) {
|
||
// We assume that the state machine provides stable results even if the
|
||
// start position was a retroactive candidate.
|
||
_prevScalar = scalar
|
||
_prevCategory = Unicode._WordBreakProperty(from: scalar)
|
||
_baseCategory = _prevCategory
|
||
_state = .ordinary
|
||
}
|
||
}
|
||
}
|
||
|
||
@available(StdlibDeploymentTarget 6.3, *)
|
||
extension Unicode._WordRecognizer {
|
||
/// The parts of the word recognizer state that are in addition to saved
|
||
/// information about previous scalars.
|
||
///
|
||
/// This is used to implement stateful lookahead when a break at a particular
|
||
/// candidate position may need to be suppressed or activated based on a
|
||
/// subsequent scalar value. It is also used to keep track of the number of
|
||
/// contiguous regional indicator scalars we have seen so far.
|
||
internal enum _State: Int, Sendable {
|
||
case ordinary
|
||
case afterWB6 // AHLetter × (MidLetter | MidNumLetQ) AHLetter
|
||
case afterWB7b // Hebrew_Letter × Double_Quote Hebrew_Letter
|
||
case afterWB12 // Numeric × (MidNum | MidNumLetQ) Numeric
|
||
case afterMidFlag // [^RI] (RI RI)* RI × RI
|
||
|
||
var hasPendingCandidate: Bool {
|
||
switch self {
|
||
case .afterWB6, .afterWB7b, .afterWB12: return true
|
||
default: return false
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Reject a break at the current position, triggering a break at the current
|
||
/// candidate (if any).
|
||
internal mutating func _reject(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
let breakAtCandidate = _state.hasPendingCandidate
|
||
_state = .ordinary
|
||
return (
|
||
setCandidate: false,
|
||
breakAtCandidate: breakAtCandidate,
|
||
breakHere: false)
|
||
}
|
||
|
||
/// Skip this position, ignoring the current Unicode scalar.
|
||
internal mutating func _ignore(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
// Note: not updating _baseCategory
|
||
return (setCandidate: false, breakAtCandidate: false, breakHere: false)
|
||
}
|
||
|
||
/// Signal a break at the current position, also triggering a break at the
|
||
/// current candidate (if any).
|
||
internal mutating func _accept(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
// If we have a pending candidate, put a break at it
|
||
let breakAtCandidate = _state.hasPendingCandidate
|
||
_state = .ordinary
|
||
return (
|
||
setCandidate: false,
|
||
breakAtCandidate: breakAtCandidate,
|
||
breakHere: true)
|
||
}
|
||
|
||
/// Set the current position as the active break candidate, and transition into the
|
||
/// specified state.
|
||
internal mutating func _transition(
|
||
into state: _State
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
_internalInvariant(_state == .ordinary)
|
||
_state = state
|
||
return (setCandidate: true, breakAtCandidate: false, breakHere: false)
|
||
}
|
||
|
||
/// If the current state matches the given expectation, suppress a break at
|
||
/// this position and discard the active candidate; otherwise, report a break
|
||
/// at both positions.
|
||
internal mutating func _expect(
|
||
_ expectedState: _State
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
let breakHere = (_state != expectedState)
|
||
let breakAtCandidate = breakHere && _state.hasPendingCandidate
|
||
_state = .ordinary
|
||
return (
|
||
setCandidate: false,
|
||
breakAtCandidate: breakAtCandidate,
|
||
breakHere: breakHere)
|
||
}
|
||
|
||
/// Feeds the next scalar to the state machine, reporting if there is a word
|
||
/// boundary at the current position or a previously reported candidate.
|
||
///
|
||
/// To decide whether there is a word break at the current position, the
|
||
/// segmentation algorithm sometimes needs to look ahead by visiting
|
||
/// additional scalars following the break, up to the next word boundary. To
|
||
/// allow this, the state machine can report that the current position is a
|
||
/// provisional break "candidate". Clients are expected to remember the
|
||
/// position of the last reported candidate, so that it can be retroactively
|
||
/// promoted to a full break as needed.
|
||
///
|
||
/// - Parameter nextScalar: The scalar at the current position in the text.
|
||
/// - Returns: A triple of Boolean values `setCandidate`, `breakAtCandidate`,
|
||
/// `breakHere`. If `setCandidate` is true, then the caller is expected to
|
||
/// save the current text position as a potential word boundary. If
|
||
/// `breakAtCandidate` is true, then there is a word boundary at the last
|
||
/// candidate position. If `breakHere` is true, then there is a word
|
||
/// boundary at the current position. The caller is expected to process
|
||
/// these three components in this specific order.
|
||
public mutating func hasBreak(
|
||
before nextScalar: Unicode.Scalar
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
let nextCategory = Unicode._WordBreakProperty(from: nextScalar)
|
||
var nextBase = nextCategory
|
||
|
||
// FIXME: Implement a proper state machine here, dispatching on
|
||
// (state, nextProperty), ideally through a static look-up table
|
||
|
||
defer {
|
||
_prevScalar = nextScalar
|
||
_prevCategory = nextCategory
|
||
_baseCategory = nextBase
|
||
}
|
||
|
||
switch (_prevCategory, nextCategory) {
|
||
case (.any, .any): // WB999
|
||
// Fast path: If we know our scalars have no properties then the decision
|
||
// is trivial and we don't need to crawl to the default statement.
|
||
return _accept()
|
||
|
||
case (.newlineCRLF, _), // WB3a
|
||
(_, .newlineCRLF): // WB3b
|
||
if _prevScalar.value == 0xD, nextScalar.value == 0xA { // WB3
|
||
_internalInvariant(_prevCategory == .newlineCRLF)
|
||
return _reject()
|
||
}
|
||
return _accept()
|
||
|
||
case (.zwj, .extendedPictographic), // WB3c
|
||
(.wSegSpace, .wSegSpace): // WB3d
|
||
return _reject()
|
||
|
||
case (_, .format), // WB4
|
||
(_, .extend),
|
||
(_, .zwj):
|
||
nextBase = _baseCategory // Cancel _baseCategory update
|
||
return _ignore()
|
||
|
||
default:
|
||
break
|
||
}
|
||
|
||
switch (_baseCategory, nextCategory) {
|
||
case (.aLetter, .aLetter), // WB5
|
||
(.aLetter, .hebrewLetter),
|
||
(.hebrewLetter, .aLetter),
|
||
(.hebrewLetter, .hebrewLetter):
|
||
return _reject()
|
||
|
||
case (.aLetter, .midLetter), // WB6
|
||
(.hebrewLetter, .midLetter),
|
||
(.aLetter, .midNumLet),
|
||
(.hebrewLetter, .midNumLet),
|
||
(.aLetter, .singleQuote):
|
||
return _transition(into: .afterWB6)
|
||
|
||
case (.midLetter, .aLetter), // WB7
|
||
(.midLetter, .hebrewLetter),
|
||
(.midNumLet, .aLetter),
|
||
(.midNumLet, .hebrewLetter),
|
||
(.singleQuote, .aLetter),
|
||
(.singleQuote, .hebrewLetter):
|
||
return _expect(.afterWB6)
|
||
|
||
case (.hebrewLetter, .singleQuote): // WB7a
|
||
return _reject()
|
||
|
||
case (.hebrewLetter, .doubleQuote): // WB7b
|
||
return _transition(into: .afterWB7b)
|
||
|
||
case (.doubleQuote, .hebrewLetter): // WB7c
|
||
return _expect(.afterWB7b)
|
||
|
||
case (.numeric, .numeric), // WB8
|
||
(.aLetter, .numeric), // WB9
|
||
(.hebrewLetter, .numeric), // WB9
|
||
(.numeric, .aLetter), // WB10
|
||
(.numeric, .hebrewLetter): // WB10
|
||
return _reject()
|
||
|
||
case (.midNum, .numeric), // WB11
|
||
(.midNumLet, .numeric),
|
||
(.singleQuote, .numeric):
|
||
return _expect(.afterWB12)
|
||
|
||
case (.numeric, .midNum), // WB12
|
||
(.numeric, .midNumLet),
|
||
(.numeric, .singleQuote):
|
||
return _transition(into: .afterWB12)
|
||
|
||
case (.katakana, .katakana), // WB13
|
||
(.aLetter, .extendNumLet), // WB13a
|
||
(.hebrewLetter, .extendNumLet),
|
||
(.numeric, .extendNumLet),
|
||
(.katakana, .extendNumLet),
|
||
(.extendNumLet, .extendNumLet),
|
||
(.extendNumLet, .aLetter), // WB13b
|
||
(.extendNumLet, .hebrewLetter),
|
||
(.extendNumLet, .numeric),
|
||
(.extendNumLet, .katakana):
|
||
return _reject()
|
||
|
||
case (.regionalIndicator, .regionalIndicator): // WB15/WB16
|
||
let breakHere: Bool
|
||
if _state == .afterMidFlag {
|
||
_state = .ordinary
|
||
breakHere = true
|
||
} else {
|
||
_state = .afterMidFlag
|
||
breakHere = false
|
||
}
|
||
return (setCandidate: false, breakAtCandidate: false, breakHere: breakHere)
|
||
|
||
default: // WB999
|
||
return _accept()
|
||
}
|
||
}
|
||
|
||
/// Returns true if the previously reported word boundary candidate needs to
|
||
/// be promoted to a full break if there are no more scalars in the input text.
|
||
///
|
||
/// There is always an (implicit) word boundary at position at the end of
|
||
/// text, following the last scalar; however, the end may also trigger a
|
||
/// pending unreported break at the last candidate previously set by
|
||
/// `hasBreak`. This method returns true in that case, allowing clients to
|
||
/// reliably detect such boundaries.
|
||
public func hasCandidateBreakAtEnd() -> Bool {
|
||
_state.hasPendingCandidate
|
||
}
|
||
}
|
||
|
||
extension Unicode {
|
||
/// A state machine for recognizing safe word boundaries in a backward
|
||
/// sequence of Unicode scalars, based on the specification in [Unicode Annex
|
||
/// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules).
|
||
///
|
||
/// The text segmentation algorithm is not stable, and it allows implementers
|
||
/// to tailor it to their needs. Accordingly, reported word boundaries may
|
||
/// vary in arbitrary ways between Unicode implementations and system
|
||
/// configurations, including between versions of the Swift Standard Library.
|
||
///
|
||
/// This is intended to help implement searching for word boundaries near an
|
||
/// arbitrary position in a middle of a larger text, as described in [section
|
||
/// 6.4 of Annex #29](https://unicode.org/reports/tr29/#Random_Access). The
|
||
/// start position may be at at an arbitrary scalar anywhere in the input text
|
||
/// -- there is no expectation that the first scalar addresses a known word
|
||
/// boundary. The state machine scans backwards from that position until it
|
||
/// detects a reliable word boundary.
|
||
///
|
||
/// To detect a word break near a particular position in a series of Unicode
|
||
/// scalars, start iterating scalars backward from the start position, feeding
|
||
/// each of them to the `hasGuaranteedBreak(after:)` method. The method
|
||
/// indicates if there is a word break preceding the given scalar, or at a
|
||
/// previously reported candidate position. There is always a word break at
|
||
/// the start of the text; so if we run out of scalars, the start position is
|
||
/// going to be a suitable safe word boundary.
|
||
///
|
||
/// Note that this construct may skip over an arbitrary number of word
|
||
/// boundaries while it is searching for a safe break position. Once a safe
|
||
/// boundary is found, callers are usually expected to use it as the start
|
||
/// position to iterate forward using the standard segmentation algorithm
|
||
/// (implemented by `Unicode._WordRecognizer`), for example until they find
|
||
/// the break nearest to their original start position. In such cases, it is
|
||
/// usually a good idea to incrementally memoize word boundaries as they are
|
||
/// detected, to avoid repeating this process on the same positions later.
|
||
@available(StdlibDeploymentTarget 6.3, *)
|
||
public // Core primitive
|
||
struct _RandomAccessWordRecognizer: Sendable {
|
||
// FIXME: We also need proper public API for this
|
||
|
||
/// The last scalar that was fed to `hasGuaranteedBreak`; i.e., the scalar
|
||
/// immediately following the current one in the text.
|
||
var _nextScalar: Unicode.Scalar
|
||
/// The cached word break property of `_nextScalar`.
|
||
var _nextCategory: _WordBreakProperty
|
||
/// The word break property of the most recently seen scalar that wasn't
|
||
/// ignored by rule WB4.
|
||
var _baseCategory: _WordBreakProperty
|
||
/// Additional recognizer state.
|
||
var _state: _State
|
||
var _hasPendingCandidate: Bool
|
||
|
||
/// Initialize a new word recognizer at an arbitrary text position preceding
|
||
/// the given scalar.
|
||
public init(before scalar: Unicode.Scalar) {
|
||
_nextScalar = scalar
|
||
_nextCategory = Unicode._WordBreakProperty(from: scalar)
|
||
_baseCategory = _nextCategory
|
||
_state = .initial
|
||
_hasPendingCandidate = false
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
@available(StdlibDeploymentTarget 6.3, *)
|
||
extension Unicode._RandomAccessWordRecognizer {
|
||
internal enum _State: Int, Sendable {
|
||
case initial
|
||
case ordinary
|
||
case beforeWB7
|
||
case beforeWB7c
|
||
case beforeWB11
|
||
|
||
var hasPendingCandidate: Bool {
|
||
switch self {
|
||
case .beforeWB7, .beforeWB7c, .beforeWB11: return true
|
||
default: return false
|
||
}
|
||
}
|
||
}
|
||
|
||
internal mutating func _reject(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
_hasPendingCandidate = false
|
||
return (setCandidate: false, breakAtCandidate: false, breakHere: false)
|
||
}
|
||
|
||
internal mutating func _ignore(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
return (setCandidate: false, breakAtCandidate: false, breakHere: false)
|
||
}
|
||
|
||
internal mutating func _accept(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
if _hasPendingCandidate {
|
||
_hasPendingCandidate = false
|
||
return (setCandidate: false, breakAtCandidate: true, breakHere: false)
|
||
}
|
||
return (setCandidate: false, breakAtCandidate: false, breakHere: true)
|
||
}
|
||
|
||
internal mutating func _placeCandidate(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
if _state == .initial {
|
||
return _reject()
|
||
}
|
||
_hasPendingCandidate = true
|
||
return (setCandidate: true, breakAtCandidate: false, breakHere: false)
|
||
}
|
||
|
||
internal mutating func _placeSoftCandidate(
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
if _hasPendingCandidate || _state == .initial {
|
||
return (false, false, false)
|
||
}
|
||
return _placeCandidate()
|
||
}
|
||
|
||
public mutating func hasGuaranteedBreak(
|
||
after previousScalar: Unicode.Scalar
|
||
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
|
||
let prevCategory = Unicode._WordBreakProperty(from: previousScalar)
|
||
var newState: _State = .ordinary
|
||
var newBase = prevCategory
|
||
defer {
|
||
_nextCategory = prevCategory
|
||
_nextScalar = previousScalar
|
||
_baseCategory = newBase
|
||
_state = newState
|
||
}
|
||
|
||
switch (prevCategory, _nextCategory) {
|
||
case (.any, .any): // WB999 shortcut
|
||
return _accept()
|
||
|
||
case (.newlineCRLF, _), // WB3a
|
||
(_, .newlineCRLF): // WB3b
|
||
if previousScalar.value == 0xD, _nextScalar.value == 0xA { // WB3
|
||
return _reject()
|
||
}
|
||
return _accept()
|
||
|
||
case (.zwj, .extendedPictographic), // WB3c
|
||
(.wSegSpace, .wSegSpace): // WB3d
|
||
newBase = _baseCategory
|
||
newState = _state
|
||
return _reject()
|
||
|
||
case (.format, _), // WB4
|
||
(.extend, _),
|
||
(.zwj, _):
|
||
newBase = _baseCategory
|
||
newState = _state
|
||
if _state == .initial || _nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj {
|
||
return _ignore()
|
||
}
|
||
return _placeSoftCandidate()
|
||
|
||
default:
|
||
break
|
||
}
|
||
|
||
switch (prevCategory, _baseCategory) {
|
||
case (.aLetter, .aLetter), // WB5
|
||
(.aLetter, .hebrewLetter),
|
||
(.hebrewLetter, .aLetter),
|
||
(.hebrewLetter, .hebrewLetter):
|
||
return _reject()
|
||
|
||
case (.midLetter, .aLetter), // WB7
|
||
(.midLetter, .hebrewLetter),
|
||
(.midNumLet, .aLetter),
|
||
(.midNumLet, .hebrewLetter),
|
||
(.singleQuote, .aLetter),
|
||
(.singleQuote, .hebrewLetter):
|
||
newState = .beforeWB7
|
||
return _placeSoftCandidate()
|
||
|
||
case (.aLetter, .midLetter), // WB6
|
||
(.hebrewLetter, .midLetter),
|
||
(.aLetter, .midNumLet),
|
||
(.hebrewLetter, .midNumLet),
|
||
(.aLetter, .singleQuote):
|
||
if _state == .beforeWB7 || _state == .initial {
|
||
return _reject()
|
||
}
|
||
return _accept()
|
||
|
||
case (.hebrewLetter, .singleQuote): // WB7a
|
||
return _reject()
|
||
|
||
case (.doubleQuote, .hebrewLetter): // WB7c
|
||
newState = .beforeWB7c
|
||
return _placeSoftCandidate()
|
||
|
||
case (.hebrewLetter, .doubleQuote): // WB7b
|
||
if _state == .beforeWB7c || _state == .initial {
|
||
return _reject()
|
||
}
|
||
return _accept()
|
||
|
||
case (.numeric, .numeric), // WB8
|
||
(.aLetter, .numeric), // WB9
|
||
(.hebrewLetter, .numeric),
|
||
(.numeric, .aLetter), // WB10
|
||
(.numeric, .hebrewLetter):
|
||
return _reject()
|
||
|
||
case (.midNum, .numeric), // WB11
|
||
(.midNumLet, .numeric),
|
||
(.singleQuote, .numeric):
|
||
newState = .beforeWB11
|
||
return _placeSoftCandidate()
|
||
|
||
case (.numeric, .midNum), // WB12
|
||
(.numeric, .midNumLet),
|
||
(.numeric, .singleQuote):
|
||
if _state == .beforeWB11 || _state == .initial {
|
||
return _reject()
|
||
}
|
||
return _accept()
|
||
|
||
case (.katakana, .katakana), // WB13
|
||
(.aLetter, .extendNumLet), // WB13a
|
||
(.hebrewLetter, .extendNumLet),
|
||
(.numeric, .extendNumLet),
|
||
(.katakana, .extendNumLet),
|
||
(.extendNumLet, .extendNumLet),
|
||
(.extendNumLet, .aLetter), // WB13b
|
||
(.extendNumLet, .hebrewLetter),
|
||
(.extendNumLet, .numeric),
|
||
(.extendNumLet, .katakana):
|
||
return _reject()
|
||
|
||
case (.regionalIndicator, .regionalIndicator): // WB15/WB16
|
||
return _reject()
|
||
|
||
case (_, .format),
|
||
(_, .extend),
|
||
(_, .zwj):
|
||
_internalInvariant(!_hasPendingCandidate)
|
||
newState = .initial
|
||
return _reject()
|
||
|
||
default:
|
||
if
|
||
!_hasPendingCandidate,
|
||
_nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj
|
||
{
|
||
return _ignore()
|
||
}
|
||
return _accept()
|
||
}
|
||
}
|
||
}
|
||
|
||
extension String {
|
||
/// Find and return a word boundary position at or before an arbitrary index
|
||
/// within this string. The result may not be the closest word break to the
|
||
/// start position.
|
||
///
|
||
/// This implements the core algorithm for finding a "safe" starting point for
|
||
/// random access to word breaks, following [section 6.4 of Unicode Annex
|
||
/// #29](https://unicode.org/reports/tr29/#Random_Access). Unicode defines
|
||
/// word boundaries using a forward-only state machine; this algorithm does
|
||
/// its best to run the state machine backwards until it finds a guaranteed
|
||
/// break position.
|
||
///
|
||
/// This process is inherently an approximation: the algorithm may need to
|
||
/// skip over an arbitrary number of actual word boundaries before finding one
|
||
/// that it can judge with confidence. This makes it relatively expensive to
|
||
/// iterate over word boundaries backwards; in the worst case, a naive
|
||
/// implementation may have quadratic complexity. The recommended way to
|
||
/// mitigate this is to maintain a cache of word breaks already traversed,
|
||
/// only calling this method to extend the range of known breaks backwards as
|
||
/// needed.
|
||
///
|
||
/// - Parameter i: An arbitrary index within the string, not necessarily
|
||
/// addressing a word boundary.
|
||
/// - Returns: A valid index less than equal to the input that is guaranteed to
|
||
/// identify some word boundary at or before `i`.
|
||
@available(SwiftStdlib 6.3, *)
|
||
public func _wordIndex(somewhereAtOrBefore i: Index) -> Index {
|
||
var j = _guts.validateInclusiveScalarIndex(i)
|
||
if j == endIndex {
|
||
return j
|
||
}
|
||
var recognizer = Unicode._RandomAccessWordRecognizer(
|
||
before: self.unicodeScalars[j])
|
||
var candidate = j
|
||
while j > self.startIndex {
|
||
let p = self.unicodeScalars.index(before: j)
|
||
let b = recognizer.hasGuaranteedBreak(after: self.unicodeScalars[p])
|
||
if b.setCandidate { candidate = j }
|
||
if b.breakAtCandidate { return candidate }
|
||
if b.breakHere { return j }
|
||
j = p
|
||
}
|
||
return j
|
||
}
|
||
|
||
/// Return the word boundary position following a known word boundary within
|
||
/// this string.
|
||
///
|
||
/// This implements the word boundary specification of [Unicode Annex
|
||
/// #29](https://unicode.org/reports/tr29/#Default_Word_Boundaries). The
|
||
/// algorithm is not stable, and it allows implementers to tailor it to their
|
||
/// needs; accordingly, the result of this operation may vary between Unicode
|
||
/// implementations and system configurations, including versions of the Swift
|
||
/// Standard Library.
|
||
///
|
||
/// Note: The input index must be on a known word boundary, otherwise the
|
||
/// result of this operation is unspecified. The start and end indices are
|
||
/// always known word boundaries, in every string.
|
||
///
|
||
/// - Parameter i: A valid index addressing a word boundary within this
|
||
/// string.
|
||
/// - Returns: The first word break strictly following `i` in the string.
|
||
@available(StdlibDeploymentTarget 5.7, *)
|
||
public func _wordIndex(after i: String.Index) -> String.Index {
|
||
guard #available(StdlibDeploymentTarget 6.3, *) else {
|
||
fatalError("Unreachable")
|
||
}
|
||
let i = _guts.validateScalarIndex(i)
|
||
if _slowPath(_guts.isForeign) {
|
||
return _guts._nextForeignWordIndex(after: i)
|
||
}
|
||
return _guts._nextUTF8WordIndex(after: i)
|
||
}
|
||
}
|
||
|
||
extension _StringGuts {
|
||
@available(StdlibDeploymentTarget 6.3, *)
|
||
@inline(never)
|
||
@_effects(releasenone)
|
||
internal func _nextUTF8WordIndex(after index: Index) -> Index {
|
||
_internalInvariant(self.isFastUTF8)
|
||
let result = unsafe self.withFastUTF8 { utf8 in
|
||
var offset = index._encodedOffset
|
||
let first = unsafe _decodeScalar(utf8, startingAt: offset)
|
||
offset &+= first.scalarLength
|
||
var recognizer = Unicode._WordRecognizer(after: first.0)
|
||
var candidate = offset
|
||
while offset < utf8.count {
|
||
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: offset)
|
||
let r = recognizer.hasBreak(before: scalar)
|
||
if r.setCandidate { candidate = offset }
|
||
if r.breakAtCandidate { return candidate }
|
||
if r.breakHere { return offset }
|
||
offset &+= len
|
||
}
|
||
if recognizer.hasCandidateBreakAtEnd() {
|
||
return candidate
|
||
}
|
||
return offset
|
||
}
|
||
// Note: We only signal that the result is scalar aligned, not
|
||
// character-aligned. Unicode does attempt to ensure that word breaks are
|
||
// always character-aligned, but this is not a strict guarantee, especially
|
||
// not if either segmentation algorithm has a tailored implementation. (As
|
||
// of 6.3, we do not tailor our implementations, but we used to do so and we
|
||
// may choose to do it again in the future.)
|
||
return Index(_encodedOffset: result)._scalarAligned._knownUTF8
|
||
}
|
||
|
||
@available(StdlibDeploymentTarget 6.3, *)
|
||
@inline(never)
|
||
internal func _nextForeignWordIndex(after index: Index) -> Index {
|
||
#if _runtime(_ObjC)
|
||
_internalInvariant(self.isForeign)
|
||
let scalars = String.UnicodeScalarView(self)
|
||
var recognizer = Unicode._WordRecognizer(after: scalars[index])
|
||
var i = scalars.index(after: index)
|
||
var candidate = i
|
||
while i < scalars.endIndex {
|
||
let r = recognizer.hasBreak(before: scalars[i])
|
||
if r.setCandidate { candidate = i }
|
||
if r.breakAtCandidate { return candidate }
|
||
if r.breakHere { return i }
|
||
scalars.formIndex(after: &i)
|
||
}
|
||
if recognizer.hasCandidateBreakAtEnd() {
|
||
return candidate
|
||
}
|
||
return i
|
||
#else
|
||
fatalError("Foreign strings are unsupported on this platform")
|
||
#endif
|
||
}
|
||
}
|