Files
swift-mirror/stdlib/public/core/StringWordBreaking.swift
Karoy Lorentey 3e18a07187 [stdlib] Fix implementation of Unicode text segmentation for word boundaries
Carefully overhaul our word breaking implementation to follow the recommendations of Unicode Annex #29. Start exposing the core primitives (as well as `String`-level interfaces), so that folks can prototype proper API for these concepts.

- Fix `_wordIndex(after:)` to always advance forward. It now requires its input index to be on a word boundary. Remove the `@_spi` attribute, exposing it as a (hidden, but) public entry point.
- The old SPIs `_wordIndex(before:)` and `_nearestWordIndex(atOrBelow:)` were irredemably broken; follow the Unicode recommendation for implementing random-access text segmentation and replace them both with a new public `_wordIndex(somewhereAtOrBefore:)` entry pont.
- Expose handcrafted low-level state machines for detecting word boundaries (_WordRecognizer`, `_RandomAccessWordRecognizer`), following the design of `_CharacterRecognizer`.
- Add tests to reliably validate that the two state machine flavors always produce consistent results.

rdar://155482680
2025-08-05 20:04:46 -07:00

774 lines
30 KiB
Swift
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//===----------------------------------------------------------------------===//
//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2022 - 2025 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
extension Unicode {
/// A state machine for recognizing word boundaries in an arbitrary series of
/// Unicode scalars, based on the specification in [Unicode Annex
/// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules).
///
/// The text segmentation algorithm is not stable, and it allows implementers
/// to tailor it to their needs. Accordingly, reported word boundaries may
/// vary in arbitrary ways between Unicode implementations and system
/// configurations, including between versions of the Swift Standard Library.
///
/// To implement the rules as specified, this low-level construct has built-in
/// support to defer making a decision on whether there is a word boundary
/// between two Unicode scalars until more scalars are fed to the state
/// machine; that is to say, it implements limited lookahead. The API surface
/// only allows one such candidate position to exist at any given time -- this
/// corresponds to allowing looking ahead up to the next word boundary. (In
/// the unlikely case the rules evolve to require looking ahead even further,
/// then this interface will need to be modified or replaced accordingly.)
///
/// To detect word breaks in a sequence of Unicode scalars, feed each of them
/// to the recognizer by calling its `hasBreak(before:)` method. The method
/// indicates if there is a word break preceding the given scalar, or at a
/// previously reported candidate position. When every scalar in the text has
/// been fed to the recognizer, the `hasCandidateBreakAtEnd()` method should
/// be called to determine if there is a break at the last reported candidate
/// position. There is also an (implicit) word break at the end of text
/// position.
///
/// Note that `_WordRecognizer` does not take or return actual text positions
/// (such as a string index); it is entirely independent of the underlying
/// text representation, and it is able to work with any container model. (For
/// example, it can be used to incrementally recognize word breaks in UTF-16
/// data streamed from a network connection, or iterate over word boundaries
/// in piecewise contiguous UTF-8 buffers stored in a rope data structure.) Of
/// course, it is also possible to use it to detect word breaks in a standard
/// `String` value, such as done by this example function:
///
/// func collectWordBreaks(in string: String) -> [String.Index] {
/// var result: [String.Index] = []
/// var recognizer = Unicode._WordRecognizer()
/// var candidate = string.startIndex
/// for i in string.unicodeScalars.indices {
/// let r = recognizer.hasBreak(before: string.unicodeScalars[i])
/// if r.setCandidate { candidate = i }
/// if r.breakAtCandidate { result.append(candidate) }
/// if r.breakHere { result.append(i) }
/// }
/// if recognizer.hasCandidateBreakAtEnd() {
/// result.append(candidate)
/// }
/// result.append(string.endIndex)
/// return result
/// }
///
/// When used this way, the state machine is able to efficiently iterate over
/// all breaks within the string by visiting each scalar exactly once, without
/// any backtracking.
///
/// It is also possible to discard the recognizer after each detected
/// boundary, reinitializing it from scratch for each iteration step:
///
/// func wordBreak(
/// after knownBreak: String.Index, in string: String
/// ) -> String.Index {
/// var recognizer = Unicode._WordRecognizer(after: string.unicodeScalars[knownBreak])
/// var i = string.unicodeScalars.index(after: knownBreak)
/// var candidate = i
/// while i < string.endIndex {
/// let r = recognizer.hasBreak(before: string.unicodeScalars[i])
/// if r.setCandidate { candidate = i }
/// if r.breakAtCandidate { return candidate }
/// if r.breakHere { return i }
/// string.uncodeScalars.formIndex(after: &i)
/// }
/// if recognizer.hasCandidateBreakAtEnd() {
/// return candidate
/// }
/// return i
/// }
///
/// However, note that iterating this way is less efficient, because it
/// discards lookahead information -- some scalars will be processed multiple
/// times. The rules are carefully constructed so that the algorithm reports
/// the same word boundaries whether or not recognizer state is preserved.
@available(StdlibDeploymentTarget 6.3, *)
public // Core primitive
struct _WordRecognizer: Sendable {
// FIXME: We also need proper public API for this
/// The last scalar that was fed to `hasBreak(before:)`.
var _prevScalar: Unicode.Scalar
/// The cached word break property of `_prevScalar`.
var _prevCategory: _WordBreakProperty
/// The word break property of the last preceding scalar that wasn't ignored by rule WB4.
var _baseCategory: _WordBreakProperty
/// The current state of the recognizer.
var _state: _State
/// Initialize a new word recognizer at the _start of text_ (sot)
/// position.
///
/// The resulting state machine will report a word break before the first
/// scalar that is fed to it.
public init() {
// To avoid having to handle the empty case specially, we use LF as the
// placeholder before the first scalar. Per WB3a, we always produce a break
// following a line feed.
_baseCategory = .newlineCRLF
_prevScalar = Unicode.Scalar(0x0A as UInt8)
_prevCategory = .newlineCRLF
_state = .ordinary
}
/// Initialize a new word recognizer with a state after a previously
/// recognized word boundary.
///
/// This enables clients to iterate over word boundaries without maintaining
/// a persistent recognizer state. However, iterating this way may result in
/// an arbitrarily large amount of duplicate work. (This is because the word
/// segmentation algorithm requires looking ahead by as much as a full
/// word's worth of Unicode scalars, and when the old recognizer is
/// discarded, the information thus collected has to be recreated from
/// scratch.) Whenever possible, it is therefore preferable to iterate over
/// multiple word boundaries using a single recognizer instance.
///
/// - Parameter scalar: The Unicode scalar immediately following a known
/// word boundary position.
public init(after scalar: Unicode.Scalar) {
// We assume that the state machine provides stable results even if the
// start position was a retroactive candidate.
_prevScalar = scalar
_prevCategory = Unicode._WordBreakProperty(from: scalar)
_baseCategory = _prevCategory
_state = .ordinary
}
}
}
@available(StdlibDeploymentTarget 6.3, *)
extension Unicode._WordRecognizer {
/// The parts of the word recognizer state that are in addition to saved
/// information about previous scalars.
///
/// This is used to implement stateful lookahead when a break at a particular
/// candidate position may need to be suppressed or activated based on a
/// subsequent scalar value. It is also used to keep track of the number of
/// contiguous regional indicator scalars we have seen so far.
internal enum _State: Int, Sendable {
case ordinary
case afterWB6 // AHLetter × (MidLetter | MidNumLetQ) AHLetter
case afterWB7b // Hebrew_Letter × Double_Quote Hebrew_Letter
case afterWB12 // Numeric × (MidNum | MidNumLetQ) Numeric
case afterMidFlag // [^RI] (RI RI)* RI × RI
var hasPendingCandidate: Bool {
switch self {
case .afterWB6, .afterWB7b, .afterWB12: return true
default: return false
}
}
}
/// Reject a break at the current position, triggering a break at the current
/// candidate (if any).
internal mutating func _reject(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
let breakAtCandidate = _state.hasPendingCandidate
_state = .ordinary
return (
setCandidate: false,
breakAtCandidate: breakAtCandidate,
breakHere: false)
}
/// Skip this position, ignoring the current Unicode scalar.
internal mutating func _ignore(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
// Note: not updating _baseCategory
return (setCandidate: false, breakAtCandidate: false, breakHere: false)
}
/// Signal a break at the current position, also triggering a break at the
/// current candidate (if any).
internal mutating func _accept(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
// If we have a pending candidate, put a break at it
let breakAtCandidate = _state.hasPendingCandidate
_state = .ordinary
return (
setCandidate: false,
breakAtCandidate: breakAtCandidate,
breakHere: true)
}
/// Set the current position as the active break candidate, and transition into the
/// specified state.
internal mutating func _transition(
into state: _State
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
_internalInvariant(_state == .ordinary)
_state = state
return (setCandidate: true, breakAtCandidate: false, breakHere: false)
}
/// If the current state matches the given expectation, suppress a break at
/// this position and discard the active candidate; otherwise, report a break
/// at both positions.
internal mutating func _expect(
_ expectedState: _State
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
let breakHere = (_state != expectedState)
let breakAtCandidate = breakHere && _state.hasPendingCandidate
_state = .ordinary
return (
setCandidate: false,
breakAtCandidate: breakAtCandidate,
breakHere: breakHere)
}
/// Feeds the next scalar to the state machine, reporting if there is a word
/// boundary at the current position or a previously reported candidate.
///
/// To decide whether there is a word break at the current position, the
/// segmentation algorithm sometimes needs to look ahead by visiting
/// additional scalars following the break, up to the next word boundary. To
/// allow this, the state machine can report that the current position is a
/// provisional break "candidate". Clients are expected to remember the
/// position of the last reported candidate, so that it can be retroactively
/// promoted to a full break as needed.
///
/// - Parameter nextScalar: The scalar at the current position in the text.
/// - Returns: A triple of Boolean values `setCandidate`, `breakAtCandidate`,
/// `breakHere`. If `setCandidate` is true, then the caller is expected to
/// save the current text position as a potential word boundary. If
/// `breakAtCandidate` is true, then there is a word boundary at the last
/// candidate position. If `breakHere` is true, then there is a word
/// boundary at the current position. The caller is expected to process
/// these three components in this specific order.
public mutating func hasBreak(
before nextScalar: Unicode.Scalar
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
let nextCategory = Unicode._WordBreakProperty(from: nextScalar)
var nextBase = nextCategory
// FIXME: Implement a proper state machine here, dispatching on
// (state, nextProperty), ideally through a static look-up table
defer {
_prevScalar = nextScalar
_prevCategory = nextCategory
_baseCategory = nextBase
}
switch (_prevCategory, nextCategory) {
case (.any, .any): // WB999
// Fast path: If we know our scalars have no properties then the decision
// is trivial and we don't need to crawl to the default statement.
return _accept()
case (.newlineCRLF, _), // WB3a
(_, .newlineCRLF): // WB3b
if _prevScalar.value == 0xD, nextScalar.value == 0xA { // WB3
_internalInvariant(_prevCategory == .newlineCRLF)
return _reject()
}
return _accept()
case (.zwj, .extendedPictographic), // WB3c
(.wSegSpace, .wSegSpace): // WB3d
return _reject()
case (_, .format), // WB4
(_, .extend),
(_, .zwj):
nextBase = _baseCategory // Cancel _baseCategory update
return _ignore()
default:
break
}
switch (_baseCategory, nextCategory) {
case (.aLetter, .aLetter), // WB5
(.aLetter, .hebrewLetter),
(.hebrewLetter, .aLetter),
(.hebrewLetter, .hebrewLetter):
return _reject()
case (.aLetter, .midLetter), // WB6
(.hebrewLetter, .midLetter),
(.aLetter, .midNumLet),
(.hebrewLetter, .midNumLet),
(.aLetter, .singleQuote):
return _transition(into: .afterWB6)
case (.midLetter, .aLetter), // WB7
(.midLetter, .hebrewLetter),
(.midNumLet, .aLetter),
(.midNumLet, .hebrewLetter),
(.singleQuote, .aLetter),
(.singleQuote, .hebrewLetter):
return _expect(.afterWB6)
case (.hebrewLetter, .singleQuote): // WB7a
return _reject()
case (.hebrewLetter, .doubleQuote): // WB7b
return _transition(into: .afterWB7b)
case (.doubleQuote, .hebrewLetter): // WB7c
return _expect(.afterWB7b)
case (.numeric, .numeric), // WB8
(.aLetter, .numeric), // WB9
(.hebrewLetter, .numeric), // WB9
(.numeric, .aLetter), // WB10
(.numeric, .hebrewLetter): // WB10
return _reject()
case (.midNum, .numeric), // WB11
(.midNumLet, .numeric),
(.singleQuote, .numeric):
return _expect(.afterWB12)
case (.numeric, .midNum), // WB12
(.numeric, .midNumLet),
(.numeric, .singleQuote):
return _transition(into: .afterWB12)
case (.katakana, .katakana), // WB13
(.aLetter, .extendNumLet), // WB13a
(.hebrewLetter, .extendNumLet),
(.numeric, .extendNumLet),
(.katakana, .extendNumLet),
(.extendNumLet, .extendNumLet),
(.extendNumLet, .aLetter), // WB13b
(.extendNumLet, .hebrewLetter),
(.extendNumLet, .numeric),
(.extendNumLet, .katakana):
return _reject()
case (.regionalIndicator, .regionalIndicator): // WB15/WB16
let breakHere: Bool
if _state == .afterMidFlag {
_state = .ordinary
breakHere = true
} else {
_state = .afterMidFlag
breakHere = false
}
return (setCandidate: false, breakAtCandidate: false, breakHere: breakHere)
default: // WB999
return _accept()
}
}
/// Returns true if the previously reported word boundary candidate needs to
/// be promoted to a full break if there are no more scalars in the input text.
///
/// There is always an (implicit) word boundary at position at the end of
/// text, following the last scalar; however, the end may also trigger a
/// pending unreported break at the last candidate previously set by
/// `hasBreak`. This method returns true in that case, allowing clients to
/// reliably detect such boundaries.
public func hasCandidateBreakAtEnd() -> Bool {
_state.hasPendingCandidate
}
}
extension Unicode {
/// A state machine for recognizing safe word boundaries in a backward
/// sequence of Unicode scalars, based on the specification in [Unicode Annex
/// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules).
///
/// The text segmentation algorithm is not stable, and it allows implementers
/// to tailor it to their needs. Accordingly, reported word boundaries may
/// vary in arbitrary ways between Unicode implementations and system
/// configurations, including between versions of the Swift Standard Library.
///
/// This is intended to help implement searching for word boundaries near an
/// arbitrary position in a middle of a larger text, as described in [section
/// 6.4 of Annex #29](https://unicode.org/reports/tr29/#Random_Access). The
/// start position may be at at an arbitrary scalar anywhere in the input text
/// -- there is no expectation that the first scalar addresses a known word
/// boundary. The state machine scans backwards from that position until it
/// detects a reliable word boundary.
///
/// To detect a word break near a particular position in a series of Unicode
/// scalars, start iterating scalars backward from the start position, feeding
/// each of them to the `hasGuaranteedBreak(after:)` method. The method
/// indicates if there is a word break preceding the given scalar, or at a
/// previously reported candidate position. There is always a word break at
/// the start of the text; so if we run out of scalars, the start position is
/// going to be a suitable safe word boundary.
///
/// Note that this construct may skip over an arbitrary number of word
/// boundaries while it is searching for a safe break position. Once a safe
/// boundary is found, callers are usually expected to use it as the start
/// position to iterate forward using the standard segmentation algorithm
/// (implemented by `Unicode._WordRecognizer`), for example until they find
/// the break nearest to their original start position. In such cases, it is
/// usually a good idea to incrementally memoize word boundaries as they are
/// detected, to avoid repeating this process on the same positions later.
@available(StdlibDeploymentTarget 6.3, *)
public // Core primitive
struct _RandomAccessWordRecognizer: Sendable {
// FIXME: We also need proper public API for this
/// The last scalar that was fed to `hasGuaranteedBreak`; i.e., the scalar
/// immediately following the current one in the text.
var _nextScalar: Unicode.Scalar
/// The cached word break property of `_nextScalar`.
var _nextCategory: _WordBreakProperty
/// The word break property of the most recently seen scalar that wasn't
/// ignored by rule WB4.
var _baseCategory: _WordBreakProperty
/// Additional recognizer state.
var _state: _State
var _hasPendingCandidate: Bool
/// Initialize a new word recognizer at an arbitrary text position preceding
/// the given scalar.
public init(before scalar: Unicode.Scalar) {
_nextScalar = scalar
_nextCategory = Unicode._WordBreakProperty(from: scalar)
_baseCategory = _nextCategory
_state = .initial
_hasPendingCandidate = false
}
}
}
@available(StdlibDeploymentTarget 6.3, *)
extension Unicode._RandomAccessWordRecognizer {
internal enum _State: Int, Sendable {
case initial
case ordinary
case beforeWB7
case beforeWB7c
case beforeWB11
var hasPendingCandidate: Bool {
switch self {
case .beforeWB7, .beforeWB7c, .beforeWB11: return true
default: return false
}
}
}
internal mutating func _reject(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
_hasPendingCandidate = false
return (setCandidate: false, breakAtCandidate: false, breakHere: false)
}
internal mutating func _ignore(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
return (setCandidate: false, breakAtCandidate: false, breakHere: false)
}
internal mutating func _accept(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
if _hasPendingCandidate {
_hasPendingCandidate = false
return (setCandidate: false, breakAtCandidate: true, breakHere: false)
}
return (setCandidate: false, breakAtCandidate: false, breakHere: true)
}
internal mutating func _placeCandidate(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
if _state == .initial {
return _reject()
}
_hasPendingCandidate = true
return (setCandidate: true, breakAtCandidate: false, breakHere: false)
}
internal mutating func _placeSoftCandidate(
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
if _hasPendingCandidate || _state == .initial {
return (false, false, false)
}
return _placeCandidate()
}
public mutating func hasGuaranteedBreak(
after previousScalar: Unicode.Scalar
) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) {
let prevCategory = Unicode._WordBreakProperty(from: previousScalar)
var newState: _State = .ordinary
var newBase = prevCategory
defer {
_nextCategory = prevCategory
_nextScalar = previousScalar
_baseCategory = newBase
_state = newState
}
switch (prevCategory, _nextCategory) {
case (.any, .any): // WB999 shortcut
return _accept()
case (.newlineCRLF, _), // WB3a
(_, .newlineCRLF): // WB3b
if previousScalar.value == 0xD, _nextScalar.value == 0xA { // WB3
return _reject()
}
return _accept()
case (.zwj, .extendedPictographic), // WB3c
(.wSegSpace, .wSegSpace): // WB3d
newBase = _baseCategory
newState = _state
return _reject()
case (.format, _), // WB4
(.extend, _),
(.zwj, _):
newBase = _baseCategory
newState = _state
if _state == .initial || _nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj {
return _ignore()
}
return _placeSoftCandidate()
default:
break
}
switch (prevCategory, _baseCategory) {
case (.aLetter, .aLetter), // WB5
(.aLetter, .hebrewLetter),
(.hebrewLetter, .aLetter),
(.hebrewLetter, .hebrewLetter):
return _reject()
case (.midLetter, .aLetter), // WB7
(.midLetter, .hebrewLetter),
(.midNumLet, .aLetter),
(.midNumLet, .hebrewLetter),
(.singleQuote, .aLetter),
(.singleQuote, .hebrewLetter):
newState = .beforeWB7
return _placeSoftCandidate()
case (.aLetter, .midLetter), // WB6
(.hebrewLetter, .midLetter),
(.aLetter, .midNumLet),
(.hebrewLetter, .midNumLet),
(.aLetter, .singleQuote):
if _state == .beforeWB7 || _state == .initial {
return _reject()
}
return _accept()
case (.hebrewLetter, .singleQuote): // WB7a
return _reject()
case (.doubleQuote, .hebrewLetter): // WB7c
newState = .beforeWB7c
return _placeSoftCandidate()
case (.hebrewLetter, .doubleQuote): // WB7b
if _state == .beforeWB7c || _state == .initial {
return _reject()
}
return _accept()
case (.numeric, .numeric), // WB8
(.aLetter, .numeric), // WB9
(.hebrewLetter, .numeric),
(.numeric, .aLetter), // WB10
(.numeric, .hebrewLetter):
return _reject()
case (.midNum, .numeric), // WB11
(.midNumLet, .numeric),
(.singleQuote, .numeric):
newState = .beforeWB11
return _placeSoftCandidate()
case (.numeric, .midNum), // WB12
(.numeric, .midNumLet),
(.numeric, .singleQuote):
if _state == .beforeWB11 || _state == .initial {
return _reject()
}
return _accept()
case (.katakana, .katakana), // WB13
(.aLetter, .extendNumLet), // WB13a
(.hebrewLetter, .extendNumLet),
(.numeric, .extendNumLet),
(.katakana, .extendNumLet),
(.extendNumLet, .extendNumLet),
(.extendNumLet, .aLetter), // WB13b
(.extendNumLet, .hebrewLetter),
(.extendNumLet, .numeric),
(.extendNumLet, .katakana):
return _reject()
case (.regionalIndicator, .regionalIndicator): // WB15/WB16
return _reject()
case (_, .format),
(_, .extend),
(_, .zwj):
_internalInvariant(!_hasPendingCandidate)
newState = .initial
return _reject()
default:
if
!_hasPendingCandidate,
_nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj
{
return _ignore()
}
return _accept()
}
}
}
extension String {
/// Find and return a word boundary position at or before an arbitrary index
/// within this string. The result may not be the closest word break to the
/// start position.
///
/// This implements the core algorithm for finding a "safe" starting point for
/// random access to word breaks, following [section 6.4 of Unicode Annex
/// #29](https://unicode.org/reports/tr29/#Random_Access). Unicode defines
/// word boundaries using a forward-only state machine; this algorithm does
/// its best to run the state machine backwards until it finds a guaranteed
/// break position.
///
/// This process is inherently an approximation: the algorithm may need to
/// skip over an arbitrary number of actual word boundaries before finding one
/// that it can judge with confidence. This makes it relatively expensive to
/// iterate over word boundaries backwards; in the worst case, a naive
/// implementation may have quadratic complexity. The recommended way to
/// mitigate this is to maintain a cache of word breaks already traversed,
/// only calling this method to extend the range of known breaks backwards as
/// needed.
///
/// - Parameter i: An arbitrary index within the string, not necessarily
/// addressing a word boundary.
/// - Returns: A valid index less than equal to the input that is guaranteed to
/// identify some word boundary at or before `i`.
@available(SwiftStdlib 6.3, *)
public func _wordIndex(somewhereAtOrBefore i: Index) -> Index {
var j = _guts.validateInclusiveScalarIndex(i)
if j == endIndex {
return j
}
var recognizer = Unicode._RandomAccessWordRecognizer(
before: self.unicodeScalars[j])
var candidate = j
while j > self.startIndex {
let p = self.unicodeScalars.index(before: j)
let b = recognizer.hasGuaranteedBreak(after: self.unicodeScalars[p])
if b.setCandidate { candidate = j }
if b.breakAtCandidate { return candidate }
if b.breakHere { return j }
j = p
}
return j
}
/// Return the word boundary position following a known word boundary within
/// this string.
///
/// This implements the word boundary specification of [Unicode Annex
/// #29](https://unicode.org/reports/tr29/#Default_Word_Boundaries). The
/// algorithm is not stable, and it allows implementers to tailor it to their
/// needs; accordingly, the result of this operation may vary between Unicode
/// implementations and system configurations, including versions of the Swift
/// Standard Library.
///
/// Note: The input index must be on a known word boundary, otherwise the
/// result of this operation is unspecified. The start and end indices are
/// always known word boundaries, in every string.
///
/// - Parameter i: A valid index addressing a word boundary within this
/// string.
/// - Returns: The first word break strictly following `i` in the string.
@available(StdlibDeploymentTarget 5.7, *)
public func _wordIndex(after i: String.Index) -> String.Index {
guard #available(StdlibDeploymentTarget 6.3, *) else {
fatalError("Unreachable")
}
let i = _guts.validateScalarIndex(i)
if _slowPath(_guts.isForeign) {
return _guts._nextForeignWordIndex(after: i)
}
return _guts._nextUTF8WordIndex(after: i)
}
}
extension _StringGuts {
@available(StdlibDeploymentTarget 6.3, *)
@inline(never)
@_effects(releasenone)
internal func _nextUTF8WordIndex(after index: Index) -> Index {
_internalInvariant(self.isFastUTF8)
let result = unsafe self.withFastUTF8 { utf8 in
var offset = index._encodedOffset
let first = unsafe _decodeScalar(utf8, startingAt: offset)
offset &+= first.scalarLength
var recognizer = Unicode._WordRecognizer(after: first.0)
var candidate = offset
while offset < utf8.count {
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: offset)
let r = recognizer.hasBreak(before: scalar)
if r.setCandidate { candidate = offset }
if r.breakAtCandidate { return candidate }
if r.breakHere { return offset }
offset &+= len
}
if recognizer.hasCandidateBreakAtEnd() {
return candidate
}
return offset
}
// Note: We only signal that the result is scalar aligned, not
// character-aligned. Unicode does attempt to ensure that word breaks are
// always character-aligned, but this is not a strict guarantee, especially
// not if either segmentation algorithm has a tailored implementation. (As
// of 6.3, we do not tailor our implementations, but we used to do so and we
// may choose to do it again in the future.)
return Index(_encodedOffset: result)._scalarAligned._knownUTF8
}
@available(StdlibDeploymentTarget 6.3, *)
@inline(never)
internal func _nextForeignWordIndex(after index: Index) -> Index {
#if _runtime(_ObjC)
_internalInvariant(self.isForeign)
let scalars = String.UnicodeScalarView(self)
var recognizer = Unicode._WordRecognizer(after: scalars[index])
var i = scalars.index(after: index)
var candidate = i
while i < scalars.endIndex {
let r = recognizer.hasBreak(before: scalars[i])
if r.setCandidate { candidate = i }
if r.breakAtCandidate { return candidate }
if r.breakHere { return i }
scalars.formIndex(after: &i)
}
if recognizer.hasCandidateBreakAtEnd() {
return candidate
}
return i
#else
fatalError("Foreign strings are unsupported on this platform")
#endif
}
}