//===----------------------------------------------------------------------===// // // // This source file is part of the Swift.org open source project // // Copyright (c) 2022 - 2025 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// extension Unicode { /// A state machine for recognizing word boundaries in an arbitrary series of /// Unicode scalars, based on the specification in [Unicode Annex /// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules). /// /// The text segmentation algorithm is not stable, and it allows implementers /// to tailor it to their needs. Accordingly, reported word boundaries may /// vary in arbitrary ways between Unicode implementations and system /// configurations, including between versions of the Swift Standard Library. /// /// To implement the rules as specified, this low-level construct has built-in /// support to defer making a decision on whether there is a word boundary /// between two Unicode scalars until more scalars are fed to the state /// machine; that is to say, it implements limited lookahead. The API surface /// only allows one such candidate position to exist at any given time -- this /// corresponds to allowing looking ahead up to the next word boundary. (In /// the unlikely case the rules evolve to require looking ahead even further, /// then this interface will need to be modified or replaced accordingly.) /// /// To detect word breaks in a sequence of Unicode scalars, feed each of them /// to the recognizer by calling its `hasBreak(before:)` method. The method /// indicates if there is a word break preceding the given scalar, or at a /// previously reported candidate position. When every scalar in the text has /// been fed to the recognizer, the `hasCandidateBreakAtEnd()` method should /// be called to determine if there is a break at the last reported candidate /// position. There is also an (implicit) word break at the end of text /// position. /// /// Note that `_WordRecognizer` does not take or return actual text positions /// (such as a string index); it is entirely independent of the underlying /// text representation, and it is able to work with any container model. (For /// example, it can be used to incrementally recognize word breaks in UTF-16 /// data streamed from a network connection, or iterate over word boundaries /// in piecewise contiguous UTF-8 buffers stored in a rope data structure.) Of /// course, it is also possible to use it to detect word breaks in a standard /// `String` value, such as done by this example function: /// /// func collectWordBreaks(in string: String) -> [String.Index] { /// var result: [String.Index] = [] /// var recognizer = Unicode._WordRecognizer() /// var candidate = string.startIndex /// for i in string.unicodeScalars.indices { /// let r = recognizer.hasBreak(before: string.unicodeScalars[i]) /// if r.setCandidate { candidate = i } /// if r.breakAtCandidate { result.append(candidate) } /// if r.breakHere { result.append(i) } /// } /// if recognizer.hasCandidateBreakAtEnd() { /// result.append(candidate) /// } /// result.append(string.endIndex) /// return result /// } /// /// When used this way, the state machine is able to efficiently iterate over /// all breaks within the string by visiting each scalar exactly once, without /// any backtracking. /// /// It is also possible to discard the recognizer after each detected /// boundary, reinitializing it from scratch for each iteration step: /// /// func wordBreak( /// after knownBreak: String.Index, in string: String /// ) -> String.Index { /// var recognizer = Unicode._WordRecognizer(after: string.unicodeScalars[knownBreak]) /// var i = string.unicodeScalars.index(after: knownBreak) /// var candidate = i /// while i < string.endIndex { /// let r = recognizer.hasBreak(before: string.unicodeScalars[i]) /// if r.setCandidate { candidate = i } /// if r.breakAtCandidate { return candidate } /// if r.breakHere { return i } /// string.uncodeScalars.formIndex(after: &i) /// } /// if recognizer.hasCandidateBreakAtEnd() { /// return candidate /// } /// return i /// } /// /// However, note that iterating this way is less efficient, because it /// discards lookahead information -- some scalars will be processed multiple /// times. The rules are carefully constructed so that the algorithm reports /// the same word boundaries whether or not recognizer state is preserved. @available(StdlibDeploymentTarget 6.3, *) public // Core primitive struct _WordRecognizer: Sendable { // FIXME: We also need proper public API for this /// The last scalar that was fed to `hasBreak(before:)`. var _prevScalar: Unicode.Scalar /// The cached word break property of `_prevScalar`. var _prevCategory: _WordBreakProperty /// The word break property of the last preceding scalar that wasn't ignored by rule WB4. var _baseCategory: _WordBreakProperty /// The current state of the recognizer. var _state: _State /// Initialize a new word recognizer at the _start of text_ (sot) /// position. /// /// The resulting state machine will report a word break before the first /// scalar that is fed to it. public init() { // To avoid having to handle the empty case specially, we use LF as the // placeholder before the first scalar. Per WB3a, we always produce a break // following a line feed. _baseCategory = .newlineCRLF _prevScalar = Unicode.Scalar(0x0A as UInt8) _prevCategory = .newlineCRLF _state = .ordinary } /// Initialize a new word recognizer with a state after a previously /// recognized word boundary. /// /// This enables clients to iterate over word boundaries without maintaining /// a persistent recognizer state. However, iterating this way may result in /// an arbitrarily large amount of duplicate work. (This is because the word /// segmentation algorithm requires looking ahead by as much as a full /// word's worth of Unicode scalars, and when the old recognizer is /// discarded, the information thus collected has to be recreated from /// scratch.) Whenever possible, it is therefore preferable to iterate over /// multiple word boundaries using a single recognizer instance. /// /// - Parameter scalar: The Unicode scalar immediately following a known /// word boundary position. public init(after scalar: Unicode.Scalar) { // We assume that the state machine provides stable results even if the // start position was a retroactive candidate. _prevScalar = scalar _prevCategory = Unicode._WordBreakProperty(from: scalar) _baseCategory = _prevCategory _state = .ordinary } } } @available(StdlibDeploymentTarget 6.3, *) extension Unicode._WordRecognizer { /// The parts of the word recognizer state that are in addition to saved /// information about previous scalars. /// /// This is used to implement stateful lookahead when a break at a particular /// candidate position may need to be suppressed or activated based on a /// subsequent scalar value. It is also used to keep track of the number of /// contiguous regional indicator scalars we have seen so far. internal enum _State: Int, Sendable { case ordinary case afterWB6 // AHLetter × (MidLetter | MidNumLetQ) AHLetter case afterWB7b // Hebrew_Letter × Double_Quote Hebrew_Letter case afterWB12 // Numeric × (MidNum | MidNumLetQ) Numeric case afterMidFlag // [^RI] (RI RI)* RI × RI var hasPendingCandidate: Bool { switch self { case .afterWB6, .afterWB7b, .afterWB12: return true default: return false } } } /// Reject a break at the current position, triggering a break at the current /// candidate (if any). internal mutating func _reject( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { let breakAtCandidate = _state.hasPendingCandidate _state = .ordinary return ( setCandidate: false, breakAtCandidate: breakAtCandidate, breakHere: false) } /// Skip this position, ignoring the current Unicode scalar. internal mutating func _ignore( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { // Note: not updating _baseCategory return (setCandidate: false, breakAtCandidate: false, breakHere: false) } /// Signal a break at the current position, also triggering a break at the /// current candidate (if any). internal mutating func _accept( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { // If we have a pending candidate, put a break at it let breakAtCandidate = _state.hasPendingCandidate _state = .ordinary return ( setCandidate: false, breakAtCandidate: breakAtCandidate, breakHere: true) } /// Set the current position as the active break candidate, and transition into the /// specified state. internal mutating func _transition( into state: _State ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { _internalInvariant(_state == .ordinary) _state = state return (setCandidate: true, breakAtCandidate: false, breakHere: false) } /// If the current state matches the given expectation, suppress a break at /// this position and discard the active candidate; otherwise, report a break /// at both positions. internal mutating func _expect( _ expectedState: _State ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { let breakHere = (_state != expectedState) let breakAtCandidate = breakHere && _state.hasPendingCandidate _state = .ordinary return ( setCandidate: false, breakAtCandidate: breakAtCandidate, breakHere: breakHere) } /// Feeds the next scalar to the state machine, reporting if there is a word /// boundary at the current position or a previously reported candidate. /// /// To decide whether there is a word break at the current position, the /// segmentation algorithm sometimes needs to look ahead by visiting /// additional scalars following the break, up to the next word boundary. To /// allow this, the state machine can report that the current position is a /// provisional break "candidate". Clients are expected to remember the /// position of the last reported candidate, so that it can be retroactively /// promoted to a full break as needed. /// /// - Parameter nextScalar: The scalar at the current position in the text. /// - Returns: A triple of Boolean values `setCandidate`, `breakAtCandidate`, /// `breakHere`. If `setCandidate` is true, then the caller is expected to /// save the current text position as a potential word boundary. If /// `breakAtCandidate` is true, then there is a word boundary at the last /// candidate position. If `breakHere` is true, then there is a word /// boundary at the current position. The caller is expected to process /// these three components in this specific order. public mutating func hasBreak( before nextScalar: Unicode.Scalar ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { let nextCategory = Unicode._WordBreakProperty(from: nextScalar) var nextBase = nextCategory // FIXME: Implement a proper state machine here, dispatching on // (state, nextProperty), ideally through a static look-up table defer { _prevScalar = nextScalar _prevCategory = nextCategory _baseCategory = nextBase } switch (_prevCategory, nextCategory) { case (.any, .any): // WB999 // Fast path: If we know our scalars have no properties then the decision // is trivial and we don't need to crawl to the default statement. return _accept() case (.newlineCRLF, _), // WB3a (_, .newlineCRLF): // WB3b if _prevScalar.value == 0xD, nextScalar.value == 0xA { // WB3 _internalInvariant(_prevCategory == .newlineCRLF) return _reject() } return _accept() case (.zwj, .extendedPictographic), // WB3c (.wSegSpace, .wSegSpace): // WB3d return _reject() case (_, .format), // WB4 (_, .extend), (_, .zwj): nextBase = _baseCategory // Cancel _baseCategory update return _ignore() default: break } switch (_baseCategory, nextCategory) { case (.aLetter, .aLetter), // WB5 (.aLetter, .hebrewLetter), (.hebrewLetter, .aLetter), (.hebrewLetter, .hebrewLetter): return _reject() case (.aLetter, .midLetter), // WB6 (.hebrewLetter, .midLetter), (.aLetter, .midNumLet), (.hebrewLetter, .midNumLet), (.aLetter, .singleQuote): return _transition(into: .afterWB6) case (.midLetter, .aLetter), // WB7 (.midLetter, .hebrewLetter), (.midNumLet, .aLetter), (.midNumLet, .hebrewLetter), (.singleQuote, .aLetter), (.singleQuote, .hebrewLetter): return _expect(.afterWB6) case (.hebrewLetter, .singleQuote): // WB7a return _reject() case (.hebrewLetter, .doubleQuote): // WB7b return _transition(into: .afterWB7b) case (.doubleQuote, .hebrewLetter): // WB7c return _expect(.afterWB7b) case (.numeric, .numeric), // WB8 (.aLetter, .numeric), // WB9 (.hebrewLetter, .numeric), // WB9 (.numeric, .aLetter), // WB10 (.numeric, .hebrewLetter): // WB10 return _reject() case (.midNum, .numeric), // WB11 (.midNumLet, .numeric), (.singleQuote, .numeric): return _expect(.afterWB12) case (.numeric, .midNum), // WB12 (.numeric, .midNumLet), (.numeric, .singleQuote): return _transition(into: .afterWB12) case (.katakana, .katakana), // WB13 (.aLetter, .extendNumLet), // WB13a (.hebrewLetter, .extendNumLet), (.numeric, .extendNumLet), (.katakana, .extendNumLet), (.extendNumLet, .extendNumLet), (.extendNumLet, .aLetter), // WB13b (.extendNumLet, .hebrewLetter), (.extendNumLet, .numeric), (.extendNumLet, .katakana): return _reject() case (.regionalIndicator, .regionalIndicator): // WB15/WB16 let breakHere: Bool if _state == .afterMidFlag { _state = .ordinary breakHere = true } else { _state = .afterMidFlag breakHere = false } return (setCandidate: false, breakAtCandidate: false, breakHere: breakHere) default: // WB999 return _accept() } } /// Returns true if the previously reported word boundary candidate needs to /// be promoted to a full break if there are no more scalars in the input text. /// /// There is always an (implicit) word boundary at position at the end of /// text, following the last scalar; however, the end may also trigger a /// pending unreported break at the last candidate previously set by /// `hasBreak`. This method returns true in that case, allowing clients to /// reliably detect such boundaries. public func hasCandidateBreakAtEnd() -> Bool { _state.hasPendingCandidate } } extension Unicode { /// A state machine for recognizing safe word boundaries in a backward /// sequence of Unicode scalars, based on the specification in [Unicode Annex /// #29](https://unicode.org/reports/tr29/#Word_Boundary_Rules). /// /// The text segmentation algorithm is not stable, and it allows implementers /// to tailor it to their needs. Accordingly, reported word boundaries may /// vary in arbitrary ways between Unicode implementations and system /// configurations, including between versions of the Swift Standard Library. /// /// This is intended to help implement searching for word boundaries near an /// arbitrary position in a middle of a larger text, as described in [section /// 6.4 of Annex #29](https://unicode.org/reports/tr29/#Random_Access). The /// start position may be at at an arbitrary scalar anywhere in the input text /// -- there is no expectation that the first scalar addresses a known word /// boundary. The state machine scans backwards from that position until it /// detects a reliable word boundary. /// /// To detect a word break near a particular position in a series of Unicode /// scalars, start iterating scalars backward from the start position, feeding /// each of them to the `hasGuaranteedBreak(after:)` method. The method /// indicates if there is a word break preceding the given scalar, or at a /// previously reported candidate position. There is always a word break at /// the start of the text; so if we run out of scalars, the start position is /// going to be a suitable safe word boundary. /// /// Note that this construct may skip over an arbitrary number of word /// boundaries while it is searching for a safe break position. Once a safe /// boundary is found, callers are usually expected to use it as the start /// position to iterate forward using the standard segmentation algorithm /// (implemented by `Unicode._WordRecognizer`), for example until they find /// the break nearest to their original start position. In such cases, it is /// usually a good idea to incrementally memoize word boundaries as they are /// detected, to avoid repeating this process on the same positions later. @available(StdlibDeploymentTarget 6.3, *) public // Core primitive struct _RandomAccessWordRecognizer: Sendable { // FIXME: We also need proper public API for this /// The last scalar that was fed to `hasGuaranteedBreak`; i.e., the scalar /// immediately following the current one in the text. var _nextScalar: Unicode.Scalar /// The cached word break property of `_nextScalar`. var _nextCategory: _WordBreakProperty /// The word break property of the most recently seen scalar that wasn't /// ignored by rule WB4. var _baseCategory: _WordBreakProperty /// Additional recognizer state. var _state: _State var _hasPendingCandidate: Bool /// Initialize a new word recognizer at an arbitrary text position preceding /// the given scalar. public init(before scalar: Unicode.Scalar) { _nextScalar = scalar _nextCategory = Unicode._WordBreakProperty(from: scalar) _baseCategory = _nextCategory _state = .initial _hasPendingCandidate = false } } } @available(StdlibDeploymentTarget 6.3, *) extension Unicode._RandomAccessWordRecognizer { internal enum _State: Int, Sendable { case initial case ordinary case beforeWB7 case beforeWB7c case beforeWB11 var hasPendingCandidate: Bool { switch self { case .beforeWB7, .beforeWB7c, .beforeWB11: return true default: return false } } } internal mutating func _reject( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { _hasPendingCandidate = false return (setCandidate: false, breakAtCandidate: false, breakHere: false) } internal mutating func _ignore( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { return (setCandidate: false, breakAtCandidate: false, breakHere: false) } internal mutating func _accept( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { if _hasPendingCandidate { _hasPendingCandidate = false return (setCandidate: false, breakAtCandidate: true, breakHere: false) } return (setCandidate: false, breakAtCandidate: false, breakHere: true) } internal mutating func _placeCandidate( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { if _state == .initial { return _reject() } _hasPendingCandidate = true return (setCandidate: true, breakAtCandidate: false, breakHere: false) } internal mutating func _placeSoftCandidate( ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { if _hasPendingCandidate || _state == .initial { return (false, false, false) } return _placeCandidate() } public mutating func hasGuaranteedBreak( after previousScalar: Unicode.Scalar ) -> (setCandidate: Bool, breakAtCandidate: Bool, breakHere: Bool) { let prevCategory = Unicode._WordBreakProperty(from: previousScalar) var newState: _State = .ordinary var newBase = prevCategory defer { _nextCategory = prevCategory _nextScalar = previousScalar _baseCategory = newBase _state = newState } switch (prevCategory, _nextCategory) { case (.any, .any): // WB999 shortcut return _accept() case (.newlineCRLF, _), // WB3a (_, .newlineCRLF): // WB3b if previousScalar.value == 0xD, _nextScalar.value == 0xA { // WB3 return _reject() } return _accept() case (.zwj, .extendedPictographic), // WB3c (.wSegSpace, .wSegSpace): // WB3d newBase = _baseCategory newState = _state return _reject() case (.format, _), // WB4 (.extend, _), (.zwj, _): newBase = _baseCategory newState = _state if _state == .initial || _nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj { return _ignore() } return _placeSoftCandidate() default: break } switch (prevCategory, _baseCategory) { case (.aLetter, .aLetter), // WB5 (.aLetter, .hebrewLetter), (.hebrewLetter, .aLetter), (.hebrewLetter, .hebrewLetter): return _reject() case (.midLetter, .aLetter), // WB7 (.midLetter, .hebrewLetter), (.midNumLet, .aLetter), (.midNumLet, .hebrewLetter), (.singleQuote, .aLetter), (.singleQuote, .hebrewLetter): newState = .beforeWB7 return _placeSoftCandidate() case (.aLetter, .midLetter), // WB6 (.hebrewLetter, .midLetter), (.aLetter, .midNumLet), (.hebrewLetter, .midNumLet), (.aLetter, .singleQuote): if _state == .beforeWB7 || _state == .initial { return _reject() } return _accept() case (.hebrewLetter, .singleQuote): // WB7a return _reject() case (.doubleQuote, .hebrewLetter): // WB7c newState = .beforeWB7c return _placeSoftCandidate() case (.hebrewLetter, .doubleQuote): // WB7b if _state == .beforeWB7c || _state == .initial { return _reject() } return _accept() case (.numeric, .numeric), // WB8 (.aLetter, .numeric), // WB9 (.hebrewLetter, .numeric), (.numeric, .aLetter), // WB10 (.numeric, .hebrewLetter): return _reject() case (.midNum, .numeric), // WB11 (.midNumLet, .numeric), (.singleQuote, .numeric): newState = .beforeWB11 return _placeSoftCandidate() case (.numeric, .midNum), // WB12 (.numeric, .midNumLet), (.numeric, .singleQuote): if _state == .beforeWB11 || _state == .initial { return _reject() } return _accept() case (.katakana, .katakana), // WB13 (.aLetter, .extendNumLet), // WB13a (.hebrewLetter, .extendNumLet), (.numeric, .extendNumLet), (.katakana, .extendNumLet), (.extendNumLet, .extendNumLet), (.extendNumLet, .aLetter), // WB13b (.extendNumLet, .hebrewLetter), (.extendNumLet, .numeric), (.extendNumLet, .katakana): return _reject() case (.regionalIndicator, .regionalIndicator): // WB15/WB16 return _reject() case (_, .format), (_, .extend), (_, .zwj): _internalInvariant(!_hasPendingCandidate) newState = .initial return _reject() default: if !_hasPendingCandidate, _nextCategory == .format || _nextCategory == .extend || _nextCategory == .zwj { return _ignore() } return _accept() } } } extension String { /// Find and return a word boundary position at or before an arbitrary index /// within this string. The result may not be the closest word break to the /// start position. /// /// This implements the core algorithm for finding a "safe" starting point for /// random access to word breaks, following [section 6.4 of Unicode Annex /// #29](https://unicode.org/reports/tr29/#Random_Access). Unicode defines /// word boundaries using a forward-only state machine; this algorithm does /// its best to run the state machine backwards until it finds a guaranteed /// break position. /// /// This process is inherently an approximation: the algorithm may need to /// skip over an arbitrary number of actual word boundaries before finding one /// that it can judge with confidence. This makes it relatively expensive to /// iterate over word boundaries backwards; in the worst case, a naive /// implementation may have quadratic complexity. The recommended way to /// mitigate this is to maintain a cache of word breaks already traversed, /// only calling this method to extend the range of known breaks backwards as /// needed. /// /// - Parameter i: An arbitrary index within the string, not necessarily /// addressing a word boundary. /// - Returns: A valid index less than equal to the input that is guaranteed to /// identify some word boundary at or before `i`. @available(SwiftStdlib 6.3, *) public func _wordIndex(somewhereAtOrBefore i: Index) -> Index { var j = _guts.validateInclusiveScalarIndex(i) if j == endIndex { return j } var recognizer = Unicode._RandomAccessWordRecognizer( before: self.unicodeScalars[j]) var candidate = j while j > self.startIndex { let p = self.unicodeScalars.index(before: j) let b = recognizer.hasGuaranteedBreak(after: self.unicodeScalars[p]) if b.setCandidate { candidate = j } if b.breakAtCandidate { return candidate } if b.breakHere { return j } j = p } return j } /// Return the word boundary position following a known word boundary within /// this string. /// /// This implements the word boundary specification of [Unicode Annex /// #29](https://unicode.org/reports/tr29/#Default_Word_Boundaries). The /// algorithm is not stable, and it allows implementers to tailor it to their /// needs; accordingly, the result of this operation may vary between Unicode /// implementations and system configurations, including versions of the Swift /// Standard Library. /// /// Note: The input index must be on a known word boundary, otherwise the /// result of this operation is unspecified. The start and end indices are /// always known word boundaries, in every string. /// /// - Parameter i: A valid index addressing a word boundary within this /// string. /// - Returns: The first word break strictly following `i` in the string. @available(StdlibDeploymentTarget 5.7, *) public func _wordIndex(after i: String.Index) -> String.Index { guard #available(StdlibDeploymentTarget 6.3, *) else { fatalError("Unreachable") } let i = _guts.validateScalarIndex(i) if _slowPath(_guts.isForeign) { return _guts._nextForeignWordIndex(after: i) } return _guts._nextUTF8WordIndex(after: i) } } extension _StringGuts { @available(StdlibDeploymentTarget 6.3, *) @inline(never) @_effects(releasenone) internal func _nextUTF8WordIndex(after index: Index) -> Index { _internalInvariant(self.isFastUTF8) let result = unsafe self.withFastUTF8 { utf8 in var offset = index._encodedOffset let first = unsafe _decodeScalar(utf8, startingAt: offset) offset &+= first.scalarLength var recognizer = Unicode._WordRecognizer(after: first.0) var candidate = offset while offset < utf8.count { let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: offset) let r = recognizer.hasBreak(before: scalar) if r.setCandidate { candidate = offset } if r.breakAtCandidate { return candidate } if r.breakHere { return offset } offset &+= len } if recognizer.hasCandidateBreakAtEnd() { return candidate } return offset } // Note: We only signal that the result is scalar aligned, not // character-aligned. Unicode does attempt to ensure that word breaks are // always character-aligned, but this is not a strict guarantee, especially // not if either segmentation algorithm has a tailored implementation. (As // of 6.3, we do not tailor our implementations, but we used to do so and we // may choose to do it again in the future.) return Index(_encodedOffset: result)._scalarAligned._knownUTF8 } @available(StdlibDeploymentTarget 6.3, *) @inline(never) internal func _nextForeignWordIndex(after index: Index) -> Index { #if _runtime(_ObjC) _internalInvariant(self.isForeign) let scalars = String.UnicodeScalarView(self) var recognizer = Unicode._WordRecognizer(after: scalars[index]) var i = scalars.index(after: index) var candidate = i while i < scalars.endIndex { let r = recognizer.hasBreak(before: scalars[i]) if r.setCandidate { candidate = i } if r.breakAtCandidate { return candidate } if r.breakHere { return i } scalars.formIndex(after: &i) } if recognizer.hasCandidateBreakAtEnd() { return candidate } return i #else fatalError("Foreign strings are unsupported on this platform") #endif } }