//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// import SwiftShims /// CR and LF are common special cases in grapheme breaking logic private var _CR: UInt8 { return 0x0D } private var _LF: UInt8 { return 0x0A } /// Perform a quick-check to determine if there's a grapheme-break between two /// scalars, without consulting the data tables. Returns true if there /// definitely is a break, false if there definitely is none, and nil if a /// break couldn't be determined internal func _quickHasGraphemeBreakBetween( _ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar ) -> Bool? { // GB3: // CR-LF is a special case: no break between these if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) { return false } // Whether the given scalar, when it appears paired with another scalar // satisfying this property, has a grapheme break between it and the other // scalar. func hasBreakWhenPaired(_ x: Unicode.Scalar) -> Bool { // TODO: This doesn't generate optimal code, tune/re-write at a lower // level. // // NOTE: Order of case ranges affects codegen, and thus performance. All // things being equal, keep existing order below. switch x.value { // Unified CJK Han ideographs, common and some supplemental, amongst // others: // U+3400 ~ U+A4CF case 0x3400...0xa4cf: return true // Repeat sub-300 check, this is beneficial for common cases of Latin // characters embedded within non-Latin script (e.g. newlines, spaces, // proper nouns and/or jargon, punctuation). // // NOTE: CR-LF special case has already been checked. case 0x0000...0x02ff: return true // Non-combining kana: // U+3041 ~ U+3096 // U+30A1 ~ U+30FC case 0x3041...0x3096: return true case 0x30a1...0x30fc: return true // Non-combining modern (and some archaic) Cyrillic: // U+0400 ~ U+0482 (first half of Cyrillic block) case 0x0400...0x0482: return true // Modern Arabic, excluding extenders and prependers: // U+061D ~ U+064A case 0x061d...0x064a: return true // Precomposed Hangul syllables: // U+AC00 ~ U+D7AF case 0xac00...0xd7af: return true // Common general use punctuation, excluding extenders: // U+2010 ~ U+2029 case 0x2010...0x2029: return true // CJK punctuation characters, excluding extenders: // U+3000 ~ U+3029 case 0x3000...0x3029: return true // Full-width forms: // U+FF01 ~ U+FF9D case 0xFF01...0xFF9D: return true default: return false } } if hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) { return true } return nil } extension _StringGuts { @inline(__always) internal func roundDownToNearestCharacter( _ i: String.Index ) -> String.Index { _internalInvariant(i._isScalarAligned) _internalInvariant(hasMatchingEncoding(i)) _internalInvariant(i._encodedOffset <= count) let offset = i._encodedOffset if _fastPath(i._isCharacterAligned) { return i } if offset == 0 || offset == count { return i._characterAligned } return _slowRoundDownToNearestCharacter(i) } @inline(never) internal func _slowRoundDownToNearestCharacter( _ i: String.Index ) -> String.Index { let offset = i._encodedOffset let start = offset - _opaqueCharacterStride(endingAt: offset) let stride = _opaqueCharacterStride(startingAt: start) _internalInvariant(offset <= start + stride, "Grapheme breaking inconsistency") if offset >= start + stride { // Already aligned, or grapheme breaking returned an unexpected result. return i._characterAligned } let r = String.Index(encodedOffset: start, characterStride: stride) return markEncoding(r._characterAligned) } @inline(__always) internal func roundDownToNearestCharacter( _ i: String.Index, in bounds: Range ) -> String.Index { _internalInvariant( bounds.lowerBound._isScalarAligned && bounds.upperBound._isScalarAligned) _internalInvariant( hasMatchingEncoding(bounds.lowerBound) && hasMatchingEncoding(bounds.upperBound)) _internalInvariant(bounds.upperBound <= endIndex) _internalInvariant(i._isScalarAligned) _internalInvariant(hasMatchingEncoding(i)) _internalInvariant(i >= bounds.lowerBound && i <= bounds.upperBound) // We can only use the `_isCharacterAligned` bit if the start index is also // character-aligned. if _fastPath( bounds.lowerBound._isCharacterAligned && i._isCharacterAligned ) { return i } if i == bounds.lowerBound || i == bounds.upperBound { return i } return _slowRoundDownToNearestCharacter(i, in: bounds) } @inline(never) internal func _slowRoundDownToNearestCharacter( _ i: String.Index, in bounds: Range ) -> String.Index { let offset = i._encodedOffset let offsetBounds = bounds._encodedOffsetRange let prior = offset - _opaqueCharacterStride(endingAt: offset, in: offsetBounds) let stride = _opaqueCharacterStride(startingAt: prior) _internalInvariant(offset <= prior + stride, "Grapheme breaking inconsistency") if offset >= prior + stride { // Already aligned, or grapheme breaking returned an unexpected result. return i } var r = String.Index(encodedOffset: prior, characterStride: stride) if bounds.lowerBound._isCharacterAligned { r = r._characterAligned } else { r = r._scalarAligned } return markEncoding(r) } } extension _StringGuts { @usableFromInline @inline(never) @_effects(releasenone) internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool { if i._isCharacterAligned { return true } guard i.transcodedOffset == 0 else { return false } let offset = i._encodedOffset if offset == 0 || offset == self.count { return true } guard isOnUnicodeScalarBoundary(i) else { return false } let nearest = roundDownToNearestCharacter(i._scalarAligned) return i == nearest } } extension _StringGuts { /// Return the length of the extended grapheme cluster starting at offset `i`, /// assuming it falls on a grapheme cluster boundary. /// /// Note: This does not look behind at data preceding `i`, so if `i` is not on /// a grapheme cluster boundary, then it may return results that are /// inconsistent with `_opaqueCharacterStride(endingAt:)`. On the other hand, /// this behavior makes this suitable for use in substrings whose start index /// itself does not fall on a cluster boundary. @usableFromInline @inline(__always) @_effects(releasenone) internal func _opaqueCharacterStride(startingAt i: Int) -> Int { _internalInvariant(i < endIndex._encodedOffset) if isFastUTF8 { let fast = unsafe withFastUTF8 { utf8 in if i &+ 1 == utf8.count { return true } let pair = unsafe UnsafeRawPointer( utf8.baseAddress.unsafelyUnwrapped ).loadUnaligned(fromByteOffset: i, as: UInt16.self) //& 0x8080 == 0 is "both not ASCII", != 0x0A0D is "not CRLF" return pair & 0x8080 == 0 && pair != 0x0A0D } if _fastPath(fast) { _internalInvariant(_opaqueComplexCharacterStride(startingAt: i) == 1) return 1 } } return _opaqueComplexCharacterStride(startingAt: i) } @_effects(releasenone) @inline(never) internal func _opaqueComplexCharacterStride(startingAt i: Int) -> Int { if _slowPath(isForeign) { return _foreignOpaqueCharacterStride(startingAt: i) } let nextIdx = unsafe withFastUTF8 { utf8 in nextBoundary(startingAt: i) { j in _internalInvariant(j >= 0) guard j < utf8.count else { return nil } let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: j) return (scalar, j &+ len) } } return nextIdx &- i } /// Return the length of the extended grapheme cluster ending at offset `i`, /// or if `i` happens to be in the middle of a grapheme cluster, find and /// return the distance to its start. /// /// Note: unlike `_opaqueCharacterStride(startingAt:)`, this method always /// finds a correct grapheme cluster boundary. @usableFromInline @inline(__always) @_effects(releasenone) internal func _opaqueCharacterStride(endingAt i: Int) -> Int { if i <= 1 { return i } if isFastUTF8 { let fast = unsafe withFastUTF8 { utf8 in let pair = unsafe UnsafeRawPointer( utf8.baseAddress.unsafelyUnwrapped ).loadUnaligned(fromByteOffset: i &- 2, as: UInt16.self) //& 0x8080 == 0 is "both not ASCII", != 0x0A0D is "not CRLF" return pair & 0x8080 == 0 && pair != 0x0A0D } if _fastPath(fast) { _internalInvariant(_opaqueComplexCharacterStride(endingAt: i) == 1) return 1 } } return _opaqueComplexCharacterStride(endingAt: i) } @_effects(releasenone) @inline(never) internal func _opaqueComplexCharacterStride(endingAt i: Int) -> Int { if _slowPath(isForeign) { return _foreignOpaqueCharacterStride(endingAt: i) } let previousIdx = unsafe withFastUTF8 { utf8 in previousBoundary(endingAt: i) { j in _internalInvariant(j <= utf8.count) guard j > 0 else { return nil } let (scalar, len) = unsafe _decodeScalar(utf8, endingAt: j) return (scalar, j &- len) } } return i &- previousIdx } /// Return the length of the extended grapheme cluster ending at offset `i` in /// bounds, or if `i` happens to be in the middle of a grapheme cluster, find /// and return the distance to its start. /// /// Note: unlike `_opaqueCharacterStride(startingAt:)`, this method always /// finds a correct grapheme cluster boundary within the substring defined by /// the specified bounds. @_effects(releasenone) internal func _opaqueCharacterStride( endingAt i: Int, in bounds: Range ) -> Int { _internalInvariant(i > bounds.lowerBound && i <= bounds.upperBound) if _slowPath(isForeign) { return _foreignOpaqueCharacterStride(endingAt: i, in: bounds) } let previousIdx = unsafe withFastUTF8 { utf8 in previousBoundary(endingAt: i) { j in _internalInvariant(j <= bounds.upperBound) guard j > bounds.lowerBound else { return nil } let (scalar, len) = unsafe _decodeScalar(utf8, endingAt: j) return (scalar, j &- len) } } _internalInvariant(bounds.contains(previousIdx)) return i &- previousIdx } @inline(never) @_effects(releasenone) private func _foreignOpaqueCharacterStride(startingAt i: Int) -> Int { #if _runtime(_ObjC) _internalInvariant(isForeign) let nextIdx = nextBoundary(startingAt: i) { j in _internalInvariant(j >= 0) guard j < count else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) let scalar = scalars[idx] let nextIdx = scalars.index(after: idx) return (scalar, nextIdx._encodedOffset) } return nextIdx &- i #else fatalError("No foreign strings on Linux in this version of Swift") #endif } @inline(never) @_effects(releasenone) private func _foreignOpaqueCharacterStride( startingAt i: Int, in bounds: Range ) -> Int { #if _runtime(_ObjC) _internalInvariant(isForeign) _internalInvariant(bounds.contains(i)) let nextIdx = nextBoundary(startingAt: i) { j in _internalInvariant(j >= bounds.lowerBound) guard j < bounds.upperBound else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) let scalar = scalars[idx] let nextIdx = scalars.index(after: idx) return (scalar, nextIdx._encodedOffset) } return nextIdx &- i #else fatalError("No foreign strings on Linux in this version of Swift") #endif } @inline(never) @_effects(releasenone) private func _foreignOpaqueCharacterStride(endingAt i: Int) -> Int { #if _runtime(_ObjC) _internalInvariant(isForeign) let previousIdx = previousBoundary(endingAt: i) { j in _internalInvariant(j <= self.count) guard j > 0 else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) let previousIdx = scalars.index(before: idx) let scalar = scalars[previousIdx] return (scalar, previousIdx._encodedOffset) } return i &- previousIdx #else fatalError("No foreign strings on Linux in this version of Swift") #endif } @inline(never) @_effects(releasenone) private func _foreignOpaqueCharacterStride( endingAt i: Int, in bounds: Range ) -> Int { #if _runtime(_ObjC) _internalInvariant(isForeign) _internalInvariant(i > bounds.lowerBound && i <= bounds.upperBound) let previousIdx = previousBoundary(endingAt: i) { j in _internalInvariant(j <= bounds.upperBound) guard j > bounds.lowerBound else { return nil } let scalars = String.UnicodeScalarView(self) let idx = String.Index(_encodedOffset: j) let previousIdx = scalars.index(before: idx) let scalar = scalars[previousIdx] return (scalar, previousIdx._encodedOffset) } return i &- previousIdx #else fatalError("No foreign strings on Linux in this version of Swift") #endif } } extension Unicode.Scalar { fileprivate var _isInCBConsonant: Bool { _swift_stdlib_isInCB_Consonant(value) } fileprivate var _isInCBExtend: Bool { // Assuming that we're already an Extend or ZWJ... !(_isInCBConsonant || _isInCBLinker || value == 0x200C) } fileprivate var _isInCBLinker: Bool { switch value { // Devanagari case 0x94D: return true // Bengali case 0x9CD: return true // Gujarati case 0xACD: return true // Oriya case 0xB4D: return true // Telugu case 0xC4D: return true // Malayalam case 0xD4D: return true default: return false } } } internal struct _GraphemeBreakingState: Sendable, Equatable { // When we're looking through an indic sequence, one of the requirements is // that there is at LEAST 1 InCB=Linker present between two InCB=Consonant. // This value helps ensure that when we ultimately need to decide whether or // not to break that we've at least seen 1 when walking. var hasSeenInCBLinker = false // When walking forwards in a string, we need to know whether or not we've // entered an emoji sequence to be able to eventually break after all of the // emoji's various extenders and zero width joiners. This bit allows us to // keep track of whether or not we're still in an emoji sequence when deciding // to break. var isInEmojiSequence = false // Similar to emoji sequences, we need to know not to break an Indic grapheme // sequence. This sequence is (potentially) composed of many scalars and isn't // as trivial as comparing two grapheme properties. var isInIndicSequence = false // When walking forward in a string, we need to not break on emoji flag // sequences. Emoji flag sequences are composed of 2 regional indicators, so // when we see our first (.regionalIndicator, .regionalIndicator) decision, // we need to know to return false in this case. However, if the next scalar // is another regional indicator, we reach the same decision rule, but in this // case we actually need to break there's a boundary between emoji flag // sequences. var shouldBreakRI = false } extension _GraphemeBreakingState: CustomStringConvertible { var description: String { var r = "[" if hasSeenInCBLinker { r += "L" } if isInEmojiSequence { r += "E" } if isInIndicSequence { r += "I" } if shouldBreakRI { r += "R" } r += "]" return r } } extension Unicode { /// A state machine for recognizing character (i.e., extended grapheme /// cluster) boundaries in an arbitrary series of Unicode scalars. /// /// To detect grapheme breaks in a sequence of Unicode scalars, feed each of /// them to the `hasBreak(before:)` method. The method returns true if the /// sequence has a grapheme break preceding the given value. /// /// The results produced by this state machine are guaranteed to match the way /// `String` splits its contents into `Character` values. @available(SwiftStdlib 5.8, *) public // SPI(Foundation) FIXME: We need API for this struct _CharacterRecognizer: Sendable { internal var _previous: Unicode.Scalar internal var _state: _GraphemeBreakingState /// Refactoring TODO: should we use a quick check result? /// /// Returns a non-nil value if it can be determined whether there is a /// grapheme break between `scalar1` and `scalar2` without knowing anything /// about the scalars that precede `scalar1`. This can optionally be used as /// a fast (but incomplete) test before spinning up a full state machine /// session. @_effects(releasenone) public static func quickBreak( between scalar1: Unicode.Scalar, and scalar2: Unicode.Scalar ) -> Bool? { _quickHasGraphemeBreakBetween(scalar1, scalar2) } /// Initialize a new character recognizer at the _start of text_ (sot) /// position. /// /// The resulting state machine will report a grapheme break on the /// first scalar that is fed to it. public init() { _state = _GraphemeBreakingState() // To avoid having to handle the empty case specially, we use NUL as the // placeholder before the first scalar. NUL is a control character, so per // rule GB5, it will induce an unconditional grapheme break before the // first actual scalar, emulating GB1. _previous = Unicode.Scalar(0 as UInt8) } /// Feeds the next scalar to the state machine, returning a Boolean value /// indicating whether it starts a new extended grapheme cluster. /// /// This method will always report a break the first time it is called /// on a newly initialized recognizer. /// /// The state machine does not carry information across character /// boundaries. I.e., if this method returns true, then `self` after the /// call is equivalent to feeding the same scalar to a newly initialized /// recognizer instance. @_effects(releasenone) public mutating func hasBreak( before next: Unicode.Scalar ) -> Bool { let r = _state.shouldBreak(between: _previous, and: next) if r { _state = _GraphemeBreakingState() } _previous = next return r } /// Decode the scalars in the given UTF-8 buffer and feed them to the /// recognizer up to and including the scalar following the first grapheme /// break. If the buffer contains a grapheme break, then this function /// returns the index range of the scalar that follows the first one; /// otherwise it returns `nil`. /// /// On return, the state of the recognizer is updated to reflect the scalars /// up to and including the returned one. You can detect additional grapheme /// breaks by feeding the recognizer subsequent data. /// /// - Parameter buffer: A buffer containing valid UTF-8 data, starting and /// ending on Unicode scalar boundaries. /// /// - Parameter start: A valid index into `buffer`, addressing the first /// code unit of a UTF-8 scalar in the buffer, or the end. /// /// - Returns: The index range of the scalar that follows the first grapheme /// break in the buffer, if there is one. If the buffer contains no /// grapheme breaks, then this function returns `nil`. /// /// - Warning: This function does not validate that the buffer contains /// valid UTF-8 data; its behavior is undefined if given invalid input. @_effects(releasenone) public mutating func _firstBreak( inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer, startingAt start: Int = 0 ) -> Range? { var i = start while i < buffer.endIndex { let (next, n) = unsafe _decodeScalar(buffer, startingAt: i) if hasBreak(before: next) { return unsafe Range(_uncheckedBounds: (i, i &+ n)) } i &+= n } return nil } } } @available(SwiftStdlib 5.9, *) extension Unicode._CharacterRecognizer: Equatable { public static func ==(left: Self, right: Self) -> Bool { left._previous == right._previous && left._state == right._state } } @available(SwiftStdlib 5.9, *) extension Unicode._CharacterRecognizer: CustomStringConvertible { public var description: String { return "\(_state)U+\(String(_previous.value, radix: 16, uppercase: true))" } } extension _StringGuts { // Returns the stride of the grapheme cluster starting at offset `index`, // assuming it is on a grapheme cluster boundary. // // This method never looks at data below `index`. If `index` isn't on a // grapheme cluster boundary, then the result may not be consistent with the // actual breaks in the string. `Substring` relies on this to generate the // right breaks if its start index isn't aligned on one -- in this case, the // substring's breaks may not match the ones in its base string. internal func nextBoundary( startingAt index: Int, nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)? ) -> Int { _internalInvariant(index < endIndex._encodedOffset) return _nextGraphemeClusterBoundary(startingAt: index, nextScalar: nextScalar) } } internal func _nextGraphemeClusterBoundary( startingAt index: Int, nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)? ) -> Int { // Note: If `index` isn't already on a boundary, then starting with an empty // state here sometimes leads to this method returning results that diverge // from the true breaks in the string. var state = _GraphemeBreakingState() var (scalar, index) = nextScalar(index)! while true { guard let (scalar2, nextIndex) = nextScalar(index) else { break } if state.shouldBreak(between: scalar, and: scalar2) { break } index = nextIndex scalar = scalar2 } return index } extension _StringGuts { fileprivate func previousBoundary( endingAt index: Int, previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? ) -> Int { _previousGraphemeClusterBoundary(endingAt: index, previousScalar: previousScalar) } } // Returns the stride of the grapheme cluster ending at offset `index`. // // This method uses `previousScalar` to looks back in the string as far as // necessary to find a correct grapheme cluster boundary, whether or not // `index` happens to be on a boundary itself. internal func _previousGraphemeClusterBoundary( endingAt index: Int, previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? ) -> Int { // FIXME: This requires potentially arbitrary lookback in each iteration, // leading to quadratic behavior in some edge cases. Ideally lookback should // only be done once per cluster (or in the case of RI sequences, once per // flag sequence). One way to avoid most quadratic behavior is to replace // this implementation with a scheme that first searches backwards for a // safe point then iterates forward using the regular `shouldBreak` until we // reach `index`, as recommended in section 6.4 of TR#29. // // https://www.unicode.org/reports/tr29/#Random_Access var (scalar2, index) = previousScalar(index)! while true { guard let (scalar1, previousIndex) = previousScalar(index) else { break } if _shouldBreakWithLookback( between: scalar1, and: scalar2, at: index, with: previousScalar ) { break } index = previousIndex scalar2 = scalar1 } return index } extension _GraphemeBreakingState { // Return true if there is an extended grapheme cluster boundary between two // scalars, based on state information previously collected about preceding // scalars. // // This method never looks at scalars other than the two that are explicitly // passed to it. The `state` parameter is assumed to hold all contextual // information necessary to make a correct decision; it gets updated with more // data as needed. // // This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules). internal mutating func shouldBreak( between scalar1: Unicode.Scalar, and scalar2: Unicode.Scalar ) -> Bool { if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) { return result } let x = Unicode._GraphemeBreakProperty(from: scalar1) // GB4 handled here because we don't need to know `y` for this case if x == .control { return true } // This variable and the defer statement help toggle the isInEmojiSequence // state variable to false after every decision of 'shouldBreak'. If we // happen to see a rhs .extend or .zwj, then it's a signal that we should // continue treating the current grapheme cluster as an emoji sequence. var enterEmojiSequence = false // Very similar to emoji sequences, but for Indic grapheme sequences. var enterIndicSequence = false defer { isInEmojiSequence = enterEmojiSequence isInIndicSequence = enterIndicSequence } let y = Unicode._GraphemeBreakProperty(from: scalar2) switch (x, y) { // Fast path: If we know our scalars have no properties the decision is // trivial and we don't need to crawl to the default statement. case (.any, .any): return true // (GB4 is handled above) // GB5 case (_, .control): return true // GB6 case (.l, .l), (.l, .v), (.l, .lv), (.l, .lvt): return false // GB7 case (.lv, .v), (.v, .v), (.lv, .t), (.v, .t): return false // GB8 case (.lvt, .t), (.t, .t): return false // GB9 (partial GB9c and partial GB11) case (_, .extend), (_, .zwj): // Prepare for recognizing GB11, by remembering if we're in an emoji // sequence. // // GB11: Extended_Pictographic Extend* ZWJ × Extended_Pictographic // // If our left-side scalar is a pictograph, then it starts a new emoji // sequence; the sequence continues through subsequent extend/extend and // extend/zwj pairs. if ( x == .extendedPictographic || (isInEmojiSequence && x == .extend) ) { enterEmojiSequence = true } // GB9c: InCB=Consonant [InCB=Extend InCB=Linker]* InCB=Linker [InCB=Extend InCB=Linker]* × InCB=Consonant // // If our lhs is an InCB=Consonant and our rhs is either an InCB=Extend or // an InCB=Linker, then enter into an indic sequence and mark if scalar 2 // is a linker and that we've seen a linker. // // If the lhs is not an InCB=Consonant, then check if we're currently in // an indic sequence to properly propagate that back to the state. // Otherwise, we're not in an indic sequence, but our rhs is still an // extension scalar so don't break regardless right here. If we are in an // indic sequence, tell the state that we've seen a linker if our rhs is // one. switch (scalar1._isInCBConsonant, scalar2._isInCBExtend, scalar2._isInCBLinker) { // (InCB=Consonant, InCB=Extend) case (true, true, false): enterIndicSequence = true // (InCB=Consonant, InCB=Linker) case (true, false, true): enterIndicSequence = true hasSeenInCBLinker = true // (_, InCB=Extend) case (false, true, false): guard isInIndicSequence else { break } enterIndicSequence = true // (_, InCB=Linker) case (false, false, true): guard isInIndicSequence else { break } enterIndicSequence = true hasSeenInCBLinker = true default: break } return false // GB9a case (_, .spacingMark): return false // GB9b case (.prepend, _): return false // GB11 case (.zwj, .extendedPictographic): return !isInEmojiSequence // GB12 & GB13 case (.regionalIndicator, .regionalIndicator): defer { shouldBreakRI.toggle() } return shouldBreakRI // GB999 default: // GB9c if isInIndicSequence, hasSeenInCBLinker, scalar2._isInCBConsonant { hasSeenInCBLinker = false return false } return true } } } // Return true if there is an extended grapheme cluster boundary between two // scalars, with no previous knowledge about preceding scalars. // // This method looks back as far as it needs to determine the correct // placement of boundaries. // // This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules). fileprivate func _shouldBreakWithLookback( between scalar1: Unicode.Scalar, and scalar2: Unicode.Scalar, at index: Int, with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? ) -> Bool { if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) { return result } let x = Unicode._GraphemeBreakProperty(from: scalar1) let y = Unicode._GraphemeBreakProperty(from: scalar2) switch (x, y) { // Fast path: If we know our scalars have no properties the decision is // trivial and we don't need to crawl to the default statement. case (.any, .any): return true // GB4 case (.control, _): return true // GB5 case (_, .control): return true // GB6 case (.l, .l), (.l, .v), (.l, .lv), (.l, .lvt): return false // GB7 case (.lv, .v), (.v, .v), (.lv, .t), (.v, .t): return false // GB8 case (.lvt, .t), (.t, .t): return false // GB9 case (_, .extend), (_, .zwj): return false // GB9a case (_, .spacingMark): return false // GB9b case (.prepend, _): return false // GB11 case (.zwj, .extendedPictographic): return !_checkIfInEmojiSequence(at: index, with: previousScalar) // GB12 & GB13 case (.regionalIndicator, .regionalIndicator): return _countRIs(at: index, with: previousScalar) // GB999 default: // GB9c // // Check if our rhs is an InCB=Consonant first because we can more easily // exit out of this branch in most cases. Otherwise, this is a consonant. // Check that the lhs is an InCB=Extend or InCB=Linker (we have to check // if it's an .extend or .zwj first because _isInCBExtend assumes that it // is true). if scalar2._isInCBConsonant, (x == .extend || x == .zwj), (scalar1._isInCBExtend || scalar1._isInCBLinker) { return !_checkIfInIndicSequence(at: index, with: previousScalar) } return true } } // When walking backwards, it's impossible to know whether we were in an emoji // sequence without walking further backwards. This walks the string backwards // enough until we figure out whether or not to break our // (.zwj, .extendedPictographic) question. For example: // // Scalar view #1: // // [.control, .zwj, .extendedPictographic] // ^ // | = To determine whether or not we break here, we need // to see the previous scalar's grapheme property. // ^ // | = This is neither .extendedPictographic nor .extend, thus we // were never in an emoji sequence, so break between the .zwj // and .extendedPictographic. // // Scalar view #2: // // [.extendedPictographic, .zwj, .extendedPictographic] // ^ // | = Same as above, move backwards one to // view the previous scalar's property. // ^ // | = This is an .extendedPictographic, so this indicates that // we are in an emoji sequence, so we should NOT break // between the .zwj and .extendedPictographic. // // Scalar view #3: // // [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic] // ^ // | = Same as above // ^ // | = This is an .extend which means // there is a potential emoji // sequence, walk further backwards // to find an .extendedPictographic. // // <-- = Another extend, go backwards more. // ^ // | = We found our starting .extendedPictographic letting us // know that we are in an emoji sequence so our initial // break question is answered as NO. fileprivate func _checkIfInEmojiSequence( at index: Int, with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? ) -> Bool { guard var i = previousScalar(index)?.start else { return false } while let prev = previousScalar(i) { i = prev.start let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar) switch gbp { case .extend: continue case .extendedPictographic: return true default: return false } } return false } // When walking backwards, it's impossible to know whether we break when we // see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant) // without walking further backwards. This walks the string backwards enough // until we figure out whether or not to break this indic sequence. For example: // // Scalar view #1: // // [InCB=Linker, InCB=Extend, InCB=Consonant] // ^ // | = To be able to know whether or not to // break these two, we need to walk // backwards to determine if this is a // legitimate indic sequence. // ^ // | = The scalar sequence ends without a starting InCB=Consonant, // so this is in fact not an indic sequence, so we can break the two. // // Scalar view #2: // // [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant] // ^ // | = Same as above // ^ // | = This is a Linker, so we at least have seen // 1 to be able to return true if we see a // consonant later. // ^ // | = Is a consonant and we've seen a linker, so this is a // legitimate indic sequence, so do NOT break the initial question. fileprivate func _checkIfInIndicSequence( at index: Int, with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? ) -> Bool { guard let p = previousScalar(index) else { return false } var hasSeenInCBLinker = p.scalar._isInCBLinker var i = p.start while let (scalar, prev) = previousScalar(i) { i = prev if scalar._isInCBConsonant { return hasSeenInCBLinker } let gbp = Unicode._GraphemeBreakProperty(from: scalar) guard gbp == .extend || gbp == .zwj else { return false } switch (scalar._isInCBExtend, scalar._isInCBLinker) { case (false, false): return false case (false, true): hasSeenInCBLinker = true case (true, false): continue case (true, true): // This case should never happen, but if it does then just be cautious // and say this is invalid. return false } } return false } // When walking backwards, it's impossible to know whether we break when we // see our first (.regionalIndicator, .regionalIndicator) without walking // further backwards. This walks the string backwards enough until we figure // out whether or not to break these RIs. For example: // // Scalar view #1: // // [.control, .regionalIndicator, .regionalIndicator] // ^ // | = To be able to know whether or not to // break these two, we need to walk // backwards to determine if there were // any previous .regionalIndicators in // a row. // ^ // | = Not a .regionalIndicator, so our total riCount is 0 and 0 is // even thus we do not break. // // Scalar view #2: // // [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator] // ^ // | = Same as above // ^ // | = This is a .regionalIndicator, so continue // walking backwards for more of them. riCount is // now equal to 1. // ^ // | = Not a .regionalIndicator. riCount = 1 which is odd, so break // the last two .regionalIndicators. fileprivate func _countRIs( at index: Int, with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)? ) -> Bool { guard let p = previousScalar(index) else { return false } var i = p.start var riCount = 0 while let p = previousScalar(i) { i = p.start let gbp = Unicode._GraphemeBreakProperty(from: p.scalar) guard gbp == .regionalIndicator else { break } riCount += 1 } return riCount & 1 != 0 }