mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
1165 lines
38 KiB
Swift
1165 lines
38 KiB
Swift
//===----------------------------------------------------------------------===//
|
||
//
|
||
// This source file is part of the Swift.org open source project
|
||
//
|
||
// Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors
|
||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||
//
|
||
// See https://swift.org/LICENSE.txt for license information
|
||
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||
//
|
||
//===----------------------------------------------------------------------===//
|
||
|
||
import SwiftShims
|
||
|
||
/// CR and LF are common special cases in grapheme breaking logic
|
||
private var _CR: UInt8 { return 0x0D }
|
||
private var _LF: UInt8 { return 0x0A }
|
||
|
||
/// Perform a quick-check to determine if there's a grapheme-break between two
|
||
/// scalars, without consulting the data tables. Returns true if there
|
||
/// definitely is a break, false if there definitely is none, and nil if a
|
||
/// break couldn't be determined
|
||
internal func _quickHasGraphemeBreakBetween(
|
||
_ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
|
||
) -> Bool? {
|
||
// GB3:
|
||
// CR-LF is a special case: no break between these
|
||
if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
|
||
return false
|
||
}
|
||
|
||
// Whether the given scalar, when it appears paired with another scalar
|
||
// satisfying this property, has a grapheme break between it and the other
|
||
// scalar.
|
||
func hasBreakWhenPaired(_ x: Unicode.Scalar) -> Bool {
|
||
// TODO: This doesn't generate optimal code, tune/re-write at a lower
|
||
// level.
|
||
//
|
||
// NOTE: Order of case ranges affects codegen, and thus performance. All
|
||
// things being equal, keep existing order below.
|
||
switch x.value {
|
||
// Unified CJK Han ideographs, common and some supplemental, amongst
|
||
// others:
|
||
// U+3400 ~ U+A4CF
|
||
case 0x3400...0xa4cf: return true
|
||
|
||
// Repeat sub-300 check, this is beneficial for common cases of Latin
|
||
// characters embedded within non-Latin script (e.g. newlines, spaces,
|
||
// proper nouns and/or jargon, punctuation).
|
||
//
|
||
// NOTE: CR-LF special case has already been checked.
|
||
case 0x0000...0x02ff: return true
|
||
|
||
// Non-combining kana:
|
||
// U+3041 ~ U+3096
|
||
// U+30A1 ~ U+30FC
|
||
case 0x3041...0x3096: return true
|
||
case 0x30a1...0x30fc: return true
|
||
|
||
// Non-combining modern (and some archaic) Cyrillic:
|
||
// U+0400 ~ U+0482 (first half of Cyrillic block)
|
||
case 0x0400...0x0482: return true
|
||
|
||
// Modern Arabic, excluding extenders and prependers:
|
||
// U+061D ~ U+064A
|
||
case 0x061d...0x064a: return true
|
||
|
||
// Precomposed Hangul syllables:
|
||
// U+AC00 ~ U+D7AF
|
||
case 0xac00...0xd7af: return true
|
||
|
||
// Common general use punctuation, excluding extenders:
|
||
// U+2010 ~ U+2029
|
||
case 0x2010...0x2029: return true
|
||
|
||
// CJK punctuation characters, excluding extenders:
|
||
// U+3000 ~ U+3029
|
||
case 0x3000...0x3029: return true
|
||
|
||
// Full-width forms:
|
||
// U+FF01 ~ U+FF9D
|
||
case 0xFF01...0xFF9D: return true
|
||
|
||
default: return false
|
||
}
|
||
}
|
||
if hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs) {
|
||
return true
|
||
}
|
||
return nil
|
||
}
|
||
|
||
extension _StringGuts {
|
||
@inline(__always)
|
||
internal func roundDownToNearestCharacter(
|
||
_ i: String.Index
|
||
) -> String.Index {
|
||
_internalInvariant(i._isScalarAligned)
|
||
_internalInvariant(hasMatchingEncoding(i))
|
||
_internalInvariant(i._encodedOffset <= count)
|
||
|
||
let offset = i._encodedOffset
|
||
if _fastPath(i._isCharacterAligned) { return i }
|
||
if offset == 0 || offset == count { return i._characterAligned }
|
||
return _slowRoundDownToNearestCharacter(i)
|
||
}
|
||
|
||
@inline(never)
|
||
internal func _slowRoundDownToNearestCharacter(
|
||
_ i: String.Index
|
||
) -> String.Index {
|
||
let offset = i._encodedOffset
|
||
let start = offset - _opaqueCharacterStride(endingAt: offset)
|
||
let stride = _opaqueCharacterStride(startingAt: start)
|
||
_internalInvariant(offset <= start + stride,
|
||
"Grapheme breaking inconsistency")
|
||
if offset >= start + stride {
|
||
// Already aligned, or grapheme breaking returned an unexpected result.
|
||
return i._characterAligned
|
||
}
|
||
let r = String.Index(encodedOffset: start, characterStride: stride)
|
||
return markEncoding(r._characterAligned)
|
||
}
|
||
|
||
@inline(__always)
|
||
internal func roundDownToNearestCharacter(
|
||
_ i: String.Index,
|
||
in bounds: Range<String.Index>
|
||
) -> String.Index {
|
||
_internalInvariant(
|
||
bounds.lowerBound._isScalarAligned && bounds.upperBound._isScalarAligned)
|
||
_internalInvariant(
|
||
hasMatchingEncoding(bounds.lowerBound)
|
||
&& hasMatchingEncoding(bounds.upperBound))
|
||
_internalInvariant(bounds.upperBound <= endIndex)
|
||
|
||
_internalInvariant(i._isScalarAligned)
|
||
_internalInvariant(hasMatchingEncoding(i))
|
||
_internalInvariant(i >= bounds.lowerBound && i <= bounds.upperBound)
|
||
|
||
// We can only use the `_isCharacterAligned` bit if the start index is also
|
||
// character-aligned.
|
||
if _fastPath(
|
||
bounds.lowerBound._isCharacterAligned && i._isCharacterAligned
|
||
) {
|
||
return i
|
||
}
|
||
if i == bounds.lowerBound || i == bounds.upperBound { return i }
|
||
return _slowRoundDownToNearestCharacter(i, in: bounds)
|
||
}
|
||
|
||
@inline(never)
|
||
internal func _slowRoundDownToNearestCharacter(
|
||
_ i: String.Index,
|
||
in bounds: Range<String.Index>
|
||
) -> String.Index {
|
||
let offset = i._encodedOffset
|
||
|
||
let offsetBounds = bounds._encodedOffsetRange
|
||
let prior =
|
||
offset - _opaqueCharacterStride(endingAt: offset, in: offsetBounds)
|
||
let stride = _opaqueCharacterStride(startingAt: prior)
|
||
_internalInvariant(offset <= prior + stride,
|
||
"Grapheme breaking inconsistency")
|
||
if offset >= prior + stride {
|
||
// Already aligned, or grapheme breaking returned an unexpected result.
|
||
return i
|
||
}
|
||
var r = String.Index(encodedOffset: prior, characterStride: stride)
|
||
if bounds.lowerBound._isCharacterAligned {
|
||
r = r._characterAligned
|
||
} else {
|
||
r = r._scalarAligned
|
||
}
|
||
return markEncoding(r)
|
||
}
|
||
}
|
||
|
||
extension _StringGuts {
|
||
@usableFromInline @inline(never)
|
||
@_effects(releasenone)
|
||
internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool {
|
||
if i._isCharacterAligned { return true }
|
||
|
||
guard i.transcodedOffset == 0 else { return false }
|
||
|
||
let offset = i._encodedOffset
|
||
if offset == 0 || offset == self.count { return true }
|
||
|
||
guard isOnUnicodeScalarBoundary(i) else { return false }
|
||
|
||
let nearest = roundDownToNearestCharacter(i._scalarAligned)
|
||
return i == nearest
|
||
}
|
||
}
|
||
|
||
extension _StringGuts {
|
||
/// Return the length of the extended grapheme cluster starting at offset `i`,
|
||
/// assuming it falls on a grapheme cluster boundary.
|
||
///
|
||
/// Note: This does not look behind at data preceding `i`, so if `i` is not on
|
||
/// a grapheme cluster boundary, then it may return results that are
|
||
/// inconsistent with `_opaqueCharacterStride(endingAt:)`. On the other hand,
|
||
/// this behavior makes this suitable for use in substrings whose start index
|
||
/// itself does not fall on a cluster boundary.
|
||
@usableFromInline @inline(__always)
|
||
@_effects(releasenone)
|
||
internal func _opaqueCharacterStride(startingAt i: Int) -> Int {
|
||
_internalInvariant(i < endIndex._encodedOffset)
|
||
if isFastUTF8 {
|
||
let fast = unsafe withFastUTF8 { utf8 in
|
||
if i &+ 1 == utf8.count { return true }
|
||
let pair = unsafe UnsafeRawPointer(
|
||
utf8.baseAddress.unsafelyUnwrapped
|
||
).loadUnaligned(fromByteOffset: i, as: UInt16.self)
|
||
//& 0x8080 == 0 is "both not ASCII", != 0x0A0D is "not CRLF"
|
||
return pair & 0x8080 == 0 && pair != 0x0A0D
|
||
}
|
||
if _fastPath(fast) {
|
||
_internalInvariant(_opaqueComplexCharacterStride(startingAt: i) == 1)
|
||
return 1
|
||
}
|
||
}
|
||
|
||
return _opaqueComplexCharacterStride(startingAt: i)
|
||
}
|
||
|
||
@_effects(releasenone) @inline(never)
|
||
internal func _opaqueComplexCharacterStride(startingAt i: Int) -> Int {
|
||
if _slowPath(isForeign) {
|
||
return _foreignOpaqueCharacterStride(startingAt: i)
|
||
}
|
||
|
||
let nextIdx = unsafe withFastUTF8 { utf8 in
|
||
nextBoundary(startingAt: i) { j in
|
||
_internalInvariant(j >= 0)
|
||
guard j < utf8.count else { return nil }
|
||
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: j)
|
||
return (scalar, j &+ len)
|
||
}
|
||
}
|
||
|
||
return nextIdx &- i
|
||
}
|
||
|
||
/// Return the length of the extended grapheme cluster ending at offset `i`,
|
||
/// or if `i` happens to be in the middle of a grapheme cluster, find and
|
||
/// return the distance to its start.
|
||
///
|
||
/// Note: unlike `_opaqueCharacterStride(startingAt:)`, this method always
|
||
/// finds a correct grapheme cluster boundary.
|
||
|
||
@usableFromInline @inline(__always)
|
||
@_effects(releasenone)
|
||
internal func _opaqueCharacterStride(endingAt i: Int) -> Int {
|
||
if i <= 1 {
|
||
return i
|
||
}
|
||
if isFastUTF8 {
|
||
let fast = unsafe withFastUTF8 { utf8 in
|
||
let pair = unsafe UnsafeRawPointer(
|
||
utf8.baseAddress.unsafelyUnwrapped
|
||
).loadUnaligned(fromByteOffset: i &- 2, as: UInt16.self)
|
||
//& 0x8080 == 0 is "both not ASCII", != 0x0A0D is "not CRLF"
|
||
return pair & 0x8080 == 0 && pair != 0x0A0D
|
||
}
|
||
if _fastPath(fast) {
|
||
_internalInvariant(_opaqueComplexCharacterStride(endingAt: i) == 1)
|
||
return 1
|
||
}
|
||
}
|
||
|
||
return _opaqueComplexCharacterStride(endingAt: i)
|
||
}
|
||
|
||
@_effects(releasenone) @inline(never)
|
||
internal func _opaqueComplexCharacterStride(endingAt i: Int) -> Int {
|
||
if _slowPath(isForeign) {
|
||
return _foreignOpaqueCharacterStride(endingAt: i)
|
||
}
|
||
|
||
let previousIdx = unsafe withFastUTF8 { utf8 in
|
||
previousBoundary(endingAt: i) { j in
|
||
_internalInvariant(j <= utf8.count)
|
||
guard j > 0 else { return nil }
|
||
let (scalar, len) = unsafe _decodeScalar(utf8, endingAt: j)
|
||
return (scalar, j &- len)
|
||
}
|
||
}
|
||
|
||
return i &- previousIdx
|
||
}
|
||
|
||
/// Return the length of the extended grapheme cluster ending at offset `i` in
|
||
/// bounds, or if `i` happens to be in the middle of a grapheme cluster, find
|
||
/// and return the distance to its start.
|
||
///
|
||
/// Note: unlike `_opaqueCharacterStride(startingAt:)`, this method always
|
||
/// finds a correct grapheme cluster boundary within the substring defined by
|
||
/// the specified bounds.
|
||
@_effects(releasenone)
|
||
internal func _opaqueCharacterStride(
|
||
endingAt i: Int,
|
||
in bounds: Range<Int>
|
||
) -> Int {
|
||
_internalInvariant(i > bounds.lowerBound && i <= bounds.upperBound)
|
||
if _slowPath(isForeign) {
|
||
return _foreignOpaqueCharacterStride(endingAt: i, in: bounds)
|
||
}
|
||
|
||
let previousIdx = unsafe withFastUTF8 { utf8 in
|
||
previousBoundary(endingAt: i) { j in
|
||
_internalInvariant(j <= bounds.upperBound)
|
||
guard j > bounds.lowerBound else { return nil }
|
||
let (scalar, len) = unsafe _decodeScalar(utf8, endingAt: j)
|
||
return (scalar, j &- len)
|
||
}
|
||
}
|
||
|
||
_internalInvariant(bounds.contains(previousIdx))
|
||
return i &- previousIdx
|
||
}
|
||
|
||
@inline(never)
|
||
@_effects(releasenone)
|
||
private func _foreignOpaqueCharacterStride(startingAt i: Int) -> Int {
|
||
#if _runtime(_ObjC)
|
||
_internalInvariant(isForeign)
|
||
|
||
let nextIdx = nextBoundary(startingAt: i) { j in
|
||
_internalInvariant(j >= 0)
|
||
guard j < count else { return nil }
|
||
let scalars = String.UnicodeScalarView(self)
|
||
let idx = String.Index(_encodedOffset: j)
|
||
|
||
let scalar = scalars[idx]
|
||
let nextIdx = scalars.index(after: idx)
|
||
|
||
return (scalar, nextIdx._encodedOffset)
|
||
}
|
||
|
||
return nextIdx &- i
|
||
#else
|
||
fatalError("No foreign strings on Linux in this version of Swift")
|
||
#endif
|
||
}
|
||
|
||
@inline(never)
|
||
@_effects(releasenone)
|
||
private func _foreignOpaqueCharacterStride(
|
||
startingAt i: Int,
|
||
in bounds: Range<Int>
|
||
) -> Int {
|
||
#if _runtime(_ObjC)
|
||
_internalInvariant(isForeign)
|
||
_internalInvariant(bounds.contains(i))
|
||
|
||
let nextIdx = nextBoundary(startingAt: i) { j in
|
||
_internalInvariant(j >= bounds.lowerBound)
|
||
guard j < bounds.upperBound else { return nil }
|
||
let scalars = String.UnicodeScalarView(self)
|
||
let idx = String.Index(_encodedOffset: j)
|
||
|
||
let scalar = scalars[idx]
|
||
let nextIdx = scalars.index(after: idx)
|
||
|
||
return (scalar, nextIdx._encodedOffset)
|
||
}
|
||
|
||
return nextIdx &- i
|
||
#else
|
||
fatalError("No foreign strings on Linux in this version of Swift")
|
||
#endif
|
||
}
|
||
|
||
@inline(never)
|
||
@_effects(releasenone)
|
||
private func _foreignOpaqueCharacterStride(endingAt i: Int) -> Int {
|
||
#if _runtime(_ObjC)
|
||
_internalInvariant(isForeign)
|
||
|
||
let previousIdx = previousBoundary(endingAt: i) { j in
|
||
_internalInvariant(j <= self.count)
|
||
guard j > 0 else { return nil }
|
||
let scalars = String.UnicodeScalarView(self)
|
||
let idx = String.Index(_encodedOffset: j)
|
||
|
||
let previousIdx = scalars.index(before: idx)
|
||
let scalar = scalars[previousIdx]
|
||
|
||
return (scalar, previousIdx._encodedOffset)
|
||
}
|
||
|
||
return i &- previousIdx
|
||
#else
|
||
fatalError("No foreign strings on Linux in this version of Swift")
|
||
#endif
|
||
}
|
||
|
||
@inline(never)
|
||
@_effects(releasenone)
|
||
private func _foreignOpaqueCharacterStride(
|
||
endingAt i: Int,
|
||
in bounds: Range<Int>
|
||
) -> Int {
|
||
#if _runtime(_ObjC)
|
||
_internalInvariant(isForeign)
|
||
_internalInvariant(i > bounds.lowerBound && i <= bounds.upperBound)
|
||
|
||
let previousIdx = previousBoundary(endingAt: i) { j in
|
||
_internalInvariant(j <= bounds.upperBound)
|
||
guard j > bounds.lowerBound else { return nil }
|
||
let scalars = String.UnicodeScalarView(self)
|
||
let idx = String.Index(_encodedOffset: j)
|
||
|
||
let previousIdx = scalars.index(before: idx)
|
||
|
||
let scalar = scalars[previousIdx]
|
||
return (scalar, previousIdx._encodedOffset)
|
||
}
|
||
|
||
return i &- previousIdx
|
||
#else
|
||
fatalError("No foreign strings on Linux in this version of Swift")
|
||
#endif
|
||
}
|
||
}
|
||
|
||
extension Unicode.Scalar {
|
||
fileprivate var _isInCBConsonant: Bool {
|
||
_swift_stdlib_isInCB_Consonant(value)
|
||
}
|
||
|
||
fileprivate var _isInCBExtend: Bool {
|
||
// Assuming that we're already an Extend or ZWJ...
|
||
!(_isInCBConsonant || _isInCBLinker || value == 0x200C)
|
||
}
|
||
|
||
fileprivate var _isInCBLinker: Bool {
|
||
switch value {
|
||
// Devanagari
|
||
case 0x94D:
|
||
return true
|
||
// Bengali
|
||
case 0x9CD:
|
||
return true
|
||
// Gujarati
|
||
case 0xACD:
|
||
return true
|
||
// Oriya
|
||
case 0xB4D:
|
||
return true
|
||
// Telugu
|
||
case 0xC4D:
|
||
return true
|
||
// Malayalam
|
||
case 0xD4D:
|
||
return true
|
||
|
||
default:
|
||
return false
|
||
}
|
||
}
|
||
}
|
||
|
||
internal struct _GraphemeBreakingState: Sendable, Equatable {
|
||
// When we're looking through an indic sequence, one of the requirements is
|
||
// that there is at LEAST 1 InCB=Linker present between two InCB=Consonant.
|
||
// This value helps ensure that when we ultimately need to decide whether or
|
||
// not to break that we've at least seen 1 when walking.
|
||
var hasSeenInCBLinker = false
|
||
|
||
// When walking forwards in a string, we need to know whether or not we've
|
||
// entered an emoji sequence to be able to eventually break after all of the
|
||
// emoji's various extenders and zero width joiners. This bit allows us to
|
||
// keep track of whether or not we're still in an emoji sequence when deciding
|
||
// to break.
|
||
var isInEmojiSequence = false
|
||
|
||
// Similar to emoji sequences, we need to know not to break an Indic grapheme
|
||
// sequence. This sequence is (potentially) composed of many scalars and isn't
|
||
// as trivial as comparing two grapheme properties.
|
||
var isInIndicSequence = false
|
||
|
||
// When walking forward in a string, we need to not break on emoji flag
|
||
// sequences. Emoji flag sequences are composed of 2 regional indicators, so
|
||
// when we see our first (.regionalIndicator, .regionalIndicator) decision,
|
||
// we need to know to return false in this case. However, if the next scalar
|
||
// is another regional indicator, we reach the same decision rule, but in this
|
||
// case we actually need to break there's a boundary between emoji flag
|
||
// sequences.
|
||
var shouldBreakRI = false
|
||
}
|
||
|
||
extension _GraphemeBreakingState: CustomStringConvertible {
|
||
var description: String {
|
||
var r = "["
|
||
if hasSeenInCBLinker { r += "L" }
|
||
if isInEmojiSequence { r += "E" }
|
||
if isInIndicSequence { r += "I" }
|
||
if shouldBreakRI { r += "R" }
|
||
r += "]"
|
||
return r
|
||
}
|
||
}
|
||
|
||
extension Unicode {
|
||
/// A state machine for recognizing character (i.e., extended grapheme
|
||
/// cluster) boundaries in an arbitrary series of Unicode scalars.
|
||
///
|
||
/// To detect grapheme breaks in a sequence of Unicode scalars, feed each of
|
||
/// them to the `hasBreak(before:)` method. The method returns true if the
|
||
/// sequence has a grapheme break preceding the given value.
|
||
///
|
||
/// The results produced by this state machine are guaranteed to match the way
|
||
/// `String` splits its contents into `Character` values.
|
||
@available(SwiftStdlib 5.8, *)
|
||
public // SPI(Foundation) FIXME: We need API for this
|
||
struct _CharacterRecognizer: Sendable {
|
||
internal var _previous: Unicode.Scalar
|
||
internal var _state: _GraphemeBreakingState
|
||
|
||
/// Refactoring TODO: should we use a quick check result?
|
||
///
|
||
/// Returns a non-nil value if it can be determined whether there is a
|
||
/// grapheme break between `scalar1` and `scalar2` without knowing anything
|
||
/// about the scalars that precede `scalar1`. This can optionally be used as
|
||
/// a fast (but incomplete) test before spinning up a full state machine
|
||
/// session.
|
||
@_effects(releasenone)
|
||
public static func quickBreak(
|
||
between scalar1: Unicode.Scalar,
|
||
and scalar2: Unicode.Scalar
|
||
) -> Bool? {
|
||
_quickHasGraphemeBreakBetween(scalar1, scalar2)
|
||
}
|
||
|
||
/// Initialize a new character recognizer at the _start of text_ (sot)
|
||
/// position.
|
||
///
|
||
/// The resulting state machine will report a grapheme break on the
|
||
/// first scalar that is fed to it.
|
||
public init() {
|
||
_state = _GraphemeBreakingState()
|
||
// To avoid having to handle the empty case specially, we use NUL as the
|
||
// placeholder before the first scalar. NUL is a control character, so per
|
||
// rule GB5, it will induce an unconditional grapheme break before the
|
||
// first actual scalar, emulating GB1.
|
||
_previous = Unicode.Scalar(0 as UInt8)
|
||
}
|
||
|
||
/// Feeds the next scalar to the state machine, returning a Boolean value
|
||
/// indicating whether it starts a new extended grapheme cluster.
|
||
///
|
||
/// This method will always report a break the first time it is called
|
||
/// on a newly initialized recognizer.
|
||
///
|
||
/// The state machine does not carry information across character
|
||
/// boundaries. I.e., if this method returns true, then `self` after the
|
||
/// call is equivalent to feeding the same scalar to a newly initialized
|
||
/// recognizer instance.
|
||
@_effects(releasenone)
|
||
public mutating func hasBreak(
|
||
before next: Unicode.Scalar
|
||
) -> Bool {
|
||
let r = _state.shouldBreak(between: _previous, and: next)
|
||
if r {
|
||
_state = _GraphemeBreakingState()
|
||
}
|
||
_previous = next
|
||
return r
|
||
}
|
||
|
||
/// Decode the scalars in the given UTF-8 buffer and feed them to the
|
||
/// recognizer up to and including the scalar following the first grapheme
|
||
/// break. If the buffer contains a grapheme break, then this function
|
||
/// returns the index range of the scalar that follows the first one;
|
||
/// otherwise it returns `nil`.
|
||
///
|
||
/// On return, the state of the recognizer is updated to reflect the scalars
|
||
/// up to and including the returned one. You can detect additional grapheme
|
||
/// breaks by feeding the recognizer subsequent data.
|
||
///
|
||
/// - Parameter buffer: A buffer containing valid UTF-8 data, starting and
|
||
/// ending on Unicode scalar boundaries.
|
||
///
|
||
/// - Parameter start: A valid index into `buffer`, addressing the first
|
||
/// code unit of a UTF-8 scalar in the buffer, or the end.
|
||
///
|
||
/// - Returns: The index range of the scalar that follows the first grapheme
|
||
/// break in the buffer, if there is one. If the buffer contains no
|
||
/// grapheme breaks, then this function returns `nil`.
|
||
///
|
||
/// - Warning: This function does not validate that the buffer contains
|
||
/// valid UTF-8 data; its behavior is undefined if given invalid input.
|
||
@_effects(releasenone)
|
||
public mutating func _firstBreak(
|
||
inUncheckedUnsafeUTF8Buffer buffer: UnsafeBufferPointer<UInt8>,
|
||
startingAt start: Int = 0
|
||
) -> Range<Int>? {
|
||
var i = start
|
||
while i < buffer.endIndex {
|
||
let (next, n) = unsafe _decodeScalar(buffer, startingAt: i)
|
||
if hasBreak(before: next) {
|
||
return unsafe Range(_uncheckedBounds: (i, i &+ n))
|
||
}
|
||
i &+= n
|
||
}
|
||
return nil
|
||
}
|
||
}
|
||
}
|
||
|
||
@available(SwiftStdlib 5.9, *)
|
||
extension Unicode._CharacterRecognizer: Equatable {
|
||
public static func ==(left: Self, right: Self) -> Bool {
|
||
left._previous == right._previous && left._state == right._state
|
||
}
|
||
}
|
||
|
||
@available(SwiftStdlib 5.9, *)
|
||
extension Unicode._CharacterRecognizer: CustomStringConvertible {
|
||
public var description: String {
|
||
return "\(_state)U+\(String(_previous.value, radix: 16, uppercase: true))"
|
||
}
|
||
}
|
||
|
||
|
||
extension _StringGuts {
|
||
// Returns the stride of the grapheme cluster starting at offset `index`,
|
||
// assuming it is on a grapheme cluster boundary.
|
||
//
|
||
// This method never looks at data below `index`. If `index` isn't on a
|
||
// grapheme cluster boundary, then the result may not be consistent with the
|
||
// actual breaks in the string. `Substring` relies on this to generate the
|
||
// right breaks if its start index isn't aligned on one -- in this case, the
|
||
// substring's breaks may not match the ones in its base string.
|
||
internal func nextBoundary(
|
||
startingAt index: Int,
|
||
nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
|
||
) -> Int {
|
||
_internalInvariant(index < endIndex._encodedOffset)
|
||
return _nextGraphemeClusterBoundary(startingAt: index, nextScalar: nextScalar)
|
||
}
|
||
}
|
||
|
||
internal func _nextGraphemeClusterBoundary(
|
||
startingAt index: Int,
|
||
nextScalar: (Int) -> (scalar: Unicode.Scalar, end: Int)?
|
||
) -> Int {
|
||
|
||
// Note: If `index` isn't already on a boundary, then starting with an empty
|
||
// state here sometimes leads to this method returning results that diverge
|
||
// from the true breaks in the string.
|
||
var state = _GraphemeBreakingState()
|
||
var (scalar, index) = nextScalar(index)!
|
||
|
||
while true {
|
||
guard let (scalar2, nextIndex) = nextScalar(index) else { break }
|
||
if state.shouldBreak(between: scalar, and: scalar2) {
|
||
break
|
||
}
|
||
index = nextIndex
|
||
scalar = scalar2
|
||
}
|
||
|
||
return index
|
||
}
|
||
|
||
extension _StringGuts {
|
||
fileprivate func previousBoundary(
|
||
endingAt index: Int,
|
||
previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||
) -> Int {
|
||
_previousGraphemeClusterBoundary(endingAt: index, previousScalar: previousScalar)
|
||
}
|
||
|
||
}
|
||
|
||
// Returns the stride of the grapheme cluster ending at offset `index`.
|
||
//
|
||
// This method uses `previousScalar` to looks back in the string as far as
|
||
// necessary to find a correct grapheme cluster boundary, whether or not
|
||
// `index` happens to be on a boundary itself.
|
||
internal func _previousGraphemeClusterBoundary(
|
||
endingAt index: Int,
|
||
previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||
) -> Int {
|
||
// FIXME: This requires potentially arbitrary lookback in each iteration,
|
||
// leading to quadratic behavior in some edge cases. Ideally lookback should
|
||
// only be done once per cluster (or in the case of RI sequences, once per
|
||
// flag sequence). One way to avoid most quadratic behavior is to replace
|
||
// this implementation with a scheme that first searches backwards for a
|
||
// safe point then iterates forward using the regular `shouldBreak` until we
|
||
// reach `index`, as recommended in section 6.4 of TR#29.
|
||
//
|
||
// https://www.unicode.org/reports/tr29/#Random_Access
|
||
|
||
var (scalar2, index) = previousScalar(index)!
|
||
|
||
while true {
|
||
guard let (scalar1, previousIndex) = previousScalar(index) else { break }
|
||
if _shouldBreakWithLookback(
|
||
between: scalar1, and: scalar2, at: index, with: previousScalar
|
||
) {
|
||
break
|
||
}
|
||
index = previousIndex
|
||
scalar2 = scalar1
|
||
}
|
||
|
||
return index
|
||
}
|
||
|
||
extension _GraphemeBreakingState {
|
||
// Return true if there is an extended grapheme cluster boundary between two
|
||
// scalars, based on state information previously collected about preceding
|
||
// scalars.
|
||
//
|
||
// This method never looks at scalars other than the two that are explicitly
|
||
// passed to it. The `state` parameter is assumed to hold all contextual
|
||
// information necessary to make a correct decision; it gets updated with more
|
||
// data as needed.
|
||
//
|
||
// This is based on the Unicode Annex #29 for [Grapheme Cluster Boundary
|
||
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
|
||
internal mutating func shouldBreak(
|
||
between scalar1: Unicode.Scalar,
|
||
and scalar2: Unicode.Scalar
|
||
) -> Bool {
|
||
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
|
||
return result
|
||
}
|
||
|
||
let x = Unicode._GraphemeBreakProperty(from: scalar1)
|
||
|
||
// GB4 handled here because we don't need to know `y` for this case
|
||
if x == .control {
|
||
return true
|
||
}
|
||
|
||
// This variable and the defer statement help toggle the isInEmojiSequence
|
||
// state variable to false after every decision of 'shouldBreak'. If we
|
||
// happen to see a rhs .extend or .zwj, then it's a signal that we should
|
||
// continue treating the current grapheme cluster as an emoji sequence.
|
||
var enterEmojiSequence = false
|
||
|
||
// Very similar to emoji sequences, but for Indic grapheme sequences.
|
||
var enterIndicSequence = false
|
||
|
||
defer {
|
||
isInEmojiSequence = enterEmojiSequence
|
||
isInIndicSequence = enterIndicSequence
|
||
}
|
||
|
||
let y = Unicode._GraphemeBreakProperty(from: scalar2)
|
||
|
||
switch (x, y) {
|
||
|
||
// Fast path: If we know our scalars have no properties the decision is
|
||
// trivial and we don't need to crawl to the default statement.
|
||
case (.any, .any):
|
||
return true
|
||
|
||
// (GB4 is handled above)
|
||
|
||
// GB5
|
||
case (_, .control):
|
||
return true
|
||
|
||
// GB6
|
||
case (.l, .l),
|
||
(.l, .v),
|
||
(.l, .lv),
|
||
(.l, .lvt):
|
||
return false
|
||
|
||
// GB7
|
||
case (.lv, .v),
|
||
(.v, .v),
|
||
(.lv, .t),
|
||
(.v, .t):
|
||
return false
|
||
|
||
// GB8
|
||
case (.lvt, .t),
|
||
(.t, .t):
|
||
return false
|
||
|
||
// GB9 (partial GB9c and partial GB11)
|
||
case (_, .extend),
|
||
(_, .zwj):
|
||
|
||
// Prepare for recognizing GB11, by remembering if we're in an emoji
|
||
// sequence.
|
||
//
|
||
// GB11: Extended_Pictographic Extend* ZWJ × Extended_Pictographic
|
||
//
|
||
// If our left-side scalar is a pictograph, then it starts a new emoji
|
||
// sequence; the sequence continues through subsequent extend/extend and
|
||
// extend/zwj pairs.
|
||
if (
|
||
x == .extendedPictographic || (isInEmojiSequence && x == .extend)
|
||
) {
|
||
enterEmojiSequence = true
|
||
}
|
||
|
||
// GB9c: InCB=Consonant [InCB=Extend InCB=Linker]* InCB=Linker [InCB=Extend InCB=Linker]* × InCB=Consonant
|
||
//
|
||
// If our lhs is an InCB=Consonant and our rhs is either an InCB=Extend or
|
||
// an InCB=Linker, then enter into an indic sequence and mark if scalar 2
|
||
// is a linker and that we've seen a linker.
|
||
//
|
||
// If the lhs is not an InCB=Consonant, then check if we're currently in
|
||
// an indic sequence to properly propagate that back to the state.
|
||
// Otherwise, we're not in an indic sequence, but our rhs is still an
|
||
// extension scalar so don't break regardless right here. If we are in an
|
||
// indic sequence, tell the state that we've seen a linker if our rhs is
|
||
// one.
|
||
switch (scalar1._isInCBConsonant, scalar2._isInCBExtend, scalar2._isInCBLinker) {
|
||
// (InCB=Consonant, InCB=Extend)
|
||
case (true, true, false):
|
||
enterIndicSequence = true
|
||
|
||
// (InCB=Consonant, InCB=Linker)
|
||
case (true, false, true):
|
||
enterIndicSequence = true
|
||
hasSeenInCBLinker = true
|
||
|
||
// (_, InCB=Extend)
|
||
case (false, true, false):
|
||
guard isInIndicSequence else {
|
||
break
|
||
}
|
||
|
||
enterIndicSequence = true
|
||
|
||
// (_, InCB=Linker)
|
||
case (false, false, true):
|
||
guard isInIndicSequence else {
|
||
break
|
||
}
|
||
|
||
enterIndicSequence = true
|
||
hasSeenInCBLinker = true
|
||
|
||
default:
|
||
break
|
||
}
|
||
|
||
return false
|
||
|
||
// GB9a
|
||
case (_, .spacingMark):
|
||
return false
|
||
|
||
// GB9b
|
||
case (.prepend, _):
|
||
return false
|
||
|
||
// GB11
|
||
case (.zwj, .extendedPictographic):
|
||
return !isInEmojiSequence
|
||
|
||
// GB12 & GB13
|
||
case (.regionalIndicator, .regionalIndicator):
|
||
defer {
|
||
shouldBreakRI.toggle()
|
||
}
|
||
|
||
return shouldBreakRI
|
||
|
||
// GB999
|
||
default:
|
||
// GB9c
|
||
if isInIndicSequence, hasSeenInCBLinker, scalar2._isInCBConsonant {
|
||
hasSeenInCBLinker = false
|
||
return false
|
||
}
|
||
|
||
return true
|
||
}
|
||
}
|
||
}
|
||
|
||
// Return true if there is an extended grapheme cluster boundary between two
|
||
// scalars, with no previous knowledge about preceding scalars.
|
||
//
|
||
// This method looks back as far as it needs to determine the correct
|
||
// placement of boundaries.
|
||
//
|
||
// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
|
||
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
|
||
fileprivate func _shouldBreakWithLookback(
|
||
between scalar1: Unicode.Scalar,
|
||
and scalar2: Unicode.Scalar,
|
||
at index: Int,
|
||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||
) -> Bool {
|
||
if let result = _quickHasGraphemeBreakBetween(scalar1, scalar2) {
|
||
return result
|
||
}
|
||
|
||
let x = Unicode._GraphemeBreakProperty(from: scalar1)
|
||
let y = Unicode._GraphemeBreakProperty(from: scalar2)
|
||
|
||
switch (x, y) {
|
||
|
||
// Fast path: If we know our scalars have no properties the decision is
|
||
// trivial and we don't need to crawl to the default statement.
|
||
case (.any, .any):
|
||
return true
|
||
|
||
// GB4
|
||
case (.control, _):
|
||
return true
|
||
|
||
// GB5
|
||
case (_, .control):
|
||
return true
|
||
|
||
// GB6
|
||
case (.l, .l),
|
||
(.l, .v),
|
||
(.l, .lv),
|
||
(.l, .lvt):
|
||
return false
|
||
|
||
// GB7
|
||
case (.lv, .v),
|
||
(.v, .v),
|
||
(.lv, .t),
|
||
(.v, .t):
|
||
return false
|
||
|
||
// GB8
|
||
case (.lvt, .t),
|
||
(.t, .t):
|
||
return false
|
||
|
||
// GB9
|
||
case (_, .extend),
|
||
(_, .zwj):
|
||
return false
|
||
|
||
// GB9a
|
||
case (_, .spacingMark):
|
||
return false
|
||
|
||
// GB9b
|
||
case (.prepend, _):
|
||
return false
|
||
|
||
// GB11
|
||
case (.zwj, .extendedPictographic):
|
||
return !_checkIfInEmojiSequence(at: index, with: previousScalar)
|
||
|
||
// GB12 & GB13
|
||
case (.regionalIndicator, .regionalIndicator):
|
||
return _countRIs(at: index, with: previousScalar)
|
||
|
||
// GB999
|
||
default:
|
||
// GB9c
|
||
//
|
||
// Check if our rhs is an InCB=Consonant first because we can more easily
|
||
// exit out of this branch in most cases. Otherwise, this is a consonant.
|
||
// Check that the lhs is an InCB=Extend or InCB=Linker (we have to check
|
||
// if it's an .extend or .zwj first because _isInCBExtend assumes that it
|
||
// is true).
|
||
if scalar2._isInCBConsonant,
|
||
(x == .extend || x == .zwj),
|
||
(scalar1._isInCBExtend || scalar1._isInCBLinker) {
|
||
return !_checkIfInIndicSequence(at: index, with: previousScalar)
|
||
}
|
||
|
||
return true
|
||
}
|
||
}
|
||
|
||
// When walking backwards, it's impossible to know whether we were in an emoji
|
||
// sequence without walking further backwards. This walks the string backwards
|
||
// enough until we figure out whether or not to break our
|
||
// (.zwj, .extendedPictographic) question. For example:
|
||
//
|
||
// Scalar view #1:
|
||
//
|
||
// [.control, .zwj, .extendedPictographic]
|
||
// ^
|
||
// | = To determine whether or not we break here, we need
|
||
// to see the previous scalar's grapheme property.
|
||
// ^
|
||
// | = This is neither .extendedPictographic nor .extend, thus we
|
||
// were never in an emoji sequence, so break between the .zwj
|
||
// and .extendedPictographic.
|
||
//
|
||
// Scalar view #2:
|
||
//
|
||
// [.extendedPictographic, .zwj, .extendedPictographic]
|
||
// ^
|
||
// | = Same as above, move backwards one to
|
||
// view the previous scalar's property.
|
||
// ^
|
||
// | = This is an .extendedPictographic, so this indicates that
|
||
// we are in an emoji sequence, so we should NOT break
|
||
// between the .zwj and .extendedPictographic.
|
||
//
|
||
// Scalar view #3:
|
||
//
|
||
// [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
|
||
// ^
|
||
// | = Same as above
|
||
// ^
|
||
// | = This is an .extend which means
|
||
// there is a potential emoji
|
||
// sequence, walk further backwards
|
||
// to find an .extendedPictographic.
|
||
//
|
||
// <-- = Another extend, go backwards more.
|
||
// ^
|
||
// | = We found our starting .extendedPictographic letting us
|
||
// know that we are in an emoji sequence so our initial
|
||
// break question is answered as NO.
|
||
fileprivate func _checkIfInEmojiSequence(
|
||
at index: Int,
|
||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||
) -> Bool {
|
||
guard var i = previousScalar(index)?.start else { return false }
|
||
while let prev = previousScalar(i) {
|
||
i = prev.start
|
||
let gbp = Unicode._GraphemeBreakProperty(from: prev.scalar)
|
||
|
||
switch gbp {
|
||
case .extend:
|
||
continue
|
||
case .extendedPictographic:
|
||
return true
|
||
default:
|
||
return false
|
||
}
|
||
}
|
||
return false
|
||
}
|
||
|
||
// When walking backwards, it's impossible to know whether we break when we
|
||
// see our first (InCB=Extend, InCB=Consonant) or (InCB=Linker, InCB=Consonant)
|
||
// without walking further backwards. This walks the string backwards enough
|
||
// until we figure out whether or not to break this indic sequence. For example:
|
||
//
|
||
// Scalar view #1:
|
||
//
|
||
// [InCB=Linker, InCB=Extend, InCB=Consonant]
|
||
// ^
|
||
// | = To be able to know whether or not to
|
||
// break these two, we need to walk
|
||
// backwards to determine if this is a
|
||
// legitimate indic sequence.
|
||
// ^
|
||
// | = The scalar sequence ends without a starting InCB=Consonant,
|
||
// so this is in fact not an indic sequence, so we can break the two.
|
||
//
|
||
// Scalar view #2:
|
||
//
|
||
// [InCB=Consonant, InCB=Linker, InCB=Extend, InCB=Consonant]
|
||
// ^
|
||
// | = Same as above
|
||
// ^
|
||
// | = This is a Linker, so we at least have seen
|
||
// 1 to be able to return true if we see a
|
||
// consonant later.
|
||
// ^
|
||
// | = Is a consonant and we've seen a linker, so this is a
|
||
// legitimate indic sequence, so do NOT break the initial question.
|
||
fileprivate func _checkIfInIndicSequence(
|
||
at index: Int,
|
||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||
) -> Bool {
|
||
guard let p = previousScalar(index) else { return false }
|
||
|
||
var hasSeenInCBLinker = p.scalar._isInCBLinker
|
||
var i = p.start
|
||
|
||
while let (scalar, prev) = previousScalar(i) {
|
||
i = prev
|
||
|
||
if scalar._isInCBConsonant {
|
||
return hasSeenInCBLinker
|
||
}
|
||
|
||
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
|
||
|
||
guard gbp == .extend || gbp == .zwj else {
|
||
return false
|
||
}
|
||
|
||
switch (scalar._isInCBExtend, scalar._isInCBLinker) {
|
||
case (false, false):
|
||
return false
|
||
|
||
case (false, true):
|
||
hasSeenInCBLinker = true
|
||
|
||
case (true, false):
|
||
continue
|
||
|
||
case (true, true):
|
||
// This case should never happen, but if it does then just be cautious
|
||
// and say this is invalid.
|
||
return false
|
||
}
|
||
}
|
||
|
||
return false
|
||
}
|
||
|
||
// When walking backwards, it's impossible to know whether we break when we
|
||
// see our first (.regionalIndicator, .regionalIndicator) without walking
|
||
// further backwards. This walks the string backwards enough until we figure
|
||
// out whether or not to break these RIs. For example:
|
||
//
|
||
// Scalar view #1:
|
||
//
|
||
// [.control, .regionalIndicator, .regionalIndicator]
|
||
// ^
|
||
// | = To be able to know whether or not to
|
||
// break these two, we need to walk
|
||
// backwards to determine if there were
|
||
// any previous .regionalIndicators in
|
||
// a row.
|
||
// ^
|
||
// | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
|
||
// even thus we do not break.
|
||
//
|
||
// Scalar view #2:
|
||
//
|
||
// [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
|
||
// ^
|
||
// | = Same as above
|
||
// ^
|
||
// | = This is a .regionalIndicator, so continue
|
||
// walking backwards for more of them. riCount is
|
||
// now equal to 1.
|
||
// ^
|
||
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
|
||
// the last two .regionalIndicators.
|
||
fileprivate func _countRIs(
|
||
at index: Int,
|
||
with previousScalar: (Int) -> (scalar: Unicode.Scalar, start: Int)?
|
||
) -> Bool {
|
||
guard let p = previousScalar(index) else { return false }
|
||
var i = p.start
|
||
var riCount = 0
|
||
while let p = previousScalar(i) {
|
||
i = p.start
|
||
|
||
let gbp = Unicode._GraphemeBreakProperty(from: p.scalar)
|
||
guard gbp == .regionalIndicator else {
|
||
break
|
||
}
|
||
|
||
riCount += 1
|
||
}
|
||
return riCount & 1 != 0
|
||
}
|