mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
688 lines
20 KiB
Swift
688 lines
20 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
import SwiftShims
|
|
|
|
/// CR and LF are common special cases in grapheme breaking logic
|
|
private var _CR: UInt8 { return 0x0d }
|
|
private var _LF: UInt8 { return 0x0a }
|
|
|
|
internal func _hasGraphemeBreakBetween(
|
|
_ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
|
|
) -> Bool {
|
|
|
|
// CR-LF is a special case: no break between these
|
|
if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
|
|
return false
|
|
}
|
|
|
|
// Whether the given scalar, when it appears paired with another scalar
|
|
// satisfying this property, has a grapheme break between it and the other
|
|
// scalar.
|
|
func hasBreakWhenPaired(_ x: Unicode.Scalar) -> Bool {
|
|
// TODO: This doesn't generate optimal code, tune/re-write at a lower
|
|
// level.
|
|
//
|
|
// NOTE: Order of case ranges affects codegen, and thus performance. All
|
|
// things being equal, keep existing order below.
|
|
switch x.value {
|
|
// Unified CJK Han ideographs, common and some supplemental, amongst
|
|
// others:
|
|
// U+3400 ~ U+A4CF
|
|
case 0x3400...0xa4cf: return true
|
|
|
|
// Repeat sub-300 check, this is beneficial for common cases of Latin
|
|
// characters embedded within non-Latin script (e.g. newlines, spaces,
|
|
// proper nouns and/or jargon, punctuation).
|
|
//
|
|
// NOTE: CR-LF special case has already been checked.
|
|
case 0x0000...0x02ff: return true
|
|
|
|
// Non-combining kana:
|
|
// U+3041 ~ U+3096
|
|
// U+30A1 ~ U+30FC
|
|
case 0x3041...0x3096: return true
|
|
case 0x30a1...0x30fc: return true
|
|
|
|
// Non-combining modern (and some archaic) Cyrillic:
|
|
// U+0400 ~ U+0482 (first half of Cyrillic block)
|
|
case 0x0400...0x0482: return true
|
|
|
|
// Modern Arabic, excluding extenders and prependers:
|
|
// U+061D ~ U+064A
|
|
case 0x061d...0x064a: return true
|
|
|
|
// Precomposed Hangul syllables:
|
|
// U+AC00 ~ U+D7AF
|
|
case 0xac00...0xd7af: return true
|
|
|
|
// Common general use punctuation, excluding extenders:
|
|
// U+2010 ~ U+2029
|
|
case 0x2010...0x2029: return true
|
|
|
|
// CJK punctuation characters, excluding extenders:
|
|
// U+3000 ~ U+3029
|
|
case 0x3000...0x3029: return true
|
|
|
|
// Full-width forms:
|
|
// U+FF01 ~ U+FF9D
|
|
case 0xFF01...0xFF9D: return true
|
|
|
|
default: return false
|
|
}
|
|
}
|
|
return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
|
|
}
|
|
|
|
extension _StringGuts {
|
|
@usableFromInline @inline(never)
|
|
@_effects(releasenone)
|
|
internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool {
|
|
guard i.transcodedOffset == 0 else { return false }
|
|
|
|
let offset = i._encodedOffset
|
|
if offset == 0 || offset == self.count { return true }
|
|
|
|
guard isOnUnicodeScalarBoundary(i) else { return false }
|
|
|
|
let str = String(self)
|
|
return i == str.index(before: str.index(after: i))
|
|
}
|
|
|
|
@usableFromInline @inline(never)
|
|
@_effects(releasenone)
|
|
internal func _opaqueCharacterStride(startingAt i: Int) -> Int {
|
|
if _slowPath(isForeign) {
|
|
return _foreignOpaqueCharacterStride(startingAt: i)
|
|
}
|
|
|
|
let nextIdx = withFastUTF8 { utf8 in
|
|
nextBoundary(startingAt: i) {
|
|
let (scalar, len) = _decodeScalar(utf8, startingAt: $0)
|
|
return (scalar, $0 &+ len)
|
|
}
|
|
}
|
|
|
|
return nextIdx &- i
|
|
}
|
|
|
|
@usableFromInline @inline(never)
|
|
@_effects(releasenone)
|
|
internal func _opaqueCharacterStride(endingAt i: Int) -> Int {
|
|
if _slowPath(isForeign) {
|
|
return _foreignOpaqueCharacterStride(endingAt: i)
|
|
}
|
|
|
|
let previousIdx = withFastUTF8 { utf8 in
|
|
previousBoundary(endingAt: i) {
|
|
let (scalar, len) = _decodeScalar(utf8, endingAt: $0)
|
|
return (scalar, $0 &- len)
|
|
}
|
|
}
|
|
|
|
return i &- previousIdx
|
|
}
|
|
|
|
@inline(never)
|
|
@_effects(releasenone)
|
|
private func _foreignOpaqueCharacterStride(startingAt i: Int) -> Int {
|
|
#if _runtime(_ObjC)
|
|
_internalInvariant(isForeign)
|
|
|
|
let nextIdx = nextBoundary(startingAt: i) {
|
|
let scalars = String.UnicodeScalarView(self)
|
|
let idx = String.Index(_encodedOffset: $0)
|
|
|
|
let scalar = scalars[idx]
|
|
let nextIdx = scalars.index(after: idx)
|
|
|
|
return (scalar, nextIdx._encodedOffset)
|
|
}
|
|
|
|
return nextIdx &- i
|
|
#else
|
|
fatalError("No foreign strings on Linux in this version of Swift")
|
|
#endif
|
|
}
|
|
|
|
@inline(never)
|
|
@_effects(releasenone)
|
|
private func _foreignOpaqueCharacterStride(endingAt i: Int) -> Int {
|
|
#if _runtime(_ObjC)
|
|
_internalInvariant(isForeign)
|
|
|
|
let previousIdx = previousBoundary(endingAt: i) {
|
|
let scalars = String.UnicodeScalarView(self)
|
|
let idx = String.Index(_encodedOffset: $0)
|
|
|
|
let previousIdx = scalars.index(before: idx)
|
|
let scalar = scalars[previousIdx]
|
|
|
|
return (scalar, previousIdx._encodedOffset)
|
|
}
|
|
|
|
return i &- previousIdx
|
|
#else
|
|
fatalError("No foreign strings on Linux in this version of Swift")
|
|
#endif
|
|
}
|
|
}
|
|
|
|
extension Unicode.Scalar {
|
|
fileprivate var _isLinkingConsonant: Bool {
|
|
_swift_stdlib_isLinkingConsonant(value)
|
|
}
|
|
|
|
fileprivate var _isVirama: Bool {
|
|
switch value {
|
|
// Devanagari
|
|
case 0x94D:
|
|
return true
|
|
// Bengali
|
|
case 0x9CD:
|
|
return true
|
|
// Gujarati
|
|
case 0xACD:
|
|
return true
|
|
// Oriya
|
|
case 0xB4D:
|
|
return true
|
|
// Telugu
|
|
case 0xC4D:
|
|
return true
|
|
// Malayalam
|
|
case 0xD4D:
|
|
return true
|
|
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
internal struct _GraphemeBreakingState {
|
|
// When we're looking through an indic sequence, one of the requirements is
|
|
// that there is at LEAST 1 Virama present between two linking consonants.
|
|
// This value helps ensure that when we ultimately need to decide whether or
|
|
// not to break that we've at least seen 1 when walking.
|
|
var hasSeenVirama = false
|
|
|
|
// When walking forwards in a string, we need to know whether or not we've
|
|
// entered an emoji sequence to be able to eventually break after all of the
|
|
// emoji's various extenders and zero width joiners. This bit allows us to
|
|
// keep track of whether or not we're still in an emoji sequence when deciding
|
|
// to break.
|
|
var isInEmojiSequence = false
|
|
|
|
// Similar to emoji sequences, we need to know not to break an Indic grapheme
|
|
// sequence. This sequence is (potentially) composed of many scalars and isn't
|
|
// as trivial as comparing two grapheme properties.
|
|
var isInIndicSequence = false
|
|
|
|
// When walking forward in a string, we need to not break on emoji flag
|
|
// sequences. Emoji flag sequences are composed of 2 regional indicators, so
|
|
// when we see our first (.regionalIndicator, .regionalIndicator) decision,
|
|
// we need to know to return false in this case. However, if the next scalar
|
|
// is another regional indicator, we reach the same decision rule, but in this
|
|
// case we actually need to break there's a boundary between emoji flag
|
|
// sequences.
|
|
var shouldBreakRI = false
|
|
}
|
|
|
|
extension _StringGuts {
|
|
// Returns the stride of the next grapheme cluster at the previous boundary
|
|
// offset.
|
|
internal func nextBoundary(
|
|
startingAt index: Int,
|
|
nextScalar: (Int) -> (Unicode.Scalar, end: Int)
|
|
) -> Int {
|
|
_internalInvariant(index != endIndex._encodedOffset)
|
|
var state = _GraphemeBreakingState()
|
|
var index = index
|
|
|
|
while true {
|
|
let (scalar1, nextIdx) = nextScalar(index)
|
|
index = nextIdx
|
|
|
|
guard index != endIndex._encodedOffset else {
|
|
break
|
|
}
|
|
|
|
let (scalar2, _) = nextScalar(index)
|
|
|
|
if shouldBreak(scalar1, between: scalar2, &state, index) {
|
|
break
|
|
}
|
|
}
|
|
|
|
return index
|
|
}
|
|
|
|
// Returns the stride of the previous grapheme cluster at the current boundary
|
|
// offset.
|
|
internal func previousBoundary(
|
|
endingAt index: Int,
|
|
previousScalar: (Int) -> (Unicode.Scalar, start: Int)
|
|
) -> Int {
|
|
_internalInvariant(index != startIndex._encodedOffset)
|
|
var state = _GraphemeBreakingState()
|
|
var index = index
|
|
|
|
while true {
|
|
let (scalar2, previousIdx) = previousScalar(index)
|
|
index = previousIdx
|
|
|
|
guard index != startIndex._encodedOffset else {
|
|
break
|
|
}
|
|
|
|
let (scalar1, _) = previousScalar(index)
|
|
|
|
if shouldBreak(
|
|
scalar1,
|
|
between: scalar2,
|
|
&state,
|
|
index,
|
|
isBackwards: true
|
|
) {
|
|
break
|
|
}
|
|
}
|
|
|
|
return index
|
|
}
|
|
}
|
|
|
|
extension _StringGuts {
|
|
// The "algorithm" that determines whether or not we should break between
|
|
// certain grapheme break properties.
|
|
//
|
|
// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
|
|
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
|
|
internal func shouldBreak(
|
|
_ scalar1: Unicode.Scalar,
|
|
between scalar2: Unicode.Scalar,
|
|
_ state: inout _GraphemeBreakingState,
|
|
_ index: Int,
|
|
isBackwards: Bool = false
|
|
) -> Bool {
|
|
// GB3
|
|
if scalar1.value == 0xD, scalar2.value == 0xA {
|
|
return false
|
|
}
|
|
|
|
if _hasGraphemeBreakBetween(scalar1, scalar2) {
|
|
return true
|
|
}
|
|
|
|
let x = Unicode._GraphemeBreakProperty(from: scalar1)
|
|
let y = Unicode._GraphemeBreakProperty(from: scalar2)
|
|
|
|
// This variable and the defer statement help toggle the isInEmojiSequence
|
|
// state variable to false after every decision of 'shouldBreak'. If we
|
|
// happen to see a rhs .extend or .zwj, then it's a signal that we should
|
|
// continue treating the current grapheme cluster as an emoji sequence.
|
|
var enterEmojiSequence = false
|
|
|
|
// Very similar to emoji sequences, but for Indic grapheme sequences.
|
|
var enterIndicSequence = false
|
|
|
|
defer {
|
|
state.isInEmojiSequence = enterEmojiSequence
|
|
state.isInIndicSequence = enterIndicSequence
|
|
}
|
|
|
|
switch (x, y) {
|
|
|
|
// Fast path: If we know our scalars have no properties the decision is
|
|
// trivial and we don't need to crawl to the default statement.
|
|
case (.any, .any):
|
|
return true
|
|
|
|
// GB4
|
|
case (.control, _):
|
|
return true
|
|
|
|
// GB5
|
|
case (_, .control):
|
|
return true
|
|
|
|
// GB6
|
|
case (.l, .l),
|
|
(.l, .v),
|
|
(.l, .lv),
|
|
(.l, .lvt):
|
|
return false
|
|
|
|
// GB7
|
|
case (.lv, .v),
|
|
(.v, .v),
|
|
(.lv, .t),
|
|
(.v, .t):
|
|
return false
|
|
|
|
// GB8
|
|
case (.lvt, .t),
|
|
(.t, .t):
|
|
return false
|
|
|
|
// GB9 (partial GB11)
|
|
case (_, .extend),
|
|
(_, .zwj):
|
|
|
|
// If we're currently in an emoji sequence, then extends and ZWJ help
|
|
// continue the grapheme cluster by combining more scalars later. If we're
|
|
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
|
|
// then that's a signal that it's the start of an emoji sequence.
|
|
if state.isInEmojiSequence || x == .extendedPictographic {
|
|
enterEmojiSequence = true
|
|
}
|
|
|
|
// If we're currently in an indic sequence (or if our lhs is a linking
|
|
// consonant), then this check and everything underneath ensures that
|
|
// we continue being in one and may check if this extend is a Virama.
|
|
if state.isInIndicSequence || scalar1._isLinkingConsonant {
|
|
if y == .extend {
|
|
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
|
|
|
|
// If our extend's CCC is 0, then this rule does not apply.
|
|
guard extendNormData.ccc != 0 else {
|
|
return false
|
|
}
|
|
}
|
|
|
|
enterIndicSequence = true
|
|
|
|
if scalar2._isVirama {
|
|
state.hasSeenVirama = true
|
|
}
|
|
}
|
|
|
|
return false
|
|
|
|
// GB9a
|
|
case (_, .spacingMark):
|
|
return false
|
|
|
|
// GB9b
|
|
case (.prepend, _):
|
|
return false
|
|
|
|
// GB11
|
|
case (.zwj, .extendedPictographic):
|
|
if isBackwards {
|
|
return !checkIfInEmojiSequence(index)
|
|
}
|
|
|
|
return !state.isInEmojiSequence
|
|
|
|
// GB12 & GB13
|
|
case (.regionalIndicator, .regionalIndicator):
|
|
if isBackwards {
|
|
return countRIs(index)
|
|
}
|
|
|
|
defer {
|
|
state.shouldBreakRI.toggle()
|
|
}
|
|
|
|
return state.shouldBreakRI
|
|
|
|
// GB999
|
|
default:
|
|
// GB9c
|
|
if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
|
|
state.hasSeenVirama = false
|
|
return false
|
|
}
|
|
|
|
// Handle GB9c when walking backwards.
|
|
if isBackwards {
|
|
switch (x, scalar2._isLinkingConsonant) {
|
|
case (.extend, true):
|
|
let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)
|
|
|
|
guard extendNormData.ccc != 0 else {
|
|
return true
|
|
}
|
|
|
|
return !checkIfInIndicSequence(index)
|
|
|
|
case (.zwj, true):
|
|
return !checkIfInIndicSequence(index)
|
|
|
|
default:
|
|
return true
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|
|
}
|
|
|
|
// When walking backwards, it's impossible to know whether we were in an emoji
|
|
// sequence without walking further backwards. This walks the string backwards
|
|
// enough until we figure out whether or not to break our
|
|
// (.zwj, .extendedPictographic) question. For example:
|
|
//
|
|
// Scalar view #1:
|
|
//
|
|
// [.control, .zwj, .extendedPictographic]
|
|
// ^
|
|
// | = To determine whether or not we break here, we need
|
|
// to see the previous scalar's grapheme property.
|
|
// ^
|
|
// | = This is neither .extendedPictographic nor .extend, thus we
|
|
// were never in an emoji sequence, so break between the .zwj
|
|
// and .extendedPictographic.
|
|
//
|
|
// Scalar view #2:
|
|
//
|
|
// [.extendedPictographic, .zwj, .extendedPictographic]
|
|
// ^
|
|
// | = Same as above, move backwards one to
|
|
// view the previous scalar's property.
|
|
// ^
|
|
// | = This is an .extendedPictographic, so this indicates that
|
|
// we are in an emoji sequence, so we should NOT break
|
|
// between the .zwj and .extendedPictographic.
|
|
//
|
|
// Scalar view #3:
|
|
//
|
|
// [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
|
|
// ^
|
|
// | = Same as above
|
|
// ^
|
|
// | = This is an .extend which means
|
|
// there is a potential emoji
|
|
// sequence, walk further backwards
|
|
// to find an .extendedPictographic.
|
|
//
|
|
// <-- = Another extend, go backwards more.
|
|
// ^
|
|
// | = We found our starting .extendedPictographic letting us
|
|
// know that we are in an emoji sequence so our initial
|
|
// break question is answered as NO.
|
|
internal func checkIfInEmojiSequence(_ index: Int) -> Bool {
|
|
var emojiIdx = String.Index(_encodedOffset: index)
|
|
|
|
guard emojiIdx != startIndex else {
|
|
return false
|
|
}
|
|
|
|
let scalars = String.UnicodeScalarView(self)
|
|
scalars.formIndex(before: &emojiIdx)
|
|
|
|
while emojiIdx != startIndex {
|
|
scalars.formIndex(before: &emojiIdx)
|
|
let scalar = scalars[emojiIdx]
|
|
|
|
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
|
|
|
|
switch gbp {
|
|
case .extend:
|
|
continue
|
|
case .extendedPictographic:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// When walking backwards, it's impossible to know whether we break when we
|
|
// see our first ((.extend|.zwj), .linkingConsonant) without walking
|
|
// further backwards. This walks the string backwards enough until we figure
|
|
// out whether or not to break this indic sequence. For example:
|
|
//
|
|
// Scalar view #1:
|
|
//
|
|
// [.virama, .extend, .linkingConsonant]
|
|
// ^
|
|
// | = To be able to know whether or not to break these
|
|
// two, we need to walk backwards to determine if
|
|
// this is a legitimate indic sequence.
|
|
// ^
|
|
// | = The scalar sequence ends without a starting linking consonant,
|
|
// so this is in fact not an indic sequence, so we can break the two.
|
|
//
|
|
// Scalar view #2:
|
|
//
|
|
// [.linkingConsonant, .virama, .extend, .linkingConsonant]
|
|
// ^
|
|
// | = Same as above
|
|
// ^
|
|
// | = This is a virama, so we at least have seen
|
|
// 1 to be able to return true if we see a
|
|
// linking consonant later.
|
|
// ^
|
|
// | = Is a linking consonant and we've seen a virama, so this is a
|
|
// legitimate indic sequence, so do NOT break the initial question.
|
|
internal func checkIfInIndicSequence(_ index: Int) -> Bool {
|
|
var indicIdx = String.Index(_encodedOffset: index)
|
|
|
|
guard indicIdx != startIndex else {
|
|
return false
|
|
}
|
|
|
|
let scalars = String.UnicodeScalarView(self)
|
|
scalars.formIndex(before: &indicIdx)
|
|
|
|
var hasSeenVirama = false
|
|
|
|
// Check if the first extend was the Virama.
|
|
let scalar = scalars[indicIdx]
|
|
|
|
if scalar._isVirama {
|
|
hasSeenVirama = true
|
|
}
|
|
|
|
while indicIdx != startIndex {
|
|
scalars.formIndex(before: &indicIdx)
|
|
let scalar = scalars[indicIdx]
|
|
|
|
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
|
|
|
|
switch (gbp, scalar._isLinkingConsonant) {
|
|
case (.extend, false):
|
|
let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)
|
|
|
|
guard extendNormData.ccc != 0 else {
|
|
return false
|
|
}
|
|
|
|
if scalar._isVirama {
|
|
hasSeenVirama = true
|
|
}
|
|
|
|
case (.zwj, false):
|
|
continue
|
|
|
|
// LinkingConsonant
|
|
case (_, true):
|
|
guard hasSeenVirama else {
|
|
return false
|
|
}
|
|
|
|
return true
|
|
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
return false
|
|
}
|
|
|
|
// When walking backwards, it's impossible to know whether we break when we
|
|
// see our first (.regionalIndicator, .regionalIndicator) without walking
|
|
// further backwards. This walks the string backwards enough until we figure
|
|
// out whether or not to break these RIs. For example:
|
|
//
|
|
// Scalar view #1:
|
|
//
|
|
// [.control, .regionalIndicator, .regionalIndicator]
|
|
// ^
|
|
// | = To be able to know whether or not to
|
|
// break these two, we need to walk
|
|
// backwards to determine if there were
|
|
// any previous .regionalIndicators in
|
|
// a row.
|
|
// ^
|
|
// | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
|
|
// even thus we do not break.
|
|
//
|
|
// Scalar view #2:
|
|
//
|
|
// [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
|
|
// ^
|
|
// | = Same as above
|
|
// ^
|
|
// | = This is a .regionalIndicator, so continue
|
|
// walking backwards for more of them. riCount is
|
|
// now equal to 1.
|
|
// ^
|
|
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
|
|
// the last two .regionalIndicators.
|
|
internal func countRIs(
|
|
_ index: Int
|
|
) -> Bool {
|
|
var riIdx = String.Index(_encodedOffset: index)
|
|
|
|
guard riIdx != startIndex else {
|
|
return false
|
|
}
|
|
|
|
var riCount = 0
|
|
|
|
let scalars = String.UnicodeScalarView(self)
|
|
scalars.formIndex(before: &riIdx)
|
|
|
|
while riIdx != startIndex {
|
|
scalars.formIndex(before: &riIdx)
|
|
let scalar = scalars[riIdx]
|
|
|
|
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
|
|
|
|
guard gbp == .regionalIndicator else {
|
|
break
|
|
}
|
|
|
|
riCount += 1
|
|
}
|
|
|
|
return riCount & 1 != 0
|
|
}
|
|
}
|