Files
swift-mirror/stdlib/public/core/StringGraphemeBreaking.swift
2022-01-05 16:18:54 -08:00

688 lines
20 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import SwiftShims
/// CR and LF are common special cases in grapheme breaking logic
private var _CR: UInt8 { return 0x0d }
private var _LF: UInt8 { return 0x0a }
internal func _hasGraphemeBreakBetween(
_ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
) -> Bool {
// CR-LF is a special case: no break between these
if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
return false
}
// Whether the given scalar, when it appears paired with another scalar
// satisfying this property, has a grapheme break between it and the other
// scalar.
func hasBreakWhenPaired(_ x: Unicode.Scalar) -> Bool {
// TODO: This doesn't generate optimal code, tune/re-write at a lower
// level.
//
// NOTE: Order of case ranges affects codegen, and thus performance. All
// things being equal, keep existing order below.
switch x.value {
// Unified CJK Han ideographs, common and some supplemental, amongst
// others:
// U+3400 ~ U+A4CF
case 0x3400...0xa4cf: return true
// Repeat sub-300 check, this is beneficial for common cases of Latin
// characters embedded within non-Latin script (e.g. newlines, spaces,
// proper nouns and/or jargon, punctuation).
//
// NOTE: CR-LF special case has already been checked.
case 0x0000...0x02ff: return true
// Non-combining kana:
// U+3041 ~ U+3096
// U+30A1 ~ U+30FC
case 0x3041...0x3096: return true
case 0x30a1...0x30fc: return true
// Non-combining modern (and some archaic) Cyrillic:
// U+0400 ~ U+0482 (first half of Cyrillic block)
case 0x0400...0x0482: return true
// Modern Arabic, excluding extenders and prependers:
// U+061D ~ U+064A
case 0x061d...0x064a: return true
// Precomposed Hangul syllables:
// U+AC00 ~ U+D7AF
case 0xac00...0xd7af: return true
// Common general use punctuation, excluding extenders:
// U+2010 ~ U+2029
case 0x2010...0x2029: return true
// CJK punctuation characters, excluding extenders:
// U+3000 ~ U+3029
case 0x3000...0x3029: return true
// Full-width forms:
// U+FF01 ~ U+FF9D
case 0xFF01...0xFF9D: return true
default: return false
}
}
return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
}
extension _StringGuts {
@usableFromInline @inline(never)
@_effects(releasenone)
internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool {
guard i.transcodedOffset == 0 else { return false }
let offset = i._encodedOffset
if offset == 0 || offset == self.count { return true }
guard isOnUnicodeScalarBoundary(i) else { return false }
let str = String(self)
return i == str.index(before: str.index(after: i))
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func _opaqueCharacterStride(startingAt i: Int) -> Int {
if _slowPath(isForeign) {
return _foreignOpaqueCharacterStride(startingAt: i)
}
let nextIdx = withFastUTF8 { utf8 in
nextBoundary(startingAt: i) {
let (scalar, len) = _decodeScalar(utf8, startingAt: $0)
return (scalar, $0 &+ len)
}
}
return nextIdx &- i
}
@usableFromInline @inline(never)
@_effects(releasenone)
internal func _opaqueCharacterStride(endingAt i: Int) -> Int {
if _slowPath(isForeign) {
return _foreignOpaqueCharacterStride(endingAt: i)
}
let previousIdx = withFastUTF8 { utf8 in
previousBoundary(endingAt: i) {
let (scalar, len) = _decodeScalar(utf8, endingAt: $0)
return (scalar, $0 &- len)
}
}
return i &- previousIdx
}
@inline(never)
@_effects(releasenone)
private func _foreignOpaqueCharacterStride(startingAt i: Int) -> Int {
#if _runtime(_ObjC)
_internalInvariant(isForeign)
let nextIdx = nextBoundary(startingAt: i) {
let scalars = String.UnicodeScalarView(self)
let idx = String.Index(_encodedOffset: $0)
let scalar = scalars[idx]
let nextIdx = scalars.index(after: idx)
return (scalar, nextIdx._encodedOffset)
}
return nextIdx &- i
#else
fatalError("No foreign strings on Linux in this version of Swift")
#endif
}
@inline(never)
@_effects(releasenone)
private func _foreignOpaqueCharacterStride(endingAt i: Int) -> Int {
#if _runtime(_ObjC)
_internalInvariant(isForeign)
let previousIdx = previousBoundary(endingAt: i) {
let scalars = String.UnicodeScalarView(self)
let idx = String.Index(_encodedOffset: $0)
let previousIdx = scalars.index(before: idx)
let scalar = scalars[previousIdx]
return (scalar, previousIdx._encodedOffset)
}
return i &- previousIdx
#else
fatalError("No foreign strings on Linux in this version of Swift")
#endif
}
}
extension Unicode.Scalar {
fileprivate var _isLinkingConsonant: Bool {
_swift_stdlib_isLinkingConsonant(value)
}
fileprivate var _isVirama: Bool {
switch value {
// Devanagari
case 0x94D:
return true
// Bengali
case 0x9CD:
return true
// Gujarati
case 0xACD:
return true
// Oriya
case 0xB4D:
return true
// Telugu
case 0xC4D:
return true
// Malayalam
case 0xD4D:
return true
default:
return false
}
}
}
internal struct _GraphemeBreakingState {
// When we're looking through an indic sequence, one of the requirements is
// that there is at LEAST 1 Virama present between two linking consonants.
// This value helps ensure that when we ultimately need to decide whether or
// not to break that we've at least seen 1 when walking.
var hasSeenVirama = false
// When walking forwards in a string, we need to know whether or not we've
// entered an emoji sequence to be able to eventually break after all of the
// emoji's various extenders and zero width joiners. This bit allows us to
// keep track of whether or not we're still in an emoji sequence when deciding
// to break.
var isInEmojiSequence = false
// Similar to emoji sequences, we need to know not to break an Indic grapheme
// sequence. This sequence is (potentially) composed of many scalars and isn't
// as trivial as comparing two grapheme properties.
var isInIndicSequence = false
// When walking forward in a string, we need to not break on emoji flag
// sequences. Emoji flag sequences are composed of 2 regional indicators, so
// when we see our first (.regionalIndicator, .regionalIndicator) decision,
// we need to know to return false in this case. However, if the next scalar
// is another regional indicator, we reach the same decision rule, but in this
// case we actually need to break there's a boundary between emoji flag
// sequences.
var shouldBreakRI = false
}
extension _StringGuts {
// Returns the stride of the next grapheme cluster at the previous boundary
// offset.
internal func nextBoundary(
startingAt index: Int,
nextScalar: (Int) -> (Unicode.Scalar, end: Int)
) -> Int {
_internalInvariant(index != endIndex._encodedOffset)
var state = _GraphemeBreakingState()
var index = index
while true {
let (scalar1, nextIdx) = nextScalar(index)
index = nextIdx
guard index != endIndex._encodedOffset else {
break
}
let (scalar2, _) = nextScalar(index)
if shouldBreak(scalar1, between: scalar2, &state, index) {
break
}
}
return index
}
// Returns the stride of the previous grapheme cluster at the current boundary
// offset.
internal func previousBoundary(
endingAt index: Int,
previousScalar: (Int) -> (Unicode.Scalar, start: Int)
) -> Int {
_internalInvariant(index != startIndex._encodedOffset)
var state = _GraphemeBreakingState()
var index = index
while true {
let (scalar2, previousIdx) = previousScalar(index)
index = previousIdx
guard index != startIndex._encodedOffset else {
break
}
let (scalar1, _) = previousScalar(index)
if shouldBreak(
scalar1,
between: scalar2,
&state,
index,
isBackwards: true
) {
break
}
}
return index
}
}
extension _StringGuts {
// The "algorithm" that determines whether or not we should break between
// certain grapheme break properties.
//
// This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
// Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
internal func shouldBreak(
_ scalar1: Unicode.Scalar,
between scalar2: Unicode.Scalar,
_ state: inout _GraphemeBreakingState,
_ index: Int,
isBackwards: Bool = false
) -> Bool {
// GB3
if scalar1.value == 0xD, scalar2.value == 0xA {
return false
}
if _hasGraphemeBreakBetween(scalar1, scalar2) {
return true
}
let x = Unicode._GraphemeBreakProperty(from: scalar1)
let y = Unicode._GraphemeBreakProperty(from: scalar2)
// This variable and the defer statement help toggle the isInEmojiSequence
// state variable to false after every decision of 'shouldBreak'. If we
// happen to see a rhs .extend or .zwj, then it's a signal that we should
// continue treating the current grapheme cluster as an emoji sequence.
var enterEmojiSequence = false
// Very similar to emoji sequences, but for Indic grapheme sequences.
var enterIndicSequence = false
defer {
state.isInEmojiSequence = enterEmojiSequence
state.isInIndicSequence = enterIndicSequence
}
switch (x, y) {
// Fast path: If we know our scalars have no properties the decision is
// trivial and we don't need to crawl to the default statement.
case (.any, .any):
return true
// GB4
case (.control, _):
return true
// GB5
case (_, .control):
return true
// GB6
case (.l, .l),
(.l, .v),
(.l, .lv),
(.l, .lvt):
return false
// GB7
case (.lv, .v),
(.v, .v),
(.lv, .t),
(.v, .t):
return false
// GB8
case (.lvt, .t),
(.t, .t):
return false
// GB9 (partial GB11)
case (_, .extend),
(_, .zwj):
// If we're currently in an emoji sequence, then extends and ZWJ help
// continue the grapheme cluster by combining more scalars later. If we're
// not currently in an emoji sequence, but our lhs scalar is a pictograph,
// then that's a signal that it's the start of an emoji sequence.
if state.isInEmojiSequence || x == .extendedPictographic {
enterEmojiSequence = true
}
// If we're currently in an indic sequence (or if our lhs is a linking
// consonant), then this check and everything underneath ensures that
// we continue being in one and may check if this extend is a Virama.
if state.isInIndicSequence || scalar1._isLinkingConsonant {
if y == .extend {
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
// If our extend's CCC is 0, then this rule does not apply.
guard extendNormData.ccc != 0 else {
return false
}
}
enterIndicSequence = true
if scalar2._isVirama {
state.hasSeenVirama = true
}
}
return false
// GB9a
case (_, .spacingMark):
return false
// GB9b
case (.prepend, _):
return false
// GB11
case (.zwj, .extendedPictographic):
if isBackwards {
return !checkIfInEmojiSequence(index)
}
return !state.isInEmojiSequence
// GB12 & GB13
case (.regionalIndicator, .regionalIndicator):
if isBackwards {
return countRIs(index)
}
defer {
state.shouldBreakRI.toggle()
}
return state.shouldBreakRI
// GB999
default:
// GB9c
if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
state.hasSeenVirama = false
return false
}
// Handle GB9c when walking backwards.
if isBackwards {
switch (x, scalar2._isLinkingConsonant) {
case (.extend, true):
let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)
guard extendNormData.ccc != 0 else {
return true
}
return !checkIfInIndicSequence(index)
case (.zwj, true):
return !checkIfInIndicSequence(index)
default:
return true
}
}
return true
}
}
// When walking backwards, it's impossible to know whether we were in an emoji
// sequence without walking further backwards. This walks the string backwards
// enough until we figure out whether or not to break our
// (.zwj, .extendedPictographic) question. For example:
//
// Scalar view #1:
//
// [.control, .zwj, .extendedPictographic]
// ^
// | = To determine whether or not we break here, we need
// to see the previous scalar's grapheme property.
// ^
// | = This is neither .extendedPictographic nor .extend, thus we
// were never in an emoji sequence, so break between the .zwj
// and .extendedPictographic.
//
// Scalar view #2:
//
// [.extendedPictographic, .zwj, .extendedPictographic]
// ^
// | = Same as above, move backwards one to
// view the previous scalar's property.
// ^
// | = This is an .extendedPictographic, so this indicates that
// we are in an emoji sequence, so we should NOT break
// between the .zwj and .extendedPictographic.
//
// Scalar view #3:
//
// [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
// ^
// | = Same as above
// ^
// | = This is an .extend which means
// there is a potential emoji
// sequence, walk further backwards
// to find an .extendedPictographic.
//
// <-- = Another extend, go backwards more.
// ^
// | = We found our starting .extendedPictographic letting us
// know that we are in an emoji sequence so our initial
// break question is answered as NO.
internal func checkIfInEmojiSequence(_ index: Int) -> Bool {
var emojiIdx = String.Index(_encodedOffset: index)
guard emojiIdx != startIndex else {
return false
}
let scalars = String.UnicodeScalarView(self)
scalars.formIndex(before: &emojiIdx)
while emojiIdx != startIndex {
scalars.formIndex(before: &emojiIdx)
let scalar = scalars[emojiIdx]
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
switch gbp {
case .extend:
continue
case .extendedPictographic:
return true
default:
return false
}
}
return false
}
// When walking backwards, it's impossible to know whether we break when we
// see our first ((.extend|.zwj), .linkingConsonant) without walking
// further backwards. This walks the string backwards enough until we figure
// out whether or not to break this indic sequence. For example:
//
// Scalar view #1:
//
// [.virama, .extend, .linkingConsonant]
// ^
// | = To be able to know whether or not to break these
// two, we need to walk backwards to determine if
// this is a legitimate indic sequence.
// ^
// | = The scalar sequence ends without a starting linking consonant,
// so this is in fact not an indic sequence, so we can break the two.
//
// Scalar view #2:
//
// [.linkingConsonant, .virama, .extend, .linkingConsonant]
// ^
// | = Same as above
// ^
// | = This is a virama, so we at least have seen
// 1 to be able to return true if we see a
// linking consonant later.
// ^
// | = Is a linking consonant and we've seen a virama, so this is a
// legitimate indic sequence, so do NOT break the initial question.
internal func checkIfInIndicSequence(_ index: Int) -> Bool {
var indicIdx = String.Index(_encodedOffset: index)
guard indicIdx != startIndex else {
return false
}
let scalars = String.UnicodeScalarView(self)
scalars.formIndex(before: &indicIdx)
var hasSeenVirama = false
// Check if the first extend was the Virama.
let scalar = scalars[indicIdx]
if scalar._isVirama {
hasSeenVirama = true
}
while indicIdx != startIndex {
scalars.formIndex(before: &indicIdx)
let scalar = scalars[indicIdx]
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
switch (gbp, scalar._isLinkingConsonant) {
case (.extend, false):
let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)
guard extendNormData.ccc != 0 else {
return false
}
if scalar._isVirama {
hasSeenVirama = true
}
case (.zwj, false):
continue
// LinkingConsonant
case (_, true):
guard hasSeenVirama else {
return false
}
return true
default:
return false
}
}
return false
}
// When walking backwards, it's impossible to know whether we break when we
// see our first (.regionalIndicator, .regionalIndicator) without walking
// further backwards. This walks the string backwards enough until we figure
// out whether or not to break these RIs. For example:
//
// Scalar view #1:
//
// [.control, .regionalIndicator, .regionalIndicator]
// ^
// | = To be able to know whether or not to
// break these two, we need to walk
// backwards to determine if there were
// any previous .regionalIndicators in
// a row.
// ^
// | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
// even thus we do not break.
//
// Scalar view #2:
//
// [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
// ^
// | = Same as above
// ^
// | = This is a .regionalIndicator, so continue
// walking backwards for more of them. riCount is
// now equal to 1.
// ^
// | = Not a .regionalIndicator. riCount = 1 which is odd, so break
// the last two .regionalIndicators.
internal func countRIs(
_ index: Int
) -> Bool {
var riIdx = String.Index(_encodedOffset: index)
guard riIdx != startIndex else {
return false
}
var riCount = 0
let scalars = String.UnicodeScalarView(self)
scalars.formIndex(before: &riIdx)
while riIdx != startIndex {
scalars.formIndex(before: &riIdx)
let scalar = scalars[riIdx]
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
guard gbp == .regionalIndicator else {
break
}
riCount += 1
}
return riCount & 1 != 0
}
}