swift-mirror/stdlib/public/core/StringGraphemeBreaking.swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import SwiftShims

/// CR and LF are common special cases in grapheme breaking logic
private var _CR: UInt8 { return 0x0d }
private var _LF: UInt8 { return 0x0a }

internal func _hasGraphemeBreakBetween(
  _ lhs: Unicode.Scalar, _ rhs: Unicode.Scalar
) -> Bool {

  // CR-LF is a special case: no break between these
  if lhs == Unicode.Scalar(_CR) && rhs == Unicode.Scalar(_LF) {
    return false
  }

  // Whether the given scalar, when it appears paired with another scalar
  // satisfying this property, has a grapheme break between it and the other
  // scalar.
  func hasBreakWhenPaired(_ x: Unicode.Scalar) -> Bool {
    // TODO: This doesn't generate optimal code, tune/re-write at a lower
    // level.
    //
    // NOTE: Order of case ranges affects codegen, and thus performance. All
    // things being equal, keep existing order below.
    switch x.value {
    // Unified CJK Han ideographs, common and some supplemental, amongst
    // others:
    //   U+3400 ~ U+A4CF
    case 0x3400...0xa4cf: return true

    // Repeat sub-300 check, this is beneficial for common cases of Latin
    // characters embedded within non-Latin script (e.g. newlines, spaces,
    // proper nouns and/or jargon, punctuation).
    //
    // NOTE: CR-LF special case has already been checked.
    case 0x0000...0x02ff: return true

    // Non-combining kana:
    //   U+3041 ~ U+3096
    //   U+30A1 ~ U+30FC
    case 0x3041...0x3096: return true
    case 0x30a1...0x30fc: return true

    // Non-combining modern (and some archaic) Cyrillic:
    //   U+0400 ~ U+0482 (first half of Cyrillic block)
    case 0x0400...0x0482: return true

    // Modern Arabic, excluding extenders and prependers:
    //   U+061D ~ U+064A
    case 0x061d...0x064a: return true

    // Precomposed Hangul syllables:
    //   U+AC00 ~ U+D7AF
    case 0xac00...0xd7af: return true

    // Common general use punctuation, excluding extenders:
    //   U+2010 ~ U+2029
    case 0x2010...0x2029: return true

    // CJK punctuation characters, excluding extenders:
    //   U+3000 ~ U+3029
    case 0x3000...0x3029: return true

    // Full-width forms:
    //   U+FF01 ~ U+FF9D
    case 0xFF01...0xFF9D: return true

    default: return false
    }
  }
  return hasBreakWhenPaired(lhs) && hasBreakWhenPaired(rhs)
}

extension _StringGuts {
  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func isOnGraphemeClusterBoundary(_ i: String.Index) -> Bool {
    guard i.transcodedOffset == 0 else { return false }

    let offset = i._encodedOffset
    if offset == 0 || offset == self.count { return true }

    guard isOnUnicodeScalarBoundary(i) else { return false }

    let str = String(self)
    return i == str.index(before: str.index(after: i))
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _opaqueCharacterStride(startingAt i: Int) -> Int {
    if _slowPath(isForeign) {
      return _foreignOpaqueCharacterStride(startingAt: i)
    }

    let nextIdx = withFastUTF8 { utf8 in
      nextBoundary(startingAt: i) {
        let (scalar, len) = _decodeScalar(utf8, startingAt: $0)
        return (scalar, $0 &+ len)
      }
    }

    return nextIdx &- i
  }

  @usableFromInline @inline(never)
  @_effects(releasenone)
  internal func _opaqueCharacterStride(endingAt i: Int) -> Int {
    if _slowPath(isForeign) {
      return _foreignOpaqueCharacterStride(endingAt: i)
    }

    let previousIdx = withFastUTF8 { utf8 in
      previousBoundary(endingAt: i) {
        let (scalar, len) = _decodeScalar(utf8, endingAt: $0)
        return (scalar, $0 &- len)
      }
    }

    return i &- previousIdx
  }

  @inline(never)
  @_effects(releasenone)
  private func _foreignOpaqueCharacterStride(startingAt i: Int) -> Int {
#if _runtime(_ObjC)
    _internalInvariant(isForeign)

    let nextIdx = nextBoundary(startingAt: i) {
      let scalars = String.UnicodeScalarView(self)
      let idx = String.Index(_encodedOffset: $0)

      let scalar = scalars[idx]
      let nextIdx = scalars.index(after: idx)

      return (scalar, nextIdx._encodedOffset)
    }

    return nextIdx &- i
#else
  fatalError("No foreign strings on Linux in this version of Swift")
#endif
  }

  @inline(never)
  @_effects(releasenone)
  private func _foreignOpaqueCharacterStride(endingAt i: Int) -> Int {
#if _runtime(_ObjC)
    _internalInvariant(isForeign)

    let previousIdx = previousBoundary(endingAt: i) {
      let scalars = String.UnicodeScalarView(self)
      let idx = String.Index(_encodedOffset: $0)

      let previousIdx = scalars.index(before: idx)
      let scalar = scalars[previousIdx]

      return (scalar, previousIdx._encodedOffset)
    }

    return i &- previousIdx
#else
  fatalError("No foreign strings on Linux in this version of Swift")
#endif
  }
}

extension Unicode.Scalar {
  fileprivate var _isLinkingConsonant: Bool {
    _swift_stdlib_isLinkingConsonant(value)
  }

  fileprivate var _isVirama: Bool {
    switch value {
    // Devanagari
    case 0x94D:
      return true
    // Bengali
    case 0x9CD:
      return true
    // Gujarati
    case 0xACD:
      return true
    // Oriya
    case 0xB4D:
      return true
    // Telugu
    case 0xC4D:
      return true
    // Malayalam
    case 0xD4D:
      return true

    default:
      return false
    }
  }
}

internal struct _GraphemeBreakingState {
  // When we're looking through an indic sequence, one of the requirements is
  // that there is at LEAST 1 Virama present between two linking consonants.
  // This value helps ensure that when we ultimately need to decide whether or
  // not to break that we've at least seen 1 when walking.
  var hasSeenVirama = false

  // When walking forwards in a string, we need to know whether or not we've
  // entered an emoji sequence to be able to eventually break after all of the
  // emoji's various extenders and zero width joiners. This bit allows us to
  // keep track of whether or not we're still in an emoji sequence when deciding
  // to break.
  var isInEmojiSequence = false

  // Similar to emoji sequences, we need to know not to break an Indic grapheme
  // sequence. This sequence is (potentially) composed of many scalars and isn't
  // as trivial as comparing two grapheme properties.
  var isInIndicSequence = false

  // When walking forward in a string, we need to not break on emoji flag
  // sequences. Emoji flag sequences are composed of 2 regional indicators, so
  // when we see our first (.regionalIndicator, .regionalIndicator) decision,
  // we need to know to return false in this case. However, if the next scalar
  // is another regional indicator, we reach the same decision rule, but in this
  // case we actually need to break there's a boundary between emoji flag
  // sequences.
  var shouldBreakRI = false
}

extension _StringGuts {
  // Returns the stride of the next grapheme cluster at the previous boundary
  // offset.
  internal func nextBoundary(
    startingAt index: Int,
    nextScalar: (Int) -> (Unicode.Scalar, end: Int)
  ) -> Int {
    _internalInvariant(index != endIndex._encodedOffset)
    var state = _GraphemeBreakingState()
    var index = index

    while true {
      let (scalar1, nextIdx) = nextScalar(index)
      index = nextIdx

      guard index != endIndex._encodedOffset else {
        break
      }

      let (scalar2, _) = nextScalar(index)

      if shouldBreak(scalar1, between: scalar2, &state, index) {
        break
      }
    }

    return index
  }

  // Returns the stride of the previous grapheme cluster at the current boundary
  // offset.
  internal func previousBoundary(
    endingAt index: Int,
    previousScalar: (Int) -> (Unicode.Scalar, start: Int)
  ) -> Int {
    _internalInvariant(index != startIndex._encodedOffset)
    var state = _GraphemeBreakingState()
    var index = index

    while true {
      let (scalar2, previousIdx) = previousScalar(index)
      index = previousIdx

      guard index != startIndex._encodedOffset else {
        break
      }

      let (scalar1, _) = previousScalar(index)

      if shouldBreak(
        scalar1,
        between: scalar2,
        &state,
        index,
        isBackwards: true
      ) {
        break
      }
    }

    return index
  }
}

extension _StringGuts {
  // The "algorithm" that determines whether or not we should break between
  // certain grapheme break properties.
  //
  // This is based off of the Unicode Annex #29 for [Grapheme Cluster Boundary
  // Rules](https://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules).
  internal func shouldBreak(
    _ scalar1: Unicode.Scalar,
    between scalar2: Unicode.Scalar,
    _ state: inout _GraphemeBreakingState,
    _ index: Int,
    isBackwards: Bool = false
  ) -> Bool {
    // GB3
    if scalar1.value == 0xD, scalar2.value == 0xA {
      return false
    }

    if _hasGraphemeBreakBetween(scalar1, scalar2) {
      return true
    }

    let x = Unicode._GraphemeBreakProperty(from: scalar1)
    let y = Unicode._GraphemeBreakProperty(from: scalar2)

    // This variable and the defer statement help toggle the isInEmojiSequence
    // state variable to false after every decision of 'shouldBreak'. If we
    // happen to see a rhs .extend or .zwj, then it's a signal that we should
    // continue treating the current grapheme cluster as an emoji sequence.
    var enterEmojiSequence = false

    // Very similar to emoji sequences, but for Indic grapheme sequences.
    var enterIndicSequence = false

    defer {
      state.isInEmojiSequence = enterEmojiSequence
      state.isInIndicSequence = enterIndicSequence
    }

    switch (x, y) {

    // Fast path: If we know our scalars have no properties the decision is
    //            trivial and we don't need to crawl to the default statement.
    case (.any, .any):
      return true

    // GB4
    case (.control, _):
      return true

    // GB5
    case (_, .control):
      return true

    // GB6
    case (.l, .l),
         (.l, .v),
         (.l, .lv),
         (.l, .lvt):
      return false

    // GB7
    case (.lv, .v),
         (.v, .v),
         (.lv, .t),
         (.v, .t):
      return false

    // GB8
    case (.lvt, .t),
         (.t, .t):
      return false

    // GB9 (partial GB11)
    case (_, .extend),
         (_, .zwj):

      // If we're currently in an emoji sequence, then extends and ZWJ help
      // continue the grapheme cluster by combining more scalars later. If we're
      // not currently in an emoji sequence, but our lhs scalar is a pictograph,
      // then that's a signal that it's the start of an emoji sequence.
      if state.isInEmojiSequence || x == .extendedPictographic {
        enterEmojiSequence = true
      }

      // If we're currently in an indic sequence (or if our lhs is a linking
      // consonant), then this check and everything underneath ensures that
      // we continue being in one and may check if this extend is a Virama.
      if state.isInIndicSequence || scalar1._isLinkingConsonant {
        if y == .extend {
          let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)

          // If our extend's CCC is 0, then this rule does not apply.
          guard extendNormData.ccc != 0 else {
            return false
          }
        }

        enterIndicSequence = true

        if scalar2._isVirama {
          state.hasSeenVirama = true
        }
      }

      return false

    // GB9a
    case (_, .spacingMark):
      return false

    // GB9b
    case (.prepend, _):
      return false

    // GB11
    case (.zwj, .extendedPictographic):
      if isBackwards {
        return !checkIfInEmojiSequence(index)
      }

      return !state.isInEmojiSequence

    // GB12 & GB13
    case (.regionalIndicator, .regionalIndicator):
      if isBackwards {
        return countRIs(index)
      }

      defer {
        state.shouldBreakRI.toggle()
      }

      return state.shouldBreakRI

    // GB999
    default:
      // GB9c
      if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
        state.hasSeenVirama = false
        return false
      }

      // Handle GB9c when walking backwards.
      if isBackwards {
        switch (x, scalar2._isLinkingConsonant) {
        case (.extend, true):
          let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)

          guard extendNormData.ccc != 0 else {
            return true
          }

          return !checkIfInIndicSequence(index)

        case (.zwj, true):
          return !checkIfInIndicSequence(index)

        default:
          return true
        }
      }

      return true
    }
  }

  // When walking backwards, it's impossible to know whether we were in an emoji
  // sequence without walking further backwards. This walks the string backwards
  // enough until we figure out whether or not to break our
  // (.zwj, .extendedPictographic) question. For example:
  //
  // Scalar view #1:
  //
  //     [.control, .zwj, .extendedPictographic]
  //                     ^
  //                     | = To determine whether or not we break here, we need
  //                         to see the previous scalar's grapheme property.
  //          ^
  //          | = This is neither .extendedPictographic nor .extend, thus we
  //              were never in an emoji sequence, so break between the .zwj
  //              and .extendedPictographic.
  //
  // Scalar view #2:
  //
  //     [.extendedPictographic, .zwj, .extendedPictographic]
  //                                  ^
  //                                  | = Same as above, move backwards one to
  //                                      view the previous scalar's property.
  //                ^
  //                | = This is an .extendedPictographic, so this indicates that
  //                    we are in an emoji sequence, so we should NOT break
  //                    between the .zwj and .extendedPictographic.
  //
  // Scalar view #3:
  //
  //     [.extendedPictographic, .extend, .extend, .zwj, .extendedPictographic]
  //                                                    ^
  //                                                    | = Same as above
  //                                         ^
  //                                         | = This is an .extend which means
  //                                             there is a potential emoji
  //                                             sequence, walk further backwards
  //                                             to find an .extendedPictographic.
  //
  //                               <-- = Another extend, go backwards more.
  //                ^
  //                | = We found our starting .extendedPictographic letting us
  //                    know that we are in an emoji sequence so our initial
  //                    break question is answered as NO.
  internal func checkIfInEmojiSequence(_ index: Int) -> Bool {
    var emojiIdx = String.Index(_encodedOffset: index)

    guard emojiIdx != startIndex else {
      return false
    }

    let scalars = String.UnicodeScalarView(self)
    scalars.formIndex(before: &emojiIdx)

    while emojiIdx != startIndex {
      scalars.formIndex(before: &emojiIdx)
      let scalar = scalars[emojiIdx]

      let gbp = Unicode._GraphemeBreakProperty(from: scalar)

      switch gbp {
      case .extend:
        continue
      case .extendedPictographic:
        return true
      default:
        return false
      }
    }

    return false
  }

  // When walking backwards, it's impossible to know whether we break when we
  // see our first ((.extend|.zwj), .linkingConsonant) without walking
  // further backwards. This walks the string backwards enough until we figure
  // out whether or not to break this indic sequence. For example:
  //
  // Scalar view #1:
  //
  //     [.virama, .extend, .linkingConsonant]
  //                       ^
  //                       | = To be able to know whether or not to break these
  //                           two, we need to walk backwards to determine if
  //                           this is a legitimate indic sequence.
  //      ^
  //      | = The scalar sequence ends without a starting linking consonant,
  //          so this is in fact not an indic sequence, so we can break the two.
  //
  // Scalar view #2:
  //
  //     [.linkingConsonant, .virama, .extend, .linkingConsonant]
  //                                          ^
  //                                          | = Same as above
  //                            ^
  //                            | = This is a virama, so we at least have seen
  //                                1 to be able to return true if we see a
  //                                linking consonant later.
  //         ^
  //         | = Is a linking consonant and we've seen a virama, so this is a
  //             legitimate indic sequence, so do NOT break the initial question.
  internal func checkIfInIndicSequence(_ index: Int) -> Bool {
    var indicIdx = String.Index(_encodedOffset: index)

    guard indicIdx != startIndex else {
      return false
    }

    let scalars = String.UnicodeScalarView(self)
    scalars.formIndex(before: &indicIdx)

    var hasSeenVirama = false

    // Check if the first extend was the Virama.
    let scalar = scalars[indicIdx]

    if scalar._isVirama {
      hasSeenVirama = true
    }

    while indicIdx != startIndex {
      scalars.formIndex(before: &indicIdx)
      let scalar = scalars[indicIdx]

      let gbp = Unicode._GraphemeBreakProperty(from: scalar)

      switch (gbp, scalar._isLinkingConsonant) {
      case (.extend, false):
        let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)

        guard extendNormData.ccc != 0 else {
          return false
        }

        if scalar._isVirama {
          hasSeenVirama = true
        }

      case (.zwj, false):
        continue

      // LinkingConsonant
      case (_, true):
        guard hasSeenVirama else {
          return false
        }

        return true

      default:
        return false
      }
    }

    return false
  }

  // When walking backwards, it's impossible to know whether we break when we
  // see our first (.regionalIndicator, .regionalIndicator) without walking
  // further backwards. This walks the string backwards enough until we figure
  // out whether or not to break these RIs. For example:
  //
  // Scalar view #1:
  //
  //     [.control, .regionalIndicator, .regionalIndicator]
  //                                   ^
  //                                   | = To be able to know whether or not to
  //                                       break these two, we need to walk
  //                                       backwards to determine if there were
  //                                       any previous .regionalIndicators in
  //                                       a row.
  //         ^
  //         | = Not a .regionalIndicator, so our total riCount is 0 and 0 is
  //             even thus we do not break.
  //
  // Scalar view #2:
  //
  //     [.control, .regionalIndicator, .regionalIndicator, .regionalIndicator]
  //                                                       ^
  //                                                       | = Same as above
  //                         ^
  //                         | = This is a .regionalIndicator, so continue
  //                             walking backwards for more of them. riCount is
  //                             now equal to 1.
  //         ^
  //         | = Not a .regionalIndicator. riCount = 1 which is odd, so break
  //             the last two .regionalIndicators.
  internal func countRIs(
    _ index: Int
  ) -> Bool {
    var riIdx = String.Index(_encodedOffset: index)

    guard riIdx != startIndex else {
      return false
    }

    var riCount = 0

    let scalars = String.UnicodeScalarView(self)
    scalars.formIndex(before: &riIdx)

    while riIdx != startIndex {
      scalars.formIndex(before: &riIdx)
      let scalar = scalars[riIdx]

      let gbp = Unicode._GraphemeBreakProperty(from: scalar)

      guard gbp == .regionalIndicator else {
        break
      }

      riCount += 1
    }

    return riCount & 1 != 0
  }
}