Implement the Indic grapheme breaking rules

2025-12-21 12:14:44 +01:00 · 2022-01-05 16:18:54 -08:00
parent a0693c4649
commit 4a451829f8
7 changed files with 495 additions and 98 deletions
--- a/stdlib/public/SwiftShims/UnicodeData.h
+++ b/stdlib/public/SwiftShims/UnicodeData.h
@@ -62,6 +62,9 @@ __swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
 SWIFT_RUNTIME_STDLIB_INTERNAL
 __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);

+SWIFT_RUNTIME_STDLIB_INTERNAL
+__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar);
+
 //===----------------------------------------------------------------------===//
 // Unicode.Scalar.Properties
 //===----------------------------------------------------------------------===//
--- a/stdlib/public/core/StringGraphemeBreaking.swift
+++ b/stdlib/public/core/StringGraphemeBreaking.swift
@@ -10,6 +10,8 @@
 //
 //===----------------------------------------------------------------------===//

+import SwiftShims
+
 /// CR and LF are common special cases in grapheme breaking logic
 private var _CR: UInt8 { return 0x0d }
 private var _LF: UInt8 { return 0x0a }
@@ -175,13 +177,56 @@ extension _StringGuts {
  }
 }

+extension Unicode.Scalar {
+  fileprivate var _isLinkingConsonant: Bool {
+    _swift_stdlib_isLinkingConsonant(value)
+  }
+
+  fileprivate var _isVirama: Bool {
+    switch value {
+    // Devanagari
+    case 0x94D:
+      return true
+    // Bengali
+    case 0x9CD:
+      return true
+    // Gujarati
+    case 0xACD:
+      return true
+    // Oriya
+    case 0xB4D:
+      return true
+    // Telugu
+    case 0xC4D:
+      return true
+    // Malayalam
+    case 0xD4D:
+      return true
+
+    default:
+      return false
+    }
+  }
+}
+
 internal struct _GraphemeBreakingState {
+  // When we're looking through an indic sequence, one of the requirements is
+  // that there is at LEAST 1 Virama present between two linking consonants.
+  // This value helps ensure that when we ultimately need to decide whether or
+  // not to break that we've at least seen 1 when walking.
+  var hasSeenVirama = false
+
  // When walking forwards in a string, we need to know whether or not we've
  // entered an emoji sequence to be able to eventually break after all of the
  // emoji's various extenders and zero width joiners. This bit allows us to
  // keep track of whether or not we're still in an emoji sequence when deciding
  // to break.
-  var isInEmojiSequence: Bool = false
+  var isInEmojiSequence = false
+
+  // Similar to emoji sequences, we need to know not to break an Indic grapheme
+  // sequence. This sequence is (potentially) composed of many scalars and isn't
+  // as trivial as comparing two grapheme properties.
+  var isInIndicSequence = false

  // When walking forward in a string, we need to not break on emoji flag
  // sequences. Emoji flag sequences are composed of 2 regional indicators, so
@@ -190,7 +235,7 @@ internal struct _GraphemeBreakingState {
  // is another regional indicator, we reach the same decision rule, but in this
  // case we actually need to break there's a boundary between emoji flag
  // sequences.
-  var shouldBreakRI: Bool = false
+  var shouldBreakRI = false
 }

 extension _StringGuts {
@@ -288,8 +333,12 @@ extension _StringGuts {
    // continue treating the current grapheme cluster as an emoji sequence.
    var enterEmojiSequence = false

+    // Very similar to emoji sequences, but for Indic grapheme sequences.
+    var enterIndicSequence = false
+
    defer {
      state.isInEmojiSequence = enterEmojiSequence
+      state.isInIndicSequence = enterIndicSequence
    }

    switch (x, y) {
@@ -338,6 +387,26 @@ extension _StringGuts {
        enterEmojiSequence = true
      }

+      // If we're currently in an indic sequence (or if our lhs is a linking
+      // consonant), then this check and everything underneath ensures that
+      // we continue being in one and may check if this extend is a Virama.
+      if state.isInIndicSequence || scalar1._isLinkingConsonant {
+        if y == .extend {
+          let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
+
+          // If our extend's CCC is 0, then this rule does not apply.
+          guard extendNormData.ccc != 0 else {
+            return false
+          }
+        }
+
+        enterIndicSequence = true
+
+        if scalar2._isVirama {
+          state.hasSeenVirama = true
+        }
+      }
+
      return false

    // GB9a
@@ -370,6 +439,32 @@ extension _StringGuts {

    // GB999
    default:
+      // GB9c
+      if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
+        state.hasSeenVirama = false
+        return false
+      }
+
+      // Handle GB9c when walking backwards.
+      if isBackwards {
+        switch (x, scalar2._isLinkingConsonant) {
+        case (.extend, true):
+          let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)
+
+          guard extendNormData.ccc != 0 else {
+            return true
+          }
+
+          return !checkIfInIndicSequence(index)
+
+        case (.zwj, true):
+          return !checkIfInIndicSequence(index)
+
+        default:
+          return true
+        }
+      }
+
      return true
    }
  }
@@ -417,9 +512,7 @@ extension _StringGuts {
  //                | = We found our starting .extendedPictographic letting us
  //                    know that we are in an emoji sequence so our initial
  //                    break question is answered as NO.
-  internal func checkIfInEmojiSequence(
-    _ index: Int
-  ) -> Bool {
+  internal func checkIfInEmojiSequence(_ index: Int) -> Bool {
    var emojiIdx = String.Index(_encodedOffset: index)

    guard emojiIdx != startIndex else {
@@ -448,6 +541,90 @@ extension _StringGuts {
    return false
  }

+  // When walking backwards, it's impossible to know whether we break when we
+  // see our first ((.extend|.zwj), .linkingConsonant) without walking
+  // further backwards. This walks the string backwards enough until we figure
+  // out whether or not to break this indic sequence. For example:
+  //
+  // Scalar view #1:
+  //
+  //     [.virama, .extend, .linkingConsonant]
+  //                       ^
+  //                       | = To be able to know whether or not to break these
+  //                           two, we need to walk backwards to determine if
+  //                           this is a legitimate indic sequence.
+  //      ^
+  //      | = The scalar sequence ends without a starting linking consonant,
+  //          so this is in fact not an indic sequence, so we can break the two.
+  //
+  // Scalar view #2:
+  //
+  //     [.linkingConsonant, .virama, .extend, .linkingConsonant]
+  //                                          ^
+  //                                          | = Same as above
+  //                            ^
+  //                            | = This is a virama, so we at least have seen
+  //                                1 to be able to return true if we see a
+  //                                linking consonant later.
+  //         ^
+  //         | = Is a linking consonant and we've seen a virama, so this is a
+  //             legitimate indic sequence, so do NOT break the initial question.
+  internal func checkIfInIndicSequence(_ index: Int) -> Bool {
+    var indicIdx = String.Index(_encodedOffset: index)
+
+    guard indicIdx != startIndex else {
+      return false
+    }
+
+    let scalars = String.UnicodeScalarView(self)
+    scalars.formIndex(before: &indicIdx)
+
+    var hasSeenVirama = false
+
+    // Check if the first extend was the Virama.
+    let scalar = scalars[indicIdx]
+
+    if scalar._isVirama {
+      hasSeenVirama = true
+    }
+
+    while indicIdx != startIndex {
+      scalars.formIndex(before: &indicIdx)
+      let scalar = scalars[indicIdx]
+
+      let gbp = Unicode._GraphemeBreakProperty(from: scalar)
+
+      switch (gbp, scalar._isLinkingConsonant) {
+      case (.extend, false):
+        let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)
+
+        guard extendNormData.ccc != 0 else {
+          return false
+        }
+
+        if scalar._isVirama {
+          hasSeenVirama = true
+        }
+
+      case (.zwj, false):
+        continue
+
+      // LinkingConsonant
+      case (_, true):
+        guard hasSeenVirama else {
+          return false
+        }
+
+        return true
+
+      default:
+        return false
+      }
+    }
+
+    return false
+  }
+
  // When walking backwards, it's impossible to know whether we break when we
  // see our first (.regionalIndicator, .regionalIndicator) without walking
  // further backwards. This walks the string backwards enough until we figure
--- a/stdlib/public/stubs/Unicode/Common/GraphemeData.h
+++ b/stdlib/public/stubs/Unicode/Common/GraphemeData.h
@@ -20,7 +20,7 @@

 #define GRAPHEME_BREAK_DATA_COUNT 621

-static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
+static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
  0x3E00000, 0x400007F, 0x800000A9, 0xAD, 0x800000AE, 0x2DE00300, 0x20C00483, 0x25800591,
  0x200005BF, 0x202005C1, 0x202005C4, 0x200005C7, 0x40A00600, 0x21400610, 0x61C, 0x2280064B,
  0x20000670, 0x20C006D6, 0x400006DD, 0x20A006DF, 0x202006E7, 0x206006EA, 0x4000070F, 0x20000711,
@@ -101,4 +101,41 @@ static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
  0xB701F947, 0x3EE0000, 0x2BEE0020, 0xFEE0080, 0x3DEE0100,
 };

+static const __swift_uint16_t _swift_stdlib_linkingConsonant_ranks[165] = {
+  0x0, 0xE, 0xE, 0x12, 0x13, 0x0, 0x0, 0x0, 0x25, 0x35, 0x0, 0x20, 0x25, 0x46, 0x4B, 0x0, 0x17,
+  0x23, 0x3D, 0x44, 0x0, 0xA, 0x24, 0x31, 0x4B, 0x0, 0x1, 0x27, 0x27, 0x49, 0x0, 0xF, 0x2E, 0x3A,
+  0x57, 0x0, 0x0, 0x1F, 0x2C, 0x2C, 0x0, 0x21, 0x2D, 0x3C, 0x3C, 0x0, 0x0, 0x0, 0xD, 0x2C, 0x0,
+  0x2D, 0x30, 0x30, 0x30, 0x0, 0x0, 0x0, 0x1E, 0x31, 0x0, 0x2C, 0x2C, 0x63, 0x72, 0x0, 0x0, 0x0,
+  0x29, 0x2F, 0x0, 0x26, 0x4A, 0x51, 0x51, 0x0, 0x18, 0x39, 0x54, 0x68, 0x0, 0x18, 0x2F, 0x53, 0x64,
+  0x0, 0x23, 0x39, 0x69, 0x72, 0x0, 0x0, 0x0, 0xE, 0x18, 0x0, 0x0, 0xF, 0x25, 0x25, 0x0, 0x25, 0x26,
+  0x49, 0x49, 0x0, 0x19, 0x37, 0x59, 0x61, 0x0, 0xC, 0x24, 0x52, 0x5D, 0x0, 0x8, 0x8, 0x8, 0x2A,
+  0x0, 0x0, 0x21, 0x21, 0x21, 0x0, 0x2, 0x21, 0x23, 0x43, 0x0, 0x16, 0x22, 0x3D, 0x44, 0x0, 0x0,
+  0x0, 0x22, 0x22, 0x0, 0x0, 0x0, 0x22, 0x22, 0x0, 0x22, 0x28, 0x4B, 0x73, 0x0, 0x0, 0x21, 0x21,
+  0x3F, 0x0, 0x0, 0x25, 0x39, 0x43, 0x0, 0x12, 0x12, 0x12, 0x12,
+};
+
+static const __swift_uint64_t _swift_stdlib_linkingConsonant[166] = {
+  0x5, 0x7E0FF00, 0x0, 0x3C0000000, 0x400000000000000, 0x5BFF, 0x0, 0x0, 0x3FFFFFFFFE00000,
+  0xFF000000FF000000, 0x0, 0x3C5FDFFFFE0, 0x30000B000, 0x36DFDFFFFE0, 0x5E00, 0xFFE0, 0x3EDFDFF,
+  0xFFE0000002000000, 0xB000000003EDFDFF, 0xD620000000020000, 0xC718, 0x3FF, 0xFDFFFFE000000000,
+  0x700000003FF, 0xFDFFFFE000000000, 0x3EF, 0x40000000, 0x7FFFFFFFFE00000, 0x0, 0x2FFBFFFFFC000000,
+  0x7F, 0xFFFE000000000000, 0x7FFFFFFF, 0xF7D6000000000000, 0x7FAFFFFF, 0xF000, 0x0,
+  0xFFFFFEFF00000000, 0x1FFF, 0x0, 0x0, 0x1FFFFFFFF0000, 0xC0623C0300008000, 0x4003FFE1, 0x0, 0x0,
+  0x0, 0x0, 0xFFF8000000000000, 0xFFF80003FFF88003, 0x3, 0xFFFFFFFF0001DFF8, 0x7, 0x0, 0x0, 0x0,
+  0x0, 0x0, 0x7FFFFFFE0000, 0x7FFFF00000000, 0x0, 0xFFFFFFFFFFF, 0x0, 0xFFFFFFFF007FFFFF, 0x181FFF,
+  0x0, 0x0, 0x0, 0x1FE0000FFFFFFFF8, 0xFC00000000000000, 0xFFFF, 0xFFFFFFFF3800C001,
+  0xFFFFFFFF0000000F, 0xE0000000000F, 0x0, 0x0, 0xFFFFF78000000000, 0x3FFFFFFF00000007,
+  0xFFFC00000005FE3C, 0xFFFFF, 0x0, 0x3FFFFFC000000, 0x7FFFFF, 0xFFFFFFFF8E000000,
+  0xFF9F000000000007, 0x7C00, 0x1FFFFFFFFC0, 0xC40EFFFF00000000, 0xFFFFFFFFFFFF, 0x7FC00000000, 0x0,
+  0x0, 0x0, 0x3FFF000000000000, 0x7FD, 0x0, 0x0, 0xFEEF000100000000, 0x3FFFFF, 0x0, 0x0,
+  0xFFFFFFFFF80000, 0x20000000000000, 0xFFFFFFFFE000, 0x0, 0xFF80, 0x900000007FFFFF, 0x7FFFFFFE0,
+  0x7FFFFFFFE, 0xFF00000000000000, 0xFFFB, 0xFFF, 0xBFFFBD7000000000, 0x7FFFFFFFFC0001FF,
+  0xFFE0000000000000, 0xFDFF, 0x3ED, 0x0, 0x0, 0xFFFFFFFFC0000000, 0x1F, 0x0, 0xFFFFFFFF8000, 0x0,
+  0x0, 0x0, 0xC000000000000000, 0x7FFFFFFF, 0xC000000000000000, 0xFFFFFFFF, 0x0, 0xFFFFFC0000000000,
+  0x10007FF, 0x7FFFFFF00000000, 0x7F00000000, 0x0, 0x0, 0x0, 0xFFFFFFFFC000000, 0x0, 0x0, 0x0, 0x0,
+  0xFFFFFF6FF000, 0x0, 0x0, 0xFFFFFFFFC0000000, 0xF800000000000001, 0x7FFFFFFFF, 0xFFFFFFFFFF000,
+  0x0, 0x0, 0x7FFFFFFFC0000000, 0x0, 0xFFFFFFFC, 0x0, 0x0, 0x1FFFFFFFFF000, 0xFFFFF00000000000,
+  0x3FF, 0x0, 0x3FFFF, 0x0, 0x0, 0x0, 0x0,
+};
+
 #endif // #ifndef GRAPHEME_DATA_H
--- a/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp
+++ b/stdlib/public/stubs/Unicode/UnicodeGrapheme.cpp
@@ -12,6 +12,7 @@

 #include "Common/GraphemeData.h"
 #include "../SwiftShims/UnicodeData.h"
+#include <limits>

 SWIFT_RUNTIME_STDLIB_INTERNAL
 __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) {
@@ -57,3 +58,16 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar)
  // property). Return the max value here to indicate .any.
  return 0xFF;
 }
+
+SWIFT_RUNTIME_STDLIB_INTERNAL
+__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar) {
+  auto idx = _swift_stdlib_getScalarBitArrayIdx(scalar,
+                                          _swift_stdlib_linkingConsonant,
+                                          _swift_stdlib_linkingConsonant_ranks);
+
+  if (idx == std::numeric_limits<__swift_intptr_t>::max()) {
+    return false;
+  }
+
+  return true;
+}
--- a/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/IndicRules.swift
+++ b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/IndicRules.swift
@@ -0,0 +1,179 @@
+//===----------------------------------------------------------------------===//
+//
+// This source file is part of the Swift.org open source project
+//
+// Copyright (c) 2022 Apple Inc. and the Swift project authors
+// Licensed under Apache License v2.0 with Runtime Library Exception
+//
+// See https://swift.org/LICENSE.txt for license information
+// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
+//
+//===----------------------------------------------------------------------===//
+
+import GenUtils
+
+func getLinkingConsonant(
+  from data: String
+) -> [ClosedRange<UInt32>] {
+  var unflattened: [(ClosedRange<UInt32>, String)] = []
+  
+  for line in data.split(separator: "\n") {
+    // Skip comments
+    guard !line.hasPrefix("#") else {
+      continue
+    }
+    
+    let components = line.split(separator: ";")
+    
+    // Get the property first because it may be one we don't care about.
+    let splitProperty = components[1].split(separator: "#")
+    let filteredProperty = splitProperty[0].filter { !$0.isWhitespace }
+    
+    // We only care about Linking Consonant who is defined as 'Consonant'.
+    guard filteredProperty == "Consonant" else {
+      continue
+    }
+    
+    // This rule only applies to the following scripts, so ensure that these
+    // scalars are from such scripts.
+    for script in ["Bengali", "Devanagari", "Gujarati", "Oriya", "Telugu", "Malayalam"] {
+      guard line.contains(script.uppercased()) else {
+        continue
+      }
+      
+      break
+    }
+    
+    let scalars: ClosedRange<UInt32>
+    
+    let filteredScalars = components[0].filter { !$0.isWhitespace }
+    
+    // If we have . appear, it means we have a legitimate range. Otherwise,
+    // it's a singular scalar.
+    if filteredScalars.contains(".") {
+      let range = filteredScalars.split(separator: ".")
+      
+      scalars = UInt32(range[0], radix: 16)! ... UInt32(range[1], radix: 16)!
+    } else {
+      let scalar = UInt32(filteredScalars, radix: 16)!
+      
+      scalars = scalar ... scalar
+    }
+    
+    unflattened.append((scalars, "Consonant"))
+  }
+  
+  return flatten(unflattened).map { $0.0 }
+}
+
+func emitLinkingConsonant(
+  _ data: [ClosedRange<UInt32>],
+  into result: inout String
+) {
+  // 64 bit arrays * 8 bytes = .512 KB
+  var bitArrays: [BitArray] = .init(repeating: .init(size: 64), count: 64)
+  
+  let chunkSize = 0x110000 / 64 / 64
+  
+  var chunks: [Int] = []
+  
+  for i in 0 ..< 64 * 64 {
+    let lower = i * chunkSize
+    let upper = lower + chunkSize - 1
+    
+    let idx = i / 64
+    let bit = i % 64
+    
+    for scalar in lower ... upper {
+      if data.contains(where: { $0.contains(UInt32(scalar)) }) {
+        chunks.append(i)
+        
+        bitArrays[idx][bit] = true
+        break
+      }
+    }
+  }
+  
+  // Remove the trailing 0s. Currently this reduces quick look size down to
+  // 96 bytes from 512 bytes.
+  var reducedBA = Array(bitArrays.reversed())
+  reducedBA = Array(reducedBA.drop {
+    $0.words == [0x0]
+  })
+  
+  bitArrays = reducedBA.reversed()
+  
+  // Keep a record of every rank for all the bitarrays.
+  var ranks: [UInt16] = []
+  
+  // Record our quick look ranks.
+  var lastRank: UInt16 = 0
+  for (i, _) in bitArrays.enumerated() {
+    guard i != 0 else {
+      ranks.append(0)
+      continue
+    }
+    
+    var rank = UInt16(bitArrays[i - 1].words[0].nonzeroBitCount)
+    rank += lastRank
+    
+    ranks.append(rank)
+    
+    lastRank = rank
+  }
+  
+  // Insert our quick look size at the beginning.
+  var size = BitArray(size: 64)
+  size.words = [UInt64(bitArrays.count)]
+  bitArrays.insert(size, at: 0)
+  
+  for chunk in chunks {
+    var chunkBA = BitArray(size: chunkSize)
+    
+    let lower = chunk * chunkSize
+    let upper = lower + chunkSize
+    
+    for scalar in lower ..< upper {
+      if data.contains(where: { $0.contains(UInt32(scalar)) }) {
+        chunkBA[scalar % chunkSize] = true
+      }
+    }
+    
+    // Append our chunk bit array's rank.
+    var lastRank: UInt16 = 0
+    for (i, _) in chunkBA.words.enumerated() {
+      guard i != 0 else {
+        ranks.append(0)
+        continue
+      }
+      
+      var rank = UInt16(chunkBA.words[i - 1].nonzeroBitCount)
+      rank += lastRank
+      
+      ranks.append(rank)
+      lastRank = rank
+    }
+    
+    bitArrays += chunkBA.words.map {
+      var ba = BitArray(size: 64)
+      ba.words = [$0]
+      return ba
+    }
+  }
+  
+  emitCollection(
+    ranks,
+    name: "_swift_stdlib_linkingConsonant_ranks",
+    into: &result
+  )
+  
+  emitCollection(
+    bitArrays,
+    name: "_swift_stdlib_linkingConsonant",
+    type: "__swift_uint64_t",
+    into: &result
+  ) {
+    assert($0.words.count == 1)
+    return "0x\(String($0.words[0], radix: 16, uppercase: true))"
+  }
+}
--- a/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift
+++ b/utils/gen-unicode-data/Sources/GenGraphemeBreakProperty/main.swift
@@ -48,37 +48,6 @@ extension Unicode {
  }
 }

-// Takes an unflattened array of scalar ranges and grapheme break properties and
-// attempts to merge ranges who share the same break property. E.g:
-//
-//     0x0 ... 0xA  = .control
-//     0xB ... 0xB  = .control
-//     0xC ... 0x1F = .control
-//
-//    into:
-//
-//    0x0 ... 0x1F = .control
-func flatten(
-  _ unflattened: [(ClosedRange<UInt32>, Unicode.GraphemeBreakProperty)]
-) -> [(ClosedRange<UInt32>, Unicode.GraphemeBreakProperty)] {
-  var result: [(ClosedRange<UInt32>, Unicode.GraphemeBreakProperty)] = []
-
-  for elt in unflattened.sorted(by: { $0.0.lowerBound < $1.0.lowerBound }) {
-    guard !result.isEmpty, result.last!.1 == elt.1 else {
-      result.append(elt)
-      continue
-    }
-    
-    if elt.0.lowerBound == result.last!.0.upperBound + 1 {
-      result[result.count - 1].0 = result.last!.0.lowerBound ... elt.0.upperBound
-    } else {
-      result.append(elt)
-    }
-  }
-
-  return result
-}
-
 // Given a path to one of the Unicode data files, reads it and returns the
 // unflattened list of scalar & grapheme break property.
 //
@@ -150,7 +119,9 @@ func emit(
  into result: inout String
 ) {
  result += """
-  static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[\(data.count)] = {
+  #define GRAPHEME_BREAK_DATA_COUNT \(data.count)
+  
+  static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[\(data.count)] = {

  """

@@ -181,69 +152,20 @@ func emit(
      value |= 1 << 31
    }

-    return "0x\(String(value, radix: 16))"
+    return "0x\(String(value, radix: 16, uppercase: true))"
  }

-  result += "\n};\n\n"
-}
-
-// Writes the stdlib internal routine for binary searching the grapheme array.
-func emitAccessor(
-  _ dataCount: Int,
-  into result: inout String
-) {
  result += """
-  SWIFT_RUNTIME_STDLIB_INTERNAL
-  __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) {
-    auto low = 0;
-    auto high = \(dataCount) - 1;
  
-    while (high >= low) {
-      auto idx = low + (high - low) / 2;
+  };
  
-      auto entry = _swift_stdlib_graphemeBreakProperties[idx];
-
-      // Shift the enum and range count out of the value.
-      auto lower = (entry << 11) >> 11;
-
-      // Shift the enum out first, then shift out the scalar value.
-      auto upper = lower + ((entry << 3) >> 24);
-
-      // Shift everything out.
-      auto enumValue = (__swift_uint8_t)(entry >> 29);
-
-      // Special case: extendedPictographic who used an extra bit for the range.
-      if (enumValue == 5) {
-        upper = lower + ((entry << 2) >> 23);
-      }
-
-      if (scalar >= lower && scalar <= upper) {
-        return enumValue;
-      }
-
-      if (scalar > upper) {
-        low = idx + 1;
-        continue;
-      }
-
-      if (scalar < lower) {
-        high = idx - 1;
-        continue;
-      }
-    }
-
-    // If we made it out here, then our scalar was not found in the grapheme
-    // array (this occurs when a scalar doesn't map to any grapheme break
-    // property). Return the max value here to indicate .any.
-    return 0xFF;
-  }
  
  """
 }

 // Main entry point into the grapheme break property generator.
 func generateGraphemeBreakProperty() {
-  var result = readFile("Input/UnicodeGrapheme.cpp")
+  var result = readFile("Input/GraphemeData.h")

  let baseData = getGraphemeBreakPropertyData(
    for: "Data/GraphemeBreakProperty.txt"
@@ -268,9 +190,20 @@ func generateGraphemeBreakProperty() {

  emit(data, into: &result)
  
-  emitAccessor(data.count, into: &result)
+  // Handle the CLDR grapheme breaking rules:
  
-  write(result, to: "Output/UnicodeGrapheme.cpp")
+  let indicSyllabicCategory = readFile("Data/IndicSyllabicCategory.txt")
+  
+  let consonants = getLinkingConsonant(from: indicSyllabicCategory)
+  
+  emitLinkingConsonant(consonants, into: &result)
+  
+  result += """
+  #endif // #ifndef GRAPHEME_DATA_H
+  
+  """
+  
+  write(result, to: "Output/Common/GraphemeData.h")
 }

 generateGraphemeBreakProperty()
--- a/validation-test/stdlib/String.swift
+++ b/validation-test/stdlib/String.swift
@@ -2280,4 +2280,58 @@ StringTests.test("NormalizationCheck/Opaque")
 #endif
 }

+func expectBidirectionalCount(_ count: Int, _ string: String) {
+  var i = 0
+  var index = string.endIndex
+
+  while index != string.startIndex {
+    i += 1
+    string.formIndex(before: &index)
+  }
+
+  expectEqual(i, count)
+}
+
+StringTests.test("GraphemeBreaking.Indic Sequences") {
+  let test1 = "\u{0915}\u{0924}" // 2
+  expectEqual(2, test1.count)
+  expectBidirectionalCount(2, test1)
+
+  let test2 = "\u{0915}\u{094D}\u{0924}" // 1
+  expectEqual(1, test2.count)
+  expectBidirectionalCount(1, test2)
+
+  let test3 = "\u{0915}\u{094D}\u{094D}\u{0924}" // 1
+  expectEqual(1, test3.count)
+  expectBidirectionalCount(1, test3)
+
+  let test4 = "\u{0915}\u{094D}\u{200D}\u{0924}" // 1
+  expectEqual(1, test4.count)
+  expectBidirectionalCount(1, test4)
+
+  let test5 = "\u{0915}\u{093C}\u{200D}\u{094D}\u{0924}" // 1
+  expectEqual(1, test5.count)
+  expectBidirectionalCount(1, test5)
+
+  let test6 = "\u{0915}\u{093C}\u{094D}\u{200D}\u{0924}" // 1
+  expectEqual(1, test6.count)
+  expectBidirectionalCount(1, test6)
+
+  let test7 = "\u{0915}\u{094D}\u{0924}\u{094D}\u{092F}" // 1
+  expectEqual(1, test7.count)
+  expectBidirectionalCount(1, test7)
+
+  let test8 = "\u{0915}\u{094D}\u{0061}" // 2
+  expectEqual(2, test8.count)
+  expectBidirectionalCount(2, test8)
+
+  let test9 = "\u{0061}\u{094D}\u{0924}" // 2
+  expectEqual(2, test9.count)
+  expectBidirectionalCount(2, test9)
+
+  let test10 = "\u{003F}\u{094D}\u{0924}" // 2
+  expectEqual(2, test10.count)
+  expectBidirectionalCount(2, test10)
+}
+
 runAllTests()