mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
Implement the Indic grapheme breaking rules
This commit is contained in:
@@ -62,6 +62,9 @@ __swift_uint32_t _swift_stdlib_getComposition(__swift_uint32_t x,
|
||||
SWIFT_RUNTIME_STDLIB_INTERNAL
|
||||
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar);
|
||||
|
||||
SWIFT_RUNTIME_STDLIB_INTERNAL
|
||||
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar);
|
||||
|
||||
//===----------------------------------------------------------------------===//
|
||||
// Unicode.Scalar.Properties
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
@@ -10,6 +10,8 @@
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
import SwiftShims
|
||||
|
||||
/// CR and LF are common special cases in grapheme breaking logic
|
||||
private var _CR: UInt8 { return 0x0d }
|
||||
private var _LF: UInt8 { return 0x0a }
|
||||
@@ -175,13 +177,56 @@ extension _StringGuts {
|
||||
}
|
||||
}
|
||||
|
||||
extension Unicode.Scalar {
|
||||
fileprivate var _isLinkingConsonant: Bool {
|
||||
_swift_stdlib_isLinkingConsonant(value)
|
||||
}
|
||||
|
||||
fileprivate var _isVirama: Bool {
|
||||
switch value {
|
||||
// Devanagari
|
||||
case 0x94D:
|
||||
return true
|
||||
// Bengali
|
||||
case 0x9CD:
|
||||
return true
|
||||
// Gujarati
|
||||
case 0xACD:
|
||||
return true
|
||||
// Oriya
|
||||
case 0xB4D:
|
||||
return true
|
||||
// Telugu
|
||||
case 0xC4D:
|
||||
return true
|
||||
// Malayalam
|
||||
case 0xD4D:
|
||||
return true
|
||||
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
internal struct _GraphemeBreakingState {
|
||||
// When we're looking through an indic sequence, one of the requirements is
|
||||
// that there is at LEAST 1 Virama present between two linking consonants.
|
||||
// This value helps ensure that when we ultimately need to decide whether or
|
||||
// not to break that we've at least seen 1 when walking.
|
||||
var hasSeenVirama = false
|
||||
|
||||
// When walking forwards in a string, we need to know whether or not we've
|
||||
// entered an emoji sequence to be able to eventually break after all of the
|
||||
// emoji's various extenders and zero width joiners. This bit allows us to
|
||||
// keep track of whether or not we're still in an emoji sequence when deciding
|
||||
// to break.
|
||||
var isInEmojiSequence: Bool = false
|
||||
var isInEmojiSequence = false
|
||||
|
||||
// Similar to emoji sequences, we need to know not to break an Indic grapheme
|
||||
// sequence. This sequence is (potentially) composed of many scalars and isn't
|
||||
// as trivial as comparing two grapheme properties.
|
||||
var isInIndicSequence = false
|
||||
|
||||
// When walking forward in a string, we need to not break on emoji flag
|
||||
// sequences. Emoji flag sequences are composed of 2 regional indicators, so
|
||||
@@ -190,7 +235,7 @@ internal struct _GraphemeBreakingState {
|
||||
// is another regional indicator, we reach the same decision rule, but in this
|
||||
// case we actually need to break there's a boundary between emoji flag
|
||||
// sequences.
|
||||
var shouldBreakRI: Bool = false
|
||||
var shouldBreakRI = false
|
||||
}
|
||||
|
||||
extension _StringGuts {
|
||||
@@ -288,8 +333,12 @@ extension _StringGuts {
|
||||
// continue treating the current grapheme cluster as an emoji sequence.
|
||||
var enterEmojiSequence = false
|
||||
|
||||
// Very similar to emoji sequences, but for Indic grapheme sequences.
|
||||
var enterIndicSequence = false
|
||||
|
||||
defer {
|
||||
state.isInEmojiSequence = enterEmojiSequence
|
||||
state.isInIndicSequence = enterIndicSequence
|
||||
}
|
||||
|
||||
switch (x, y) {
|
||||
@@ -338,6 +387,26 @@ extension _StringGuts {
|
||||
enterEmojiSequence = true
|
||||
}
|
||||
|
||||
// If we're currently in an indic sequence (or if our lhs is a linking
|
||||
// consonant), then this check and everything underneath ensures that
|
||||
// we continue being in one and may check if this extend is a Virama.
|
||||
if state.isInIndicSequence || scalar1._isLinkingConsonant {
|
||||
if y == .extend {
|
||||
let extendNormData = Unicode._NormData(scalar2, fastUpperbound: 0x300)
|
||||
|
||||
// If our extend's CCC is 0, then this rule does not apply.
|
||||
guard extendNormData.ccc != 0 else {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
enterIndicSequence = true
|
||||
|
||||
if scalar2._isVirama {
|
||||
state.hasSeenVirama = true
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
|
||||
// GB9a
|
||||
@@ -370,6 +439,32 @@ extension _StringGuts {
|
||||
|
||||
// GB999
|
||||
default:
|
||||
// GB9c
|
||||
if state.isInIndicSequence, state.hasSeenVirama, scalar2._isLinkingConsonant {
|
||||
state.hasSeenVirama = false
|
||||
return false
|
||||
}
|
||||
|
||||
// Handle GB9c when walking backwards.
|
||||
if isBackwards {
|
||||
switch (x, scalar2._isLinkingConsonant) {
|
||||
case (.extend, true):
|
||||
let extendNormData = Unicode._NormData(scalar1, fastUpperbound: 0x300)
|
||||
|
||||
guard extendNormData.ccc != 0 else {
|
||||
return true
|
||||
}
|
||||
|
||||
return !checkIfInIndicSequence(index)
|
||||
|
||||
case (.zwj, true):
|
||||
return !checkIfInIndicSequence(index)
|
||||
|
||||
default:
|
||||
return true
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
}
|
||||
@@ -417,9 +512,7 @@ extension _StringGuts {
|
||||
// | = We found our starting .extendedPictographic letting us
|
||||
// know that we are in an emoji sequence so our initial
|
||||
// break question is answered as NO.
|
||||
internal func checkIfInEmojiSequence(
|
||||
_ index: Int
|
||||
) -> Bool {
|
||||
internal func checkIfInEmojiSequence(_ index: Int) -> Bool {
|
||||
var emojiIdx = String.Index(_encodedOffset: index)
|
||||
|
||||
guard emojiIdx != startIndex else {
|
||||
@@ -448,6 +541,90 @@ extension _StringGuts {
|
||||
return false
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we break when we
|
||||
// see our first ((.extend|.zwj), .linkingConsonant) without walking
|
||||
// further backwards. This walks the string backwards enough until we figure
|
||||
// out whether or not to break this indic sequence. For example:
|
||||
//
|
||||
// Scalar view #1:
|
||||
//
|
||||
// [.virama, .extend, .linkingConsonant]
|
||||
// ^
|
||||
// | = To be able to know whether or not to break these
|
||||
// two, we need to walk backwards to determine if
|
||||
// this is a legitimate indic sequence.
|
||||
// ^
|
||||
// | = The scalar sequence ends without a starting linking consonant,
|
||||
// so this is in fact not an indic sequence, so we can break the two.
|
||||
//
|
||||
// Scalar view #2:
|
||||
//
|
||||
// [.linkingConsonant, .virama, .extend, .linkingConsonant]
|
||||
// ^
|
||||
// | = Same as above
|
||||
// ^
|
||||
// | = This is a virama, so we at least have seen
|
||||
// 1 to be able to return true if we see a
|
||||
// linking consonant later.
|
||||
// ^
|
||||
// | = Is a linking consonant and we've seen a virama, so this is a
|
||||
// legitimate indic sequence, so do NOT break the initial question.
|
||||
internal func checkIfInIndicSequence(_ index: Int) -> Bool {
|
||||
var indicIdx = String.Index(_encodedOffset: index)
|
||||
|
||||
guard indicIdx != startIndex else {
|
||||
return false
|
||||
}
|
||||
|
||||
let scalars = String.UnicodeScalarView(self)
|
||||
scalars.formIndex(before: &indicIdx)
|
||||
|
||||
var hasSeenVirama = false
|
||||
|
||||
// Check if the first extend was the Virama.
|
||||
let scalar = scalars[indicIdx]
|
||||
|
||||
if scalar._isVirama {
|
||||
hasSeenVirama = true
|
||||
}
|
||||
|
||||
while indicIdx != startIndex {
|
||||
scalars.formIndex(before: &indicIdx)
|
||||
let scalar = scalars[indicIdx]
|
||||
|
||||
let gbp = Unicode._GraphemeBreakProperty(from: scalar)
|
||||
|
||||
switch (gbp, scalar._isLinkingConsonant) {
|
||||
case (.extend, false):
|
||||
let extendNormData = Unicode._NormData(scalar, fastUpperbound: 0x300)
|
||||
|
||||
guard extendNormData.ccc != 0 else {
|
||||
return false
|
||||
}
|
||||
|
||||
if scalar._isVirama {
|
||||
hasSeenVirama = true
|
||||
}
|
||||
|
||||
case (.zwj, false):
|
||||
continue
|
||||
|
||||
// LinkingConsonant
|
||||
case (_, true):
|
||||
guard hasSeenVirama else {
|
||||
return false
|
||||
}
|
||||
|
||||
return true
|
||||
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return false
|
||||
}
|
||||
|
||||
// When walking backwards, it's impossible to know whether we break when we
|
||||
// see our first (.regionalIndicator, .regionalIndicator) without walking
|
||||
// further backwards. This walks the string backwards enough until we figure
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
|
||||
#define GRAPHEME_BREAK_DATA_COUNT 621
|
||||
|
||||
static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
|
||||
static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
|
||||
0x3E00000, 0x400007F, 0x800000A9, 0xAD, 0x800000AE, 0x2DE00300, 0x20C00483, 0x25800591,
|
||||
0x200005BF, 0x202005C1, 0x202005C4, 0x200005C7, 0x40A00600, 0x21400610, 0x61C, 0x2280064B,
|
||||
0x20000670, 0x20C006D6, 0x400006DD, 0x20A006DF, 0x202006E7, 0x206006EA, 0x4000070F, 0x20000711,
|
||||
@@ -101,4 +101,41 @@ static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[621] = {
|
||||
0xB701F947, 0x3EE0000, 0x2BEE0020, 0xFEE0080, 0x3DEE0100,
|
||||
};
|
||||
|
||||
static const __swift_uint16_t _swift_stdlib_linkingConsonant_ranks[165] = {
|
||||
0x0, 0xE, 0xE, 0x12, 0x13, 0x0, 0x0, 0x0, 0x25, 0x35, 0x0, 0x20, 0x25, 0x46, 0x4B, 0x0, 0x17,
|
||||
0x23, 0x3D, 0x44, 0x0, 0xA, 0x24, 0x31, 0x4B, 0x0, 0x1, 0x27, 0x27, 0x49, 0x0, 0xF, 0x2E, 0x3A,
|
||||
0x57, 0x0, 0x0, 0x1F, 0x2C, 0x2C, 0x0, 0x21, 0x2D, 0x3C, 0x3C, 0x0, 0x0, 0x0, 0xD, 0x2C, 0x0,
|
||||
0x2D, 0x30, 0x30, 0x30, 0x0, 0x0, 0x0, 0x1E, 0x31, 0x0, 0x2C, 0x2C, 0x63, 0x72, 0x0, 0x0, 0x0,
|
||||
0x29, 0x2F, 0x0, 0x26, 0x4A, 0x51, 0x51, 0x0, 0x18, 0x39, 0x54, 0x68, 0x0, 0x18, 0x2F, 0x53, 0x64,
|
||||
0x0, 0x23, 0x39, 0x69, 0x72, 0x0, 0x0, 0x0, 0xE, 0x18, 0x0, 0x0, 0xF, 0x25, 0x25, 0x0, 0x25, 0x26,
|
||||
0x49, 0x49, 0x0, 0x19, 0x37, 0x59, 0x61, 0x0, 0xC, 0x24, 0x52, 0x5D, 0x0, 0x8, 0x8, 0x8, 0x2A,
|
||||
0x0, 0x0, 0x21, 0x21, 0x21, 0x0, 0x2, 0x21, 0x23, 0x43, 0x0, 0x16, 0x22, 0x3D, 0x44, 0x0, 0x0,
|
||||
0x0, 0x22, 0x22, 0x0, 0x0, 0x0, 0x22, 0x22, 0x0, 0x22, 0x28, 0x4B, 0x73, 0x0, 0x0, 0x21, 0x21,
|
||||
0x3F, 0x0, 0x0, 0x25, 0x39, 0x43, 0x0, 0x12, 0x12, 0x12, 0x12,
|
||||
};
|
||||
|
||||
static const __swift_uint64_t _swift_stdlib_linkingConsonant[166] = {
|
||||
0x5, 0x7E0FF00, 0x0, 0x3C0000000, 0x400000000000000, 0x5BFF, 0x0, 0x0, 0x3FFFFFFFFE00000,
|
||||
0xFF000000FF000000, 0x0, 0x3C5FDFFFFE0, 0x30000B000, 0x36DFDFFFFE0, 0x5E00, 0xFFE0, 0x3EDFDFF,
|
||||
0xFFE0000002000000, 0xB000000003EDFDFF, 0xD620000000020000, 0xC718, 0x3FF, 0xFDFFFFE000000000,
|
||||
0x700000003FF, 0xFDFFFFE000000000, 0x3EF, 0x40000000, 0x7FFFFFFFFE00000, 0x0, 0x2FFBFFFFFC000000,
|
||||
0x7F, 0xFFFE000000000000, 0x7FFFFFFF, 0xF7D6000000000000, 0x7FAFFFFF, 0xF000, 0x0,
|
||||
0xFFFFFEFF00000000, 0x1FFF, 0x0, 0x0, 0x1FFFFFFFF0000, 0xC0623C0300008000, 0x4003FFE1, 0x0, 0x0,
|
||||
0x0, 0x0, 0xFFF8000000000000, 0xFFF80003FFF88003, 0x3, 0xFFFFFFFF0001DFF8, 0x7, 0x0, 0x0, 0x0,
|
||||
0x0, 0x0, 0x7FFFFFFE0000, 0x7FFFF00000000, 0x0, 0xFFFFFFFFFFF, 0x0, 0xFFFFFFFF007FFFFF, 0x181FFF,
|
||||
0x0, 0x0, 0x0, 0x1FE0000FFFFFFFF8, 0xFC00000000000000, 0xFFFF, 0xFFFFFFFF3800C001,
|
||||
0xFFFFFFFF0000000F, 0xE0000000000F, 0x0, 0x0, 0xFFFFF78000000000, 0x3FFFFFFF00000007,
|
||||
0xFFFC00000005FE3C, 0xFFFFF, 0x0, 0x3FFFFFC000000, 0x7FFFFF, 0xFFFFFFFF8E000000,
|
||||
0xFF9F000000000007, 0x7C00, 0x1FFFFFFFFC0, 0xC40EFFFF00000000, 0xFFFFFFFFFFFF, 0x7FC00000000, 0x0,
|
||||
0x0, 0x0, 0x3FFF000000000000, 0x7FD, 0x0, 0x0, 0xFEEF000100000000, 0x3FFFFF, 0x0, 0x0,
|
||||
0xFFFFFFFFF80000, 0x20000000000000, 0xFFFFFFFFE000, 0x0, 0xFF80, 0x900000007FFFFF, 0x7FFFFFFE0,
|
||||
0x7FFFFFFFE, 0xFF00000000000000, 0xFFFB, 0xFFF, 0xBFFFBD7000000000, 0x7FFFFFFFFC0001FF,
|
||||
0xFFE0000000000000, 0xFDFF, 0x3ED, 0x0, 0x0, 0xFFFFFFFFC0000000, 0x1F, 0x0, 0xFFFFFFFF8000, 0x0,
|
||||
0x0, 0x0, 0xC000000000000000, 0x7FFFFFFF, 0xC000000000000000, 0xFFFFFFFF, 0x0, 0xFFFFFC0000000000,
|
||||
0x10007FF, 0x7FFFFFF00000000, 0x7F00000000, 0x0, 0x0, 0x0, 0xFFFFFFFFC000000, 0x0, 0x0, 0x0, 0x0,
|
||||
0xFFFFFF6FF000, 0x0, 0x0, 0xFFFFFFFFC0000000, 0xF800000000000001, 0x7FFFFFFFF, 0xFFFFFFFFFF000,
|
||||
0x0, 0x0, 0x7FFFFFFFC0000000, 0x0, 0xFFFFFFFC, 0x0, 0x0, 0x1FFFFFFFFF000, 0xFFFFF00000000000,
|
||||
0x3FF, 0x0, 0x3FFFF, 0x0, 0x0, 0x0, 0x0,
|
||||
};
|
||||
|
||||
#endif // #ifndef GRAPHEME_DATA_H
|
||||
|
||||
@@ -12,6 +12,7 @@
|
||||
|
||||
#include "Common/GraphemeData.h"
|
||||
#include "../SwiftShims/UnicodeData.h"
|
||||
#include <limits>
|
||||
|
||||
SWIFT_RUNTIME_STDLIB_INTERNAL
|
||||
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) {
|
||||
@@ -57,3 +58,16 @@ __swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar)
|
||||
// property). Return the max value here to indicate .any.
|
||||
return 0xFF;
|
||||
}
|
||||
|
||||
SWIFT_RUNTIME_STDLIB_INTERNAL
|
||||
__swift_bool _swift_stdlib_isLinkingConsonant(__swift_uint32_t scalar) {
|
||||
auto idx = _swift_stdlib_getScalarBitArrayIdx(scalar,
|
||||
_swift_stdlib_linkingConsonant,
|
||||
_swift_stdlib_linkingConsonant_ranks);
|
||||
|
||||
if (idx == std::numeric_limits<__swift_intptr_t>::max()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -0,0 +1,179 @@
|
||||
//===----------------------------------------------------------------------===//
|
||||
//
|
||||
// This source file is part of the Swift.org open source project
|
||||
//
|
||||
// Copyright (c) 2022 Apple Inc. and the Swift project authors
|
||||
// Licensed under Apache License v2.0 with Runtime Library Exception
|
||||
//
|
||||
// See https://swift.org/LICENSE.txt for license information
|
||||
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
||||
//
|
||||
//===----------------------------------------------------------------------===//
|
||||
|
||||
import GenUtils
|
||||
|
||||
func getLinkingConsonant(
|
||||
from data: String
|
||||
) -> [ClosedRange<UInt32>] {
|
||||
var unflattened: [(ClosedRange<UInt32>, String)] = []
|
||||
|
||||
for line in data.split(separator: "\n") {
|
||||
// Skip comments
|
||||
guard !line.hasPrefix("#") else {
|
||||
continue
|
||||
}
|
||||
|
||||
let components = line.split(separator: ";")
|
||||
|
||||
// Get the property first because it may be one we don't care about.
|
||||
let splitProperty = components[1].split(separator: "#")
|
||||
let filteredProperty = splitProperty[0].filter { !$0.isWhitespace }
|
||||
|
||||
// We only care about Linking Consonant who is defined as 'Consonant'.
|
||||
guard filteredProperty == "Consonant" else {
|
||||
continue
|
||||
}
|
||||
|
||||
// This rule only applies to the following scripts, so ensure that these
|
||||
// scalars are from such scripts.
|
||||
for script in ["Bengali", "Devanagari", "Gujarati", "Oriya", "Telugu", "Malayalam"] {
|
||||
guard line.contains(script.uppercased()) else {
|
||||
continue
|
||||
}
|
||||
|
||||
break
|
||||
}
|
||||
|
||||
let scalars: ClosedRange<UInt32>
|
||||
|
||||
let filteredScalars = components[0].filter { !$0.isWhitespace }
|
||||
|
||||
// If we have . appear, it means we have a legitimate range. Otherwise,
|
||||
// it's a singular scalar.
|
||||
if filteredScalars.contains(".") {
|
||||
let range = filteredScalars.split(separator: ".")
|
||||
|
||||
scalars = UInt32(range[0], radix: 16)! ... UInt32(range[1], radix: 16)!
|
||||
} else {
|
||||
let scalar = UInt32(filteredScalars, radix: 16)!
|
||||
|
||||
scalars = scalar ... scalar
|
||||
}
|
||||
|
||||
unflattened.append((scalars, "Consonant"))
|
||||
}
|
||||
|
||||
return flatten(unflattened).map { $0.0 }
|
||||
}
|
||||
|
||||
func emitLinkingConsonant(
|
||||
_ data: [ClosedRange<UInt32>],
|
||||
into result: inout String
|
||||
) {
|
||||
// 64 bit arrays * 8 bytes = .512 KB
|
||||
var bitArrays: [BitArray] = .init(repeating: .init(size: 64), count: 64)
|
||||
|
||||
let chunkSize = 0x110000 / 64 / 64
|
||||
|
||||
var chunks: [Int] = []
|
||||
|
||||
for i in 0 ..< 64 * 64 {
|
||||
let lower = i * chunkSize
|
||||
let upper = lower + chunkSize - 1
|
||||
|
||||
let idx = i / 64
|
||||
let bit = i % 64
|
||||
|
||||
for scalar in lower ... upper {
|
||||
if data.contains(where: { $0.contains(UInt32(scalar)) }) {
|
||||
chunks.append(i)
|
||||
|
||||
bitArrays[idx][bit] = true
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Remove the trailing 0s. Currently this reduces quick look size down to
|
||||
// 96 bytes from 512 bytes.
|
||||
var reducedBA = Array(bitArrays.reversed())
|
||||
reducedBA = Array(reducedBA.drop {
|
||||
$0.words == [0x0]
|
||||
})
|
||||
|
||||
bitArrays = reducedBA.reversed()
|
||||
|
||||
// Keep a record of every rank for all the bitarrays.
|
||||
var ranks: [UInt16] = []
|
||||
|
||||
// Record our quick look ranks.
|
||||
var lastRank: UInt16 = 0
|
||||
for (i, _) in bitArrays.enumerated() {
|
||||
guard i != 0 else {
|
||||
ranks.append(0)
|
||||
continue
|
||||
}
|
||||
|
||||
var rank = UInt16(bitArrays[i - 1].words[0].nonzeroBitCount)
|
||||
rank += lastRank
|
||||
|
||||
ranks.append(rank)
|
||||
|
||||
lastRank = rank
|
||||
}
|
||||
|
||||
// Insert our quick look size at the beginning.
|
||||
var size = BitArray(size: 64)
|
||||
size.words = [UInt64(bitArrays.count)]
|
||||
bitArrays.insert(size, at: 0)
|
||||
|
||||
for chunk in chunks {
|
||||
var chunkBA = BitArray(size: chunkSize)
|
||||
|
||||
let lower = chunk * chunkSize
|
||||
let upper = lower + chunkSize
|
||||
|
||||
for scalar in lower ..< upper {
|
||||
if data.contains(where: { $0.contains(UInt32(scalar)) }) {
|
||||
chunkBA[scalar % chunkSize] = true
|
||||
}
|
||||
}
|
||||
|
||||
// Append our chunk bit array's rank.
|
||||
var lastRank: UInt16 = 0
|
||||
for (i, _) in chunkBA.words.enumerated() {
|
||||
guard i != 0 else {
|
||||
ranks.append(0)
|
||||
continue
|
||||
}
|
||||
|
||||
var rank = UInt16(chunkBA.words[i - 1].nonzeroBitCount)
|
||||
rank += lastRank
|
||||
|
||||
ranks.append(rank)
|
||||
lastRank = rank
|
||||
}
|
||||
|
||||
bitArrays += chunkBA.words.map {
|
||||
var ba = BitArray(size: 64)
|
||||
ba.words = [$0]
|
||||
return ba
|
||||
}
|
||||
}
|
||||
|
||||
emitCollection(
|
||||
ranks,
|
||||
name: "_swift_stdlib_linkingConsonant_ranks",
|
||||
into: &result
|
||||
)
|
||||
|
||||
emitCollection(
|
||||
bitArrays,
|
||||
name: "_swift_stdlib_linkingConsonant",
|
||||
type: "__swift_uint64_t",
|
||||
into: &result
|
||||
) {
|
||||
assert($0.words.count == 1)
|
||||
return "0x\(String($0.words[0], radix: 16, uppercase: true))"
|
||||
}
|
||||
}
|
||||
@@ -48,37 +48,6 @@ extension Unicode {
|
||||
}
|
||||
}
|
||||
|
||||
// Takes an unflattened array of scalar ranges and grapheme break properties and
|
||||
// attempts to merge ranges who share the same break property. E.g:
|
||||
//
|
||||
// 0x0 ... 0xA = .control
|
||||
// 0xB ... 0xB = .control
|
||||
// 0xC ... 0x1F = .control
|
||||
//
|
||||
// into:
|
||||
//
|
||||
// 0x0 ... 0x1F = .control
|
||||
func flatten(
|
||||
_ unflattened: [(ClosedRange<UInt32>, Unicode.GraphemeBreakProperty)]
|
||||
) -> [(ClosedRange<UInt32>, Unicode.GraphemeBreakProperty)] {
|
||||
var result: [(ClosedRange<UInt32>, Unicode.GraphemeBreakProperty)] = []
|
||||
|
||||
for elt in unflattened.sorted(by: { $0.0.lowerBound < $1.0.lowerBound }) {
|
||||
guard !result.isEmpty, result.last!.1 == elt.1 else {
|
||||
result.append(elt)
|
||||
continue
|
||||
}
|
||||
|
||||
if elt.0.lowerBound == result.last!.0.upperBound + 1 {
|
||||
result[result.count - 1].0 = result.last!.0.lowerBound ... elt.0.upperBound
|
||||
} else {
|
||||
result.append(elt)
|
||||
}
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
||||
// Given a path to one of the Unicode data files, reads it and returns the
|
||||
// unflattened list of scalar & grapheme break property.
|
||||
//
|
||||
@@ -150,7 +119,9 @@ func emit(
|
||||
into result: inout String
|
||||
) {
|
||||
result += """
|
||||
static __swift_uint32_t _swift_stdlib_graphemeBreakProperties[\(data.count)] = {
|
||||
#define GRAPHEME_BREAK_DATA_COUNT \(data.count)
|
||||
|
||||
static const __swift_uint32_t _swift_stdlib_graphemeBreakProperties[\(data.count)] = {
|
||||
|
||||
"""
|
||||
|
||||
@@ -181,69 +152,20 @@ func emit(
|
||||
value |= 1 << 31
|
||||
}
|
||||
|
||||
return "0x\(String(value, radix: 16))"
|
||||
return "0x\(String(value, radix: 16, uppercase: true))"
|
||||
}
|
||||
|
||||
result += "\n};\n\n"
|
||||
}
|
||||
|
||||
// Writes the stdlib internal routine for binary searching the grapheme array.
|
||||
func emitAccessor(
|
||||
_ dataCount: Int,
|
||||
into result: inout String
|
||||
) {
|
||||
result += """
|
||||
SWIFT_RUNTIME_STDLIB_INTERNAL
|
||||
__swift_uint8_t _swift_stdlib_getGraphemeBreakProperty(__swift_uint32_t scalar) {
|
||||
auto low = 0;
|
||||
auto high = \(dataCount) - 1;
|
||||
|
||||
while (high >= low) {
|
||||
auto idx = low + (high - low) / 2;
|
||||
};
|
||||
|
||||
auto entry = _swift_stdlib_graphemeBreakProperties[idx];
|
||||
|
||||
// Shift the enum and range count out of the value.
|
||||
auto lower = (entry << 11) >> 11;
|
||||
|
||||
// Shift the enum out first, then shift out the scalar value.
|
||||
auto upper = lower + ((entry << 3) >> 24);
|
||||
|
||||
// Shift everything out.
|
||||
auto enumValue = (__swift_uint8_t)(entry >> 29);
|
||||
|
||||
// Special case: extendedPictographic who used an extra bit for the range.
|
||||
if (enumValue == 5) {
|
||||
upper = lower + ((entry << 2) >> 23);
|
||||
}
|
||||
|
||||
if (scalar >= lower && scalar <= upper) {
|
||||
return enumValue;
|
||||
}
|
||||
|
||||
if (scalar > upper) {
|
||||
low = idx + 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (scalar < lower) {
|
||||
high = idx - 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// If we made it out here, then our scalar was not found in the grapheme
|
||||
// array (this occurs when a scalar doesn't map to any grapheme break
|
||||
// property). Return the max value here to indicate .any.
|
||||
return 0xFF;
|
||||
}
|
||||
|
||||
"""
|
||||
}
|
||||
|
||||
// Main entry point into the grapheme break property generator.
|
||||
func generateGraphemeBreakProperty() {
|
||||
var result = readFile("Input/UnicodeGrapheme.cpp")
|
||||
var result = readFile("Input/GraphemeData.h")
|
||||
|
||||
let baseData = getGraphemeBreakPropertyData(
|
||||
for: "Data/GraphemeBreakProperty.txt"
|
||||
@@ -268,9 +190,20 @@ func generateGraphemeBreakProperty() {
|
||||
|
||||
emit(data, into: &result)
|
||||
|
||||
emitAccessor(data.count, into: &result)
|
||||
// Handle the CLDR grapheme breaking rules:
|
||||
|
||||
write(result, to: "Output/UnicodeGrapheme.cpp")
|
||||
let indicSyllabicCategory = readFile("Data/IndicSyllabicCategory.txt")
|
||||
|
||||
let consonants = getLinkingConsonant(from: indicSyllabicCategory)
|
||||
|
||||
emitLinkingConsonant(consonants, into: &result)
|
||||
|
||||
result += """
|
||||
#endif // #ifndef GRAPHEME_DATA_H
|
||||
|
||||
"""
|
||||
|
||||
write(result, to: "Output/Common/GraphemeData.h")
|
||||
}
|
||||
|
||||
generateGraphemeBreakProperty()
|
||||
|
||||
@@ -2280,4 +2280,58 @@ StringTests.test("NormalizationCheck/Opaque")
|
||||
#endif
|
||||
}
|
||||
|
||||
func expectBidirectionalCount(_ count: Int, _ string: String) {
|
||||
var i = 0
|
||||
var index = string.endIndex
|
||||
|
||||
while index != string.startIndex {
|
||||
i += 1
|
||||
string.formIndex(before: &index)
|
||||
}
|
||||
|
||||
expectEqual(i, count)
|
||||
}
|
||||
|
||||
StringTests.test("GraphemeBreaking.Indic Sequences") {
|
||||
let test1 = "\u{0915}\u{0924}" // 2
|
||||
expectEqual(2, test1.count)
|
||||
expectBidirectionalCount(2, test1)
|
||||
|
||||
let test2 = "\u{0915}\u{094D}\u{0924}" // 1
|
||||
expectEqual(1, test2.count)
|
||||
expectBidirectionalCount(1, test2)
|
||||
|
||||
let test3 = "\u{0915}\u{094D}\u{094D}\u{0924}" // 1
|
||||
expectEqual(1, test3.count)
|
||||
expectBidirectionalCount(1, test3)
|
||||
|
||||
let test4 = "\u{0915}\u{094D}\u{200D}\u{0924}" // 1
|
||||
expectEqual(1, test4.count)
|
||||
expectBidirectionalCount(1, test4)
|
||||
|
||||
let test5 = "\u{0915}\u{093C}\u{200D}\u{094D}\u{0924}" // 1
|
||||
expectEqual(1, test5.count)
|
||||
expectBidirectionalCount(1, test5)
|
||||
|
||||
let test6 = "\u{0915}\u{093C}\u{094D}\u{200D}\u{0924}" // 1
|
||||
expectEqual(1, test6.count)
|
||||
expectBidirectionalCount(1, test6)
|
||||
|
||||
let test7 = "\u{0915}\u{094D}\u{0924}\u{094D}\u{092F}" // 1
|
||||
expectEqual(1, test7.count)
|
||||
expectBidirectionalCount(1, test7)
|
||||
|
||||
let test8 = "\u{0915}\u{094D}\u{0061}" // 2
|
||||
expectEqual(2, test8.count)
|
||||
expectBidirectionalCount(2, test8)
|
||||
|
||||
let test9 = "\u{0061}\u{094D}\u{0924}" // 2
|
||||
expectEqual(2, test9.count)
|
||||
expectBidirectionalCount(2, test9)
|
||||
|
||||
let test10 = "\u{003F}\u{094D}\u{0924}" // 2
|
||||
expectEqual(2, test10.count)
|
||||
expectBidirectionalCount(2, test10)
|
||||
}
|
||||
|
||||
runAllTests()
|
||||
|
||||
Reference in New Issue
Block a user