mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
182 lines
5.8 KiB
Swift
182 lines
5.8 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2021 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
extension Unicode {
|
|
internal struct _NFD<S: StringProtocol> {
|
|
let base: S
|
|
}
|
|
}
|
|
|
|
extension Unicode._NFD {
|
|
internal struct Iterator {
|
|
var buffer = Unicode._NormDataBuffer()
|
|
|
|
// This index always points at the next starter of a normalization segment.
|
|
// Each iteration of 'next()' moves this index up to the next starter.
|
|
var index: S.UnicodeScalarView.Index
|
|
|
|
let unicodeScalars: S.UnicodeScalarView
|
|
}
|
|
}
|
|
|
|
extension Unicode._NFD.Iterator: IteratorProtocol {
|
|
internal mutating func decompose(
|
|
_ scalar: Unicode.Scalar,
|
|
with normData: Unicode._NormData
|
|
) {
|
|
// ASCII always decomposes to itself.
|
|
if _fastPath(scalar.value < 0xC0) {
|
|
// ASCII always has normData of 0.
|
|
// CCC = 0, NFC_QC = Yes, NFD_QC = Yes
|
|
buffer.append((scalar, normData))
|
|
return
|
|
}
|
|
|
|
// Handle Hangul decomposition algorithmically.
|
|
// S.base = 0xAC00
|
|
// S.count = 11172
|
|
// S.base + S.count - 1 = 0xD7A3
|
|
if (0xAC00 ... 0xD7A3).contains(scalar.value) {
|
|
decomposeHangul(scalar)
|
|
return
|
|
}
|
|
|
|
// Otherwise, we need to lookup the decomposition (if there is one).
|
|
decomposeSlow(scalar, with: normData)
|
|
}
|
|
|
|
@inline(never)
|
|
internal mutating func decomposeHangul(_ scalar: Unicode.Scalar) {
|
|
// L = Hangul leading consonants
|
|
let L: (base: UInt32, count: UInt32) = (base: 0x1100, count: 19)
|
|
// V = Hangul vowels
|
|
let V: (base: UInt32, count: UInt32) = (base: 0x1161, count: 21)
|
|
// T = Hangul tail consonants
|
|
let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28)
|
|
// N = Number of precomposed Hangul syllables that start with the same
|
|
// leading consonant. (There is no base for N).
|
|
let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588)
|
|
// S = Hangul precomposed syllables
|
|
let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172)
|
|
|
|
let sIdx = scalar.value &- S.base
|
|
|
|
let lIdx = sIdx / N.count
|
|
let l = Unicode.Scalar(_value: L.base &+ lIdx)
|
|
// Hangul leading consonants, L, always have normData of 0.
|
|
// CCC = 0, NFC_QC = Yes, NFD_QC = Yes
|
|
buffer.append((scalar: l, normData: .init(rawValue: 0)))
|
|
|
|
let vIdx = (sIdx % N.count) / T.count
|
|
let v = Unicode.Scalar(_value: V.base &+ vIdx)
|
|
// Hangul vowels, V, always have normData of 4.
|
|
// CCC = 0, NFC_QC = Maybe, NFD_QC = Yes
|
|
buffer.append((scalar: v, normData: .init(rawValue: 4)))
|
|
|
|
let tIdx = sIdx % T.count
|
|
if tIdx != 0 {
|
|
let t = Unicode.Scalar(_value: T.base &+ tIdx)
|
|
// Hangul tail consonants, T, always have normData of 4.
|
|
// CCC = 0, NFC_QC = Maybe, NFD_QC = Yes
|
|
buffer.append((scalar: t, normData: .init(rawValue: 4)))
|
|
}
|
|
}
|
|
|
|
@inline(never)
|
|
internal mutating func decomposeSlow(
|
|
_ scalar: Unicode.Scalar,
|
|
with normData: Unicode._NormData
|
|
) {
|
|
// Look into the decomposition perfect hash table.
|
|
let decompEntry = Unicode._DecompositionEntry(scalar)
|
|
|
|
// If this is not our original scalar, then we have no decomposition for this
|
|
// scalar, so just emit itself. This is required because perfect hashing
|
|
// does not know the original set of keys that it used to create itself, so
|
|
// we store the original scalar in our decomposition entry to ensure that
|
|
// scalars that hash to the same index don't succeed.
|
|
guard scalar == decompEntry.hashedScalar else {
|
|
buffer.append((scalar, normData))
|
|
return
|
|
}
|
|
|
|
var utf8 = decompEntry.utf8
|
|
|
|
while utf8.count > 0 {
|
|
let (scalar, len) = _decodeScalar(utf8, startingAt: 0)
|
|
utf8 = UnsafeBufferPointer(rebasing: utf8[len...])
|
|
|
|
// Fast path: Because this will be emitted into the completed NFD buffer,
|
|
// we don't need to look at NFD_QC anymore which lets us do a larger
|
|
// latiny check for NFC_QC and CCC (0xC0 vs. 0x300).
|
|
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
|
|
|
|
buffer.append((scalar, normData))
|
|
}
|
|
}
|
|
|
|
internal mutating func next() -> ScalarAndNormData? {
|
|
// Empty out our buffer before attempting to decompose the next
|
|
// normalization segment.
|
|
if let nextBuffered = buffer.next() {
|
|
return nextBuffered
|
|
}
|
|
|
|
while index < unicodeScalars.endIndex {
|
|
let scalar = unicodeScalars[index]
|
|
let normData = Unicode._NormData(scalar)
|
|
|
|
// If we've reached a starter, stop.
|
|
if normData.ccc == 0, !buffer.isEmpty {
|
|
break
|
|
}
|
|
|
|
unicodeScalars.formIndex(after: &index)
|
|
|
|
// If our scalar IS NFD quick check, then it's as simple as appending to
|
|
// our buffer and moving on the next scalar. Otherwise, we need to
|
|
// decompose this and append each decomposed scalar.
|
|
if normData.isNFDQC {
|
|
// Fast path: If our scalar is also ccc = 0, then this doesn't need to
|
|
// be appended to the buffer at all.
|
|
if normData.ccc == 0 {
|
|
return (scalar, normData)
|
|
}
|
|
|
|
buffer.append((scalar, normData))
|
|
} else {
|
|
decompose(scalar, with: normData)
|
|
}
|
|
}
|
|
|
|
// Sort the entire buffer based on the canonical combining class.
|
|
buffer.sort()
|
|
|
|
return buffer.next()
|
|
}
|
|
}
|
|
|
|
extension Unicode._NFD: Sequence {
|
|
internal func makeIterator() -> Iterator {
|
|
Iterator(
|
|
index: base.unicodeScalars.startIndex,
|
|
unicodeScalars: base.unicodeScalars
|
|
)
|
|
}
|
|
}
|
|
|
|
extension StringProtocol {
|
|
internal var _nfd: Unicode._NFD<Self> {
|
|
Unicode._NFD(base: self)
|
|
}
|
|
}
|