Files
swift-mirror/stdlib/public/core/NFD.swift
Alejandro Alonso 014e822cb2 Address Michael's comments
fix infinite recursion bug

NFC: Remove early ccc check

remember that false is turned on
2021-09-29 14:20:22 -07:00

182 lines
5.8 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
extension Unicode {
internal struct _NFD<S: StringProtocol> {
let base: S
}
}
extension Unicode._NFD {
internal struct Iterator {
var buffer = Unicode._NormDataBuffer()
// This index always points at the next starter of a normalization segment.
// Each iteration of 'next()' moves this index up to the next starter.
var index: S.UnicodeScalarView.Index
let unicodeScalars: S.UnicodeScalarView
}
}
extension Unicode._NFD.Iterator: IteratorProtocol {
internal mutating func decompose(
_ scalar: Unicode.Scalar,
with normData: Unicode._NormData
) {
// ASCII always decomposes to itself.
if _fastPath(scalar.value < 0xC0) {
// ASCII always has normData of 0.
// CCC = 0, NFC_QC = Yes, NFD_QC = Yes
buffer.append((scalar, normData))
return
}
// Handle Hangul decomposition algorithmically.
// S.base = 0xAC00
// S.count = 11172
// S.base + S.count - 1 = 0xD7A3
if (0xAC00 ... 0xD7A3).contains(scalar.value) {
decomposeHangul(scalar)
return
}
// Otherwise, we need to lookup the decomposition (if there is one).
decomposeSlow(scalar, with: normData)
}
@inline(never)
internal mutating func decomposeHangul(_ scalar: Unicode.Scalar) {
// L = Hangul leading consonants
let L: (base: UInt32, count: UInt32) = (base: 0x1100, count: 19)
// V = Hangul vowels
let V: (base: UInt32, count: UInt32) = (base: 0x1161, count: 21)
// T = Hangul tail consonants
let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28)
// N = Number of precomposed Hangul syllables that start with the same
// leading consonant. (There is no base for N).
let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588)
// S = Hangul precomposed syllables
let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172)
let sIdx = scalar.value &- S.base
let lIdx = sIdx / N.count
let l = Unicode.Scalar(_value: L.base &+ lIdx)
// Hangul leading consonants, L, always have normData of 0.
// CCC = 0, NFC_QC = Yes, NFD_QC = Yes
buffer.append((scalar: l, normData: .init(rawValue: 0)))
let vIdx = (sIdx % N.count) / T.count
let v = Unicode.Scalar(_value: V.base &+ vIdx)
// Hangul vowels, V, always have normData of 4.
// CCC = 0, NFC_QC = Maybe, NFD_QC = Yes
buffer.append((scalar: v, normData: .init(rawValue: 4)))
let tIdx = sIdx % T.count
if tIdx != 0 {
let t = Unicode.Scalar(_value: T.base &+ tIdx)
// Hangul tail consonants, T, always have normData of 4.
// CCC = 0, NFC_QC = Maybe, NFD_QC = Yes
buffer.append((scalar: t, normData: .init(rawValue: 4)))
}
}
@inline(never)
internal mutating func decomposeSlow(
_ scalar: Unicode.Scalar,
with normData: Unicode._NormData
) {
// Look into the decomposition perfect hash table.
let decompEntry = Unicode._DecompositionEntry(scalar)
// If this is not our original scalar, then we have no decomposition for this
// scalar, so just emit itself. This is required because perfect hashing
// does not know the original set of keys that it used to create itself, so
// we store the original scalar in our decomposition entry to ensure that
// scalars that hash to the same index don't succeed.
guard scalar == decompEntry.hashedScalar else {
buffer.append((scalar, normData))
return
}
var utf8 = decompEntry.utf8
while utf8.count > 0 {
let (scalar, len) = _decodeScalar(utf8, startingAt: 0)
utf8 = UnsafeBufferPointer(rebasing: utf8[len...])
// Fast path: Because this will be emitted into the completed NFD buffer,
// we don't need to look at NFD_QC anymore which lets us do a larger
// latiny check for NFC_QC and CCC (0xC0 vs. 0x300).
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
buffer.append((scalar, normData))
}
}
internal mutating func next() -> ScalarAndNormData? {
// Empty out our buffer before attempting to decompose the next
// normalization segment.
if let nextBuffered = buffer.next() {
return nextBuffered
}
while index < unicodeScalars.endIndex {
let scalar = unicodeScalars[index]
let normData = Unicode._NormData(scalar)
// If we've reached a starter, stop.
if normData.ccc == 0, !buffer.isEmpty {
break
}
unicodeScalars.formIndex(after: &index)
// If our scalar IS NFD quick check, then it's as simple as appending to
// our buffer and moving on the next scalar. Otherwise, we need to
// decompose this and append each decomposed scalar.
if normData.isNFDQC {
// Fast path: If our scalar is also ccc = 0, then this doesn't need to
// be appended to the buffer at all.
if normData.ccc == 0 {
return (scalar, normData)
}
buffer.append((scalar, normData))
} else {
decompose(scalar, with: normData)
}
}
// Sort the entire buffer based on the canonical combining class.
buffer.sort()
return buffer.next()
}
}
extension Unicode._NFD: Sequence {
internal func makeIterator() -> Iterator {
Iterator(
index: base.unicodeScalars.startIndex,
unicodeScalars: base.unicodeScalars
)
}
}
extension StringProtocol {
internal var _nfd: Unicode._NFD<Self> {
Unicode._NFD(base: self)
}
}