Files
swift-mirror/stdlib/public/core/UnicodeData.swift

194 lines
6.7 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2021 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import SwiftShims
internal typealias ScalarAndNormData = (
scalar: Unicode.Scalar,
normData: Unicode._NormData
)
extension Unicode {
// A wrapper type over the normalization data value we receive when we
// lookup a scalar's normalization information. The layout of the underlying
// 16 bit value we receive is as follows:
//
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// CCC
// NFD_QC
// NFC_QC
// Unused
//
// NFD_QC: This is a simple Yes/No on whether the scalar has canonical
// decomposition. Note: Yes is indicated via 0 instead of 1.
//
// NFC_QC: This is either Yes/No/Maybe on whether the scalar is NFC quick
// check. Yes, represented as 0, means the scalar can NEVER compose
// with another scalar previous to it. No, represented as 1, means the
// scalar can NEVER appear within a well formed NFC string. Maybe,
// represented as 2, means the scalar could appear with an NFC string,
// but further information is required to determine if that is the
// case. At the moment, we really only care about Yes/No.
//
// CCC: This is the canonical combining class property of a scalar that is
// used when sorting scalars of a normalization segment after NFD
// computation. A scalar with a CCC value of 128 can NEVER appear before
// a scalar with a CCC value of 100, unless there are normalization
// boundaries between them.
//
internal struct _NormData {
var rawValue: UInt16
var ccc: UInt8 {
UInt8(truncatingIfNeeded: rawValue >> 3)
}
var canonicalCombiningClass: Unicode.CanonicalCombiningClass {
Unicode.CanonicalCombiningClass(rawValue: ccc)
}
var isNFCQC: Bool {
rawValue & 0x6 == 0
}
var isNFDQC: Bool {
rawValue & 0x1 == 0
}
init(_ scalar: Unicode.Scalar, fastUpperbound: UInt32 = 0xC0) {
if _fastPath(scalar.value < fastUpperbound) {
// CCC = 0, NFC_QC = Yes, NFD_QC = Yes
rawValue = 0
} else {
rawValue = _swift_stdlib_getNormData(scalar.value)
// Because we don't store precomposed hangul in our NFD_QC data, these
// will return true for NFD_QC when in fact they are not.
if (0xAC00 ... 0xD7A3).contains(scalar.value) {
// NFD_QC = false
rawValue |= 0x1
}
}
}
init(rawValue: UInt16) {
self.rawValue = rawValue
}
}
}
extension Unicode {
// A wrapper type for normalization buffers in the NFC and NFD iterators.
// This helps remove some of the buffer logic like removal and sorting out of
// the iterators and into this type.
internal struct _NormDataBuffer {
var storage: [ScalarAndNormData] = []
// This is simply a marker denoting that we've built up our storage, and
// now everything within it needs to be emitted. We reverse the buffer and
// pop elements from the back as a way to remove them.
var isReversed = false
var isEmpty: Bool {
storage.isEmpty
}
var last: ScalarAndNormData? {
storage.last
}
mutating func append(_ scalarAndNormData: ScalarAndNormData) {
_internalInvariant(!isReversed)
storage.append(scalarAndNormData)
}
// Removes the first element from the buffer. Note: it is not safe to append
// to the buffer after this function has been called. We reverse the storage
// internally for everything to be emitted out, so appending would insert
// into the storage at the wrong location. One must continue to call this
// function until a 'nil' return value has been received before appending.
mutating func next() -> ScalarAndNormData? {
guard !storage.isEmpty else {
isReversed = false
return nil
}
// If our storage hasn't been reversed yet, do so now.
if !isReversed {
storage.reverse()
isReversed = true
}
return storage.removeLast()
}
// Sort the entire buffer based on the canonical combining class.
mutating func sort() {
storage._insertionSort(within: storage.indices) {
$0.normData.ccc < $1.normData.ccc
}
}
}
}
extension Unicode {
// A wrapper type over the decomposition entry value we receive when we
// lookup a scalar's canonical decomposition. The layout of the underlying
// 32 bit value we receive is as follows:
//
// Top 14 bits Bottom 18 bits
//
// 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
// Index Hashed Scalar
//
// Index: This is the direct index into '_swift_stdlib_nfd_decompositions'
// that points to a size byte indicating the overall size of the
// UTF-8 decomposition string. Following the size byte is said string.
//
// Hashed Scalar: Because perfect hashing doesn't know the original set of
// keys it was hashed with, we store the original scalar in the
// decomposition entry so that we can guard against scalars
// who happen to hash to the same index.
//
internal struct _DecompositionEntry {
let rawValue: UInt32
// Our original scalar is stored in the first 18 bits of this entry.
var hashedScalar: Unicode.Scalar {
Unicode.Scalar(_value: (rawValue << 14) >> 14)
}
// The index into the decomposition array is stored in the top 14 bits.
var index: Int {
Int(truncatingIfNeeded: rawValue >> 18)
}
// A buffer pointer to the UTF8 decomposition string.
var utf8: UnsafeBufferPointer<UInt8> {
let decompPtr = unsafe _swift_stdlib_nfd_decompositions._unsafelyUnwrappedUnchecked
// This size is the utf8 length of the decomposition.
let size = unsafe Int(truncatingIfNeeded: decompPtr[index])
return unsafe UnsafeBufferPointer(
// We add 1 here to skip the size byte.
start: decompPtr + index + 1,
count: size
)
}
init(_ scalar: Unicode.Scalar) {
rawValue = _swift_stdlib_getDecompositionEntry(scalar.value)
}
}
}