mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
384 lines
12 KiB
Swift
384 lines
12 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2021 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
extension Sequence where Element == Unicode.Scalar {
|
|
internal var _internalNFD: Unicode._InternalNFD<Self> {
|
|
Unicode._InternalNFD(self)
|
|
}
|
|
}
|
|
|
|
extension Unicode {
|
|
|
|
/// The contents of the source sequence, in Normalization Form D.
|
|
///
|
|
/// Normalization to NFD preserves canonical equivalence.
|
|
///
|
|
internal struct _InternalNFD<Source> where Source: Sequence<Unicode.Scalar> {
|
|
|
|
internal let source: Source
|
|
|
|
internal init(_ source: Source) {
|
|
self.source = source
|
|
}
|
|
}
|
|
}
|
|
|
|
extension Unicode._InternalNFD: Sequence {
|
|
|
|
internal consuming func makeIterator() -> Iterator {
|
|
Iterator(source: source.makeIterator())
|
|
}
|
|
|
|
internal struct Iterator: IteratorProtocol {
|
|
|
|
internal var source: Source.Iterator
|
|
internal var normalizer: Unicode._NFDNormalizer
|
|
|
|
internal init(source: Source.Iterator) {
|
|
self.source = source
|
|
self.normalizer = Unicode._NFDNormalizer()
|
|
}
|
|
|
|
internal mutating func next() -> Unicode.Scalar? {
|
|
normalizer.resume(consuming: &source) ?? normalizer.flush()
|
|
}
|
|
}
|
|
}
|
|
|
|
extension Unicode._InternalNFD: Sendable where Source: Sendable {}
|
|
extension Unicode._InternalNFD.Iterator: Sendable where Source.Iterator: Sendable {}
|
|
|
|
extension Unicode {
|
|
|
|
/// A stateful normalizer, producing a single logical stream
|
|
/// of normalized text from chunked inputs.
|
|
///
|
|
/// To use the normalizer, first create an instance.
|
|
/// Next, feed it a chunk of a text stream using the `resume(consuming:)`
|
|
/// function. The normalizer will consume from the stream and buffer
|
|
/// it as needed, so continue feeding the same source until
|
|
/// it returns `nil`, indicating that the source was exhausted.
|
|
///
|
|
/// ```swift
|
|
/// var normalizer = Unicode.NFDNormalizer()
|
|
///
|
|
/// var input: some IteratorProtocol<Unicode.Scalar> = ...
|
|
/// while let scalar = normalizer.resume(consuming: &input) {
|
|
/// print(scalar)
|
|
/// }
|
|
///
|
|
/// // assert(input.next() == nil)
|
|
/// ```
|
|
///
|
|
/// You may continue consuming sources until you reach the end
|
|
/// of the logical text stream. Once you reach the end,
|
|
/// call `flush()` to drain any remaining content
|
|
/// from the normalizer's buffers.
|
|
///
|
|
/// ```swift
|
|
/// while let scalar = normalizer.flush() {
|
|
/// print(scalar)
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// The chunks of input text do not need to be aligned on any normalization
|
|
/// boundary. The normalizer state has value semantics, so it is possible
|
|
/// to copy and store and is inherently thread-safe.
|
|
///
|
|
internal struct _NFDNormalizer: Sendable {
|
|
|
|
internal enum State {
|
|
case emittingSegment
|
|
case consuming
|
|
case terminated
|
|
}
|
|
|
|
internal var state = State.consuming
|
|
|
|
internal var buffer = Unicode._NormDataBuffer()
|
|
internal var pendingStarter = Optional<ScalarAndNormData>.none
|
|
internal var bufferIsSorted = false
|
|
|
|
/// Creates a new normalizer.
|
|
///
|
|
internal init() { }
|
|
|
|
/// Resume normalizing the text stream.
|
|
///
|
|
/// Each call to `resume` returns the next scalar in the normalized output,
|
|
/// consuming elements from the given source as necessary.
|
|
///
|
|
/// If the normalizer returns `nil`, the source was exhausted.
|
|
/// One a source is exhausted, you may:
|
|
///
|
|
/// - Call `resume` again some time later with a different source
|
|
/// to continue processing the same logical text stream, or
|
|
///
|
|
/// - Call `flush` in order to mark the end of the stream
|
|
/// and consume data remaining in the normalizer's internal buffers.
|
|
///
|
|
/// Typical usage looks like the following:
|
|
///
|
|
/// ```swift
|
|
/// var normalizer = Unicode.NFDNormalizer()
|
|
///
|
|
/// var input: some IteratorProtocol<Unicode.Scalar> = ...
|
|
/// while let scalar = normalizer.resume(consuming: &input) {
|
|
/// print(scalar)
|
|
/// }
|
|
///
|
|
/// // We could resume again, consuming from another input here.
|
|
/// // Finally, when we are done consuming inputs:
|
|
///
|
|
/// while let scalar = normalizer.flush() {
|
|
/// print(scalar)
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// The normalizer consumes data from the source as needed,
|
|
/// meaning even if a call to `resume` returns a value,
|
|
/// that value may have come from the normalizer's internal buffers
|
|
/// without consuming the input source at all.
|
|
///
|
|
/// Be careful to ensure each input source has been fully consumed
|
|
/// before moving on to the next source (marked by `resume` returning `nil`).
|
|
///
|
|
internal mutating func resume(
|
|
consuming source: inout some IteratorProtocol<Unicode.Scalar>
|
|
) -> Unicode.Scalar? {
|
|
resume(consuming: { source.next() })
|
|
}
|
|
|
|
// Intended ABI barrier for resume(consuming: inout some IteratorProtocol<Unicode.Scalar>).
|
|
// when it becomes public.
|
|
internal mutating func resume(
|
|
consuming nextFromSource: () -> Unicode.Scalar?
|
|
) -> Unicode.Scalar? {
|
|
_resume(consuming: nextFromSource)?.scalar
|
|
}
|
|
|
|
/// Marks the end of the text stream and
|
|
/// returns the next scalar from the normalizer's internal buffer.
|
|
///
|
|
/// Once you have finished feeding input data to the normalizer,
|
|
/// call `flush` until it returns `nil`.
|
|
///
|
|
/// ```swift
|
|
/// while let scalar = normalizer.flush() {
|
|
/// print(scalar)
|
|
/// }
|
|
/// ```
|
|
///
|
|
/// After calling `flush`, all future calls to `resume`
|
|
/// will immediately return `nil` without consuming from its source.
|
|
/// This allows optional chaining to be used to
|
|
/// fully normalize a stream:
|
|
///
|
|
/// ```swift
|
|
/// // Normalize the concatenation of inputA and inputB
|
|
///
|
|
/// while let scalar =
|
|
/// normalizer.resume(consuming: &inputA) ??
|
|
/// normalizer.resume(consuming: &inputB) ??
|
|
/// normalizer.flush()
|
|
/// {
|
|
/// print(scalar)
|
|
/// }
|
|
/// ```
|
|
internal mutating func flush() -> Unicode.Scalar? {
|
|
_flush()?.scalar
|
|
}
|
|
}
|
|
}
|
|
|
|
extension Unicode._NFDNormalizer {
|
|
|
|
@inline(never)
|
|
internal mutating func _resume(
|
|
consuming nextFromSource: () -> Unicode.Scalar?
|
|
) -> ScalarAndNormData? {
|
|
|
|
switch state {
|
|
case .emittingSegment:
|
|
_internalInvariant(
|
|
pendingStarter != nil,
|
|
"Must find next segment starter before emitting buffered segment"
|
|
)
|
|
_internalInvariant(
|
|
bufferIsSorted,
|
|
"Buffered segment must be sorted before being emitted"
|
|
)
|
|
|
|
if let buffered = buffer.next() {
|
|
return buffered
|
|
}
|
|
bufferIsSorted = false
|
|
state = .consuming
|
|
fallthrough
|
|
|
|
case .consuming:
|
|
|
|
while let (scalar, normData) = takePendingOrConsume(nextFromSource) {
|
|
|
|
// If this scalar is a starter, stash it and emit the decomposed segment
|
|
// we have in the buffer. The buffer must be sorted first.
|
|
|
|
if normData.canonicalCombiningClass == .notReordered, !buffer.isEmpty {
|
|
pendingStarter = (scalar, normData)
|
|
buffer.sort()
|
|
bufferIsSorted = true
|
|
state = .emittingSegment
|
|
return buffer.next()
|
|
}
|
|
|
|
// If this scalar is NFD_QC, it does not need to be decomposed.
|
|
|
|
if normData.isNFDQC {
|
|
|
|
// If the scalar is a starter its CCC is 0,
|
|
// so it does not need to be sorted and can be emitted directly.
|
|
|
|
if normData.canonicalCombiningClass == .notReordered {
|
|
return (scalar, normData)
|
|
}
|
|
buffer.append((scalar, normData))
|
|
continue
|
|
}
|
|
|
|
// Otherwise, append the scalar's decomposition to the buffer.
|
|
|
|
decomposeNonNFDQC((scalar, normData))
|
|
}
|
|
|
|
// Source is exhausted.
|
|
return nil
|
|
|
|
case .terminated:
|
|
return nil
|
|
}
|
|
}
|
|
|
|
internal mutating func _flush() -> ScalarAndNormData? {
|
|
|
|
state = .terminated
|
|
|
|
if !bufferIsSorted {
|
|
buffer.sort()
|
|
bufferIsSorted = true
|
|
}
|
|
|
|
// The buffer contains the decomposed segment *prior to*
|
|
// any pending starter we might have.
|
|
|
|
return buffer.next() ?? pendingStarter.take()
|
|
}
|
|
|
|
@inline(__always)
|
|
private mutating func takePendingOrConsume(
|
|
_ nextFromSource: () -> Unicode.Scalar?
|
|
) -> ScalarAndNormData? {
|
|
|
|
if let pendingStarter = pendingStarter.take() {
|
|
return pendingStarter
|
|
} else if let nextScalar = nextFromSource() {
|
|
return (nextScalar, Unicode._NormData(nextScalar))
|
|
} else {
|
|
return nil
|
|
}
|
|
}
|
|
|
|
private mutating func decomposeNonNFDQC(
|
|
_ scalarInfo: ScalarAndNormData
|
|
) {
|
|
// Handle Hangul decomposition algorithmically.
|
|
// S.base = 0xAC00
|
|
// S.count = 11172
|
|
// S.base + S.count - 1 = 0xD7A3
|
|
if (0xAC00 ... 0xD7A3).contains(scalarInfo.scalar.value) {
|
|
decomposeHangul(scalarInfo.scalar)
|
|
return
|
|
}
|
|
|
|
// Otherwise, we need to lookup the decomposition (if there is one).
|
|
decomposeSlow(scalarInfo)
|
|
}
|
|
|
|
@inline(never)
|
|
private mutating func decomposeHangul(_ scalar: Unicode.Scalar) {
|
|
// L = Hangul leading consonants
|
|
let L: (base: UInt32, count: UInt32) = (base: 0x1100, count: 19)
|
|
// V = Hangul vowels
|
|
let V: (base: UInt32, count: UInt32) = (base: 0x1161, count: 21)
|
|
// T = Hangul tail consonants
|
|
let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28)
|
|
// N = Number of precomposed Hangul syllables that start with the same
|
|
// leading consonant. (There is no base for N).
|
|
let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588)
|
|
// S = Hangul precomposed syllables
|
|
let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172)
|
|
|
|
let sIdx = scalar.value &- S.base
|
|
|
|
let lIdx = sIdx / N.count
|
|
let l = Unicode.Scalar(_value: L.base &+ lIdx)
|
|
// Hangul leading consonants, L, always have normData of 0.
|
|
// CCC = 0, NFC_QC = Yes, NFD_QC = Yes
|
|
buffer.append((scalar: l, normData: .init(rawValue: 0)))
|
|
|
|
let vIdx = (sIdx % N.count) / T.count
|
|
let v = Unicode.Scalar(_value: V.base &+ vIdx)
|
|
// Hangul vowels, V, always have normData of 4.
|
|
// CCC = 0, NFC_QC = Maybe, NFD_QC = Yes
|
|
buffer.append((scalar: v, normData: .init(rawValue: 4)))
|
|
|
|
let tIdx = sIdx % T.count
|
|
if tIdx != 0 {
|
|
let t = Unicode.Scalar(_value: T.base &+ tIdx)
|
|
// Hangul tail consonants, T, always have normData of 4.
|
|
// CCC = 0, NFC_QC = Maybe, NFD_QC = Yes
|
|
buffer.append((scalar: t, normData: .init(rawValue: 4)))
|
|
}
|
|
}
|
|
|
|
@inline(never)
|
|
private mutating func decomposeSlow(
|
|
_ original: ScalarAndNormData
|
|
) {
|
|
// Look into the decomposition perfect hash table.
|
|
let decompEntry = Unicode._DecompositionEntry(original.scalar)
|
|
|
|
// If this is not our original scalar, then we have no decomposition for this
|
|
// scalar, so just emit itself. This is required because perfect hashing
|
|
// does not know the original set of keys that it used to create itself, so
|
|
// we store the original scalar in our decomposition entry to ensure that
|
|
// scalars that hash to the same index don't succeed.
|
|
guard original.scalar == decompEntry.hashedScalar else {
|
|
buffer.append(original)
|
|
return
|
|
}
|
|
|
|
var utf8 = unsafe decompEntry.utf8
|
|
|
|
while utf8.count > 0 {
|
|
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: 0)
|
|
unsafe utf8 = unsafe UnsafeBufferPointer(rebasing: utf8[len...])
|
|
|
|
// Fast path: Because this will be emitted into the completed NFD buffer,
|
|
// we don't need to look at NFD_QC anymore which lets us do a larger
|
|
// latiny check for NFC_QC and CCC (0xC0 vs. 0x300).
|
|
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
|
|
|
|
buffer.append((scalar, normData))
|
|
}
|
|
}
|
|
}
|