//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2021 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// import SwiftShims extension Sequence where Element == Unicode.Scalar { internal var _internalNFC: Unicode._InternalNFC { Unicode._InternalNFC(self) } } extension Unicode { /// The contents of the source sequence, in Normalization Form C. /// /// Normalization to NFC preserves canonical equivalence. /// internal struct _InternalNFC where Source: Sequence { internal let source: Source internal init(_ source: Source) { self.source = source } } } extension Unicode._InternalNFC: Sequence { internal consuming func makeIterator() -> Iterator { Iterator(source: source.makeIterator()) } internal struct Iterator: IteratorProtocol { internal var source: Source.Iterator internal var normalizer: Unicode._NFCNormalizer internal init(source: Source.Iterator) { self.source = source if let strIter = source as? String.UnicodeScalarView.Iterator { self.normalizer = Unicode._NFCNormalizer(sourceString: strIter._guts) } else if let substrIter = source as? Substring.UnicodeScalarView.Iterator { self.normalizer = Unicode._NFCNormalizer(sourceString: substrIter._elements._wholeGuts) } else { self.normalizer = Unicode._NFCNormalizer() } } internal mutating func next() -> Unicode.Scalar? { normalizer.resume { source.next() } ?? normalizer.flush() } } } extension Unicode._InternalNFC: Sendable where Source: Sendable {} extension Unicode._InternalNFC.Iterator: Sendable where Source.Iterator: Sendable {} extension Unicode { /// A stateful normalizer, producing a single logical stream /// of normalized text from chunked inputs. /// /// To use the normalizer, first create an instance. /// Next, feed it a chunk of a text stream using the `resume(consuming:)` /// function. The normalizer will consume from the stream and buffer /// it as needed, so continue feeding the same source until /// it returns `nil`, indicating that the source was exhausted. /// /// ```swift /// var normalizer = Unicode.NFCNormalizer() /// /// var input: some IteratorProtocol = ... /// while let scalar = normalizer.resume(consuming: &input) { /// print(scalar) /// } /// /// // assert(input.next() == nil) /// ``` /// /// You may continue consuming sources until you reach the end /// of the logical text stream. Once you reach the end, /// call `flush()` to drain any remaining content /// from the normalizer's buffers. /// /// ```swift /// while let scalar = normalizer.flush() { /// print(scalar) /// } /// ``` /// /// The chunks of input text do not need to be aligned on any normalization /// boundary. The normalizer state has value semantics, so it is possible /// to copy and store and is inherently thread-safe. /// internal struct _NFCNormalizer: Sendable { internal enum State { case emittingSegment case consuming } internal var state = State.consuming internal var isTerminated = false internal var sourceIsAlreadyNFC = false internal var nfd = Unicode._NFDNormalizer() internal var buffer = Unicode._NormDataBuffer() // This is our starter that is currently being composed with other scalars // into new scalars. For example, "e\u{301}", here our first scalar is 'e', // which is a starter, thus we assign composee to this 'e' and move to the // next scalar. We attempt to compose our composee, 'e', with '\u{301}' and // find that there is a composition. Thus our new composee is now 'é' and // we continue to try and compose following scalars with this composee. internal var composee = Optional.none internal init(sourceString: borrowing _StringGuts) { sourceIsAlreadyNFC = sourceString.isNFC } /// Creates a new normalizer. /// internal init() { } /// Resume normalizing the text stream. /// /// Each call to `resume` returns the next scalar in the normalized output, /// consuming elements from the given source as necessary. /// /// If the normalizer returns `nil`, the source was exhausted. /// Once a source is exhausted, you may: /// /// - Call `resume` again some time later with a different source /// to continue processing the same logical text stream, or /// /// - Call `flush` in order to mark the end of the stream /// and consume data remaining in the normalizer's internal buffers. /// /// Typical usage looks like the following: /// /// ```swift /// var normalizer = Unicode.NFCNormalizer() /// /// var input: some IteratorProtocol = ... /// while let scalar = normalizer.resume(consuming: &input) { /// print(scalar) /// } /// /// // We could resume again, consuming from another input here. /// // Finally, when we are done consuming inputs: /// /// while let scalar = normalizer.flush() { /// print(scalar) /// } /// ``` /// /// The normalizer consumes data from the source as needed, /// meaning even if a call to `resume` returns a value, /// that value may have come from the normalizer's internal buffers /// without consuming the input source at all. /// /// Be careful to ensure each input source has been fully consumed /// before moving on to the next source (marked by `resume` returning `nil`). /// internal mutating func resume( consuming source: inout some IteratorProtocol ) -> Unicode.Scalar? { resume(consuming: { source.next() }) } // Intended ABI barrier for resume(consuming: inout some IteratorProtocol). // when it becomes public. internal mutating func resume( consuming nextFromSource: () -> Unicode.Scalar? ) -> Unicode.Scalar? { guard !isTerminated else { return nil } guard !sourceIsAlreadyNFC else { return nextFromSource() } return _resume(consumingNFD: { $0.nfd._resume(consuming: nextFromSource) }) } /// Marks the end of the text stream and /// returns the next scalar from the normalizer's internal buffer. /// /// Once you have finished feeding input data to the normalizer, /// call `flush` until it returns `nil`. /// /// ```swift /// while let scalar = normalizer.flush() { /// print(scalar) /// } /// ``` /// /// After calling `flush`, all future calls to `resume` /// will immediately return `nil` without consuming from its source. /// This allows optional chaining to be used to /// fully normalize a stream: /// /// ```swift /// // Normalize the concatenation of inputA and inputB /// /// while let scalar = /// normalizer.resume(consuming: &inputA) ?? /// normalizer.resume(consuming: &inputB) ?? /// normalizer.flush() /// { /// print(scalar) /// } /// ``` /// internal mutating func flush() -> Unicode.Scalar? { isTerminated = true guard !sourceIsAlreadyNFC else { return nil } // Process anything remaining from the NFD normalizer. if let next = _resume(consumingNFD: { $0.nfd._flush() }) { return next } // If we have a leftover composee, make sure to return it. // We may still have things in the buffer which are not complete segments. return composee.take() ?? buffer.next()?.scalar } } } extension Unicode._NFCNormalizer { @inline(never) internal mutating func _resume( consumingNFD nextNFD: (inout Self) -> ScalarAndNormData? ) -> Unicode.Scalar? { switch state { case .emittingSegment: if let buffered = buffer.next() { return buffered.scalar } state = .consuming fallthrough case .consuming: while let current = nextNFD(&self) { // The first starter in the sequence is our initial 'composee'. // Any scalars preceding the first starter have nothing to compose with // and are just emitted directly. guard let currentComposee = composee else { guard current.normData.canonicalCombiningClass == .notReordered else { return current.scalar } composee = current.scalar continue } guard let lastBufferedNormData = buffer.last?.normData else { // The buffer is empty so we have a simple Non-Blocked Pair, // . Look for an equivalent Primary Composite. // If 'current' is NFC_QC, we already know there won't be a composite. guard !current.normData.isNFCQC, let composed = compose(currentComposee, andNonNFCQC: current.scalar) else { // No Primary Composite found. // If 'current' is a starter, yield 'composee', // and begin a new segment with 'current' as the new 'composee'. // Otherwise, 'current' is a non-composing mark // that we need to buffer until we are finished composing. if current.normData.canonicalCombiningClass == .notReordered { composee = current.scalar return currentComposee } buffer.append(current) continue } // Primary Composite found. // It becomes our new 'composee' and 'current' is discarded. composee = composed continue } // We have the sequence . // Check whether 'current' may compose with 'composee', // or whether it is blocked by the buffer contents. // // Blocking refers to the presence of a scalar X in the buffer // where CCC(X) == 0 or CCC(X) >= CCC(current). // // Example: // // - "a\u{0305}\u{0300}b" (a̅̀b) => NFC "a\u{0305}\u{0300}b" (a̅̀b) // - "a\u{0300}\u{0305}b" (à̅b) => NFC "\u{00E0}\u{0305}b" (à̅b) // ^^^ ^^^ // // These strings contain two combining marks with the same combining // class: U+0305 COMBINING OVERLINE and U+0300 COMBINING GRAVE ACCENT. // Because these marks have the same class, they cannot be reordered // (their existing order is important). In one ordering, the accent // appears above the overline, and in the other the order is reversed. // // It turns out, there is no composite for <"a", overline>, // but there is one for <"a", grave accent>: the // U+00E0 LATIN SMALL LETTER A WITH GRAVE we see in the second example. // // Despite the overline not composing, it would be wrong // if the grave accent could squeeze ahead of it // via composition with the "a". // So the presence of the overline must block the composition. _internalInvariant( lastBufferedNormData.canonicalCombiningClass != .notReordered, "We never buffer starters" ) // Since we consume an NFD stream // the buffer contents are already in canonical order, // and 'lastBufferedNormData' has the highest CCC in the buffer. _internalInvariant( lastBufferedNormData.canonicalCombiningClass <= current.normData.canonicalCombiningClass || current.normData.canonicalCombiningClass == .notReordered, "NFD stream not in canonical order" ) guard lastBufferedNormData.canonicalCombiningClass < current.normData.canonicalCombiningClass else { // 'current' is blocked from composing with 'composee'. // // If 'current' is a starter, yield 'composee', // emit the segment that we have in the buffer, // and begin a new segment with 'current' as the new 'composee'. // Otherwise, 'current' is a non-composing mark // that we need to buffer until we are finished composing. if current.normData.canonicalCombiningClass == .notReordered { composee = current.scalar state = .emittingSegment return currentComposee } buffer.append(current) continue } _internalInvariant(current.normData.canonicalCombiningClass != .notReordered) // Look for a Primary Composite equivalent to . // If 'current' is NFC_QC, we already know there won't be any composite. guard !current.normData.isNFCQC, let composed = compose(currentComposee, andNonNFCQC: current.scalar) else { // No Primary Composite found. // We know 'current' is not a starter, so it is a non-composing mark // that we need to buffer until we are finished composing. buffer.append(current) continue } // Primary Composite found. // It becomes our new 'composee', and 'current' is discarded. composee = composed } // NFD source is exhausted. return nil } } private func compose( _ x: Unicode.Scalar, andNonNFCQC y: Unicode.Scalar ) -> Unicode.Scalar? { if let hangul = composeHangul(x, and: y) { return hangul } // Otherwise, lookup the composition. let composition = _swift_stdlib_getComposition(x.value, y.value) guard composition != .max else { return nil } return Unicode.Scalar(_value: composition) } @inline(never) private func composeHangul( _ x: Unicode.Scalar, and y: Unicode.Scalar ) -> Unicode.Scalar? { // L = Hangul leading consonants let L: (base: UInt32, count: UInt32) = (base: 0x1100, count: 19) // V = Hangul vowels let V: (base: UInt32, count: UInt32) = (base: 0x1161, count: 21) // T = Hangul tail consonants let T: (base: UInt32, count: UInt32) = (base: 0x11A7, count: 28) // N = Number of precomposed Hangul syllables that start with the same // leading consonant. (There is no base for N). let N: (base: UInt32, count: UInt32) = (base: 0x0, count: 588) // S = Hangul precomposed syllables let S: (base: UInt32, count: UInt32) = (base: 0xAC00, count: 11172) switch (x.value, y.value) { // Check for Hangul (L, V) -> LV compositions. case (L.base ..< L.base &+ L.count, V.base ..< V.base &+ V.count): let lIdx = x.value &- L.base let vIdx = y.value &- V.base let lvIdx = lIdx &* N.count &+ vIdx &* T.count let s = S.base &+ lvIdx return Unicode.Scalar(_value: s) // Check for Hangul (LV, T) -> LVT compositions. case (S.base ..< S.base &+ S.count, T.base &+ 1 ..< T.base &+ T.count): if (x.value &- S.base) % T.count == 0 { return Unicode.Scalar(_value: x.value &+ y.value &- T.base) } else { fallthrough } default: return nil } } }