Files
swift-mirror/stdlib/public/core/StringNormalization.swift
Michael Ilseman e6e4bd6056 UTF8Span (#78531)
Add support for UTF8Span

Also, refactor validation and grapheme breaking
2025-04-11 16:11:11 -06:00

171 lines
4.6 KiB
Swift

//===--- StringNormalization.swift ----------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
extension Unicode.Scalar {
// Normalization boundary - a place in a string where everything left of the
// boundary can be normalized independently from everything right of the
// boundary. The concatenation of each result is the same as if the entire
// string had been normalized as a whole.
//
// Normalization segment - a sequence of code units between two normalization
// boundaries (without any boundaries in the middle). Note that normalization
// segments can, as a process of normalization, expand, contract, and even
// produce new sub-segments.
// Quick check if a scalar is an NFC segment starter.
internal var _isNFCStarter: Bool {
// Fast path: All scalars up to U+300 are NFC_QC and have boundaries
// before them.
let normData = Unicode._NormData(self, fastUpperbound: 0x300)
return normData.ccc == 0 && normData.isNFCQC
}
}
extension UnsafeBufferPointer where Element == UInt8 {
internal func hasNormalizationBoundary(before offset: Int) -> Bool {
if offset == 0 || offset == count {
return true
}
unsafe _internalInvariant(!UTF8.isContinuation(self[_unchecked: offset]))
// Sub-300 latiny fast-path
if unsafe self[_unchecked: offset] < 0xCC { return true }
let cu = unsafe _decodeScalar(self, startingAt: offset).0
return cu._isNFCStarter
}
internal func isOnUnicodeScalarBoundary(_ offset: Int) -> Bool {
guard offset < count else {
_internalInvariant(offset == count)
return true
}
return unsafe !UTF8.isContinuation(self[offset])
}
}
internal func _isScalarNFCQC(
_ scalar: Unicode.Scalar,
_ prevCCC: inout UInt8
) -> Bool {
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
if prevCCC > normData.ccc, normData.ccc != 0 {
return false
}
if !normData.isNFCQC {
return false
}
prevCCC = normData.ccc
return true
}
extension _StringGutsSlice {
internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
let substring = String(_guts)[range]
// Fast path: If we're already NFC (or ASCII), then we don't need to do
// anything at all.
if _fastPath(_guts.isNFC) {
try substring.utf8.forEach(f)
return
}
var isNFCQC = true
var prevCCC: UInt8 = 0
if _guts.isFastUTF8 {
_fastNFCCheck(&isNFCQC, &prevCCC)
// Because we have access to the fastUTF8, we can go through that instead
// of accessing the UTF8 view on String.
if isNFCQC {
try unsafe withFastUTF8 {
for unsafe byte in unsafe $0 {
try f(byte)
}
}
return
}
} else {
for scalar in substring.unicodeScalars {
if !_isScalarNFCQC(scalar, &prevCCC) {
isNFCQC = false
break
}
}
if isNFCQC {
for byte in substring.utf8 {
try f(byte)
}
return
}
}
for scalar in substring.unicodeScalars._internalNFC {
try scalar.withUTF8CodeUnits {
for unsafe byte in unsafe $0 {
try f(byte)
}
}
}
}
internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
unsafe withFastUTF8 { utf8 in
isNFCQC = unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
}
}
}
/// Run the Unicode NFC quick check algorithm, returns
internal func _nfcQuickCheck(
_ utf8: UnsafeBufferPointer<UInt8>,
prevCCC: inout UInt8
) -> Bool {
var position = 0
while position < utf8.count {
// If our first byte is less than 0xCC, then it means we're under the
// 0x300 scalar value and everything up to 0x300 is NFC already.
if unsafe utf8[position] < 0xCC {
// If our first byte is less than 0xC0, then it means it is ASCII
// and only takes up a single byte.
if unsafe utf8[position] < 0xC0 {
position &+= 1
} else {
// Otherwise, this is a 2 byte < 0x300 sequence.
position &+= 2
}
// ASCII always has ccc of 0.
prevCCC = 0
continue
}
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
guard _isScalarNFCQC(scalar, &prevCCC) else {
return false
}
position &+= len
}
return true
}