mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
171 lines
4.6 KiB
Swift
171 lines
4.6 KiB
Swift
//===--- StringNormalization.swift ----------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
extension Unicode.Scalar {
|
|
// Normalization boundary - a place in a string where everything left of the
|
|
// boundary can be normalized independently from everything right of the
|
|
// boundary. The concatenation of each result is the same as if the entire
|
|
// string had been normalized as a whole.
|
|
//
|
|
// Normalization segment - a sequence of code units between two normalization
|
|
// boundaries (without any boundaries in the middle). Note that normalization
|
|
// segments can, as a process of normalization, expand, contract, and even
|
|
// produce new sub-segments.
|
|
|
|
// Quick check if a scalar is an NFC segment starter.
|
|
internal var _isNFCStarter: Bool {
|
|
// Fast path: All scalars up to U+300 are NFC_QC and have boundaries
|
|
// before them.
|
|
let normData = Unicode._NormData(self, fastUpperbound: 0x300)
|
|
return normData.ccc == 0 && normData.isNFCQC
|
|
}
|
|
}
|
|
|
|
extension UnsafeBufferPointer where Element == UInt8 {
|
|
internal func hasNormalizationBoundary(before offset: Int) -> Bool {
|
|
if offset == 0 || offset == count {
|
|
return true
|
|
}
|
|
unsafe _internalInvariant(!UTF8.isContinuation(self[_unchecked: offset]))
|
|
|
|
// Sub-300 latiny fast-path
|
|
if unsafe self[_unchecked: offset] < 0xCC { return true }
|
|
|
|
let cu = unsafe _decodeScalar(self, startingAt: offset).0
|
|
return cu._isNFCStarter
|
|
}
|
|
|
|
internal func isOnUnicodeScalarBoundary(_ offset: Int) -> Bool {
|
|
guard offset < count else {
|
|
_internalInvariant(offset == count)
|
|
return true
|
|
}
|
|
return unsafe !UTF8.isContinuation(self[offset])
|
|
}
|
|
}
|
|
|
|
internal func _isScalarNFCQC(
|
|
_ scalar: Unicode.Scalar,
|
|
_ prevCCC: inout UInt8
|
|
) -> Bool {
|
|
let normData = Unicode._NormData(scalar, fastUpperbound: 0x300)
|
|
|
|
if prevCCC > normData.ccc, normData.ccc != 0 {
|
|
return false
|
|
}
|
|
|
|
if !normData.isNFCQC {
|
|
return false
|
|
}
|
|
|
|
prevCCC = normData.ccc
|
|
return true
|
|
}
|
|
|
|
extension _StringGutsSlice {
|
|
internal func _withNFCCodeUnits(_ f: (UInt8) throws -> Void) rethrows {
|
|
let substring = String(_guts)[range]
|
|
// Fast path: If we're already NFC (or ASCII), then we don't need to do
|
|
// anything at all.
|
|
if _fastPath(_guts.isNFC) {
|
|
try substring.utf8.forEach(f)
|
|
return
|
|
}
|
|
|
|
var isNFCQC = true
|
|
var prevCCC: UInt8 = 0
|
|
|
|
if _guts.isFastUTF8 {
|
|
_fastNFCCheck(&isNFCQC, &prevCCC)
|
|
|
|
// Because we have access to the fastUTF8, we can go through that instead
|
|
// of accessing the UTF8 view on String.
|
|
if isNFCQC {
|
|
try unsafe withFastUTF8 {
|
|
for unsafe byte in unsafe $0 {
|
|
try f(byte)
|
|
}
|
|
}
|
|
|
|
return
|
|
}
|
|
} else {
|
|
for scalar in substring.unicodeScalars {
|
|
if !_isScalarNFCQC(scalar, &prevCCC) {
|
|
isNFCQC = false
|
|
break
|
|
}
|
|
}
|
|
|
|
if isNFCQC {
|
|
for byte in substring.utf8 {
|
|
try f(byte)
|
|
}
|
|
|
|
return
|
|
}
|
|
}
|
|
|
|
for scalar in substring.unicodeScalars._internalNFC {
|
|
try scalar.withUTF8CodeUnits {
|
|
for unsafe byte in unsafe $0 {
|
|
try f(byte)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
internal func _fastNFCCheck(_ isNFCQC: inout Bool, _ prevCCC: inout UInt8) {
|
|
unsafe withFastUTF8 { utf8 in
|
|
isNFCQC = unsafe _nfcQuickCheck(utf8, prevCCC: &prevCCC)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Run the Unicode NFC quick check algorithm, returns
|
|
internal func _nfcQuickCheck(
|
|
_ utf8: UnsafeBufferPointer<UInt8>,
|
|
prevCCC: inout UInt8
|
|
) -> Bool {
|
|
var position = 0
|
|
|
|
while position < utf8.count {
|
|
// If our first byte is less than 0xCC, then it means we're under the
|
|
// 0x300 scalar value and everything up to 0x300 is NFC already.
|
|
if unsafe utf8[position] < 0xCC {
|
|
// If our first byte is less than 0xC0, then it means it is ASCII
|
|
// and only takes up a single byte.
|
|
if unsafe utf8[position] < 0xC0 {
|
|
position &+= 1
|
|
} else {
|
|
// Otherwise, this is a 2 byte < 0x300 sequence.
|
|
position &+= 2
|
|
}
|
|
// ASCII always has ccc of 0.
|
|
prevCCC = 0
|
|
|
|
continue
|
|
}
|
|
|
|
let (scalar, len) = unsafe _decodeScalar(utf8, startingAt: position)
|
|
|
|
guard _isScalarNFCQC(scalar, &prevCCC) else {
|
|
return false
|
|
}
|
|
|
|
position &+= len
|
|
}
|
|
|
|
return true
|
|
}
|
|
|