mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
452 lines
15 KiB
Swift
452 lines
15 KiB
Swift
//===--- StringNormalization.swift ----------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
import SwiftShims
|
|
|
|
internal enum _Normalization {
|
|
// ICU's NFC unorm2 instance
|
|
//
|
|
// TODO(String performance): Should we cache one on TLS? Is this an expensive
|
|
// call?
|
|
internal static var _nfcNormalizer: OpaquePointer = {
|
|
var err = __swift_stdlib_U_ZERO_ERROR
|
|
let normalizer = __swift_stdlib_unorm2_getNFCInstance(&err)
|
|
guard err.isSuccess else {
|
|
// This shouldn't be possible unless some deep (unrecoverable) system
|
|
// invariants are violated
|
|
fatalError("Unable to talk to ICU")
|
|
}
|
|
return normalizer
|
|
}()
|
|
|
|
// When normalized in NFC, some segments may expand in size (e.g. some non-BMP
|
|
// musical notes). This expansion is capped by the maximum expansion factor of
|
|
// the normal form. For NFC, that is 3x.
|
|
internal static let _maxNFCExpansionFactor = 3
|
|
internal static let _maxUTF16toUTF8ExpansionFactor = 3
|
|
|
|
internal typealias _SegmentOutputBuffer = _FixedArray16<UInt16>
|
|
}
|
|
|
|
//
|
|
// Pointer casting helpers
|
|
//
|
|
@inline(__always)
|
|
private func _unsafeMutableBufferPointerCast<T, U>(
|
|
_ ptr: UnsafeMutablePointer<T>,
|
|
_ count: Int,
|
|
to: U.Type = U.self
|
|
) -> UnsafeMutableBufferPointer<U> {
|
|
return UnsafeMutableBufferPointer(
|
|
start: UnsafeMutableRawPointer(ptr).assumingMemoryBound(to: U.self),
|
|
count: count
|
|
)
|
|
}
|
|
@inline(__always)
|
|
private func _unsafeBufferPointerCast<T, U>(
|
|
_ ptr: UnsafePointer<T>,
|
|
_ count: Int,
|
|
to: U.Type = U.self
|
|
) -> UnsafeBufferPointer<U> {
|
|
return UnsafeBufferPointer(
|
|
start: UnsafeRawPointer(ptr).assumingMemoryBound(to: U.self),
|
|
count: count
|
|
)
|
|
}
|
|
internal func _castOutputBuffer(
|
|
_ ptr: UnsafeMutablePointer<_FixedArray16<UInt8>>,
|
|
endingAt endIdx: Int = 16
|
|
) -> UnsafeMutableBufferPointer<UInt8> {
|
|
let bufPtr: UnsafeMutableBufferPointer<UInt8> =
|
|
_unsafeMutableBufferPointerCast(
|
|
ptr, 16)
|
|
return UnsafeMutableBufferPointer<UInt8>(rebasing: bufPtr[..<endIdx])
|
|
}
|
|
internal func _castOutputBuffer(
|
|
_ ptr: UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>,
|
|
endingAt endIdx: Int = _Normalization._SegmentOutputBuffer.capacity
|
|
) -> UnsafeMutableBufferPointer<UInt16> {
|
|
let bufPtr: UnsafeMutableBufferPointer<UInt16> =
|
|
_unsafeMutableBufferPointerCast(
|
|
ptr, _Normalization._SegmentOutputBuffer.capacity)
|
|
return UnsafeMutableBufferPointer<UInt16>(rebasing: bufPtr[..<endIdx])
|
|
}
|
|
internal func _castOutputBuffer(
|
|
_ ptr: UnsafePointer<_Normalization._SegmentOutputBuffer>,
|
|
endingAt endIdx: Int = _Normalization._SegmentOutputBuffer.capacity
|
|
) -> UnsafeBufferPointer<UInt16> {
|
|
let bufPtr: UnsafeBufferPointer<UInt16> =
|
|
_unsafeBufferPointerCast(
|
|
ptr, _Normalization._SegmentOutputBuffer.capacity)
|
|
return UnsafeBufferPointer<UInt16>(rebasing: bufPtr[..<endIdx])
|
|
}
|
|
|
|
extension _StringGuts {
|
|
internal func foreignHasNormalizationBoundary(
|
|
before index: String.Index
|
|
) -> Bool {
|
|
let offset = index.encodedOffset
|
|
if offset == 0 || offset == count {
|
|
return true
|
|
}
|
|
|
|
let scalar = foreignErrorCorrectedScalar(startingAt: index).0
|
|
return scalar._hasNormalizationBoundaryBefore
|
|
}
|
|
}
|
|
extension UnsafeBufferPointer where Element == UInt8 {
|
|
internal func hasNormalizationBoundary(before index: Int) -> Bool {
|
|
if index == 0 || index == count {
|
|
return true
|
|
}
|
|
assert(!_isContinuation(self[_unchecked: index]))
|
|
|
|
// Sub-300 latiny fast-path
|
|
if self[_unchecked: index] < 0xCC { return true }
|
|
|
|
let cu = _decodeScalar(self, startingAt: index).0
|
|
return cu._hasNormalizationBoundaryBefore
|
|
}
|
|
}
|
|
|
|
extension Unicode.Scalar {
|
|
// Normalization boundary - a place in a string where everything left of the
|
|
// boundary can be normalized independently from everything right of the
|
|
// boundary. The concatenation of each result is the same as if the entire
|
|
// string had been normalized as a whole.
|
|
//
|
|
// Normalization segment - a sequence of code units between two normalization
|
|
// boundaries (without any boundaries in the middle). Note that normalization
|
|
// segments can, as a process of normalization, expand, contract, and even
|
|
// produce new sub-segments.
|
|
|
|
// Whether this scalar value always has a normalization boundary before it.
|
|
@inline(__always) // common fast-path
|
|
internal var _hasNormalizationBoundaryBefore: Bool {
|
|
// Fast-path: All scalars up through U+02FF are NFC and have boundaries
|
|
// before them
|
|
if self.value < 0x300 { return true }
|
|
|
|
_internalInvariant(Int32(exactly: self.value) != nil, "top bit shouldn't be set")
|
|
let value = Int32(bitPattern: self.value)
|
|
return 0 != __swift_stdlib_unorm2_hasBoundaryBefore(
|
|
_Normalization._nfcNormalizer, value)
|
|
}
|
|
@inline(__always) // common fast-path
|
|
internal var _isNFCQCYes: Bool {
|
|
// Fast-path: All scalars up through U+02FF are NFC and have boundaries
|
|
// before them
|
|
if self.value < 0x300 { return true }
|
|
|
|
return __swift_stdlib_u_getIntPropertyValue(
|
|
Builtin.reinterpretCast(value), __swift_stdlib_UCHAR_NFC_QUICK_CHECK
|
|
) == 1
|
|
}
|
|
|
|
// Quick check if a scalar is NFC and a segment starter
|
|
internal var _isNFCStarter: Bool {
|
|
// Otherwise, consult the properties
|
|
return self._hasNormalizationBoundaryBefore && self._isNFCQCYes
|
|
}
|
|
}
|
|
|
|
extension UnsafeBufferPointer where Element == UInt8 {
|
|
internal func isOnUnicodeScalarBoundary(_ index: Int) -> Bool {
|
|
guard index < count else {
|
|
_internalInvariant(index == count)
|
|
return true
|
|
}
|
|
return !_isContinuation(self[index])
|
|
}
|
|
|
|
}
|
|
|
|
//If this returns nil, it means the outputBuffer ran out of space
|
|
internal func _tryNormalize(
|
|
_ input: UnsafeBufferPointer<UInt16>,
|
|
into outputBuffer:
|
|
UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>
|
|
) -> Int? {
|
|
return _tryNormalize(input, into: _castOutputBuffer(outputBuffer))
|
|
}
|
|
|
|
//If this returns nil, it means the outputBuffer ran out of space
|
|
internal func _tryNormalize(
|
|
_ input: UnsafeBufferPointer<UInt16>,
|
|
into outputBuffer: UnsafeMutableBufferPointer<UInt16>
|
|
) -> Int? {
|
|
var err = __swift_stdlib_U_ZERO_ERROR
|
|
let count = __swift_stdlib_unorm2_normalize(
|
|
_Normalization._nfcNormalizer,
|
|
input.baseAddress._unsafelyUnwrappedUnchecked,
|
|
numericCast(input.count),
|
|
outputBuffer.baseAddress._unsafelyUnwrappedUnchecked,
|
|
numericCast(outputBuffer.count),
|
|
&err
|
|
)
|
|
guard err.isSuccess else {
|
|
// The output buffer needs to grow
|
|
return nil
|
|
}
|
|
return numericCast(count)
|
|
}
|
|
|
|
internal struct NormalizationResult {
|
|
var amountFilled: Int
|
|
var nextReadPosition: String.Index
|
|
var allocatedBuffers: Bool
|
|
}
|
|
|
|
//If this returns nil, it means the outputBuffer ran out of space
|
|
@_effects(releasenone)
|
|
private func fastFill(
|
|
_ sourceBuffer: UnsafeBufferPointer<UInt8>,
|
|
_ outputBuffer: UnsafeMutableBufferPointer<UInt8>
|
|
) -> (read: Int, written: Int)? {
|
|
let outputBufferThreshold = outputBuffer.count - 4
|
|
|
|
// TODO: Additional fast-path: All CCC-ascending NFC_QC segments are NFC
|
|
// TODO: Just freakin do normalization and don't bother with ICU
|
|
var outputCount = 0
|
|
let outputEnd = outputBufferThreshold
|
|
var inputCount = 0
|
|
let inputEnd = sourceBuffer.count
|
|
while inputCount < inputEnd && outputCount < outputEnd {
|
|
// TODO: Slightly faster code-unit scan for latiny (<0xCC)
|
|
|
|
// Check scalar-based fast-paths
|
|
let (scalar, len) = _decodeScalar(sourceBuffer, startingAt: inputCount)
|
|
_internalInvariant(inputCount &+ len <= inputEnd)
|
|
|
|
if _slowPath(
|
|
!sourceBuffer.hasNormalizationBoundary(before: inputCount &+ len)
|
|
|| !scalar._isNFCStarter
|
|
) {
|
|
break
|
|
}
|
|
inputCount &+= len
|
|
|
|
for cu in UTF8.encode(scalar)._unsafelyUnwrappedUnchecked {
|
|
outputBuffer[_unchecked: outputCount] = cu
|
|
outputCount &+= 1
|
|
}
|
|
|
|
_internalInvariant(inputCount == outputCount,
|
|
"non-normalizing UTF-8 fast path should be 1-to-1 in code units")
|
|
}
|
|
return outputCount > 0 ? (inputCount, outputCount) : nil
|
|
}
|
|
|
|
//Transcodes a single segment from the scalars provided by the closure to the outputBuffer as UTF16
|
|
//If this returns nil, it means the outputBuffer ran out of space
|
|
private func copyUTF16Segment(
|
|
boundedBy range: Range<Int>,
|
|
into outputBuffer: UnsafeMutableBufferPointer<UInt16>,
|
|
_ f: (Int) -> (Unicode.Scalar, Int)
|
|
) -> (read: Int, written: Int)? {
|
|
var readIndex = range.lowerBound
|
|
var outputWriteIndex = 0
|
|
let outputCount = outputBuffer.count
|
|
while readIndex != range.upperBound {
|
|
let (scalar, length) = f(readIndex)
|
|
if scalar._hasNormalizationBoundaryBefore && readIndex != range.lowerBound {
|
|
break
|
|
}
|
|
|
|
readIndex += length
|
|
|
|
for cu in scalar.utf16 {
|
|
if outputWriteIndex < outputCount {
|
|
outputBuffer[outputWriteIndex] = cu
|
|
outputWriteIndex += 1
|
|
} else {
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
return (readIndex - range.lowerBound, outputWriteIndex)
|
|
}
|
|
|
|
//transcodes the UTF16 segment stored in soureceBuffer into the outputBuffer as UTF8
|
|
//If this returns nil, it means the outputBuffer ran out of space
|
|
private func transcodeValidUTF16ToUTF8(
|
|
_ sourceBuffer: UnsafeBufferPointer<UInt16>,
|
|
into outputBuffer: UnsafeMutableBufferPointer<UInt8>
|
|
) -> Int? {
|
|
var readIndex = 0
|
|
var writeIndex = 0
|
|
let outputCount = outputBuffer.count
|
|
let sourceCount = sourceBuffer.count
|
|
|
|
while readIndex < sourceCount {
|
|
let (scalar, length) = _decodeScalar(sourceBuffer, startingAt: readIndex)
|
|
//we don't need to check for normalization boundaries here because we are only transcoding
|
|
//a single segment at this point
|
|
|
|
readIndex += length
|
|
|
|
for cu in UTF8.encode(scalar)._unsafelyUnwrappedUnchecked {
|
|
if writeIndex < outputCount {
|
|
outputBuffer[writeIndex] = cu
|
|
writeIndex &+= 1
|
|
} else {
|
|
return nil
|
|
}
|
|
}
|
|
}
|
|
return writeIndex
|
|
}
|
|
|
|
internal enum _BufferToCopy {
|
|
case none, output, icuInput, icuOutput
|
|
}
|
|
|
|
internal func _allocateBuffers(
|
|
sourceCount count: Int,
|
|
preserveDataIn bufferToCopy: _BufferToCopy,
|
|
outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
|
|
icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
|
|
icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
|
|
) {
|
|
let output = count * _Normalization._maxNFCExpansionFactor * _Normalization._maxUTF16toUTF8ExpansionFactor
|
|
let icuInput = count
|
|
let icuOutput = count * _Normalization._maxNFCExpansionFactor
|
|
let newOutputBuffer = UnsafeMutableBufferPointer<UInt8>.allocate(capacity: output)
|
|
let newICUInputBuffer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: icuInput)
|
|
let newICUOutputBuffer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: icuOutput)
|
|
|
|
switch bufferToCopy {
|
|
case .none:
|
|
break
|
|
case .output:
|
|
let (_, written) = newOutputBuffer.initialize(from: outputBuffer)
|
|
_internalInvariant(written == 16)
|
|
case .icuInput:
|
|
let (_, written) = newICUInputBuffer.initialize(from: icuInputBuffer)
|
|
_internalInvariant(written == 16)
|
|
case .icuOutput:
|
|
let (_, written) = newICUOutputBuffer.initialize(from: icuOutputBuffer)
|
|
_internalInvariant(written == 16)
|
|
}
|
|
|
|
outputBuffer = newOutputBuffer
|
|
icuInputBuffer = newICUInputBuffer
|
|
icuOutputBuffer = newICUOutputBuffer
|
|
}
|
|
|
|
internal func _fastNormalize(
|
|
readIndex: String.Index,
|
|
sourceBuffer: UnsafeBufferPointer<UInt8>,
|
|
outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
|
|
icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
|
|
icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
|
|
) -> NormalizationResult {
|
|
let start = readIndex.encodedOffset
|
|
let rebasedSourceBuffer = UnsafeBufferPointer(rebasing: sourceBuffer[start...])
|
|
if let (read, filled) = fastFill(rebasedSourceBuffer, outputBuffer) {
|
|
let nextIndex = readIndex.encoded(offsetBy: read)
|
|
_internalInvariant(sourceBuffer.isOnUnicodeScalarBoundary(nextIndex.encodedOffset))
|
|
|
|
return NormalizationResult(
|
|
amountFilled: filled, nextReadPosition: nextIndex, allocatedBuffers: false)
|
|
}
|
|
var allocatedBuffers = false
|
|
func performWithAllocationIfNecessary<R>(
|
|
preserving preserveDataIn: _BufferToCopy, _ f: () -> R?
|
|
) -> R {
|
|
if let result = f() {
|
|
return result
|
|
}
|
|
_allocateBuffers(
|
|
sourceCount: sourceBuffer.count,
|
|
preserveDataIn: preserveDataIn,
|
|
outputBuffer: &outputBuffer,
|
|
icuInputBuffer: &icuInputBuffer,
|
|
icuOutputBuffer: &icuOutputBuffer)
|
|
_internalInvariant(!allocatedBuffers)
|
|
allocatedBuffers = true
|
|
return f()!
|
|
}
|
|
|
|
let (read, filled) = performWithAllocationIfNecessary(preserving: .none) { () -> (Int, Int)? in
|
|
return copyUTF16Segment(boundedBy: 0..<rebasedSourceBuffer.count, into: icuInputBuffer) {
|
|
return _decodeScalar(rebasedSourceBuffer, startingAt: $0)
|
|
}
|
|
}
|
|
|
|
let nextIndex = readIndex.encoded(offsetBy: read)
|
|
_internalInvariant(sourceBuffer.isOnUnicodeScalarBoundary(nextIndex.encodedOffset))
|
|
|
|
let normalized = performWithAllocationIfNecessary(preserving: .icuInput) { () -> Int? in
|
|
return _tryNormalize(
|
|
UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled]), into: icuOutputBuffer)
|
|
}
|
|
|
|
let transcoded = performWithAllocationIfNecessary(preserving: .icuOutput) { () -> Int? in
|
|
return transcodeValidUTF16ToUTF8(
|
|
UnsafeBufferPointer<UInt16>(rebasing: icuOutputBuffer[..<normalized]),
|
|
into: outputBuffer)
|
|
}
|
|
return NormalizationResult(
|
|
amountFilled: transcoded, nextReadPosition: nextIndex, allocatedBuffers: allocatedBuffers)
|
|
}
|
|
|
|
internal func _foreignNormalize(
|
|
readIndex: String.Index,
|
|
endIndex: String.Index,
|
|
guts: _StringGuts,
|
|
outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
|
|
icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
|
|
icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
|
|
) -> NormalizationResult {
|
|
var allocatedBuffers = false
|
|
func performWithAllocationIfNecessary<R>(
|
|
preserving preserveDataIn: _BufferToCopy, _ f: () -> R?
|
|
) -> R {
|
|
if let result = f() {
|
|
return result
|
|
}
|
|
_allocateBuffers(
|
|
sourceCount: guts.count,
|
|
preserveDataIn: preserveDataIn,
|
|
outputBuffer: &outputBuffer,
|
|
icuInputBuffer: &icuInputBuffer,
|
|
icuOutputBuffer: &icuOutputBuffer)
|
|
_internalInvariant(!allocatedBuffers)
|
|
allocatedBuffers = true
|
|
return f()!
|
|
}
|
|
let (read, filled) = performWithAllocationIfNecessary(preserving: .none) { () -> (Int, Int)? in
|
|
let start = readIndex.encodedOffset
|
|
let end = endIndex.encodedOffset
|
|
return copyUTF16Segment(boundedBy: start..<end, into: icuInputBuffer) { gutsOffset in
|
|
return guts.errorCorrectedScalar(startingAt: gutsOffset)
|
|
}
|
|
}
|
|
|
|
let nextIndex = readIndex.encoded(offsetBy: read)
|
|
_internalInvariant(guts.isOnUnicodeScalarBoundary(nextIndex))
|
|
|
|
let normalized = performWithAllocationIfNecessary(preserving: .icuInput) { () -> Int? in
|
|
return _tryNormalize(
|
|
UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled]), into: icuOutputBuffer)
|
|
}
|
|
|
|
let transcoded = performWithAllocationIfNecessary(preserving: .icuOutput) { () -> Int? in
|
|
return transcodeValidUTF16ToUTF8(
|
|
UnsafeBufferPointer<UInt16>(rebasing: icuOutputBuffer[..<normalized]),
|
|
into: outputBuffer)
|
|
}
|
|
return NormalizationResult(
|
|
amountFilled: transcoded, nextReadPosition: nextIndex, allocatedBuffers: allocatedBuffers)
|
|
}
|