Files
swift-mirror/stdlib/public/core/StringNormalization.swift
Michael Ilseman 415cc8fb0c [String.Index] Deprecate encodedOffset var/init
String.Index has an encodedOffset-based initializer and computed
property that exists for serialization purposes. It was documented as
UTF-16 in the SE proposal introducing it, which was String's
underlying encoding at the time, but the dream of String even then was
to abstract away whatever encoding happend to be used.

Serialization needs an explicit encoding for serialized indices to
make sense: the offsets need to align with the view. With String
utilizing UTF-8 encoding for native contents in Swift 5, serialization
isn't necessarily the most efficient in UTF-16.

Furthermore, the majority of usage of encodedOffset in the wild is
buggy and operates under the assumption that a UTF-16 code unit was a
Swift Character, which isn't even valid if the String is known to be
all-ASCII (because CR-LF).

This change introduces a pair of semantics-preserving alternatives to
encodedOffset that explicitly call out the UTF-16 assumption. These
serve as a gentle off-ramp for current mis-uses of encodedOffset.
2019-02-13 18:42:40 -08:00

452 lines
15 KiB
Swift

//===--- StringNormalization.swift ----------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
import SwiftShims
internal enum _Normalization {
// ICU's NFC unorm2 instance
//
// TODO(String performance): Should we cache one on TLS? Is this an expensive
// call?
internal static var _nfcNormalizer: OpaquePointer = {
var err = __swift_stdlib_U_ZERO_ERROR
let normalizer = __swift_stdlib_unorm2_getNFCInstance(&err)
guard err.isSuccess else {
// This shouldn't be possible unless some deep (unrecoverable) system
// invariants are violated
fatalError("Unable to talk to ICU")
}
return normalizer
}()
// When normalized in NFC, some segments may expand in size (e.g. some non-BMP
// musical notes). This expansion is capped by the maximum expansion factor of
// the normal form. For NFC, that is 3x.
internal static let _maxNFCExpansionFactor = 3
internal static let _maxUTF16toUTF8ExpansionFactor = 3
internal typealias _SegmentOutputBuffer = _FixedArray16<UInt16>
}
//
// Pointer casting helpers
//
@inline(__always)
private func _unsafeMutableBufferPointerCast<T, U>(
_ ptr: UnsafeMutablePointer<T>,
_ count: Int,
to: U.Type = U.self
) -> UnsafeMutableBufferPointer<U> {
return UnsafeMutableBufferPointer(
start: UnsafeMutableRawPointer(ptr).assumingMemoryBound(to: U.self),
count: count
)
}
@inline(__always)
private func _unsafeBufferPointerCast<T, U>(
_ ptr: UnsafePointer<T>,
_ count: Int,
to: U.Type = U.self
) -> UnsafeBufferPointer<U> {
return UnsafeBufferPointer(
start: UnsafeRawPointer(ptr).assumingMemoryBound(to: U.self),
count: count
)
}
internal func _castOutputBuffer(
_ ptr: UnsafeMutablePointer<_FixedArray16<UInt8>>,
endingAt endIdx: Int = 16
) -> UnsafeMutableBufferPointer<UInt8> {
let bufPtr: UnsafeMutableBufferPointer<UInt8> =
_unsafeMutableBufferPointerCast(
ptr, 16)
return UnsafeMutableBufferPointer<UInt8>(rebasing: bufPtr[..<endIdx])
}
internal func _castOutputBuffer(
_ ptr: UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>,
endingAt endIdx: Int = _Normalization._SegmentOutputBuffer.capacity
) -> UnsafeMutableBufferPointer<UInt16> {
let bufPtr: UnsafeMutableBufferPointer<UInt16> =
_unsafeMutableBufferPointerCast(
ptr, _Normalization._SegmentOutputBuffer.capacity)
return UnsafeMutableBufferPointer<UInt16>(rebasing: bufPtr[..<endIdx])
}
internal func _castOutputBuffer(
_ ptr: UnsafePointer<_Normalization._SegmentOutputBuffer>,
endingAt endIdx: Int = _Normalization._SegmentOutputBuffer.capacity
) -> UnsafeBufferPointer<UInt16> {
let bufPtr: UnsafeBufferPointer<UInt16> =
_unsafeBufferPointerCast(
ptr, _Normalization._SegmentOutputBuffer.capacity)
return UnsafeBufferPointer<UInt16>(rebasing: bufPtr[..<endIdx])
}
extension _StringGuts {
internal func foreignHasNormalizationBoundary(
before index: String.Index
) -> Bool {
let offset = index._encodedOffset
if offset == 0 || offset == count {
return true
}
let scalar = foreignErrorCorrectedScalar(startingAt: index).0
return scalar._hasNormalizationBoundaryBefore
}
}
extension UnsafeBufferPointer where Element == UInt8 {
internal func hasNormalizationBoundary(before index: Int) -> Bool {
if index == 0 || index == count {
return true
}
assert(!_isContinuation(self[_unchecked: index]))
// Sub-300 latiny fast-path
if self[_unchecked: index] < 0xCC { return true }
let cu = _decodeScalar(self, startingAt: index).0
return cu._hasNormalizationBoundaryBefore
}
}
extension Unicode.Scalar {
// Normalization boundary - a place in a string where everything left of the
// boundary can be normalized independently from everything right of the
// boundary. The concatenation of each result is the same as if the entire
// string had been normalized as a whole.
//
// Normalization segment - a sequence of code units between two normalization
// boundaries (without any boundaries in the middle). Note that normalization
// segments can, as a process of normalization, expand, contract, and even
// produce new sub-segments.
// Whether this scalar value always has a normalization boundary before it.
@inline(__always) // common fast-path
internal var _hasNormalizationBoundaryBefore: Bool {
// Fast-path: All scalars up through U+02FF are NFC and have boundaries
// before them
if self.value < 0x300 { return true }
_internalInvariant(Int32(exactly: self.value) != nil, "top bit shouldn't be set")
let value = Int32(bitPattern: self.value)
return 0 != __swift_stdlib_unorm2_hasBoundaryBefore(
_Normalization._nfcNormalizer, value)
}
@inline(__always) // common fast-path
internal var _isNFCQCYes: Bool {
// Fast-path: All scalars up through U+02FF are NFC and have boundaries
// before them
if self.value < 0x300 { return true }
return __swift_stdlib_u_getIntPropertyValue(
Builtin.reinterpretCast(value), __swift_stdlib_UCHAR_NFC_QUICK_CHECK
) == 1
}
// Quick check if a scalar is NFC and a segment starter
internal var _isNFCStarter: Bool {
// Otherwise, consult the properties
return self._hasNormalizationBoundaryBefore && self._isNFCQCYes
}
}
extension UnsafeBufferPointer where Element == UInt8 {
internal func isOnUnicodeScalarBoundary(_ index: Int) -> Bool {
guard index < count else {
_internalInvariant(index == count)
return true
}
return !_isContinuation(self[index])
}
}
//If this returns nil, it means the outputBuffer ran out of space
internal func _tryNormalize(
_ input: UnsafeBufferPointer<UInt16>,
into outputBuffer:
UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>
) -> Int? {
return _tryNormalize(input, into: _castOutputBuffer(outputBuffer))
}
//If this returns nil, it means the outputBuffer ran out of space
internal func _tryNormalize(
_ input: UnsafeBufferPointer<UInt16>,
into outputBuffer: UnsafeMutableBufferPointer<UInt16>
) -> Int? {
var err = __swift_stdlib_U_ZERO_ERROR
let count = __swift_stdlib_unorm2_normalize(
_Normalization._nfcNormalizer,
input.baseAddress._unsafelyUnwrappedUnchecked,
numericCast(input.count),
outputBuffer.baseAddress._unsafelyUnwrappedUnchecked,
numericCast(outputBuffer.count),
&err
)
guard err.isSuccess else {
// The output buffer needs to grow
return nil
}
return numericCast(count)
}
internal struct NormalizationResult {
var amountFilled: Int
var nextReadPosition: String.Index
var allocatedBuffers: Bool
}
//If this returns nil, it means the outputBuffer ran out of space
@_effects(releasenone)
private func fastFill(
_ sourceBuffer: UnsafeBufferPointer<UInt8>,
_ outputBuffer: UnsafeMutableBufferPointer<UInt8>
) -> (read: Int, written: Int)? {
let outputBufferThreshold = outputBuffer.count - 4
// TODO: Additional fast-path: All CCC-ascending NFC_QC segments are NFC
// TODO: Just freakin do normalization and don't bother with ICU
var outputCount = 0
let outputEnd = outputBufferThreshold
var inputCount = 0
let inputEnd = sourceBuffer.count
while inputCount < inputEnd && outputCount < outputEnd {
// TODO: Slightly faster code-unit scan for latiny (<0xCC)
// Check scalar-based fast-paths
let (scalar, len) = _decodeScalar(sourceBuffer, startingAt: inputCount)
_internalInvariant(inputCount &+ len <= inputEnd)
if _slowPath(
!sourceBuffer.hasNormalizationBoundary(before: inputCount &+ len)
|| !scalar._isNFCStarter
) {
break
}
inputCount &+= len
for cu in UTF8.encode(scalar)._unsafelyUnwrappedUnchecked {
outputBuffer[_unchecked: outputCount] = cu
outputCount &+= 1
}
_internalInvariant(inputCount == outputCount,
"non-normalizing UTF-8 fast path should be 1-to-1 in code units")
}
return outputCount > 0 ? (inputCount, outputCount) : nil
}
//Transcodes a single segment from the scalars provided by the closure to the outputBuffer as UTF16
//If this returns nil, it means the outputBuffer ran out of space
private func copyUTF16Segment(
boundedBy range: Range<Int>,
into outputBuffer: UnsafeMutableBufferPointer<UInt16>,
_ f: (Int) -> (Unicode.Scalar, Int)
) -> (read: Int, written: Int)? {
var readIndex = range.lowerBound
var outputWriteIndex = 0
let outputCount = outputBuffer.count
while readIndex != range.upperBound {
let (scalar, length) = f(readIndex)
if scalar._hasNormalizationBoundaryBefore && readIndex != range.lowerBound {
break
}
readIndex += length
for cu in scalar.utf16 {
if outputWriteIndex < outputCount {
outputBuffer[outputWriteIndex] = cu
outputWriteIndex += 1
} else {
return nil
}
}
}
return (readIndex - range.lowerBound, outputWriteIndex)
}
//transcodes the UTF16 segment stored in soureceBuffer into the outputBuffer as UTF8
//If this returns nil, it means the outputBuffer ran out of space
private func transcodeValidUTF16ToUTF8(
_ sourceBuffer: UnsafeBufferPointer<UInt16>,
into outputBuffer: UnsafeMutableBufferPointer<UInt8>
) -> Int? {
var readIndex = 0
var writeIndex = 0
let outputCount = outputBuffer.count
let sourceCount = sourceBuffer.count
while readIndex < sourceCount {
let (scalar, length) = _decodeScalar(sourceBuffer, startingAt: readIndex)
//we don't need to check for normalization boundaries here because we are only transcoding
//a single segment at this point
readIndex += length
for cu in UTF8.encode(scalar)._unsafelyUnwrappedUnchecked {
if writeIndex < outputCount {
outputBuffer[writeIndex] = cu
writeIndex &+= 1
} else {
return nil
}
}
}
return writeIndex
}
internal enum _BufferToCopy {
case none, output, icuInput, icuOutput
}
internal func _allocateBuffers(
sourceCount count: Int,
preserveDataIn bufferToCopy: _BufferToCopy,
outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
) {
let output = count * _Normalization._maxNFCExpansionFactor * _Normalization._maxUTF16toUTF8ExpansionFactor
let icuInput = count
let icuOutput = count * _Normalization._maxNFCExpansionFactor
let newOutputBuffer = UnsafeMutableBufferPointer<UInt8>.allocate(capacity: output)
let newICUInputBuffer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: icuInput)
let newICUOutputBuffer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: icuOutput)
switch bufferToCopy {
case .none:
break
case .output:
let (_, written) = newOutputBuffer.initialize(from: outputBuffer)
_internalInvariant(written == 16)
case .icuInput:
let (_, written) = newICUInputBuffer.initialize(from: icuInputBuffer)
_internalInvariant(written == 16)
case .icuOutput:
let (_, written) = newICUOutputBuffer.initialize(from: icuOutputBuffer)
_internalInvariant(written == 16)
}
outputBuffer = newOutputBuffer
icuInputBuffer = newICUInputBuffer
icuOutputBuffer = newICUOutputBuffer
}
internal func _fastNormalize(
readIndex: String.Index,
sourceBuffer: UnsafeBufferPointer<UInt8>,
outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
) -> NormalizationResult {
let start = readIndex._encodedOffset
let rebasedSourceBuffer = UnsafeBufferPointer(rebasing: sourceBuffer[start...])
if let (read, filled) = fastFill(rebasedSourceBuffer, outputBuffer) {
let nextIndex = readIndex.encoded(offsetBy: read)
_internalInvariant(sourceBuffer.isOnUnicodeScalarBoundary(nextIndex._encodedOffset))
return NormalizationResult(
amountFilled: filled, nextReadPosition: nextIndex, allocatedBuffers: false)
}
var allocatedBuffers = false
func performWithAllocationIfNecessary<R>(
preserving preserveDataIn: _BufferToCopy, _ f: () -> R?
) -> R {
if let result = f() {
return result
}
_allocateBuffers(
sourceCount: sourceBuffer.count,
preserveDataIn: preserveDataIn,
outputBuffer: &outputBuffer,
icuInputBuffer: &icuInputBuffer,
icuOutputBuffer: &icuOutputBuffer)
_internalInvariant(!allocatedBuffers)
allocatedBuffers = true
return f()!
}
let (read, filled) = performWithAllocationIfNecessary(preserving: .none) { () -> (Int, Int)? in
return copyUTF16Segment(boundedBy: 0..<rebasedSourceBuffer.count, into: icuInputBuffer) {
return _decodeScalar(rebasedSourceBuffer, startingAt: $0)
}
}
let nextIndex = readIndex.encoded(offsetBy: read)
_internalInvariant(sourceBuffer.isOnUnicodeScalarBoundary(nextIndex._encodedOffset))
let normalized = performWithAllocationIfNecessary(preserving: .icuInput) { () -> Int? in
return _tryNormalize(
UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled]), into: icuOutputBuffer)
}
let transcoded = performWithAllocationIfNecessary(preserving: .icuOutput) { () -> Int? in
return transcodeValidUTF16ToUTF8(
UnsafeBufferPointer<UInt16>(rebasing: icuOutputBuffer[..<normalized]),
into: outputBuffer)
}
return NormalizationResult(
amountFilled: transcoded, nextReadPosition: nextIndex, allocatedBuffers: allocatedBuffers)
}
internal func _foreignNormalize(
readIndex: String.Index,
endIndex: String.Index,
guts: _StringGuts,
outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
) -> NormalizationResult {
var allocatedBuffers = false
func performWithAllocationIfNecessary<R>(
preserving preserveDataIn: _BufferToCopy, _ f: () -> R?
) -> R {
if let result = f() {
return result
}
_allocateBuffers(
sourceCount: guts.count,
preserveDataIn: preserveDataIn,
outputBuffer: &outputBuffer,
icuInputBuffer: &icuInputBuffer,
icuOutputBuffer: &icuOutputBuffer)
_internalInvariant(!allocatedBuffers)
allocatedBuffers = true
return f()!
}
let (read, filled) = performWithAllocationIfNecessary(preserving: .none) { () -> (Int, Int)? in
let start = readIndex._encodedOffset
let end = endIndex._encodedOffset
return copyUTF16Segment(boundedBy: start..<end, into: icuInputBuffer) { gutsOffset in
return guts.errorCorrectedScalar(startingAt: gutsOffset)
}
}
let nextIndex = readIndex.encoded(offsetBy: read)
_internalInvariant(guts.isOnUnicodeScalarBoundary(nextIndex))
let normalized = performWithAllocationIfNecessary(preserving: .icuInput) { () -> Int? in
return _tryNormalize(
UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled]), into: icuOutputBuffer)
}
let transcoded = performWithAllocationIfNecessary(preserving: .icuOutput) { () -> Int? in
return transcodeValidUTF16ToUTF8(
UnsafeBufferPointer<UInt16>(rebasing: icuOutputBuffer[..<normalized]),
into: outputBuffer)
}
return NormalizationResult(
amountFilled: transcoded, nextReadPosition: nextIndex, allocatedBuffers: allocatedBuffers)
}