swift-mirror/stdlib/public/core/StringNormalization.swift

//===--- StringNormalization.swift ----------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//

import SwiftShims

internal enum _Normalization {
  // ICU's NFC unorm2 instance
  //
  // TODO(String performance): Should we cache one on TLS? Is this an expensive
  // call?
  internal static var _nfcNormalizer: OpaquePointer = {
    var err = __swift_stdlib_U_ZERO_ERROR
    let normalizer = __swift_stdlib_unorm2_getNFCInstance(&err)
    guard err.isSuccess else {
      // This shouldn't be possible unless some deep (unrecoverable) system
      // invariants are violated
      fatalError("Unable to talk to ICU")
    }
    return normalizer
  }()

  // When normalized in NFC, some segments may expand in size (e.g. some non-BMP
  // musical notes). This expansion is capped by the maximum expansion factor of
  // the normal form. For NFC, that is 3x.
  internal static let _maxNFCExpansionFactor = 3
  internal static let _maxUTF16toUTF8ExpansionFactor = 3

  internal typealias _SegmentOutputBuffer = _FixedArray16<UInt16>
}

//
// Pointer casting helpers
//
@inline(__always)
private func _unsafeMutableBufferPointerCast<T, U>(
  _ ptr: UnsafeMutablePointer<T>,
  _ count: Int,
  to: U.Type = U.self
) -> UnsafeMutableBufferPointer<U> {
  return UnsafeMutableBufferPointer(
    start: UnsafeMutableRawPointer(ptr).assumingMemoryBound(to: U.self),
    count: count
  )
}
@inline(__always)
private func _unsafeBufferPointerCast<T, U>(
  _ ptr: UnsafePointer<T>,
  _ count: Int,
  to: U.Type = U.self
) -> UnsafeBufferPointer<U> {
  return UnsafeBufferPointer(
    start: UnsafeRawPointer(ptr).assumingMemoryBound(to: U.self),
    count: count
  )
}
internal func _castOutputBuffer(
  _ ptr: UnsafeMutablePointer<_FixedArray16<UInt8>>,
  endingAt endIdx: Int = 16
) -> UnsafeMutableBufferPointer<UInt8> {
  let bufPtr: UnsafeMutableBufferPointer<UInt8> =
    _unsafeMutableBufferPointerCast(
      ptr, 16)
  return UnsafeMutableBufferPointer<UInt8>(rebasing: bufPtr[..<endIdx])
}
internal func _castOutputBuffer(
  _ ptr: UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>,
  endingAt endIdx: Int = _Normalization._SegmentOutputBuffer.capacity
) -> UnsafeMutableBufferPointer<UInt16> {
  let bufPtr: UnsafeMutableBufferPointer<UInt16> =
    _unsafeMutableBufferPointerCast(
      ptr, _Normalization._SegmentOutputBuffer.capacity)
  return UnsafeMutableBufferPointer<UInt16>(rebasing: bufPtr[..<endIdx])
}
internal func _castOutputBuffer(
  _ ptr: UnsafePointer<_Normalization._SegmentOutputBuffer>,
  endingAt endIdx: Int = _Normalization._SegmentOutputBuffer.capacity
) -> UnsafeBufferPointer<UInt16> {
  let bufPtr: UnsafeBufferPointer<UInt16> =
    _unsafeBufferPointerCast(
      ptr, _Normalization._SegmentOutputBuffer.capacity)
  return UnsafeBufferPointer<UInt16>(rebasing: bufPtr[..<endIdx])
}

extension _StringGuts {
  internal func foreignHasNormalizationBoundary(
    before index: String.Index
  ) -> Bool {
    let offset = index._encodedOffset
    if offset == 0 || offset == count {
      return true
    }

    let scalar = foreignErrorCorrectedScalar(startingAt: index).0
    return scalar._hasNormalizationBoundaryBefore
  }
}
extension UnsafeBufferPointer where Element == UInt8 {
  internal func hasNormalizationBoundary(before index: Int) -> Bool {
    if index == 0 || index == count {
      return true
    }
    assert(!_isContinuation(self[_unchecked: index]))

    // Sub-300 latiny fast-path
    if self[_unchecked: index] < 0xCC { return true }

    let cu = _decodeScalar(self, startingAt: index).0
    return cu._hasNormalizationBoundaryBefore
  }
}

extension Unicode.Scalar {
  // Normalization boundary - a place in a string where everything left of the
  // boundary can be normalized independently from everything right of the
  // boundary. The concatenation of each result is the same as if the entire
  // string had been normalized as a whole.
  //
  // Normalization segment - a sequence of code units between two normalization
  // boundaries (without any boundaries in the middle). Note that normalization
  // segments can, as a process of normalization, expand, contract, and even
  // produce new sub-segments.

  // Whether this scalar value always has a normalization boundary before it.
  @inline(__always) // common fast-path
  internal var _hasNormalizationBoundaryBefore: Bool {
    // Fast-path: All scalars up through U+02FF are NFC and have boundaries
    // before them
    if self.value < 0x300 { return true }

    _internalInvariant(Int32(exactly: self.value) != nil, "top bit shouldn't be set")
    let value = Int32(bitPattern: self.value)
    return 0 != __swift_stdlib_unorm2_hasBoundaryBefore(
      _Normalization._nfcNormalizer, value)
  }
  @inline(__always) // common fast-path
  internal var _isNFCQCYes: Bool {
    // Fast-path: All scalars up through U+02FF are NFC and have boundaries
    // before them
    if self.value < 0x300 { return true }

    return __swift_stdlib_u_getIntPropertyValue(
      Builtin.reinterpretCast(value), __swift_stdlib_UCHAR_NFC_QUICK_CHECK
    ) == 1
  }

  // Quick check if a scalar is NFC and a segment starter
  internal var _isNFCStarter: Bool {
    // Otherwise, consult the properties
    return self._hasNormalizationBoundaryBefore && self._isNFCQCYes
  }
}

extension UnsafeBufferPointer where Element == UInt8 {
  internal func isOnUnicodeScalarBoundary(_ index: Int) -> Bool {
    guard index < count else {
      _internalInvariant(index == count)
      return true
    }
    return !_isContinuation(self[index])
  }

}

//If this returns nil, it means the outputBuffer ran out of space
internal func _tryNormalize(
  _ input: UnsafeBufferPointer<UInt16>,
  into outputBuffer:
    UnsafeMutablePointer<_Normalization._SegmentOutputBuffer>
) -> Int? {
  return _tryNormalize(input, into: _castOutputBuffer(outputBuffer))
}

//If this returns nil, it means the outputBuffer ran out of space
internal func _tryNormalize(
  _ input: UnsafeBufferPointer<UInt16>,
  into outputBuffer: UnsafeMutableBufferPointer<UInt16>
) -> Int? {
  var err = __swift_stdlib_U_ZERO_ERROR
  let count = __swift_stdlib_unorm2_normalize(
    _Normalization._nfcNormalizer,
    input.baseAddress._unsafelyUnwrappedUnchecked,
    numericCast(input.count),
    outputBuffer.baseAddress._unsafelyUnwrappedUnchecked,
    numericCast(outputBuffer.count),
    &err
  )
  guard err.isSuccess else {
    // The output buffer needs to grow
    return nil
  }
  return numericCast(count)
}

internal struct NormalizationResult {
  var amountFilled: Int
  var nextReadPosition: String.Index
  var allocatedBuffers: Bool
}

//If this returns nil, it means the outputBuffer ran out of space
@_effects(releasenone)
private func fastFill(
  _ sourceBuffer: UnsafeBufferPointer<UInt8>,
  _ outputBuffer: UnsafeMutableBufferPointer<UInt8>
) -> (read: Int, written: Int)? {
  let outputBufferThreshold = outputBuffer.count - 4

  // TODO: Additional fast-path: All CCC-ascending NFC_QC segments are NFC
  // TODO: Just freakin do normalization and don't bother with ICU
  var outputCount = 0
  let outputEnd = outputBufferThreshold
  var inputCount = 0
  let inputEnd = sourceBuffer.count
  while inputCount < inputEnd && outputCount < outputEnd {
    // TODO: Slightly faster code-unit scan for latiny (<0xCC)

    // Check scalar-based fast-paths
    let (scalar, len) = _decodeScalar(sourceBuffer, startingAt: inputCount)
    _internalInvariant(inputCount &+ len <= inputEnd)

    if _slowPath(
         !sourceBuffer.hasNormalizationBoundary(before: inputCount &+ len)
      || !scalar._isNFCStarter
    ) {
      break
    }
    inputCount &+= len

    for cu in UTF8.encode(scalar)._unsafelyUnwrappedUnchecked {
      outputBuffer[_unchecked: outputCount] = cu
      outputCount &+= 1
    }

    _internalInvariant(inputCount == outputCount,
      "non-normalizing UTF-8 fast path should be 1-to-1 in code units")
  }
  return outputCount > 0 ? (inputCount, outputCount) : nil
}

//Transcodes a single segment from the scalars provided by the closure to the outputBuffer as UTF16
//If this returns nil, it means the outputBuffer ran out of space
private func copyUTF16Segment(
  boundedBy range: Range<Int>,
  into outputBuffer: UnsafeMutableBufferPointer<UInt16>,
  _ f: (Int) -> (Unicode.Scalar, Int)
) -> (read: Int, written: Int)? {
  var readIndex = range.lowerBound
  var outputWriteIndex = 0
  let outputCount = outputBuffer.count
  while readIndex != range.upperBound {
    let (scalar, length) = f(readIndex)
    if scalar._hasNormalizationBoundaryBefore && readIndex != range.lowerBound {
      break
    }

    readIndex += length

    for cu in scalar.utf16 {
      if outputWriteIndex < outputCount {
        outputBuffer[outputWriteIndex] = cu
        outputWriteIndex += 1
      } else {
        return nil
      }
    }
  }
  return (readIndex - range.lowerBound, outputWriteIndex)
}

//transcodes the UTF16 segment stored in soureceBuffer into the outputBuffer as UTF8
//If this returns nil, it means the outputBuffer ran out of space
private func transcodeValidUTF16ToUTF8(
  _ sourceBuffer: UnsafeBufferPointer<UInt16>,
  into outputBuffer: UnsafeMutableBufferPointer<UInt8>
) -> Int? {
  var readIndex = 0
  var writeIndex = 0
  let outputCount = outputBuffer.count
  let sourceCount = sourceBuffer.count

  while readIndex < sourceCount {
    let (scalar, length) = _decodeScalar(sourceBuffer, startingAt: readIndex)
    //we don't need to check for normalization boundaries here because we are only transcoding
    //a single segment at this point

    readIndex += length

    for cu in UTF8.encode(scalar)._unsafelyUnwrappedUnchecked {
      if writeIndex < outputCount {
        outputBuffer[writeIndex] = cu
        writeIndex &+= 1
      } else {
        return nil
      }
    }
  }
  return writeIndex
}

internal enum _BufferToCopy {
  case none, output, icuInput, icuOutput
}

internal func _allocateBuffers(
  sourceCount count: Int,
  preserveDataIn bufferToCopy: _BufferToCopy,
  outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
  icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
  icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
) {
  let output = count * _Normalization._maxNFCExpansionFactor * _Normalization._maxUTF16toUTF8ExpansionFactor
  let icuInput = count
  let icuOutput = count * _Normalization._maxNFCExpansionFactor
  let newOutputBuffer = UnsafeMutableBufferPointer<UInt8>.allocate(capacity: output)
  let newICUInputBuffer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: icuInput)
  let newICUOutputBuffer = UnsafeMutableBufferPointer<UInt16>.allocate(capacity: icuOutput)

  switch bufferToCopy {
  case .none:
    break
  case .output:
    let (_, written) = newOutputBuffer.initialize(from: outputBuffer)
    _internalInvariant(written == 16)
  case .icuInput:
    let (_, written) = newICUInputBuffer.initialize(from: icuInputBuffer)
    _internalInvariant(written == 16)
  case .icuOutput:
    let (_, written) = newICUOutputBuffer.initialize(from: icuOutputBuffer)
    _internalInvariant(written == 16)
  }

  outputBuffer = newOutputBuffer
  icuInputBuffer = newICUInputBuffer
  icuOutputBuffer = newICUOutputBuffer
}

internal func _fastNormalize(
  readIndex: String.Index,
  sourceBuffer: UnsafeBufferPointer<UInt8>,
  outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
  icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
  icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
) -> NormalizationResult {
  let start = readIndex._encodedOffset
  let rebasedSourceBuffer = UnsafeBufferPointer(rebasing: sourceBuffer[start...])
  if let (read, filled) = fastFill(rebasedSourceBuffer, outputBuffer) {
    let nextIndex = readIndex.encoded(offsetBy: read)
    _internalInvariant(sourceBuffer.isOnUnicodeScalarBoundary(nextIndex._encodedOffset))

    return NormalizationResult(
      amountFilled: filled, nextReadPosition: nextIndex, allocatedBuffers: false)
  }
  var allocatedBuffers = false
  func performWithAllocationIfNecessary<R>(
    preserving preserveDataIn: _BufferToCopy, _ f: () -> R?
  ) -> R {
    if let result = f() {
      return result
    }
    _allocateBuffers(
      sourceCount: sourceBuffer.count,
      preserveDataIn: preserveDataIn,
      outputBuffer: &outputBuffer,
      icuInputBuffer: &icuInputBuffer,
      icuOutputBuffer: &icuOutputBuffer)
    _internalInvariant(!allocatedBuffers)
    allocatedBuffers = true
    return f()!
  }

  let (read, filled) = performWithAllocationIfNecessary(preserving: .none) { () -> (Int, Int)? in
    return copyUTF16Segment(boundedBy: 0..<rebasedSourceBuffer.count, into: icuInputBuffer) {
      return _decodeScalar(rebasedSourceBuffer, startingAt: $0)
    }
  }

  let nextIndex = readIndex.encoded(offsetBy: read)
  _internalInvariant(sourceBuffer.isOnUnicodeScalarBoundary(nextIndex._encodedOffset))

  let normalized = performWithAllocationIfNecessary(preserving: .icuInput) { () -> Int? in
    return _tryNormalize(
      UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled]), into: icuOutputBuffer)
  }

  let transcoded = performWithAllocationIfNecessary(preserving: .icuOutput) { () -> Int? in
    return transcodeValidUTF16ToUTF8(
      UnsafeBufferPointer<UInt16>(rebasing: icuOutputBuffer[..<normalized]),
      into: outputBuffer)
  }
  return NormalizationResult(
    amountFilled: transcoded, nextReadPosition: nextIndex, allocatedBuffers: allocatedBuffers)
}

internal func _foreignNormalize(
  readIndex: String.Index,
  endIndex: String.Index,
  guts: _StringGuts,
  outputBuffer: inout UnsafeMutableBufferPointer<UInt8>,
  icuInputBuffer: inout UnsafeMutableBufferPointer<UInt16>,
  icuOutputBuffer: inout UnsafeMutableBufferPointer<UInt16>
) -> NormalizationResult {
  var allocatedBuffers = false
  func performWithAllocationIfNecessary<R>(
    preserving preserveDataIn: _BufferToCopy, _ f: () -> R?
  ) -> R {
    if let result = f() {
      return result
    }
    _allocateBuffers(
      sourceCount: guts.count,
      preserveDataIn: preserveDataIn,
      outputBuffer: &outputBuffer,
      icuInputBuffer: &icuInputBuffer,
      icuOutputBuffer: &icuOutputBuffer)
    _internalInvariant(!allocatedBuffers)
    allocatedBuffers = true
    return f()!
  }
  let (read, filled) = performWithAllocationIfNecessary(preserving: .none) { () -> (Int, Int)? in
    let start = readIndex._encodedOffset
    let end = endIndex._encodedOffset
    return copyUTF16Segment(boundedBy: start..<end, into: icuInputBuffer) { gutsOffset in
      return guts.errorCorrectedScalar(startingAt: gutsOffset)
    }
  }

  let nextIndex = readIndex.encoded(offsetBy: read)
  _internalInvariant(guts.isOnUnicodeScalarBoundary(nextIndex))

  let normalized = performWithAllocationIfNecessary(preserving: .icuInput) { () -> Int? in
    return _tryNormalize(
      UnsafeBufferPointer(rebasing: icuInputBuffer[..<filled]), into: icuOutputBuffer)
  }

  let transcoded = performWithAllocationIfNecessary(preserving: .icuOutput) { () -> Int? in
    return transcodeValidUTF16ToUTF8(
      UnsafeBufferPointer<UInt16>(rebasing: icuOutputBuffer[..<normalized]),
      into: outputBuffer)
  }
  return NormalizationResult(
    amountFilled: transcoded, nextReadPosition: nextIndex, allocatedBuffers: allocatedBuffers)
}