swift-mirror/stdlib/public/core/UTF16.swift

//===--- UTF16.swift ------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
extension Unicode {
  @frozen
  public enum UTF16: Sendable {
  case _swift3Buffer(Unicode.UTF16.ForwardParser)
  }
}

extension Unicode.UTF16 {
  /// Returns the number of code units required to encode the given Unicode
  /// scalar.
  ///
  /// Because a Unicode scalar value can require up to 21 bits to store its
  /// value, some Unicode scalars are represented in UTF-16 by a pair of
  /// 16-bit code units. The first and second code units of the pair,
  /// designated *leading* and *trailing* surrogates, make up a *surrogate
  /// pair*.
  ///
  ///     let anA: Unicode.Scalar = "A"
  ///     print(anA.value)
  ///     // Prints "65"
  ///     print(UTF16.width(anA))
  ///     // Prints "1"
  ///
  ///     let anApple: Unicode.Scalar = "🍎"
  ///     print(anApple.value)
  ///     // Prints "127822"
  ///     print(UTF16.width(anApple))
  ///     // Prints "2"
  ///
  /// - Parameter x: A Unicode scalar value.
  /// - Returns: The width of `x` when encoded in UTF-16, either `1` or `2`.
  @inlinable
  public static func width(_ x: Unicode.Scalar) -> Int {
    return x.value <= UInt16.max ? 1 : 2
  }

  /// Returns the high-surrogate code unit of the surrogate pair representing
  /// the specified Unicode scalar.
  ///
  /// Because a Unicode scalar value can require up to 21 bits to store its
  /// value, some Unicode scalars are represented in UTF-16 by a pair of
  /// 16-bit code units. The first and second code units of the pair,
  /// designated *leading* and *trailing* surrogates, make up a *surrogate
  /// pair*.
  ///
  ///     let apple: Unicode.Scalar = "🍎"
  ///     print(UTF16.leadSurrogate(apple))
  ///     // Prints "55356"
  ///
  /// - Parameter x: A Unicode scalar value. `x` must be represented by a
  ///   surrogate pair when encoded in UTF-16. To check whether `x` is
  ///   represented by a surrogate pair, use `UTF16.width(x) == 2`.
  /// - Returns: The leading surrogate code unit of `x` when encoded in UTF-16.
  @inlinable
  public static func leadSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit {
    _precondition(width(x) == 2)
    return 0xD800 + UTF16.CodeUnit(truncatingIfNeeded:
      (x.value - 0x1_0000) &>> (10 as UInt32))
  }

  /// Returns the low-surrogate code unit of the surrogate pair representing
  /// the specified Unicode scalar.
  ///
  /// Because a Unicode scalar value can require up to 21 bits to store its
  /// value, some Unicode scalars are represented in UTF-16 by a pair of
  /// 16-bit code units. The first and second code units of the pair,
  /// designated *leading* and *trailing* surrogates, make up a *surrogate
  /// pair*.
  ///
  ///     let apple: Unicode.Scalar = "🍎"
  ///     print(UTF16.trailSurrogate(apple))
  ///     // Prints "57166"
  ///
  /// - Parameter x: A Unicode scalar value. `x` must be represented by a
  ///   surrogate pair when encoded in UTF-16. To check whether `x` is
  ///   represented by a surrogate pair, use `UTF16.width(x) == 2`.
  /// - Returns: The trailing surrogate code unit of `x` when encoded in UTF-16.
  @inlinable
  public static func trailSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit {
    _precondition(width(x) == 2)
    return 0xDC00 + UTF16.CodeUnit(truncatingIfNeeded:
      (x.value - 0x1_0000) & (((1 as UInt32) &<< 10) - 1))
  }

  /// Returns a Boolean value indicating whether the specified code unit is a
  /// high-surrogate code unit.
  ///
  /// Here's an example of checking whether each code unit in a string's
  /// `utf16` view is a lead surrogate. The `apple` string contains a single
  /// emoji character made up of a surrogate pair when encoded in UTF-16.
  ///
  ///     let apple = "🍎"
  ///     for unit in apple.utf16 {
  ///         print(UTF16.isLeadSurrogate(unit))
  ///     }
  ///     // Prints "true"
  ///     // Prints "false"
  ///
  /// This method does not validate the encoding of a UTF-16 sequence beyond
  /// the specified code unit. Specifically, it does not validate that a
  /// low-surrogate code unit follows `x`.
  ///
  /// - Parameter x: A UTF-16 code unit.
  /// - Returns: `true` if `x` is a high-surrogate code unit; otherwise,
  ///   `false`.
  @inlinable
  public static func isLeadSurrogate(_ x: CodeUnit) -> Bool {
    return (x & 0xFC00) == 0xD800
  }

  /// Returns a Boolean value indicating whether the specified code unit is a
  /// low-surrogate code unit.
  ///
  /// Here's an example of checking whether each code unit in a string's
  /// `utf16` view is a trailing surrogate. The `apple` string contains a
  /// single emoji character made up of a surrogate pair when encoded in
  /// UTF-16.
  ///
  ///     let apple = "🍎"
  ///     for unit in apple.utf16 {
  ///         print(UTF16.isTrailSurrogate(unit))
  ///     }
  ///     // Prints "false"
  ///     // Prints "true"
  ///
  /// This method does not validate the encoding of a UTF-16 sequence beyond
  /// the specified code unit. Specifically, it does not validate that a
  /// high-surrogate code unit precedes `x`.
  ///
  /// - Parameter x: A UTF-16 code unit.
  /// - Returns: `true` if `x` is a low-surrogate code unit; otherwise,
  ///   `false`.
  @inlinable
  public static func isTrailSurrogate(_ x: CodeUnit) -> Bool {
    return (x & 0xFC00) == 0xDC00
  }

  /// Returns a Boolean value indicating whether the specified code unit is a
  /// high or low surrogate code unit.
  @_alwaysEmitIntoClient
  public static func isSurrogate(_ x: CodeUnit) -> Bool {
    return isLeadSurrogate(x) || isTrailSurrogate(x)
  }

  @inlinable
  public // @testable
  static func _copy<T: _StringElement, U: _StringElement>(
    source: UnsafeMutablePointer<T>,
    destination: UnsafeMutablePointer<U>,
    count: Int
  ) {
    if MemoryLayout<T>.stride == MemoryLayout<U>.stride {
      unsafe _memcpy(
        dest: UnsafeMutablePointer(destination),
        src: UnsafeMutablePointer(source),
        size: UInt(count) * UInt(MemoryLayout<U>.stride))
    }
    else {
      for i in 0..<count {
        let u16 = unsafe T._toUTF16CodeUnit((source + i).pointee)
        unsafe (destination + i).pointee = U._fromUTF16CodeUnit(u16)
      }
    }
  }

  /// Returns the number of UTF-16 code units required for the given code unit
  /// sequence when transcoded to UTF-16, and a Boolean value indicating
  /// whether the sequence was found to contain only ASCII characters.
  ///
  /// The following example finds the length of the UTF-16 encoding of the
  /// string `"Fermata 𝄐"`, starting with its UTF-8 representation.
  ///
  ///     let fermata = "Fermata 𝄐"
  ///     let bytes = fermata.utf8
  ///     print(Array(bytes))
  ///     // Prints "[70, 101, 114, 109, 97, 116, 97, 32, 240, 157, 132, 144]"
  ///
  ///     let result = UTF16.transcodedLength(of: bytes.makeIterator(),
  ///                                         decodedAs: UTF8.self,
  ///                                         repairingIllFormedSequences: false)
  ///     print(result)
  ///     // Prints "Optional((count: 10, isASCII: false))"
  ///
  /// - Parameters:
  ///   - input: An iterator of code units to be translated, encoded as
  ///     `sourceEncoding`. If `repairingIllFormedSequences` is `true`, the
  ///     entire iterator will be exhausted. Otherwise, iteration will stop if
  ///     an ill-formed sequence is detected.
  ///   - sourceEncoding: The Unicode encoding of `input`.
  ///   - repairingIllFormedSequences: Pass `true` to measure the length of
  ///     `input` even when `input` contains ill-formed sequences. Each
  ///     ill-formed sequence is replaced with a Unicode replacement character
  ///     (`"\u{FFFD}"`) and is measured as such. Pass `false` to immediately
  ///     stop measuring `input` when an ill-formed sequence is encountered.
  /// - Returns: A tuple containing the number of UTF-16 code units required to
  ///   encode `input` and a Boolean value that indicates whether the `input`
  ///   contained only ASCII characters. If `repairingIllFormedSequences` is
  ///   `false` and an ill-formed sequence is detected, this method returns
  ///   `nil`.
  @inlinable
  public static func transcodedLength<
    Input: IteratorProtocol,
    Encoding: Unicode.Encoding
  >(
    of input: Input,
    decodedAs sourceEncoding: Encoding.Type,
    repairingIllFormedSequences: Bool
  ) -> (count: Int, isASCII: Bool)?
    where Encoding.CodeUnit == Input.Element {

    var utf16Count = 0
    var i = input
    var d = Encoding.ForwardParser()

    // Fast path for ASCII in a UTF8 buffer
    if sourceEncoding == Unicode.UTF8.self {
      var peek: Encoding.CodeUnit = 0
      while let u = i.next() {
        peek = u
        guard _fastPath(peek < 0x80) else { break }
        utf16Count = utf16Count + 1
      }
      if _fastPath(peek < 0x80) { return (utf16Count, true) }

      var d1 = UTF8.ForwardParser()
      d1._buffer.append(numericCast(peek))
      d = _identityCast(d1, to: Encoding.ForwardParser.self)
    }

    var utf16BitUnion: CodeUnit = 0
    while true {
      let s = d.parseScalar(from: &i)
      if _fastPath(s._valid != nil), let scalarContent = s._valid {
        let utf16 = transcode(scalarContent, from: sourceEncoding)
          ._unsafelyUnwrappedUnchecked
        utf16Count += utf16.count
        for x in utf16 { utf16BitUnion |= x }
      }
      else if let _ = s._error {
        guard _fastPath(repairingIllFormedSequences) else { return nil }
        utf16Count += 1
        utf16BitUnion |= UTF16._replacementCodeUnit
      }
      else {
        return (utf16Count, utf16BitUnion < 0x80)
      }
    }
    fatalError()
  }
}

extension Unicode.UTF16: _UnicodeEncoding {
  public typealias CodeUnit = UInt16
  public typealias EncodedScalar = _UIntBuffer<UInt16>

  @inlinable
  internal static var _replacementCodeUnit: CodeUnit {
    @inline(__always) get { return 0xfffd }
  }

  @inlinable
  public static var encodedReplacementCharacter: EncodedScalar {
    return EncodedScalar(_storage: 0xFFFD, _bitCount: 16)
  }

  /// Returns whether the given code unit represents an ASCII scalar
  @_alwaysEmitIntoClient
  public static func isASCII(_ x: CodeUnit) -> Bool {
    return x <= 0x7f
  }

  @inlinable
  public static func _isScalar(_ x: CodeUnit) -> Bool {
    return x & 0xf800 != 0xd800
  }

  @inlinable
  @inline(__always)
  internal static func _decodeSurrogates(
    _ lead: CodeUnit,
    _ trail: CodeUnit
  ) -> Unicode.Scalar {
    _internalInvariant(isLeadSurrogate(lead))
    _internalInvariant(isTrailSurrogate(trail))
    return Unicode.Scalar(
      _unchecked: 0x10000 +
        (UInt32(lead & 0x03ff) &<< 10 | UInt32(trail & 0x03ff)))
  }

  @inlinable
  public static func decode(_ source: EncodedScalar) -> Unicode.Scalar {
    let bits = source._storage
    if _fastPath(source._bitCount == 16) {
      return Unicode.Scalar(_unchecked: bits & 0xffff)
    }
    _internalInvariant(source._bitCount == 32)
    let lower: UInt32 = bits >> 16 & 0x03ff
    let upper: UInt32 = (bits & 0x03ff) << 10
    let value = 0x10000 + (lower | upper)
    return Unicode.Scalar(_unchecked: value)
  }

  @inlinable
  public static func encode(
    _ source: Unicode.Scalar
  ) -> EncodedScalar? {
    let x = source.value
    if _fastPath(x < ((1 as UInt32) << 16)) {
      return EncodedScalar(_storage: x, _bitCount: 16)
    }
    let x1 = x - ((1 as UInt32) << 16)
    var r = (0xdc00 + (x1 & 0x3ff))
    r &<<= 16
    r |= (0xd800 + (x1 &>> 10 & 0x3ff))
    return EncodedScalar(_storage: r, _bitCount: 32)
  }

  @inlinable
  @inline(__always)
  public static func transcode<FromEncoding: Unicode.Encoding>(
    _ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
  ) -> EncodedScalar? {
    if _fastPath(FromEncoding.self == UTF8.self) {
      let c = _identityCast(content, to: UTF8.EncodedScalar.self)
      var b = c.count
      b = b &- 1
      if _fastPath(b == 0) {
        return EncodedScalar(
          _storage: (c._biasedBits &- 0x1) & 0b0__111_1111, _bitCount: 16)
      }
      var s = c._biasedBits &- 0x01010101
      var r = s
      r &<<= 6
      s &>>= 8
      r |= s & 0b0__11_1111
      b = b &- 1

      if _fastPath(b == 0) {
        return EncodedScalar(_storage: r & 0b0__111_1111_1111, _bitCount: 16)
      }
      r &<<= 6
      s &>>= 8
      r |= s & 0b0__11_1111
      b = b &- 1

      if _fastPath(b == 0) {
        return EncodedScalar(_storage: r & 0xFFFF, _bitCount: 16)
      }

      r &<<= 6
      s &>>= 8
      r |= s & 0b0__11_1111
      r &= (1 &<< 21) - 1
      return encode(Unicode.Scalar(_unchecked: r))
    }
    else if _fastPath(FromEncoding.self == UTF16.self) {
      return unsafe unsafeBitCast(content, to: UTF16.EncodedScalar.self)
    }
    return encode(FromEncoding.decode(content))
  }

  @frozen
  public struct ForwardParser: Sendable {
    public typealias _Buffer = _UIntBuffer<UInt16>

    public var _buffer: _Buffer

    @inlinable
    public init() { _buffer = _Buffer() }
  }

  @frozen
  public struct ReverseParser: Sendable {
    public typealias _Buffer = _UIntBuffer<UInt16>

    public var _buffer: _Buffer

    @inlinable
    public init() { _buffer = _Buffer() }
  }
}

extension UTF16.ReverseParser: Unicode.Parser, _UTFParser {
  public typealias Encoding = Unicode.UTF16

  @inlinable
  public func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
    _internalInvariant(  // this case handled elsewhere
      !Encoding._isScalar(UInt16(truncatingIfNeeded: _buffer._storage)))
    if _fastPath(_buffer._storage & 0xFC00_FC00 == 0xD800_DC00) {
      return (true, 2*16)
    }
    return (false, 1*16)
  }

  @inlinable
  public func _bufferedScalar(bitCount: UInt8) -> Encoding.EncodedScalar {
    return Encoding.EncodedScalar(
      _storage:
        (_buffer._storage &<< 16 | _buffer._storage &>> 16) &>> (32 - bitCount),
      _bitCount: bitCount
    )
  }
}

extension Unicode.UTF16.ForwardParser: Unicode.Parser, _UTFParser {
  public typealias Encoding = Unicode.UTF16

  @inlinable
  public func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
    _internalInvariant(  // this case handled elsewhere
      !Encoding._isScalar(UInt16(truncatingIfNeeded: _buffer._storage)))
    if _fastPath(_buffer._storage & 0xFC00_FC00 == 0xDC00_D800) {
      return (true, 2*16)
    }
    return (false, 1*16)
  }

  @inlinable
  public func _bufferedScalar(bitCount: UInt8) -> Encoding.EncodedScalar {
    var r = _buffer
    r._bitCount = bitCount
    return r
  }
}

private enum ScalarFallbackResult: UInt8 {
  case invalid
  case singleByte
  case multiByte
}

#if arch(arm64_32)
private typealias Word = UInt64
#else
private typealias Word = UInt
#endif
@_transparent private var mask: Word {
  Word(truncatingIfNeeded: 0xFF80FF80_FF80FF80 as UInt64)
}

private typealias Block = (Word, Word, Word, Word)

#if SWIFT_STDLIB_ENABLE_VECTOR_TYPES
#if _pointerBitWidth(_32) && !arch(arm64_32)
@_transparent private var blockSize: Int { 8 }
@_transparent
private func allASCIIBlock(at pointer: UnsafePointer<UInt16>) -> SIMD8<UInt8>? {
  let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
  return unsafe ((block.0 | block.1 | block.2 | block.3) & mask == 0)
    ? unsafeBitCast(block, to: SIMD16<UInt8>.self).evenHalf : nil
}
#else
@_transparent private var blockSize: Int { 16 }
@_transparent
private func allASCIIBlock(at pointer: UnsafePointer<UInt16>) -> SIMD16<UInt8>? {
  let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
  return unsafe ((block.0 | block.1 | block.2 | block.3) & mask == 0)
    ? unsafeBitCast(block, to: SIMD32<UInt8>.self).evenHalf : nil
}
#endif
#else
@_transparent private var blockSize: Int { 1 }
@_transparent
private func allASCIIBlock(at pointer: UnsafePointer<UInt16>) -> CollectionOfOne<UInt8>? {
  let value = unsafe pointer.pointee
  if value & 0xFF80 == 0 {
    return CollectionOfOne(UInt8(truncatingIfNeeded: value))
  }
  return nil
}
#endif

@_transparent private var utf8TwoByteMax: UInt32 { 0x7FF }
@_transparent private var utf16LeadSurrogateMin: UInt32 { 0xD800 }
@_transparent private var utf16TrailSurrogateMin: UInt32 { 0xDC00 }
@_transparent private var utf16ReplacementCharacter: UInt32 { 0xFFFD }
@_transparent private var utf16ScalarMax: UInt32 { 0x10FFFF }
@_transparent private var utf16BasicMultilingualPlaneMax: UInt32 { 0xFFFF }
@_transparent private var utf16AstralPlaneMin: UInt32 { 0x10000 }

/*
 This is expressible in a more concise way using the other transcoding
 primitives in the stdlib, but at least as of July 2025 doing that makes
 processing runs of non-ASCII several times slower.
 */
@inline(__always)
private func encodeScalarAsUTF8(
  _ scalar: UInt32,
  output: inout UnsafeMutablePointer<Unicode.UTF8.CodeUnit>
) {
  _debugPrecondition(scalar >= 0x80)
  _debugPrecondition(scalar <= utf16ScalarMax)
  if scalar <= utf8TwoByteMax {
    // Scalar fits in 11 bits
    // 2 byte UTF8 is 0b110[top 5 bits] 0b10[bottom 6 bits]
    unsafe output.pointee = 0b1100_0000 | UInt8((scalar >> 6) & 0b01_1111)
    unsafe (output + 1).pointee = 0b1000_0000 | UInt8(scalar & 0b11_1111)
    unsafe output += 2
  } else if scalar <= utf16BasicMultilingualPlaneMax {
    // Scalar fits in 16 bits
    // 3 byte UTF8 is 0b1110[top 4 bits] 0b10[middle 6 bits] 0b10[bottom 6 bits]
    unsafe output.pointee = 0b1110_0000 | UInt8((scalar >> 12) & 0b1111)
    unsafe (output + 1).pointee = 0b1000_0000 | UInt8((scalar >> 6) & 0b11_1111)
    unsafe (output + 2).pointee = 0b1000_0000 | UInt8(scalar & 0b11_1111)
    unsafe output += 3
  } else if scalar <= utf16ScalarMax {
    // Scalar fits in 21 bits.
    // 0b11110[top 3] 0b10[upper middle 6] 0b10[lower middle 6] 0b10[bottom 6]
    unsafe output.pointee = 0b1111_0000 | UInt8((scalar >> 18) & 0b0111)
    unsafe (output + 1).pointee = 0b1000_0000 | UInt8((scalar >> 12) & 0b11_1111)
    unsafe (output + 2).pointee = 0b1000_0000 | UInt8((scalar >> 6) & 0b11_1111)
    unsafe (output + 3).pointee = 0b1000_0000 | UInt8(scalar & 0b11_1111)
    unsafe output += 4
  } else {
    Builtin.unreachable()
  }
}

@inline(__always)
private func processNonASCIIScalarFallback(
  _ cu: UInt16,
  input: inout UnsafePointer<UInt16>,
  inputEnd: UnsafePointer<UInt16>,
  output: inout UnsafeMutablePointer<Unicode.UTF8.CodeUnit>,
  repairing: Bool
) -> (ScalarFallbackResult, repairsMade: Bool) {
  var scalar: UInt32 = 0
  var invalid = false
  if _slowPath(UTF16.isLeadSurrogate(cu)) {
    if unsafe input + 1 >= inputEnd {
      //Leading with no room for trailing
      invalid = true
      unsafe input += 1
    } else {
      let next = unsafe (input + 1).pointee
      if !UTF16.isTrailSurrogate(next) {
        //Leading followed by non-trailing
        invalid = true
        unsafe input += 1
      } else {
        /*
         Code points outside the BMP are encoded as:
         value -= smallest non-BMP code point
         lead = smallest leading surrogate + high 10 bits of value
         trail = smallest trailing surrogate + low 10 bits of value
         */
        scalar = utf16AstralPlaneMin
          + ((UInt32(cu) - utf16LeadSurrogateMin) << 10)
          + (UInt32(next) - utf16TrailSurrogateMin)
        unsafe input += 2
      }
    }
  } else if _slowPath(UTF16.isTrailSurrogate(cu)) {
    //Trailing with no leading
    invalid = true
    unsafe input += 1
  } else {
    scalar = UInt32(cu)
    unsafe input += 1
  }
  if _slowPath(invalid || scalar > utf16ScalarMax) {
    guard repairing else { return (.invalid, repairsMade: false) }
    unsafe encodeScalarAsUTF8(utf16ReplacementCharacter, output: &output)
    return (.multiByte, repairsMade: true)
  }
  unsafe encodeScalarAsUTF8(scalar, output: &output)
  return (.multiByte, repairsMade: false)
}

@inline(__always)
private func processScalarFallback(
  input: inout UnsafePointer<Unicode.UTF16.CodeUnit>,
  inputEnd: UnsafePointer<Unicode.UTF16.CodeUnit>,
  output: inout UnsafeMutablePointer<Unicode.UTF8.CodeUnit>,
  repairing: Bool
) -> (ScalarFallbackResult, repairsMade: Bool) {
  let cu = unsafe input.pointee
  if Unicode.UTF16.isASCII(cu) {
    unsafe output.initialize(to: UInt8(truncatingIfNeeded: cu))
    unsafe input += 1
    unsafe output += 1
  } else {
    // Scalar fallback for this code unit
    return unsafe processNonASCIIScalarFallback(
      cu,
      input: &input,
      inputEnd: inputEnd,
      output: &output,
      repairing: repairing
    )
  }
  return (.singleByte, repairsMade: false)
}

private func processNonASCIIChunk(
  input: inout UnsafePointer<UInt16>,
  inputEnd: UnsafePointer<UInt16>,
  output: inout UnsafeMutablePointer<UInt8>,
  repairing: Bool
) -> (Bool, repairsMade: Bool) {
  var repaired = false
  // Bound by position, not iteration count: a single call can consume a
  // surrogate pair (2 code units), so a fixed `blockSize` iteration count
  // would overrun into the next block. Matches utf8Length's scalar loop.
  let chunkEnd = unsafe input + blockSize
  while unsafe input < chunkEnd {
    switch unsafe processScalarFallback(
      input: &input,
      inputEnd: inputEnd,
      output: &output,
      repairing: repairing
    ) {
    case (.invalid, let repairsMade):
      return (false, repairsMade: repaired || repairsMade)
    case (.multiByte, let repairsMade):
      repaired = repaired || repairsMade
      continue
    case (.singleByte, _):
      continue
    }
  }
  return (true, repairsMade: repaired)
}

/*
 This is only ever called after validating the buffer size with
 utf8Length(of:repairing:), so it does not check for end of buffer. Don't call
 it if you haven't done that first!
 */
internal func transcodeUTF16ToUTF8(
  UTF16CodeUnits: UnsafeBufferPointer<Unicode.UTF16.CodeUnit>,
  intoKnownSufficientlyLarge outputBuffer: UnsafeMutableBufferPointer<Unicode.UTF8.CodeUnit>,
  repairing: Bool = true
) -> (Int, repairsMade: Bool) {
  let inCount = UTF16CodeUnits.count
  let outCount = outputBuffer.count
  guard inCount > 0, outCount > 0 else { return (0, repairsMade: false) }
  var input = unsafe UTF16CodeUnits.baseAddress.unsafelyUnwrapped
  let inputEnd = unsafe input + inCount
  var output = unsafe outputBuffer.baseAddress.unsafelyUnwrapped
  let outputStart = unsafe output
  var repairsMade = false

  while unsafe (inputEnd - input) >= blockSize {
    if let asciiBlock = unsafe allASCIIBlock(at: input) {
      _onFastPath()
      // All ASCII: transcode directly
      for i in 0 ..< blockSize {
        unsafe (output + i).initialize(to: asciiBlock[i])
      }
      unsafe input += blockSize
      unsafe output += blockSize
    } else {
      let (success, tmpRepairsMade) = unsafe processNonASCIIChunk(
        input: &input,
        inputEnd: inputEnd,
        output: &output,
        repairing: repairing
      )
      repairsMade = repairsMade || tmpRepairsMade
      if !success {
        return unsafe (output - outputStart, repairsMade: repairsMade)
      }
    }
  }
  // Finish any remaining code units using fallback scalar loop
  while unsafe input < inputEnd {
    switch unsafe processScalarFallback(
      input: &input,
      inputEnd: inputEnd,
      output: &output,
      repairing: repairing
    ) {
    case (.invalid, let tmpRepairsMade):
      return unsafe (output - outputStart, repairsMade: repairsMade || tmpRepairsMade)
    case (_, let tmpRepairsMade):
      repairsMade = repairsMade || tmpRepairsMade
    }
  }
  return unsafe (output - outputStart, repairsMade: repairsMade)
}

@inline(__always)
private func utf8Length(
  input: inout UnsafePointer<Unicode.UTF16.CodeUnit>,
  end: UnsafePointer<Unicode.UTF16.CodeUnit>,
  inputEnd: UnsafePointer<Unicode.UTF16.CodeUnit>,
  repairing: Bool
) -> Int? {
  // `end` is the stopping point for this call; `inputEnd` is the real end of
  // the buffer. They differ when the caller is processing one block at a time
  // from a larger buffer. A lead surrogate at `end - 1` may pair with a trail
  // at `end` (which is past the block but still within the buffer); in that
  // case we consume the pair, advancing `input` one position past `end`.
  var count = 0
  while unsafe input < end {
    let cu = unsafe input.pointee
    if cu < 0x80 {
      count &+= 1
      unsafe input += 1
    } else if cu < 0x800 {
      count &+= 2
      unsafe input += 1
    } else if UTF16.isLeadSurrogate(cu) {
      // Check for a valid surrogate pair, allowing the trail to sit just past
      // `end` as long as it's still within the buffer.
      let next = unsafe input + 1
      if unsafe next < inputEnd && UTF16.isTrailSurrogate(next.pointee) {
        count &+= 4
        unsafe input += 2
      } else if repairing {
        count &+= 3 // U+FFFD replacement character
        unsafe input += 1
      } else {
        return nil
      }
    } else if UTF16.isTrailSurrogate(cu) {
      // Unpaired low surrogate
      if repairing {
        count &+= 3 // U+FFFD replacement character
        unsafe input += 1
      } else {
        return nil
      }
    } else {
      count &+= 3 // BMP non-surrogate
      unsafe input += 1
    }
  }
  return count
}

internal func utf8Length(
  of UTF16CodeUnits: UnsafeBufferPointer<Unicode.UTF16.CodeUnit>,
  repairing: Bool = true
) -> (Int, isASCII: Bool)? {
  let inCount = UTF16CodeUnits.count
  guard inCount > 0 else { return (0, isASCII: true) }
  var input = unsafe UTF16CodeUnits.baseAddress.unsafelyUnwrapped
  let inputEnd = unsafe input + inCount
  var count = 0

  // For each UTF-16 code unit:
  //   U+0000..U+007F  → 1 UTF-8 byte  (ASCII)
  //   U+0080..U+07FF  → 2 UTF-8 bytes
  //   U+0800..U+D7FF  → 3 UTF-8 bytes  (BMP, non-surrogate)
  //   U+D800..U+DBFF  → high surrogate (4 UTF-8 bytes for the pair)
  //   U+DC00..U+DFFF  → low surrogate  (consumed by high surrogate)
  //   U+E000..U+FFFF  → 3 UTF-8 bytes  (BMP, non-surrogate)

  while unsafe (inputEnd - input) >= blockSize {
    if let _ = unsafe allASCIIBlock(at: input) {
      _onFastPath()
      unsafe input += blockSize
      count += blockSize
    } else {
      let blockEnd = unsafe Swift.min(input + blockSize, inputEnd)
      guard let addedCount = unsafe utf8Length(
        input: &input,
        end: blockEnd,
        inputEnd: inputEnd,
        repairing: repairing
      ) else {
        return nil
      }
      count &+= addedCount
    }
  }
  // Finish any remaining code units that didn't fill a full block
  guard let addedByteCount = unsafe utf8Length(
    input: &input,
    end: inputEnd,
    inputEnd: inputEnd,
    repairing: repairing
  ) else {
    return nil
  }
  count &+= addedByteCount
  return (count, isASCII: count == inCount)
}