swift-mirror/stdlib/public/core/UTF16.swift

//===--- UTF16.swift ------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
extension Unicode {
  @frozen
  public enum UTF16: Sendable {
  case _swift3Buffer(Unicode.UTF16.ForwardParser)
  }
}

extension Unicode.UTF16 {
  /// Returns the number of code units required to encode the given Unicode
  /// scalar.
  ///
  /// Because a Unicode scalar value can require up to 21 bits to store its
  /// value, some Unicode scalars are represented in UTF-16 by a pair of
  /// 16-bit code units. The first and second code units of the pair,
  /// designated *leading* and *trailing* surrogates, make up a *surrogate
  /// pair*.
  ///
  ///     let anA: Unicode.Scalar = "A"
  ///     print(anA.value)
  ///     // Prints "65"
  ///     print(UTF16.width(anA))
  ///     // Prints "1"
  ///
  ///     let anApple: Unicode.Scalar = "🍎"
  ///     print(anApple.value)
  ///     // Prints "127822"
  ///     print(UTF16.width(anApple))
  ///     // Prints "2"
  ///
  /// - Parameter x: A Unicode scalar value.
  /// - Returns: The width of `x` when encoded in UTF-16, either `1` or `2`.
  @inlinable
  public static func width(_ x: Unicode.Scalar) -> Int {
    return x.value <= UInt16.max ? 1 : 2
  }

  /// Returns the high-surrogate code unit of the surrogate pair representing
  /// the specified Unicode scalar.
  ///
  /// Because a Unicode scalar value can require up to 21 bits to store its
  /// value, some Unicode scalars are represented in UTF-16 by a pair of
  /// 16-bit code units. The first and second code units of the pair,
  /// designated *leading* and *trailing* surrogates, make up a *surrogate
  /// pair*.
  ///
  ///     let apple: Unicode.Scalar = "🍎"
  ///     print(UTF16.leadSurrogate(apple))
  ///     // Prints "55356"
  ///
  /// - Parameter x: A Unicode scalar value. `x` must be represented by a
  ///   surrogate pair when encoded in UTF-16. To check whether `x` is
  ///   represented by a surrogate pair, use `UTF16.width(x) == 2`.
  /// - Returns: The leading surrogate code unit of `x` when encoded in UTF-16.
  @inlinable
  public static func leadSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit {
    _precondition(width(x) == 2)
    return 0xD800 + UTF16.CodeUnit(truncatingIfNeeded:
      (x.value - 0x1_0000) &>> (10 as UInt32))
  }

  /// Returns the low-surrogate code unit of the surrogate pair representing
  /// the specified Unicode scalar.
  ///
  /// Because a Unicode scalar value can require up to 21 bits to store its
  /// value, some Unicode scalars are represented in UTF-16 by a pair of
  /// 16-bit code units. The first and second code units of the pair,
  /// designated *leading* and *trailing* surrogates, make up a *surrogate
  /// pair*.
  ///
  ///     let apple: Unicode.Scalar = "🍎"
  ///     print(UTF16.trailSurrogate(apple))
  ///     // Prints "57166"
  ///
  /// - Parameter x: A Unicode scalar value. `x` must be represented by a
  ///   surrogate pair when encoded in UTF-16. To check whether `x` is
  ///   represented by a surrogate pair, use `UTF16.width(x) == 2`.
  /// - Returns: The trailing surrogate code unit of `x` when encoded in UTF-16.
  @inlinable
  public static func trailSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit {
    _precondition(width(x) == 2)
    return 0xDC00 + UTF16.CodeUnit(truncatingIfNeeded:
      (x.value - 0x1_0000) & (((1 as UInt32) &<< 10) - 1))
  }

  /// Returns a Boolean value indicating whether the specified code unit is a
  /// high-surrogate code unit.
  ///
  /// Here's an example of checking whether each code unit in a string's
  /// `utf16` view is a lead surrogate. The `apple` string contains a single
  /// emoji character made up of a surrogate pair when encoded in UTF-16.
  ///
  ///     let apple = "🍎"
  ///     for unit in apple.utf16 {
  ///         print(UTF16.isLeadSurrogate(unit))
  ///     }
  ///     // Prints "true"
  ///     // Prints "false"
  ///
  /// This method does not validate the encoding of a UTF-16 sequence beyond
  /// the specified code unit. Specifically, it does not validate that a
  /// low-surrogate code unit follows `x`.
  ///
  /// - Parameter x: A UTF-16 code unit.
  /// - Returns: `true` if `x` is a high-surrogate code unit; otherwise,
  ///   `false`.
  @inlinable
  public static func isLeadSurrogate(_ x: CodeUnit) -> Bool {
    return (x & 0xFC00) == 0xD800
  }

  /// Returns a Boolean value indicating whether the specified code unit is a
  /// low-surrogate code unit.
  ///
  /// Here's an example of checking whether each code unit in a string's
  /// `utf16` view is a trailing surrogate. The `apple` string contains a
  /// single emoji character made up of a surrogate pair when encoded in
  /// UTF-16.
  ///
  ///     let apple = "🍎"
  ///     for unit in apple.utf16 {
  ///         print(UTF16.isTrailSurrogate(unit))
  ///     }
  ///     // Prints "false"
  ///     // Prints "true"
  ///
  /// This method does not validate the encoding of a UTF-16 sequence beyond
  /// the specified code unit. Specifically, it does not validate that a
  /// high-surrogate code unit precedes `x`.
  ///
  /// - Parameter x: A UTF-16 code unit.
  /// - Returns: `true` if `x` is a low-surrogate code unit; otherwise,
  ///   `false`.
  @inlinable
  public static func isTrailSurrogate(_ x: CodeUnit) -> Bool {
    return (x & 0xFC00) == 0xDC00
  }

  /// Returns a Boolean value indicating whether the specified code unit is a
  /// high or low surrogate code unit.
  @_alwaysEmitIntoClient
  public static func isSurrogate(_ x: CodeUnit) -> Bool {
    return isLeadSurrogate(x) || isTrailSurrogate(x)
  }

  @inlinable
  public // @testable
  static func _copy<T: _StringElement, U: _StringElement>(
    source: UnsafeMutablePointer<T>,
    destination: UnsafeMutablePointer<U>,
    count: Int
  ) {
    if MemoryLayout<T>.stride == MemoryLayout<U>.stride {
      _memcpy(
        dest: UnsafeMutablePointer(destination),
        src: UnsafeMutablePointer(source),
        size: UInt(count) * UInt(MemoryLayout<U>.stride))
    }
    else {
      for i in 0..<count {
        let u16 = T._toUTF16CodeUnit((source + i).pointee)
        (destination + i).pointee = U._fromUTF16CodeUnit(u16)
      }
    }
  }

  /// Returns the number of UTF-16 code units required for the given code unit
  /// sequence when transcoded to UTF-16, and a Boolean value indicating
  /// whether the sequence was found to contain only ASCII characters.
  ///
  /// The following example finds the length of the UTF-16 encoding of the
  /// string `"Fermata 𝄐"`, starting with its UTF-8 representation.
  ///
  ///     let fermata = "Fermata 𝄐"
  ///     let bytes = fermata.utf8
  ///     print(Array(bytes))
  ///     // Prints "[70, 101, 114, 109, 97, 116, 97, 32, 240, 157, 132, 144]"
  ///
  ///     let result = UTF16.transcodedLength(of: bytes.makeIterator(),
  ///                                         decodedAs: UTF8.self,
  ///                                         repairingIllFormedSequences: false)
  ///     print(result)
  ///     // Prints "Optional((count: 10, isASCII: false))"
  ///
  /// - Parameters:
  ///   - input: An iterator of code units to be translated, encoded as
  ///     `sourceEncoding`. If `repairingIllFormedSequences` is `true`, the
  ///     entire iterator will be exhausted. Otherwise, iteration will stop if
  ///     an ill-formed sequence is detected.
  ///   - sourceEncoding: The Unicode encoding of `input`.
  ///   - repairingIllFormedSequences: Pass `true` to measure the length of
  ///     `input` even when `input` contains ill-formed sequences. Each
  ///     ill-formed sequence is replaced with a Unicode replacement character
  ///     (`"\u{FFFD}"`) and is measured as such. Pass `false` to immediately
  ///     stop measuring `input` when an ill-formed sequence is encountered.
  /// - Returns: A tuple containing the number of UTF-16 code units required to
  ///   encode `input` and a Boolean value that indicates whether the `input`
  ///   contained only ASCII characters. If `repairingIllFormedSequences` is
  ///   `false` and an ill-formed sequence is detected, this method returns
  ///   `nil`.
  @inlinable
  public static func transcodedLength<
    Input: IteratorProtocol,
    Encoding: Unicode.Encoding
  >(
    of input: Input,
    decodedAs sourceEncoding: Encoding.Type,
    repairingIllFormedSequences: Bool
  ) -> (count: Int, isASCII: Bool)?
    where Encoding.CodeUnit == Input.Element {

    var utf16Count = 0
    var i = input
    var d = Encoding.ForwardParser()

    // Fast path for ASCII in a UTF8 buffer
    if sourceEncoding == Unicode.UTF8.self {
      var peek: Encoding.CodeUnit = 0
      while let u = i.next() {
        peek = u
        guard _fastPath(peek < 0x80) else { break }
        utf16Count = utf16Count + 1
      }
      if _fastPath(peek < 0x80) { return (utf16Count, true) }

      var d1 = UTF8.ForwardParser()
      d1._buffer.append(numericCast(peek))
      d = _identityCast(d1, to: Encoding.ForwardParser.self)
    }

    var utf16BitUnion: CodeUnit = 0
    while true {
      let s = d.parseScalar(from: &i)
      if _fastPath(s._valid != nil), let scalarContent = s._valid {
        let utf16 = transcode(scalarContent, from: sourceEncoding)
          ._unsafelyUnwrappedUnchecked
        utf16Count += utf16.count
        for x in utf16 { utf16BitUnion |= x }
      }
      else if let _ = s._error {
        guard _fastPath(repairingIllFormedSequences) else { return nil }
        utf16Count += 1
        utf16BitUnion |= UTF16._replacementCodeUnit
      }
      else {
        return (utf16Count, utf16BitUnion < 0x80)
      }
    }
  }
}

extension Unicode.UTF16: Unicode.Encoding {
  public typealias CodeUnit = UInt16
  public typealias EncodedScalar = _UIntBuffer<UInt16>

  @inlinable
  internal static var _replacementCodeUnit: CodeUnit {
    @inline(__always) get { return 0xfffd }
  }

  @inlinable
  public static var encodedReplacementCharacter: EncodedScalar {
    return EncodedScalar(_storage: 0xFFFD, _bitCount: 16)
  }

  /// Returns whether the given code unit represents an ASCII scalar
  @_alwaysEmitIntoClient
  public static func isASCII(_ x: CodeUnit) -> Bool {
    return x <= 0x7f
  }

  @inlinable
  public static func _isScalar(_ x: CodeUnit) -> Bool {
    return x & 0xf800 != 0xd800
  }

  @inlinable
  @inline(__always)
  internal static func _decodeSurrogates(
    _ lead: CodeUnit,
    _ trail: CodeUnit
  ) -> Unicode.Scalar {
    _internalInvariant(isLeadSurrogate(lead))
    _internalInvariant(isTrailSurrogate(trail))
    return Unicode.Scalar(
      _unchecked: 0x10000 +
        (UInt32(lead & 0x03ff) &<< 10 | UInt32(trail & 0x03ff)))
  }

  @inlinable
  public static func decode(_ source: EncodedScalar) -> Unicode.Scalar {
    let bits = source._storage
    if _fastPath(source._bitCount == 16) {
      return Unicode.Scalar(_unchecked: bits & 0xffff)
    }
    _internalInvariant(source._bitCount == 32)
    let lower: UInt32 = bits >> 16 & 0x03ff
    let upper: UInt32 = (bits & 0x03ff) << 10
    let value = 0x10000 + (lower | upper)
    return Unicode.Scalar(_unchecked: value)
  }

  @inlinable
  public static func encode(
    _ source: Unicode.Scalar
  ) -> EncodedScalar? {
    let x = source.value
    if _fastPath(x < ((1 as UInt32) << 16)) {
      return EncodedScalar(_storage: x, _bitCount: 16)
    }
    let x1 = x - ((1 as UInt32) << 16)
    var r = (0xdc00 + (x1 & 0x3ff))
    r &<<= 16
    r |= (0xd800 + (x1 &>> 10 & 0x3ff))
    return EncodedScalar(_storage: r, _bitCount: 32)
  }

  @inlinable
  @inline(__always)
  public static func transcode<FromEncoding: Unicode.Encoding>(
    _ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
  ) -> EncodedScalar? {
    if _fastPath(FromEncoding.self == UTF8.self) {
      let c = _identityCast(content, to: UTF8.EncodedScalar.self)
      var b = c.count
      b = b &- 1
      if _fastPath(b == 0) {
        return EncodedScalar(
          _storage: (c._biasedBits &- 0x1) & 0b0__111_1111, _bitCount: 16)
      }
      var s = c._biasedBits &- 0x01010101
      var r = s
      r &<<= 6
      s &>>= 8
      r |= s & 0b0__11_1111
      b = b &- 1

      if _fastPath(b == 0) {
        return EncodedScalar(_storage: r & 0b0__111_1111_1111, _bitCount: 16)
      }
      r &<<= 6
      s &>>= 8
      r |= s & 0b0__11_1111
      b = b &- 1

      if _fastPath(b == 0) {
        return EncodedScalar(_storage: r & 0xFFFF, _bitCount: 16)
      }

      r &<<= 6
      s &>>= 8
      r |= s & 0b0__11_1111
      r &= (1 &<< 21) - 1
      return encode(Unicode.Scalar(_unchecked: r))
    }
    else if _fastPath(FromEncoding.self == UTF16.self) {
      return unsafeBitCast(content, to: UTF16.EncodedScalar.self)
    }
    return encode(FromEncoding.decode(content))
  }

  @frozen
  public struct ForwardParser: Sendable {
    public typealias _Buffer = _UIntBuffer<UInt16>

    public var _buffer: _Buffer

    @inlinable
    public init() { _buffer = _Buffer() }
  }

  @frozen
  public struct ReverseParser: Sendable {
    public typealias _Buffer = _UIntBuffer<UInt16>

    public var _buffer: _Buffer

    @inlinable
    public init() { _buffer = _Buffer() }
  }
}

extension UTF16.ReverseParser: Unicode.Parser, _UTFParser {
  public typealias Encoding = Unicode.UTF16

  @inlinable
  public func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
    _internalInvariant(  // this case handled elsewhere
      !Encoding._isScalar(UInt16(truncatingIfNeeded: _buffer._storage)))
    if _fastPath(_buffer._storage & 0xFC00_FC00 == 0xD800_DC00) {
      return (true, 2*16)
    }
    return (false, 1*16)
  }

  @inlinable
  public func _bufferedScalar(bitCount: UInt8) -> Encoding.EncodedScalar {
    return Encoding.EncodedScalar(
      _storage:
        (_buffer._storage &<< 16 | _buffer._storage &>> 16) &>> (32 - bitCount),
      _bitCount: bitCount
    )
  }
}

extension Unicode.UTF16.ForwardParser: Unicode.Parser, _UTFParser {
  public typealias Encoding = Unicode.UTF16

  @inlinable
  public func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
    _internalInvariant(  // this case handled elsewhere
      !Encoding._isScalar(UInt16(truncatingIfNeeded: _buffer._storage)))
    if _fastPath(_buffer._storage & 0xFC00_FC00 == 0xDC00_D800) {
      return (true, 2*16)
    }
    return (false, 1*16)
  }

  @inlinable
  public func _bufferedScalar(bitCount: UInt8) -> Encoding.EncodedScalar {
    var r = _buffer
    r._bitCount = bitCount
    return r
  }
}