swift-mirror/stdlib/public/core/UTF8EncodingError.swift

extension Unicode.UTF8 {
  /**

   The kind and location of a UTF-8 encoding error.

   Valid UTF-8 is represented by this table:

   ```
   ╔════════════════════╦════════╦════════╦════════╦════════╗
   ║    Scalar value    ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
   ╠════════════════════╬════════╬════════╬════════╬════════╣
   ║ U+0000..U+007F     ║ 00..7F ║        ║        ║        ║
   ║ U+0080..U+07FF     ║ C2..DF ║ 80..BF ║        ║        ║
   ║ U+0800..U+0FFF     ║ E0     ║ A0..BF ║ 80..BF ║        ║
   ║ U+1000..U+CFFF     ║ E1..EC ║ 80..BF ║ 80..BF ║        ║
   ║ U+D000..U+D7FF     ║ ED     ║ 80..9F ║ 80..BF ║        ║
   ║ U+E000..U+FFFF     ║ EE..EF ║ 80..BF ║ 80..BF ║        ║
   ║ U+10000..U+3FFFF   ║ F0     ║ 90..BF ║ 80..BF ║ 80..BF ║
   ║ U+40000..U+FFFFF   ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║
   ║ U+100000..U+10FFFF ║ F4     ║ 80..8F ║ 80..BF ║ 80..BF ║
   ╚════════════════════╩════════╩════════╩════════╩════════╝
   ```

   ### Classifying errors

   An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs
   in a position that should be the start of a new scalar value. Unexpected
   continuations can often occur when the input contains arbitrary data
   instead of textual content. An unexpected continuation at the start of
   input might mean that the input was not correctly sliced along scalar
   boundaries or that it does not contain UTF-8.

   A *truncated scalar* is a multi-byte sequence that is the start of a valid
   multi-byte scalar but is cut off before ending correctly. A truncated
   scalar at the end of the input might mean that only part of the entire
   input was received.

   A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate
   code points are used by UTF-16 to encode scalars in the supplementary
   planes. Their presence may mean the input was encoded in a different 8-bit
   encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8.

   An *invalid non-surrogate code point* is any code point higher than
   `U+10FFFF`. This can often occur when the input is arbitrary data instead
   of textual content.

   An *overlong encoding* occurs when a scalar value that could have been
   encoded using fewer bytes is encoded in a longer byte sequence. Overlong
   encodings are invalid UTF-8 and can lead to security issues if not
   correctly detected:

   - https://nvd.nist.gov/vuln/detail/CVE-2008-2938
   - https://nvd.nist.gov/vuln/detail/CVE-2000-0884

   An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified
   UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts
   to bypass security measures.

   ### Reporting the range of the error

   The range of the error reported follows the *Maximal subpart of an
   ill-formed subsequence* algorithm in which each error is either one byte
   long or ends before the first byte that is disallowed. See "U+FFFD
   Substitution of Maximal Subparts" in the Unicode Standard. Unicode started
   recommending this algorithm in version 6 and is adopted by the W3C.

   The maximal subpart algorithm will produce a single multi-byte range for a
   truncated scalar (a multi-byte sequence that is the start of a valid
   multi-byte scalar but is cut off before ending correctly). For all other
   errors (including overlong encodings, surrogates, and invalid code
   points), it will produce an error per byte.

   // FIXME: without a checkAllErrors, we don't have these classification distinctions, should we drop it, ensure we will do it, or what?

   Since overlong encodings, surrogates, and invalid code points are erroneous
   by the second byte (at the latest), the above definition produces the same
   ranges as defining such a sequence as a truncated scalar error followed by
   unexpected continuation byte errors. The more semantically-rich
   classification is reported.

   For example, a surrogate count point sequence `ED A0 80` will be reported
   as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar`
   followed by two `.unexpectedContinuationByte` errors.

   Other commonly reported error ranges can be constructed from this result.
   For example, PEP 383's error-per-byte can be constructed by mapping over
   the reported range. Similarly, constructing a single error for the longest
   invalid byte range can be constructed by joining adjacent error ranges.

   ```
   ╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗
   ║                 ║  61  ║ F1  ║ 80  ║ 80  ║ E1  ║ 80  ║ C2  ║  62  ║
   ╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣
   ║ Longest range   ║ U+61 ║ err ║     ║     ║     ║     ║     ║ U+62 ║
   ║ Maximal subpart ║ U+61 ║ err ║     ║     ║ err ║     ║ err ║ U+62 ║
   ║ Error per byte  ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║
   ╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝
   ```

   */
  @available(SwiftStdlib 6.2, *)
  @frozen
  public struct ValidationError: Error, Sendable, Hashable
  {
    /// The kind of encoding error
    public var kind: Unicode.UTF8.ValidationError.Kind

    /// The range of offsets into our input containing the error
    public var byteOffsets: Range<Int>

    @_alwaysEmitIntoClient
    public init(
      _ kind: Unicode.UTF8.ValidationError.Kind,
      _ byteOffsets: Range<Int>
    ) {
      _precondition(byteOffsets.lowerBound >= 0)
      if kind == .truncatedScalar {
        _precondition(!byteOffsets.isEmpty)
        _precondition(byteOffsets.count < 4)
      } else {
        _precondition(byteOffsets.count == 1)
      }

      self.kind = kind
      self.byteOffsets = byteOffsets
    }

    @_alwaysEmitIntoClient
    public init(
      _ kind: Unicode.UTF8.ValidationError.Kind, at byteOffset: Int
    ) {
      self.init(kind, byteOffset..<(byteOffset+1))
    }
  }
}


@available(SwiftStdlib 6.2, *)
extension UTF8.ValidationError {
  /// The kind of encoding error encountered during validation
  @frozen
  public struct Kind: Error, Sendable, Hashable, RawRepresentable
   {
    public var rawValue: UInt8

    @inlinable
    public init?(rawValue: UInt8) {
      guard rawValue <= 4 else { return nil }
      self.rawValue = rawValue
    }

    /// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
    @_alwaysEmitIntoClient
    public static var unexpectedContinuationByte: Self {
      .init(rawValue: 0)!
    }

    /// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
    @_alwaysEmitIntoClient
    public static var surrogateCodePointByte: Self {
      .init(rawValue: 1)!
    }

    /// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
    @_alwaysEmitIntoClient
    public static var invalidNonSurrogateCodePointByte: Self {
      .init(rawValue: 2)!
    }

    /// A byte in an overlong encoding sequence
    @_alwaysEmitIntoClient
    public static var overlongEncodingByte: Self {
      .init(rawValue: 3)!
    }

    /// A multi-byte sequence that is the start of a valid multi-byte scalar
    /// but is cut off before ending correctly
    @_alwaysEmitIntoClient
    public static var truncatedScalar: Self {
      .init(rawValue: 4)!
    }
  }
}

@_unavailableInEmbedded
@available(SwiftStdlib 6.2, *)
extension UTF8.ValidationError.Kind: CustomStringConvertible {
  public var description: String {
    switch self {
    case .invalidNonSurrogateCodePointByte:
      ".invalidNonSurrogateCodePointByte"
    case .overlongEncodingByte:
      ".overlongEncodingByte"
    case .surrogateCodePointByte:
      ".surrogateCodePointByte"
    case .truncatedScalar:
      ".truncatedScalar"
    case .unexpectedContinuationByte:
      ".unexpectedContinuationByte"
    default:
      fatalError("unreachable")
    }
  }
}

@_unavailableInEmbedded
@available(SwiftStdlib 6.2, *)
extension UTF8.ValidationError: CustomStringConvertible {
  public var description: String {
    "UTF8.ValidationError(\(kind), \(byteOffsets))"
  }
}

extension UTF8 {
  @available(SwiftStdlib 6.2, *)
  @usableFromInline // for testing purposes
  internal static func _checkAllErrors(
    _ s: some Sequence<UInt8>
  ) -> Array<UTF8.ValidationError> {
    // TODO: Span fast path
    // TODO: Fixed size buffer for non-contig inputs
    // TODO: Lifetime-dependent result variant
    let cus = Array(s)
    return unsafe cus.withUnsafeBytes {
      var bufPtr = unsafe $0
      var start = 0
      var errors: Array<UTF8.ValidationError> = []

      // Remember the previous error, so that we can
      // apply it to subsequent bytes instead of reporting
      // just `.unexpectedContinuation`.
      var priorError: UTF8.ValidationError? = nil
      while true {
        do throws(UTF8.ValidationError) {
          _ = unsafe try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count)
          return errors
        } catch {
          let adjustedRange =
            error.byteOffsets.lowerBound + start ..< error.byteOffsets.upperBound + start

          let kind: UTF8.ValidationError.Kind
          if let prior = priorError,
             prior.byteOffsets.upperBound == adjustedRange.lowerBound,
             error.kind == .unexpectedContinuationByte
          {
            kind = prior.kind
          } else {
            kind = error.kind
          }
          let adjustedErr = UTF8.ValidationError(kind, adjustedRange)
          priorError = adjustedErr

          let errEnd = error.byteOffsets.upperBound
          start += errEnd
          unsafe bufPtr = .init(rebasing: bufPtr[errEnd...])
          errors.append(adjustedErr)
        }
      }
    }
  }
}