UTF8Span (#78531) (#80890)

Add support for UTF8Span Also, refactor validation and grapheme breaking
2025-12-21 12:14:44 +01:00 · 2025-04-18 08:01:15 -06:00
parent c363658147
commit 393c24ddd7
22 changed files with 3365 additions and 467 deletions
--- a/stdlib/public/core/UTF8EncodingError.swift
+++ b/stdlib/public/core/UTF8EncodingError.swift
@@ -0,0 +1,261 @@
+extension Unicode.UTF8 {
+  /**
+
+   The kind and location of a UTF-8 encoding error.
+
+   Valid UTF-8 is represented by this table:
+
+   ```
+   ╔════════════════════╦════════╦════════╦════════╦════════╗
+   ║    Scalar value    ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║
+   ╠════════════════════╬════════╬════════╬════════╬════════╣
+   ║ U+0000..U+007F     ║ 00..7F ║        ║        ║        ║
+   ║ U+0080..U+07FF     ║ C2..DF ║ 80..BF ║        ║        ║
+   ║ U+0800..U+0FFF     ║ E0     ║ A0..BF ║ 80..BF ║        ║
+   ║ U+1000..U+CFFF     ║ E1..EC ║ 80..BF ║ 80..BF ║        ║
+   ║ U+D000..U+D7FF     ║ ED     ║ 80..9F ║ 80..BF ║        ║
+   ║ U+E000..U+FFFF     ║ EE..EF ║ 80..BF ║ 80..BF ║        ║
+   ║ U+10000..U+3FFFF   ║ F0     ║ 90..BF ║ 80..BF ║ 80..BF ║
+   ║ U+40000..U+FFFFF   ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║
+   ║ U+100000..U+10FFFF ║ F4     ║ 80..8F ║ 80..BF ║ 80..BF ║
+   ╚════════════════════╩════════╩════════╩════════╩════════╝
+   ```
+
+   ### Classifying errors
+
+   An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs
+   in a position that should be the start of a new scalar value. Unexpected
+   continuations can often occur when the input contains arbitrary data
+   instead of textual content. An unexpected continuation at the start of
+   input might mean that the input was not correctly sliced along scalar
+   boundaries or that it does not contain UTF-8.
+
+   A *truncated scalar* is a multi-byte sequence that is the start of a valid
+   multi-byte scalar but is cut off before ending correctly. A truncated
+   scalar at the end of the input might mean that only part of the entire
+   input was received.
+
+   A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate
+   code points are used by UTF-16 to encode scalars in the supplementary
+   planes. Their presence may mean the input was encoded in a different 8-bit
+   encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8.
+
+   An *invalid non-surrogate code point* is any code point higher than
+   `U+10FFFF`. This can often occur when the input is arbitrary data instead
+   of textual content.
+
+   An *overlong encoding* occurs when a scalar value that could have been
+   encoded using fewer bytes is encoded in a longer byte sequence. Overlong
+   encodings are invalid UTF-8 and can lead to security issues if not
+   correctly detected:
+
+   - https://nvd.nist.gov/vuln/detail/CVE-2008-2938
+   - https://nvd.nist.gov/vuln/detail/CVE-2000-0884
+
+   An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified
+   UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts
+   to bypass security measures.
+
+   ### Reporting the range of the error
+
+   The range of the error reported follows the *Maximal subpart of an
+   ill-formed subsequence* algorithm in which each error is either one byte
+   long or ends before the first byte that is disallowed. See "U+FFFD
+   Substitution of Maximal Subparts" in the Unicode Standard. Unicode started
+   recommending this algorithm in version 6 and is adopted by the W3C.
+
+   The maximal subpart algorithm will produce a single multi-byte range for a
+   truncated scalar (a multi-byte sequence that is the start of a valid
+   multi-byte scalar but is cut off before ending correctly). For all other
+   errors (including overlong encodings, surrogates, and invalid code
+   points), it will produce an error per byte.
+
+   // FIXME: without a checkAllErrors, we don't have these classification distinctions, should we drop it, ensure we will do it, or what?
+
+   Since overlong encodings, surrogates, and invalid code points are erroneous
+   by the second byte (at the latest), the above definition produces the same
+   ranges as defining such a sequence as a truncated scalar error followed by
+   unexpected continuation byte errors. The more semantically-rich
+   classification is reported.
+
+   For example, a surrogate count point sequence `ED A0 80` will be reported
+   as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar`
+   followed by two `.unexpectedContinuationByte` errors.
+
+   Other commonly reported error ranges can be constructed from this result.
+   For example, PEP 383's error-per-byte can be constructed by mapping over
+   the reported range. Similarly, constructing a single error for the longest
+   invalid byte range can be constructed by joining adjacent error ranges.
+
+   ```
+   ╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗
+   ║                 ║  61  ║ F1  ║ 80  ║ 80  ║ E1  ║ 80  ║ C2  ║  62  ║
+   ╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣
+   ║ Longest range   ║ U+61 ║ err ║     ║     ║     ║     ║     ║ U+62 ║
+   ║ Maximal subpart ║ U+61 ║ err ║     ║     ║ err ║     ║ err ║ U+62 ║
+   ║ Error per byte  ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║
+   ╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝
+   ```
+
+   */
+  @available(SwiftStdlib 6.2, *)
+  @frozen
+  public struct ValidationError: Error, Sendable, Hashable
+  {
+    /// The kind of encoding error
+    public var kind: Unicode.UTF8.ValidationError.Kind
+
+    /// The range of offsets into our input containing the error
+    public var byteOffsets: Range<Int>
+
+    @_alwaysEmitIntoClient
+    public init(
+      _ kind: Unicode.UTF8.ValidationError.Kind,
+      _ byteOffsets: Range<Int>
+    ) {
+      _precondition(byteOffsets.lowerBound >= 0)
+      if kind == .truncatedScalar {
+        _precondition(!byteOffsets.isEmpty)
+        _precondition(byteOffsets.count < 4)
+      } else {
+        _precondition(byteOffsets.count == 1)
+      }
+
+      self.kind = kind
+      self.byteOffsets = byteOffsets
+    }
+
+    @_alwaysEmitIntoClient
+    public init(
+      _ kind: Unicode.UTF8.ValidationError.Kind, at byteOffset: Int
+    ) {
+      self.init(kind, byteOffset..<(byteOffset+1))
+    }
+  }
+}
+
+
+@available(SwiftStdlib 6.2, *)
+extension UTF8.ValidationError {
+  /// The kind of encoding error encountered during validation
+  @frozen
+  public struct Kind: Error, Sendable, Hashable, RawRepresentable
+   {
+    public var rawValue: UInt8
+
+    @inlinable
+    public init?(rawValue: UInt8) {
+      guard rawValue <= 4 else { return nil }
+      self.rawValue = rawValue
+    }
+
+    /// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence
+    @_alwaysEmitIntoClient
+    public static var unexpectedContinuationByte: Self {
+      .init(rawValue: 0)!
+    }
+
+    /// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence
+    @_alwaysEmitIntoClient
+    public static var surrogateCodePointByte: Self {
+      .init(rawValue: 1)!
+    }
+
+    /// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence
+    @_alwaysEmitIntoClient
+    public static var invalidNonSurrogateCodePointByte: Self {
+      .init(rawValue: 2)!
+    }
+
+    /// A byte in an overlong encoding sequence
+    @_alwaysEmitIntoClient
+    public static var overlongEncodingByte: Self {
+      .init(rawValue: 3)!
+    }
+
+    /// A multi-byte sequence that is the start of a valid multi-byte scalar
+    /// but is cut off before ending correctly
+    @_alwaysEmitIntoClient
+    public static var truncatedScalar: Self {
+      .init(rawValue: 4)!
+    }
+  }
+}
+
+@_unavailableInEmbedded
+@available(SwiftStdlib 6.2, *)
+extension UTF8.ValidationError.Kind: CustomStringConvertible {
+  public var description: String {
+    switch self {
+    case .invalidNonSurrogateCodePointByte:
+      ".invalidNonSurrogateCodePointByte"
+    case .overlongEncodingByte:
+      ".overlongEncodingByte"
+    case .surrogateCodePointByte:
+      ".surrogateCodePointByte"
+    case .truncatedScalar:
+      ".truncatedScalar"
+    case .unexpectedContinuationByte:
+      ".unexpectedContinuationByte"
+    default:
+      fatalError("unreachable")
+    }
+  }
+}
+
+@_unavailableInEmbedded
+@available(SwiftStdlib 6.2, *)
+extension UTF8.ValidationError: CustomStringConvertible {
+  public var description: String {
+    "UTF8.ValidationError(\(kind), \(byteOffsets))"
+  }
+}
+
+extension UTF8 {
+  @available(SwiftStdlib 6.2, *)
+  @usableFromInline // for testing purposes
+  internal static func _checkAllErrors(
+    _ s: some Sequence<UInt8>
+  ) -> Array<UTF8.ValidationError> {
+    // TODO: Span fast path
+    // TODO: Fixed size buffer for non-contig inputs
+    // TODO: Lifetime-dependent result variant
+    let cus = Array(s)
+    return unsafe cus.withUnsafeBytes {
+      var bufPtr = unsafe $0
+      var start = 0
+      var errors: Array<UTF8.ValidationError> = []
+
+      // Remember the previous error, so that we can
+      // apply it to subsequent bytes instead of reporting
+      // just `.unexpectedContinuation`.
+      var priorError: UTF8.ValidationError? = nil
+      while true {
+        do throws(UTF8.ValidationError) {
+          _ = unsafe try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count)
+          return errors
+        } catch {
+          let adjustedRange =
+            error.byteOffsets.lowerBound + start ..< error.byteOffsets.upperBound + start
+
+          let kind: UTF8.ValidationError.Kind
+          if let prior = priorError,
+             prior.byteOffsets.upperBound == adjustedRange.lowerBound,
+             error.kind == .unexpectedContinuationByte
+          {
+            kind = prior.kind
+          } else {
+            kind = error.kind
+          }
+          let adjustedErr = UTF8.ValidationError(kind, adjustedRange)
+          priorError = adjustedErr
+
+          let errEnd = error.byteOffsets.upperBound
+          start += errEnd
+          unsafe bufPtr = .init(rebasing: bufPtr[errEnd...])
+          errors.append(adjustedErr)
+        }
+      }
+    }
+  }
+}