extension Unicode.UTF8 { /** The kind and location of a UTF-8 encoding error. Valid UTF-8 is represented by this table: ``` ╔════════════════════╦════════╦════════╦════════╦════════╗ ║ Scalar value ║ Byte 0 ║ Byte 1 ║ Byte 2 ║ Byte 3 ║ ╠════════════════════╬════════╬════════╬════════╬════════╣ ║ U+0000..U+007F ║ 00..7F ║ ║ ║ ║ ║ U+0080..U+07FF ║ C2..DF ║ 80..BF ║ ║ ║ ║ U+0800..U+0FFF ║ E0 ║ A0..BF ║ 80..BF ║ ║ ║ U+1000..U+CFFF ║ E1..EC ║ 80..BF ║ 80..BF ║ ║ ║ U+D000..U+D7FF ║ ED ║ 80..9F ║ 80..BF ║ ║ ║ U+E000..U+FFFF ║ EE..EF ║ 80..BF ║ 80..BF ║ ║ ║ U+10000..U+3FFFF ║ F0 ║ 90..BF ║ 80..BF ║ 80..BF ║ ║ U+40000..U+FFFFF ║ F1..F3 ║ 80..BF ║ 80..BF ║ 80..BF ║ ║ U+100000..U+10FFFF ║ F4 ║ 80..8F ║ 80..BF ║ 80..BF ║ ╚════════════════════╩════════╩════════╩════════╩════════╝ ``` ### Classifying errors An *unexpected continuation* is when a continuation byte (`10xxxxxx`) occurs in a position that should be the start of a new scalar value. Unexpected continuations can often occur when the input contains arbitrary data instead of textual content. An unexpected continuation at the start of input might mean that the input was not correctly sliced along scalar boundaries or that it does not contain UTF-8. A *truncated scalar* is a multi-byte sequence that is the start of a valid multi-byte scalar but is cut off before ending correctly. A truncated scalar at the end of the input might mean that only part of the entire input was received. A *surrogate code point* (`U+D800..U+DFFF`) is invalid UTF-8. Surrogate code points are used by UTF-16 to encode scalars in the supplementary planes. Their presence may mean the input was encoded in a different 8-bit encoding, such as CESU-8, WTF-8, or Java's Modified UTF-8. An *invalid non-surrogate code point* is any code point higher than `U+10FFFF`. This can often occur when the input is arbitrary data instead of textual content. An *overlong encoding* occurs when a scalar value that could have been encoded using fewer bytes is encoded in a longer byte sequence. Overlong encodings are invalid UTF-8 and can lead to security issues if not correctly detected: - https://nvd.nist.gov/vuln/detail/CVE-2008-2938 - https://nvd.nist.gov/vuln/detail/CVE-2000-0884 An overlong encoding of `NUL`, `0xC0 0x80`, is used in Java's Modified UTF-8 but is invalid UTF-8. Overlong encoding errors often catch attempts to bypass security measures. ### Reporting the range of the error The range of the error reported follows the *Maximal subpart of an ill-formed subsequence* algorithm in which each error is either one byte long or ends before the first byte that is disallowed. See "U+FFFD Substitution of Maximal Subparts" in the Unicode Standard. Unicode started recommending this algorithm in version 6 and is adopted by the W3C. The maximal subpart algorithm will produce a single multi-byte range for a truncated scalar (a multi-byte sequence that is the start of a valid multi-byte scalar but is cut off before ending correctly). For all other errors (including overlong encodings, surrogates, and invalid code points), it will produce an error per byte. // FIXME: without a checkAllErrors, we don't have these classification distinctions, should we drop it, ensure we will do it, or what? Since overlong encodings, surrogates, and invalid code points are erroneous by the second byte (at the latest), the above definition produces the same ranges as defining such a sequence as a truncated scalar error followed by unexpected continuation byte errors. The more semantically-rich classification is reported. For example, a surrogate count point sequence `ED A0 80` will be reported as three `.surrogateCodePointByte` errors rather than a `.truncatedScalar` followed by two `.unexpectedContinuationByte` errors. Other commonly reported error ranges can be constructed from this result. For example, PEP 383's error-per-byte can be constructed by mapping over the reported range. Similarly, constructing a single error for the longest invalid byte range can be constructed by joining adjacent error ranges. ``` ╔═════════════════╦══════╦═════╦═════╦═════╦═════╦═════╦═════╦══════╗ ║ ║ 61 ║ F1 ║ 80 ║ 80 ║ E1 ║ 80 ║ C2 ║ 62 ║ ╠═════════════════╬══════╬═════╬═════╬═════╬═════╬═════╬═════╬══════╣ ║ Longest range ║ U+61 ║ err ║ ║ ║ ║ ║ ║ U+62 ║ ║ Maximal subpart ║ U+61 ║ err ║ ║ ║ err ║ ║ err ║ U+62 ║ ║ Error per byte ║ U+61 ║ err ║ err ║ err ║ err ║ err ║ err ║ U+62 ║ ╚═════════════════╩══════╩═════╩═════╩═════╩═════╩═════╩═════╩══════╝ ``` */ @available(SwiftStdlib 6.2, *) @frozen public struct ValidationError: Error, Sendable, Hashable { /// The kind of encoding error public var kind: Unicode.UTF8.ValidationError.Kind /// The range of offsets into our input containing the error public var byteOffsets: Range @_alwaysEmitIntoClient public init( _ kind: Unicode.UTF8.ValidationError.Kind, _ byteOffsets: Range ) { _precondition(byteOffsets.lowerBound >= 0) if kind == .truncatedScalar { _precondition(!byteOffsets.isEmpty) _precondition(byteOffsets.count < 4) } else { _precondition(byteOffsets.count == 1) } self.kind = kind self.byteOffsets = byteOffsets } @_alwaysEmitIntoClient public init( _ kind: Unicode.UTF8.ValidationError.Kind, at byteOffset: Int ) { self.init(kind, byteOffset..<(byteOffset+1)) } } } @available(SwiftStdlib 6.2, *) extension UTF8.ValidationError { /// The kind of encoding error encountered during validation @frozen public struct Kind: Error, Sendable, Hashable, RawRepresentable { public var rawValue: UInt8 @inlinable public init?(rawValue: UInt8) { guard rawValue <= 4 else { return nil } self.rawValue = rawValue } /// A continuation byte (`10xxxxxx`) outside of a multi-byte sequence @_alwaysEmitIntoClient public static var unexpectedContinuationByte: Self { .init(rawValue: 0)! } /// A byte in a surrogate code point (`U+D800..U+DFFF`) sequence @_alwaysEmitIntoClient public static var surrogateCodePointByte: Self { .init(rawValue: 1)! } /// A byte in an invalid, non-surrogate code point (`>U+10FFFF`) sequence @_alwaysEmitIntoClient public static var invalidNonSurrogateCodePointByte: Self { .init(rawValue: 2)! } /// A byte in an overlong encoding sequence @_alwaysEmitIntoClient public static var overlongEncodingByte: Self { .init(rawValue: 3)! } /// A multi-byte sequence that is the start of a valid multi-byte scalar /// but is cut off before ending correctly @_alwaysEmitIntoClient public static var truncatedScalar: Self { .init(rawValue: 4)! } } } @_unavailableInEmbedded @available(SwiftStdlib 6.2, *) extension UTF8.ValidationError.Kind: CustomStringConvertible { public var description: String { switch self { case .invalidNonSurrogateCodePointByte: ".invalidNonSurrogateCodePointByte" case .overlongEncodingByte: ".overlongEncodingByte" case .surrogateCodePointByte: ".surrogateCodePointByte" case .truncatedScalar: ".truncatedScalar" case .unexpectedContinuationByte: ".unexpectedContinuationByte" default: fatalError("unreachable") } } } @_unavailableInEmbedded @available(SwiftStdlib 6.2, *) extension UTF8.ValidationError: CustomStringConvertible { public var description: String { "UTF8.ValidationError(\(kind), \(byteOffsets))" } } extension UTF8 { @available(SwiftStdlib 6.2, *) @usableFromInline // for testing purposes internal static func _checkAllErrors( _ s: some Sequence ) -> Array { // TODO: Span fast path // TODO: Fixed size buffer for non-contig inputs // TODO: Lifetime-dependent result variant let cus = Array(s) return unsafe cus.withUnsafeBytes { var bufPtr = unsafe $0 var start = 0 var errors: Array = [] // Remember the previous error, so that we can // apply it to subsequent bytes instead of reporting // just `.unexpectedContinuation`. var priorError: UTF8.ValidationError? = nil while true { do throws(UTF8.ValidationError) { _ = unsafe try bufPtr.baseAddress!._validateUTF8(limitedBy: bufPtr.count) return errors } catch { let adjustedRange = error.byteOffsets.lowerBound + start ..< error.byteOffsets.upperBound + start let kind: UTF8.ValidationError.Kind if let prior = priorError, prior.byteOffsets.upperBound == adjustedRange.lowerBound, error.kind == .unexpectedContinuationByte { kind = prior.kind } else { kind = error.kind } let adjustedErr = UTF8.ValidationError(kind, adjustedRange) priorError = adjustedErr let errEnd = error.byteOffsets.upperBound start += errEnd unsafe bufPtr = .init(rebasing: bufPtr[errEnd...]) errors.append(adjustedErr) } } } } }