mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
677 lines
24 KiB
Swift
677 lines
24 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
import SwiftShims
|
|
|
|
// Conversions between different Unicode encodings. Note that UTF-16 and
|
|
// UTF-32 decoding are *not* currently resilient to erroneous data.
|
|
|
|
/// The result of one Unicode decoding step.
|
|
///
|
|
/// Each `UnicodeDecodingResult` instance can represent a Unicode scalar value,
|
|
/// an indication that no more Unicode scalars are available, or an indication
|
|
/// of a decoding error.
|
|
@frozen
|
|
public enum UnicodeDecodingResult: Equatable {
|
|
/// A decoded Unicode scalar value.
|
|
case scalarValue(Unicode.Scalar)
|
|
|
|
/// An indication that no more Unicode scalars are available in the input.
|
|
case emptyInput
|
|
|
|
/// An indication of a decoding error.
|
|
case error
|
|
|
|
@inlinable
|
|
public static func == (
|
|
lhs: UnicodeDecodingResult,
|
|
rhs: UnicodeDecodingResult
|
|
) -> Bool {
|
|
switch (lhs, rhs) {
|
|
case (.scalarValue(let lhsScalar), .scalarValue(let rhsScalar)):
|
|
return lhsScalar == rhsScalar
|
|
case (.emptyInput, .emptyInput):
|
|
return true
|
|
case (.error, .error):
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A Unicode encoding form that translates between Unicode scalar values and
|
|
/// form-specific code units.
|
|
///
|
|
/// The `UnicodeCodec` protocol declares methods that decode code unit
|
|
/// sequences into Unicode scalar values and encode Unicode scalar values
|
|
/// into code unit sequences. The standard library implements codecs for the
|
|
/// UTF-8, UTF-16, and UTF-32 encoding schemes as the `UTF8`, `UTF16`, and
|
|
/// `UTF32` types, respectively. Use the `Unicode.Scalar` type to work with
|
|
/// decoded Unicode scalar values.
|
|
public protocol UnicodeCodec: Unicode.Encoding {
|
|
|
|
/// Creates an instance of the codec.
|
|
init()
|
|
|
|
/// Starts or continues decoding a code unit sequence into Unicode scalar
|
|
/// values.
|
|
///
|
|
/// To decode a code unit sequence completely, call this method repeatedly
|
|
/// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the
|
|
/// iterator was exhausted is not sufficient, because the decoder can store
|
|
/// buffered data from the input iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `Unicode.Scalar` or an error.
|
|
///
|
|
/// The following example decodes the UTF-8 encoded bytes of a string into an
|
|
/// array of `Unicode.Scalar` instances:
|
|
///
|
|
/// let str = "✨Unicode✨"
|
|
/// print(Array(str.utf8))
|
|
/// // Prints "[226, 156, 168, 85, 110, 105, 99, 111, 100, 101, 226, 156, 168]"
|
|
///
|
|
/// var bytesIterator = str.utf8.makeIterator()
|
|
/// var scalars: [Unicode.Scalar] = []
|
|
/// var utf8Decoder = UTF8()
|
|
/// Decode: while true {
|
|
/// switch utf8Decoder.decode(&bytesIterator) {
|
|
/// case .scalarValue(let v): scalars.append(v)
|
|
/// case .emptyInput: break Decode
|
|
/// case .error:
|
|
/// print("Decoding error")
|
|
/// break Decode
|
|
/// }
|
|
/// }
|
|
/// print(scalars)
|
|
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
|
|
///
|
|
/// - Parameter input: An iterator of code units to be decoded. `input` must be
|
|
/// the same iterator instance in repeated calls to this method. Do not
|
|
/// advance the iterator or any copies of the iterator outside this
|
|
/// method.
|
|
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
|
|
/// Unicode scalar, an indication of an error, or an indication that the
|
|
/// UTF sequence has been fully decoded.
|
|
mutating func decode<I: IteratorProtocol>(
|
|
_ input: inout I
|
|
) -> UnicodeDecodingResult where I.Element == CodeUnit
|
|
|
|
/// Encodes a Unicode scalar as a series of code units by calling the given
|
|
/// closure on each code unit.
|
|
///
|
|
/// For example, the musical fermata symbol ("𝄐") is a single Unicode scalar
|
|
/// value (`\u{1D110}`) but requires four code units for its UTF-8
|
|
/// representation. The following code uses the `UTF8` codec to encode a
|
|
/// fermata in UTF-8:
|
|
///
|
|
/// var bytes: [UTF8.CodeUnit] = []
|
|
/// UTF8.encode("𝄐", into: { bytes.append($0) })
|
|
/// print(bytes)
|
|
/// // Prints "[240, 157, 132, 144]"
|
|
///
|
|
/// - Parameters:
|
|
/// - input: The Unicode scalar value to encode.
|
|
/// - processCodeUnit: A closure that processes one code unit argument at a
|
|
/// time.
|
|
static func encode(
|
|
_ input: Unicode.Scalar,
|
|
into processCodeUnit: (CodeUnit) -> Void
|
|
)
|
|
|
|
/// Searches for the first occurrence of a `CodeUnit` that is equal to 0.
|
|
///
|
|
/// Is an equivalent of `strlen` for C-strings.
|
|
///
|
|
/// - Complexity: O(*n*)
|
|
static func _nullCodeUnitOffset(in input: UnsafePointer<CodeUnit>) -> Int
|
|
}
|
|
|
|
/// A codec for translating between Unicode scalar values and UTF-8 code
|
|
/// units.
|
|
extension Unicode.UTF8: UnicodeCodec {
|
|
/// Creates an instance of the UTF-8 codec.
|
|
@inlinable
|
|
public init() { self = ._swift3Buffer(ForwardParser()) }
|
|
|
|
/// Starts or continues decoding a UTF-8 sequence.
|
|
///
|
|
/// To decode a code unit sequence completely, call this method repeatedly
|
|
/// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the
|
|
/// iterator was exhausted is not sufficient, because the decoder can store
|
|
/// buffered data from the input iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `Unicode.Scalar` or an error.
|
|
///
|
|
/// The following example decodes the UTF-8 encoded bytes of a string into an
|
|
/// array of `Unicode.Scalar` instances. This is a demonstration only---if
|
|
/// you need the Unicode scalar representation of a string, use its
|
|
/// `unicodeScalars` view.
|
|
///
|
|
/// let str = "✨Unicode✨"
|
|
/// print(Array(str.utf8))
|
|
/// // Prints "[226, 156, 168, 85, 110, 105, 99, 111, 100, 101, 226, 156, 168]"
|
|
///
|
|
/// var bytesIterator = str.utf8.makeIterator()
|
|
/// var scalars: [Unicode.Scalar] = []
|
|
/// var utf8Decoder = UTF8()
|
|
/// Decode: while true {
|
|
/// switch utf8Decoder.decode(&bytesIterator) {
|
|
/// case .scalarValue(let v): scalars.append(v)
|
|
/// case .emptyInput: break Decode
|
|
/// case .error:
|
|
/// print("Decoding error")
|
|
/// break Decode
|
|
/// }
|
|
/// }
|
|
/// print(scalars)
|
|
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
|
|
///
|
|
/// - Parameter input: An iterator of code units to be decoded. `input` must be
|
|
/// the same iterator instance in repeated calls to this method. Do not
|
|
/// advance the iterator or any copies of the iterator outside this
|
|
/// method.
|
|
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
|
|
/// Unicode scalar, an indication of an error, or an indication that the
|
|
/// UTF sequence has been fully decoded.
|
|
@inlinable
|
|
@inline(__always)
|
|
public mutating func decode<I: IteratorProtocol>(
|
|
_ input: inout I
|
|
) -> UnicodeDecodingResult where I.Element == CodeUnit {
|
|
guard case ._swift3Buffer(var parser) = self else {
|
|
Builtin.unreachable()
|
|
}
|
|
defer { self = ._swift3Buffer(parser) }
|
|
|
|
switch parser.parseScalar(from: &input) {
|
|
case .valid(let s): return .scalarValue(UTF8.decode(s))
|
|
case .error: return .error
|
|
case .emptyInput: return .emptyInput
|
|
}
|
|
}
|
|
|
|
/// Attempts to decode a single UTF-8 code unit sequence starting at the LSB
|
|
/// of `buffer`.
|
|
///
|
|
/// - Returns:
|
|
/// - result: The decoded code point if the code unit sequence is
|
|
/// well-formed; `nil` otherwise.
|
|
/// - length: The length of the code unit sequence in bytes if it is
|
|
/// well-formed; otherwise the *maximal subpart of the ill-formed
|
|
/// sequence* (Unicode 8.0.0, Ch 3.9, D93b), i.e. the number of leading
|
|
/// code units that were valid or 1 in case none were valid. Unicode
|
|
/// recommends to skip these bytes and replace them by a single
|
|
/// replacement character (U+FFFD).
|
|
///
|
|
/// - Requires: There is at least one used byte in `buffer`, and the unused
|
|
/// space in `buffer` is filled with some value not matching the UTF-8
|
|
/// continuation byte form (`0b10xxxxxx`).
|
|
@inlinable
|
|
public // @testable
|
|
static func _decodeOne(_ buffer: UInt32) -> (result: UInt32?, length: UInt8) {
|
|
// Note the buffer is read least significant byte first: [ #3 #2 #1 #0 ].
|
|
|
|
if buffer & 0x80 == 0 { // 1-byte sequence (ASCII), buffer: [ ... ... ... CU0 ].
|
|
let value = buffer & 0xff
|
|
return (value, 1)
|
|
}
|
|
var p = ForwardParser()
|
|
p._buffer._storage = buffer
|
|
p._buffer._bitCount = 32
|
|
var i = EmptyCollection<UInt8>().makeIterator()
|
|
switch p.parseScalar(from: &i) {
|
|
case .valid(let s):
|
|
return (
|
|
result: UTF8.decode(s).value,
|
|
length: UInt8(truncatingIfNeeded: s.count))
|
|
case .error(let l):
|
|
return (result: nil, length: UInt8(truncatingIfNeeded: l))
|
|
case .emptyInput: Builtin.unreachable()
|
|
}
|
|
}
|
|
|
|
/// Encodes a Unicode scalar as a series of code units by calling the given
|
|
/// closure on each code unit.
|
|
///
|
|
/// For example, the musical fermata symbol ("𝄐") is a single Unicode scalar
|
|
/// value (`\u{1D110}`) but requires four code units for its UTF-8
|
|
/// representation. The following code encodes a fermata in UTF-8:
|
|
///
|
|
/// var bytes: [UTF8.CodeUnit] = []
|
|
/// UTF8.encode("𝄐", into: { bytes.append($0) })
|
|
/// print(bytes)
|
|
/// // Prints "[240, 157, 132, 144]"
|
|
///
|
|
/// - Parameters:
|
|
/// - input: The Unicode scalar value to encode.
|
|
/// - processCodeUnit: A closure that processes one code unit argument at a
|
|
/// time.
|
|
@inlinable
|
|
@inline(__always)
|
|
public static func encode(
|
|
_ input: Unicode.Scalar,
|
|
into processCodeUnit: (CodeUnit) -> Void
|
|
) {
|
|
var s = encode(input)!._biasedBits
|
|
processCodeUnit(UInt8(truncatingIfNeeded: s) &- 0x01)
|
|
s &>>= 8
|
|
if _fastPath(s == 0) { return }
|
|
processCodeUnit(UInt8(truncatingIfNeeded: s) &- 0x01)
|
|
s &>>= 8
|
|
if _fastPath(s == 0) { return }
|
|
processCodeUnit(UInt8(truncatingIfNeeded: s) &- 0x01)
|
|
s &>>= 8
|
|
if _fastPath(s == 0) { return }
|
|
processCodeUnit(UInt8(truncatingIfNeeded: s) &- 0x01)
|
|
}
|
|
|
|
/// Returns a Boolean value indicating whether the specified code unit is a
|
|
/// UTF-8 continuation byte.
|
|
///
|
|
/// Continuation bytes take the form `0b10xxxxxx`. For example, a lowercase
|
|
/// "e" with an acute accent above it (`"é"`) uses 2 bytes for its UTF-8
|
|
/// representation: `0b11000011` (195) and `0b10101001` (169). The second
|
|
/// byte is a continuation byte.
|
|
///
|
|
/// let eAcute = "é"
|
|
/// for codeUnit in eAcute.utf8 {
|
|
/// print(codeUnit, UTF8.isContinuation(codeUnit))
|
|
/// }
|
|
/// // Prints "195 false"
|
|
/// // Prints "169 true"
|
|
///
|
|
/// - Parameter byte: A UTF-8 code unit.
|
|
/// - Returns: `true` if `byte` is a continuation byte; otherwise, `false`.
|
|
@inlinable
|
|
public static func isContinuation(_ byte: CodeUnit) -> Bool {
|
|
return byte & 0b11_00__0000 == 0b10_00__0000
|
|
}
|
|
|
|
@inlinable
|
|
public static func _nullCodeUnitOffset(
|
|
in input: UnsafePointer<CodeUnit>
|
|
) -> Int {
|
|
return Int(_swift_stdlib_strlen_unsigned(input))
|
|
}
|
|
// Support parsing C strings as-if they are UTF8 strings.
|
|
@inlinable
|
|
public static func _nullCodeUnitOffset(
|
|
in input: UnsafePointer<CChar>
|
|
) -> Int {
|
|
return Int(_swift_stdlib_strlen(input))
|
|
}
|
|
}
|
|
|
|
/// A codec for translating between Unicode scalar values and UTF-16 code
|
|
/// units.
|
|
extension Unicode.UTF16: UnicodeCodec {
|
|
/// Creates an instance of the UTF-16 codec.
|
|
@inlinable
|
|
public init() { self = ._swift3Buffer(ForwardParser()) }
|
|
|
|
/// Starts or continues decoding a UTF-16 sequence.
|
|
///
|
|
/// To decode a code unit sequence completely, call this method repeatedly
|
|
/// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the
|
|
/// iterator was exhausted is not sufficient, because the decoder can store
|
|
/// buffered data from the input iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `Unicode.Scalar` or an error.
|
|
///
|
|
/// The following example decodes the UTF-16 encoded bytes of a string into an
|
|
/// array of `Unicode.Scalar` instances. This is a demonstration only---if
|
|
/// you need the Unicode scalar representation of a string, use its
|
|
/// `unicodeScalars` view.
|
|
///
|
|
/// let str = "✨Unicode✨"
|
|
/// print(Array(str.utf16))
|
|
/// // Prints "[10024, 85, 110, 105, 99, 111, 100, 101, 10024]"
|
|
///
|
|
/// var codeUnitIterator = str.utf16.makeIterator()
|
|
/// var scalars: [Unicode.Scalar] = []
|
|
/// var utf16Decoder = UTF16()
|
|
/// Decode: while true {
|
|
/// switch utf16Decoder.decode(&codeUnitIterator) {
|
|
/// case .scalarValue(let v): scalars.append(v)
|
|
/// case .emptyInput: break Decode
|
|
/// case .error:
|
|
/// print("Decoding error")
|
|
/// break Decode
|
|
/// }
|
|
/// }
|
|
/// print(scalars)
|
|
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
|
|
///
|
|
/// - Parameter input: An iterator of code units to be decoded. `input` must be
|
|
/// the same iterator instance in repeated calls to this method. Do not
|
|
/// advance the iterator or any copies of the iterator outside this
|
|
/// method.
|
|
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
|
|
/// Unicode scalar, an indication of an error, or an indication that the
|
|
/// UTF sequence has been fully decoded.
|
|
@inlinable
|
|
public mutating func decode<I: IteratorProtocol>(
|
|
_ input: inout I
|
|
) -> UnicodeDecodingResult where I.Element == CodeUnit {
|
|
guard case ._swift3Buffer(var parser) = self else {
|
|
Builtin.unreachable()
|
|
}
|
|
defer { self = ._swift3Buffer(parser) }
|
|
switch parser.parseScalar(from: &input) {
|
|
case .valid(let s): return .scalarValue(UTF16.decode(s))
|
|
case .error: return .error
|
|
case .emptyInput: return .emptyInput
|
|
}
|
|
}
|
|
|
|
/// Try to decode one Unicode scalar, and return the actual number of code
|
|
/// units it spanned in the input. This function may consume more code
|
|
/// units than required for this scalar.
|
|
@inlinable
|
|
internal mutating func _decodeOne<I: IteratorProtocol>(
|
|
_ input: inout I
|
|
) -> (UnicodeDecodingResult, Int) where I.Element == CodeUnit {
|
|
let result = decode(&input)
|
|
switch result {
|
|
case .scalarValue(let us):
|
|
return (result, UTF16.width(us))
|
|
|
|
case .emptyInput:
|
|
return (result, 0)
|
|
|
|
case .error:
|
|
return (result, 1)
|
|
}
|
|
}
|
|
|
|
/// Encodes a Unicode scalar as a series of code units by calling the given
|
|
/// closure on each code unit.
|
|
///
|
|
/// For example, the musical fermata symbol ("𝄐") is a single Unicode scalar
|
|
/// value (`\u{1D110}`) but requires two code units for its UTF-16
|
|
/// representation. The following code encodes a fermata in UTF-16:
|
|
///
|
|
/// var codeUnits: [UTF16.CodeUnit] = []
|
|
/// UTF16.encode("𝄐", into: { codeUnits.append($0) })
|
|
/// print(codeUnits)
|
|
/// // Prints "[55348, 56592]"
|
|
///
|
|
/// - Parameters:
|
|
/// - input: The Unicode scalar value to encode.
|
|
/// - processCodeUnit: A closure that processes one code unit argument at a
|
|
/// time.
|
|
@inlinable
|
|
public static func encode(
|
|
_ input: Unicode.Scalar,
|
|
into processCodeUnit: (CodeUnit) -> Void
|
|
) {
|
|
var s = encode(input)!._storage
|
|
processCodeUnit(UInt16(truncatingIfNeeded: s))
|
|
s &>>= 16
|
|
if _fastPath(s == 0) { return }
|
|
processCodeUnit(UInt16(truncatingIfNeeded: s))
|
|
}
|
|
}
|
|
|
|
/// A codec for translating between Unicode scalar values and UTF-32 code
|
|
/// units.
|
|
extension Unicode.UTF32: UnicodeCodec {
|
|
/// Creates an instance of the UTF-32 codec.
|
|
@inlinable
|
|
public init() { self = ._swift3Codec }
|
|
|
|
/// Starts or continues decoding a UTF-32 sequence.
|
|
///
|
|
/// To decode a code unit sequence completely, call this method repeatedly
|
|
/// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the
|
|
/// iterator was exhausted is not sufficient, because the decoder can store
|
|
/// buffered data from the input iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `Unicode.Scalar` or an error.
|
|
///
|
|
/// The following example decodes the UTF-16 encoded bytes of a string
|
|
/// into an array of `Unicode.Scalar` instances. This is a demonstration
|
|
/// only---if you need the Unicode scalar representation of a string, use
|
|
/// its `unicodeScalars` view.
|
|
///
|
|
/// // UTF-32 representation of "✨Unicode✨"
|
|
/// let codeUnits: [UTF32.CodeUnit] =
|
|
/// [10024, 85, 110, 105, 99, 111, 100, 101, 10024]
|
|
///
|
|
/// var codeUnitIterator = codeUnits.makeIterator()
|
|
/// var scalars: [Unicode.Scalar] = []
|
|
/// var utf32Decoder = UTF32()
|
|
/// Decode: while true {
|
|
/// switch utf32Decoder.decode(&codeUnitIterator) {
|
|
/// case .scalarValue(let v): scalars.append(v)
|
|
/// case .emptyInput: break Decode
|
|
/// case .error:
|
|
/// print("Decoding error")
|
|
/// break Decode
|
|
/// }
|
|
/// }
|
|
/// print(scalars)
|
|
/// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]"
|
|
///
|
|
/// - Parameter input: An iterator of code units to be decoded. `input` must be
|
|
/// the same iterator instance in repeated calls to this method. Do not
|
|
/// advance the iterator or any copies of the iterator outside this
|
|
/// method.
|
|
/// - Returns: A `UnicodeDecodingResult` instance, representing the next
|
|
/// Unicode scalar, an indication of an error, or an indication that the
|
|
/// UTF sequence has been fully decoded.
|
|
@inlinable
|
|
public mutating func decode<I: IteratorProtocol>(
|
|
_ input: inout I
|
|
) -> UnicodeDecodingResult where I.Element == CodeUnit {
|
|
var parser = ForwardParser()
|
|
|
|
switch parser.parseScalar(from: &input) {
|
|
case .valid(let s): return .scalarValue(UTF32.decode(s))
|
|
case .error: return .error
|
|
case .emptyInput: return .emptyInput
|
|
}
|
|
}
|
|
|
|
/// Encodes a Unicode scalar as a UTF-32 code unit by calling the given
|
|
/// closure.
|
|
///
|
|
/// For example, like every Unicode scalar, the musical fermata symbol ("𝄐")
|
|
/// can be represented in UTF-32 as a single code unit. The following code
|
|
/// encodes a fermata in UTF-32:
|
|
///
|
|
/// var codeUnit: UTF32.CodeUnit = 0
|
|
/// UTF32.encode("𝄐", into: { codeUnit = $0 })
|
|
/// print(codeUnit)
|
|
/// // Prints "119056"
|
|
///
|
|
/// - Parameters:
|
|
/// - input: The Unicode scalar value to encode.
|
|
/// - processCodeUnit: A closure that processes one code unit argument at a
|
|
/// time.
|
|
@inlinable
|
|
public static func encode(
|
|
_ input: Unicode.Scalar,
|
|
into processCodeUnit: (CodeUnit) -> Void
|
|
) {
|
|
processCodeUnit(UInt32(input))
|
|
}
|
|
}
|
|
|
|
/// Translates the given input from one Unicode encoding to another by calling
|
|
/// the given closure.
|
|
///
|
|
/// The following example transcodes the UTF-8 representation of the string
|
|
/// `"Fermata 𝄐"` into UTF-32.
|
|
///
|
|
/// let fermata = "Fermata 𝄐"
|
|
/// let bytes = fermata.utf8
|
|
/// print(Array(bytes))
|
|
/// // Prints "[70, 101, 114, 109, 97, 116, 97, 32, 240, 157, 132, 144]"
|
|
///
|
|
/// var codeUnits: [UTF32.CodeUnit] = []
|
|
/// let sink = { codeUnits.append($0) }
|
|
/// transcode(bytes.makeIterator(), from: UTF8.self, to: UTF32.self,
|
|
/// stoppingOnError: false, into: sink)
|
|
/// print(codeUnits)
|
|
/// // Prints "[70, 101, 114, 109, 97, 116, 97, 32, 119056]"
|
|
///
|
|
/// The `sink` closure is called with each resulting UTF-32 code unit as the
|
|
/// function iterates over its input.
|
|
///
|
|
/// - Parameters:
|
|
/// - input: An iterator of code units to be translated, encoded as
|
|
/// `inputEncoding`. If `stopOnError` is `false`, the entire iterator will
|
|
/// be exhausted. Otherwise, iteration will stop if an encoding error is
|
|
/// detected.
|
|
/// - inputEncoding: The Unicode encoding of `input`.
|
|
/// - outputEncoding: The destination Unicode encoding.
|
|
/// - stopOnError: Pass `true` to stop translation when an encoding error is
|
|
/// detected in `input`. Otherwise, a Unicode replacement character
|
|
/// (`"\u{FFFD}"`) is inserted for each detected error.
|
|
/// - processCodeUnit: A closure that processes one `outputEncoding` code
|
|
/// unit at a time.
|
|
/// - Returns: `true` if the translation detected encoding errors in `input`;
|
|
/// otherwise, `false`.
|
|
@inlinable
|
|
@inline(__always)
|
|
public func transcode<
|
|
Input: IteratorProtocol,
|
|
InputEncoding: Unicode.Encoding,
|
|
OutputEncoding: Unicode.Encoding
|
|
>(
|
|
_ input: Input,
|
|
from inputEncoding: InputEncoding.Type,
|
|
to outputEncoding: OutputEncoding.Type,
|
|
stoppingOnError stopOnError: Bool,
|
|
into processCodeUnit: (OutputEncoding.CodeUnit) -> Void
|
|
) -> Bool
|
|
where InputEncoding.CodeUnit == Input.Element {
|
|
var input = input
|
|
|
|
// NB. It is not possible to optimize this routine to a memcpy if
|
|
// InputEncoding == OutputEncoding. The reason is that memcpy will not
|
|
// substitute U+FFFD replacement characters for ill-formed sequences.
|
|
|
|
var p = InputEncoding.ForwardParser()
|
|
var hadError = false
|
|
loop:
|
|
while true {
|
|
switch p.parseScalar(from: &input) {
|
|
case .valid(let s):
|
|
let t = OutputEncoding.transcode(s, from: inputEncoding)
|
|
guard _fastPath(t != nil), let s = t else { break }
|
|
s.forEach(processCodeUnit)
|
|
continue loop
|
|
case .emptyInput:
|
|
return hadError
|
|
case .error:
|
|
if _slowPath(stopOnError) { return true }
|
|
hadError = true
|
|
}
|
|
OutputEncoding.encodedReplacementCharacter.forEach(processCodeUnit)
|
|
}
|
|
}
|
|
|
|
/// Instances of conforming types are used in internal `String`
|
|
/// representation.
|
|
public // @testable
|
|
protocol _StringElement {
|
|
static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit
|
|
|
|
static func _fromUTF16CodeUnit(_ utf16: UTF16.CodeUnit) -> Self
|
|
}
|
|
|
|
extension UTF16.CodeUnit: _StringElement {
|
|
@inlinable
|
|
public // @testable
|
|
static func _toUTF16CodeUnit(_ x: UTF16.CodeUnit) -> UTF16.CodeUnit {
|
|
return x
|
|
}
|
|
@inlinable
|
|
public // @testable
|
|
static func _fromUTF16CodeUnit(
|
|
_ utf16: UTF16.CodeUnit
|
|
) -> UTF16.CodeUnit {
|
|
return utf16
|
|
}
|
|
}
|
|
|
|
extension UTF8.CodeUnit: _StringElement {
|
|
@inlinable
|
|
public // @testable
|
|
static func _toUTF16CodeUnit(_ x: UTF8.CodeUnit) -> UTF16.CodeUnit {
|
|
_internalInvariant(x <= 0x7f, "should only be doing this with ASCII")
|
|
return UTF16.CodeUnit(truncatingIfNeeded: x)
|
|
}
|
|
@inlinable
|
|
public // @testable
|
|
static func _fromUTF16CodeUnit(
|
|
_ utf16: UTF16.CodeUnit
|
|
) -> UTF8.CodeUnit {
|
|
_internalInvariant(utf16 <= 0x7f, "should only be doing this with ASCII")
|
|
return UTF8.CodeUnit(truncatingIfNeeded: utf16)
|
|
}
|
|
}
|
|
|
|
// Unchecked init to avoid precondition branches in hot code paths where we
|
|
// already know the value is a valid unicode scalar.
|
|
extension Unicode.Scalar {
|
|
/// Create an instance with numeric value `value`, bypassing the regular
|
|
/// precondition checks for code point validity.
|
|
@inlinable
|
|
internal init(_unchecked value: UInt32) {
|
|
_internalInvariant(value < 0xD800 || value > 0xDFFF,
|
|
"high- and low-surrogate code points are not valid Unicode scalar values")
|
|
_internalInvariant(value <= 0x10FFFF, "value is outside of Unicode codespace")
|
|
|
|
self._value = value
|
|
}
|
|
}
|
|
|
|
extension UnicodeCodec {
|
|
@inlinable
|
|
public static func _nullCodeUnitOffset(
|
|
in input: UnsafePointer<CodeUnit>
|
|
) -> Int {
|
|
var length = 0
|
|
while input[length] != 0 {
|
|
length += 1
|
|
}
|
|
return length
|
|
}
|
|
}
|
|
|
|
@available(*, unavailable, message: "use 'transcode(_:from:to:stoppingOnError:into:)'")
|
|
public func transcode<Input, InputEncoding, OutputEncoding>(
|
|
_ inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type,
|
|
_ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void,
|
|
stopOnError: Bool
|
|
) -> Bool
|
|
where
|
|
Input: IteratorProtocol,
|
|
InputEncoding: UnicodeCodec,
|
|
OutputEncoding: UnicodeCodec,
|
|
InputEncoding.CodeUnit == Input.Element {
|
|
Builtin.unreachable()
|
|
}
|
|
|
|
/// A namespace for Unicode utilities.
|
|
@frozen
|
|
public enum Unicode {}
|
|
|