//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See http://swift.org/LICENSE.txt for license information // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// import SwiftShims // Conversions between different Unicode encodings. Note that UTF-16 and // UTF-32 decoding are *not* currently resilient to erroneous data. /// The result of one Unicode decoding step. /// /// Each `UnicodeDecodingResult` instance can represent a Unicode scalar value, /// an indication that no more Unicode scalars are available, or an indication /// of a decoding error. /// /// - SeeAlso: `UnicodeCodec.decode(next:)` public enum UnicodeDecodingResult : Equatable { /// A decoded Unicode scalar value. case scalarValue(UnicodeScalar) /// An indication that no more Unicode scalars are available in the input. case emptyInput /// An indication of a decoding error. case error public static func == ( lhs: UnicodeDecodingResult, rhs: UnicodeDecodingResult ) -> Bool { switch (lhs, rhs) { case (.scalarValue(let lhsScalar), .scalarValue(let rhsScalar)): return lhsScalar == rhsScalar case (.emptyInput, .emptyInput): return true case (.error, .error): return true default: return false } } } /// A Unicode encoding form that translates between Unicode scalar values and /// form-specific code units. /// /// The `UnicodeCodec` protocol declares methods that decode code unit /// sequences into Unicode scalar values and encode Unicode scalar values /// into code unit sequences. The standard library implements codecs for the /// UTF-8, UTF-16, and UTF-32 encoding schemes as the `UTF8`, `UTF16`, and /// `UTF32` types, respectively. Use the `UnicodeScalar` type to work with /// decoded Unicode scalar values. /// /// - SeeAlso: `UTF8`, `UTF16`, `UTF32`, `UnicodeScalar` public protocol UnicodeCodec { /// A type that can hold code unit values for this encoding. associatedtype CodeUnit /// Creates an instance of the codec. init() /// Starts or continues decoding a code unit sequence into Unicode scalar /// values. /// /// To decode a code unit sequence completely, call this method repeatedly /// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the /// iterator was exhausted is not sufficient, because the decoder can store /// buffered data from the input iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// The following example decodes the UTF-8 encoded bytes of a string into an /// array of `UnicodeScalar` instances: /// /// let str = "✨Unicode✨" /// print(Array(str.utf8)) /// // Prints "[226, 156, 168, 85, 110, 105, 99, 111, 100, 101, 226, 156, 168]" /// /// var bytesIterator = str.utf8.makeIterator() /// var scalars: [UnicodeScalar] = [] /// var utf8Decoder = UTF8() /// Decode: while true { /// switch utf8Decoder.decode(&bytesIterator) { /// case .scalarValue(let v): scalars.append(v) /// case .emptyInput: break Decode /// case .error: /// print("Decoding error") /// break Decode /// } /// } /// print(scalars) /// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]" /// /// - Parameter input: An iterator of code units to be decoded. `input` must be /// the same iterator instance in repeated calls to this method. Do not /// advance the iterator or any copies of the iterator outside this /// method. /// - Returns: A `UnicodeDecodingResult` instance, representing the next /// Unicode scalar, an indication of an error, or an indication that the /// UTF sequence has been fully decoded. mutating func decode( _ input: inout I ) -> UnicodeDecodingResult where I.Element == CodeUnit /// Encodes a Unicode scalar as a series of code units by calling the given /// closure on each code unit. /// /// For example, the musical fermata symbol ("𝄐") is a single Unicode scalar /// value (`\u{1D110}`) but requires four code units for its UTF-8 /// representation. The following code uses the `UTF8` codec to encode a /// fermata in UTF-8: /// /// var bytes: [UTF8.CodeUnit] = [] /// UTF8.encode("𝄐", into: { bytes.append($0) }) /// print(bytes) /// // Prints "[240, 157, 132, 144]" /// /// - Parameters: /// - input: The Unicode scalar value to encode. /// - processCodeUnit: A closure that processes one code unit argument at a /// time. static func encode( _ input: UnicodeScalar, into processCodeUnit: @noescape (CodeUnit) -> Void ) /// Searches for the first occurrence of a `CodeUnit` that is equal to 0. /// /// Is an equivalent of `strlen` for C-strings. /// - Complexity: O(n) static func _nullCodeUnitOffset(in input: UnsafePointer) -> Int } /// A codec for translating between Unicode scalar values and UTF-8 code /// units. public struct UTF8 : UnicodeCodec { // See Unicode 8.0.0, Ch 3.9, UTF-8. // http://www.unicode.org/versions/Unicode8.0.0/ch03.pdf /// A type that can hold code unit values for this encoding. public typealias CodeUnit = UInt8 /// Creates an instance of the UTF-8 codec. public init() {} /// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at MSB, /// and bytes are read at LSB. Note that we need to use a buffer, because /// in case of invalid subsequences we sometimes don't know whether we should /// consume a certain byte before looking at it. internal var _decodeBuffer: UInt32 = 0 /// The number of bits in `_decodeBuffer` that are current filled. internal var _bitsInBuffer: UInt8 = 0 /// Starts or continues decoding a UTF-8 sequence. /// /// To decode a code unit sequence completely, call this method repeatedly /// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the /// iterator was exhausted is not sufficient, because the decoder can store /// buffered data from the input iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// The following example decodes the UTF-8 encoded bytes of a string into an /// array of `UnicodeScalar` instances. This is a demonstration only---if /// you need the Unicode scalar representation of a string, use its /// `unicodeScalars` view. /// /// let str = "✨Unicode✨" /// print(Array(str.utf8)) /// // Prints "[226, 156, 168, 85, 110, 105, 99, 111, 100, 101, 226, 156, 168]" /// /// var bytesIterator = str.utf8.makeIterator() /// var scalars: [UnicodeScalar] = [] /// var utf8Decoder = UTF8() /// Decode: while true { /// switch utf8Decoder.decode(&bytesIterator) { /// case .scalarValue(let v): scalars.append(v) /// case .emptyInput: break Decode /// case .error: /// print("Decoding error") /// break Decode /// } /// } /// print(scalars) /// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]" /// /// - Parameter input: An iterator of code units to be decoded. `input` must be /// the same iterator instance in repeated calls to this method. Do not /// advance the iterator or any copies of the iterator outside this /// method. /// - Returns: A `UnicodeDecodingResult` instance, representing the next /// Unicode scalar, an indication of an error, or an indication that the /// UTF sequence has been fully decoded. public mutating func decode( _ input: inout I ) -> UnicodeDecodingResult where I.Element == CodeUnit { // Bufferless ASCII fastpath. if _fastPath(_bitsInBuffer == 0) { guard let codeUnit = input.next() else { return .emptyInput } // ASCII, return immediately. if codeUnit & 0x80 == 0 { return .scalarValue(UnicodeScalar(_unchecked: UInt32(codeUnit))) } // Non-ASCII, proceed to buffering mode. _decodeBuffer = UInt32(codeUnit) _bitsInBuffer = 8 } else if (_decodeBuffer & 0x80 == 0) { // ASCII in buffer. We don't refill the buffer so we can return // to bufferless mode once we've exhausted it. let codeUnit = _decodeBuffer & 0xff _decodeBuffer >>= 8 _bitsInBuffer = _bitsInBuffer &- 8 return .scalarValue(UnicodeScalar(_unchecked: codeUnit)) } // Buffering mode. // Fill buffer back to 4 bytes (or as many as are left in the iterator). _sanityCheck(_bitsInBuffer < 32) repeat { if let codeUnit = input.next() { // We know _bitsInBuffer < 32 so we use `& 0x1f` (31) to make the // compiler omit a bounds check branch for the bitshift. _decodeBuffer |= (UInt32(codeUnit) << UInt32(_bitsInBuffer & 0x1f)) _bitsInBuffer = _bitsInBuffer &+ 8 } else { if _bitsInBuffer == 0 { return .emptyInput } break // We still have some bytes left in our buffer. } } while _bitsInBuffer < 32 // Decode one unicode scalar. // Note our empty bytes are always 0x00, which is required for this call. let (result, length) = UTF8._decodeOne(_decodeBuffer) // Consume the decoded bytes (or maximal subpart of ill-formed sequence). let bitsConsumed = 8 &* length _sanityCheck(1...4 ~= length && bitsConsumed <= _bitsInBuffer) // Swift doesn't allow shifts greater than or equal to the type width. // _decodeBuffer >>= UInt32(bitsConsumed) // >>= 32 crashes. // Mask with 0x3f (63) to let the compiler omit the '>= 64' bounds check. _decodeBuffer = UInt32(truncatingBitPattern: UInt64(_decodeBuffer) >> (UInt64(bitsConsumed) & 0x3f)) _bitsInBuffer = _bitsInBuffer &- bitsConsumed guard _fastPath(result != nil) else { return .error } return .scalarValue(UnicodeScalar(_unchecked: result!)) } /// Attempts to decode a single UTF-8 code unit sequence starting at the LSB /// of `buffer`. /// /// - Returns: /// - result: The decoded code point if the code unit sequence is /// well-formed; `nil` otherwise. /// - length: The length of the code unit sequence in bytes if it is /// well-formed; otherwise the *maximal subpart of the ill-formed /// sequence* (Unicode 8.0.0, Ch 3.9, D93b), i.e. the number of leading /// code units that were valid or 1 in case none were valid. Unicode /// recommends to skip these bytes and replace them by a single /// replacement character (U+FFFD). /// /// - Requires: There is at least one used byte in `buffer`, and the unused /// space in `buffer` is filled with some value not matching the UTF-8 /// continuation byte form (`0b10xxxxxx`). public // @testable static func _decodeOne(_ buffer: UInt32) -> (result: UInt32?, length: UInt8) { // Note the buffer is read least significant byte first: [ #3 #2 #1 #0 ]. if buffer & 0x80 == 0 { // 1-byte sequence (ASCII), buffer: [ … … … CU0 ]. let value = buffer & 0xff return (value, 1) } // Determine sequence length using high 5 bits of 1st byte. We use a // look-up table to branch less. 1-byte sequences are handled above. // // case | pattern | description // ---------------------------- // 00 | 110xx | 2-byte sequence // 01 | 1110x | 3-byte sequence // 10 | 11110 | 4-byte sequence // 11 | other | invalid // // 11xxx 10xxx 01xxx 00xxx let lut0: UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111 let lut1: UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111 let index = (buffer >> 3) & 0x1f let bit0 = (lut0 >> index) & 1 let bit1 = (lut1 >> index) & 1 switch (bit1, bit0) { case (0, 0): // 2-byte sequence, buffer: [ … … CU1 CU0 ]. // Require 10xx xxxx 110x xxxx. if _slowPath(buffer & 0xc0e0 != 0x80c0) { return (nil, 1) } // Disallow xxxx xxxx xxx0 000x (<= 7 bits case). if _slowPath(buffer & 0x001e == 0x0000) { return (nil, 1) } // Extract data bits. let value = (buffer & 0x3f00) >> 8 | (buffer & 0x001f) << 6 return (value, 2) case (0, 1): // 3-byte sequence, buffer: [ … CU2 CU1 CU0 ]. // Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case). if _slowPath(buffer & 0x00200f == 0x000000) { return (nil, 1) } // Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points). if _slowPath(buffer & 0x00200f == 0x00200d) { return (nil, 1) } // Require 10xx xxxx 10xx xxxx 1110 xxxx. if _slowPath(buffer & 0xc0c0f0 != 0x8080e0) { if buffer & 0x00c000 != 0x008000 { return (nil, 1) } return (nil, 2) // All checks on CU0 & CU1 passed. } // Extract data bits. let value = (buffer & 0x3f0000) >> 16 | (buffer & 0x003f00) >> 2 | (buffer & 0x00000f) << 12 return (value, 3) case (1, 0): // 4-byte sequence, buffer: [ CU3 CU2 CU1 CU0 ]. // Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case). if _slowPath(buffer & 0x00003007 == 0x00000000) { return (nil, 1) } // If xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx. if buffer & 0x00000004 == 0x00000004 { // Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF). if _slowPath(buffer & 0x00003003 != 0x00000000) { return (nil, 1) } } // Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx. if _slowPath(buffer & 0xc0c0c0f8 != 0x808080f0) { if buffer & 0x0000c000 != 0x00008000 { return (nil, 1) } // All other checks on CU0, CU1 & CU2 passed. if buffer & 0x00c00000 != 0x00800000 { return (nil, 2) } return (nil, 3) } // Extract data bits. // FIXME(integers): remove extra type casts let value = (buffer & 0x3f000000) >> (24 as UInt32) | (buffer & 0x003f0000) >> (10 as UInt32) | (buffer & 0x00003f00) << (4 as UInt32) | (buffer & 0x00000007) << (18 as UInt32) return (value, 4) default: // Invalid sequence (CU0 invalid). return (nil, 1) } } /// Encodes a Unicode scalar as a series of code units by calling the given /// closure on each code unit. /// /// For example, the musical fermata symbol ("𝄐") is a single Unicode scalar /// value (`\u{1D110}`) but requires four code units for its UTF-8 /// representation. The following code encodes a fermata in UTF-8: /// /// var bytes: [UTF8.CodeUnit] = [] /// UTF8.encode("𝄐", into: { bytes.append($0) }) /// print(bytes) /// // Prints "[240, 157, 132, 144]" /// /// - Parameters: /// - input: The Unicode scalar value to encode. /// - processCodeUnit: A closure that processes one code unit argument at a /// time. public static func encode( _ input: UnicodeScalar, into processCodeUnit: @noescape (CodeUnit) -> Void ) { var c = UInt32(input) var buf3 = UInt8(c & 0xFF) if c >= UInt32(1<<7) { c >>= 6 buf3 = (buf3 & 0x3F) | 0x80 // 10xxxxxx var buf2 = UInt8(c & 0xFF) if c < UInt32(1<<5) { buf2 |= 0xC0 // 110xxxxx } else { c >>= 6 buf2 = (buf2 & 0x3F) | 0x80 // 10xxxxxx var buf1 = UInt8(c & 0xFF) if c < UInt32(1<<4) { buf1 |= 0xE0 // 1110xxxx } else { c >>= 6 buf1 = (buf1 & 0x3F) | 0x80 // 10xxxxxx processCodeUnit(UInt8(c | 0xF0)) // 11110xxx } processCodeUnit(buf1) } processCodeUnit(buf2) } processCodeUnit(buf3) } /// Returns a Boolean value indicating whether the specified code unit is a /// UTF-8 continuation byte. /// /// Continuation bytes take the form `0b10xxxxxx`. For example, a lowercase /// "e" with an acute accent above it (`"Γ©"`) uses 2 bytes for its UTF-8 /// representation: `0b11000011` (195) and `0b10101001` (169). The second /// byte is a continuation byte. /// /// let eAcute = "Γ©" /// for codePoint in eAcute.utf8 { /// print(codePoint, UTF8.isContinuation(codePoint)) /// } /// // Prints "195 false" /// // Prints "169 true" /// /// - Parameter byte: A UTF-8 code unit. /// - Returns: `true` if `byte` is a continuation byte; otherwise, `false`. public static func isContinuation(_ byte: CodeUnit) -> Bool { return byte & 0b11_00__0000 == 0b10_00__0000 } public static func _nullCodeUnitOffset(in input: UnsafePointer) -> Int { // Relying on a permissive memory model in C. let cstr = unsafeBitCast(input, to: UnsafePointer.self) return Int(_swift_stdlib_strlen(cstr)) } // Support parsing C strings as-if they are UTF8 strings. public static func _nullCodeUnitOffset(in input: UnsafePointer) -> Int { return Int(_swift_stdlib_strlen(input)) } } /// A codec for translating between Unicode scalar values and UTF-16 code /// units. public struct UTF16 : UnicodeCodec { /// A type that can hold code unit values for this encoding. public typealias CodeUnit = UInt16 /// Creates an instance of the UTF-16 codec. public init() {} /// A lookahead buffer for one UTF-16 code unit. internal var _decodeLookahead: UInt16? /// Starts or continues decoding a UTF-16 sequence. /// /// To decode a code unit sequence completely, call this method repeatedly /// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the /// iterator was exhausted is not sufficient, because the decoder can store /// buffered data from the input iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// The following example decodes the UTF-16 encoded bytes of a string into an /// array of `UnicodeScalar` instances. This is a demonstration only---if /// you need the Unicode scalar representation of a string, use its /// `unicodeScalars` view. /// /// let str = "✨Unicode✨" /// print(Array(str.utf16)) /// // Prints "[10024, 85, 110, 105, 99, 111, 100, 101, 10024]" /// /// var codeUnitIterator = str.utf16.makeIterator() /// var scalars: [UnicodeScalar] = [] /// var utf16Decoder = UTF16() /// Decode: while true { /// switch utf16Decoder.decode(&codeUnitIterator) { /// case .scalarValue(let v): scalars.append(v) /// case .emptyInput: break Decode /// case .error: /// print("Decoding error") /// break Decode /// } /// } /// print(scalars) /// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]" /// /// - Parameter input: An iterator of code units to be decoded. `input` must be /// the same iterator instance in repeated calls to this method. Do not /// advance the iterator or any copies of the iterator outside this /// method. /// - Returns: A `UnicodeDecodingResult` instance, representing the next /// Unicode scalar, an indication of an error, or an indication that the /// UTF sequence has been fully decoded. public mutating func decode( _ input: inout I ) -> UnicodeDecodingResult where I.Element == CodeUnit { // Note: maximal subpart of ill-formed sequence for UTF-16 can only have // length 1. Length 0 does not make sense. Neither does length 2 -- in // that case the sequence is valid. let unit0: UInt16 if _fastPath(_decodeLookahead == nil) { guard let next = input.next() else { return .emptyInput } unit0 = next } else { // Consume lookahead first. unit0 = _decodeLookahead! _decodeLookahead = nil } // A well-formed pair of surrogates looks like this: // high-surrogate low-surrogate // [1101 10xx xxxx xxxx] [1101 11xx xxxx xxxx] // Common case first, non-surrogate -- just a sequence of 1 code unit. if _fastPath((unit0 >> 11) != 0b1101_1) { return .scalarValue(UnicodeScalar(_unchecked: UInt32(unit0))) } // Ensure `unit0` is a high-surrogate. guard _fastPath((unit0 >> 10) == 0b1101_10) else { return .error } // We already have a high-surrogate, so there should be a next code unit. guard let unit1 = input.next() else { return .error } // `unit0` is a high-surrogate, so `unit1` should be a low-surrogate. guard _fastPath((unit1 >> 10) == 0b1101_11) else { // Invalid sequence, discard `unit0` and store `unit1` for the next call. _decodeLookahead = unit1 return .error } // We have a well-formed surrogate pair, decode it. let result = 0x10000 + ((UInt32(unit0 & 0x03ff) << 10) | UInt32(unit1 & 0x03ff)) return .scalarValue(UnicodeScalar(_unchecked: result)) } /// Try to decode one Unicode scalar, and return the actual number of code /// units it spanned in the input. This function may consume more code /// units than required for this scalar. @_versioned internal mutating func _decodeOne( _ input: inout I ) -> (UnicodeDecodingResult, Int) where I.Element == CodeUnit { let result = decode(&input) switch result { case .scalarValue(let us): return (result, UTF16.width(us)) case .emptyInput: return (result, 0) case .error: return (result, 1) } } /// Encodes a Unicode scalar as a series of code units by calling the given /// closure on each code unit. /// /// For example, the musical fermata symbol ("𝄐") is a single Unicode scalar /// value (`\u{1D110}`) but requires two code units for its UTF-16 /// representation. The following code encodes a fermata in UTF-16: /// /// var codeUnits: [UTF16.CodeUnit] = [] /// UTF16.encode("𝄐", into: { codeUnits.append($0) }) /// print(codeUnits) /// // Prints "[55348, 56592]" /// /// - Parameters: /// - input: The Unicode scalar value to encode. /// - processCodeUnit: A closure that processes one code unit argument at a /// time. public static func encode( _ input: UnicodeScalar, into processCodeUnit: @noescape (CodeUnit) -> Void ) { let scalarValue: UInt32 = UInt32(input) if scalarValue <= UInt32(UInt16.max) { processCodeUnit(UInt16(scalarValue)) } else { let lead_offset = UInt32(0xd800) - UInt32(0x10000 >> 10) processCodeUnit(UInt16(lead_offset + (scalarValue >> 10))) processCodeUnit(UInt16(0xdc00 + (scalarValue & 0x3ff))) } } } /// A codec for translating between Unicode scalar values and UTF-32 code /// units. public struct UTF32 : UnicodeCodec { /// A type that can hold code unit values for this encoding. public typealias CodeUnit = UInt32 /// Creates an instance of the UTF-32 codec. public init() {} /// Starts or continues decoding a UTF-32 sequence. /// /// To decode a code unit sequence completely, call this method repeatedly /// until it returns `UnicodeDecodingResult.emptyInput`. Checking that the /// iterator was exhausted is not sufficient, because the decoder can store /// buffered data from the input iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// The following example decodes the UTF-16 encoded bytes of a string /// into an array of `UnicodeScalar` instances. This is a demonstration /// only---if you need the Unicode scalar representation of a string, use /// its `unicodeScalars` view. /// /// // UTF-32 representation of "✨Unicode✨" /// let codeUnits: [UTF32.CodeUnit] = /// [10024, 85, 110, 105, 99, 111, 100, 101, 10024] /// /// var codeUnitIterator = codeUnits.makeIterator() /// var scalars: [UnicodeScalar] = [] /// var utf32Decoder = UTF32() /// Decode: while true { /// switch utf32Decoder.decode(&codeUnitIterator) { /// case .scalarValue(let v): scalars.append(v) /// case .emptyInput: break Decode /// case .error: /// print("Decoding error") /// break Decode /// } /// } /// print(scalars) /// // Prints "["\u{2728}", "U", "n", "i", "c", "o", "d", "e", "\u{2728}"]" /// /// - Parameter input: An iterator of code units to be decoded. `input` must be /// the same iterator instance in repeated calls to this method. Do not /// advance the iterator or any copies of the iterator outside this /// method. /// - Returns: A `UnicodeDecodingResult` instance, representing the next /// Unicode scalar, an indication of an error, or an indication that the /// UTF sequence has been fully decoded. public mutating func decode( _ input: inout I ) -> UnicodeDecodingResult where I.Element == CodeUnit { return UTF32._decode(&input) } internal static func _decode( _ input: inout I ) -> UnicodeDecodingResult where I.Element == CodeUnit { guard let x = input.next() else { return .emptyInput } // Check code unit is valid: not surrogate-reserved and within range. guard _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) else { return .error } // x is a valid scalar. return .scalarValue(UnicodeScalar(_unchecked: x)) } /// Encodes a Unicode scalar as a UTF-32 code unit by calling the given /// closure. /// /// For example, like every Unicode scalar, the musical fermata symbol ("𝄐") /// can be represented in UTF-32 as a single code unit. The following code /// encodes a fermata in UTF-32: /// /// var codeUnit: UTF32.CodeUnit = 0 /// UTF32.encode("𝄐", into: { codeUnit = $0 }) /// print(codeUnit) /// // Prints "119056" /// /// - Parameters: /// - input: The Unicode scalar value to encode. /// - processCodeUnit: A closure that processes one code unit argument at a /// time. public static func encode( _ input: UnicodeScalar, into processCodeUnit: @noescape (CodeUnit) -> Void ) { processCodeUnit(UInt32(input)) } } /// Translates the given input from one Unicode encoding to another by calling /// the given closure. /// /// The following example transcodes the UTF-8 representation of the string /// `"Fermata 𝄐"` into UTF-32. /// /// let fermata = "Fermata 𝄐" /// let bytes = fermata.utf8 /// print(Array(bytes)) /// // Prints "[70, 101, 114, 109, 97, 116, 97, 32, 240, 157, 132, 144]" /// /// var codeUnits: [UTF32.CodeUnit] = [] /// let sink = { codeUnits.append($0) } /// transcode(bytes.makeIterator(), from: UTF8.self, to: UTF32.self, /// stoppingOnError: false, into: sink) /// print(codeUnits) /// // Prints "[70, 101, 114, 109, 97, 116, 97, 32, 119056]" /// /// The `sink` closure is called with each resulting UTF-32 code unit as the /// function iterates over its input. /// /// - Parameters: /// - input: An iterator of code units to be translated, encoded as /// `inputEncoding`. If `stopOnError` is `false`, the entire iterator will /// be exhausted. Otherwise, iteration will stop if an encoding error is /// detected. /// - inputEncoding: The Unicode encoding of `input`. /// - outputEncoding: The destination Unicode encoding. /// - stopOnError: Pass `true` to stop translation when an encoding error is /// detected in `input`. Otherwise, a Unicode replacement character /// (`"\u{FFFD}"`) is inserted for each detected error. /// - processCodeUnit: A closure that processes one `outputEncoding` code /// unit at a time. /// - Returns: `true` if the translation detected encoding errors in `input`; /// otherwise, `false`. public func transcode( _ input: Input, from inputEncoding: InputEncoding.Type, to outputEncoding: OutputEncoding.Type, stoppingOnError stopOnError: Bool, into processCodeUnit: @noescape (OutputEncoding.CodeUnit) -> Void ) -> Bool where Input : IteratorProtocol, InputEncoding : UnicodeCodec, OutputEncoding : UnicodeCodec, InputEncoding.CodeUnit == Input.Element { var input = input // NB. It is not possible to optimize this routine to a memcpy if // InputEncoding == OutputEncoding. The reason is that memcpy will not // substitute U+FFFD replacement characters for ill-formed sequences. var inputDecoder = inputEncoding.init() var hadError = false loop: while true { switch inputDecoder.decode(&input) { case .scalarValue(let us): OutputEncoding.encode(us, into: processCodeUnit) case .emptyInput: break loop case .error: hadError = true if stopOnError { break loop } OutputEncoding.encode("\u{fffd}", into: processCodeUnit) } } return hadError } /// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD. /// /// Returns the index of the first unhandled code unit and the UTF-8 data /// that was encoded. internal func _transcodeSomeUTF16AsUTF8( _ input: Input, _ startIndex: Input.Index ) -> (Input.Index, _StringCore._UTF8Chunk) where Input : Collection, Input.Iterator.Element == UInt16 { typealias _UTF8Chunk = _StringCore._UTF8Chunk let endIndex = input.endIndex let utf8Max = sizeof(_UTF8Chunk.self) var result: _UTF8Chunk = 0 var utf8Count = 0 var nextIndex = startIndex while nextIndex != input.endIndex && utf8Count != utf8Max { let u = UInt(input[nextIndex]) let shift = _UTF8Chunk(utf8Count * 8) var utf16Length: Input.IndexDistance = 1 if _fastPath(u <= 0x7f) { result |= _UTF8Chunk(u) << shift utf8Count += 1 } else { var scalarUtf8Length: Int var r: UInt if _fastPath((u >> 11) != 0b1101_1) { // Neither high-surrogate, nor low-surrogate -- well-formed sequence // of 1 code unit, decoding is trivial. if u < 0x800 { r = 0b10__00_0000__110__0_0000 r |= u >> 6 r |= (u & 0b11_1111) << 8 scalarUtf8Length = 2 } else { r = 0b10__00_0000__10__00_0000__1110__0000 r |= u >> 12 r |= ((u >> 6) & 0b11_1111) << 8 r |= (u & 0b11_1111) << 16 scalarUtf8Length = 3 } } else { let unit0 = u if _slowPath((unit0 >> 10) == 0b1101_11) { // `unit0` is a low-surrogate. We have an ill-formed sequence. // Replace it with U+FFFD. r = 0xbdbfef scalarUtf8Length = 3 } else if _slowPath(input.index(nextIndex, offsetBy: 1) == endIndex) { // We have seen a high-surrogate and EOF, so we have an ill-formed // sequence. Replace it with U+FFFD. r = 0xbdbfef scalarUtf8Length = 3 } else { let unit1 = UInt(input[input.index(nextIndex, offsetBy: 1)]) if _fastPath((unit1 >> 10) == 0b1101_11) { // `unit1` is a low-surrogate. We have a well-formed surrogate // pair. let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff)) r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000 r |= v >> 18 r |= ((v >> 12) & 0b11_1111) << 8 r |= ((v >> 6) & 0b11_1111) << 16 r |= (v & 0b11_1111) << 24 scalarUtf8Length = 4 utf16Length = 2 } else { // Otherwise, we have an ill-formed sequence. Replace it with // U+FFFD. r = 0xbdbfef scalarUtf8Length = 3 } } } // Don't overrun the buffer if utf8Count + scalarUtf8Length > utf8Max { break } result |= numericCast(r) << shift utf8Count += scalarUtf8Length } nextIndex = input.index(nextIndex, offsetBy: utf16Length) } // FIXME: Annoying check, courtesy of if utf8Count < sizeofValue(result) { result |= ~0 << numericCast(utf8Count * 8) } return (nextIndex, result) } /// Instances of conforming types are used in internal `String` /// representation. public // @testable protocol _StringElement { static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit static func _fromUTF16CodeUnit(_ utf16: UTF16.CodeUnit) -> Self } extension UTF16.CodeUnit : _StringElement { public // @testable static func _toUTF16CodeUnit(_ x: UTF16.CodeUnit) -> UTF16.CodeUnit { return x } public // @testable static func _fromUTF16CodeUnit( _ utf16: UTF16.CodeUnit ) -> UTF16.CodeUnit { return utf16 } } extension UTF8.CodeUnit : _StringElement { public // @testable static func _toUTF16CodeUnit(_ x: UTF8.CodeUnit) -> UTF16.CodeUnit { _sanityCheck(x <= 0x7f, "should only be doing this with ASCII") return UTF16.CodeUnit(x) } public // @testable static func _fromUTF16CodeUnit( _ utf16: UTF16.CodeUnit ) -> UTF8.CodeUnit { _sanityCheck(utf16 <= 0x7f, "should only be doing this with ASCII") return UTF8.CodeUnit(utf16) } } extension UTF16 { /// Returns the number of code units required to encode the given Unicode /// scalar. /// /// Because a Unicode scalar value can require up to 21 bits to store its /// value, some Unicode scalars are represented in UTF-16 by a pair of /// 16-bit code units. The first and second code units of the pair, /// designated *leading* and *trailing* surrogates, make up a *surrogate /// pair*. /// /// let anA: UnicodeScalar = "A" /// print(anA.value) /// // Prints "65" /// print(UTF16.width(anA)) /// // Prints "1" /// /// let anApple: UnicodeScalar = "🍎" /// print(anApple.value) /// // Prints "127822" /// print(UTF16.width(anApple)) /// // Prints "2" /// /// - Parameter x: A Unicode scalar value. /// - Returns: The width of `x` when encoded in UTF-16, either `1` or `2`. public static func width(_ x: UnicodeScalar) -> Int { return x.value <= 0xFFFF ? 1 : 2 } /// Returns the high-surrogate code unit of the surrogate pair representing /// the specified Unicode scalar. /// /// Because a Unicode scalar value can require up to 21 bits to store its /// value, some Unicode scalars are represented in UTF-16 by a pair of /// 16-bit code units. The first and second code units of the pair, /// designated *leading* and *trailing* surrogates, make up a *surrogate /// pair*. /// /// let apple: UnicodeScalar = "🍎" /// print(UTF16.leadSurrogate(apple) /// // Prints "55356" /// /// - Parameter x: A Unicode scalar value. `x` must be represented by a /// surrogate pair when encoded in UTF-16. To check whether `x` is /// represented by a surrogate pair, use `UTF16.width(x) == 2`. /// - Returns: The leading surrogate code unit of `x` when encoded in UTF-16. /// /// - SeeAlso: `UTF16.width(_:)`, `UTF16.trailSurrogate(_:)` public static func leadSurrogate(_ x: UnicodeScalar) -> UTF16.CodeUnit { _precondition(width(x) == 2) return UTF16.CodeUnit((x.value - 0x1_0000) >> (10 as UInt32)) + 0xD800 } /// Returns the low-surrogate code unit of the surrogate pair representing /// the specified Unicode scalar. /// /// Because a Unicode scalar value can require up to 21 bits to store its /// value, some Unicode scalars are represented in UTF-16 by a pair of /// 16-bit code units. The first and second code units of the pair, /// designated *leading* and *trailing* surrogates, make up a *surrogate /// pair*. /// /// let apple: UnicodeScalar = "🍎" /// print(UTF16.trailSurrogate(apple) /// // Prints "57166" /// /// - Parameter x: A Unicode scalar value. `x` must be represented by a /// surrogate pair when encoded in UTF-16. To check whether `x` is /// represented by a surrogate pair, use `UTF16.width(x) == 2`. /// - Returns: The trailing surrogate code unit of `x` when encoded in UTF-16. /// /// - SeeAlso: `UTF16.width(_:)`, `UTF16.leadSurrogate(_:)` public static func trailSurrogate(_ x: UnicodeScalar) -> UTF16.CodeUnit { _precondition(width(x) == 2) return UTF16.CodeUnit( (x.value - 0x1_0000) & (((1 as UInt32) << 10) - 1) ) + 0xDC00 } /// Returns a Boolean value indicating whether the specified code unit is a /// high-surrogate code unit. /// /// Here's an example of checking whether each code unit in a string's /// `utf16` view is a lead surrogate. The `apple` string contains a single /// emoji character made up of a surrogate pair when encoded in UTF-16. /// /// let apple = "🍎" /// for unit in apple.utf16 { /// print(UTF16.isLeadSurrogate(unit)) /// } /// // Prints "true" /// // Prints "false" /// /// This method does not validate the encoding of a UTF-16 sequence beyond /// the specified code unit. Specifically, it does not validate that a /// low-surrogate code unit follows `x`. /// /// - Parameter x: A UTF-16 code unit. /// - Returns: `true` if `x` is a high-surrogate code unit; otherwise, /// `false`. /// /// - SeeAlso: `UTF16.width(_:)`, `UTF16.leadSurrogate(_:)` public static func isLeadSurrogate(_ x: CodeUnit) -> Bool { return 0xD800...0xDBFF ~= x } /// Returns a Boolean value indicating whether the specified code unit is a /// low-surrogate code unit. /// /// Here's an example of checking whether each code unit in a string's /// `utf16` view is a trailing surrogate. The `apple` string contains a /// single emoji character made up of a surrogate pair when encoded in /// UTF-16. /// /// let apple = "🍎" /// for unit in apple.utf16 { /// print(UTF16.isTrailSurrogate(unit)) /// } /// // Prints "false" /// // Prints "true" /// /// This method does not validate the encoding of a UTF-16 sequence beyond /// the specified code unit. Specifically, it does not validate that a /// high-surrogate code unit precedes `x`. /// /// - Parameter x: A UTF-16 code unit. /// - Returns: `true` if `x` is a low-surrogate code unit; otherwise, /// `false`. /// /// - SeeAlso: `UTF16.width(_:)`, `UTF16.leadSurrogate(_:)` public static func isTrailSurrogate(_ x: CodeUnit) -> Bool { return 0xDC00...0xDFFF ~= x } public // @testable static func _copy( source: UnsafeMutablePointer, destination: UnsafeMutablePointer, count: Int ) { if strideof(T.self) == strideof(U.self) { _memcpy( dest: UnsafeMutablePointer(destination), src: UnsafeMutablePointer(source), size: UInt(count) * UInt(strideof(U.self))) } else { for i in 0..( of input: Input, decodedAs sourceEncoding: Encoding.Type, repairingIllFormedSequences: Bool ) -> (count: Int, isASCII: Bool)? where Input : IteratorProtocol, Encoding : UnicodeCodec, Encoding.CodeUnit == Input.Element { var input = input var count = 0 var isAscii = true var inputDecoder = Encoding() loop: while true { switch inputDecoder.decode(&input) { case .scalarValue(let us): if us.value > 0x7f { isAscii = false } count += width(us) case .emptyInput: break loop case .error: if !repairingIllFormedSequences { return nil } isAscii = false count += width(UnicodeScalar(0xfffd)!) } } return (count, isAscii) } } // Unchecked init to avoid precondition branches in hot code paths where we // already know the value is a valid unicode scalar. extension UnicodeScalar { /// Create an instance with numeric value `value`, bypassing the regular /// precondition checks for code point validity. internal init(_unchecked value: UInt32) { _sanityCheck(value < 0xD800 || value > 0xDFFF, "high- and low-surrogate code points are not valid Unicode scalar values") _sanityCheck(value <= 0x10FFFF, "value is outside of Unicode codespace") self._value = value } } extension UnicodeCodec where CodeUnit : UnsignedInteger { public static func _nullCodeUnitOffset(in input: UnsafePointer) -> Int { var length = 0 while input[length] != 0 { length += 1 } return length } } extension UnicodeCodec { public static func _nullCodeUnitOffset(in input: UnsafePointer) -> Int { fatalError("_nullCodeUnitOffset(in:) implementation should be provided") } } @available(*, unavailable, renamed: "UnicodeCodec") public typealias UnicodeCodecType = UnicodeCodec @available(*, unavailable, message: "use 'transcode(_:from:to:stoppingOnError:into:)'") public func transcode( _ inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type, _ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void, stopOnError: Bool ) -> Bool where Input : IteratorProtocol, InputEncoding : UnicodeCodec, OutputEncoding : UnicodeCodec, InputEncoding.CodeUnit == Input.Element { Builtin.unreachable() } extension UTF16 { @available(*, unavailable, message: "use 'transcodedLength(of:decodedAs:repairingIllFormedSequences:)'") public static func measure( _: Encoding.Type, input: Input, repairIllFormedSequences: Bool ) -> (Int, Bool)? where Encoding : UnicodeCodec, Input : IteratorProtocol, Encoding.CodeUnit == Input.Element { Builtin.unreachable() } }