//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See http://swift.org/LICENSE.txt for license information // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // Conversions between different Unicode encodings. Note that UTF-16 and // UTF-32 decoding are *not* currently resilient to erroneous data. /// The result of one Unicode decoding step. /// /// A unicode scalar value, an indication that no more unicode scalars /// are available, or an indication of a decoding error. public enum UnicodeDecodingResult : Equatable { case scalarValue(UnicodeScalar) case emptyInput case error } public func == ( lhs: UnicodeDecodingResult, rhs: UnicodeDecodingResult ) -> Bool { switch (lhs, rhs) { case (.scalarValue(let lhsScalar), .scalarValue(let rhsScalar)): return lhsScalar == rhsScalar case (.emptyInput, .emptyInput): return true case (.error, .error): return true default: return false } } /// A Unicode [encoding scheme](http://www.unicode.org/glossary/#character_encoding_scheme). /// /// Consists of an underlying [code unit](http://www.unicode.org/glossary/#code_unit) and functions to /// translate between sequences of these code units and [unicode scalar values](http://www.unicode.org/glossary/#unicode_scalar_value). public protocol UnicodeCodec { /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this /// encoding. associatedtype CodeUnit init() /// Start or continue decoding a UTF sequence. /// /// In order to decode a code unit sequence completely, this function should /// be called repeatedly until it returns `UnicodeDecodingResult.emptyInput`. /// Checking that the iterator was exhausted is not sufficient. The decoder /// can have an internal buffer that is pre-filled with data from the input /// iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// - parameter next: An iterator over the code units to be decoded. mutating func decode< I : IteratorProtocol where I.Element == CodeUnit >(next: inout I) -> UnicodeDecodingResult /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by /// calling `output` on each `CodeUnit`. static func encode( input: UnicodeScalar, sendingOutputTo processCodeUnit: (CodeUnit) -> Void ) } /// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8). public struct UTF8 : UnicodeCodec { /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this /// encoding. public typealias CodeUnit = UInt8 public init() {} /// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB, /// and bytes are read at MSB. internal var _decodeLookahead: UInt32 = 0 /// Flags with layout: `0bxxxx_yyyy`. /// /// `xxxx` is the EOF flag. It means that the input iterator has signaled /// end of sequence. Out of the four bits, only one bit can be set. The bit /// position specifies how many bytes have been consumed from the lookahead /// buffer already. A value of `1000` means that there are `yyyy` bytes in /// the buffer, `0100` means that there are `yyyy - 1` bytes, `0010` -- /// `yyyy - 2`, `0001` -- `yyyy - 3`. /// /// `yyyy` specifies how many bytes are valid in the lookahead buffer. Value /// is expressed in unary code. Valid values: `1111` (4), `0111` (3), /// `0011` (2), `0001` (1), `0000` (0). /// /// This representation is crafted to allow one to consume a byte from a /// buffer with a shift, and update flags with a single-bit right shift. internal var _lookaheadFlags: UInt8 = 0 /// Returns `true` if the LSB bytes in `buffer` are a well-formed UTF-8 code /// unit sequence. The lowest byte is considered the first code unit. /// /// - Requires: There is at least one used byte in `buffer`, and the unused /// space in `buffer` is filled with some value not matching the UTF-8 /// continuation byte form (`0b10xxxxxx`). @warn_unused_result public // @testable static func _isValidUTF8(buffer: UInt32) -> Bool { if _fastPath(buffer & 0x80 == 0) { return true // 0x00 -- 0x7f: 1-byte sequences (ASCII). } // Determine sequence length using high 5 bits of 1st byte. We use a // look-up table to branch less. 1-byte sequences are handled above. // // case | pattern | description // ---------------------------- // 00 | 110xx | 2-byte sequence // 01 | 1110x | 3-byte sequence // 10 | 11110 | 4-byte sequence // 11 | other | invalid // // 11xxx 10xxx 01xxx 00xxx let lut0: UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111 let lut1: UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111 let index = (buffer >> 3) & 0x1f let bit0 = (lut0 >> index) & 1 let bit1 = (lut1 >> index) & 1 switch (bit1, bit0) { case (0, 0): // 2-byte sequence. // Require 10xx xxxx 110x xxxx. if buffer & 0xc0e0 != 0x80c0 { return false } // Disallow xxxx xxxx xxx0 000x (<= 7 bits case). if buffer & 0x001e == 0x0000 { return false } return true case (0, 1): // 3-byte sequence. // Require 10xx xxxx 10xx xxxx 1110 xxxx. if buffer & 0xc0c0f0 != 0x8080e0 { return false } // Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case). if buffer & 0x00200f == 0x000000 { return false } // Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points). if buffer & 0x00200f == 0x00200d { return false } return true case (1, 0): // 4-byte sequence. // Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx. if buffer & 0xc0c0c0f8 != 0x808080f0 { return false } // Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case). if buffer & 0x00003007 == 0x00000000 { return false } // Case xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx. if buffer & 0x00000004 == 0x00000004 { // Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF). if buffer & 0x00003003 != 0x00000000 { return false } } return true default: // Invalid sequence. return false } } /// Given an ill-formed sequence, find the length of its maximal subpart. @inline(never) @warn_unused_result internal static func _findMaximalSubpartOfIllFormedUTF8Sequence( buffer: UInt32, validBytes: UInt8) -> UInt8 { var buffer = buffer var validBytes = validBytes // This function is '@inline(never)' because it is used only in the error // handling path. // Clear EOF flag, we don't care about it. validBytes &= 0b0000_1111 _sanityCheck(validBytes != 0, "input buffer should not be empty") _sanityCheck(!UTF8._isValidUTF8(buffer), "input sequence should be ill-formed UTF-8") // Unicode 6.3.0, D93b: // // Maximal subpart of an ill-formed subsequence: The longest code unit // subsequence starting at an unconvertible offset that is either: // a. the initial subsequence of a well-formed code unit sequence, or // b. a subsequence of length one. // Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8 // Byte Sequences. let cu0 = UInt8(buffer & 0xff) buffer >>= 8 validBytes >>= 1 if (cu0 >= 0xc2 && cu0 <= 0xdf) { // First byte is valid, but we know that this code unit sequence is // invalid, so the maximal subpart has to end after the first byte. return 1 } if validBytes == 0 { return 1 } let cu1 = UInt8(buffer & 0xff) buffer >>= 8 validBytes >>= 1 if (cu0 == 0xe0) { return (cu1 >= 0xa0 && cu1 <= 0xbf) ? 2 : 1 } if (cu0 >= 0xe1 && cu0 <= 0xec) { return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1 } if (cu0 == 0xed) { return (cu1 >= 0x80 && cu1 <= 0x9f) ? 2 : 1 } if (cu0 >= 0xee && cu0 <= 0xef) { return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1 } if (cu0 == 0xf0) { if (cu1 >= 0x90 && cu1 <= 0xbf) { if validBytes == 0 { return 2 } let cu2 = UInt8(buffer & 0xff) return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 } return 1 } if (cu0 >= 0xf1 && cu0 <= 0xf3) { if (cu1 >= 0x80 && cu1 <= 0xbf) { if validBytes == 0 { return 2 } let cu2 = UInt8(buffer & 0xff) return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 } return 1 } if (cu0 == 0xf4) { if (cu1 >= 0x80 && cu1 <= 0x8f) { if validBytes == 0 { return 2 } let cu2 = UInt8(buffer & 0xff) return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2 } return 1 } _sanityCheck((cu0 >= 0x80 && cu0 <= 0xc1) || cu0 >= 0xf5, "case analysis above should have handled all valid first bytes") // There are no well-formed sequences that start with these bytes. Maximal // subpart is defined to have length 1 in these cases. return 1 } /// Start or continue decoding a UTF sequence. /// /// In order to decode a code unit sequence completely, this function should /// be called repeatedly until it returns `UnicodeDecodingResult.emptyInput`. /// Checking that the iterator was exhausted is not sufficient. The decoder /// can have an internal buffer that is pre-filled with data from the input /// iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// - parameter next: An iterator over the code units to be decoded. public mutating func decode< I : IteratorProtocol where I.Element == CodeUnit >(next: inout I) -> UnicodeDecodingResult { // If the EOF flag is not set, fill the lookahead buffer from the input // iterator. if _lookaheadFlags & 0b1111_0000 == 0 { // Add more bytes into the buffer until we have 4. while _lookaheadFlags != 0b0000_1111 { if let codeUnit = next.next() { _decodeLookahead = (_decodeLookahead << 8) | UInt32(codeUnit) _lookaheadFlags = (_lookaheadFlags << 1) | 1 } else { // Set the EOF flag. switch _lookaheadFlags & 0b0000_1111 { case 0b1111: _sanityCheckFailure("should have not entered buffer refill loop") case 0b0111: _lookaheadFlags |= 0b0100_0000 case 0b0011: _lookaheadFlags |= 0b0010_0000 case 0b0001: _lookaheadFlags |= 0b0001_0000 case 0b0000: _lookaheadFlags |= 0b1000_0000 return .emptyInput default: _sanityCheckFailure("bad value in _lookaheadFlags") } break } } } if _slowPath(_lookaheadFlags & 0b0000_1111 == 0) { return .emptyInput } if _slowPath(_lookaheadFlags & 0b1111_0000 != 0) { // Reached EOF. Restore the invariant: first unread byte is always at // MSB. switch _lookaheadFlags & 0b1111_0000 { case 0b1000_0000: break case 0b0100_0000: _decodeLookahead <<= 1 * 8 case 0b0010_0000: _decodeLookahead <<= 2 * 8 case 0b0001_0000: _decodeLookahead <<= 3 * 8 default: _sanityCheckFailure("bad value in _lookaheadFlags") } _lookaheadFlags = (_lookaheadFlags & 0b0000_1111) | 0b1000_0000 } // The first byte to read is located at MSB of `_decodeLookahead`. Get a // representation of the buffer where we can read bytes starting from LSB. var buffer = _decodeLookahead.byteSwapped if _slowPath(!UTF8._isValidUTF8(buffer)) { // The code unit sequence is ill-formed. According to Unicode // recommendation, replace the maximal subpart of ill-formed sequence // with one replacement character. _lookaheadFlags >>= UTF8._findMaximalSubpartOfIllFormedUTF8Sequence(buffer, validBytes: _lookaheadFlags) return .error } // At this point we know that `buffer` starts with a well-formed code unit // sequence. Decode it. // // When consuming bytes from the `buffer`, we just need to update // `_lookaheadFlags`. The stored buffer in `_decodeLookahead` will be // shifted at the beginning of the next decoding cycle. let cu0 = UInt8(buffer & 0xff) buffer >>= 8 _lookaheadFlags >>= 1 if cu0 < 0x80 { // 1-byte sequences. return .scalarValue(UnicodeScalar(UInt32(cu0))) } // Start with octet 1 (we'll mask off high bits later). var result = UInt32(cu0) let cu1 = UInt8(buffer & 0xff) buffer >>= 8 _lookaheadFlags >>= 1 result = (result << 6) | UInt32(cu1 & 0x3f) if cu0 < 0xe0 { // 2-byte sequences. return .scalarValue(UnicodeScalar(result & 0x000007ff)) // 11 bits } let cu2 = UInt8(buffer & 0xff) buffer >>= 8 _lookaheadFlags >>= 1 result = (result << 6) | UInt32(cu2 & 0x3f) if cu0 < 0xf0 { // 3-byte sequences. return .scalarValue(UnicodeScalar(result & 0x0000ffff)) // 16 bits } // 4-byte sequences. let cu3 = UInt8(buffer & 0xff) _lookaheadFlags >>= 1 result = (result << 6) | UInt32(cu3 & 0x3f) return .scalarValue(UnicodeScalar(result & 0x001fffff)) // 21 bits } /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by /// calling `output` on each `CodeUnit`. public static func encode( input: UnicodeScalar, sendingOutputTo processCodeUnit: (CodeUnit) -> Void ) { var c = UInt32(input) var buf3 = UInt8(c & 0xFF) if c >= UInt32(1<<7) { c >>= 6 buf3 = (buf3 & 0x3F) | 0x80 // 10xxxxxx var buf2 = UInt8(c & 0xFF) if c < UInt32(1<<5) { buf2 |= 0xC0 // 110xxxxx } else { c >>= 6 buf2 = (buf2 & 0x3F) | 0x80 // 10xxxxxx var buf1 = UInt8(c & 0xFF) if c < UInt32(1<<4) { buf1 |= 0xE0 // 1110xxxx } else { c >>= 6 buf1 = (buf1 & 0x3F) | 0x80 // 10xxxxxx processCodeUnit(UInt8(c | 0xF0)) // 11110xxx } processCodeUnit(buf1) } processCodeUnit(buf2) } processCodeUnit(buf3) } /// Returns `true` if `byte` is a continuation byte of the form /// `0b10xxxxxx`. @warn_unused_result public static func isContinuation(byte: CodeUnit) -> Bool { return byte & 0b11_00__0000 == 0b10_00__0000 } } /// A codec for [UTF-16](http://www.unicode.org/glossary/#UTF_16). public struct UTF16 : UnicodeCodec { /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this /// encoding. public typealias CodeUnit = UInt16 public init() {} /// A lookahead buffer for one UTF-16 code unit. var _decodeLookahead: UInt32 = 0 /// Flags with layout: `0b0000_00xy`. /// /// `y` is the EOF flag. /// /// `x` is set when `_decodeLookahead` contains a code unit. var _lookaheadFlags: UInt8 = 0 /// Start or continue decoding a UTF sequence. /// /// In order to decode a code unit sequence completely, this function should /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. /// Checking that the iterator was exhausted is not sufficient. The decoder /// can have an internal buffer that is pre-filled with data from the input /// iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// - parameter next: An *iterator* over the code units to be decoded. public mutating func decode< I : IteratorProtocol where I.Element == CodeUnit >(input: inout I) -> UnicodeDecodingResult { if _lookaheadFlags & 0b01 != 0 { return .emptyInput } // Note: maximal subpart of ill-formed sequence for UTF-16 can only have // length 1. Length 0 does not make sense. Neither does length 2 -- in // that case the sequence is valid. var unit0: UInt32 if _fastPath(_lookaheadFlags & 0b10 == 0) { if let first = input.next() { unit0 = UInt32(first) } else { // Set EOF flag. _lookaheadFlags |= 0b01 return .emptyInput } } else { // Fetch code unit from the lookahead buffer and note this fact in flags. unit0 = _decodeLookahead _lookaheadFlags &= 0b01 } // A well-formed pair of surrogates looks like this: // [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx] if _fastPath((unit0 >> 11) != 0b1101_1) { // Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit, // decoding is trivial. return .scalarValue(UnicodeScalar(unit0)) } if _slowPath((unit0 >> 10) == 0b1101_11) { // `unit0` is a low-surrogate. We have an ill-formed sequence. return .error } // At this point we know that `unit0` is a high-surrogate. var unit1: UInt32 if let second = input.next() { unit1 = UInt32(second) } else { // EOF reached. Set EOF flag. _lookaheadFlags |= 0b01 // We have seen a high-surrogate and EOF, so we have an ill-formed // sequence. return .error } if _fastPath((unit1 >> 10) == 0b1101_11) { // `unit1` is a low-surrogate. We have a well-formed surrogate pair. let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff)) return .scalarValue(UnicodeScalar(result)) } // Otherwise, we have an ill-formed sequence. These are the possible // cases: // // * `unit1` is a high-surrogate, so we have a pair of two high-surrogates. // // * `unit1` is not a surrogate. We have an ill-formed sequence: // high-surrogate followed by a non-surrogate. // Save the second code unit in the lookahead buffer. _decodeLookahead = unit1 _lookaheadFlags |= 0b10 return .error } /// Try to decode one Unicode scalar, and return the actual number of code /// units it spanned in the input. This function may consume more code /// units than required for this scalar. mutating func _decodeOne< I : IteratorProtocol where I.Element == CodeUnit >(input: inout I) -> (UnicodeDecodingResult, Int) { let result = decode(&input) switch result { case .scalarValue(let us): return (result, UTF16.width(us)) case .emptyInput: return (result, 0) case .error: return (result, 1) } } /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by /// calling `output` on each `CodeUnit`. public static func encode( input: UnicodeScalar, sendingOutputTo processCodeUnit: (CodeUnit) -> Void ) { let scalarValue: UInt32 = UInt32(input) if scalarValue <= UInt32(UInt16.max) { processCodeUnit(UInt16(scalarValue)) } else { let lead_offset = UInt32(0xd800) - UInt32(0x10000 >> 10) processCodeUnit(UInt16(lead_offset + (scalarValue >> 10))) processCodeUnit(UInt16(0xdc00 + (scalarValue & 0x3ff))) } } } /// A codec for [UTF-32](http://www.unicode.org/glossary/#UTF_32). public struct UTF32 : UnicodeCodec { /// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this /// encoding. public typealias CodeUnit = UInt32 public init() {} /// Start or continue decoding a UTF sequence. /// /// In order to decode a code unit sequence completely, this function should /// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`. /// Checking that the iterator was exhausted is not sufficient. The decoder /// can have an internal buffer that is pre-filled with data from the input /// iterator. /// /// Because of buffering, it is impossible to find the corresponding position /// in the iterator for a given returned `UnicodeScalar` or an error. /// /// - parameter next: An iterator over the code units to be decoded. public mutating func decode< I : IteratorProtocol where I.Element == CodeUnit >(input: inout I) -> UnicodeDecodingResult { return UTF32._decode(&input) } static func _decode< I : IteratorProtocol where I.Element == CodeUnit >(input: inout I) -> UnicodeDecodingResult { guard let x = input.next() else { return .emptyInput } if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) { return .scalarValue(UnicodeScalar(x)) } else { return .error } } /// Encode a `UnicodeScalar` as a series of `CodeUnit`s by /// calling `output` on each `CodeUnit`. public static func encode( input: UnicodeScalar, sendingOutputTo processCodeUnit: (CodeUnit) -> Void ) { processCodeUnit(UInt32(input)) } } /// Translate `input`, in the given `InputEncoding`, into `output`, in /// the given `OutputEncoding`. /// /// - parameter stopOnError: Causes encoding to stop when an encoding /// error is detected in `input`, if `true`. Otherwise, U+FFFD /// replacement characters are inserted for each detected error. public func transcode< Input : IteratorProtocol, InputEncoding : UnicodeCodec, OutputEncoding : UnicodeCodec where InputEncoding.CodeUnit == Input.Element >( input: Input, from inputEncoding: InputEncoding.Type, to outputEncoding: OutputEncoding.Type, stoppingOnError stopOnError: Bool, sendingOutputTo processCodeUnit: (OutputEncoding.CodeUnit) -> Void ) -> Bool { var input = input // NB. It is not possible to optimize this routine to a memcpy if // InputEncoding == OutputEncoding. The reason is that memcpy will not // substitute U+FFFD replacement characters for ill-formed sequences. var inputDecoder = inputEncoding.init() var hadError = false loop: while true { switch inputDecoder.decode(&input) { case .scalarValue(let us): OutputEncoding.encode(us, sendingOutputTo: processCodeUnit) case .emptyInput: break loop case .error: hadError = true if stopOnError { break loop } OutputEncoding.encode("\u{fffd}", sendingOutputTo: processCodeUnit) } } return hadError } /// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD. /// /// Returns the index of the first unhandled code unit and the UTF-8 data /// that was encoded. @warn_unused_result internal func _transcodeSomeUTF16AsUTF8< Input : Collection where Input.Iterator.Element == UInt16>( input: Input, _ startIndex: Input.Index ) -> (Input.Index, _StringCore._UTF8Chunk) { typealias _UTF8Chunk = _StringCore._UTF8Chunk let endIndex = input.endIndex let utf8Max = sizeof(_UTF8Chunk.self) var result: _UTF8Chunk = 0 var utf8Count = 0 var nextIndex = startIndex while nextIndex != input.endIndex && utf8Count != utf8Max { let u = UInt(input[nextIndex]) let shift = _UTF8Chunk(utf8Count * 8) var utf16Length: Input.Index.Distance = 1 if _fastPath(u <= 0x7f) { result |= _UTF8Chunk(u) << shift utf8Count += 1 } else { var scalarUtf8Length: Int var r: UInt if _fastPath((u >> 11) != 0b1101_1) { // Neither high-surrogate, nor low-surrogate -- well-formed sequence // of 1 code unit, decoding is trivial. if u < 0x800 { r = 0b10__00_0000__110__0_0000 r |= u >> 6 r |= (u & 0b11_1111) << 8 scalarUtf8Length = 2 } else { r = 0b10__00_0000__10__00_0000__1110__0000 r |= u >> 12 r |= ((u >> 6) & 0b11_1111) << 8 r |= (u & 0b11_1111) << 16 scalarUtf8Length = 3 } } else { let unit0 = u if _slowPath((unit0 >> 10) == 0b1101_11) { // `unit0` is a low-surrogate. We have an ill-formed sequence. // Replace it with U+FFFD. r = 0xbdbfef scalarUtf8Length = 3 } else if _slowPath(nextIndex.advanced(by: 1) == endIndex) { // We have seen a high-surrogate and EOF, so we have an ill-formed // sequence. Replace it with U+FFFD. r = 0xbdbfef scalarUtf8Length = 3 } else { let unit1 = UInt(input[nextIndex.advanced(by: 1)]) if _fastPath((unit1 >> 10) == 0b1101_11) { // `unit1` is a low-surrogate. We have a well-formed surrogate // pair. let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff)) r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000 r |= v >> 18 r |= ((v >> 12) & 0b11_1111) << 8 r |= ((v >> 6) & 0b11_1111) << 16 r |= (v & 0b11_1111) << 24 scalarUtf8Length = 4 utf16Length = 2 } else { // Otherwise, we have an ill-formed sequence. Replace it with // U+FFFD. r = 0xbdbfef scalarUtf8Length = 3 } } } // Don't overrun the buffer if utf8Count + scalarUtf8Length > utf8Max { break } result |= numericCast(r) << shift utf8Count += scalarUtf8Length } nextIndex = nextIndex.advanced(by: utf16Length) } // FIXME: Annoying check, courtesy of if utf8Count < sizeofValue(result) { result |= ~0 << numericCast(utf8Count * 8) } return (nextIndex, result) } /// Instances of conforming types are used in internal `String` /// representation. public // @testable protocol _StringElement { @warn_unused_result static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit @warn_unused_result static func _fromUTF16CodeUnit(utf16: UTF16.CodeUnit) -> Self } extension UTF16.CodeUnit : _StringElement { public // @testable static func _toUTF16CodeUnit(x: UTF16.CodeUnit) -> UTF16.CodeUnit { return x } public // @testable static func _fromUTF16CodeUnit( utf16: UTF16.CodeUnit ) -> UTF16.CodeUnit { return utf16 } } extension UTF8.CodeUnit : _StringElement { public // @testable static func _toUTF16CodeUnit(x: UTF8.CodeUnit) -> UTF16.CodeUnit { _sanityCheck(x <= 0x7f, "should only be doing this with ASCII") return UTF16.CodeUnit(x) } public // @testable static func _fromUTF16CodeUnit( utf16: UTF16.CodeUnit ) -> UTF8.CodeUnit { _sanityCheck(utf16 <= 0x7f, "should only be doing this with ASCII") return UTF8.CodeUnit(utf16) } } extension UTF16 { /// Returns the number of code units required to encode `x`. @warn_unused_result public static func width(x: UnicodeScalar) -> Int { return x.value <= 0xFFFF ? 1 : 2 } /// Returns the high surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing /// `x`. /// /// - Precondition: `width(x) == 2`. @warn_unused_result public static func leadSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit { _precondition(width(x) == 2) return UTF16.CodeUnit((x.value - 0x1_0000) >> (10 as UInt32)) + 0xD800 } /// Returns the low surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing /// `x`. /// /// - Precondition: `width(x) == 2`. @warn_unused_result public static func trailSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit { _precondition(width(x) == 2) return UTF16.CodeUnit( (x.value - 0x1_0000) & (((1 as UInt32) << 10) - 1) ) + 0xDC00 } @warn_unused_result public static func isLeadSurrogate(x: CodeUnit) -> Bool { return 0xD800...0xDBFF ~= x } @warn_unused_result public static func isTrailSurrogate(x: CodeUnit) -> Bool { return 0xDC00...0xDFFF ~= x } public // @testable static func _copy( source source: UnsafeMutablePointer, destination: UnsafeMutablePointer, count: Int ) { if strideof(T.self) == strideof(U.self) { _memcpy( dest: UnsafeMutablePointer(destination), src: UnsafeMutablePointer(source), size: UInt(count) * UInt(strideof(U.self))) } else { for i in 0..( of input: Input, decodedAs sourceEncoding: Encoding.Type, repairingIllFormedSequences: Bool ) -> (count: Int, isASCII: Bool)? { var input = input var count = 0 var isAscii = true var inputDecoder = Encoding() loop: while true { switch inputDecoder.decode(&input) { case .scalarValue(let us): if us.value > 0x7f { isAscii = false } count += width(us) case .emptyInput: break loop case .error: if !repairingIllFormedSequences { return nil } isAscii = false count += width(UnicodeScalar(0xfffd)) } } return (count, isAscii) } } @available(*, unavailable, renamed="UnicodeCodec") public typealias UnicodeCodecType = UnicodeCodec @available(*, unavailable, message="use 'transcode(_:from:to:stoppingOnError:sendingOutputTo:)'") public func transcode< Input : IteratorProtocol, InputEncoding : UnicodeCodec, OutputEncoding : UnicodeCodec where InputEncoding.CodeUnit == Input.Element >( inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type, _ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void, stoppingOnError stopOnError: Bool ) -> Bool { fatalError("unavailable function can't be called") } extension UTF16 { @available(*, unavailable, message="use 'transcodedLength(of:decodedAs:repairingIllFormedSequences:)'") public static func measure< Encoding : UnicodeCodec, Input : IteratorProtocol where Encoding.CodeUnit == Input.Element >( _: Encoding.Type, input: Input, repairIllFormedSequences: Bool ) -> (Int, Bool)? { fatalError("unavailable function can't be called") } }