mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
M-x findr-query-replace \<advance(\([^:]+?\),\([ ]+\)by: *\([^(),]*\|[^(),]+([^()]*)[^(),]*\)\(,\(?:[ ]+\)limit: *\(?:[^()]*\|[^()]+([^()]*)[^()]*\)\)?) index(\3,\2stepsFrom: \1\4)
936 lines
30 KiB
Swift
936 lines
30 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See http://swift.org/LICENSE.txt for license information
|
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
// Conversions between different Unicode encodings. Note that UTF-16 and
|
|
// UTF-32 decoding are *not* currently resilient to erroneous data.
|
|
|
|
/// The result of one Unicode decoding step.
|
|
///
|
|
/// A unicode scalar value, an indication that no more unicode scalars
|
|
/// are available, or an indication of a decoding error.
|
|
public enum UnicodeDecodingResult : Equatable {
|
|
case scalarValue(UnicodeScalar)
|
|
case emptyInput
|
|
case error
|
|
}
|
|
|
|
public func == (
|
|
lhs: UnicodeDecodingResult,
|
|
rhs: UnicodeDecodingResult
|
|
) -> Bool {
|
|
switch (lhs, rhs) {
|
|
case (.scalarValue(let lhsScalar), .scalarValue(let rhsScalar)):
|
|
return lhsScalar == rhsScalar
|
|
case (.emptyInput, .emptyInput):
|
|
return true
|
|
case (.error, .error):
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|
|
|
|
/// A Unicode [encoding scheme](http://www.unicode.org/glossary/#character_encoding_scheme).
|
|
///
|
|
/// Consists of an underlying [code unit](http://www.unicode.org/glossary/#code_unit) and functions to
|
|
/// translate between sequences of these code units and [unicode scalar values](http://www.unicode.org/glossary/#unicode_scalar_value).
|
|
public protocol UnicodeCodec {
|
|
|
|
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
|
|
/// encoding.
|
|
associatedtype CodeUnit
|
|
|
|
init()
|
|
|
|
/// Start or continue decoding a UTF sequence.
|
|
///
|
|
/// In order to decode a code unit sequence completely, this function should
|
|
/// be called repeatedly until it returns `UnicodeDecodingResult.emptyInput`.
|
|
/// Checking that the iterator was exhausted is not sufficient. The decoder
|
|
/// can have an internal buffer that is pre-filled with data from the input
|
|
/// iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `UnicodeScalar` or an error.
|
|
///
|
|
/// - parameter next: An iterator over the code units to be decoded.
|
|
mutating func decode<
|
|
I : IteratorProtocol where I.Element == CodeUnit
|
|
>(next: inout I) -> UnicodeDecodingResult
|
|
|
|
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
|
|
/// calling `output` on each `CodeUnit`.
|
|
static func encode(
|
|
input: UnicodeScalar,
|
|
@noescape sendingOutputTo processCodeUnit: (CodeUnit) -> Void
|
|
)
|
|
}
|
|
|
|
/// A codec for [UTF-8](http://www.unicode.org/glossary/#UTF_8).
|
|
public struct UTF8 : UnicodeCodec {
|
|
|
|
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
|
|
/// encoding.
|
|
public typealias CodeUnit = UInt8
|
|
|
|
public init() {}
|
|
|
|
/// Lookahead buffer used for UTF-8 decoding. New bytes are inserted at LSB,
|
|
/// and bytes are read at MSB.
|
|
internal var _decodeLookahead: UInt32 = 0
|
|
|
|
/// Flags with layout: `0bxxxx_yyyy`.
|
|
///
|
|
/// `xxxx` is the EOF flag. It means that the input iterator has signaled
|
|
/// end of sequence. Out of the four bits, only one bit can be set. The bit
|
|
/// position specifies how many bytes have been consumed from the lookahead
|
|
/// buffer already. A value of `1000` means that there are `yyyy` bytes in
|
|
/// the buffer, `0100` means that there are `yyyy - 1` bytes, `0010` --
|
|
/// `yyyy - 2`, `0001` -- `yyyy - 3`.
|
|
///
|
|
/// `yyyy` specifies how many bytes are valid in the lookahead buffer. Value
|
|
/// is expressed in unary code. Valid values: `1111` (4), `0111` (3),
|
|
/// `0011` (2), `0001` (1), `0000` (0).
|
|
///
|
|
/// This representation is crafted to allow one to consume a byte from a
|
|
/// buffer with a shift, and update flags with a single-bit right shift.
|
|
internal var _lookaheadFlags: UInt8 = 0
|
|
|
|
|
|
/// Returns `true` if the LSB bytes in `buffer` are a well-formed UTF-8 code
|
|
/// unit sequence. The lowest byte is considered the first code unit.
|
|
///
|
|
/// - Requires: There is at least one used byte in `buffer`, and the unused
|
|
/// space in `buffer` is filled with some value not matching the UTF-8
|
|
/// continuation byte form (`0b10xxxxxx`).
|
|
@warn_unused_result
|
|
public // @testable
|
|
static func _isValidUTF8(buffer: UInt32) -> Bool {
|
|
|
|
if _fastPath(buffer & 0x80 == 0) {
|
|
return true // 0x00 -- 0x7f: 1-byte sequences (ASCII).
|
|
}
|
|
|
|
// Determine sequence length using high 5 bits of 1st byte. We use a
|
|
// look-up table to branch less. 1-byte sequences are handled above.
|
|
//
|
|
// case | pattern | description
|
|
// ----------------------------
|
|
// 00 | 110xx | 2-byte sequence
|
|
// 01 | 1110x | 3-byte sequence
|
|
// 10 | 11110 | 4-byte sequence
|
|
// 11 | other | invalid
|
|
//
|
|
// 11xxx 10xxx 01xxx 00xxx
|
|
let lut0: UInt32 = 0b1011_0000__1111_1111__1111_1111__1111_1111
|
|
let lut1: UInt32 = 0b1100_0000__1111_1111__1111_1111__1111_1111
|
|
|
|
let index = (buffer >> 3) & 0x1f
|
|
let bit0 = (lut0 >> index) & 1
|
|
let bit1 = (lut1 >> index) & 1
|
|
|
|
switch (bit1, bit0) {
|
|
case (0, 0): // 2-byte sequence.
|
|
// Require 10xx xxxx 110x xxxx.
|
|
if buffer & 0xc0e0 != 0x80c0 { return false }
|
|
// Disallow xxxx xxxx xxx0 000x (<= 7 bits case).
|
|
if buffer & 0x001e == 0x0000 { return false }
|
|
return true
|
|
case (0, 1): // 3-byte sequence.
|
|
// Require 10xx xxxx 10xx xxxx 1110 xxxx.
|
|
if buffer & 0xc0c0f0 != 0x8080e0 { return false }
|
|
// Disallow xxxx xxxx xx0x xxxx xxxx 0000 (<= 11 bits case).
|
|
if buffer & 0x00200f == 0x000000 { return false }
|
|
// Disallow xxxx xxxx xx1x xxxx xxxx 1101 (surrogate code points).
|
|
if buffer & 0x00200f == 0x00200d { return false }
|
|
return true
|
|
case (1, 0): // 4-byte sequence.
|
|
// Require 10xx xxxx 10xx xxxx 10xx xxxx 1111 0xxx.
|
|
if buffer & 0xc0c0c0f8 != 0x808080f0 { return false }
|
|
// Disallow xxxx xxxx xxxx xxxx xx00 xxxx xxxx x000 (<= 16 bits case).
|
|
if buffer & 0x00003007 == 0x00000000 { return false }
|
|
// Case xxxx xxxx xxxx xxxx xxxx xxxx xxxx x1xx.
|
|
if buffer & 0x00000004 == 0x00000004 {
|
|
// Require xxxx xxxx xxxx xxxx xx00 xxxx xxxx xx00 (<= 0x10FFFF).
|
|
if buffer & 0x00003003 != 0x00000000 { return false }
|
|
}
|
|
return true
|
|
default: // Invalid sequence.
|
|
return false
|
|
}
|
|
}
|
|
|
|
/// Given an ill-formed sequence, find the length of its maximal subpart.
|
|
@inline(never)
|
|
@warn_unused_result
|
|
internal static func _findMaximalSubpartOfIllFormedUTF8Sequence(
|
|
buffer: UInt32, validBytes: UInt8) -> UInt8 {
|
|
var buffer = buffer
|
|
var validBytes = validBytes
|
|
// This function is '@inline(never)' because it is used only in the error
|
|
// handling path.
|
|
|
|
// Clear EOF flag, we don't care about it.
|
|
validBytes &= 0b0000_1111
|
|
|
|
_sanityCheck(validBytes != 0,
|
|
"input buffer should not be empty")
|
|
_sanityCheck(!UTF8._isValidUTF8(buffer),
|
|
"input sequence should be ill-formed UTF-8")
|
|
|
|
// Unicode 6.3.0, D93b:
|
|
//
|
|
// Maximal subpart of an ill-formed subsequence: The longest code unit
|
|
// subsequence starting at an unconvertible offset that is either:
|
|
// a. the initial subsequence of a well-formed code unit sequence, or
|
|
// b. a subsequence of length one.
|
|
|
|
// Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
|
|
// Byte Sequences.
|
|
|
|
let cu0 = UInt8(buffer & 0xff)
|
|
buffer >>= 8
|
|
validBytes >>= 1
|
|
if (cu0 >= 0xc2 && cu0 <= 0xdf) {
|
|
// First byte is valid, but we know that this code unit sequence is
|
|
// invalid, so the maximal subpart has to end after the first byte.
|
|
return 1
|
|
}
|
|
|
|
if validBytes == 0 {
|
|
return 1
|
|
}
|
|
|
|
let cu1 = UInt8(buffer & 0xff)
|
|
buffer >>= 8
|
|
validBytes >>= 1
|
|
|
|
if (cu0 == 0xe0) {
|
|
return (cu1 >= 0xa0 && cu1 <= 0xbf) ? 2 : 1
|
|
}
|
|
if (cu0 >= 0xe1 && cu0 <= 0xec) {
|
|
return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1
|
|
}
|
|
if (cu0 == 0xed) {
|
|
return (cu1 >= 0x80 && cu1 <= 0x9f) ? 2 : 1
|
|
}
|
|
if (cu0 >= 0xee && cu0 <= 0xef) {
|
|
return (cu1 >= 0x80 && cu1 <= 0xbf) ? 2 : 1
|
|
}
|
|
if (cu0 == 0xf0) {
|
|
if (cu1 >= 0x90 && cu1 <= 0xbf) {
|
|
if validBytes == 0 {
|
|
return 2
|
|
}
|
|
|
|
let cu2 = UInt8(buffer & 0xff)
|
|
return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
|
|
}
|
|
return 1
|
|
}
|
|
if (cu0 >= 0xf1 && cu0 <= 0xf3) {
|
|
if (cu1 >= 0x80 && cu1 <= 0xbf) {
|
|
if validBytes == 0 {
|
|
return 2
|
|
}
|
|
|
|
let cu2 = UInt8(buffer & 0xff)
|
|
return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
|
|
}
|
|
return 1
|
|
}
|
|
if (cu0 == 0xf4) {
|
|
if (cu1 >= 0x80 && cu1 <= 0x8f) {
|
|
if validBytes == 0 {
|
|
return 2
|
|
}
|
|
|
|
let cu2 = UInt8(buffer & 0xff)
|
|
return (cu2 >= 0x80 && cu2 <= 0xbf) ? 3 : 2
|
|
}
|
|
return 1
|
|
}
|
|
|
|
_sanityCheck((cu0 >= 0x80 && cu0 <= 0xc1) || cu0 >= 0xf5,
|
|
"case analysis above should have handled all valid first bytes")
|
|
|
|
// There are no well-formed sequences that start with these bytes. Maximal
|
|
// subpart is defined to have length 1 in these cases.
|
|
return 1
|
|
}
|
|
|
|
/// Start or continue decoding a UTF sequence.
|
|
///
|
|
/// In order to decode a code unit sequence completely, this function should
|
|
/// be called repeatedly until it returns `UnicodeDecodingResult.emptyInput`.
|
|
/// Checking that the iterator was exhausted is not sufficient. The decoder
|
|
/// can have an internal buffer that is pre-filled with data from the input
|
|
/// iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `UnicodeScalar` or an error.
|
|
///
|
|
/// - parameter next: An iterator over the code units to be decoded.
|
|
public mutating func decode<
|
|
I : IteratorProtocol where I.Element == CodeUnit
|
|
>(next: inout I) -> UnicodeDecodingResult {
|
|
// If the EOF flag is not set, fill the lookahead buffer from the input
|
|
// iterator.
|
|
if _lookaheadFlags & 0b1111_0000 == 0 {
|
|
// Add more bytes into the buffer until we have 4.
|
|
while _lookaheadFlags != 0b0000_1111 {
|
|
if let codeUnit = next.next() {
|
|
_decodeLookahead = (_decodeLookahead << 8) | UInt32(codeUnit)
|
|
_lookaheadFlags = (_lookaheadFlags << 1) | 1
|
|
} else {
|
|
// Set the EOF flag.
|
|
switch _lookaheadFlags & 0b0000_1111 {
|
|
case 0b1111:
|
|
_sanityCheckFailure("should have not entered buffer refill loop")
|
|
case 0b0111:
|
|
_lookaheadFlags |= 0b0100_0000
|
|
case 0b0011:
|
|
_lookaheadFlags |= 0b0010_0000
|
|
case 0b0001:
|
|
_lookaheadFlags |= 0b0001_0000
|
|
case 0b0000:
|
|
_lookaheadFlags |= 0b1000_0000
|
|
return .emptyInput
|
|
default:
|
|
_sanityCheckFailure("bad value in _lookaheadFlags")
|
|
}
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if _slowPath(_lookaheadFlags & 0b0000_1111 == 0) {
|
|
return .emptyInput
|
|
}
|
|
|
|
if _slowPath(_lookaheadFlags & 0b1111_0000 != 0) {
|
|
// Reached EOF. Restore the invariant: first unread byte is always at
|
|
// MSB.
|
|
switch _lookaheadFlags & 0b1111_0000 {
|
|
case 0b1000_0000:
|
|
break
|
|
case 0b0100_0000:
|
|
_decodeLookahead <<= 1 * 8
|
|
case 0b0010_0000:
|
|
_decodeLookahead <<= 2 * 8
|
|
case 0b0001_0000:
|
|
_decodeLookahead <<= 3 * 8
|
|
default:
|
|
_sanityCheckFailure("bad value in _lookaheadFlags")
|
|
}
|
|
_lookaheadFlags = (_lookaheadFlags & 0b0000_1111) | 0b1000_0000
|
|
}
|
|
|
|
// The first byte to read is located at MSB of `_decodeLookahead`. Get a
|
|
// representation of the buffer where we can read bytes starting from LSB.
|
|
var buffer = _decodeLookahead.byteSwapped
|
|
if _slowPath(!UTF8._isValidUTF8(buffer)) {
|
|
// The code unit sequence is ill-formed. According to Unicode
|
|
// recommendation, replace the maximal subpart of ill-formed sequence
|
|
// with one replacement character.
|
|
_lookaheadFlags >>=
|
|
UTF8._findMaximalSubpartOfIllFormedUTF8Sequence(buffer,
|
|
validBytes: _lookaheadFlags)
|
|
return .error
|
|
}
|
|
|
|
// At this point we know that `buffer` starts with a well-formed code unit
|
|
// sequence. Decode it.
|
|
//
|
|
// When consuming bytes from the `buffer`, we just need to update
|
|
// `_lookaheadFlags`. The stored buffer in `_decodeLookahead` will be
|
|
// shifted at the beginning of the next decoding cycle.
|
|
let cu0 = UInt8(buffer & 0xff)
|
|
buffer >>= 8
|
|
_lookaheadFlags >>= 1
|
|
|
|
if cu0 < 0x80 {
|
|
// 1-byte sequences.
|
|
return .scalarValue(UnicodeScalar(UInt32(cu0)))
|
|
}
|
|
|
|
// Start with octet 1 (we'll mask off high bits later).
|
|
var result = UInt32(cu0)
|
|
|
|
let cu1 = UInt8(buffer & 0xff)
|
|
buffer >>= 8
|
|
_lookaheadFlags >>= 1
|
|
result = (result << 6) | UInt32(cu1 & 0x3f)
|
|
if cu0 < 0xe0 {
|
|
// 2-byte sequences.
|
|
return .scalarValue(UnicodeScalar(result & 0x000007ff)) // 11 bits
|
|
}
|
|
|
|
let cu2 = UInt8(buffer & 0xff)
|
|
buffer >>= 8
|
|
_lookaheadFlags >>= 1
|
|
result = (result << 6) | UInt32(cu2 & 0x3f)
|
|
if cu0 < 0xf0 {
|
|
// 3-byte sequences.
|
|
return .scalarValue(UnicodeScalar(result & 0x0000ffff)) // 16 bits
|
|
}
|
|
|
|
// 4-byte sequences.
|
|
let cu3 = UInt8(buffer & 0xff)
|
|
_lookaheadFlags >>= 1
|
|
result = (result << 6) | UInt32(cu3 & 0x3f)
|
|
return .scalarValue(UnicodeScalar(result & 0x001fffff)) // 21 bits
|
|
}
|
|
|
|
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
|
|
/// calling `output` on each `CodeUnit`.
|
|
public static func encode(
|
|
input: UnicodeScalar,
|
|
@noescape sendingOutputTo processCodeUnit: (CodeUnit) -> Void
|
|
) {
|
|
var c = UInt32(input)
|
|
var buf3 = UInt8(c & 0xFF)
|
|
|
|
if c >= UInt32(1<<7) {
|
|
c >>= 6
|
|
buf3 = (buf3 & 0x3F) | 0x80 // 10xxxxxx
|
|
var buf2 = UInt8(c & 0xFF)
|
|
if c < UInt32(1<<5) {
|
|
buf2 |= 0xC0 // 110xxxxx
|
|
}
|
|
else {
|
|
c >>= 6
|
|
buf2 = (buf2 & 0x3F) | 0x80 // 10xxxxxx
|
|
var buf1 = UInt8(c & 0xFF)
|
|
if c < UInt32(1<<4) {
|
|
buf1 |= 0xE0 // 1110xxxx
|
|
}
|
|
else {
|
|
c >>= 6
|
|
buf1 = (buf1 & 0x3F) | 0x80 // 10xxxxxx
|
|
processCodeUnit(UInt8(c | 0xF0)) // 11110xxx
|
|
}
|
|
processCodeUnit(buf1)
|
|
}
|
|
processCodeUnit(buf2)
|
|
}
|
|
processCodeUnit(buf3)
|
|
}
|
|
|
|
/// Returns `true` if `byte` is a continuation byte of the form
|
|
/// `0b10xxxxxx`.
|
|
@warn_unused_result
|
|
public static func isContinuation(byte: CodeUnit) -> Bool {
|
|
return byte & 0b11_00__0000 == 0b10_00__0000
|
|
}
|
|
}
|
|
|
|
/// A codec for [UTF-16](http://www.unicode.org/glossary/#UTF_16).
|
|
public struct UTF16 : UnicodeCodec {
|
|
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
|
|
/// encoding.
|
|
public typealias CodeUnit = UInt16
|
|
|
|
public init() {}
|
|
|
|
/// A lookahead buffer for one UTF-16 code unit.
|
|
var _decodeLookahead: UInt32 = 0
|
|
|
|
/// Flags with layout: `0b0000_00xy`.
|
|
///
|
|
/// `y` is the EOF flag.
|
|
///
|
|
/// `x` is set when `_decodeLookahead` contains a code unit.
|
|
var _lookaheadFlags: UInt8 = 0
|
|
|
|
/// Start or continue decoding a UTF sequence.
|
|
///
|
|
/// In order to decode a code unit sequence completely, this function should
|
|
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
|
|
/// Checking that the iterator was exhausted is not sufficient. The decoder
|
|
/// can have an internal buffer that is pre-filled with data from the input
|
|
/// iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `UnicodeScalar` or an error.
|
|
///
|
|
/// - parameter next: An *iterator* over the code units to be decoded.
|
|
public mutating func decode<
|
|
I : IteratorProtocol where I.Element == CodeUnit
|
|
>(input: inout I) -> UnicodeDecodingResult {
|
|
if _lookaheadFlags & 0b01 != 0 {
|
|
return .emptyInput
|
|
}
|
|
|
|
// Note: maximal subpart of ill-formed sequence for UTF-16 can only have
|
|
// length 1. Length 0 does not make sense. Neither does length 2 -- in
|
|
// that case the sequence is valid.
|
|
|
|
var unit0: UInt32
|
|
if _fastPath(_lookaheadFlags & 0b10 == 0) {
|
|
if let first = input.next() {
|
|
unit0 = UInt32(first)
|
|
} else {
|
|
// Set EOF flag.
|
|
_lookaheadFlags |= 0b01
|
|
return .emptyInput
|
|
}
|
|
} else {
|
|
// Fetch code unit from the lookahead buffer and note this fact in flags.
|
|
unit0 = _decodeLookahead
|
|
_lookaheadFlags &= 0b01
|
|
}
|
|
|
|
// A well-formed pair of surrogates looks like this:
|
|
// [1101 10ww wwxx xxxx] [1101 11xx xxxx xxxx]
|
|
|
|
if _fastPath((unit0 >> 11) != 0b1101_1) {
|
|
// Neither high-surrogate, nor low-surrogate -- sequence of 1 code unit,
|
|
// decoding is trivial.
|
|
return .scalarValue(UnicodeScalar(unit0))
|
|
}
|
|
|
|
if _slowPath((unit0 >> 10) == 0b1101_11) {
|
|
// `unit0` is a low-surrogate. We have an ill-formed sequence.
|
|
return .error
|
|
}
|
|
|
|
// At this point we know that `unit0` is a high-surrogate.
|
|
|
|
var unit1: UInt32
|
|
if let second = input.next() {
|
|
unit1 = UInt32(second)
|
|
} else {
|
|
// EOF reached. Set EOF flag.
|
|
_lookaheadFlags |= 0b01
|
|
|
|
// We have seen a high-surrogate and EOF, so we have an ill-formed
|
|
// sequence.
|
|
return .error
|
|
}
|
|
|
|
if _fastPath((unit1 >> 10) == 0b1101_11) {
|
|
// `unit1` is a low-surrogate. We have a well-formed surrogate pair.
|
|
|
|
let result = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
|
|
return .scalarValue(UnicodeScalar(result))
|
|
}
|
|
|
|
// Otherwise, we have an ill-formed sequence. These are the possible
|
|
// cases:
|
|
//
|
|
// * `unit1` is a high-surrogate, so we have a pair of two high-surrogates.
|
|
//
|
|
// * `unit1` is not a surrogate. We have an ill-formed sequence:
|
|
// high-surrogate followed by a non-surrogate.
|
|
|
|
// Save the second code unit in the lookahead buffer.
|
|
_decodeLookahead = unit1
|
|
_lookaheadFlags |= 0b10
|
|
return .error
|
|
}
|
|
|
|
/// Try to decode one Unicode scalar, and return the actual number of code
|
|
/// units it spanned in the input. This function may consume more code
|
|
/// units than required for this scalar.
|
|
mutating func _decodeOne<
|
|
I : IteratorProtocol where I.Element == CodeUnit
|
|
>(input: inout I) -> (UnicodeDecodingResult, Int) {
|
|
let result = decode(&input)
|
|
switch result {
|
|
case .scalarValue(let us):
|
|
return (result, UTF16.width(us))
|
|
|
|
case .emptyInput:
|
|
return (result, 0)
|
|
|
|
case .error:
|
|
return (result, 1)
|
|
}
|
|
}
|
|
|
|
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
|
|
/// calling `output` on each `CodeUnit`.
|
|
public static func encode(
|
|
input: UnicodeScalar,
|
|
@noescape sendingOutputTo processCodeUnit: (CodeUnit) -> Void
|
|
) {
|
|
let scalarValue: UInt32 = UInt32(input)
|
|
|
|
if scalarValue <= UInt32(UInt16.max) {
|
|
processCodeUnit(UInt16(scalarValue))
|
|
}
|
|
else {
|
|
let lead_offset = UInt32(0xd800) - UInt32(0x10000 >> 10)
|
|
processCodeUnit(UInt16(lead_offset + (scalarValue >> 10)))
|
|
processCodeUnit(UInt16(0xdc00 + (scalarValue & 0x3ff)))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// A codec for [UTF-32](http://www.unicode.org/glossary/#UTF_32).
|
|
public struct UTF32 : UnicodeCodec {
|
|
/// A type that can hold [code unit](http://www.unicode.org/glossary/#code_unit) values for this
|
|
/// encoding.
|
|
public typealias CodeUnit = UInt32
|
|
|
|
public init() {}
|
|
|
|
/// Start or continue decoding a UTF sequence.
|
|
///
|
|
/// In order to decode a code unit sequence completely, this function should
|
|
/// be called repeatedly until it returns `UnicodeDecodingResult.EmptyInput`.
|
|
/// Checking that the iterator was exhausted is not sufficient. The decoder
|
|
/// can have an internal buffer that is pre-filled with data from the input
|
|
/// iterator.
|
|
///
|
|
/// Because of buffering, it is impossible to find the corresponding position
|
|
/// in the iterator for a given returned `UnicodeScalar` or an error.
|
|
///
|
|
/// - parameter next: An iterator over the code units to be decoded.
|
|
public mutating func decode<
|
|
I : IteratorProtocol where I.Element == CodeUnit
|
|
>(input: inout I) -> UnicodeDecodingResult {
|
|
return UTF32._decode(&input)
|
|
}
|
|
|
|
static func _decode<
|
|
I : IteratorProtocol where I.Element == CodeUnit
|
|
>(input: inout I) -> UnicodeDecodingResult {
|
|
guard let x = input.next() else { return .emptyInput }
|
|
if _fastPath((x >> 11) != 0b1101_1 && x <= 0x10ffff) {
|
|
return .scalarValue(UnicodeScalar(x))
|
|
} else {
|
|
return .error
|
|
}
|
|
}
|
|
|
|
/// Encode a `UnicodeScalar` as a series of `CodeUnit`s by
|
|
/// calling `output` on each `CodeUnit`.
|
|
public static func encode(
|
|
input: UnicodeScalar,
|
|
@noescape sendingOutputTo processCodeUnit: (CodeUnit) -> Void
|
|
) {
|
|
processCodeUnit(UInt32(input))
|
|
}
|
|
}
|
|
|
|
/// Translate `input`, in the given `InputEncoding`, into `output`, in
|
|
/// the given `OutputEncoding`.
|
|
///
|
|
/// - parameter stopOnError: Causes encoding to stop when an encoding
|
|
/// error is detected in `input`, if `true`. Otherwise, U+FFFD
|
|
/// replacement characters are inserted for each detected error.
|
|
public func transcode<
|
|
Input : IteratorProtocol,
|
|
InputEncoding : UnicodeCodec,
|
|
OutputEncoding : UnicodeCodec
|
|
where InputEncoding.CodeUnit == Input.Element
|
|
>(
|
|
input: Input,
|
|
from inputEncoding: InputEncoding.Type,
|
|
to outputEncoding: OutputEncoding.Type,
|
|
stoppingOnError stopOnError: Bool,
|
|
@noescape sendingOutputTo processCodeUnit: (OutputEncoding.CodeUnit) -> Void
|
|
) -> Bool {
|
|
var input = input
|
|
|
|
// NB. It is not possible to optimize this routine to a memcpy if
|
|
// InputEncoding == OutputEncoding. The reason is that memcpy will not
|
|
// substitute U+FFFD replacement characters for ill-formed sequences.
|
|
|
|
var inputDecoder = inputEncoding.init()
|
|
var hadError = false
|
|
loop:
|
|
while true {
|
|
switch inputDecoder.decode(&input) {
|
|
case .scalarValue(let us):
|
|
OutputEncoding.encode(us, sendingOutputTo: processCodeUnit)
|
|
case .emptyInput:
|
|
break loop
|
|
case .error:
|
|
hadError = true
|
|
if stopOnError {
|
|
break loop
|
|
}
|
|
OutputEncoding.encode("\u{fffd}", sendingOutputTo: processCodeUnit)
|
|
}
|
|
}
|
|
return hadError
|
|
}
|
|
|
|
/// Transcode UTF-16 to UTF-8, replacing ill-formed sequences with U+FFFD.
|
|
///
|
|
/// Returns the index of the first unhandled code unit and the UTF-8 data
|
|
/// that was encoded.
|
|
@warn_unused_result
|
|
internal func _transcodeSomeUTF16AsUTF8<
|
|
Input : Collection
|
|
where
|
|
Input.Iterator.Element == UInt16>(
|
|
input: Input, _ startIndex: Input.Index
|
|
) -> (Input.Index, _StringCore._UTF8Chunk) {
|
|
typealias _UTF8Chunk = _StringCore._UTF8Chunk
|
|
|
|
let endIndex = input.endIndex
|
|
let utf8Max = sizeof(_UTF8Chunk.self)
|
|
var result: _UTF8Chunk = 0
|
|
var utf8Count = 0
|
|
var nextIndex = startIndex
|
|
while nextIndex != input.endIndex && utf8Count != utf8Max {
|
|
let u = UInt(input[nextIndex])
|
|
let shift = _UTF8Chunk(utf8Count * 8)
|
|
var utf16Length: Input.IndexDistance = 1
|
|
|
|
if _fastPath(u <= 0x7f) {
|
|
result |= _UTF8Chunk(u) << shift
|
|
utf8Count += 1
|
|
} else {
|
|
var scalarUtf8Length: Int
|
|
var r: UInt
|
|
if _fastPath((u >> 11) != 0b1101_1) {
|
|
// Neither high-surrogate, nor low-surrogate -- well-formed sequence
|
|
// of 1 code unit, decoding is trivial.
|
|
if u < 0x800 {
|
|
r = 0b10__00_0000__110__0_0000
|
|
r |= u >> 6
|
|
r |= (u & 0b11_1111) << 8
|
|
scalarUtf8Length = 2
|
|
}
|
|
else {
|
|
r = 0b10__00_0000__10__00_0000__1110__0000
|
|
r |= u >> 12
|
|
r |= ((u >> 6) & 0b11_1111) << 8
|
|
r |= (u & 0b11_1111) << 16
|
|
scalarUtf8Length = 3
|
|
}
|
|
} else {
|
|
let unit0 = u
|
|
if _slowPath((unit0 >> 10) == 0b1101_11) {
|
|
// `unit0` is a low-surrogate. We have an ill-formed sequence.
|
|
// Replace it with U+FFFD.
|
|
r = 0xbdbfef
|
|
scalarUtf8Length = 3
|
|
} else if _slowPath(input.index(1, stepsFrom: nextIndex) == endIndex) {
|
|
// We have seen a high-surrogate and EOF, so we have an ill-formed
|
|
// sequence. Replace it with U+FFFD.
|
|
r = 0xbdbfef
|
|
scalarUtf8Length = 3
|
|
} else {
|
|
let unit1 = UInt(input[input.index(1, stepsFrom: nextIndex)])
|
|
if _fastPath((unit1 >> 10) == 0b1101_11) {
|
|
// `unit1` is a low-surrogate. We have a well-formed surrogate
|
|
// pair.
|
|
let v = 0x10000 + (((unit0 & 0x03ff) << 10) | (unit1 & 0x03ff))
|
|
|
|
r = 0b10__00_0000__10__00_0000__10__00_0000__1111_0__000
|
|
r |= v >> 18
|
|
r |= ((v >> 12) & 0b11_1111) << 8
|
|
r |= ((v >> 6) & 0b11_1111) << 16
|
|
r |= (v & 0b11_1111) << 24
|
|
scalarUtf8Length = 4
|
|
utf16Length = 2
|
|
} else {
|
|
// Otherwise, we have an ill-formed sequence. Replace it with
|
|
// U+FFFD.
|
|
r = 0xbdbfef
|
|
scalarUtf8Length = 3
|
|
}
|
|
}
|
|
}
|
|
// Don't overrun the buffer
|
|
if utf8Count + scalarUtf8Length > utf8Max {
|
|
break
|
|
}
|
|
result |= numericCast(r) << shift
|
|
utf8Count += scalarUtf8Length
|
|
}
|
|
nextIndex = input.index(utf16Length, stepsFrom: nextIndex)
|
|
}
|
|
// FIXME: Annoying check, courtesy of <rdar://problem/16740169>
|
|
if utf8Count < sizeofValue(result) {
|
|
result |= ~0 << numericCast(utf8Count * 8)
|
|
}
|
|
return (nextIndex, result)
|
|
}
|
|
|
|
/// Instances of conforming types are used in internal `String`
|
|
/// representation.
|
|
public // @testable
|
|
protocol _StringElement {
|
|
@warn_unused_result
|
|
static func _toUTF16CodeUnit(_: Self) -> UTF16.CodeUnit
|
|
|
|
@warn_unused_result
|
|
static func _fromUTF16CodeUnit(utf16: UTF16.CodeUnit) -> Self
|
|
}
|
|
|
|
extension UTF16.CodeUnit : _StringElement {
|
|
public // @testable
|
|
static func _toUTF16CodeUnit(x: UTF16.CodeUnit) -> UTF16.CodeUnit {
|
|
return x
|
|
}
|
|
public // @testable
|
|
static func _fromUTF16CodeUnit(
|
|
utf16: UTF16.CodeUnit
|
|
) -> UTF16.CodeUnit {
|
|
return utf16
|
|
}
|
|
}
|
|
|
|
extension UTF8.CodeUnit : _StringElement {
|
|
public // @testable
|
|
static func _toUTF16CodeUnit(x: UTF8.CodeUnit) -> UTF16.CodeUnit {
|
|
_sanityCheck(x <= 0x7f, "should only be doing this with ASCII")
|
|
return UTF16.CodeUnit(x)
|
|
}
|
|
public // @testable
|
|
static func _fromUTF16CodeUnit(
|
|
utf16: UTF16.CodeUnit
|
|
) -> UTF8.CodeUnit {
|
|
_sanityCheck(utf16 <= 0x7f, "should only be doing this with ASCII")
|
|
return UTF8.CodeUnit(utf16)
|
|
}
|
|
}
|
|
|
|
extension UTF16 {
|
|
/// Returns the number of code units required to encode `x`.
|
|
@warn_unused_result
|
|
public static func width(x: UnicodeScalar) -> Int {
|
|
return x.value <= 0xFFFF ? 1 : 2
|
|
}
|
|
|
|
/// Returns the high surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing
|
|
/// `x`.
|
|
///
|
|
/// - Precondition: `width(x) == 2`.
|
|
@warn_unused_result
|
|
public static func leadSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit {
|
|
_precondition(width(x) == 2)
|
|
return UTF16.CodeUnit((x.value - 0x1_0000) >> (10 as UInt32)) + 0xD800
|
|
}
|
|
|
|
/// Returns the low surrogate code unit of a [surrogate pair](http://www.unicode.org/glossary/#surrogate_pair) representing
|
|
/// `x`.
|
|
///
|
|
/// - Precondition: `width(x) == 2`.
|
|
@warn_unused_result
|
|
public static func trailSurrogate(x: UnicodeScalar) -> UTF16.CodeUnit {
|
|
_precondition(width(x) == 2)
|
|
return UTF16.CodeUnit(
|
|
(x.value - 0x1_0000) & (((1 as UInt32) << 10) - 1)
|
|
) + 0xDC00
|
|
}
|
|
|
|
@warn_unused_result
|
|
public static func isLeadSurrogate(x: CodeUnit) -> Bool {
|
|
return 0xD800...0xDBFF ~= x
|
|
}
|
|
|
|
@warn_unused_result
|
|
public static func isTrailSurrogate(x: CodeUnit) -> Bool {
|
|
return 0xDC00...0xDFFF ~= x
|
|
}
|
|
|
|
public // @testable
|
|
static func _copy<T : _StringElement, U : _StringElement>(
|
|
source source: UnsafeMutablePointer<T>,
|
|
destination: UnsafeMutablePointer<U>,
|
|
count: Int
|
|
) {
|
|
if strideof(T.self) == strideof(U.self) {
|
|
_memcpy(
|
|
dest: UnsafeMutablePointer(destination),
|
|
src: UnsafeMutablePointer(source),
|
|
size: UInt(count) * UInt(strideof(U.self)))
|
|
}
|
|
else {
|
|
for i in 0..<count {
|
|
let u16 = T._toUTF16CodeUnit((source + i).pointee)
|
|
(destination + i).pointee = U._fromUTF16CodeUnit(u16)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns the number of UTF-16 code units required for the given code unit
|
|
/// sequence when transcoded to UTF-16, and a bit describing if the sequence
|
|
/// was found to contain only ASCII characters.
|
|
///
|
|
/// If `repairIllFormedSequences` is `true`, the function always succeeds.
|
|
/// If it is `false`, `nil` is returned if an ill-formed code unit sequence is
|
|
/// found in `input`.
|
|
@warn_unused_result
|
|
public static func transcodedLength<
|
|
Encoding : UnicodeCodec, Input : IteratorProtocol
|
|
where Encoding.CodeUnit == Input.Element
|
|
>(
|
|
of input: Input,
|
|
decodedAs sourceEncoding: Encoding.Type,
|
|
repairingIllFormedSequences: Bool
|
|
) -> (count: Int, isASCII: Bool)? {
|
|
var input = input
|
|
var count = 0
|
|
var isAscii = true
|
|
|
|
var inputDecoder = Encoding()
|
|
loop:
|
|
while true {
|
|
switch inputDecoder.decode(&input) {
|
|
case .scalarValue(let us):
|
|
if us.value > 0x7f {
|
|
isAscii = false
|
|
}
|
|
count += width(us)
|
|
case .emptyInput:
|
|
break loop
|
|
case .error:
|
|
if !repairingIllFormedSequences {
|
|
return nil
|
|
}
|
|
isAscii = false
|
|
count += width(UnicodeScalar(0xfffd))
|
|
}
|
|
}
|
|
return (count, isAscii)
|
|
}
|
|
}
|
|
|
|
@available(*, unavailable, renamed: "UnicodeCodec")
|
|
public typealias UnicodeCodecType = UnicodeCodec
|
|
|
|
@available(*, unavailable, message: "use 'transcode(_:from:to:stoppingOnError:sendingOutputTo:)'")
|
|
public func transcode<
|
|
Input : IteratorProtocol,
|
|
InputEncoding : UnicodeCodec,
|
|
OutputEncoding : UnicodeCodec
|
|
where InputEncoding.CodeUnit == Input.Element
|
|
>(
|
|
inputEncoding: InputEncoding.Type, _ outputEncoding: OutputEncoding.Type,
|
|
_ input: Input, _ output: (OutputEncoding.CodeUnit) -> Void,
|
|
stoppingOnError stopOnError: Bool
|
|
) -> Bool {
|
|
fatalError("unavailable function can't be called")
|
|
}
|
|
|
|
extension UTF16 {
|
|
@available(*, unavailable, message: "use 'transcodedLength(of:decodedAs:repairingIllFormedSequences:)'")
|
|
public static func measure<
|
|
Encoding : UnicodeCodec, Input : IteratorProtocol
|
|
where Encoding.CodeUnit == Input.Element
|
|
>(
|
|
_: Encoding.Type, input: Input, repairIllFormedSequences: Bool
|
|
) -> (Int, Bool)? {
|
|
fatalError("unavailable function can't be called")
|
|
}
|
|
}
|