mirror of
https://github.com/apple/swift.git
synced 2026-06-20 15:42:51 +02:00
f08afd00c3
Fixes rdar://141789595
793 lines
26 KiB
Swift
793 lines
26 KiB
Swift
//===--- UTF16.swift ------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
extension Unicode {
|
|
@frozen
|
|
public enum UTF16: Sendable {
|
|
case _swift3Buffer(Unicode.UTF16.ForwardParser)
|
|
}
|
|
}
|
|
|
|
extension Unicode.UTF16 {
|
|
/// Returns the number of code units required to encode the given Unicode
|
|
/// scalar.
|
|
///
|
|
/// Because a Unicode scalar value can require up to 21 bits to store its
|
|
/// value, some Unicode scalars are represented in UTF-16 by a pair of
|
|
/// 16-bit code units. The first and second code units of the pair,
|
|
/// designated *leading* and *trailing* surrogates, make up a *surrogate
|
|
/// pair*.
|
|
///
|
|
/// let anA: Unicode.Scalar = "A"
|
|
/// print(anA.value)
|
|
/// // Prints "65"
|
|
/// print(UTF16.width(anA))
|
|
/// // Prints "1"
|
|
///
|
|
/// let anApple: Unicode.Scalar = "🍎"
|
|
/// print(anApple.value)
|
|
/// // Prints "127822"
|
|
/// print(UTF16.width(anApple))
|
|
/// // Prints "2"
|
|
///
|
|
/// - Parameter x: A Unicode scalar value.
|
|
/// - Returns: The width of `x` when encoded in UTF-16, either `1` or `2`.
|
|
@inlinable
|
|
public static func width(_ x: Unicode.Scalar) -> Int {
|
|
return x.value <= UInt16.max ? 1 : 2
|
|
}
|
|
|
|
/// Returns the high-surrogate code unit of the surrogate pair representing
|
|
/// the specified Unicode scalar.
|
|
///
|
|
/// Because a Unicode scalar value can require up to 21 bits to store its
|
|
/// value, some Unicode scalars are represented in UTF-16 by a pair of
|
|
/// 16-bit code units. The first and second code units of the pair,
|
|
/// designated *leading* and *trailing* surrogates, make up a *surrogate
|
|
/// pair*.
|
|
///
|
|
/// let apple: Unicode.Scalar = "🍎"
|
|
/// print(UTF16.leadSurrogate(apple))
|
|
/// // Prints "55356"
|
|
///
|
|
/// - Parameter x: A Unicode scalar value. `x` must be represented by a
|
|
/// surrogate pair when encoded in UTF-16. To check whether `x` is
|
|
/// represented by a surrogate pair, use `UTF16.width(x) == 2`.
|
|
/// - Returns: The leading surrogate code unit of `x` when encoded in UTF-16.
|
|
@inlinable
|
|
public static func leadSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit {
|
|
_precondition(width(x) == 2)
|
|
return 0xD800 + UTF16.CodeUnit(truncatingIfNeeded:
|
|
(x.value - 0x1_0000) &>> (10 as UInt32))
|
|
}
|
|
|
|
/// Returns the low-surrogate code unit of the surrogate pair representing
|
|
/// the specified Unicode scalar.
|
|
///
|
|
/// Because a Unicode scalar value can require up to 21 bits to store its
|
|
/// value, some Unicode scalars are represented in UTF-16 by a pair of
|
|
/// 16-bit code units. The first and second code units of the pair,
|
|
/// designated *leading* and *trailing* surrogates, make up a *surrogate
|
|
/// pair*.
|
|
///
|
|
/// let apple: Unicode.Scalar = "🍎"
|
|
/// print(UTF16.trailSurrogate(apple))
|
|
/// // Prints "57166"
|
|
///
|
|
/// - Parameter x: A Unicode scalar value. `x` must be represented by a
|
|
/// surrogate pair when encoded in UTF-16. To check whether `x` is
|
|
/// represented by a surrogate pair, use `UTF16.width(x) == 2`.
|
|
/// - Returns: The trailing surrogate code unit of `x` when encoded in UTF-16.
|
|
@inlinable
|
|
public static func trailSurrogate(_ x: Unicode.Scalar) -> UTF16.CodeUnit {
|
|
_precondition(width(x) == 2)
|
|
return 0xDC00 + UTF16.CodeUnit(truncatingIfNeeded:
|
|
(x.value - 0x1_0000) & (((1 as UInt32) &<< 10) - 1))
|
|
}
|
|
|
|
/// Returns a Boolean value indicating whether the specified code unit is a
|
|
/// high-surrogate code unit.
|
|
///
|
|
/// Here's an example of checking whether each code unit in a string's
|
|
/// `utf16` view is a lead surrogate. The `apple` string contains a single
|
|
/// emoji character made up of a surrogate pair when encoded in UTF-16.
|
|
///
|
|
/// let apple = "🍎"
|
|
/// for unit in apple.utf16 {
|
|
/// print(UTF16.isLeadSurrogate(unit))
|
|
/// }
|
|
/// // Prints "true"
|
|
/// // Prints "false"
|
|
///
|
|
/// This method does not validate the encoding of a UTF-16 sequence beyond
|
|
/// the specified code unit. Specifically, it does not validate that a
|
|
/// low-surrogate code unit follows `x`.
|
|
///
|
|
/// - Parameter x: A UTF-16 code unit.
|
|
/// - Returns: `true` if `x` is a high-surrogate code unit; otherwise,
|
|
/// `false`.
|
|
@inlinable
|
|
public static func isLeadSurrogate(_ x: CodeUnit) -> Bool {
|
|
return (x & 0xFC00) == 0xD800
|
|
}
|
|
|
|
/// Returns a Boolean value indicating whether the specified code unit is a
|
|
/// low-surrogate code unit.
|
|
///
|
|
/// Here's an example of checking whether each code unit in a string's
|
|
/// `utf16` view is a trailing surrogate. The `apple` string contains a
|
|
/// single emoji character made up of a surrogate pair when encoded in
|
|
/// UTF-16.
|
|
///
|
|
/// let apple = "🍎"
|
|
/// for unit in apple.utf16 {
|
|
/// print(UTF16.isTrailSurrogate(unit))
|
|
/// }
|
|
/// // Prints "false"
|
|
/// // Prints "true"
|
|
///
|
|
/// This method does not validate the encoding of a UTF-16 sequence beyond
|
|
/// the specified code unit. Specifically, it does not validate that a
|
|
/// high-surrogate code unit precedes `x`.
|
|
///
|
|
/// - Parameter x: A UTF-16 code unit.
|
|
/// - Returns: `true` if `x` is a low-surrogate code unit; otherwise,
|
|
/// `false`.
|
|
@inlinable
|
|
public static func isTrailSurrogate(_ x: CodeUnit) -> Bool {
|
|
return (x & 0xFC00) == 0xDC00
|
|
}
|
|
|
|
/// Returns a Boolean value indicating whether the specified code unit is a
|
|
/// high or low surrogate code unit.
|
|
@_alwaysEmitIntoClient
|
|
public static func isSurrogate(_ x: CodeUnit) -> Bool {
|
|
return isLeadSurrogate(x) || isTrailSurrogate(x)
|
|
}
|
|
|
|
@inlinable
|
|
public // @testable
|
|
static func _copy<T: _StringElement, U: _StringElement>(
|
|
source: UnsafeMutablePointer<T>,
|
|
destination: UnsafeMutablePointer<U>,
|
|
count: Int
|
|
) {
|
|
if MemoryLayout<T>.stride == MemoryLayout<U>.stride {
|
|
unsafe _memcpy(
|
|
dest: UnsafeMutablePointer(destination),
|
|
src: UnsafeMutablePointer(source),
|
|
size: UInt(count) * UInt(MemoryLayout<U>.stride))
|
|
}
|
|
else {
|
|
for i in 0..<count {
|
|
let u16 = unsafe T._toUTF16CodeUnit((source + i).pointee)
|
|
unsafe (destination + i).pointee = U._fromUTF16CodeUnit(u16)
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Returns the number of UTF-16 code units required for the given code unit
|
|
/// sequence when transcoded to UTF-16, and a Boolean value indicating
|
|
/// whether the sequence was found to contain only ASCII characters.
|
|
///
|
|
/// The following example finds the length of the UTF-16 encoding of the
|
|
/// string `"Fermata 𝄐"`, starting with its UTF-8 representation.
|
|
///
|
|
/// let fermata = "Fermata 𝄐"
|
|
/// let bytes = fermata.utf8
|
|
/// print(Array(bytes))
|
|
/// // Prints "[70, 101, 114, 109, 97, 116, 97, 32, 240, 157, 132, 144]"
|
|
///
|
|
/// let result = UTF16.transcodedLength(of: bytes.makeIterator(),
|
|
/// decodedAs: UTF8.self,
|
|
/// repairingIllFormedSequences: false)
|
|
/// print(result)
|
|
/// // Prints "Optional((count: 10, isASCII: false))"
|
|
///
|
|
/// - Parameters:
|
|
/// - input: An iterator of code units to be translated, encoded as
|
|
/// `sourceEncoding`. If `repairingIllFormedSequences` is `true`, the
|
|
/// entire iterator will be exhausted. Otherwise, iteration will stop if
|
|
/// an ill-formed sequence is detected.
|
|
/// - sourceEncoding: The Unicode encoding of `input`.
|
|
/// - repairingIllFormedSequences: Pass `true` to measure the length of
|
|
/// `input` even when `input` contains ill-formed sequences. Each
|
|
/// ill-formed sequence is replaced with a Unicode replacement character
|
|
/// (`"\u{FFFD}"`) and is measured as such. Pass `false` to immediately
|
|
/// stop measuring `input` when an ill-formed sequence is encountered.
|
|
/// - Returns: A tuple containing the number of UTF-16 code units required to
|
|
/// encode `input` and a Boolean value that indicates whether the `input`
|
|
/// contained only ASCII characters. If `repairingIllFormedSequences` is
|
|
/// `false` and an ill-formed sequence is detected, this method returns
|
|
/// `nil`.
|
|
@inlinable
|
|
public static func transcodedLength<
|
|
Input: IteratorProtocol,
|
|
Encoding: Unicode.Encoding
|
|
>(
|
|
of input: Input,
|
|
decodedAs sourceEncoding: Encoding.Type,
|
|
repairingIllFormedSequences: Bool
|
|
) -> (count: Int, isASCII: Bool)?
|
|
where Encoding.CodeUnit == Input.Element {
|
|
|
|
var utf16Count = 0
|
|
var i = input
|
|
var d = Encoding.ForwardParser()
|
|
|
|
// Fast path for ASCII in a UTF8 buffer
|
|
if sourceEncoding == Unicode.UTF8.self {
|
|
var peek: Encoding.CodeUnit = 0
|
|
while let u = i.next() {
|
|
peek = u
|
|
guard _fastPath(peek < 0x80) else { break }
|
|
utf16Count = utf16Count + 1
|
|
}
|
|
if _fastPath(peek < 0x80) { return (utf16Count, true) }
|
|
|
|
var d1 = UTF8.ForwardParser()
|
|
d1._buffer.append(numericCast(peek))
|
|
d = _identityCast(d1, to: Encoding.ForwardParser.self)
|
|
}
|
|
|
|
var utf16BitUnion: CodeUnit = 0
|
|
while true {
|
|
let s = d.parseScalar(from: &i)
|
|
if _fastPath(s._valid != nil), let scalarContent = s._valid {
|
|
let utf16 = transcode(scalarContent, from: sourceEncoding)
|
|
._unsafelyUnwrappedUnchecked
|
|
utf16Count += utf16.count
|
|
for x in utf16 { utf16BitUnion |= x }
|
|
}
|
|
else if let _ = s._error {
|
|
guard _fastPath(repairingIllFormedSequences) else { return nil }
|
|
utf16Count += 1
|
|
utf16BitUnion |= UTF16._replacementCodeUnit
|
|
}
|
|
else {
|
|
return (utf16Count, utf16BitUnion < 0x80)
|
|
}
|
|
}
|
|
fatalError()
|
|
}
|
|
}
|
|
|
|
extension Unicode.UTF16: _UnicodeEncoding {
|
|
public typealias CodeUnit = UInt16
|
|
public typealias EncodedScalar = _UIntBuffer<UInt16>
|
|
|
|
@inlinable
|
|
internal static var _replacementCodeUnit: CodeUnit {
|
|
@inline(__always) get { return 0xfffd }
|
|
}
|
|
|
|
@inlinable
|
|
public static var encodedReplacementCharacter: EncodedScalar {
|
|
return EncodedScalar(_storage: 0xFFFD, _bitCount: 16)
|
|
}
|
|
|
|
/// Returns whether the given code unit represents an ASCII scalar
|
|
@_alwaysEmitIntoClient
|
|
public static func isASCII(_ x: CodeUnit) -> Bool {
|
|
return x <= 0x7f
|
|
}
|
|
|
|
@inlinable
|
|
public static func _isScalar(_ x: CodeUnit) -> Bool {
|
|
return x & 0xf800 != 0xd800
|
|
}
|
|
|
|
@inlinable
|
|
@inline(__always)
|
|
internal static func _decodeSurrogates(
|
|
_ lead: CodeUnit,
|
|
_ trail: CodeUnit
|
|
) -> Unicode.Scalar {
|
|
_internalInvariant(isLeadSurrogate(lead))
|
|
_internalInvariant(isTrailSurrogate(trail))
|
|
return Unicode.Scalar(
|
|
_unchecked: 0x10000 +
|
|
(UInt32(lead & 0x03ff) &<< 10 | UInt32(trail & 0x03ff)))
|
|
}
|
|
|
|
@inlinable
|
|
public static func decode(_ source: EncodedScalar) -> Unicode.Scalar {
|
|
let bits = source._storage
|
|
if _fastPath(source._bitCount == 16) {
|
|
return Unicode.Scalar(_unchecked: bits & 0xffff)
|
|
}
|
|
_internalInvariant(source._bitCount == 32)
|
|
let lower: UInt32 = bits >> 16 & 0x03ff
|
|
let upper: UInt32 = (bits & 0x03ff) << 10
|
|
let value = 0x10000 + (lower | upper)
|
|
return Unicode.Scalar(_unchecked: value)
|
|
}
|
|
|
|
@inlinable
|
|
public static func encode(
|
|
_ source: Unicode.Scalar
|
|
) -> EncodedScalar? {
|
|
let x = source.value
|
|
if _fastPath(x < ((1 as UInt32) << 16)) {
|
|
return EncodedScalar(_storage: x, _bitCount: 16)
|
|
}
|
|
let x1 = x - ((1 as UInt32) << 16)
|
|
var r = (0xdc00 + (x1 & 0x3ff))
|
|
r &<<= 16
|
|
r |= (0xd800 + (x1 &>> 10 & 0x3ff))
|
|
return EncodedScalar(_storage: r, _bitCount: 32)
|
|
}
|
|
|
|
@inlinable
|
|
@inline(__always)
|
|
public static func transcode<FromEncoding: Unicode.Encoding>(
|
|
_ content: FromEncoding.EncodedScalar, from _: FromEncoding.Type
|
|
) -> EncodedScalar? {
|
|
if _fastPath(FromEncoding.self == UTF8.self) {
|
|
let c = _identityCast(content, to: UTF8.EncodedScalar.self)
|
|
var b = c.count
|
|
b = b &- 1
|
|
if _fastPath(b == 0) {
|
|
return EncodedScalar(
|
|
_storage: (c._biasedBits &- 0x1) & 0b0__111_1111, _bitCount: 16)
|
|
}
|
|
var s = c._biasedBits &- 0x01010101
|
|
var r = s
|
|
r &<<= 6
|
|
s &>>= 8
|
|
r |= s & 0b0__11_1111
|
|
b = b &- 1
|
|
|
|
if _fastPath(b == 0) {
|
|
return EncodedScalar(_storage: r & 0b0__111_1111_1111, _bitCount: 16)
|
|
}
|
|
r &<<= 6
|
|
s &>>= 8
|
|
r |= s & 0b0__11_1111
|
|
b = b &- 1
|
|
|
|
if _fastPath(b == 0) {
|
|
return EncodedScalar(_storage: r & 0xFFFF, _bitCount: 16)
|
|
}
|
|
|
|
r &<<= 6
|
|
s &>>= 8
|
|
r |= s & 0b0__11_1111
|
|
r &= (1 &<< 21) - 1
|
|
return encode(Unicode.Scalar(_unchecked: r))
|
|
}
|
|
else if _fastPath(FromEncoding.self == UTF16.self) {
|
|
return unsafe unsafeBitCast(content, to: UTF16.EncodedScalar.self)
|
|
}
|
|
return encode(FromEncoding.decode(content))
|
|
}
|
|
|
|
@frozen
|
|
public struct ForwardParser: Sendable {
|
|
public typealias _Buffer = _UIntBuffer<UInt16>
|
|
|
|
public var _buffer: _Buffer
|
|
|
|
@inlinable
|
|
public init() { _buffer = _Buffer() }
|
|
}
|
|
|
|
@frozen
|
|
public struct ReverseParser: Sendable {
|
|
public typealias _Buffer = _UIntBuffer<UInt16>
|
|
|
|
public var _buffer: _Buffer
|
|
|
|
@inlinable
|
|
public init() { _buffer = _Buffer() }
|
|
}
|
|
}
|
|
|
|
extension UTF16.ReverseParser: Unicode.Parser, _UTFParser {
|
|
public typealias Encoding = Unicode.UTF16
|
|
|
|
@inlinable
|
|
public func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
|
|
_internalInvariant( // this case handled elsewhere
|
|
!Encoding._isScalar(UInt16(truncatingIfNeeded: _buffer._storage)))
|
|
if _fastPath(_buffer._storage & 0xFC00_FC00 == 0xD800_DC00) {
|
|
return (true, 2*16)
|
|
}
|
|
return (false, 1*16)
|
|
}
|
|
|
|
@inlinable
|
|
public func _bufferedScalar(bitCount: UInt8) -> Encoding.EncodedScalar {
|
|
return Encoding.EncodedScalar(
|
|
_storage:
|
|
(_buffer._storage &<< 16 | _buffer._storage &>> 16) &>> (32 - bitCount),
|
|
_bitCount: bitCount
|
|
)
|
|
}
|
|
}
|
|
|
|
extension Unicode.UTF16.ForwardParser: Unicode.Parser, _UTFParser {
|
|
public typealias Encoding = Unicode.UTF16
|
|
|
|
@inlinable
|
|
public func _parseMultipleCodeUnits() -> (isValid: Bool, bitCount: UInt8) {
|
|
_internalInvariant( // this case handled elsewhere
|
|
!Encoding._isScalar(UInt16(truncatingIfNeeded: _buffer._storage)))
|
|
if _fastPath(_buffer._storage & 0xFC00_FC00 == 0xDC00_D800) {
|
|
return (true, 2*16)
|
|
}
|
|
return (false, 1*16)
|
|
}
|
|
|
|
@inlinable
|
|
public func _bufferedScalar(bitCount: UInt8) -> Encoding.EncodedScalar {
|
|
var r = _buffer
|
|
r._bitCount = bitCount
|
|
return r
|
|
}
|
|
}
|
|
|
|
private enum ScalarFallbackResult: UInt8 {
|
|
case invalid
|
|
case singleByte
|
|
case multiByte
|
|
}
|
|
|
|
#if arch(arm64_32)
|
|
private typealias Word = UInt64
|
|
#else
|
|
private typealias Word = UInt
|
|
#endif
|
|
@_transparent private var mask: Word {
|
|
Word(truncatingIfNeeded: 0xFF80FF80_FF80FF80 as UInt64)
|
|
}
|
|
|
|
private typealias Block = (Word, Word, Word, Word)
|
|
|
|
#if SWIFT_STDLIB_ENABLE_VECTOR_TYPES
|
|
#if _pointerBitWidth(_32) && !arch(arm64_32)
|
|
@_transparent private var blockSize: Int { 8 }
|
|
@_transparent
|
|
private func allASCIIBlock(at pointer: UnsafePointer<UInt16>) -> SIMD8<UInt8>? {
|
|
let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
|
|
return unsafe ((block.0 | block.1 | block.2 | block.3) & mask == 0)
|
|
? unsafeBitCast(block, to: SIMD16<UInt8>.self).evenHalf : nil
|
|
}
|
|
#else
|
|
@_transparent private var blockSize: Int { 16 }
|
|
@_transparent
|
|
private func allASCIIBlock(at pointer: UnsafePointer<UInt16>) -> SIMD16<UInt8>? {
|
|
let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
|
|
return unsafe ((block.0 | block.1 | block.2 | block.3) & mask == 0)
|
|
? unsafeBitCast(block, to: SIMD32<UInt8>.self).evenHalf : nil
|
|
}
|
|
#endif
|
|
#else
|
|
@_transparent private var blockSize: Int { 1 }
|
|
@_transparent
|
|
private func allASCIIBlock(at pointer: UnsafePointer<UInt16>) -> CollectionOfOne<UInt8>? {
|
|
let value = unsafe pointer.pointee
|
|
if value & 0xFF80 == 0 {
|
|
return CollectionOfOne(UInt8(truncatingIfNeeded: value))
|
|
}
|
|
return nil
|
|
}
|
|
#endif
|
|
|
|
@_transparent private var utf8TwoByteMax: UInt32 { 0x7FF }
|
|
@_transparent private var utf16LeadSurrogateMin: UInt32 { 0xD800 }
|
|
@_transparent private var utf16TrailSurrogateMin: UInt32 { 0xDC00 }
|
|
@_transparent private var utf16ReplacementCharacter: UInt32 { 0xFFFD }
|
|
@_transparent private var utf16ScalarMax: UInt32 { 0x10FFFF }
|
|
@_transparent private var utf16BasicMultilingualPlaneMax: UInt32 { 0xFFFF }
|
|
@_transparent private var utf16AstralPlaneMin: UInt32 { 0x10000 }
|
|
|
|
/*
|
|
This is expressible in a more concise way using the other transcoding
|
|
primitives in the stdlib, but at least as of July 2025 doing that makes
|
|
processing runs of non-ASCII several times slower.
|
|
*/
|
|
@inline(__always)
|
|
private func encodeScalarAsUTF8(
|
|
_ scalar: UInt32,
|
|
output: inout UnsafeMutablePointer<Unicode.UTF8.CodeUnit>
|
|
) {
|
|
_debugPrecondition(scalar >= 0x80)
|
|
_debugPrecondition(scalar <= utf16ScalarMax)
|
|
if scalar <= utf8TwoByteMax {
|
|
// Scalar fits in 11 bits
|
|
// 2 byte UTF8 is 0b110[top 5 bits] 0b10[bottom 6 bits]
|
|
unsafe output.pointee = 0b1100_0000 | UInt8((scalar >> 6) & 0b01_1111)
|
|
unsafe (output + 1).pointee = 0b1000_0000 | UInt8(scalar & 0b11_1111)
|
|
unsafe output += 2
|
|
} else if scalar <= utf16BasicMultilingualPlaneMax {
|
|
// Scalar fits in 16 bits
|
|
// 3 byte UTF8 is 0b1110[top 4 bits] 0b10[middle 6 bits] 0b10[bottom 6 bits]
|
|
unsafe output.pointee = 0b1110_0000 | UInt8((scalar >> 12) & 0b1111)
|
|
unsafe (output + 1).pointee = 0b1000_0000 | UInt8((scalar >> 6) & 0b11_1111)
|
|
unsafe (output + 2).pointee = 0b1000_0000 | UInt8(scalar & 0b11_1111)
|
|
unsafe output += 3
|
|
} else if scalar <= utf16ScalarMax {
|
|
// Scalar fits in 21 bits.
|
|
// 0b11110[top 3] 0b10[upper middle 6] 0b10[lower middle 6] 0b10[bottom 6]
|
|
unsafe output.pointee = 0b1111_0000 | UInt8((scalar >> 18) & 0b0111)
|
|
unsafe (output + 1).pointee = 0b1000_0000 | UInt8((scalar >> 12) & 0b11_1111)
|
|
unsafe (output + 2).pointee = 0b1000_0000 | UInt8((scalar >> 6) & 0b11_1111)
|
|
unsafe (output + 3).pointee = 0b1000_0000 | UInt8(scalar & 0b11_1111)
|
|
unsafe output += 4
|
|
} else {
|
|
Builtin.unreachable()
|
|
}
|
|
}
|
|
|
|
@inline(__always)
|
|
private func processNonASCIIScalarFallback(
|
|
_ cu: UInt16,
|
|
input: inout UnsafePointer<UInt16>,
|
|
inputEnd: UnsafePointer<UInt16>,
|
|
output: inout UnsafeMutablePointer<Unicode.UTF8.CodeUnit>,
|
|
repairing: Bool
|
|
) -> (ScalarFallbackResult, repairsMade: Bool) {
|
|
var scalar: UInt32 = 0
|
|
var invalid = false
|
|
if _slowPath(UTF16.isLeadSurrogate(cu)) {
|
|
if unsafe input + 1 >= inputEnd {
|
|
//Leading with no room for trailing
|
|
invalid = true
|
|
unsafe input += 1
|
|
} else {
|
|
let next = unsafe (input + 1).pointee
|
|
if !UTF16.isTrailSurrogate(next) {
|
|
//Leading followed by non-trailing
|
|
invalid = true
|
|
unsafe input += 1
|
|
} else {
|
|
/*
|
|
Code points outside the BMP are encoded as:
|
|
value -= smallest non-BMP code point
|
|
lead = smallest leading surrogate + high 10 bits of value
|
|
trail = smallest trailing surrogate + low 10 bits of value
|
|
*/
|
|
scalar = utf16AstralPlaneMin
|
|
+ ((UInt32(cu) - utf16LeadSurrogateMin) << 10)
|
|
+ (UInt32(next) - utf16TrailSurrogateMin)
|
|
unsafe input += 2
|
|
}
|
|
}
|
|
} else if _slowPath(UTF16.isTrailSurrogate(cu)) {
|
|
//Trailing with no leading
|
|
invalid = true
|
|
unsafe input += 1
|
|
} else {
|
|
scalar = UInt32(cu)
|
|
unsafe input += 1
|
|
}
|
|
if _slowPath(invalid || scalar > utf16ScalarMax) {
|
|
guard repairing else { return (.invalid, repairsMade: false) }
|
|
unsafe encodeScalarAsUTF8(utf16ReplacementCharacter, output: &output)
|
|
return (.multiByte, repairsMade: true)
|
|
}
|
|
unsafe encodeScalarAsUTF8(scalar, output: &output)
|
|
return (.multiByte, repairsMade: false)
|
|
}
|
|
|
|
@inline(__always)
|
|
private func processScalarFallback(
|
|
input: inout UnsafePointer<Unicode.UTF16.CodeUnit>,
|
|
inputEnd: UnsafePointer<Unicode.UTF16.CodeUnit>,
|
|
output: inout UnsafeMutablePointer<Unicode.UTF8.CodeUnit>,
|
|
repairing: Bool
|
|
) -> (ScalarFallbackResult, repairsMade: Bool) {
|
|
let cu = unsafe input.pointee
|
|
if Unicode.UTF16.isASCII(cu) {
|
|
unsafe output.initialize(to: UInt8(truncatingIfNeeded: cu))
|
|
unsafe input += 1
|
|
unsafe output += 1
|
|
} else {
|
|
// Scalar fallback for this code unit
|
|
return unsafe processNonASCIIScalarFallback(
|
|
cu,
|
|
input: &input,
|
|
inputEnd: inputEnd,
|
|
output: &output,
|
|
repairing: repairing
|
|
)
|
|
}
|
|
return (.singleByte, repairsMade: false)
|
|
}
|
|
|
|
private func processNonASCIIChunk(
|
|
input: inout UnsafePointer<UInt16>,
|
|
inputEnd: UnsafePointer<UInt16>,
|
|
output: inout UnsafeMutablePointer<UInt8>,
|
|
repairing: Bool
|
|
) -> (Bool, repairsMade: Bool) {
|
|
var repaired = false
|
|
// Bound by position, not iteration count: a single call can consume a
|
|
// surrogate pair (2 code units), so a fixed `blockSize` iteration count
|
|
// would overrun into the next block. Matches utf8Length's scalar loop.
|
|
let chunkEnd = unsafe input + blockSize
|
|
while unsafe input < chunkEnd {
|
|
switch unsafe processScalarFallback(
|
|
input: &input,
|
|
inputEnd: inputEnd,
|
|
output: &output,
|
|
repairing: repairing
|
|
) {
|
|
case (.invalid, let repairsMade):
|
|
return (false, repairsMade: repaired || repairsMade)
|
|
case (.multiByte, let repairsMade):
|
|
repaired = repaired || repairsMade
|
|
continue
|
|
case (.singleByte, _):
|
|
continue
|
|
}
|
|
}
|
|
return (true, repairsMade: repaired)
|
|
}
|
|
|
|
/*
|
|
This is only ever called after validating the buffer size with
|
|
utf8Length(of:repairing:), so it does not check for end of buffer. Don't call
|
|
it if you haven't done that first!
|
|
*/
|
|
internal func transcodeUTF16ToUTF8(
|
|
UTF16CodeUnits: UnsafeBufferPointer<Unicode.UTF16.CodeUnit>,
|
|
intoKnownSufficientlyLarge outputBuffer: UnsafeMutableBufferPointer<Unicode.UTF8.CodeUnit>,
|
|
repairing: Bool = true
|
|
) -> (Int, repairsMade: Bool) {
|
|
let inCount = UTF16CodeUnits.count
|
|
let outCount = outputBuffer.count
|
|
guard inCount > 0, outCount > 0 else { return (0, repairsMade: false) }
|
|
var input = unsafe UTF16CodeUnits.baseAddress.unsafelyUnwrapped
|
|
let inputEnd = unsafe input + inCount
|
|
var output = unsafe outputBuffer.baseAddress.unsafelyUnwrapped
|
|
let outputStart = unsafe output
|
|
var repairsMade = false
|
|
|
|
while unsafe (inputEnd - input) >= blockSize {
|
|
if let asciiBlock = unsafe allASCIIBlock(at: input) {
|
|
_onFastPath()
|
|
// All ASCII: transcode directly
|
|
for i in 0 ..< blockSize {
|
|
unsafe (output + i).initialize(to: asciiBlock[i])
|
|
}
|
|
unsafe input += blockSize
|
|
unsafe output += blockSize
|
|
} else {
|
|
let (success, tmpRepairsMade) = unsafe processNonASCIIChunk(
|
|
input: &input,
|
|
inputEnd: inputEnd,
|
|
output: &output,
|
|
repairing: repairing
|
|
)
|
|
repairsMade = repairsMade || tmpRepairsMade
|
|
if !success {
|
|
return unsafe (output - outputStart, repairsMade: repairsMade)
|
|
}
|
|
}
|
|
}
|
|
// Finish any remaining code units using fallback scalar loop
|
|
while unsafe input < inputEnd {
|
|
switch unsafe processScalarFallback(
|
|
input: &input,
|
|
inputEnd: inputEnd,
|
|
output: &output,
|
|
repairing: repairing
|
|
) {
|
|
case (.invalid, let tmpRepairsMade):
|
|
return unsafe (output - outputStart, repairsMade: repairsMade || tmpRepairsMade)
|
|
case (_, let tmpRepairsMade):
|
|
repairsMade = repairsMade || tmpRepairsMade
|
|
}
|
|
}
|
|
return unsafe (output - outputStart, repairsMade: repairsMade)
|
|
}
|
|
|
|
@inline(__always)
|
|
private func utf8Length(
|
|
input: inout UnsafePointer<Unicode.UTF16.CodeUnit>,
|
|
end: UnsafePointer<Unicode.UTF16.CodeUnit>,
|
|
inputEnd: UnsafePointer<Unicode.UTF16.CodeUnit>,
|
|
repairing: Bool
|
|
) -> Int? {
|
|
// `end` is the stopping point for this call; `inputEnd` is the real end of
|
|
// the buffer. They differ when the caller is processing one block at a time
|
|
// from a larger buffer. A lead surrogate at `end - 1` may pair with a trail
|
|
// at `end` (which is past the block but still within the buffer); in that
|
|
// case we consume the pair, advancing `input` one position past `end`.
|
|
var count = 0
|
|
while unsafe input < end {
|
|
let cu = unsafe input.pointee
|
|
if cu < 0x80 {
|
|
count &+= 1
|
|
unsafe input += 1
|
|
} else if cu < 0x800 {
|
|
count &+= 2
|
|
unsafe input += 1
|
|
} else if UTF16.isLeadSurrogate(cu) {
|
|
// Check for a valid surrogate pair, allowing the trail to sit just past
|
|
// `end` as long as it's still within the buffer.
|
|
let next = unsafe input + 1
|
|
if unsafe next < inputEnd && UTF16.isTrailSurrogate(next.pointee) {
|
|
count &+= 4
|
|
unsafe input += 2
|
|
} else if repairing {
|
|
count &+= 3 // U+FFFD replacement character
|
|
unsafe input += 1
|
|
} else {
|
|
return nil
|
|
}
|
|
} else if UTF16.isTrailSurrogate(cu) {
|
|
// Unpaired low surrogate
|
|
if repairing {
|
|
count &+= 3 // U+FFFD replacement character
|
|
unsafe input += 1
|
|
} else {
|
|
return nil
|
|
}
|
|
} else {
|
|
count &+= 3 // BMP non-surrogate
|
|
unsafe input += 1
|
|
}
|
|
}
|
|
return count
|
|
}
|
|
|
|
internal func utf8Length(
|
|
of UTF16CodeUnits: UnsafeBufferPointer<Unicode.UTF16.CodeUnit>,
|
|
repairing: Bool = true
|
|
) -> (Int, isASCII: Bool)? {
|
|
let inCount = UTF16CodeUnits.count
|
|
guard inCount > 0 else { return (0, isASCII: true) }
|
|
var input = unsafe UTF16CodeUnits.baseAddress.unsafelyUnwrapped
|
|
let inputEnd = unsafe input + inCount
|
|
var count = 0
|
|
|
|
// For each UTF-16 code unit:
|
|
// U+0000..U+007F → 1 UTF-8 byte (ASCII)
|
|
// U+0080..U+07FF → 2 UTF-8 bytes
|
|
// U+0800..U+D7FF → 3 UTF-8 bytes (BMP, non-surrogate)
|
|
// U+D800..U+DBFF → high surrogate (4 UTF-8 bytes for the pair)
|
|
// U+DC00..U+DFFF → low surrogate (consumed by high surrogate)
|
|
// U+E000..U+FFFF → 3 UTF-8 bytes (BMP, non-surrogate)
|
|
|
|
while unsafe (inputEnd - input) >= blockSize {
|
|
if let _ = unsafe allASCIIBlock(at: input) {
|
|
_onFastPath()
|
|
unsafe input += blockSize
|
|
count += blockSize
|
|
} else {
|
|
let blockEnd = unsafe Swift.min(input + blockSize, inputEnd)
|
|
guard let addedCount = unsafe utf8Length(
|
|
input: &input,
|
|
end: blockEnd,
|
|
inputEnd: inputEnd,
|
|
repairing: repairing
|
|
) else {
|
|
return nil
|
|
}
|
|
count &+= addedCount
|
|
}
|
|
}
|
|
// Finish any remaining code units that didn't fill a full block
|
|
guard let addedByteCount = unsafe utf8Length(
|
|
input: &input,
|
|
end: inputEnd,
|
|
inputEnd: inputEnd,
|
|
repairing: repairing
|
|
) else {
|
|
return nil
|
|
}
|
|
count &+= addedByteCount
|
|
return (count, isASCII: count == inCount)
|
|
}
|