mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
* Simplify conforming to ExpressibleByStringLiteral with default implementations * attributes on default implementations * ExpressibleByUnicodeScalarLiteral validation test * more generic default implementations * clean up test * remove unneeded implementations * remove test verification * indent * revert @effects and affected methods * fix test generics with _ protocols * Add semantic tests * clean up tests * Fix redundant conformance requirements
447 lines
16 KiB
Swift
447 lines
16 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// A single extended grapheme cluster, which approximates a user-perceived
|
|
/// character.
|
|
///
|
|
/// The `Character` type represents a character made up of one or more Unicode
|
|
/// scalar values, grouped by a Unicode boundary algorithm. Generally, a
|
|
/// `Character` instance matches what the reader of a string will perceive as
|
|
/// a single character. The number of visible characters is generally the most
|
|
/// natural way to count the length of a string.
|
|
///
|
|
/// let greeting = "Hello! 🐥"
|
|
/// print("Character count: \(greeting.characters.count)")
|
|
/// // Prints "Character count: 8"
|
|
///
|
|
/// Because each character in a string can be made up of one or more Unicode
|
|
/// code points, the number of characters in a string may not match the length
|
|
/// of the Unicode code point representation or the length of the string in a
|
|
/// particular binary representation.
|
|
///
|
|
/// print("Unicode code point count: \(greeting.unicodeScalars.count)")
|
|
/// // Prints "Unicode code point count: 15"
|
|
///
|
|
/// print("UTF-8 representation count: \(greeting.utf8.count)")
|
|
/// // Prints "UTF-8 representation count: 18"
|
|
///
|
|
/// Every `Character` instance is composed of one or more Unicode code points
|
|
/// that are grouped together as an *extended grapheme cluster*. The way these
|
|
/// code points are grouped is defined by a canonical, localized, or otherwise
|
|
/// tailored Unicode segmentation algorithm.
|
|
///
|
|
/// For example, a country's Unicode flag character is made up of two regional
|
|
/// indicator code points that correspond to that country's ISO 3166-1 alpha-2
|
|
/// code. The alpha-2 code for The United States is "US", so its flag
|
|
/// character is made up of the Unicode code points `"\u{1F1FA}"` (REGIONAL
|
|
/// INDICATOR SYMBOL LETTER U) and `"\u{1F1F8}"` (REGIONAL INDICATOR SYMBOL
|
|
/// LETTER S). When placed next to each other in a Swift string literal, these
|
|
/// two code points are combined into a single grapheme cluster, represented
|
|
/// by a `Character` instance in Swift.
|
|
///
|
|
/// let usFlag: Character = "\u{1F1FA}\u{1F1F8}"
|
|
/// print(usFlag)
|
|
/// // Prints "🇺🇸"
|
|
///
|
|
/// For more information about the Unicode terms used in this discussion, see
|
|
/// the [Unicode.org glossary][glossary]. In particular, this discussion
|
|
/// mentions [extended grapheme clusters][clusters] and [Unicode scalar
|
|
/// values][scalars].
|
|
///
|
|
/// [glossary]: http://www.unicode.org/glossary/
|
|
/// [clusters]: http://www.unicode.org/glossary/#extended_grapheme_cluster
|
|
/// [scalars]: http://www.unicode.org/glossary/#unicode_scalar_value
|
|
public struct Character :
|
|
_ExpressibleByBuiltinExtendedGraphemeClusterLiteral,
|
|
ExpressibleByExtendedGraphemeClusterLiteral, Hashable {
|
|
|
|
// Fundamentally, it is just a String, but it is optimized for the
|
|
// common case where the UTF-8 representation fits in 63 bits. The
|
|
// remaining bit is used to discriminate between small and large
|
|
// representations. In the small representation, the unused bytes
|
|
// are filled with 0xFF.
|
|
//
|
|
// If the grapheme cluster can be represented as `.small`, it
|
|
// should be represented as such.
|
|
@_versioned
|
|
internal enum Representation {
|
|
// A _StringBuffer whose first grapheme cluster is self.
|
|
// NOTE: may be more than 1 Character long.
|
|
case large(_StringBuffer._Storage)
|
|
case small(Builtin.Int63)
|
|
}
|
|
|
|
/// Creates a character containing the given Unicode scalar value.
|
|
///
|
|
/// - Parameter scalar: The Unicode scalar value to convert into a character.
|
|
public init(_ scalar: UnicodeScalar) {
|
|
var asInt: UInt64 = 0
|
|
var shift: UInt64 = 0
|
|
|
|
let output: (UTF8.CodeUnit) -> Void = {
|
|
asInt |= UInt64($0) &<< shift
|
|
shift += 8
|
|
}
|
|
|
|
UTF8.encode(scalar, into: output)
|
|
asInt |= (~0) &<< shift
|
|
_representation = .small(Builtin.trunc_Int64_Int63(asInt._value))
|
|
}
|
|
|
|
@effects(readonly)
|
|
public init(_builtinUnicodeScalarLiteral value: Builtin.Int32) {
|
|
self = Character(
|
|
String._fromWellFormedCodeUnitSequence(
|
|
UTF32.self, input: CollectionOfOne(UInt32(value))))
|
|
}
|
|
|
|
@effects(readonly)
|
|
public init(
|
|
_builtinExtendedGraphemeClusterLiteral start: Builtin.RawPointer,
|
|
utf8CodeUnitCount: Builtin.Word,
|
|
isASCII: Builtin.Int1
|
|
) {
|
|
// Most character literals are going to be fewer than eight UTF-8 code
|
|
// units; for those, build the small character representation directly.
|
|
let maxCodeUnitCount = MemoryLayout<UInt64>.size
|
|
if _fastPath(Int(utf8CodeUnitCount) <= maxCodeUnitCount) {
|
|
var buffer: UInt64 = ~0
|
|
_memcpy(
|
|
dest: UnsafeMutableRawPointer(Builtin.addressof(&buffer)),
|
|
src: UnsafeMutableRawPointer(start),
|
|
size: UInt(utf8CodeUnitCount))
|
|
// Copying the bytes directly from the literal into an integer assumes
|
|
// little endianness, so convert the copied data into host endianness.
|
|
let utf8Chunk = UInt64(littleEndian: buffer)
|
|
let bits = maxCodeUnitCount &* 8 &- 1
|
|
// Verify that the highest bit isn't set so that we can truncate it to
|
|
// 63 bits.
|
|
if _fastPath(utf8Chunk & (1 &<< numericCast(bits)) != 0) {
|
|
_representation = .small(Builtin.trunc_Int64_Int63(utf8Chunk._value))
|
|
return
|
|
}
|
|
}
|
|
// For anything that doesn't fit in 63 bits, build the large
|
|
// representation.
|
|
self = Character(_largeRepresentationString: String(
|
|
_builtinExtendedGraphemeClusterLiteral: start,
|
|
utf8CodeUnitCount: utf8CodeUnitCount,
|
|
isASCII: isASCII))
|
|
}
|
|
|
|
/// Creates a character with the specified value.
|
|
///
|
|
/// Do not call this initializer directly. It is used by the compiler when
|
|
/// you use a string literal to initialize a `Character` instance. For
|
|
/// example:
|
|
///
|
|
/// let oBreve: Character = "o\u{306}"
|
|
/// print(oBreve)
|
|
/// // Prints "ŏ"
|
|
///
|
|
/// The assignment to the `oBreve` constant calls this initializer behind the
|
|
/// scenes.
|
|
public init(extendedGraphemeClusterLiteral value: Character) {
|
|
self = value
|
|
}
|
|
|
|
/// Creates a character from a single-character string.
|
|
///
|
|
/// The following example creates a new character from the uppercase version
|
|
/// of a string that only holds one character.
|
|
///
|
|
/// let a = "a"
|
|
/// let capitalA = Character(a.uppercased())
|
|
///
|
|
/// - Parameter s: The single-character string to convert to a `Character`
|
|
/// instance. `s` must contain exactly one extended grapheme cluster.
|
|
public init(_ s: String) {
|
|
// The small representation can accept up to 8 code units as long
|
|
// as the last one is a continuation. Since the high bit of the
|
|
// last byte is used for the enum's discriminator, we have to
|
|
// reconstruct it. As a result, we can't store 0x7f in the final
|
|
// byte, because we wouldn't be able to distinguish it from an
|
|
// unused 0xFF byte. Rather than trying to squeeze in other
|
|
// one-byte code points there, we simplify decoding by banning
|
|
// starting a code point in the last byte, and assuming that its
|
|
// high bit is 1.
|
|
_precondition(
|
|
s._core.count != 0, "Can't form a Character from an empty String")
|
|
_precondition(
|
|
s.index(after: s.startIndex) == s.endIndex,
|
|
"Can't form a Character from a String containing more than one extended grapheme cluster")
|
|
|
|
let (count, initialUTF8) = s._core._encodeSomeUTF8(from: 0)
|
|
// Notice that the result of sizeof() is a small non-zero number and can't
|
|
// overflow when multiplied by 8.
|
|
let bits = MemoryLayout.size(ofValue: initialUTF8) &* 8 &- 1
|
|
if _fastPath(
|
|
count == s._core.count && (initialUTF8 & (1 &<< numericCast(bits))) != 0) {
|
|
_representation = .small(Builtin.trunc_Int64_Int63(initialUTF8._value))
|
|
}
|
|
else {
|
|
self = Character(_largeRepresentationString: s)
|
|
}
|
|
}
|
|
|
|
/// Creates a Character from a String that is already known to require the
|
|
/// large representation.
|
|
internal init(_largeRepresentationString s: String) {
|
|
if let native = s._core.nativeBuffer,
|
|
native.start == s._core._baseAddress! {
|
|
_representation = .large(native._storage)
|
|
return
|
|
}
|
|
var nativeString = ""
|
|
nativeString.append(s)
|
|
_representation = .large(nativeString._core.nativeBuffer!._storage)
|
|
}
|
|
|
|
/// Returns the index of the lowest byte that is 0xFF, or 8 if
|
|
/// there is none.
|
|
static func _smallSize(_ value: UInt64) -> Int {
|
|
var mask: UInt64 = 0xFF
|
|
for i in 0..<8 {
|
|
if (value & mask) == mask {
|
|
return i
|
|
}
|
|
mask &<<= 8
|
|
}
|
|
return 8
|
|
}
|
|
|
|
static func _smallValue(_ value: Builtin.Int63) -> UInt64 {
|
|
return UInt64(Builtin.zext_Int63_Int64(value)) | (1 &<< 63)
|
|
}
|
|
|
|
internal struct _SmallUTF8 : RandomAccessCollection {
|
|
typealias Indices = CountableRange<Int>
|
|
|
|
var indices: CountableRange<Int> {
|
|
return startIndex..<endIndex
|
|
}
|
|
|
|
init(_ u8: UInt64) {
|
|
let utf8Count = Character._smallSize(u8)
|
|
_sanityCheck(utf8Count <= 8, "Character with more than 8 UTF-8 code units")
|
|
self.count = UInt16(utf8Count)
|
|
self.data = u8
|
|
}
|
|
|
|
/// The position of the first element in a non-empty collection.
|
|
///
|
|
/// In an empty collection, `startIndex == endIndex`.
|
|
var startIndex: Int {
|
|
return 0
|
|
}
|
|
|
|
/// The collection's "past the end" position.
|
|
///
|
|
/// `endIndex` is not a valid argument to `subscript`, and is always
|
|
/// reachable from `startIndex` by zero or more applications of
|
|
/// `index(after:)`.
|
|
var endIndex: Int {
|
|
return Int(count)
|
|
}
|
|
|
|
/// Access the code unit at `position`.
|
|
///
|
|
/// - Precondition: `position` is a valid position in `self` and
|
|
/// `position != endIndex`.
|
|
subscript(position: Int) -> UTF8.CodeUnit {
|
|
_sanityCheck(position >= 0)
|
|
_sanityCheck(position < Int(count))
|
|
// Note: using unchecked arithmetic because overflow cannot happen if the
|
|
// above sanity checks hold.
|
|
return UTF8.CodeUnit(
|
|
extendingOrTruncating: data &>> (UInt64(position) &* 8))
|
|
}
|
|
|
|
internal struct Iterator : IteratorProtocol {
|
|
init(_ data: UInt64) {
|
|
self._data = data
|
|
}
|
|
|
|
internal mutating func next() -> UInt8? {
|
|
let result = UInt8(extendingOrTruncating: _data)
|
|
if result == 0xFF {
|
|
return nil
|
|
}
|
|
_data = (_data &>> 8) | 0xFF00_0000_0000_0000
|
|
return result
|
|
}
|
|
|
|
internal var _data: UInt64
|
|
}
|
|
|
|
internal func makeIterator() -> Iterator {
|
|
return Iterator(data)
|
|
}
|
|
|
|
var count: UInt16
|
|
var data: UInt64
|
|
}
|
|
|
|
struct _SmallUTF16 : RandomAccessCollection {
|
|
typealias Indices = CountableRange<Int>
|
|
|
|
init(_ u8: UInt64) {
|
|
let count = UTF16.transcodedLength(
|
|
of: _SmallUTF8(u8).makeIterator(),
|
|
decodedAs: UTF8.self,
|
|
repairingIllFormedSequences: true)!.0
|
|
_sanityCheck(count <= 4, "Character with more than 4 UTF-16 code units")
|
|
self.count = UInt16(count)
|
|
var u16: UInt64 = 0
|
|
let output: (UTF16.CodeUnit) -> Void = {
|
|
u16 = u16 &<< 16
|
|
u16 = u16 | UInt64(extendingOrTruncating: $0)
|
|
}
|
|
_ = transcode(
|
|
_SmallUTF8(u8).makeIterator(),
|
|
from: UTF8.self, to: UTF16.self,
|
|
stoppingOnError: false,
|
|
into: output)
|
|
self.data = u16
|
|
}
|
|
|
|
/// The position of the first element in a non-empty collection.
|
|
///
|
|
/// In an empty collection, `startIndex == endIndex`.
|
|
var startIndex: Int {
|
|
return 0
|
|
}
|
|
|
|
/// The collection's "past the end" position.
|
|
///
|
|
/// `endIndex` is not a valid argument to `subscript`, and is always
|
|
/// reachable from `startIndex` by zero or more applications of
|
|
/// `successor()`.
|
|
var endIndex: Int {
|
|
return Int(count)
|
|
}
|
|
|
|
/// Access the code unit at `position`.
|
|
///
|
|
/// - Precondition: `position` is a valid position in `self` and
|
|
/// `position != endIndex`.
|
|
subscript(position: Int) -> UTF16.CodeUnit {
|
|
_sanityCheck(position >= 0)
|
|
_sanityCheck(position < Int(count))
|
|
// Note: using unchecked arithmetic because overflow cannot happen if the
|
|
// above sanity checks hold.
|
|
return UTF16.CodeUnit(extendingOrTruncating:
|
|
data &>> ((UInt64(count) &- UInt64(position) &- 1) &* 16))
|
|
}
|
|
|
|
var count: UInt16
|
|
var data: UInt64
|
|
}
|
|
|
|
/// The character's hash value.
|
|
///
|
|
/// Hash values are not guaranteed to be equal across different executions of
|
|
/// your program. Do not save hash values to use during a future execution.
|
|
public var hashValue: Int {
|
|
// FIXME(performance): constructing a temporary string is extremely
|
|
// wasteful and inefficient.
|
|
return String(self).hashValue
|
|
}
|
|
|
|
typealias UTF16View = String.UTF16View
|
|
var utf16: UTF16View {
|
|
return String(self).utf16
|
|
}
|
|
|
|
@_versioned
|
|
internal var _representation: Representation
|
|
}
|
|
|
|
extension Character : CustomStringConvertible {
|
|
public var description: String {
|
|
return String(describing: self)
|
|
}
|
|
}
|
|
|
|
extension Character : LosslessStringConvertible {}
|
|
|
|
extension Character : CustomDebugStringConvertible {
|
|
/// A textual representation of the character, suitable for debugging.
|
|
public var debugDescription: String {
|
|
return String(self).debugDescription
|
|
}
|
|
}
|
|
|
|
extension String {
|
|
/// Creates a string containing the given character.
|
|
///
|
|
/// - Parameter c: The character to convert to a string.
|
|
public init(_ c: Character) {
|
|
switch c._representation {
|
|
case let .small(_63bits):
|
|
let value = Character._smallValue(_63bits)
|
|
let smallUTF8 = Character._SmallUTF8(value)
|
|
self = String._fromWellFormedCodeUnitSequence(
|
|
UTF8.self, input: smallUTF8)
|
|
case let .large(value):
|
|
let buf = String(_StringCore(_StringBuffer(value)))
|
|
self = buf[buf.startIndex..<buf.index(after: buf.startIndex)]
|
|
}
|
|
}
|
|
}
|
|
|
|
/// `.small` characters are stored in an Int63 with their UTF-8 representation,
|
|
/// with any unused bytes set to 0xFF. ASCII characters will have all bytes set
|
|
/// to 0xFF except for the lowest byte, which will store the ASCII value. Since
|
|
/// 0x7FFFFFFFFFFFFF80 or greater is an invalid UTF-8 sequence, we know if a
|
|
/// value is ASCII by checking if it is greater than or equal to
|
|
/// 0x7FFFFFFFFFFFFF00.
|
|
internal var _minASCIICharReprBuiltin: Builtin.Int63 {
|
|
@inline(__always) get {
|
|
let x: Int64 = 0x7FFFFFFFFFFFFF00
|
|
return Builtin.truncOrBitCast_Int64_Int63(x._value)
|
|
}
|
|
}
|
|
|
|
extension Character : Equatable {
|
|
public static func == (lhs: Character, rhs: Character) -> Bool {
|
|
switch (lhs._representation, rhs._representation) {
|
|
case let (.small(lbits), .small(rbits)) where
|
|
Bool(Builtin.cmp_uge_Int63(lbits, _minASCIICharReprBuiltin))
|
|
&& Bool(Builtin.cmp_uge_Int63(rbits, _minASCIICharReprBuiltin)):
|
|
return Bool(Builtin.cmp_eq_Int63(lbits, rbits))
|
|
default:
|
|
// FIXME(performance): constructing two temporary strings is extremely
|
|
// wasteful and inefficient.
|
|
return String(lhs) == String(rhs)
|
|
}
|
|
}
|
|
}
|
|
|
|
extension Character : Comparable {
|
|
public static func < (lhs: Character, rhs: Character) -> Bool {
|
|
switch (lhs._representation, rhs._representation) {
|
|
case let (.small(lbits), .small(rbits)) where
|
|
// Note: This is consistent with Foundation but unicode incorrect.
|
|
// See String._compareASCII.
|
|
Bool(Builtin.cmp_uge_Int63(lbits, _minASCIICharReprBuiltin))
|
|
&& Bool(Builtin.cmp_uge_Int63(rbits, _minASCIICharReprBuiltin)):
|
|
return Bool(Builtin.cmp_ult_Int63(lbits, rbits))
|
|
default:
|
|
// FIXME(performance): constructing two temporary strings is extremely
|
|
// wasteful and inefficient.
|
|
return String(lhs) < String(rhs)
|
|
}
|
|
}
|
|
}
|
|
|