mirror of
https://github.com/apple/swift.git
synced 2025-12-21 12:14:44 +01:00
Whenever possible, prefer the small string format. Updates creation logic. Adds assertion on large strings that they're not intended to be small.
504 lines
18 KiB
Swift
504 lines
18 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See https://swift.org/LICENSE.txt for license information
|
|
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// A single extended grapheme cluster that approximates a user-perceived
|
|
/// character.
|
|
///
|
|
/// The `Character` type represents a character made up of one or more Unicode
|
|
/// scalar values, grouped by a Unicode boundary algorithm. Generally, a
|
|
/// `Character` instance matches what the reader of a string will perceive as
|
|
/// a single character. Strings are collections of `Character` instances, so
|
|
/// the number of visible characters is generally the most natural way to
|
|
/// count the length of a string.
|
|
///
|
|
/// let greeting = "Hello! 🐥"
|
|
/// print("Length: \(greeting.count)")
|
|
/// // Prints "Length: 8"
|
|
///
|
|
/// Because each character in a string can be made up of one or more Unicode
|
|
/// scalar values, the number of characters in a string may not match the
|
|
/// length of the Unicode scalar value representation or the length of the
|
|
/// string in a particular binary representation.
|
|
///
|
|
/// print("Unicode scalar value count: \(greeting.unicodeScalars.count)")
|
|
/// // Prints "Unicode scalar value count: 15"
|
|
///
|
|
/// print("UTF-8 representation count: \(greeting.utf8.count)")
|
|
/// // Prints "UTF-8 representation count: 18"
|
|
///
|
|
/// Every `Character` instance is composed of one or more Unicode scalar values
|
|
/// that are grouped together as an *extended grapheme cluster*. The way these
|
|
/// scalar values are grouped is defined by a canonical, localized, or
|
|
/// otherwise tailored Unicode segmentation algorithm.
|
|
///
|
|
/// For example, a country's Unicode flag character is made up of two regional
|
|
/// indicator scalar values that correspond to that country's ISO 3166-1
|
|
/// alpha-2 code. The alpha-2 code for The United States is "US", so its flag
|
|
/// character is made up of the Unicode scalar values `"\u{1F1FA}"` (REGIONAL
|
|
/// INDICATOR SYMBOL LETTER U) and `"\u{1F1F8}"` (REGIONAL INDICATOR SYMBOL
|
|
/// LETTER S). When placed next to each other in a string literal, these two
|
|
/// scalar values are combined into a single grapheme cluster, represented by
|
|
/// a `Character` instance in Swift.
|
|
///
|
|
/// let usFlag: Character = "\u{1F1FA}\u{1F1F8}"
|
|
/// print(usFlag)
|
|
/// // Prints "🇺🇸"
|
|
///
|
|
/// For more information about the Unicode terms used in this discussion, see
|
|
/// the [Unicode.org glossary][glossary]. In particular, this discussion
|
|
/// mentions [extended grapheme clusters][clusters] and [Unicode scalar
|
|
/// values][scalars].
|
|
///
|
|
/// [glossary]: http://www.unicode.org/glossary/
|
|
/// [clusters]: http://www.unicode.org/glossary/#extended_grapheme_cluster
|
|
/// [scalars]: http://www.unicode.org/glossary/#unicode_scalar_value
|
|
@_fixed_layout
|
|
public struct Character {
|
|
// Fundamentally, it is just a String, but it is optimized for the common case
|
|
// where the UTF-16 representation fits in 63 bits. The remaining bit is used
|
|
// to discriminate between small and large representations. Since a grapheme
|
|
// cluster cannot have U+0000 anywhere but in its first scalar, we can store
|
|
// zero in empty code units above the first one.
|
|
@_frozen // FIXME(sil-serialize-all)
|
|
@_versioned
|
|
internal enum Representation {
|
|
case smallUTF16(Builtin.Int63)
|
|
case large(_UTF16StringStorage)
|
|
}
|
|
|
|
@_versioned
|
|
internal var _representation: Representation
|
|
|
|
// FIXME(sil-serialize-all): Should be @_inlineable @_versioned
|
|
// <rdar://problem/34557187>
|
|
internal static func _smallValue(_ value: Builtin.Int63) -> UInt64 {
|
|
return UInt64(Builtin.zext_Int63_Int64(value))
|
|
}
|
|
|
|
typealias UTF16View = String.UTF16View
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned // FIXME(sil-serialize-all)
|
|
internal var utf16: UTF16View {
|
|
return String(self).utf16
|
|
}
|
|
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned
|
|
internal init(_smallRepresentation b: _SmallUTF16) {
|
|
_sanityCheck(Int64(b._storage) >= 0)
|
|
_representation = .smallUTF16(
|
|
Builtin.trunc_Int64_Int63(b._storage._value))
|
|
}
|
|
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned
|
|
internal init(_largeRepresentation storage: _UTF16StringStorage) {
|
|
_representation = .large(storage)
|
|
}
|
|
|
|
/// Creates a Character from a String that is already known to require the
|
|
/// large representation.
|
|
///
|
|
/// - Note: `s` should contain only a single grapheme, but we can't require
|
|
/// that formally because of grapheme cluster literals and the shifting
|
|
/// sands of Unicode. https://bugs.swift.org/browse/SR-4955
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned
|
|
internal init(_largeRepresentationString s: String) {
|
|
let storage = s._guts._extractNativeStorage(of: UTF16.CodeUnit.self)
|
|
self.init(_largeRepresentation: storage)
|
|
}
|
|
}
|
|
|
|
extension Character
|
|
: _ExpressibleByBuiltinUTF16ExtendedGraphemeClusterLiteral,
|
|
ExpressibleByExtendedGraphemeClusterLiteral
|
|
{
|
|
/// Creates a character containing the given Unicode scalar value.
|
|
///
|
|
/// - Parameter content: The Unicode scalar value to convert into a character.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
public init(_ content: Unicode.Scalar) {
|
|
let content16 = UTF16.encode(content)._unsafelyUnwrappedUnchecked
|
|
_representation = .smallUTF16(
|
|
Builtin.zext_Int32_Int63(content16._storage._value))
|
|
}
|
|
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@effects(readonly)
|
|
public init(_builtinUnicodeScalarLiteral value: Builtin.Int32) {
|
|
self.init(Unicode.Scalar(_builtinUnicodeScalarLiteral: value))
|
|
}
|
|
|
|
// Inlining ensures that the whole constructor can be folded away to a single
|
|
// integer constant in case of small character literals.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@inline(__always)
|
|
@effects(readonly)
|
|
public init(
|
|
_builtinExtendedGraphemeClusterLiteral start: Builtin.RawPointer,
|
|
utf8CodeUnitCount: Builtin.Word,
|
|
isASCII: Builtin.Int1
|
|
) {
|
|
let utf8 = UnsafeBufferPointer(
|
|
start: UnsafePointer<Unicode.UTF8.CodeUnit>(start),
|
|
count: Int(utf8CodeUnitCount))
|
|
|
|
if utf8.count == 1 {
|
|
_representation = .smallUTF16(
|
|
Builtin.zext_Int8_Int63(utf8.first._unsafelyUnwrappedUnchecked._value))
|
|
return
|
|
}
|
|
|
|
FastPath:
|
|
repeat {
|
|
var shift = 0
|
|
let maxShift = 64 - 16
|
|
var bits: UInt64 = 0
|
|
|
|
for s8 in Unicode._ParsingIterator(
|
|
codeUnits: utf8.makeIterator(), parser: UTF8.ForwardParser()) {
|
|
|
|
let s16
|
|
= UTF16.transcode(s8, from: UTF8.self)._unsafelyUnwrappedUnchecked
|
|
|
|
for u16 in s16 {
|
|
guard _fastPath(shift <= maxShift) else { break FastPath }
|
|
bits |= UInt64(u16) &<< shift
|
|
shift += 16
|
|
}
|
|
}
|
|
guard _fastPath(Int64(truncatingIfNeeded: bits) >= 0) else {
|
|
break FastPath
|
|
}
|
|
_representation = .smallUTF16(Builtin.trunc_Int64_Int63(bits._value))
|
|
return
|
|
}
|
|
while false
|
|
|
|
// For anything that doesn't fit in 63 bits, build the large
|
|
// representation.
|
|
self = Character(_largeRepresentationString:
|
|
String(
|
|
_builtinExtendedGraphemeClusterLiteral: start,
|
|
utf8CodeUnitCount: utf8CodeUnitCount,
|
|
isASCII: isASCII))
|
|
}
|
|
|
|
// Inlining ensures that the whole constructor can be folded away to a single
|
|
// integer constant in case of small character literals.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@inline(__always)
|
|
@effects(readonly)
|
|
public init(
|
|
_builtinExtendedGraphemeClusterLiteral start: Builtin.RawPointer,
|
|
utf16CodeUnitCount: Builtin.Word
|
|
) {
|
|
let utf16 = _UnmanagedString<UTF16.CodeUnit>(
|
|
start: UnsafePointer(start),
|
|
count: Int(utf16CodeUnitCount))
|
|
|
|
switch utf16.count {
|
|
case 1:
|
|
_representation = .smallUTF16(Builtin.zext_Int16_Int63(utf16[0]._value))
|
|
case 2:
|
|
let bits = UInt32(utf16[0]) | UInt32(utf16[1]) &<< 16
|
|
_representation = .smallUTF16(Builtin.zext_Int32_Int63(bits._value))
|
|
case 3:
|
|
let bits = UInt64(utf16[0])
|
|
| UInt64(utf16[1]) &<< 16
|
|
| UInt64(utf16[2]) &<< 32
|
|
_representation = .smallUTF16(Builtin.trunc_Int64_Int63(bits._value))
|
|
case 4 where utf16[3] < 0x8000:
|
|
let bits = UInt64(utf16[0])
|
|
| UInt64(utf16[1]) &<< 16
|
|
| UInt64(utf16[2]) &<< 32
|
|
| UInt64(utf16[3]) &<< 48
|
|
_representation = .smallUTF16(Builtin.trunc_Int64_Int63(bits._value))
|
|
default:
|
|
// TODO(SSO): small check
|
|
_representation = .large(
|
|
_StringGuts(_large: utf16)._extractNativeStorage())
|
|
}
|
|
}
|
|
|
|
/// Creates a character with the specified value.
|
|
///
|
|
/// Do not call this initalizer directly. It is used by the compiler when
|
|
/// you use a string literal to initialize a `Character` instance. For
|
|
/// example:
|
|
///
|
|
/// let oBreve: Character = "o\u{306}"
|
|
/// print(oBreve)
|
|
/// // Prints "ŏ"
|
|
///
|
|
/// The assignment to the `oBreve` constant calls this initializer behind the
|
|
/// scenes.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
public init(extendedGraphemeClusterLiteral value: Character) {
|
|
self = value
|
|
}
|
|
|
|
/// Creates a character from a single-character string.
|
|
///
|
|
/// The following example creates a new character from the uppercase version
|
|
/// of a string that only holds one character.
|
|
///
|
|
/// let a = "a"
|
|
/// let capitalA = Character(a.uppercased())
|
|
///
|
|
/// - Parameter s: The single-character string to convert to a `Character`
|
|
/// instance. `s` must contain exactly one extended grapheme cluster.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
public init(_ s: String) {
|
|
let count = s._guts.count
|
|
_precondition(count != 0,
|
|
"Can't form a Character from an empty String")
|
|
_debugPrecondition(s.index(after: s.startIndex) == s.endIndex,
|
|
"Can't form a Character from a String containing more than one extended grapheme cluster")
|
|
|
|
self.init(_unverified: s._guts)
|
|
}
|
|
|
|
/// Construct a Character from a _StringGuts, assuming it consists of exactly
|
|
/// one extended grapheme cluster.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned // FIXME(sil-serialize-all)
|
|
internal init(_unverified guts: _StringGuts) {
|
|
self = _visitGuts(guts,
|
|
ascii: { ascii in
|
|
if _fastPath(ascii.count == 1) {
|
|
return Character(_singleCodeUnit: ascii[0])
|
|
}
|
|
// The only multi-scalar ASCII grapheme cluster is CR/LF.
|
|
_sanityCheck(ascii.count == 2)
|
|
_sanityCheck(ascii.start[0] == _CR)
|
|
_sanityCheck(ascii.start[1] == _LF)
|
|
return Character(_codeUnitPair: UInt16(_CR), UInt16(_LF))
|
|
},
|
|
utf16: { utf16 in return Character(_unverified: utf16) },
|
|
opaque: { opaque in return Character(_unverified: opaque) })
|
|
}
|
|
|
|
/// Construct a Character from a slice of a _StringGuts, assuming
|
|
/// the specified range covers exactly one extended grapheme cluster.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned // FIXME(sil-serialize-all)
|
|
internal init(_unverified guts: _StringGuts, range: Range<Int>) {
|
|
self = _visitGuts(
|
|
guts, range: (range, performBoundsCheck: true),
|
|
ascii: { ascii in
|
|
if _fastPath(ascii.count == 1) {
|
|
return Character(_singleCodeUnit: ascii[0])
|
|
}
|
|
// The only multi-scalar ASCII grapheme cluster is CR/LF.
|
|
_sanityCheck(ascii.count == 2)
|
|
_sanityCheck(ascii.start[0] == _CR)
|
|
_sanityCheck(ascii.start[1] == _LF)
|
|
return Character(_codeUnitPair: UInt16(_CR), UInt16(_LF))
|
|
},
|
|
utf16: { utf16 in return Character(_unverified: utf16) },
|
|
opaque: { opaque in return Character(_unverified: opaque) })
|
|
}
|
|
|
|
@_inlineable
|
|
@_versioned
|
|
internal
|
|
init(_singleCodeUnit cu: UInt16) {
|
|
_sanityCheck(UTF16._isScalar(cu))
|
|
_representation = .smallUTF16(
|
|
Builtin.zext_Int16_Int63(Builtin.reinterpretCast(cu)))
|
|
}
|
|
|
|
@_inlineable
|
|
@_versioned
|
|
internal
|
|
init(_codeUnitPair first: UInt16, _ second: UInt16) {
|
|
_sanityCheck(
|
|
(UTF16._isScalar(first) && UTF16._isScalar(second)) ||
|
|
(UTF16.isLeadSurrogate(first) && UTF16.isTrailSurrogate(second)))
|
|
_representation = .smallUTF16(
|
|
Builtin.zext_Int32_Int63(
|
|
Builtin.reinterpretCast(
|
|
UInt32(first) | UInt32(second) &<< 16)))
|
|
}
|
|
|
|
@_inlineable
|
|
@_versioned
|
|
internal
|
|
init(_unverified storage: _SwiftStringStorage<Unicode.UTF16.CodeUnit>) {
|
|
if _fastPath(storage.count <= 4) {
|
|
_sanityCheck(storage.count > 0)
|
|
let b = _SmallUTF16(storage.unmanagedView)
|
|
if _fastPath(Int64(bitPattern: b._storage) >= 0) {
|
|
self.init(_smallRepresentation: b)
|
|
_fixLifetime(storage)
|
|
return
|
|
}
|
|
}
|
|
// FIXME: We may want to make a copy if storage.unusedCapacity > 0
|
|
self.init(_largeRepresentation: storage)
|
|
}
|
|
|
|
@_inlineable
|
|
@_versioned
|
|
internal
|
|
init<V: _StringVariant>(_unverified variant: V) {
|
|
if _fastPath(variant.count <= 4) {
|
|
_sanityCheck(variant.count > 0)
|
|
let b = _SmallUTF16(variant)
|
|
if _fastPath(Int64(bitPattern: b._storage) >= 0) {
|
|
self.init(_smallRepresentation: b)
|
|
return
|
|
}
|
|
}
|
|
self.init(_largeRepresentation: variant._copyToNativeStorage())
|
|
}
|
|
}
|
|
|
|
extension Character : CustomStringConvertible {
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
public var description: String {
|
|
return String(describing: self)
|
|
}
|
|
}
|
|
|
|
extension Character : LosslessStringConvertible { }
|
|
|
|
extension Character : CustomDebugStringConvertible {
|
|
/// A textual representation of the character, suitable for debugging.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
public var debugDescription: String {
|
|
return String(self).debugDescription
|
|
}
|
|
}
|
|
|
|
extension Character {
|
|
internal typealias _SmallUTF16 = _UIntBuffer<UInt64, Unicode.UTF16.CodeUnit>
|
|
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned
|
|
internal var _smallUTF16 : _SmallUTF16? {
|
|
guard case .smallUTF16(let _63bits) = _representation else { return nil }
|
|
_onFastPath()
|
|
let bits = UInt64(Builtin.zext_Int63_Int64(_63bits))
|
|
let minBitWidth = type(of: bits).bitWidth - bits.leadingZeroBitCount
|
|
return _SmallUTF16(
|
|
_storage: bits,
|
|
_bitCount: UInt8(
|
|
truncatingIfNeeded: 16 * Swift.max(1, (minBitWidth + 15) / 16))
|
|
)
|
|
}
|
|
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned
|
|
internal var _largeUTF16 : _UTF16StringStorage? {
|
|
guard case .large(let storage) = _representation else { return nil }
|
|
return storage
|
|
}
|
|
}
|
|
|
|
extension Character {
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
@_versioned
|
|
internal var _count : Int {
|
|
if let small = _smallUTF16 { return small.count }
|
|
return _largeUTF16._unsafelyUnwrappedUnchecked.count
|
|
}
|
|
}
|
|
|
|
extension String {
|
|
/// Creates a string containing the given character.
|
|
///
|
|
/// - Parameter c: The character to convert to a string.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
public init(_ c: Character) {
|
|
if let utf16 = c._smallUTF16 {
|
|
if let small = _SmallUTF8String(utf16) {
|
|
self = String(_StringGuts(small))
|
|
} else {
|
|
// FIXME: Remove when we support UTF-8 in small string
|
|
self = String(decoding: utf16, as: Unicode.UTF16.self)
|
|
}
|
|
}
|
|
else {
|
|
// TODO(SSO): small check. For now, since we only do ASCII, this won't hit
|
|
self = String(_StringGuts(_large: c._largeUTF16!))
|
|
}
|
|
}
|
|
}
|
|
|
|
/// `.small` characters are stored in an Int63 with their UTF-8 representation,
|
|
/// with any unused bytes set to 0xFF. ASCII characters will have all bytes set
|
|
/// to 0xFF except for the lowest byte, which will store the ASCII value. Since
|
|
/// 0x7FFFFFFFFFFFFF80 or greater is an invalid UTF-8 sequence, we know if a
|
|
/// value is ASCII by checking if it is greater than or equal to
|
|
/// 0x7FFFFFFFFFFFFF00.
|
|
// FIXME(sil-serialize-all): Should be @_inlineable @_versioned
|
|
// <rdar://problem/34557187>
|
|
internal var _minASCIICharReprBuiltin: Builtin.Int63 {
|
|
@inline(__always) get {
|
|
let x: Int64 = 0x7FFFFFFFFFFFFF00
|
|
return Builtin.truncOrBitCast_Int64_Int63(x._value)
|
|
}
|
|
}
|
|
|
|
extension Character : Equatable {
|
|
@_inlineable
|
|
@inline(__always)
|
|
public static func == (lhs: Character, rhs: Character) -> Bool {
|
|
let l0 = lhs._smallUTF16
|
|
if _fastPath(l0 != nil), let l = l0?._storage {
|
|
let r0 = rhs._smallUTF16
|
|
if _fastPath(r0 != nil), let r = r0?._storage {
|
|
if (l | r) < 0x300 { return l == r }
|
|
if l == r { return true }
|
|
}
|
|
}
|
|
|
|
// FIXME(performance): constructing two temporary strings is extremely
|
|
// wasteful and inefficient.
|
|
return String(lhs) == String(rhs)
|
|
}
|
|
}
|
|
|
|
extension Character : Comparable {
|
|
@_inlineable
|
|
@inline(__always)
|
|
public static func < (lhs: Character, rhs: Character) -> Bool {
|
|
let l0 = lhs._smallUTF16
|
|
if _fastPath(l0 != nil), let l = l0?._storage {
|
|
let r0 = rhs._smallUTF16
|
|
if _fastPath(r0 != nil), let r = r0?._storage {
|
|
if (l | r) < 0x80 { return l < r }
|
|
if l == r { return false }
|
|
}
|
|
}
|
|
// FIXME(performance): constructing two temporary strings is extremely
|
|
// wasteful and inefficient.
|
|
return String(lhs) < String(rhs)
|
|
}
|
|
}
|
|
|
|
extension Character: Hashable {
|
|
/// The character's hash value.
|
|
///
|
|
/// Hash values are not guaranteed to be equal across different executions of
|
|
/// your program. Do not save hash values to use during a future execution.
|
|
@_inlineable // FIXME(sil-serialize-all)
|
|
public var hashValue: Int {
|
|
// FIXME(performance): constructing a temporary string is extremely
|
|
// wasteful and inefficient.
|
|
return String(self).hashValue
|
|
}
|
|
}
|