Files
swift-mirror/stdlib/public/core/StringObject.swift
Michael Ilseman 7aea40680d [String] NFC iterator fast-paths
Refactor and rename _StringGutsSlice, apply NFC-aware fast paths to a
new buffered iterator.

Also, fix bug in _typeName which used to assume ASCIIness and better
SIL optimizations on StringObject.
2018-11-04 10:42:41 -08:00

877 lines
26 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// NOTE: This is a prototype, it does not have e.g. 32-bit support yet.
//
//
// StringObject abstracts the bit-level interpretation and creation of the
// String struct.
//
// TODO: Word-level diagram
@_fixed_layout @usableFromInline
internal struct _StringObject {
// Abstract the count and performance-flags containing word
@_fixed_layout @usableFromInline
struct CountAndFlags {
@usableFromInline
var _storage: UInt
@inlinable @inline(__always)
internal init(zero: ()) { self._storage = 0 }
}
//
// Laid out as (_countAndFlags, _object), which allows small string contents to
// naturally start on vector-alignment.
//
@usableFromInline
internal var _countAndFlags: CountAndFlags
@usableFromInline
internal var _object: Builtin.BridgeObject
@inlinable @inline(__always)
internal init(zero: ()) {
self._countAndFlags = CountAndFlags(zero:())
self._object = Builtin.reinterpretCast(0)
}
// Namespace to hold magic numbers
@usableFromInline @_frozen
enum Nibbles {}
}
// Raw
extension _StringObject {
@usableFromInline
internal typealias RawBitPattern = (UInt, UInt)
@inlinable
internal var rawBits: RawBitPattern {
@inline(__always) get { return (_countAndFlags.rawBits, objectRawBits) }
}
@inlinable @inline(__always)
internal init(
bridgeObject: Builtin.BridgeObject, countAndFlags: CountAndFlags
) {
self._object = bridgeObject
self._countAndFlags = countAndFlags
_invariantCheck()
}
@inlinable @inline(__always)
internal init(
object: AnyObject, discriminator: UInt, countAndFlags: CountAndFlags
) {
let builtinRawObject: Builtin.Int64 = Builtin.reinterpretCast(object)
let builtinDiscrim: Builtin.Int64 = Builtin.reinterpretCast(discriminator)
self.init(
bridgeObject: Builtin.reinterpretCast(
Builtin.stringObjectOr_Int64(builtinRawObject, builtinDiscrim)),
countAndFlags: countAndFlags)
}
// Initializer to use for tagged (unmanaged) values
@inlinable @inline(__always)
internal init(
valueBits: UInt, discriminator: UInt, countAndFlags: CountAndFlags
) {
let builtinValueBits: Builtin.Int64 = Builtin.reinterpretCast(valueBits)
let builtinDiscrim: Builtin.Int64 = Builtin.reinterpretCast(discriminator)
self.init(
bridgeObject: Builtin.valueToBridgeObject(Builtin.stringObjectOr_Int64(
builtinValueBits, builtinDiscrim)),
countAndFlags: countAndFlags)
}
@inlinable @inline(__always)
internal init(raw bits: RawBitPattern) {
self.init(zero:())
self._countAndFlags = Builtin.reinterpretCast(bits.0)
self._object = Builtin.reinterpretCast(bits.1)
_sanityCheck(self.rawBits == bits)
_invariantCheck()
}
@inlinable @_transparent
internal var objectRawBits: UInt {
@inline(__always) get { return Builtin.reinterpretCast(_object) }
}
@inlinable @_transparent
internal var undiscriminatedObjectRawBits: UInt {
@inline(__always) get {
return (objectRawBits & Nibbles.largeAddressMask)
}
}
}
extension _StringObject.CountAndFlags {
@usableFromInline
internal typealias RawBitPattern = UInt
@inlinable
internal var rawBits: RawBitPattern {
@inline(__always) get { return _storage }
}
@inlinable @inline(__always)
internal init(raw bits: RawBitPattern) {
self._storage = bits
_invariantCheck()
}
}
/*
Encoding is optimized for common fast creation. The canonical empty string,
ASCII small strings, as well as most literals, have all consecutive 1s in their
high nibble mask, and thus can all be encoded as a logical immediate operand
on arm64.
The high nibble of the bridge object has the following encoding:
Form b63 b62 b61 b60
Small 1 ASCII 1 0
Large Immortal ObjC 0 Slow
Immortal (b63): Should the Swift runtime skip ARC
- Small strings are just values, always immortal
- Large strings can sometimes be immortal, e.g. literals
ObjC (b62): Should the runtime perform ObjC ARC rather than Swift ARC
- Lazily-bridged NSStrings set this bit, are managed by ObjC ARC
- Note: Small strings repurpose this bit to denote ASCII-only contents
isSmall (b61): Dedicated bit to denote small strings
Slow (b60): If set, string is unable to provide access to contiguous UTF-8
- Small strings can spill to the stack
- Native strings are UTF-8
- Lazily bridged Cocoa strings or other foreign strings may be able to
The canonical empty string is the zero-sized small string. It has a leading
nibble of 1110, and all other bits are 0.
*/
extension _StringObject.Nibbles {
// The canonical empty sting is an empty small string
@inlinable
internal static var emptyString: UInt {
@inline(__always) get { return _StringObject.Nibbles.small(isASCII: true) }
}
}
/*
Large strings can either be "native", "shared", or "foreign".
Native strings have tail-allocated storage, which begins at an offset of
`nativeBias` from the storage object's address. String literals, which reside
in the constant section, are encoded as their start address minus `nativeBias`,
unifying code paths for both literals ("immortal native") and native strings.
Native Strings are always managed by the Swift runtime.
Shared strings do not have tail-allocated storage, but can provide access
upon query to contiguous UTF-8 code units. Lazily-bridged NSStrings capable of
providing access to contiguous ASCII/UTF-8 set the ObjC bit. Accessing shared
string's pointer should always be behind a resilience barrier, permitting
future evolution.
Foreign strings, currently, only encompass lazily-bridged NSStrings that
cannot be treated as "shared". Such strings may provide access to contiguous
UTF-16, or may be discontiguous in storage. Accessing foreign strings should
remain behind a resilience barrier for future evolution.
Shared and foreign strings are always created and accessed behind a resilience
barrier, providing flexibility for the future.
nativeBias
32
The rest of the "object" bit representation for large strings:
Form b63:b60 b59 b58 b57 b56 b55:b0
Native x 0 0 0 0 TBD TBD TBD objectAddr
Shared x x 0 0 1 TBD TBD TBD objectAddr
Foreign x 0 0 1 1 TBD TBD TBD objectAddr
b59: Set if not a typical tail-allocated native Swift string
- Native Swift strings are tail-allocated contiguous UTF-8
- Shared strings can provide access to contiguous UTF-8
- Foreign string cannot
TODO(UTF8): Allocate the rest of the bits appropriately
TODO(UTF8): For Foreign strings, consider allocating a bit for whether they can
provide contiguous UTF-16 code units, which would allow us to avoid doing the
full call for non-contiguous NSString.
objectAddr: The address of the beginning of the potentially-managed object.
Other foreign forms are reserved for the future.
*/
extension _StringObject.Nibbles {
// Mask for address bits, i.e. non-discriminator and non-extra high bits
@inlinable
static internal var largeAddressMask: UInt {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return 0x00FF_FFFF_FFFF_FFFF
#endif
}
}
// Discriminator for small strings
@inlinable @inline(__always)
internal static func small(isASCII: Bool) -> UInt {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return isASCII ? 0xE000_0000_0000_0000 : 0xA000_0000_0000_0000
#endif
}
// Discriminator for large, immortal, swift-native strings
@inlinable @inline(__always)
internal static func largeImmortal() -> UInt {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return 0x8000_0000_0000_0000
#endif
}
// Discriminator for large, mortal (i.e. managed), swift-native strings
@inlinable @inline(__always)
internal static func largeMortal() -> UInt {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return 0x0000_0000_0000_0000
#endif
}
// Discriminator for large, shared, mortal (i.e. managed), swift-native
// strings
@inlinable @inline(__always)
internal static func largeSharedMortal() -> UInt {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return 0x0800_0000_0000_0000
#endif
}
internal static func largeCocoa(providesFastUTF8: Bool) -> UInt {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return providesFastUTF8 ? 0x4800_0000_0000_0000 : 0x5800_0000_0000_0000
#endif
}
}
extension _StringObject {
@inlinable
internal static var nativeBias: UInt { @inline(__always) get { return 32 } }
@inlinable
internal var isImmortal: Bool {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return (objectRawBits & 0x8000_0000_0000_0000) != 0
#endif
}
}
@inlinable
internal var isMortal: Bool {
@inline(__always) get { return !isImmortal }
}
@inlinable
internal var isSmall: Bool {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return (objectRawBits & 0x2000_0000_0000_0000) != 0
#endif
}
}
@inlinable
internal var isLarge: Bool { @inline(__always) get { return !isSmall } }
// Whether this string can provide access to contiguous UTF-8 code units:
// - Small strings can by spilling to the stack
// - Large native strings can through an offset
// - Shared strings can:
// - Cocoa strings which respond to e.g. CFStringGetCStringPtr()
// - TODO(UTF8): non-Cocoa shared strings
@inlinable
internal var providesFastUTF8: Bool {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return (objectRawBits & 0x1000_0000_0000_0000) == 0
#endif
}
}
@inlinable
internal var isForeign: Bool {
@inline(__always) get { return !providesFastUTF8 }
}
// Whether we are a mortal, native string
@inlinable
internal var hasNativeStorage: Bool {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return (objectRawBits & 0xF800_0000_0000_0000) == 0
#endif
}
}
// Whether we are a mortal, shared string (managed by Swift runtime)
internal var hasSharedStorage: Bool {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return (objectRawBits & 0xF800_0000_0000_0000)
== Nibbles.largeSharedMortal()
#endif
}
}
}
// Queries conditional on being in a large or fast form.
extension _StringObject {
// Whether this string is native, presupposing it is both large and fast
@inlinable
internal var largeFastIsNative: Bool {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
_sanityCheck(isLarge && providesFastUTF8)
return (objectRawBits & 0x0800_0000_0000_0000) == 0
#endif
}
}
// Whether this string is shared, presupposing it is both large and fast
@inlinable
internal var largeFastIsShared: Bool {
@inline(__always) get { return !largeFastIsNative }
}
// Whether this string is a lazily-bridged NSString, presupposing it is large
@inlinable
internal var largeIsCocoa: Bool {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
_sanityCheck(isLarge)
return (objectRawBits & 0x4000_0000_0000_0000) != 0
#endif
}
}
}
/*
Small strings have the following per-byte layout. When stored in memory
(little-endian), their first character ('a') is in the lowest address and their
top-nibble and count is in the highest address.
7 6 5 4 3 2 1 0 15 14 13 12 11 10 9 8
h g f e d c b a p o n l k j i 1x0x count
*/
extension _StringObject {
@inlinable
internal var smallCount: Int {
@inline(__always)
get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
_sanityCheck(isSmall)
return Int(bitPattern: (objectRawBits & 0x0F00_0000_0000_0000) &>> 56)
#endif
}
}
@inlinable @inline(__always)
mutating internal func setSmallCount(_ count: Int, isASCII: Bool) {
// TODO(UTF8 codegen): Ensure this is just a couple simple ops
_sanityCheck(isSmall && count <= _SmallString.capacity)
let valueBits = self.undiscriminatedObjectRawBits
let discrim = Nibbles.small(isASCII: isASCII)
| (UInt(bitPattern: count) &<< 56)
self = _StringObject(
valueBits: valueBits,
discriminator: discrim,
countAndFlags: self._countAndFlags)
}
@inlinable
internal var smallIsASCII: Bool {
@inline(__always)
get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
_sanityCheck(isSmall)
return objectRawBits & 0x4000_0000_0000_0000 != 0
#endif
}
}
@inlinable
internal var asSmallString: _SmallString {
@inline(__always)
get {
_sanityCheck(isSmall)
return _SmallString(raw:rawBits)
}
}
@inlinable @inline(__always)
internal init(_ small: _SmallString) {
self.init(raw: small.rawBits)
}
// Canonical empty pattern: small zero-length string
@inlinable @inline(__always)
internal init(empty:()) {
self._countAndFlags = CountAndFlags(zero:())
self._object = Builtin.reinterpretCast(Nibbles.emptyString)
_sanityCheck(self.smallCount == 0)
_invariantCheck()
}
}
/*
All non-small forms share the same structure for the other half of the bits
(i.e. non-object bits) as a word containing code unit count and various
performance flags. The top 16 bits are for performance flags, which are not
semantically relevant but communicate that some operations can be done more
efficiently on this particular string, and the lower 48 are the code unit
count (aka endIndex).
b63 b62 b61:48 b47:0
isASCII isNFC TBD count
isASCII: set when all code units are known to be ASCII, enabling:
- Trivial Unicode scalars, they're just the code units
- Trivial UTF-16 transcoding (just bit-extend)
- Also, isASCII always implies isNFC
isNFC: set when the contents are in normal form C, enable:
- Trivial lexicographical comparisons: just memcmp
Allocation of more performance flags is TBD, un-used bits will be reserved for
future use. Count stores the number of code units: corresponds to `endIndex`.
*/
extension _StringObject.CountAndFlags {
@inlinable @inline(__always)
internal init(count: Int) {
self.init(zero:())
self.count = count
_invariantCheck()
}
@inlinable @inline(__always)
internal init(count: Int, isASCII: Bool) {
self.init(zero:())
self.count = count
if isASCII {
// ASCII also entails NFC
self._storage |= 0xC000_0000_0000_0000
}
_invariantCheck()
}
@inlinable
internal var countMask: UInt {
@inline(__always) get {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return 0x0000_FFFF_FFFF_FFFF
#endif
}
}
@inlinable
internal var flagsMask: UInt { @inline(__always) get { return ~countMask} }
@inlinable
internal var count: Int {
@inline(__always) get { return Int(bitPattern: _storage & countMask) }
@inline(__always) set {
_sanityCheck(newValue <= countMask, "too large")
_storage = (_storage & flagsMask) | UInt(bitPattern: newValue)
}
}
@inlinable
internal var isASCII: Bool {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return 0 != _storage & 0x8000_0000_0000_0000
#endif
}
@inlinable
internal var isNFC: Bool {
#if arch(i386) || arch(arm)
unimplemented_utf8_32bit()
#else
return 0 != _storage & 0x4000_0000_0000_0000
#endif
}
#if !INTERNAL_CHECKS_ENABLED
@inlinable @inline(__always) internal func _invariantCheck() {}
#else
@usableFromInline @inline(never) @_effects(releasenone)
internal func _invariantCheck() {
if isASCII {
_sanityCheck(isNFC)
}
}
#endif // INTERNAL_CHECKS_ENABLED
}
// Extract
extension _StringObject {
@inlinable
internal var largeCount: Int {
@inline(__always) get {
_sanityCheck(isLarge)
return _countAndFlags.count
}
@inline(__always) set {
_countAndFlags.count = newValue
_sanityCheck(newValue == largeCount)
_invariantCheck()
}
}
@inlinable
internal var largeAddressBits: UInt {
@inline(__always) get {
_sanityCheck(isLarge)
return undiscriminatedObjectRawBits
}
}
@inlinable
internal var nativeUTF8Start: UnsafePointer<UInt8> {
@inline(__always) get {
_sanityCheck(largeFastIsNative)
let largeAddressBits = objectRawBits & Nibbles.largeAddressMask
return UnsafePointer(
bitPattern: largeAddressBits &+ _StringObject.nativeBias
)._unsafelyUnwrappedUnchecked
}
}
@inlinable
internal var nativeUTF8: UnsafeBufferPointer<UInt8> {
@inline(__always) get {
_sanityCheck(largeFastIsNative)
return UnsafeBufferPointer(start: nativeUTF8Start, count: largeCount)
}
}
// Resilient way to fetch a pointer
@usableFromInline @inline(never)
@_effects(releasenone)
internal func getSharedUTF8Start() -> UnsafePointer<UInt8> {
_sanityCheck(largeFastIsShared)
if largeIsCocoa {
return _cocoaUTF8Pointer(cocoaObject)._unsafelyUnwrappedUnchecked
}
return sharedStorage.start
}
@inlinable
internal var sharedUTF8: UnsafeBufferPointer<UInt8> {
@inline(__always) get {
_sanityCheck(largeFastIsShared)
let start = self.getSharedUTF8Start()
return UnsafeBufferPointer(start: start, count: largeCount)
}
}
@inlinable
internal var nativeStorage: _StringStorage {
@inline(__always) get {
_sanityCheck(hasNativeStorage)
return Builtin.reinterpretCast(largeAddressBits)
}
}
internal var sharedStorage: _SharedStringStorage {
@inline(__always) get {
_sanityCheck(largeFastIsShared && !largeIsCocoa)
_sanityCheck(hasSharedStorage)
return Builtin.reinterpretCast(largeAddressBits)
}
}
internal var cocoaObject: AnyObject {
@inline(__always) get {
_sanityCheck(largeIsCocoa && !isImmortal)
return Builtin.reinterpretCast(largeAddressBits)
}
}
}
// Aggregate queries / abstractions
extension _StringObject {
// The number of code units stored
//
// TODO(UTF8 compiles): Check generated code
@inlinable
internal var count: Int {
@inline(__always) get { return isSmall ? smallCount : largeCount }
}
//
// Whether the string is all ASCII
//
@inlinable
internal var isASCII: Bool {
@inline(__always) get {
if isSmall { return smallIsASCII }
return _countAndFlags.isASCII
}
}
@inlinable
internal var isNFC: Bool {
@inline(__always) get {
if isSmall {
// TODO(UTF8 perf): Worth implementing more sophisiticated check, or
// else performing normalization on-construction. For now, approximate
// it with isASCII
return smallIsASCII
}
return _countAndFlags.isNFC
}
}
// Get access to fast UTF-8 contents for large strings which provide it.
@inlinable
internal var fastUTF8: UnsafeBufferPointer<UInt8> {
@inline(__always) get {
_sanityCheck(self.isLarge && self.providesFastUTF8)
let start: UnsafePointer<UInt8>
let count = self.largeCount
if _slowPath(self.largeFastIsShared) {
start = getSharedUTF8Start()
} else {
start = self.nativeUTF8Start
}
return UnsafeBufferPointer(start: start, count: count)
}
}
// Whether the object stored can be bridged directly as a NSString
@usableFromInline // @opaque
internal var hasObjCBridgeableObject: Bool {
// Currently, all mortal objects can zero-cost bridge
return !self.isImmortal
}
// Fetch the stored subclass of NSString for bridging
@inlinable
internal var objCBridgeableObject: AnyObject {
@inline(__always) get {
_sanityCheck(hasObjCBridgeableObject)
return Builtin.reinterpretCast(largeAddressBits)
}
}
// Whether the object provides fast UTF-8 contents that are nul-terminated
@inlinable
internal var isFastZeroTerminated: Bool {
if _slowPath(!providesFastUTF8) { return false }
// Small strings nul-terminate when spilling for contiguous access
if isSmall { return true }
// TODO(UTF8 perf): Use performance flag, which could be more inclusive. For
// now, we only know native strings and small strings (when accessed) are.
// We could also know about some shared strings.
return largeFastIsNative
}
}
// Object creation
extension _StringObject {
@inlinable @inline(__always)
internal init(immortal bufPtr: UnsafeBufferPointer<UInt8>, isASCII: Bool) {
// We bias to align code paths for mortal and immortal strings
let biasedAddress = UInt(
bitPattern: bufPtr.baseAddress._unsafelyUnwrappedUnchecked
) &- _StringObject.nativeBias
let countAndFlags = CountAndFlags(count: bufPtr.count, isASCII: isASCII)
self.init(
valueBits: biasedAddress,
discriminator: Nibbles.largeImmortal(),
countAndFlags: countAndFlags)
}
@inlinable @inline(__always)
internal init(_ storage: _StringStorage) {
self.init(
object: storage,
discriminator: Nibbles.largeMortal(),
countAndFlags: storage._countAndFlags)
}
internal init(_ storage: _SharedStringStorage, isASCII: Bool) {
self.init(
object: storage,
discriminator: Nibbles.largeSharedMortal(),
countAndFlags: storage._countAndFlags)
}
internal init(
cocoa: AnyObject, providesFastUTF8: Bool, isASCII: Bool, length: Int
) {
let countAndFlags = CountAndFlags(count: length, isASCII: isASCII)
let discriminator = Nibbles.largeCocoa(providesFastUTF8: providesFastUTF8)
self.init(
object: cocoa, discriminator: discriminator, countAndFlags: countAndFlags)
_sanityCheck(self.largeAddressBits == Builtin.reinterpretCast(cocoa))
_sanityCheck(self.providesFastUTF8 == providesFastUTF8)
_sanityCheck(self.largeCount == length)
}
}
// Internal invariants
extension _StringObject {
#if !INTERNAL_CHECKS_ENABLED
@inlinable @inline(__always) internal func _invariantCheck() {}
#else
@usableFromInline @inline(never) @_effects(releasenone)
internal func _invariantCheck() {
_sanityCheck(MemoryLayout<_StringObject>.size == 16)
if isForeign {
_sanityCheck(largeIsCocoa, "No other foreign forms yet")
}
if isSmall {
_sanityCheck(isImmortal)
_sanityCheck(smallCount <= 15)
_sanityCheck(smallCount == count)
_sanityCheck(!hasObjCBridgeableObject)
} else {
_sanityCheck(isLarge)
_sanityCheck(largeCount == count)
_countAndFlags._invariantCheck()
if providesFastUTF8 && largeFastIsNative {
_sanityCheck(!isSmall)
_sanityCheck(!largeIsCocoa)
if isImmortal {
_sanityCheck(!hasNativeStorage)
_sanityCheck(!hasObjCBridgeableObject)
} else {
_sanityCheck(hasNativeStorage)
_sanityCheck(hasObjCBridgeableObject)
_sanityCheck(nativeStorage.count == self.count)
nativeStorage._invariantCheck()
}
}
if largeIsCocoa {
_sanityCheck(hasObjCBridgeableObject)
_sanityCheck(!isSmall)
if isForeign {
} else {
_sanityCheck(largeFastIsShared)
}
}
}
}
#endif // INTERNAL_CHECKS_ENABLED
@inline(never)
internal func _dump() {
// TODO(UTF8): Print out any useful internal information
#if INTERNAL_CHECKS_ENABLED
if isSmall {
asSmallString._dump()
return
}
if providesFastUTF8 && largeFastIsNative {
let repr = _StringGuts(self)._classify()
print("""
Native(\
owner: \(repr._objectIdentifier!), \
count: \(repr._count), \
capacity: \(repr._capacity))
""")
} else if largeIsCocoa {
print("Cocoa")
} else {
print("TODO...")
}
#endif // INTERNAL_CHECKS_ENABLED
}
}