Files
swift-mirror/stdlib/public/core/StringCreate.swift
Michael Ilseman 0ca42e9ef7 [string] Shrink storage class sizes.
* Don't allocate breadrumbs pointer if under threshold
* Increase breadrumbs threshold
* Linear 16-byte bucketing until 128 bytes, malloc_size after
* Allow cap less than _SmallString.capacity (bridging non-ASCII)

This change decreases the amount of heap usage for moderate-length
strings (< 64 UTF-8 code units in length) and increases the amount of
spare code unit capacity available (less growth needed).

Average improvements for moderate-length strings:

* 64-bit: on average, 8 bytes saved and 4 bytes of extra capacity
* 32-bit: on average, 4 bytes saved and 6 bytes of extra capacity

Additionally, on 32-bit, large-length strings also gain an average of
6 bytes of extra spare capacity.

Details:

On 64-bit, half of moderate-length allocations will save 16 bytes
while the other half get an extra 8 bytes of spare capacity.

On 32-bit, a quarter of moderate-length allocations will save 16
bytes, and the rest get an extra 4 bytes of spare
capacity. Additionally, 32-bit string's storage class now claims its
full allocation, which is its birthright. Prior to this change, we'd
have on average 1.5 bytes of spare capacity, and now we have 7.5 bytes
of spare capacity.

Breadcrumbs threshold is increased from the super-conservative 32 to
the pretty-conservative 64. Some speed improvements are incorporated
in this change, but more are in flight. Even without those eventual
improvements, this is a worthwhile change (ASCII is still fast-pathed
and irrelevant to breadcrumbing).

For a complex real-world workload, this amounts to around a 5%
improvement to transient heap usage due to all strings and a 4%
improvement to peak heap usage due to all strings. For moderate-length
strings specifically, this gives around 11% improvement to both.
2020-03-05 16:10:23 -08:00

289 lines
8.6 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
// String Creation Helpers
//===----------------------------------------------------------------------===//
internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
if input.isEmpty { return true }
// NOTE: Avoiding for-in syntax to avoid bounds checks
//
// TODO(String performance): SIMD-ize
//
let ptr = input.baseAddress._unsafelyUnwrappedUnchecked
var i = 0
let count = input.count
let stride = MemoryLayout<UInt>.stride
let address = Int(bitPattern: ptr)
let wordASCIIMask = UInt(truncatingIfNeeded: 0x8080_8080_8080_8080 as UInt64)
let byteASCIIMask = UInt8(truncatingIfNeeded: wordASCIIMask)
while (address &+ i) % stride != 0 && i < count {
guard ptr[i] & byteASCIIMask == 0 else { return false }
i &+= 1
}
while (i &+ stride) <= count {
let word: UInt = UnsafePointer(
bitPattern: address &+ i
)._unsafelyUnwrappedUnchecked.pointee
guard word & wordASCIIMask == 0 else { return false }
i &+= stride
}
while i < count {
guard ptr[i] & byteASCIIMask == 0 else { return false }
i &+= 1
}
return true
}
extension String {
internal static func _uncheckedFromASCII(
_ input: UnsafeBufferPointer<UInt8>
) -> String {
if let smol = _SmallString(input) {
return String(_StringGuts(smol))
}
let storage = __StringStorage.create(initializingFrom: input, isASCII: true)
return storage.asString
}
@usableFromInline
internal static func _fromASCII(
_ input: UnsafeBufferPointer<UInt8>
) -> String {
_internalInvariant(_allASCII(input), "not actually ASCII")
return _uncheckedFromASCII(input)
}
internal static func _fromASCIIValidating(
_ input: UnsafeBufferPointer<UInt8>
) -> String? {
if _fastPath(_allASCII(input)) {
return _uncheckedFromASCII(input)
}
return nil
}
public // SPI(Foundation)
static func _tryFromUTF8(_ input: UnsafeBufferPointer<UInt8>) -> String? {
guard case .success(let extraInfo) = validateUTF8(input) else {
return nil
}
return String._uncheckedFromUTF8(input, isASCII: extraInfo.isASCII)
}
@usableFromInline
internal static func _fromUTF8Repairing(
_ input: UnsafeBufferPointer<UInt8>
) -> (result: String, repairsMade: Bool) {
switch validateUTF8(input) {
case .success(let extraInfo):
return (String._uncheckedFromUTF8(
input, asciiPreScanResult: extraInfo.isASCII
), false)
case .error(let initialRange):
return (repairUTF8(input, firstKnownBrokenRange: initialRange), true)
}
}
internal static func _fromLargeUTF8Repairing(
uninitializedCapacity capacity: Int,
initializingWith initializer: (
_ buffer: UnsafeMutableBufferPointer<UInt8>
) throws -> Int
) rethrows -> String {
let result = try __StringStorage.create(
uninitializedCodeUnitCapacity: capacity,
initializingUncheckedUTF8With: initializer)
switch validateUTF8(result.codeUnits) {
case .success(let info):
result._updateCountAndFlags(
newCount: result.count,
newIsASCII: info.isASCII
)
return result.asString
case .error(let initialRange):
//This could be optimized to use excess tail capacity
return repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange)
}
}
@usableFromInline
internal static func _uncheckedFromUTF8(
_ input: UnsafeBufferPointer<UInt8>
) -> String {
return _uncheckedFromUTF8(input, isASCII: _allASCII(input))
}
@usableFromInline
internal static func _uncheckedFromUTF8(
_ input: UnsafeBufferPointer<UInt8>,
isASCII: Bool
) -> String {
if let smol = _SmallString(input) {
return String(_StringGuts(smol))
}
let storage = __StringStorage.create(
initializingFrom: input, isASCII: isASCII)
return storage.asString
}
// If we've already pre-scanned for ASCII, just supply the result
@usableFromInline
internal static func _uncheckedFromUTF8(
_ input: UnsafeBufferPointer<UInt8>, asciiPreScanResult: Bool
) -> String {
if let smol = _SmallString(input) {
return String(_StringGuts(smol))
}
let isASCII = asciiPreScanResult
let storage = __StringStorage.create(
initializingFrom: input, isASCII: isASCII)
return storage.asString
}
@usableFromInline
internal static func _uncheckedFromUTF16(
_ input: UnsafeBufferPointer<UInt16>
) -> String {
// TODO(String Performance): Attempt to form smol strings
// TODO(String performance): Skip intermediary array, transcode directly
// into a StringStorage space.
var contents: [UInt8] = []
contents.reserveCapacity(input.count)
let repaired = transcode(
input.makeIterator(),
from: UTF16.self,
to: UTF8.self,
stoppingOnError: false,
into: { contents.append($0) })
_internalInvariant(!repaired, "Error present")
return contents.withUnsafeBufferPointer { String._uncheckedFromUTF8($0) }
}
@inline(never) // slow path
private static func _slowFromCodeUnits<
Input: Collection,
Encoding: Unicode.Encoding
>(
_ input: Input,
encoding: Encoding.Type,
repair: Bool
) -> (String, repairsMade: Bool)?
where Input.Element == Encoding.CodeUnit {
// TODO(String Performance): Attempt to form smol strings
// TODO(String performance): Skip intermediary array, transcode directly
// into a StringStorage space.
var contents: [UInt8] = []
contents.reserveCapacity(input.underestimatedCount)
let repaired = transcode(
input.makeIterator(),
from: Encoding.self,
to: UTF8.self,
stoppingOnError: false,
into: { contents.append($0) })
guard repair || !repaired else { return nil }
let str = contents.withUnsafeBufferPointer { String._uncheckedFromUTF8($0) }
return (str, repaired)
}
@usableFromInline @inline(never) // can't be inlined w/out breaking ABI
@_specialize(
where Input == UnsafeBufferPointer<UInt8>, Encoding == Unicode.ASCII)
@_specialize(
where Input == Array<UInt8>, Encoding == Unicode.ASCII)
internal static func _fromCodeUnits<
Input: Collection,
Encoding: Unicode.Encoding
>(
_ input: Input,
encoding: Encoding.Type,
repair: Bool
) -> (String, repairsMade: Bool)?
where Input.Element == Encoding.CodeUnit {
guard _fastPath(encoding == Unicode.ASCII.self) else {
return _slowFromCodeUnits(input, encoding: encoding, repair: repair)
}
var result:String? = nil
if let contigBytes = input as? _HasContiguousBytes,
contigBytes._providesContiguousBytesNoCopy {
result = contigBytes.withUnsafeBytes { rawBufPtr in
let buffer = UnsafeBufferPointer(
start: rawBufPtr.baseAddress?.assumingMemoryBound(to: UInt8.self),
count: rawBufPtr.count)
return String._fromASCIIValidating(buffer)
}
} else {
result = Array(input).withUnsafeBufferPointer {
let buffer = UnsafeRawBufferPointer($0).bindMemory(to: UInt8.self)
return String._fromASCIIValidating(buffer)
}
}
return result != nil ?
(result!, repairsMade: false) :
_slowFromCodeUnits(input, encoding: encoding, repair: repair)
}
public // @testable
static func _fromInvalidUTF16(
_ utf16: UnsafeBufferPointer<UInt16>
) -> String {
return String._fromCodeUnits(utf16, encoding: UTF16.self, repair: true)!.0
}
@usableFromInline
internal static func _fromSubstring(
_ substring: __shared Substring
) -> String {
if substring._offsetRange == substring.base._offsetRange {
return substring.base
}
return String._copying(substring)
}
@_alwaysEmitIntoClient
@inline(never) // slow-path
internal static func _copying(_ str: String) -> String {
return String._copying(str[...])
}
@_alwaysEmitIntoClient
@inline(never) // slow-path
internal static func _copying(_ str: Substring) -> String {
if _fastPath(str._wholeGuts.isFastUTF8) {
return str._wholeGuts.withFastUTF8(range: str._offsetRange) {
String._uncheckedFromUTF8($0)
}
}
return Array(str.utf8).withUnsafeBufferPointer {
String._uncheckedFromUTF8($0)
}
}
}