mirror of
https://github.com/apple/swift.git
synced 2025-12-14 20:36:38 +01:00
484 lines
16 KiB
Swift
484 lines
16 KiB
Swift
//===----------------------------------------------------------------------===//
|
|
//
|
|
// This source file is part of the Swift.org open source project
|
|
//
|
|
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
|
|
// Licensed under Apache License v2.0 with Runtime Library Exception
|
|
//
|
|
// See http://swift.org/LICENSE.txt for license information
|
|
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
|
|
//
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
/// The core implementation of a highly-optimizable String that
|
|
/// can store both ASCII and UTF-16, and can wrap native Swift
|
|
/// _StringBuffer or NSString instances.
|
|
///
|
|
/// Usage note: when elements are 8 bits wide, this code may
|
|
/// dereference one past the end of the byte array that it owns, so
|
|
/// make sure that storage is allocated! You want a null terminator
|
|
/// anyway, so it shouldn't be a burden.
|
|
//
|
|
// Implementation note: We try hard to avoid branches in this code, so
|
|
// for example we use integer math to avoid switching on the element
|
|
// size with the ternary operator. This is also the cause of the
|
|
// extra element requirement for 8 bit elements. See the
|
|
// implementation of subscript(Int) -> UTF16.CodeUnit below for details.
|
|
struct _StringCore {
|
|
//===--------------------------------------------------------------------===//
|
|
// Internals
|
|
var _baseAddress: COpaquePointer
|
|
var _countAndFlags: UWord
|
|
var _owner: AnyObject?
|
|
|
|
/// (private) create the implementation of a string from its component parts.
|
|
init(
|
|
baseAddress: COpaquePointer,
|
|
_countAndFlags: UWord,
|
|
owner: AnyObject?
|
|
) {
|
|
self._baseAddress = baseAddress
|
|
self._countAndFlags = _countAndFlags
|
|
self._owner = owner
|
|
_invariantCheck()
|
|
}
|
|
|
|
func _invariantCheck() {
|
|
assert(count >= 0)
|
|
|
|
if _baseAddress == .null() {
|
|
assert(cocoaBuffer,
|
|
"Only opaque cocoa strings may have a null base pointer")
|
|
assert(elementWidth == 2,
|
|
"Opaque cocoa strings should have an elementWidth of 2")
|
|
}
|
|
else if _baseAddress == _emptyStringBase {
|
|
assert(count == 0, "Empty string storage with non-zero length")
|
|
assert(!_owner, "String pointing at empty storage has owner")
|
|
}
|
|
else if let buffer = nativeBuffer {
|
|
assert(elementWidth == buffer.elementWidth,
|
|
"_StringCore elementWidth doesn't match its buffer's")
|
|
assert(UnsafePointer(_baseAddress) >= buffer.start)
|
|
assert(UnsafePointer(_baseAddress) <= buffer.usedEnd)
|
|
assert(UnsafePointer(_pointerToNth(count)) <= buffer.usedEnd)
|
|
}
|
|
}
|
|
|
|
/// Bitmask for the count part of _countAndFlags
|
|
var _countMask: UWord {
|
|
return UWord.max >> 2
|
|
}
|
|
|
|
/// Bitmask for the flags part of _countAndFlags
|
|
var _flagMask: UWord {
|
|
return ~_countMask
|
|
}
|
|
|
|
/// Value by which to multiply a 2nd byte fetched in order to
|
|
/// assemble a UTF16 code unit from our contiguous storage. If we
|
|
/// store ASCII, this will be zero. Otherwise, it will be 0x100
|
|
var _highByteMultiplier: UTF16.CodeUnit {
|
|
return UTF16.CodeUnit(elementShift) << 8
|
|
}
|
|
|
|
/// Return a pointer to the Nth element of contiguous
|
|
/// storage. Caveats: The string must have contiguous storage; the
|
|
/// element may be 1 or 2 bytes wide, depending on elementWidth; the
|
|
/// result may be null if the string is empty.
|
|
func _pointerToNth(n: Int) -> COpaquePointer {
|
|
assert(hasContiguousStorage && n >= 0 && n <= count)
|
|
return COpaquePointer(
|
|
UnsafePointer<RawByte>(_baseAddress) + (n << elementShift))
|
|
}
|
|
|
|
static func _copyElements(
|
|
srcStart: COpaquePointer, srcElementWidth: Int,
|
|
dstStart: COpaquePointer, dstElementWidth: Int,
|
|
count: Int
|
|
) {
|
|
// Copy the old stuff into the new storage
|
|
if _fastPath(srcElementWidth == dstElementWidth) {
|
|
// No change in storage width; we can use memcpy
|
|
c_memcpy(
|
|
dest: UnsafePointer(dstStart),
|
|
src: UnsafePointer(srcStart),
|
|
size: UInt(count << (srcElementWidth - 1)))
|
|
}
|
|
else if (srcElementWidth < dstElementWidth) {
|
|
// Widening ASCII to UTF16; we need to copy the bytes manually
|
|
var dest = UnsafePointer<UTF16.CodeUnit>(dstStart)
|
|
var src = UnsafePointer<UTF8.CodeUnit>(srcStart)
|
|
let srcEnd = src + count
|
|
while (src != srcEnd) {
|
|
dest++.set(UTF16.CodeUnit(src++.get()))
|
|
}
|
|
}
|
|
else {
|
|
// Narrowing UTF16 to ASCII; we need to copy the bytes manually
|
|
var dest = UnsafePointer<UTF8.CodeUnit>(dstStart)
|
|
var src = UnsafePointer<UTF16.CodeUnit>(srcStart)
|
|
let srcEnd = src + count
|
|
while (src != srcEnd) {
|
|
dest++.set(UTF8.CodeUnit(src++.get()))
|
|
}
|
|
}
|
|
}
|
|
|
|
//===--------------------------------------------------------------------===//
|
|
// Initialization
|
|
init(
|
|
baseAddress: COpaquePointer,
|
|
count: Int,
|
|
elementShift: Int,
|
|
hasCocoaBuffer: Bool,
|
|
owner: AnyObject?
|
|
) {
|
|
assert(elementShift == 0 || elementShift == 1)
|
|
self._baseAddress = baseAddress
|
|
|
|
self._countAndFlags
|
|
= (UWord(elementShift) << UWord(sizeof(UWord.self) * 8 - 1))
|
|
| ((hasCocoaBuffer ? 1 : 0) << UWord(sizeof(UWord.self) * 8 - 2))
|
|
| UWord(count)
|
|
|
|
self._owner = owner
|
|
assert(UWord(count) & _flagMask == 0, "String too long to represent")
|
|
_invariantCheck()
|
|
}
|
|
|
|
/// Create a _StringCore that covers the entire length of the _StringBuffer.
|
|
init(_ buffer: _StringBuffer) {
|
|
self = _StringCore(
|
|
baseAddress: COpaquePointer(buffer.start),
|
|
count: buffer.usedCount,
|
|
elementShift: buffer.elementShift,
|
|
hasCocoaBuffer: false,
|
|
owner: buffer
|
|
)
|
|
}
|
|
|
|
/// Create the implementation of an empty string.
|
|
/// NOTE: there is no null terminator in an empty string!
|
|
init() {
|
|
self._baseAddress = _emptyStringBase
|
|
self._countAndFlags = 0
|
|
self._owner = .None
|
|
_invariantCheck()
|
|
}
|
|
|
|
//===--------------------------------------------------------------------===//
|
|
// Properties
|
|
|
|
/// The number of elements stored
|
|
var count: Int {
|
|
get {
|
|
return Int(_countAndFlags & _countMask)
|
|
}
|
|
set(newValue) {
|
|
assert(UWord(newValue) & _flagMask == 0)
|
|
_countAndFlags = (_countAndFlags & _flagMask) | UWord(newValue)
|
|
}
|
|
}
|
|
|
|
/// left shift amount to apply to an offset N so that when
|
|
/// added to a UnsafePointer<RawByte>, it traverses N elements
|
|
var elementShift: Int {
|
|
return Int(_countAndFlags >> UWord(sizeof(UWord.self) * 8 - 1))
|
|
}
|
|
|
|
/// the number of bytes per element
|
|
var elementWidth: Int {
|
|
return elementShift + 1
|
|
}
|
|
|
|
var hasContiguousStorage: Bool {
|
|
return _fastPath(_baseAddress != .null())
|
|
}
|
|
|
|
/// are we using an NSString for storage?
|
|
var hasCocoaBuffer: Bool {
|
|
return Word((_countAndFlags << 1).value) < 0
|
|
}
|
|
|
|
var startASCII: UnsafePointer<UTF8.CodeUnit> {
|
|
assert(elementWidth == 1, "String does not contain contiguous ASCII")
|
|
return UnsafePointer(_baseAddress)
|
|
}
|
|
|
|
var startUTF16: UnsafePointer<UTF16.CodeUnit> {
|
|
assert(
|
|
count == 0 || elementWidth == 2,
|
|
"String does not contain contiguous UTF16")
|
|
return UnsafePointer(_baseAddress)
|
|
}
|
|
|
|
/// the native _StringBuffer, if any, or .None.
|
|
var nativeBuffer: _StringBuffer? {
|
|
if !hasCocoaBuffer {
|
|
return _owner.map {
|
|
reinterpretCast($0) as _StringBuffer
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
/// the Cocoa String buffer, if any, or .None.
|
|
var cocoaBuffer: _CocoaString? {
|
|
if hasCocoaBuffer {
|
|
return _owner.map {
|
|
reinterpretCast($0) as _CocoaString
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
//===--------------------------------------------------------------------===//
|
|
// slicing
|
|
|
|
/// Return the given sub-_StringCore
|
|
subscript(subRange: Range<Int>) -> _StringCore {
|
|
|
|
assert(subRange.startIndex >= 0)
|
|
assert(subRange.endIndex <= count)
|
|
|
|
let newCount = subRange.endIndex - subRange.startIndex
|
|
assert(UWord(newCount) & _flagMask == 0)
|
|
|
|
if hasContiguousStorage {
|
|
return _StringCore(
|
|
baseAddress: _pointerToNth(subRange.startIndex),
|
|
_countAndFlags: (_countAndFlags & _flagMask) | UWord(newCount),
|
|
owner: _owner)
|
|
}
|
|
return _cocoaStringSlice(target: self, subRange: subRange)
|
|
}
|
|
|
|
/// Get the Nth UTF16 Code Unit stored
|
|
subscript(position: Int) -> UTF16.CodeUnit {
|
|
assert(position >= 0)
|
|
assert(position <= count)
|
|
|
|
if (_baseAddress != .null()) {
|
|
let p = UnsafePointer<UInt8>(_pointerToNth(position).value)
|
|
// Always dereference two bytes, but when elements are 8 bits we
|
|
// multiply the high byte by 0.
|
|
return UTF16.CodeUnit(p.get())
|
|
+ UTF16.CodeUnit((p + 1).get()) * _highByteMultiplier
|
|
}
|
|
|
|
return _cocoaStringSubscript(target: self, position: position)
|
|
}
|
|
|
|
/// Write the string, in the given encoding, to output.
|
|
func encode<
|
|
Encoding: UnicodeCodec,
|
|
Output: Sink
|
|
where Encoding.CodeUnit == Output.Element
|
|
>(encoding: Encoding.Type, output: Output)
|
|
{
|
|
if _fastPath(_baseAddress != .null()) {
|
|
if _fastPath(elementWidth == 1) {
|
|
var out = output
|
|
for x in UnsafeArray(
|
|
start: UnsafePointer<UTF8.CodeUnit>(_baseAddress), length: count
|
|
) {
|
|
Encoding.encode(UnicodeScalar(UInt32(x)), output: &out)
|
|
}
|
|
}
|
|
else {
|
|
transcode(UTF16.self, encoding,
|
|
UnsafeArray(
|
|
start: UnsafePointer<UTF16.CodeUnit>(_baseAddress),
|
|
length: count
|
|
),
|
|
output
|
|
)
|
|
}
|
|
}
|
|
else if (hasCocoaBuffer) {
|
|
_StringCore(
|
|
_cocoaStringToContiguous(source: cocoaBuffer!, range: 0...count,
|
|
minimumCapacity: 0)
|
|
).encode(encoding, output: output)
|
|
}
|
|
}
|
|
|
|
/// Attempt to claim unused capacity in the String's existing
|
|
/// native buffer, if any. Return zero and a pointer to the claimed
|
|
/// storage if successful. Otherwise, returns a suggested new
|
|
/// capacity and a null pointer.
|
|
///
|
|
/// Note: If successful, effectively appends garbage to the String
|
|
/// until it has newSize UTF16 code units; you must immediately copy
|
|
/// valid UTF16 into that storage.
|
|
///
|
|
/// Note: if unsuccessful because of insufficient space in an
|
|
/// existing buffer, the suggested new capacity will at least double
|
|
/// the existing buffer's storage
|
|
mutating func _claimCapacity(newSize: Int,
|
|
minElementWidth: Int) -> (Int, COpaquePointer) {
|
|
if _fastPath(nativeBuffer && elementWidth >= minElementWidth) {
|
|
var buffer = nativeBuffer!
|
|
|
|
// The buffer's "used" field must match this in order to be
|
|
// grown. Otherwise, some other String is using parts of
|
|
// the buffer beyond our last byte.
|
|
let matchUsed = _pointerToNth(count)
|
|
|
|
// Attempt to claim unused capacity in the buffer
|
|
if _fastPath(buffer.grow(
|
|
UnsafePointer<RawByte>(matchUsed.value), newUsedCount: newSize)) {
|
|
count = newSize
|
|
return (0, matchUsed)
|
|
}
|
|
else if newSize > buffer.capacity {
|
|
// Growth failed because of insufficient storage; double the size
|
|
return (max(buffer.capacity * 2, newSize), .null())
|
|
}
|
|
}
|
|
return (newSize, .null())
|
|
}
|
|
|
|
/// Ensure that this String references a _StringBuffer having
|
|
/// a capacity of at least newSize elements of at least the given width.
|
|
/// Effectively appends garbage to the String until it has newSize
|
|
/// UTF16 code units. Returns a pointer to the garbage code units;
|
|
/// you must immediately copy valid data into that storage.
|
|
mutating func _growBuffer(
|
|
newSize: Int, minElementWidth: Int
|
|
) -> COpaquePointer {
|
|
let (newCapacity, existingStorage)
|
|
= _claimCapacity(newSize, minElementWidth: minElementWidth)
|
|
|
|
if _fastPath(!existingStorage.isNull()) {
|
|
return existingStorage
|
|
}
|
|
|
|
// Allocate storage.
|
|
let newElementWidth =
|
|
minElementWidth >= elementWidth
|
|
? minElementWidth
|
|
: representableAsASCII() ? 1 : 2
|
|
|
|
var newStorage = _StringBuffer(capacity: newCapacity, initialSize: newSize,
|
|
elementWidth: newElementWidth)
|
|
|
|
var oldCount = count
|
|
if hasContiguousStorage {
|
|
_StringCore._copyElements(
|
|
_baseAddress, srcElementWidth: elementWidth,
|
|
dstStart: COpaquePointer(newStorage.start),
|
|
dstElementWidth: newElementWidth, count: oldCount)
|
|
}
|
|
else {
|
|
// Opaque cocoa buffers might not store ASCII, so assert that
|
|
// we've allocated for 2-byte elements.
|
|
// FIXME: can we get Cocoa to tell us quickly that an opaque
|
|
// string is ASCII? Do we care much about that edge case?
|
|
assert(newStorage.elementShift == 1)
|
|
_cocoaStringReadAll(source: cocoaBuffer!,
|
|
destination: UnsafePointer(newStorage.start))
|
|
}
|
|
|
|
self = _StringCore(newStorage)
|
|
return _pointerToNth(oldCount)
|
|
}
|
|
|
|
mutating func append(c: UnicodeScalar) {
|
|
_invariantCheck()
|
|
// How many bytes does it take to encode each UTF16 code unit of
|
|
// c if ASCII storage is available?
|
|
let minBytesPerCodeUnit = c.value <= 0x7f ? 1 : 2
|
|
// How many UTF16 code units does it take to encode c?
|
|
let utf16Width = c.value <= 0xFFFF ? 1 : 2
|
|
|
|
let destination = _growBuffer(count + utf16Width,
|
|
minElementWidth: minBytesPerCodeUnit)
|
|
|
|
if _fastPath(elementWidth == 1) {
|
|
assert(
|
|
_pointerToNth(count)
|
|
== COpaquePointer(UnsafePointer<RawByte>(destination) + 1))
|
|
|
|
UnsafePointer<UTF8.CodeUnit>(destination).set(UTF8.CodeUnit(c.value))
|
|
}
|
|
else {
|
|
let destination16 = UnsafePointer<UTF16.CodeUnit>(destination.value)
|
|
if _fastPath(utf16Width == 1) {
|
|
assert(_pointerToNth(count) == COpaquePointer(destination16 + 1))
|
|
destination16.set(UTF16.CodeUnit(c.value))
|
|
}
|
|
else {
|
|
assert(_pointerToNth(count) == COpaquePointer(destination16 + 2))
|
|
destination16.set(UTF16.leadSurrogate(c))
|
|
(destination16 + 1).set(UTF16.trailSurrogate(c))
|
|
}
|
|
}
|
|
_invariantCheck()
|
|
}
|
|
|
|
mutating func append(rhs: _StringCore) {
|
|
_invariantCheck()
|
|
let minElementWidth
|
|
= elementWidth >= rhs.elementWidth
|
|
? elementWidth
|
|
: rhs.representableAsASCII() ? 1 : 2
|
|
|
|
let destination = _growBuffer(count + rhs.count,
|
|
minElementWidth: minElementWidth)
|
|
if _fastPath(rhs.hasContiguousStorage) {
|
|
_StringCore._copyElements(
|
|
rhs._baseAddress, srcElementWidth: rhs.elementWidth,
|
|
dstStart: destination, dstElementWidth:elementWidth, count: rhs.count)
|
|
}
|
|
else {
|
|
assert(elementWidth == 2)
|
|
_cocoaStringReadAll(source: rhs.cocoaBuffer!,
|
|
destination: UnsafePointer(destination))
|
|
}
|
|
_invariantCheck()
|
|
}
|
|
|
|
/// Return true iff the contents of this string can be
|
|
/// represented as pure ASCII. O(N) in the worst case
|
|
func representableAsASCII() -> Bool {
|
|
if _slowPath(!hasContiguousStorage) {
|
|
return false
|
|
}
|
|
if _fastPath(elementWidth == 1) {
|
|
return true
|
|
}
|
|
return !contains(
|
|
UnsafeArray(start: UnsafePointer<UTF16.CodeUnit>(_baseAddress),
|
|
length: count)
|
|
) { $0 > 0x7f }
|
|
}
|
|
}
|
|
|
|
extension _StringCore : Collection {
|
|
var startIndex: Int {
|
|
return 0
|
|
}
|
|
|
|
var endIndex: Int {
|
|
return count
|
|
}
|
|
|
|
func generate() -> IndexingGenerator<_StringCore> {
|
|
return IndexingGenerator(self)
|
|
}
|
|
}
|
|
|
|
extension _StringCore : Sliceable {}
|
|
|
|
// Used to support a tighter invariant: all strings with contiguous
|
|
// storage have a non-NULL base address.
|
|
var _emptyStringStorage: UInt32 = 0
|
|
|
|
var _emptyStringBase: COpaquePointer {
|
|
return COpaquePointer(
|
|
UnsafePointer<UInt16>(Builtin.addressof(&_emptyStringStorage)))
|
|
}
|
|
|