Files
swift-mirror/stdlib/public/core/StringUTF8.swift
Jordan Rose bc83940301 Make pointer nullability explicit using Optional.
Implements SE-0055: https://github.com/apple/swift-evolution/blob/master/proposals/0055-optional-unsafe-pointers.md

- Add NULL as an extra inhabitant of Builtin.RawPointer (currently
  hardcoded to 0 rather than being target-dependent).
- Import non-object pointers as Optional/IUO when nullable/null_unspecified
  (like everything else).
- Change the type checker's *-to-pointer conversions to handle a layer of
  optional.
- Use 'AutoreleasingUnsafeMutablePointer<NSError?>?' as the type of error
  parameters exported to Objective-C.
- Drop NilLiteralConvertible conformance for all pointer types.
- Update the standard library and then all the tests.

I've decided to leave this commit only updating existing tests; any new
tests will come in the following commits. (That may mean some additional
implementation work to follow.)

The other major piece that's missing here is migration. I'm hoping we get
a lot of that with Swift 1.1's work for optional object references, but
I still need to investigate.
2016-04-11 20:06:38 -07:00

426 lines
14 KiB
Swift

//===--- StringUTF8.swift - A UTF8 view of _StringCore --------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// _StringCore currently has three representations: Native ASCII,
// Native UTF-16, and Opaque Cocoa. Expose each of these as UTF-8 in a
// way that will hopefully be efficient to traverse
//
//===----------------------------------------------------------------------===//
extension _StringCore {
/// An integral type that holds a sequence of UTF-8 code units, starting in
/// its low byte.
internal typealias _UTF8Chunk = UInt64
/// Encode text starting at `i` as UTF-8. Returns a pair whose first
/// element is the index of the text following whatever got encoded,
/// and the second element contains the encoded UTF-8 starting in its
/// low byte. Any unused high bytes in the result will be set to
/// 0xFF.
@warn_unused_result
func _encodeSomeUTF8(from i: Int) -> (Int, _UTF8Chunk) {
_sanityCheck(i <= count)
if _fastPath(elementWidth == 1) {
// How many UTF-16 code units might we use before we've filled up
// our _UTF8Chunk with UTF-8 code units?
let utf16Count = Swift.min(sizeof(_UTF8Chunk.self), count - i)
var result: _UTF8Chunk = ~0 // Start with all bits set
_memcpy(
dest: UnsafeMutablePointer(Builtin.addressof(&result)),
src: UnsafeMutablePointer(startASCII + i),
size: numericCast(utf16Count))
return (i + utf16Count, result)
} else if _fastPath(_baseAddress != nil) {
return _encodeSomeContiguousUTF16AsUTF8(from: i)
} else {
#if _runtime(_ObjC)
return _encodeSomeNonContiguousUTF16AsUTF8(from: i)
#else
_sanityCheckFailure("_encodeSomeUTF8: Unexpected cocoa string")
#endif
}
}
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
/// storage is contiguous UTF-16.
@warn_unused_result
func _encodeSomeContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) {
_sanityCheck(elementWidth == 2)
_sanityCheck(_baseAddress != nil)
let storage = UnsafeBufferPointer(start: startUTF16, count: self.count)
return _transcodeSomeUTF16AsUTF8(storage, i)
}
#if _runtime(_ObjC)
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
/// storage is non-contiguous UTF-16.
@warn_unused_result
func _encodeSomeNonContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) {
_sanityCheck(elementWidth == 2)
_sanityCheck(_baseAddress == nil)
let storage = _CollectionOf<Int, UInt16>(
startIndex: 0, endIndex: self.count) {
(i: Int) -> UInt16 in
return _cocoaStringSubscript(self, i)
}
return _transcodeSomeUTF16AsUTF8(storage, i)
}
#endif
}
extension String {
/// A collection of UTF-8 code units that encodes a `String` value.
public struct UTF8View : Collection, CustomStringConvertible, CustomDebugStringConvertible {
internal let _core: _StringCore
internal let _startIndex: Index
internal let _endIndex: Index
init(_ _core: _StringCore) {
self._core = _core
self._endIndex = Index(_core, _core.endIndex, Index._emptyBuffer)
if _fastPath(_core.count != 0) {
let (_, buffer) = _core._encodeSomeUTF8(from: 0)
self._startIndex = Index(_core, 0, buffer)
} else {
self._startIndex = self._endIndex
}
}
init(_ _core: _StringCore, _ s: Index, _ e: Index) {
self._core = _core
self._startIndex = s
self._endIndex = e
}
/// A position in a `String.UTF8View`.
public struct Index : ForwardIndex {
internal typealias Buffer = _StringCore._UTF8Chunk
init(_ _core: _StringCore, _ _coreIndex: Int,
_ _buffer: Buffer) {
self._core = _core
self._coreIndex = _coreIndex
self._buffer = _buffer
_sanityCheck(_coreIndex >= 0)
_sanityCheck(_coreIndex <= _core.count)
}
/// Returns the next consecutive value after `self`.
///
/// - Precondition: The next value is representable.
@warn_unused_result
public func successor() -> Index {
let currentUnit = UTF8.CodeUnit(truncatingBitPattern: _buffer)
let hiNibble = currentUnit >> 4
// Map the high nibble of the current code unit into the
// amount by which to increment the utf16 index. Only when
// the high nibble is 1111 do we have a surrogate pair.
let u16Increments = Int(bitPattern:
// 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
0b10___01___01___01___00___00___00___00___01___01___01___01___01___01___01___01)
let increment = (u16Increments >> numericCast(hiNibble << 1)) & 0x3
let nextCoreIndex = _coreIndex &+ increment
let nextBuffer = Index._nextBuffer(after: _buffer)
// if the nextBuffer is non-empty, we have all we need
if _fastPath(nextBuffer != Index._emptyBuffer) {
return Index(_core, nextCoreIndex, nextBuffer)
}
// If the underlying UTF16 isn't exhausted, fill a new buffer
else if _fastPath(nextCoreIndex < _core.endIndex) {
let (_, freshBuffer) = _core._encodeSomeUTF8(from: nextCoreIndex)
return Index(_core, nextCoreIndex, freshBuffer)
}
else {
// Produce the endIndex
_precondition(
nextCoreIndex == _core.endIndex,
"Can't increment past endIndex of String.UTF8View")
return Index(_core, nextCoreIndex, nextBuffer)
}
}
/// True iff the index is at the end of its view or if the next
/// byte begins a new UnicodeScalar.
internal var _isOnUnicodeScalarBoundary : Bool {
let buffer = UInt32(truncatingBitPattern: _buffer)
let (codePoint, _) = UTF8._decodeOne(buffer)
return codePoint != nil || _isAtEnd
}
/// True iff the index is at the end of its view
internal var _isAtEnd : Bool {
return _buffer == Index._emptyBuffer
&& _coreIndex == _core.endIndex
}
/// The value of the buffer when it is empty
internal static var _emptyBuffer: Buffer {
return ~0
}
/// A Buffer value with the high byte set
internal static var _bufferHiByte: Buffer {
return 0xFF << numericCast((sizeof(Buffer.self) &- 1) &* 8)
}
/// Consume a byte of the given buffer: shift out the low byte
/// and put FF in the high byte
internal static func _nextBuffer(after thisBuffer: Buffer) -> Buffer {
return (thisBuffer >> 8) | _bufferHiByte
}
/// The underlying buffer we're presenting as UTF8
internal let _core: _StringCore
/// The position of `self`, rounded up to the nearest unicode
/// scalar boundary, in the underlying UTF16.
internal let _coreIndex: Int
/// If `self` is at the end of its `_core`, has the value `_endBuffer`.
/// Otherwise, the low byte contains the value of
internal let _buffer: Buffer
}
/// The position of the first code unit if the `String` is
/// non-empty; identical to `endIndex` otherwise.
public var startIndex: Index {
return self._startIndex
}
/// The "past the end" position.
///
/// `endIndex` is not a valid argument to `subscript`, and is always
/// reachable from `startIndex` by zero or more applications of
/// `successor()`.
public var endIndex: Index {
return self._endIndex
}
/// Access the element at `position`.
///
/// - Precondition: `position` is a valid position in `self` and
/// `position != endIndex`.
public subscript(position: Index) -> UTF8.CodeUnit {
let result: UTF8.CodeUnit = numericCast(position._buffer & 0xFF)
_precondition(result != 0xFF, "cannot subscript using endIndex")
return result
}
/// Access the contiguous subrange of elements enclosed by `bounds`.
///
/// - Complexity: O(1) unless bridging from Objective-C requires an
/// O(N) conversion.
public subscript(bounds: Range<Index>) -> UTF8View {
return UTF8View(_core, bounds.startIndex, bounds.endIndex)
}
public var description: String {
return String._fromCodeUnitSequenceWithRepair(UTF8.self, input: self).0
}
public var debugDescription: String {
return "UTF8View(\(self.description.debugDescription))"
}
}
/// A UTF-8 encoding of `self`.
public var utf8: UTF8View {
get {
return UTF8View(self._core)
}
set {
self = String(newValue)
}
}
public var _contiguousUTF8: UnsafeMutablePointer<UTF8.CodeUnit>? {
return _core.elementWidth == 1 ? _core.startASCII : nil
}
/// A contiguously-stored nul-terminated UTF-8 representation of
/// `self`.
///
/// To access the underlying memory, invoke
/// `withUnsafeBufferPointer` on the `ContiguousArray`.
public var nulTerminatedUTF8: ContiguousArray<UTF8.CodeUnit> {
var result = ContiguousArray<UTF8.CodeUnit>()
result.reserveCapacity(utf8.count + 1)
result += utf8
result.append(0)
return result
}
/// Construct the `String` corresponding to the given sequence of
/// UTF-8 code units. If `utf8` contains unpaired surrogates, the
/// result is `nil`.
public init?(_ utf8: UTF8View) {
let wholeString = String(utf8._core)
if let start = utf8.startIndex.samePosition(in: wholeString),
let end = utf8.endIndex.samePosition(in: wholeString) {
self = wholeString[start..<end]
return
}
return nil
}
/// The index type for subscripting a `String`'s `.utf8` view.
public typealias UTF8Index = UTF8View.Index
}
@warn_unused_result
public func == (
lhs: String.UTF8View.Index,
rhs: String.UTF8View.Index
) -> Bool {
// If the underlying UTF16 index differs, they're unequal
if lhs._coreIndex != rhs._coreIndex {
return false
}
// Match up bytes in the buffer
var buffer = (lhs._buffer, rhs._buffer)
var isContinuation: Bool
repeat {
let unit = (
UTF8.CodeUnit(truncatingBitPattern: buffer.0),
UTF8.CodeUnit(truncatingBitPattern: buffer.1))
isContinuation = UTF8.isContinuation(unit.0)
if !isContinuation {
// We don't check for unit equality in this case because one of
// the units might be an 0xFF read from the end of the buffer.
return !UTF8.isContinuation(unit.1)
}
// Continuation bytes must match exactly
else if unit.0 != unit.1 {
return false
}
// Move the buffers along.
buffer = (
String.UTF8Index._nextBuffer(after: buffer.0),
String.UTF8Index._nextBuffer(after: buffer.1))
}
while true
}
// Index conversions
extension String.UTF8View.Index {
internal init(_ core: _StringCore, _utf16Offset: Int) {
let (_, buffer) = core._encodeSomeUTF8(from: _utf16Offset)
self.init(core, _utf16Offset, buffer)
}
/// Construct the position in `utf8` that corresponds exactly to
/// `utf16Index`. If no such position exists, the result is `nil`.
///
/// - Precondition: `utf8Index` is an element of
/// `String(utf16)!.utf8.indices`.
public init?(_ utf16Index: String.UTF16Index, within utf8: String.UTF8View) {
let utf16 = String.UTF16View(utf8._core)
if utf16Index != utf16.startIndex
&& utf16Index != utf16.endIndex {
_precondition(
utf16Index >= utf16.startIndex
&& utf16Index <= utf16.endIndex,
"Invalid String.UTF16Index for this UTF-8 view")
// Detect positions that have no corresponding index. Note that
// we have to check before and after, because an unpaired
// surrogate will be decoded as a single replacement character,
// thus making the corresponding position valid in UTF8.
if UTF16.isTrailSurrogate(utf16[utf16Index])
&& UTF16.isLeadSurrogate(utf16[utf16Index.predecessor()]) {
return nil
}
}
self.init(utf8._core, _utf16Offset: utf16Index._offset)
}
/// Construct the position in `utf8` that corresponds exactly to
/// `unicodeScalarIndex`.
///
/// - Precondition: `unicodeScalarIndex` is an element of
/// `String(utf8)!.unicodeScalars.indices`.
public init(
_ unicodeScalarIndex: String.UnicodeScalarIndex,
within utf8: String.UTF8View
) {
self.init(utf8._core, _utf16Offset: unicodeScalarIndex._position)
}
/// Construct the position in `utf8` that corresponds exactly to
/// `characterIndex`.
///
/// - Precondition: `characterIndex` is an element of
/// `String(utf8)!.indices`.
public init(_ characterIndex: String.Index, within utf8: String.UTF8View) {
self.init(utf8._core, _utf16Offset: characterIndex._base._position)
}
/// Returns the position in `utf16` that corresponds exactly
/// to `self`, or if no such position exists, `nil`.
///
/// - Precondition: `self` is an element of `String(utf16)!.utf8.indices`.
@warn_unused_result
public func samePosition(
in utf16: String.UTF16View
) -> String.UTF16View.Index? {
return String.UTF16View.Index(self, within: utf16)
}
/// Returns the position in `unicodeScalars` that corresponds exactly
/// to `self`, or if no such position exists, `nil`.
///
/// - Precondition: `self` is an element of
/// `String(unicodeScalars).utf8.indices`.
@warn_unused_result
public func samePosition(
in unicodeScalars: String.UnicodeScalarView
) -> String.UnicodeScalarIndex? {
return String.UnicodeScalarIndex(self, within: unicodeScalars)
}
/// Returns the position in `characters` that corresponds exactly
/// to `self`, or if no such position exists, `nil`.
///
/// - Precondition: `self` is an element of `characters.utf8.indices`.
@warn_unused_result
public func samePosition(
in characters: String
) -> String.Index? {
return String.Index(self, within: characters)
}
}
// Reflection
extension String.UTF8View : CustomReflectable {
/// Returns a mirror that reflects `self`.
public var customMirror: Mirror {
return Mirror(self, unlabeledChildren: self)
}
}
extension String.UTF8View : CustomPlaygroundQuickLookable {
public var customPlaygroundQuickLook: PlaygroundQuickLook {
return .text(description)
}
}