Files
swift-mirror/stdlib/core/StringUTF8.swift
Dave Abrahams 386146364b [stdlib] Refactor String.UTF8Index for conversion
The old design did not strictly keep track of the index in underlying
UTF16, which would have made converting between the different index
types too difficult.  It also made equality comparison between indices
broken, because

  UTF8Index(s.utf16.startIndex+1, within: s.utf8)

and

  UTF8Index(s.utf16.startIndex, within: s.utf8).successor()

would often have completely different UTF8 buffers and offsets within
the underlying UTF16.

For some reason this disturbed SILPasses/devirt_default_case.swift,
which is now XFAIL'd.  <rdar://problem/19298212>
SILPasses/devirt_default_case.swift is XFAIL'd

Swift SVN r24012
2014-12-18 20:47:37 +00:00

317 lines
11 KiB
Swift

//===--- StringUTF8.swift - A UTF8 view of _StringCore ---------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2015 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// _StringCore currently has three representations: Native ASCII,
// Native UTF-16, and Opaque Cocoa. Expose each of these as UTF-8 in a
// way that will hopefully be efficient to traverse
//
//===----------------------------------------------------------------------===//
extension _StringCore {
/// An integral type that holds a sequence of UTF-8 code units, starting in
/// its low byte.
public typealias UTF8Chunk = UInt64
/// Encode text starting at `i` as UTF-8. Returns a pair whose first
/// element is the index of the text following whatever got encoded,
/// and the second element contains the encoded UTF-8 starting in its
/// low byte. Any unused high bytes in the result will be set to
/// 0xFF.
func _encodeSomeUTF8(i: Int) -> (Int, UTF8Chunk) {
_sanityCheck(i <= count)
if _fastPath(elementWidth == 1) {
// How many UTF-16 code units might we use before we've filled up
// our UTF8Chunk with UTF-8 code units?
let utf16Count = min(sizeof(UTF8Chunk.self), count - i)
var result: UTF8Chunk = ~0 // start with all bits set
_memcpy(
dest: UnsafeMutablePointer(Builtin.addressof(&result)),
src: UnsafeMutablePointer(startASCII + i),
size: numericCast(utf16Count))
return (i + utf16Count, result)
} else if _fastPath(!_baseAddress._isNull) {
return _encodeSomeContiguousUTF16AsUTF8(i)
} else {
#if _runtime(_ObjC)
return _encodeSomeNonContiguousUTF16AsUTF8(i)
#else
_sanityCheckFailure("_encodeSomeUTF8: Unexpected cocoa string")
#endif
}
}
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
/// storage is contiguous UTF-16.
func _encodeSomeContiguousUTF16AsUTF8(i: Int) -> (Int, UTF8Chunk) {
_sanityCheck(elementWidth == 2)
_sanityCheck(!_baseAddress._isNull)
let storage = UnsafeBufferPointer(start: startUTF16, count: self.count)
return _transcodeSomeUTF16AsUTF8(storage, i)
}
#if _runtime(_ObjC)
/// Helper for `_encodeSomeUTF8`, above. Handles the case where the
/// storage is non-contiguous UTF-16.
func _encodeSomeNonContiguousUTF16AsUTF8(i: Int) -> (Int, UTF8Chunk) {
_sanityCheck(elementWidth == 2)
_sanityCheck(_baseAddress._isNull)
let storage = _CollectionOf<Int, UInt16>(
startIndex: 0, endIndex: self.count) {
(i: Int) -> UInt16 in
return _cocoaStringSubscript(target: self, position: i)
}
return _transcodeSomeUTF16AsUTF8(storage, i)
}
#endif
}
extension String {
/// A collection of UTF-8 code units that encodes a `String` value.
public struct UTF8View : CollectionType, Reflectable {
internal let _core: _StringCore
init(_ _core: _StringCore) {
self._core = _core
}
/// A position in a `String.UTF8View`
public struct Index : ForwardIndexType {
internal typealias Buffer = _StringCore.UTF8Chunk
init(_ _core: _StringCore, _ _coreIndex: Int,
_ _buffer: Buffer) {
self._core = _core
self._coreIndex = _coreIndex
self._buffer = _buffer
_sanityCheck(_coreIndex >= 0)
_sanityCheck(_coreIndex <= _core.count)
}
/// Returns the next consecutive value after `self`.
///
/// Requires: the next value is representable.
public func successor() -> Index {
let currentUnit = UTF8.CodeUnit(truncatingBitPattern: _buffer)
let hiNibble = currentUnit >> 4
// Map the high nibble of the current code unit into the
// amount by which to increment the utf16 index. Only when
// the high nibble is 1111 do we have a surrogate pair.
let u16Increments =
// 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000
0b10___01___01___01___00___00___00___00___01___01___01___01___01___01___01___01
let increment = (u16Increments >> numericCast(hiNibble << 1)) & 0x3
let nextCoreIndex = _coreIndex &+ increment
let nextBuffer = Index._nextBuffer(_buffer)
// if the nextBuffer is non-empty, we have all we need
if _fastPath(nextBuffer != Index._emptyBuffer) {
return Index(_core, nextCoreIndex, nextBuffer)
}
// If the underlying UTF16 isn't exhausted, fill a new buffer
else if _fastPath(nextCoreIndex < _core.endIndex) {
let (_, freshBuffer) = _core._encodeSomeUTF8(nextCoreIndex)
return Index(_core, nextCoreIndex, freshBuffer)
}
else {
// Produce the endIndex
_precondition(
nextCoreIndex == _core.endIndex,
"Can't increment past endIndex of String.UTF8View")
return Index(_core, nextCoreIndex, nextBuffer)
}
}
/// True iff the index is at the end of its view or if the next
/// byte begins a new UnicodeScalar.
internal var _isOnUnicodeScalarBoundary : Bool {
let next = UTF8.CodeUnit(truncatingBitPattern: _buffer)
return UTF8._numTrailingBytes(next) != 4 || _isAtEnd
}
/// True iff the index is at the end of its view
internal var _isAtEnd : Bool {
return _coreIndex == _core.endIndex
}
/// The value of the buffer when it is empty
internal static var _emptyBuffer: Buffer {
return ~0
}
/// A Buffer value with the high byte set
internal static var _bufferHiByte: Buffer {
return 0xFF << numericCast((sizeof(Buffer.self) &- 1) &* 8)
}
/// Consume a byte of the given buffer: shift out the low byte
/// and put FF in the high byte
internal static func _nextBuffer(thisBuffer: Buffer) -> Buffer {
return (thisBuffer >> 8) | _bufferHiByte
}
/// The underlying buffer we're presenting as UTF8
internal let _core: _StringCore
/// The position of `self`, rounded up to the nearest unicode
/// scalar boundary, in the underlying UTF16.
internal let _coreIndex: Int
/// If `self` is at the end of its `_core`, has the value `_endBuffer`.
/// Otherwise, the low byte contains the value of
internal let _buffer: Buffer
}
/// The position of the first code unit if the `String` is
/// non-empty; identical to `endIndex` otherwise.
public var startIndex: Index {
if _fastPath(_core.count != 0) {
let (_, buffer) = _core._encodeSomeUTF8(0)
return Index(_core, 0, buffer)
}
return endIndex
}
/// The "past the end" position.
///
/// `endIndex` is not a valid argument to `subscript`, and is always
/// reachable from `startIndex` by zero or more applications of
/// `successor()`.
public var endIndex: Index {
return Index(_core, _core.endIndex, Index._emptyBuffer)
}
/// Access the element at `position`.
///
/// Requires: `position` is a valid position in `self` and
/// `position != endIndex`.
public subscript(position: Index) -> UTF8.CodeUnit {
let result: UTF8.CodeUnit = numericCast(position._buffer & 0xFF)
_precondition(result != 0xFF, "can not subscript using endIndex")
return result
}
/// Return a *generator* over the code points that comprise this
/// *sequence*.
///
/// Complexity: O(1)
public func generate() -> IndexingGenerator<UTF8View> {
return IndexingGenerator(self)
}
/// Returns a mirror that reflects `self`.
public func getMirror() -> MirrorType {
return _UTF8ViewMirror(self)
}
}
/// A UTF-8 encoding of `self`.
public var utf8: UTF8View {
return UTF8View(self._core)
}
public var _contiguousUTF8: UnsafeMutablePointer<UTF8.CodeUnit> {
return _core.elementWidth == 1 ? _core.startASCII : nil
}
/// A contiguously-stored nul-terminated UTF-8 representation of
/// `self`.
///
/// To access the underlying memory, invoke
/// `withUnsafeBufferPointer` on the `ContiguousArray`.
public var nulTerminatedUTF8: ContiguousArray<UTF8.CodeUnit> {
var result = ContiguousArray<UTF8.CodeUnit>()
result.reserveCapacity(count(utf8) + 1)
result += utf8
result.append(0)
return result
}
public typealias UTF8Index = UTF8View.Index
}
public
func == (lhs: String.UTF8View.Index, rhs: String.UTF8View.Index) -> Bool {
// If the underlying UTF16 index differs, they're unequal
if lhs._coreIndex != rhs._coreIndex {
return false
}
// Match up bytes in the buffer
var buffer = (lhs._buffer, rhs._buffer)
var isContinuation: Bool
do {
let unit = (
UTF8.CodeUnit(truncatingBitPattern: buffer.0),
UTF8.CodeUnit(truncatingBitPattern: buffer.1))
isContinuation = UTF8.isContinuation(unit.0)
if !isContinuation {
// We don't check for unit equality in this case because one of
// the units might be an 0xFF read from the end of the buffer.
return !UTF8.isContinuation(unit.1)
}
// Continuation bytes must match exactly
else if unit.0 != unit.1 {
return false
}
// Move the buffers along.
buffer = (
String.UTF8Index._nextBuffer(buffer.0),
String.UTF8Index._nextBuffer(buffer.1))
}
while true
}
extension String.UTF8Index {
internal init(_ core: _StringCore, _utf16Offset: Int) {
let (_, buffer) = core._encodeSomeUTF8(_utf16Offset)
self.init(core, _utf16Offset, buffer)
}
public init?(_ sourceIndex: String.UTF16Index, within utf8: String.UTF8View) {
let sourceView = String.UTF16View(utf8._core)
if sourceIndex != sourceView.startIndex
&& sourceIndex != sourceView.endIndex {
_precondition(
sourceIndex >= sourceView.startIndex
&& sourceIndex <= sourceView.endIndex,
"Invalid String.UTF16Index for this UTF-8 view")
// Detect positions that have no corresponding index. Note that
// we have to check before and after, because an unpaired
// surrogate will be decoded as a single replacement character,
// thus making the corresponding position valid in UTF8.
if UTF16.isTrailSurrogate(sourceView[sourceIndex])
&& UTF16.isLeadSurrogate(sourceView[sourceIndex.predecessor()]) {
return nil
}
}
self.init(utf8._core, _utf16Offset: sourceIndex._offset)
}
public init(
_ sourceIndex: String.UnicodeScalarIndex, within utf8: String.UTF8View) {
self.init(utf8._core, _utf16Offset: sourceIndex._position)
}
public init(_ sourceIndex: String.Index, within utf8: String.UTF8View) {
self.init(utf8._core, _utf16Offset: sourceIndex._base._position)
}
}