//===--- StringUTF8.swift - A UTF8 view of _StringCore --------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See http://swift.org/LICENSE.txt for license information // See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // // _StringCore currently has three representations: Native ASCII, // Native UTF-16, and Opaque Cocoa. Expose each of these as UTF-8 in a // way that will hopefully be efficient to traverse // //===----------------------------------------------------------------------===// extension _StringCore { /// An integral type that holds a sequence of UTF-8 code units, starting in /// its low byte. internal typealias _UTF8Chunk = UInt64 /// Encode text starting at `i` as UTF-8. Returns a pair whose first /// element is the index of the text following whatever got encoded, /// and the second element contains the encoded UTF-8 starting in its /// low byte. Any unused high bytes in the result will be set to /// 0xFF. @warn_unused_result func _encodeSomeUTF8(from i: Int) -> (Int, _UTF8Chunk) { _sanityCheck(i <= count) if _fastPath(elementWidth == 1) { // How many UTF-16 code units might we use before we've filled up // our _UTF8Chunk with UTF-8 code units? let utf16Count = Swift.min(sizeof(_UTF8Chunk.self), count - i) var result: _UTF8Chunk = ~0 // Start with all bits set _memcpy( dest: UnsafeMutablePointer(Builtin.addressof(&result)), src: UnsafeMutablePointer(startASCII + i), size: numericCast(utf16Count)) return (i + utf16Count, result) } else if _fastPath(!_baseAddress._isNull) { return _encodeSomeContiguousUTF16AsUTF8(from: i) } else { #if _runtime(_ObjC) return _encodeSomeNonContiguousUTF16AsUTF8(from: i) #else _sanityCheckFailure("_encodeSomeUTF8: Unexpected cocoa string") #endif } } /// Helper for `_encodeSomeUTF8`, above. Handles the case where the /// storage is contiguous UTF-16. @warn_unused_result func _encodeSomeContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) { _sanityCheck(elementWidth == 2) _sanityCheck(!_baseAddress._isNull) let storage = UnsafeBufferPointer(start: startUTF16, count: self.count) return _transcodeSomeUTF16AsUTF8(storage, i) } #if _runtime(_ObjC) /// Helper for `_encodeSomeUTF8`, above. Handles the case where the /// storage is non-contiguous UTF-16. @warn_unused_result func _encodeSomeNonContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) { _sanityCheck(elementWidth == 2) _sanityCheck(_baseAddress._isNull) let storage = _CollectionOf( startIndex: 0, endIndex: self.count) { (i: Int) -> UInt16 in return _cocoaStringSubscript(self, i) } return _transcodeSomeUTF16AsUTF8(storage, i) } #endif } extension String { /// A collection of UTF-8 code units that encodes a `String` value. public struct UTF8View : Collection, CustomStringConvertible, CustomDebugStringConvertible { internal let _core: _StringCore internal let _startIndex: Index internal let _endIndex: Index init(_ _core: _StringCore) { self._core = _core self._endIndex = Index(_core, _core.endIndex, Index._emptyBuffer) if _fastPath(_core.count != 0) { let (_, buffer) = _core._encodeSomeUTF8(from: 0) self._startIndex = Index(_core, 0, buffer) } else { self._startIndex = self._endIndex } } init(_ _core: _StringCore, _ s: Index, _ e: Index) { self._core = _core self._startIndex = s self._endIndex = e } /// A position in a `String.UTF8View`. public struct Index : ForwardIndex { internal typealias Buffer = _StringCore._UTF8Chunk init(_ _core: _StringCore, _ _coreIndex: Int, _ _buffer: Buffer) { self._core = _core self._coreIndex = _coreIndex self._buffer = _buffer _sanityCheck(_coreIndex >= 0) _sanityCheck(_coreIndex <= _core.count) } /// Returns the next consecutive value after `self`. /// /// - Precondition: The next value is representable. @warn_unused_result public func successor() -> Index { let currentUnit = UTF8.CodeUnit(truncatingBitPattern: _buffer) let hiNibble = currentUnit >> 4 // Map the high nibble of the current code unit into the // amount by which to increment the utf16 index. Only when // the high nibble is 1111 do we have a surrogate pair. let u16Increments = Int(bitPattern: // 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000 0b10___01___01___01___00___00___00___00___01___01___01___01___01___01___01___01) let increment = (u16Increments >> numericCast(hiNibble << 1)) & 0x3 let nextCoreIndex = _coreIndex &+ increment let nextBuffer = Index._nextBuffer(after: _buffer) // if the nextBuffer is non-empty, we have all we need if _fastPath(nextBuffer != Index._emptyBuffer) { return Index(_core, nextCoreIndex, nextBuffer) } // If the underlying UTF16 isn't exhausted, fill a new buffer else if _fastPath(nextCoreIndex < _core.endIndex) { let (_, freshBuffer) = _core._encodeSomeUTF8(from: nextCoreIndex) return Index(_core, nextCoreIndex, freshBuffer) } else { // Produce the endIndex _precondition( nextCoreIndex == _core.endIndex, "Can't increment past endIndex of String.UTF8View") return Index(_core, nextCoreIndex, nextBuffer) } } /// True iff the index is at the end of its view or if the next /// byte begins a new UnicodeScalar. internal var _isOnUnicodeScalarBoundary : Bool { return UTF8._isValidUTF8(UInt32(truncatingBitPattern: _buffer)) || _isAtEnd } /// True iff the index is at the end of its view internal var _isAtEnd : Bool { return _buffer == Index._emptyBuffer && _coreIndex == _core.endIndex } /// The value of the buffer when it is empty internal static var _emptyBuffer: Buffer { return ~0 } /// A Buffer value with the high byte set internal static var _bufferHiByte: Buffer { return 0xFF << numericCast((sizeof(Buffer.self) &- 1) &* 8) } /// Consume a byte of the given buffer: shift out the low byte /// and put FF in the high byte internal static func _nextBuffer(after thisBuffer: Buffer) -> Buffer { return (thisBuffer >> 8) | _bufferHiByte } /// The underlying buffer we're presenting as UTF8 internal let _core: _StringCore /// The position of `self`, rounded up to the nearest unicode /// scalar boundary, in the underlying UTF16. internal let _coreIndex: Int /// If `self` is at the end of its `_core`, has the value `_endBuffer`. /// Otherwise, the low byte contains the value of internal let _buffer: Buffer } /// The position of the first code unit if the `String` is /// non-empty; identical to `endIndex` otherwise. public var startIndex: Index { return self._startIndex } /// The "past the end" position. /// /// `endIndex` is not a valid argument to `subscript`, and is always /// reachable from `startIndex` by zero or more applications of /// `successor()`. public var endIndex: Index { return self._endIndex } /// Access the element at `position`. /// /// - Precondition: `position` is a valid position in `self` and /// `position != endIndex`. public subscript(position: Index) -> UTF8.CodeUnit { let result: UTF8.CodeUnit = numericCast(position._buffer & 0xFF) _precondition(result != 0xFF, "cannot subscript using endIndex") return result } /// Access the contiguous subrange of elements enclosed by `bounds`. /// /// - Complexity: O(1) unless bridging from Objective-C requires an /// O(N) conversion. public subscript(bounds: Range) -> UTF8View { return UTF8View(_core, bounds.startIndex, bounds.endIndex) } public var description: String { return String._fromCodeUnitSequenceWithRepair(UTF8.self, input: self).0 } public var debugDescription: String { return "UTF8View(\(self.description.debugDescription))" } } /// A UTF-8 encoding of `self`. public var utf8: UTF8View { get { return UTF8View(self._core) } set { self = String(newValue) } } public var _contiguousUTF8: UnsafeMutablePointer { return _core.elementWidth == 1 ? _core.startASCII : nil } /// A contiguously-stored nul-terminated UTF-8 representation of /// `self`. /// /// To access the underlying memory, invoke /// `withUnsafeBufferPointer` on the `ContiguousArray`. public var nulTerminatedUTF8: ContiguousArray { var result = ContiguousArray() result.reserveCapacity(utf8.count + 1) result += utf8 result.append(0) return result } /// Construct the `String` corresponding to the given sequence of /// UTF-8 code units. If `utf8` contains unpaired surrogates, the /// result is `nil`. public init?(_ utf8: UTF8View) { let wholeString = String(utf8._core) if let start = utf8.startIndex.samePosition(in: wholeString), let end = utf8.endIndex.samePosition(in: wholeString) { self = wholeString[start.. Bool { // If the underlying UTF16 index differs, they're unequal if lhs._coreIndex != rhs._coreIndex { return false } // Match up bytes in the buffer var buffer = (lhs._buffer, rhs._buffer) var isContinuation: Bool repeat { let unit = ( UTF8.CodeUnit(truncatingBitPattern: buffer.0), UTF8.CodeUnit(truncatingBitPattern: buffer.1)) isContinuation = UTF8.isContinuation(unit.0) if !isContinuation { // We don't check for unit equality in this case because one of // the units might be an 0xFF read from the end of the buffer. return !UTF8.isContinuation(unit.1) } // Continuation bytes must match exactly else if unit.0 != unit.1 { return false } // Move the buffers along. buffer = ( String.UTF8Index._nextBuffer(after: buffer.0), String.UTF8Index._nextBuffer(after: buffer.1)) } while true } // Index conversions extension String.UTF8View.Index { internal init(_ core: _StringCore, _utf16Offset: Int) { let (_, buffer) = core._encodeSomeUTF8(from: _utf16Offset) self.init(core, _utf16Offset, buffer) } /// Construct the position in `utf8` that corresponds exactly to /// `utf16Index`. If no such position exists, the result is `nil`. /// /// - Precondition: `utf8Index` is an element of /// `String(utf16)!.utf8.indices`. public init?(_ utf16Index: String.UTF16Index, within utf8: String.UTF8View) { let utf16 = String.UTF16View(utf8._core) if utf16Index != utf16.startIndex && utf16Index != utf16.endIndex { _precondition( utf16Index >= utf16.startIndex && utf16Index <= utf16.endIndex, "Invalid String.UTF16Index for this UTF-8 view") // Detect positions that have no corresponding index. Note that // we have to check before and after, because an unpaired // surrogate will be decoded as a single replacement character, // thus making the corresponding position valid in UTF8. if UTF16.isTrailSurrogate(utf16[utf16Index]) && UTF16.isLeadSurrogate(utf16[utf16Index.predecessor()]) { return nil } } self.init(utf8._core, _utf16Offset: utf16Index._offset) } /// Construct the position in `utf8` that corresponds exactly to /// `unicodeScalarIndex`. /// /// - Precondition: `unicodeScalarIndex` is an element of /// `String(utf8)!.unicodeScalars.indices`. public init( _ unicodeScalarIndex: String.UnicodeScalarIndex, within utf8: String.UTF8View ) { self.init(utf8._core, _utf16Offset: unicodeScalarIndex._position) } /// Construct the position in `utf8` that corresponds exactly to /// `characterIndex`. /// /// - Precondition: `characterIndex` is an element of /// `String(utf8)!.indices`. public init(_ characterIndex: String.Index, within utf8: String.UTF8View) { self.init(utf8._core, _utf16Offset: characterIndex._base._position) } /// Returns the position in `utf16` that corresponds exactly /// to `self`, or if no such position exists, `nil`. /// /// - Precondition: `self` is an element of `String(utf16)!.utf8.indices`. @warn_unused_result public func samePosition( in utf16: String.UTF16View ) -> String.UTF16View.Index? { return String.UTF16View.Index(self, within: utf16) } /// Returns the position in `unicodeScalars` that corresponds exactly /// to `self`, or if no such position exists, `nil`. /// /// - Precondition: `self` is an element of /// `String(unicodeScalars).utf8.indices`. @warn_unused_result public func samePosition( in unicodeScalars: String.UnicodeScalarView ) -> String.UnicodeScalarIndex? { return String.UnicodeScalarIndex(self, within: unicodeScalars) } /// Returns the position in `characters` that corresponds exactly /// to `self`, or if no such position exists, `nil`. /// /// - Precondition: `self` is an element of `characters.utf8.indices`. @warn_unused_result public func samePosition( in characters: String ) -> String.Index? { return String.Index(self, within: characters) } } // Reflection extension String.UTF8View : CustomReflectable { /// Returns a mirror that reflects `self`. public var customMirror: Mirror { return Mirror(self, unlabeledChildren: self) } } extension String.UTF8View : CustomPlaygroundQuickLookable { public var customPlaygroundQuickLook: PlaygroundQuickLook { return .text(description) } }