//===--- StringUTF8.swift - A UTF8 view of _StringCore --------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2017 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // // _StringCore currently has three representations: Native ASCII, // Native UTF-16, and Opaque Cocoa. Expose each of these as UTF-8 in a // way that will hopefully be efficient to traverse // //===----------------------------------------------------------------------===// extension String { /// A view of a string's contents as a collection of UTF-8 code units. /// /// You can access a string's view of UTF-8 code units by using its `utf8` /// property. A string's UTF-8 view encodes the string's Unicode scalar /// values as 8-bit integers. /// /// let flowers = "Flowers 💐" /// for v in flowers.utf8 { /// print(v) /// } /// // 70 /// // 108 /// // 111 /// // 119 /// // 101 /// // 114 /// // 115 /// // 32 /// // 240 /// // 159 /// // 146 /// // 144 /// /// A string's Unicode scalar values can be up to 21 bits in length. To /// represent those scalar values using 8-bit integers, more than one UTF-8 /// code unit is often required. /// /// let flowermoji = "💐" /// for v in flowermoji.unicodeScalars { /// print(v, v.value) /// } /// // 💐 128144 /// /// for v in flowermoji.utf8 { /// print(v) /// } /// // 240 /// // 159 /// // 146 /// // 144 /// /// In the encoded representation of a Unicode scalar value, each UTF-8 code /// unit after the first is called a *continuation byte*. /// /// UTF8View Elements Match Encoded C Strings /// ========================================= /// /// Swift streamlines interoperation with C string APIs by letting you pass a /// `String` instance to a function as an `Int8` or `UInt8` pointer. When you /// call a C function using a `String`, Swift automatically creates a buffer /// of UTF-8 code units and passes a pointer to that buffer. The code units /// of that buffer match the code units in the string's `utf8` view. /// /// The following example uses the C `strncmp` function to compare the /// beginning of two Swift strings. The `strncmp` function takes two /// `const char*` pointers and an integer specifying the number of characters /// to compare. Because the strings are identical up to the 14th character, /// comparing only those characters results in a return value of `0`. /// /// let s1 = "They call me 'Bell'" /// let s2 = "They call me 'Stacey'" /// /// print(strncmp(s1, s2, 14)) /// // Prints "0" /// print(String(s1.utf8.prefix(14))) /// // Prints "They call me '" /// /// Extending the compared character count to 15 includes the differing /// characters, so a nonzero result is returned. /// /// print(strncmp(s1, s2, 15)) /// // Prints "-17" /// print(String(s1.utf8.prefix(15))) /// // Prints "They call me 'B" @_fixed_layout // FIXME(sil-serialize-all) public struct UTF8View : BidirectionalCollection, CustomStringConvertible, CustomDebugStringConvertible { /// Underlying UTF-16-compatible representation @_versioned internal let _core: _StringCore /// Distances to `(startIndex, endIndex)` from the endpoints of _core, /// measured in UTF-8 code units. /// /// Note: this is *only* here to support legacy Swift3-style slicing where /// `s.utf8[i..= -3 && _legacyOffsets.end <= 0, "out of bounds legacy end") var r = Index(encodedOffset: _core.endIndex) if _fastPath(_legacyOffsets.end == 0) { return r } switch _legacyOffsets.end { case -3: r = index(before: r); fallthrough case -2: r = index(before: r); fallthrough case -1: return index(before: r) default: Builtin.unreachable() } } @_inlineable // FIXME(sil-serialize-all) @_versioned internal func _index(atEncodedOffset n: Int) -> Index { if _fastPath(_core.isASCII) { return Index(encodedOffset: n) } if n == _core.endIndex { return endIndex } var p = UTF16.ForwardParser() var i = _core[n...].makeIterator() var buffer = Index._UTF8Buffer() Loop: while true { switch p.parseScalar(from: &i) { case .valid(let u16): let u8 = Unicode.UTF8.transcode(u16, from: Unicode.UTF16.self) ._unsafelyUnwrappedUnchecked if buffer.count + u8.count > buffer.capacity { break Loop } buffer.append(contentsOf: u8) case .error: let u8 = Unicode.UTF8.encodedReplacementCharacter if buffer.count + u8.count > buffer.capacity { break Loop } buffer.append(contentsOf: u8) case .emptyInput: break Loop } } return Index(encodedOffset: n, .utf8(buffer: buffer)) } /// Returns the next consecutive position after `i`. /// /// - Precondition: The next position is representable. @_inlineable // FIXME(sil-serialize-all) @inline(__always) public func index(after i: Index) -> Index { if _fastPath(_core.isASCII) { precondition(i.encodedOffset < _core.count) return Index(encodedOffset: i.encodedOffset + 1) } var j = i // Ensure j's cache is utf8 if _slowPath(j._cache.utf8 == nil) { j = _index(atEncodedOffset: j.encodedOffset) precondition(j != endIndex, "Index out of bounds") } let buffer = j._cache.utf8._unsafelyUnwrappedUnchecked var scalarLength16 = 1 let b0 = buffer.first._unsafelyUnwrappedUnchecked var nextBuffer = buffer let leading1s = (~b0).leadingZeroBitCount if _fastPath(leading1s == 0) { // ASCII in buffer; just consume it nextBuffer.removeFirst() } else { // Number of bytes consumed in this scalar let n8 = j._transcodedOffset + 1 // If we haven't reached a scalar boundary... if _fastPath(n8 < leading1s) { // Advance to the next position in this scalar return Index( encodedOffset: j.encodedOffset, transcodedOffset: n8, .utf8(buffer: buffer)) } // We reached a scalar boundary; compute the underlying utf16's width // based on the number of utf8 code units scalarLength16 = n8 >> 2 + 1 nextBuffer.removeFirst(n8) } if _fastPath(!nextBuffer.isEmpty) { return Index( encodedOffset: j.encodedOffset + scalarLength16, .utf8(buffer: nextBuffer)) } // If nothing left in the buffer, refill it. return _index(atEncodedOffset: j.encodedOffset + scalarLength16) } @_inlineable // FIXME(sil-serialize-all) public func index(before i: Index) -> Index { if _fastPath(_core.isASCII) { precondition(i.encodedOffset > 0) return Index(encodedOffset: i.encodedOffset - 1) } if i._transcodedOffset != 0 { _sanityCheck(i._cache.utf8 != nil) var r = i r._compoundOffset = r._compoundOffset &- 1 return r } // Handle the scalar boundary the same way as the not-a-utf8-index case. // Parse a single scalar var p = Unicode.UTF16.ReverseParser() var s = _core[.. Int { if _fastPath(_core.isASCII) { return j.encodedOffset - i.encodedOffset } return j >= i ? _forwardDistance(from: i, to: j) : -_forwardDistance(from: j, to: i) } @_inlineable // FIXME(sil-serialize-all) @_versioned @inline(__always) internal func _forwardDistance(from i: Index, to j: Index) -> Int { var r = j._transcodedOffset - i._transcodedOffset UTF8._transcode( _core[i.encodedOffset.. UTF8.CodeUnit { @inline(__always) get { if _fastPath(_core.asciiBuffer != nil), let ascii = _core.asciiBuffer { _precondition(position < endIndex, "Index out of bounds") return ascii[position.encodedOffset] } var j = position while true { if case .utf8(let buffer) = j._cache { _onFastPath() return buffer[ buffer.index(buffer.startIndex, offsetBy: j._transcodedOffset)] } j = _index(atEncodedOffset: j.encodedOffset) precondition(j < endIndex, "Index out of bounds") } } } @_inlineable // FIXME(sil-serialize-all) public var description: String { return String(_core) } @_inlineable // FIXME(sil-serialize-all) public var debugDescription: String { return "UTF8View(\(self.description.debugDescription))" } } /// A UTF-8 encoding of `self`. @_inlineable // FIXME(sil-serialize-all) public var utf8: UTF8View { get { return UTF8View(self._core) } set { self = String(describing: newValue) } } /// A contiguously stored null-terminated UTF-8 representation of the string. /// /// To access the underlying memory, invoke `withUnsafeBufferPointer` on the /// array. /// /// let s = "Hello!" /// let bytes = s.utf8CString /// print(bytes) /// // Prints "[72, 101, 108, 108, 111, 33, 0]" /// /// bytes.withUnsafeBufferPointer { ptr in /// print(strlen(ptr.baseAddress!)) /// } /// // Prints "6" @_inlineable // FIXME(sil-serialize-all) public var utf8CString: ContiguousArray { var result = ContiguousArray() result.reserveCapacity(utf8.count + 1) for c in utf8 { result.append(CChar(bitPattern: c)) } result.append(0) return result } @_inlineable // FIXME(sil-serialize-all) @_versioned // FIXME(sil-serialize-all) internal func _withUnsafeBufferPointerToUTF8( _ body: (UnsafeBufferPointer) throws -> R ) rethrows -> R { if let asciiBuffer = self._core.asciiBuffer { return try body(UnsafeBufferPointer( start: asciiBuffer.baseAddress, count: asciiBuffer.count)) } var nullTerminatedUTF8 = ContiguousArray() nullTerminatedUTF8.reserveCapacity(utf8.count + 1) nullTerminatedUTF8 += utf8 nullTerminatedUTF8.append(0) return try nullTerminatedUTF8.withUnsafeBufferPointer(body) } /// Creates a string corresponding to the given sequence of UTF-8 code units. /// /// If `utf8` is an ill-formed UTF-8 code sequence, the result is `nil`. /// /// You can use this initializer to create a new string from a slice of /// another string's `utf8` view. /// /// let picnicGuest = "Deserving porcupine" /// if let i = picnicGuest.utf8.index(of: 32) { /// let adjective = String(picnicGuest.utf8[..> utf8._core.elementShift if Index( encodedOffset: utf8.startIndex.encodedOffset + offset ).samePosition(in: wholeString) == nil || Index( encodedOffset: utf8.endIndex.encodedOffset + offset ).samePosition(in: wholeString) == nil { return nil } } self = String(utf8._core) } /// Creates a string corresponding to the given sequence of UTF-8 code units. @_inlineable // FIXME(sil-serialize-all) @available(swift, introduced: 4.0, message: "Please use failable String.init?(_:UTF8View) when in Swift 3.2 mode") public init(_ utf8: UTF8View) { self = String(utf8._core) } /// The index type for subscripting a string. public typealias UTF8Index = UTF8View.Index } extension String.UTF8View : _SwiftStringView { @_inlineable // FIXME(sil-serialize-all) @_versioned // FIXME(sil-serialize-all) internal var _persistentContent : String { return String(self._core) } } extension String.UTF8View { @_fixed_layout // FIXME(sil-serialize-all) public struct Iterator { internal typealias _OutputBuffer = UInt64 @_versioned // FIXME(sil-serialize-all) internal let _source: _StringCore @_versioned // FIXME(sil-serialize-all) internal var _sourceIndex: Int @_versioned // FIXME(sil-serialize-all) internal var _buffer: _OutputBuffer } public func makeIterator() -> Iterator { return Iterator(_core) } } extension String.UTF8View.Iterator : IteratorProtocol { @_inlineable // FIXME(sil-serialize-all) @_versioned // FIXME(sil-serialize-all) internal init(_ source: _StringCore) { _source = source _sourceIndex = 0 _buffer = 0 } @_inlineable // FIXME(sil-serialize-all) public mutating func next() -> Unicode.UTF8.CodeUnit? { if _fastPath(_buffer != 0) { let r = UInt8(truncatingIfNeeded: _buffer) &- 1 _buffer >>= 8 return r } if _slowPath(_sourceIndex == _source.count) { return nil } defer { _fixLifetime(_source) } if _fastPath(_source._unmanagedASCII != nil), let ascii = _source._unmanagedASCII { let result = ascii[_sourceIndex] _sourceIndex += 1 for i in 0 ..< _OutputBuffer.bitWidth>>3 { if _sourceIndex == _source.count { break } _buffer |= _OutputBuffer(ascii[_sourceIndex] &+ 1) &<< (i << 3) _sourceIndex += 1 } return result } if _fastPath(_source._unmanagedUTF16 != nil), let utf16 = _source._unmanagedUTF16 { return _next(refillingFrom: utf16) } return _next(refillingFrom: _source) } @_inlineable // FIXME(sil-serialize-all) @_versioned // FIXME(sil-serialize-all) internal mutating func _next( refillingFrom source: Source ) -> Unicode.UTF8.CodeUnit? where Source.Element == Unicode.UTF16.CodeUnit, Source.Index == Int { _sanityCheck(_buffer == 0) var shift = 0 // ASCII fastpath while _sourceIndex != _source.endIndex && shift < _OutputBuffer.bitWidth { let u = _source[_sourceIndex] if u >= 0x80 { break } _buffer |= _OutputBuffer(UInt8(truncatingIfNeeded: u &+ 1)) &<< shift _sourceIndex += 1 shift = shift &+ 8 } var i = IndexingIterator(_elements: source, _position: _sourceIndex) var parser = Unicode.UTF16.ForwardParser() Loop: while true { let u8: UTF8.EncodedScalar switch parser.parseScalar(from: &i) { case .valid(let s): u8 = UTF8.transcode(s, from: UTF16.self)._unsafelyUnwrappedUnchecked case .error(_): u8 = UTF8.encodedReplacementCharacter case .emptyInput: break Loop } var newBuffer = _buffer for x in u8 { newBuffer |= _OutputBuffer(x &+ 1) &<< shift shift = shift &+ 8 } guard _fastPath(shift <= _OutputBuffer.bitWidth) else { break Loop } _buffer = newBuffer _sourceIndex = i._position &- parser._buffer.count } guard _fastPath(_buffer != 0) else { return nil } let result = UInt8(truncatingIfNeeded: _buffer) &- 1 _buffer >>= 8 return result } } extension String.UTF8View { @_inlineable // FIXME(sil-serialize-all) public var count: Int { if _fastPath(_core.isASCII) { return _core.count } let b = _core._unmanagedUTF16 if _fastPath(b != nil) { defer { _fixLifetime(_core) } return _count(fromUTF16: b!) } return _count(fromUTF16: self._core) } @_inlineable // FIXME(sil-serialize-all) @_versioned // FIXME(sil-serialize-all) internal func _count(fromUTF16 source: Source) -> Int where Source.Element == Unicode.UTF16.CodeUnit { var result = 0 var prev: Unicode.UTF16.CodeUnit = 0 for u in source { switch u { case 0..<0x80: result += 1 case 0x80..<0x800: result += 2 case 0x800..<0xDC00: result += 3 case 0xDC00..<0xE000: result += UTF16.isLeadSurrogate(prev) ? 1 : 3 default: result += 3 } prev = u } return result } } // Index conversions extension String.UTF8View.Index { /// Creates an index in the given UTF-8 view that corresponds exactly to the /// specified `UTF16View` position. /// /// The following example finds the position of a space in a string's `utf16` /// view and then converts that position to an index in the string's /// `utf8` view. /// /// let cafe = "Café 🍵" /// /// let utf16Index = cafe.utf16.index(of: 32)! /// let utf8Index = String.UTF8View.Index(utf16Index, within: cafe.utf8)! /// /// print(Array(cafe.utf8[.. Index { return index(after: i!) } @_inlineable // FIXME(sil-serialize-all) @available( swift, obsoleted: 4.0, message: "Any String view index conversion can fail in Swift 4; please unwrap the optional index") public func index(_ i: Index?, offsetBy n: Int) -> Index { return index(i!, offsetBy: n) } @_inlineable // FIXME(sil-serialize-all) @available( swift, obsoleted: 4.0, message: "Any String view index conversion can fail in Swift 4; please unwrap the optional indices") public func distance( from i: Index?, to j: Index?) -> Int { return distance(from: i!, to: j!) } @_inlineable // FIXME(sil-serialize-all) @available( swift, obsoleted: 4.0, message: "Any String view index conversion can fail in Swift 4; please unwrap the optional index") public subscript(i: Index?) -> Unicode.UTF8.CodeUnit { return self[i!] } } //===--- Slicing Support --------------------------------------------------===// /// In Swift 3.2, in the absence of type context, /// /// someString.utf8[someString.utf8.startIndex..) -> String.UTF8View.SubSequence { return String.UTF8View.SubSequence(self, _bounds: r) } @_inlineable // FIXME(sil-serialize-all) @available(swift, obsoleted: 4) public subscript(r: Range) -> String.UTF8View { if r.upperBound._transcodedOffset == 0 { return String.UTF8View( _core[r.lowerBound.encodedOffset..) -> String.UTF8View { return self[bounds.relative(to: self)] } }