//===--- StringUTF8.swift - A UTF8 view of _StringCore --------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // // _StringCore currently has three representations: Native ASCII, // Native UTF-16, and Opaque Cocoa. Expose each of these as UTF-8 in a // way that will hopefully be efficient to traverse // //===----------------------------------------------------------------------===// // FIXME(ABI)#72 : The UTF-8 string view should conform to // `BidirectionalCollection`. // FIXME(ABI)#73 : The UTF-8 string view should have a custom iterator type to // allow performance optimizations of linear traversals. extension _StringCore { /// An integral type that holds a sequence of UTF-8 code units, starting in /// its low byte. internal typealias _UTF8Chunk = UInt64 /// Encode text starting at `i` as UTF-8. Returns a pair whose first /// element is the index of the text following whatever got encoded, /// and the second element contains the encoded UTF-8 starting in its /// low byte. Any unused high bytes in the result will be set to /// 0xFF. func _encodeSomeUTF8(from i: Int) -> (Int, _UTF8Chunk) { _sanityCheck(i <= count) if let asciiBuffer = self.asciiBuffer { // How many UTF-16 code units might we use before we've filled up // our _UTF8Chunk with UTF-8 code units? let utf16Count = Swift.min(MemoryLayout<_UTF8Chunk>.size, asciiBuffer.count - i) var result: _UTF8Chunk = ~0 // Start with all bits set _memcpy( dest: UnsafeMutableRawPointer(Builtin.addressof(&result)), src: asciiBuffer.baseAddress! + i, size: numericCast(utf16Count)) // Convert the _UTF8Chunk into host endianness. return (i + utf16Count, _UTF8Chunk(littleEndian: result)) } else if _fastPath(_baseAddress != nil) { // Transcoding should return a _UTF8Chunk in host endianness. return _encodeSomeContiguousUTF16AsUTF8(from: i) } else { #if _runtime(_ObjC) return _encodeSomeNonContiguousUTF16AsUTF8(from: i) #else _sanityCheckFailure("_encodeSomeUTF8: Unexpected cocoa string") #endif } } /// Helper for `_encodeSomeUTF8`, above. Handles the case where the /// storage is contiguous UTF-16. func _encodeSomeContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) { _sanityCheck(elementWidth == 2) _sanityCheck(_baseAddress != nil) let storage = UnsafeBufferPointer(start: startUTF16, count: self.count) return _transcodeSomeUTF16AsUTF8(storage, i) } #if _runtime(_ObjC) /// Helper for `_encodeSomeUTF8`, above. Handles the case where the /// storage is non-contiguous UTF-16. func _encodeSomeNonContiguousUTF16AsUTF8(from i: Int) -> (Int, _UTF8Chunk) { _sanityCheck(elementWidth == 2) _sanityCheck(_baseAddress == nil) let storage = _CollectionOf( _startIndex: 0, endIndex: self.count ) { (i: Int) -> UInt16 in return _cocoaStringSubscript(self, i) } return _transcodeSomeUTF16AsUTF8(storage, i) } #endif } extension String { /// A view of a string's contents as a collection of UTF-8 code units. /// /// You can access a string's view of UTF-8 code units by using its `utf8` /// property. A string's UTF-8 view encodes the string's Unicode scalar /// values as 8-bit integers. /// /// let flowers = "Flowers 💐" /// for v in flowers.utf8 { /// print(v) /// } /// // 70 /// // 108 /// // 111 /// // 119 /// // 101 /// // 114 /// // 115 /// // 32 /// // 240 /// // 159 /// // 146 /// // 144 /// /// A string's Unicode scalar values can be up to 21 bits in length. To /// represent those scalar values using 8-bit integers, more than one UTF-8 /// code unit is often required. /// /// let flowermoji = "💐" /// for v in flowermoji.unicodeScalars { /// print(v, v.value) /// } /// // 💐 128144 /// /// for v in flowermoji.utf8 { /// print(v) /// } /// // 240 /// // 159 /// // 146 /// // 144 /// /// In the encoded representation of a Unicode scalar value, each UTF-8 code /// unit after the first is called a *continuation byte*. /// /// UTF8View Elements Match Encoded C Strings /// ========================================= /// /// Swift streamlines interoperation with C string APIs by letting you pass a /// `String` instance to a function as an `Int8` or `UInt8` pointer. When you /// call a C function using a `String`, Swift automatically creates a buffer /// of UTF-8 code units and passes a pointer to that buffer. The code units /// of that buffer match the code units in the string's `utf8` view. /// /// The following example uses the C `strncmp` function to compare the /// beginning of two Swift strings. The `strncmp` function takes two /// `const char*` pointers and an integer specifying the number of characters /// to compare. Because the strings are identical up to the 14th character, /// comparing only those characters results in a return value of `0`. /// /// let s1 = "They call me 'Bell'" /// let s2 = "They call me 'Stacey'" /// /// print(strncmp(s1, s2, 14)) /// // Prints "0" /// print(String(s1.utf8.prefix(14))) /// // Prints "They call me '" /// /// Extending the compared character count to 15 includes the differing /// characters, so a nonzero result is returned. /// /// print(strncmp(s1, s2, 15)) /// // Prints "-17" /// print(String(s1.utf8.prefix(15))) /// // Prints "They call me 'B" public struct UTF8View : Collection, CustomStringConvertible, CustomDebugStringConvertible { internal let _core: _StringCore internal let _startIndex: Index internal let _endIndex: Index init(_ _core: _StringCore) { self._core = _core self._endIndex = Index(_coreIndex: _core.endIndex, Index._emptyBuffer) if _fastPath(_core.count != 0) { let (_, buffer) = _core._encodeSomeUTF8(from: 0) self._startIndex = Index(_coreIndex: 0, buffer) } else { self._startIndex = self._endIndex } } init(_ _core: _StringCore, _ s: Index, _ e: Index) { self._core = _core self._startIndex = s self._endIndex = e } /// A position in a string's `UTF8View` instance. /// /// You can convert between indices of the different string views by using /// conversion initializers and the `samePosition(in:)` method overloads. /// For example, the following code sample finds the index of the first /// space in the string's character view and then converts that to the same /// position in the UTF-8 view. /// /// let hearts = "Hearts <3 ♥︎ 💘" /// if let i = hearts.characters.index(of: " ") { /// let j = i.samePosition(in: hearts.utf8) /// print(Array(hearts.utf8.prefix(upTo: j))) /// print(hearts.utf8.prefix(upTo: j)) /// } /// // Prints "[72, 101, 97, 114, 116, 115]" /// // Prints "Hearts" public struct Index { internal typealias Buffer = _StringCore._UTF8Chunk init(_coreIndex: Int, _ _buffer: Buffer) { self._coreIndex = _coreIndex self._buffer = _buffer } /// True iff the index is at the end of its view or if the next /// byte begins a new UnicodeScalar. internal func _isOnUnicodeScalarBoundary(in core: _StringCore) -> Bool { let buffer = UInt32(truncatingBitPattern: _buffer) let (codePoint, _) = UTF8._decodeOne(buffer) return codePoint != nil || _isEndIndex(of: core) } /// True iff the index is at the end of its view internal func _isEndIndex(of core: _StringCore) -> Bool { return _buffer == Index._emptyBuffer && _coreIndex == core.endIndex } /// The number of UTF-8 code units remaining in the buffer before the /// next unicode scalar value is reached. This simulates calling /// `index(after: i)` until `i._coreIndex` is incremented, but doesn't /// need a `_core` reference. internal var _utf8ContinuationBytesUntilNextUnicodeScalar: Int { var buffer = _buffer var count = 0 while true { let currentUnit = UTF8.CodeUnit(truncatingBitPattern: buffer) if currentUnit & 0b1100_0000 != 0b1000_0000 { break } count += 1 buffer = Index._nextBuffer(after: buffer) } return count } /// The value of the buffer when it is empty internal static var _emptyBuffer: Buffer { return ~0 } /// A Buffer value with the high byte set internal static var _bufferHiByte: Buffer { return 0xFF << numericCast((MemoryLayout.size &- 1) &* 8) } /// Consume a byte of the given buffer: shift out the low byte /// and put FF in the high byte internal static func _nextBuffer(after thisBuffer: Buffer) -> Buffer { return (thisBuffer >> 8) | _bufferHiByte } /// The position of `self`, rounded up to the nearest unicode /// scalar boundary, in the underlying UTF-16. internal let _coreIndex: Int /// If `self` is at the end of its `_core`, has the value `_emptyBuffer`. /// Otherwise, the low byte contains the value of the UTF-8 code unit /// at this position. internal let _buffer: Buffer } public typealias IndexDistance = Int /// The position of the first code unit if the UTF-8 view is /// nonempty. /// /// If the UTF-8 view is empty, `startIndex` is equal to `endIndex`. public var startIndex: Index { return self._startIndex } /// The "past the end" position---that is, the position one /// greater than the last valid subscript argument. /// /// In an empty UTF-8 view, `endIndex` is equal to `startIndex`. public var endIndex: Index { return self._endIndex } /// Returns the next consecutive position after `i`. /// /// - Precondition: The next position is representable. public func index(after i: Index) -> Index { // FIXME: swift-3-indexing-model: range check i? let currentUnit = UTF8.CodeUnit(truncatingBitPattern: i._buffer) let hiNibble = currentUnit >> 4 // Amounts to increment the UTF-16 index based on the high nibble of a // UTF-8 code unit. If the high nibble is: // // - 0b0000-0b0111: U+0000...U+007F: increment the UTF-16 pointer by 1 // - 0b1000-0b1011: UTF-8 continuation byte, do not increment // the UTF-16 pointer // - 0b1100-0b1110: U+0080...U+FFFF: increment the UTF-16 pointer by 1 // - 0b1111: U+10000...U+1FFFFF: increment the UTF-16 pointer by 2 let u16Increments = Int(bitPattern: // 1111 1110 1101 1100 1011 1010 1001 1000 0111 0110 0101 0100 0011 0010 0001 0000 0b10___01___01___01___00___00___00___00___01___01___01___01___01___01___01___01) // Map the high nibble of the current code unit into the // amount by which to increment the UTF-16 index. let increment = (u16Increments >> numericCast(hiNibble << 1)) & 0x3 let nextCoreIndex = i._coreIndex &+ increment let nextBuffer = Index._nextBuffer(after: i._buffer) // If the nextBuffer is nonempty, we have all we need if _fastPath(nextBuffer != Index._emptyBuffer) { return Index(_coreIndex: nextCoreIndex, nextBuffer) } // If the underlying UTF16 isn't exhausted, fill a new buffer else if _fastPath(nextCoreIndex < _core.endIndex) { let (_, freshBuffer) = _core._encodeSomeUTF8(from: nextCoreIndex) return Index(_coreIndex: nextCoreIndex, freshBuffer) } else { // Produce the endIndex _precondition( nextCoreIndex == _core.endIndex, "Can't increment past endIndex of String.UTF8View") return Index(_coreIndex: nextCoreIndex, nextBuffer) } } /// Accesses the code unit at the given position. /// /// The following example uses the subscript to print the value of a /// string's first UTF-8 code unit. /// /// let greeting = "Hello, friend!" /// let i = greeting.utf8.startIndex /// print("First character's UTF-8 code unit: \(greeting.utf8[i])") /// // Prints "First character's UTF-8 code unit: 72" /// /// - Parameter position: A valid index of the view. `position` /// must be less than the view's end index. public subscript(position: Index) -> UTF8.CodeUnit { let result = UTF8.CodeUnit(truncatingBitPattern: position._buffer & 0xFF) _precondition(result != 0xFF, "cannot subscript using endIndex") return result } /// Accesses the contiguous subrange of elements enclosed by the specified /// range. /// /// - Complexity: O(*n*) if the underlying string is bridged from /// Objective-C, where *n* is the length of the string; otherwise, O(1). public subscript(bounds: Range) -> UTF8View { return UTF8View(_core, bounds.lowerBound, bounds.upperBound) } public var description: String { return String._fromCodeUnitSequenceWithRepair(UTF8.self, input: self).0 } public var debugDescription: String { return "UTF8View(\(self.description.debugDescription))" } } /// A UTF-8 encoding of `self`. public var utf8: UTF8View { get { return UTF8View(self._core) } set { self = String(describing: newValue) } } /// A contiguously stored null-terminated UTF-8 representation of the string. /// /// To access the underlying memory, invoke `withUnsafeBufferPointer` on the /// array. /// /// let s = "Hello!" /// let bytes = s.utf8CString /// print(bytes) /// // Prints "[72, 101, 108, 108, 111, 33, 0]" /// /// bytes.withUnsafeBufferPointer { ptr in /// print(strlen(ptr.baseAddress!)) /// } /// // Prints "6" public var utf8CString: ContiguousArray { var result = ContiguousArray() result.reserveCapacity(utf8.count + 1) for c in utf8 { result.append(CChar(bitPattern: c)) } result.append(0) return result } internal func _withUnsafeBufferPointerToUTF8( _ body: (UnsafeBufferPointer) throws -> R ) rethrows -> R { if let asciiBuffer = self._core.asciiBuffer { return try body(UnsafeBufferPointer( start: asciiBuffer.baseAddress, count: asciiBuffer.count)) } var nullTerminatedUTF8 = ContiguousArray() nullTerminatedUTF8.reserveCapacity(utf8.count + 1) nullTerminatedUTF8 += utf8 nullTerminatedUTF8.append(0) return try nullTerminatedUTF8.withUnsafeBufferPointer(body) } /// Creates a string corresponding to the given sequence of UTF-8 code units. /// /// If `utf8` is an ill-formed UTF-8 code sequence, the result is `nil`. /// /// You can use this initializer to create a new string from a slice of /// another string's `utf8` view. /// /// let picnicGuest = "Deserving porcupine" /// if let i = picnicGuest.utf8.index(of: 32) { /// let adjective = String(picnicGuest.utf8.prefix(upTo: i)) /// print(adjective) /// } /// // Prints "Optional(Deserving)" /// /// The `adjective` constant is created by calling this initializer with a /// slice of the `picnicGuest.utf8` view. /// /// - Parameter utf8: A UTF-8 code sequence. public init?(_ utf8: UTF8View) { let wholeString = String(utf8._core) if let start = utf8.startIndex.samePosition(in: wholeString), let end = utf8.endIndex.samePosition(in: wholeString) { self = wholeString[start.. Bool { // If the underlying UTF16 index differs, they're unequal if lhs._coreIndex != rhs._coreIndex { return false } // Match up bytes in the buffer var buffer = (lhs._buffer, rhs._buffer) var isContinuation: Bool while true { let unit = ( UTF8.CodeUnit(truncatingBitPattern: buffer.0), UTF8.CodeUnit(truncatingBitPattern: buffer.1)) isContinuation = UTF8.isContinuation(unit.0) if !isContinuation { // We don't check for unit equality in this case because one of // the units might be an 0xFF read from the end of the buffer. return !UTF8.isContinuation(unit.1) } // Continuation bytes must match exactly else if unit.0 != unit.1 { return false } // Move the buffers along. buffer = ( String.UTF8Index._nextBuffer(after: buffer.0), String.UTF8Index._nextBuffer(after: buffer.1)) } } public static func < ( lhs: String.UTF8View.Index, rhs: String.UTF8View.Index ) -> Bool { if lhs._coreIndex == rhs._coreIndex && lhs._buffer != rhs._buffer { // The index with more continuation bytes remaining before the next return lhs._utf8ContinuationBytesUntilNextUnicodeScalar > rhs._utf8ContinuationBytesUntilNextUnicodeScalar } return lhs._coreIndex < rhs._coreIndex } } // Index conversions extension String.UTF8View.Index { internal init(_ core: _StringCore, _utf16Offset: Int) { let (_, buffer) = core._encodeSomeUTF8(from: _utf16Offset) self.init(_coreIndex: _utf16Offset, buffer) } /// Creates an index in the given UTF-8 view that corresponds exactly to the /// specified `UTF16View` position. /// /// The following example finds the position of a space in a string's `utf16` /// view and then converts that position to an index in the string's /// `utf8` view. /// /// let cafe = "Café 🍵" /// /// let utf16Index = cafe.utf16.index(of: 32)! /// let utf8Index = String.UTF8View.Index(utf16Index, within: cafe.utf8)! /// /// print(Array(cafe.utf8.prefix(upTo: utf8Index))) /// // Prints "[67, 97, 102, 195, 169]" /// /// If the position passed in `utf16Index` doesn't have an exact /// corresponding position in `utf8`, the result of the initializer is /// `nil`. For example, because UTF-8 and UTF-16 represent high Unicode code /// points differently, an attempt to convert the position of the trailing /// surrogate of a UTF-16 surrogate pair fails. /// /// The next example attempts to convert the indices of the two UTF-16 code /// points that represent the teacup emoji (`"🍵"`). The index of the lead /// surrogate is successfully converted to a position in `utf8`, but the /// index of the trailing surrogate is not. /// /// let emojiHigh = cafe.utf16.index(after: utf16Index) /// print(String.UTF8View.Index(emojiHigh, within: cafe.utf8)) /// // Prints "Optional(String.Index(...))" /// /// let emojiLow = cafe.utf16.index(after: emojiHigh) /// print(String.UTF8View.Index(emojiLow, within: cafe.utf8)) /// // Prints "nil" /// /// - Parameters: /// - utf16Index: A position in a `UTF16View` instance. `utf16Index` must /// be an element in `String(utf8).utf16.indices`. /// - utf8: The `UTF8View` in which to find the new position. public init?(_ utf16Index: String.UTF16Index, within utf8: String.UTF8View) { let utf16 = String.UTF16View(utf8._core) if utf16Index != utf16.startIndex && utf16Index != utf16.endIndex { _precondition( utf16Index >= utf16.startIndex && utf16Index <= utf16.endIndex, "Invalid String.UTF16Index for this UTF-8 view") // Detect positions that have no corresponding index. Note that // we have to check before and after, because an unpaired // surrogate will be decoded as a single replacement character, // thus making the corresponding position valid in UTF8. if UTF16.isTrailSurrogate(utf16[utf16Index]) && UTF16.isLeadSurrogate(utf16[utf16.index(before: utf16Index)]) { return nil } } self.init(utf8._core, _utf16Offset: utf16Index._offset) } /// Creates an index in the given UTF-8 view that corresponds exactly to the /// specified `UnicodeScalarView` position. /// /// The following example converts the position of the Unicode scalar `"e"` /// into its corresponding position in the string's `utf8` view. /// /// let cafe = "Cafe\u{0301}" /// let scalarsIndex = cafe.unicodeScalars.index(of: "e")! /// let utf8Index = String.UTF8View.Index(scalarsIndex, within: cafe.utf8) /// /// print(Array(cafe.utf8.prefix(through: utf8Index))) /// // Prints "[67, 97, 102, 101]" /// /// - Parameters: /// - unicodeScalarIndex: A position in a `UnicodeScalarView` instance. /// `unicodeScalarIndex` must be an element of /// `String(utf8).unicodeScalars.indices`. /// - utf8: The `UTF8View` in which to find the new position. public init( _ unicodeScalarIndex: String.UnicodeScalarIndex, within utf8: String.UTF8View ) { self.init(utf8._core, _utf16Offset: unicodeScalarIndex._position) } /// Creates an index in the given UTF-8 view that corresponds exactly to the /// specified string position. /// /// The following example converts the position of the teacup emoji (`"🍵"`) /// into its corresponding position in the string's `utf8` view. /// /// let cafe = "Café 🍵" /// let characterIndex = cafe.characters.index(of: "🍵")! /// let utf8Index = String.UTF8View.Index(characterIndex, within: cafe.utf8) /// /// print(Array(cafe.utf8.suffix(from: utf8Index))) /// // Prints "[240, 159, 141, 181]" /// /// - Parameters: /// - characterIndex: A position in a `CharacterView` instance. /// `characterIndex` must be an element of /// `String(utf8).characters.indices`. /// - utf8: The `UTF8View` in which to find the new position. public init(_ characterIndex: String.Index, within utf8: String.UTF8View) { self.init(utf8._core, _utf16Offset: characterIndex._base._position) } /// Returns the position in the given UTF-16 view that corresponds exactly to /// this index. /// /// The index must be a valid index of `String(utf16).utf8`. /// /// This example first finds the position of a space (UTF-8 code point `32`) /// in a string's `utf8` view and then uses this method to find the same /// position in the string's `utf16` view. /// /// let cafe = "Café 🍵" /// let i = cafe.utf8.index(of: 32)! /// let j = i.samePosition(in: cafe.utf16)! /// print(cafe.utf16.prefix(upTo: j)) /// // Prints "Café" /// /// - Parameter utf16: The view to use for the index conversion. /// - Returns: The position in `utf16` that corresponds exactly to this /// index. If this index does not have an exact corresponding position in /// `utf16`, this method returns `nil`. For example, an attempt to convert /// the position of a UTF-8 continuation byte returns `nil`. public func samePosition( in utf16: String.UTF16View ) -> String.UTF16View.Index? { return String.UTF16View.Index(self, within: utf16) } /// Returns the position in the given view of Unicode scalars that /// corresponds exactly to this index. /// /// This index must be a valid index of `String(unicodeScalars).utf8`. /// /// This example first finds the position of a space (UTF-8 code point `32`) /// in a string's `utf8` view and then uses this method to find the same position /// in the string's `unicodeScalars` view. /// /// let cafe = "Café 🍵" /// let i = cafe.utf8.index(of: 32)! /// let j = i.samePosition(in: cafe.unicodeScalars)! /// print(cafe.unicodeScalars.prefix(upTo: j)) /// // Prints "Café" /// /// - Parameter unicodeScalars: The view to use for the index conversion. /// - Returns: The position in `unicodeScalars` that corresponds exactly to /// this index. If this index does not have an exact corresponding /// position in `unicodeScalars`, this method returns `nil`. For example, /// an attempt to convert the position of a UTF-8 continuation byte /// returns `nil`. public func samePosition( in unicodeScalars: String.UnicodeScalarView ) -> String.UnicodeScalarIndex? { return String.UnicodeScalarIndex(self, within: unicodeScalars) } /// Returns the position in the given string that corresponds exactly to this /// index. /// /// This index must be a valid index of `characters.utf8`. /// /// This example first finds the position of a space (UTF-8 code point `32`) /// in a string's `utf8` view and then uses this method find the same position /// in the string. /// /// let cafe = "Café 🍵" /// let i = cafe.utf8.index(of: 32)! /// let j = i.samePosition(in: cafe)! /// print(cafe[cafe.startIndex ..< j]) /// // Prints "Café" /// /// - Parameter characters: The string to use for the index conversion. /// - Returns: The position in `characters` that corresponds exactly to /// this index. If this index does not have an exact corresponding /// position in `characters`, this method returns `nil`. For example, /// an attempt to convert the position of a UTF-8 continuation byte /// returns `nil`. public func samePosition( in characters: String ) -> String.Index? { return String.Index(self, within: characters) } } // Reflection extension String.UTF8View : CustomReflectable { /// Returns a mirror that reflects the UTF-8 view of a string. public var customMirror: Mirror { return Mirror(self, unlabeledChildren: self) } } extension String.UTF8View : CustomPlaygroundQuickLookable { public var customPlaygroundQuickLook: PlaygroundQuickLook { return .text(description) } } extension String { @available(*, unavailable, message: "Please use String.utf8CString instead.") public var nulTerminatedUTF8: ContiguousArray { Builtin.unreachable() } }