//===--- StringUTF8.swift - A UTF8 view of String -------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2025 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // FIXME(ABI)#71 : The UTF-16 string view should have a custom iterator type to // allow performance optimizations of linear traversals. extension String { /// A view of a string's contents as a collection of UTF-8 code units. /// /// You can access a string's view of UTF-8 code units by using its `utf8` /// property. A string's UTF-8 view encodes the string's Unicode scalar /// values as 8-bit integers. /// /// let flowers = "Flowers 💐" /// for v in flowers.utf8 { /// print(v) /// } /// // 70 /// // 108 /// // 111 /// // 119 /// // 101 /// // 114 /// // 115 /// // 32 /// // 240 /// // 159 /// // 146 /// // 144 /// /// A string's Unicode scalar values can be up to 21 bits in length. To /// represent those scalar values using 8-bit integers, more than one UTF-8 /// code unit is often required. /// /// let flowermoji = "💐" /// for v in flowermoji.unicodeScalars { /// print(v, v.value) /// } /// // 💐 128144 /// /// for v in flowermoji.utf8 { /// print(v) /// } /// // 240 /// // 159 /// // 146 /// // 144 /// /// In the encoded representation of a Unicode scalar value, each UTF-8 code /// unit after the first is called a *continuation byte*. /// /// UTF8View Elements Match Encoded C Strings /// ========================================= /// /// Swift streamlines interoperation with C string APIs by letting you pass a /// `String` instance to a function as an `Int8` or `UInt8` pointer. When you /// call a C function using a `String`, Swift automatically creates a buffer /// of UTF-8 code units and passes a pointer to that buffer. The code units /// of that buffer match the code units in the string's `utf8` view. /// /// The following example uses the C `strncmp` function to compare the /// beginning of two Swift strings. The `strncmp` function takes two /// `const char*` pointers and an integer specifying the number of characters /// to compare. Because the strings are identical up to the 14th character, /// comparing only those characters results in a return value of `0`. /// /// let s1 = "They call me 'Bell'" /// let s2 = "They call me 'Stacey'" /// /// print(strncmp(s1, s2, 14)) /// // Prints "0" /// print(String(s1.utf8.prefix(14))!) /// // Prints "They call me '" /// /// Extending the compared character count to 15 includes the differing /// characters, so a nonzero result is returned. /// /// print(strncmp(s1, s2, 15)) /// // Prints "-17" /// print(String(s1.utf8.prefix(15))!) /// // Prints "They call me 'B" @frozen public struct UTF8View: Sendable { @usableFromInline internal var _guts: _StringGuts @inlinable @inline(__always) internal init(_ guts: _StringGuts) { self._guts = guts _invariantCheck() } } } extension String.UTF8View { #if !INTERNAL_CHECKS_ENABLED @inlinable @inline(__always) internal func _invariantCheck() {} #else @usableFromInline @inline(never) @_effects(releasenone) internal func _invariantCheck() { // TODO: Ensure index alignment } #endif // INTERNAL_CHECKS_ENABLED } extension String.UTF8View: BidirectionalCollection { public typealias Index = String.Index public typealias Element = UTF8.CodeUnit /// The position of the first code unit if the UTF-8 view is /// nonempty. /// /// If the UTF-8 view is empty, `startIndex` is equal to `endIndex`. @inlinable @inline(__always) public var startIndex: Index { return _guts.startIndex } /// The "past the end" position---that is, the position one /// greater than the last valid subscript argument. /// /// In an empty UTF-8 view, `endIndex` is equal to `startIndex`. @inlinable @inline(__always) public var endIndex: Index { return _guts.endIndex } /// Returns the next consecutive position after `i`. /// /// - Precondition: The next position is representable. @inlinable @inline(__always) public func index(after i: Index) -> Index { let i = _guts.ensureMatchingEncoding(i) if _fastPath(_guts.isFastUTF8) { // Note: deferred bounds check return i.strippingTranscoding.nextEncoded._knownUTF8 } _precondition(i._encodedOffset < _guts.count, "String index is out of bounds") return _foreignIndex(after: i) } @inlinable @inline(__always) public func index(before i: Index) -> Index { let i = _guts.ensureMatchingEncoding(i) _precondition(!i.isZeroPosition, "String index is out of bounds") if _fastPath(_guts.isFastUTF8) { return i.strippingTranscoding.priorEncoded._knownUTF8 } _precondition(i._encodedOffset <= _guts.count, "String index is out of bounds") return _foreignIndex(before: i) } @inlinable @inline(__always) public func index(_ i: Index, offsetBy n: Int) -> Index { let i = _guts.ensureMatchingEncoding(i) if _fastPath(_guts.isFastUTF8) { let offset = n + i._encodedOffset _precondition(offset >= 0 && offset <= _guts.count, "String index is out of bounds") return Index(_encodedOffset: offset)._knownUTF8 } return _foreignIndex(i, offsetBy: n) } @inlinable @inline(__always) public func index( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { let i = _guts.ensureMatchingEncoding(i) if _fastPath(_guts.isFastUTF8) { // Check the limit: ignore limit if it precedes `i` (in the correct // direction), otherwise must not be beyond limit (in the correct // direction). let iOffset = i._encodedOffset let result = iOffset + n let limitOffset = limit._encodedOffset if n >= 0 { guard limitOffset < iOffset || result <= limitOffset else { return nil } } else { guard limitOffset > iOffset || result >= limitOffset else { return nil } } _precondition(result >= 0 && result <= _guts.count, "String index is out of bounds") return Index(_encodedOffset: result)._knownUTF8 } return _foreignIndex(i, offsetBy: n, limitedBy: limit) } @inlinable @inline(__always) public func distance(from i: Index, to j: Index) -> Int { let i = _guts.ensureMatchingEncoding(i) let j = _guts.ensureMatchingEncoding(j) if _fastPath(_guts.isFastUTF8) { return j._encodedOffset &- i._encodedOffset } _precondition( i._encodedOffset <= _guts.count && j._encodedOffset <= _guts.count, "String index is out of bounds") return _foreignDistance(from: i, to: j) } /// Accesses the code unit at the given position. /// /// The following example uses the subscript to print the value of a /// string's first UTF-8 code unit. /// /// let greeting = "Hello, friend!" /// let i = greeting.utf8.startIndex /// print("First character's UTF-8 code unit: \(greeting.utf8[i])") /// // Prints "First character's UTF-8 code unit: 72" /// /// - Parameter position: A valid index of the view. `position` /// must be less than the view's end index. @inlinable @inline(__always) public subscript(i: Index) -> UTF8.CodeUnit { let i = _guts.ensureMatchingEncoding(i) _precondition(i._encodedOffset < _guts.count, "String index is out of bounds") return self[_unchecked: i] } @_alwaysEmitIntoClient @inline(__always) internal subscript(_unchecked i: Index) -> UTF8.CodeUnit { if _fastPath(_guts.isFastUTF8) { return unsafe _guts.withFastUTF8 { utf8 in unsafe utf8[_unchecked: i._encodedOffset] } } return _foreignSubscript(position: i) } } extension String.UTF8View: CustomStringConvertible { @inlinable @inline(__always) public var description: String { return String(_guts) } } extension String.UTF8View: CustomDebugStringConvertible { public var debugDescription: String { return "UTF8View(\(self.description.debugDescription))" } } extension String { /// A UTF-8 encoding of `self`. @inlinable public var utf8: UTF8View { @inline(__always) get { return UTF8View(self._guts) } set { self = String(newValue._guts) } } /// A contiguously stored null-terminated UTF-8 representation of the string. /// /// To access the underlying memory, invoke `withUnsafeBufferPointer` on the /// array. /// /// let s = "Hello!" /// let bytes = s.utf8CString /// print(bytes) /// // Prints "[72, 101, 108, 108, 111, 33, 0]" /// /// bytes.withUnsafeBufferPointer { ptr in /// print(strlen(ptr.baseAddress!)) /// } /// // Prints "6" public var utf8CString: ContiguousArray { @_effects(readonly) @_semantics("string.getUTF8CString") get { if _fastPath(_guts.isFastUTF8) { var result = unsafe _guts.withFastCChar { unsafe ContiguousArray($0) } result.append(0) return result } return _slowUTF8CString() } } @usableFromInline @inline(never) // slow-path internal func _slowUTF8CString() -> ContiguousArray { var result = ContiguousArray() result.reserveCapacity(self._guts.count + 1) for c in self.utf8 { result.append(CChar(bitPattern: c)) } result.append(0) return result } /// Creates a string corresponding to the given sequence of UTF-8 code units. @available(swift, introduced: 4.0, message: "Please use failable String.init?(_:UTF8View) when in Swift 3.2 mode") @inlinable @inline(__always) public init(_ utf8: UTF8View) { self = String(utf8._guts) } } extension String.UTF8View { @inlinable @inline(__always) public var count: Int { if _fastPath(_guts.isFastUTF8) { return _guts.count } return _foreignCount() } } @available(SwiftStdlib 6.2, *) extension String.UTF8View { @lifetime(borrow self) internal borrowing func _underlyingSpan() -> Span { #if _runtime(_ObjC) // handle non-UTF8 Objective-C bridging cases here if !_guts.isFastUTF8, _guts._object.hasObjCBridgeableObject { let storage = _guts._getOrAllocateAssociatedStorage() let (start, count) = unsafe (storage.start, storage.count) let span = unsafe Span(_unsafeStart: start, count: count) return unsafe _overrideLifetime(span, borrowing: self) } #endif // _runtime(_ObjC) let count = _guts.count if _guts.isSmall { let a = Builtin.addressOfBorrow(self) let address = unsafe UnsafePointer(a) let span = unsafe Span(_unsafeStart: address, count: count) return unsafe _overrideLifetime(span, borrowing: self) } let isFastUTF8 = _guts.isFastUTF8 _precondition(isFastUTF8, "String must be contiguous UTF8") let buffer = unsafe _guts._object.fastUTF8 let span = unsafe Span(_unsafeElements: buffer) return unsafe _overrideLifetime(span, borrowing: self) } #if !(os(watchOS) && _pointerBitWidth(_32)) /// A span over the UTF8 code units that make up this string. /// /// - Note: In the case of bridged UTF16 String instances (on Apple /// platforms,) this property transcodes the code units the first time /// it is called. The transcoded buffer is cached, and subsequent calls /// to `span` can reuse the buffer. /// /// Returns: a `Span` over the UTF8 code units of this String. /// /// Complexity: O(1) for native UTF8 Strings, /// amortized O(1) for bridged UTF16 Strings. @available(SwiftStdlib 6.2, *) public var span: Span { @lifetime(borrow self) borrowing get { _underlyingSpan() } } /// A span over the UTF8 code units that make up this string. /// /// - Note: In the case of bridged UTF16 String instances (on Apple /// platforms,) this property transcodes the code units the first time /// it is called. The transcoded buffer is cached, and subsequent calls /// to `span` can reuse the buffer. /// /// Returns: a `Span` over the UTF8 code units of this String. /// /// Complexity: O(1) for native UTF8 Strings, /// amortized O(1) for bridged UTF16 Strings. @available(SwiftStdlib 6.2, *) public var _span: Span? { @_alwaysEmitIntoClient @inline(__always) @lifetime(borrow self) borrowing get { span } } #else // !(os(watchOS) && _pointerBitWidth(_32)) @available(watchOS, unavailable) public var span: Span { @lifetime(borrow self) borrowing get { fatalError("\(#function) unavailable on 32-bit watchOS") } } /// A span over the UTF8 code units that make up this string. /// /// - Note: In the case of bridged UTF16 String instances (on Apple /// platforms,) this property transcodes the code units the first time /// it is called. The transcoded buffer is cached, and subsequent calls /// to `span` can reuse the buffer. /// /// Returns: a `Span` over the UTF8 code units of this String, or `nil` /// if the String does not have a contiguous representation. /// /// Complexity: O(1) for native UTF8 Strings, /// amortized O(1) for bridged UTF16 Strings. @available(SwiftStdlib 6.2, *) public var _span: Span? { @lifetime(borrow self) borrowing get { if _guts.isSmall, _guts.count > _SmallString.contiguousCapacity() { return nil } return _underlyingSpan() } } #endif // !(os(watchOS) && _pointerBitWidth(_32)) } // Index conversions extension String.UTF8View.Index { /// Creates an index in the given UTF-8 view that corresponds exactly to the /// specified `UTF16View` position. /// /// The following example finds the position of a space in a string's `utf16` /// view and then converts that position to an index in the string's /// `utf8` view. /// /// let cafe = "Café 🍵" /// /// let utf16Index = cafe.utf16.firstIndex(of: 32)! /// let utf8Index = String.UTF8View.Index(utf16Index, within: cafe.utf8)! /// /// print(Array(cafe.utf8[..) -> String.UTF8View.SubSequence { let r = _guts.validateSubscalarRange(r) return Substring.UTF8View(self, _bounds: r) } } extension String.UTF8View { /// Copies `self` into the supplied buffer. /// /// - Precondition: The memory in `self` is uninitialized. The buffer must /// contain sufficient uninitialized memory to accommodate /// `source.underestimatedCount`. /// /// - Postcondition: The `Pointee`s at `buffer[startIndex.. ) -> (Iterator, UnsafeMutableBufferPointer.Index) { guard unsafe buffer.baseAddress != nil else { _preconditionFailure( "Attempt to copy string contents into nil buffer pointer") } guard let written = unsafe _guts.copyUTF8(into: buffer) else { _preconditionFailure( "Insufficient space allocated to copy string contents") } let it = String().utf8.makeIterator() return (it, unsafe buffer.index(buffer.startIndex, offsetBy: written)) } } // Foreign string support extension String.UTF8View { // Align a foreign UTF-16 index to a valid UTF-8 position. If there is a // transcoded offset already, this is already a valid UTF-8 position // (referring to a continuation byte) and returns `idx`. Otherwise, this will // scalar-align the index. This is needed because we may be passed a // non-scalar-aligned foreign index from the UTF16View. @_alwaysEmitIntoClient @inline(__always) internal func _utf8AlignForeignIndex(_ idx: String.Index) -> String.Index { _internalInvariant(_guts.isForeign) guard idx.transcodedOffset == 0 else { return idx } return _guts.scalarAlign(idx) } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignIndex(after idx: Index) -> Index { _internalInvariant(_guts.isForeign) _internalInvariant(idx._encodedOffset < _guts.count) let idx = _utf8AlignForeignIndex(idx) let (scalar, scalarLen) = _guts.foreignErrorCorrectedScalar( startingAt: idx.strippingTranscoding) let utf8Len = UTF8.width(scalar) if utf8Len == 1 { _internalInvariant(idx.transcodedOffset == 0) return idx.nextEncoded._scalarAligned._knownUTF16 } // Check if we're still transcoding sub-scalar if idx.transcodedOffset < utf8Len - 1 { return idx.nextTranscoded._knownUTF16 } // Skip to the next scalar _internalInvariant(idx.transcodedOffset == utf8Len - 1) return idx.encoded(offsetBy: scalarLen)._scalarAligned._knownUTF16 } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignIndex(before idx: Index) -> Index { _internalInvariant(_guts.isForeign) _internalInvariant(idx._encodedOffset <= _guts.count) let idx = _utf8AlignForeignIndex(idx) if idx.transcodedOffset != 0 { _internalInvariant((1...3) ~= idx.transcodedOffset) return idx.priorTranscoded._knownUTF16 } let (scalar, scalarLen) = _guts.foreignErrorCorrectedScalar( endingAt: idx.strippingTranscoding) let utf8Len = UTF8.width(scalar) return idx.encoded( offsetBy: -scalarLen ).transcoded(withOffset: utf8Len &- 1)._knownUTF16 } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignSubscript(position idx: Index) -> UTF8.CodeUnit { _internalInvariant(_guts.isForeign) let idx = _utf8AlignForeignIndex(idx) let scalar = _guts.foreignErrorCorrectedScalar( startingAt: idx.strippingTranscoding).0 let encoded = Unicode.UTF8.encode(scalar)._unsafelyUnwrappedUnchecked _internalInvariant(idx.transcodedOffset < 1+encoded.count) return encoded[ encoded.index(encoded.startIndex, offsetBy: idx.transcodedOffset)] } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignIndex(_ i: Index, offsetBy n: Int) -> Index { _internalInvariant(_guts.isForeign) return _index(i, offsetBy: n) } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignIndex( _ i: Index, offsetBy n: Int, limitedBy limit: Index ) -> Index? { _internalInvariant(_guts.isForeign) return _index(i, offsetBy: n, limitedBy: limit) } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignDistance(from i: Index, to j: Index) -> Int { _internalInvariant(_guts.isForeign) let i = _utf8AlignForeignIndex(i) let j = _utf8AlignForeignIndex(j) #if _runtime(_ObjC) // Currently, foreign means NSString let count = _guts._object.withCocoaObject { _cocoaStringUTF8Count($0, range: i._encodedOffset ..< j._encodedOffset) } if let count { // _cocoaStringUTF8Count gave us the scalar aligned count, but we still // need to compensate for sub-scalar indexing, e.g. if `i` is in the // middle of a two-byte UTF8 scalar. let refinedCount = (count - i.transcodedOffset) + j.transcodedOffset _internalInvariant(refinedCount == _distance(from: i, to: j)) return refinedCount } #endif return _distance(from: i, to: j) } @usableFromInline @inline(never) @_effects(releasenone) internal func _foreignCount() -> Int { _internalInvariant(_guts.isForeign) return _foreignDistance(from: startIndex, to: endIndex) } } extension String.Index { @usableFromInline @inline(never) // opaque slow-path @_effects(releasenone) internal func _foreignIsWithin(_ target: String.UTF8View) -> Bool { _internalInvariant(target._guts.isForeign) return self == target._utf8AlignForeignIndex(self) } } extension String.UTF8View { @inlinable public func withContiguousStorageIfAvailable( _ body: (UnsafeBufferPointer) throws -> R ) rethrows -> R? { guard _guts.isFastUTF8 else { return nil } return unsafe try _guts.withFastUTF8(body) } }