//===----------------------------------------------------------------------===// // // This source file is part of the Swift.org open source project // // Copyright (c) 2014 - 2023 Apple Inc. and the Swift project authors // Licensed under Apache License v2.0 with Runtime Library Exception // // See https://swift.org/LICENSE.txt for license information // See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors // //===----------------------------------------------------------------------===// // String Creation Helpers //===----------------------------------------------------------------------===// internal func _allASCII(_ input: UnsafeBufferPointer) -> Bool { //--------------- Implementation building blocks ---------------------------// #if arch(arm64_32) typealias Word = UInt64 #else typealias Word = UInt #endif let mask = Word(truncatingIfNeeded: 0x80808080_80808080 as UInt64) #if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES // TODO: Should consider AVX2 / AVX512 / AVX10 path here typealias Block = (SIMD16, SIMD16) @_transparent func pmovmskb(_ vec: SIMD16) -> UInt16 { UInt16(Builtin.bitcast_Vec16xInt1_Int16( Builtin.cmp_slt_Vec16xInt8(vec._storage._value, Builtin.zeroInitializer()) )) } #elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES typealias Block = (SIMD16, SIMD16) @_transparent func umaxv(_ vec: SIMD16) -> UInt8 { UInt8(Builtin.int_vector_reduce_umax_Vec16xInt8(vec._storage._value)) } #else typealias Block = (Word, Word, Word, Word) #endif @_transparent func allASCII(wordAt pointer: UnsafePointer) -> Bool { let word = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Word.self) return word & mask == 0 } @_transparent func allASCII(blockAt pointer: UnsafePointer) -> Bool { let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self) #if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES return pmovmskb(block.0 | block.1) == 0 #elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES return umaxv(block.0 | block.1) < 0x80 #else return (block.0 | block.1 | block.2 | block.3) & mask == 0 #endif } //----------------------- Implementation proper ----------------------------// guard input.count >= MemoryLayout.size else { // They gave us a region of memory // whose size is as modest as it can be. // We'll check every byte // for the bit of most height // and return if we happen on any // // I'm sorry, I'm sorry, I'm trying to delete it. (This chunk of code, not // the Limerick. I would wager that--at least for Strings--we could // unconditionally load 16B here,¹ because of the small string encoding, // and check them all at once, which would be much more efficient. That // probably has to happen by lifting this check into the SmallString // initializer directly, though.) // // ¹ well, most of the time, which makes it a rather conditional // "unconditionally". return unsafe input.allSatisfy { $0 < 0x80 } } // bytes.count is non-zero, so we can unconditionally unwrap baseAddress. let base = unsafe input.baseAddress._unsafelyUnwrappedUnchecked let n = input.count var i = 0 guard n >= MemoryLayout.size else { // The size isn't yet to a block // word-by-word we are forced to walk. // So as to not leave a gap // the last word may lap // the word that we already chalked. // // 0 k 2k 3k ?k n-k n-1 // | | | | | | | // +------+------+------+ +------+ | // | word | word | word | ... | word | | // +------+------+------+ +------+ v // +------+ // possibly overlapping final word > | word | // +------+ // // This means that we check any bytes in the overlap region twice, but // that's much preferrable to using smaller accesses to avoid rechecking, // because the entire last word is about as expensive as checking just // one byte would be, and on average there's more than one byte remaining. // // Note that we don't bother trying to align any of these accesses, because // there is minimal benefit to doing so on "modern" OoO cores, which can // handle cacheline-crossing loads at full speed. If the string happens to // be aligned, they'll be aligned, if not, they won't be. It will likely // make sense to add a path that does align everything for more limited // embedded CPUs, though. let k = MemoryLayout.size let last = n &- k while i < last { guard unsafe allASCII(wordAt: base + i) else { return false } i &+= k } return unsafe allASCII(wordAt: base + last) } // check block-by-block, with a possibly overlapping last block to avoid // sub-block cleanup. We should be able to avoid manual index arithmetic // and write this loop and the one above something like the following: // // return stride(from: 0, to: last, by: k).allSatisfy { // allASCII(blockAt: base + $0) // } && allASCII(blockAt: base + last) // // but LLVM leaves one unnecessary conditional operation in the loop // when we do that, so we write them out as while loops instead for now. let k = MemoryLayout.size let last = n &- k while i < last { guard unsafe allASCII(blockAt: base + i) else { return false } i &+= k } return unsafe allASCII(blockAt: base + last) } extension String { internal static func _uncheckedFromASCII( _ input: UnsafeBufferPointer ) -> String { if let smol = unsafe _SmallString(input) { return String(_StringGuts(smol)) } let storage = unsafe __StringStorage.create(initializingFrom: input, isASCII: true) return storage.asString } @usableFromInline internal static func _fromASCII( _ input: UnsafeBufferPointer ) -> String { unsafe _internalInvariant(_allASCII(input), "not actually ASCII") return unsafe _uncheckedFromASCII(input) } internal static func _fromASCIIValidating( _ input: UnsafeBufferPointer ) -> String? { if unsafe _fastPath(_allASCII(input)) { return unsafe _uncheckedFromASCII(input) } return nil } public // SPI(Foundation) static func _tryFromUTF8(_ input: UnsafeBufferPointer) -> String? { guard case .success(let extraInfo) = unsafe validateUTF8(input) else { return nil } return unsafe String._uncheckedFromUTF8(input, isASCII: extraInfo.isASCII) } @usableFromInline internal static func _fromUTF8Repairing( _ input: UnsafeBufferPointer ) -> (result: String, repairsMade: Bool) { switch unsafe validateUTF8(input) { case .success(let extraInfo): return unsafe (String._uncheckedFromUTF8( input, asciiPreScanResult: extraInfo.isASCII ), false) case .error(_, let initialRange): return unsafe (repairUTF8(input, firstKnownBrokenRange: initialRange), true) } } internal static func _fromLargeUTF8Repairing( uninitializedCapacity capacity: Int, initializingWith initializer: ( _ buffer: UnsafeMutableBufferPointer ) throws -> Int ) rethrows -> String { let result = try unsafe __StringStorage.create( uninitializedCodeUnitCapacity: capacity, initializingUncheckedUTF8With: initializer) switch unsafe validateUTF8(result.codeUnits) { case .success(let info): result._updateCountAndFlags( newCount: result.count, newIsASCII: info.isASCII ) return result.asString case .error(_, let initialRange): defer { _fixLifetime(result) } //This could be optimized to use excess tail capacity return unsafe repairUTF8(result.codeUnits, firstKnownBrokenRange: initialRange) } } @usableFromInline internal static func _uncheckedFromUTF8( _ input: UnsafeBufferPointer ) -> String { return unsafe _uncheckedFromUTF8(input, isASCII: _allASCII(input)) } @usableFromInline internal static func _uncheckedFromUTF8( _ input: UnsafeBufferPointer, isASCII: Bool ) -> String { if let smol = unsafe _SmallString(input) { return String(_StringGuts(smol)) } let storage = unsafe __StringStorage.create( initializingFrom: input, isASCII: isASCII) return storage.asString } // If we've already pre-scanned for ASCII, just supply the result @usableFromInline internal static func _uncheckedFromUTF8( _ input: UnsafeBufferPointer, asciiPreScanResult: Bool ) -> String { if let smol = unsafe _SmallString(input) { return String(_StringGuts(smol)) } let isASCII = asciiPreScanResult let storage = unsafe __StringStorage.create( initializingFrom: input, isASCII: isASCII) return storage.asString } @usableFromInline internal static func _uncheckedFromUTF16( _ input: UnsafeBufferPointer ) -> String { // TODO(String Performance): Attempt to form smol strings // TODO(String performance): Skip intermediary array, transcode directly // into a StringStorage space. var contents: [UInt8] = [] contents.reserveCapacity(input.count) let repaired = unsafe transcode( input.makeIterator(), from: UTF16.self, to: UTF8.self, stoppingOnError: false, into: { contents.append($0) }) _internalInvariant(!repaired, "Error present") return unsafe contents.withUnsafeBufferPointer { unsafe String._uncheckedFromUTF8($0) } } @inline(never) // slow path private static func _slowFromCodeUnits< Input: Collection, Encoding: Unicode.Encoding >( _ input: Input, encoding: Encoding.Type, repair: Bool ) -> (String, repairsMade: Bool)? where Input.Element == Encoding.CodeUnit { // TODO(String Performance): Attempt to form smol strings // TODO(String performance): Skip intermediary array, transcode directly // into a StringStorage space. var contents: [UInt8] = [] contents.reserveCapacity(input.underestimatedCount) let repaired = transcode( input.makeIterator(), from: Encoding.self, to: UTF8.self, stoppingOnError: false, into: { contents.append($0) }) guard repair || !repaired else { return nil } let str = unsafe contents.withUnsafeBufferPointer { unsafe String._uncheckedFromUTF8($0) } return (str, repaired) } @usableFromInline @inline(never) // can't be inlined w/out breaking ABI @_specialize( where Input == UnsafeBufferPointer, Encoding == Unicode.ASCII) @_specialize( where Input == Array, Encoding == Unicode.ASCII) internal static func _fromCodeUnits< Input: Collection, Encoding: Unicode.Encoding >( _ input: Input, encoding: Encoding.Type, repair: Bool ) -> (String, repairsMade: Bool)? where Input.Element == Encoding.CodeUnit { guard _fastPath(encoding == Unicode.ASCII.self) else { return _slowFromCodeUnits(input, encoding: encoding, repair: repair) } // Helper to simplify early returns func resultOrSlow(_ resultOpt: String?) -> (String, repairsMade: Bool)? { guard let result = resultOpt else { return _slowFromCodeUnits(input, encoding: encoding, repair: repair) } return (result, repairsMade: false) } #if !$Embedded // Fast path for untyped raw storage and known stdlib types if let contigBytes = input as? _HasContiguousBytes, contigBytes._providesContiguousBytesNoCopy { return resultOrSlow(contigBytes.withUnsafeBytes { rawBufPtr in let buffer = unsafe UnsafeBufferPointer( start: rawBufPtr.baseAddress?.assumingMemoryBound(to: UInt8.self), count: rawBufPtr.count) return unsafe String._fromASCIIValidating(buffer) }) } #endif // Fast path for user-defined Collections if let strOpt = input.withContiguousStorageIfAvailable({ (buffer: UnsafeBufferPointer) -> String? in return unsafe String._fromASCIIValidating( UnsafeRawBufferPointer(buffer).bindMemory(to: UInt8.self)) }) { return resultOrSlow(strOpt) } return unsafe resultOrSlow(Array(input).withUnsafeBufferPointer { let buffer = unsafe UnsafeRawBufferPointer($0).bindMemory(to: UInt8.self) return unsafe String._fromASCIIValidating(buffer) }) } public // @testable static func _fromInvalidUTF16( _ utf16: UnsafeBufferPointer ) -> String { return unsafe String._fromCodeUnits(utf16, encoding: UTF16.self, repair: true)!.0 } @usableFromInline internal static func _fromSubstring( _ substring: __shared Substring ) -> String { if substring._offsetRange == substring.base._offsetRange { return substring.base } return String._copying(substring) } @_alwaysEmitIntoClient @inline(never) // slow-path internal static func _copying(_ str: String) -> String { return String._copying(str[...]) } @_alwaysEmitIntoClient @inline(never) // slow-path internal static func _copying(_ str: Substring) -> String { if _fastPath(str._wholeGuts.isFastUTF8) { var new = unsafe str._wholeGuts.withFastUTF8(range: str._offsetRange) { unsafe String._uncheckedFromUTF8($0) } #if os(watchOS) && _pointerBitWidth(_32) // Required for compatibility with some small strings that // may be encoded in the 32-bit slice of watchOS binaries. if str._wholeGuts.isSmall, str._wholeGuts.count > _SmallString.contiguousCapacity() { new.reserveCapacity(_SmallString.capacity + 1) return new } #endif return new } return unsafe Array(str.utf8).withUnsafeBufferPointer { unsafe String._uncheckedFromUTF8($0) } } @usableFromInline @available(SwiftStdlib 6.0, *) internal static func _validate( _ input: UnsafeBufferPointer, as encoding: Encoding.Type ) -> String? { if encoding.CodeUnit.self == UInt8.self { let bytes = unsafe _identityCast(input, to: UnsafeBufferPointer.self) if encoding.self == UTF8.self { guard case .success(let info) = unsafe validateUTF8(bytes) else { return nil } return unsafe String._uncheckedFromUTF8(bytes, asciiPreScanResult: info.isASCII) } else if encoding.self == Unicode.ASCII.self { guard unsafe _allASCII(bytes) else { return nil } return unsafe String._uncheckedFromASCII(bytes) } } // slow-path var isASCII = true var buffer: UnsafeMutableBufferPointer unsafe buffer = UnsafeMutableBufferPointer.allocate(capacity: input.count*3) var written = buffer.startIndex var parser = Encoding.ForwardParser() var input = unsafe input.makeIterator() transcodingLoop: while true { switch unsafe parser.parseScalar(from: &input) { case .valid(let s): let scalar = Encoding.decode(s) guard let utf8 = Unicode.UTF8.encode(scalar) else { // transcoding error: clean up and return nil fallthrough } if buffer.count < written + utf8.count { let newCapacity = buffer.count + (buffer.count >> 1) let copy: UnsafeMutableBufferPointer unsafe copy = UnsafeMutableBufferPointer.allocate(capacity: newCapacity) let copied = unsafe copy.moveInitialize( fromContentsOf: buffer.prefix(upTo: written) ) unsafe buffer.deallocate() unsafe buffer = unsafe copy written = copied } if isASCII && utf8.count > 1 { isASCII = false } written = unsafe buffer.suffix(from: written).initialize(fromContentsOf: utf8) break case .error: // validation error: clean up and return nil unsafe buffer.prefix(upTo: written).deinitialize() unsafe buffer.deallocate() return nil case .emptyInput: break transcodingLoop } } let storage = unsafe buffer.baseAddress.map { unsafe __SharedStringStorage( _mortal: $0, countAndFlags: _StringObject.CountAndFlags( count: buffer.startIndex.distance(to: written), isASCII: isASCII, isNFC: isASCII, isNativelyStored: false, isTailAllocated: false ) ) } return storage?.asString } }