Optimization pass over String and UTF8Span's allASCII helper (#82540)

This ranges between parity (for very small strings) and 5x faster (for
32-63B strings) in benchmarking on M1 MBP. For largeish strings it
delivers a roughly 2x speedup; further increase in blocksize nets a
small win in microbenchmarks that I do not expect would translate to
real world usage due to codesize impact and the fact that most strings
are smallish.

There's some opportunity for further work here; in particular, if people
start building Swift for a baseline of AVX2 or AVX512, we should have
paths for that (and we should also implement them if/when we get better
multiversioning dispatch machinery in the language). Span adoption would
be interesting. It's likely we should have a dedicated "small core"
implementation that uses only aligned accesses. Still, this is a
significant improvement as-is, and we should land it.


![allASCII](https://github.com/user-attachments/assets/ebbc45ba-5ba8-42dd-bf63-31ca77844fca)
This commit is contained in:
Stephen Canon
2025-07-11 12:08:17 -04:00
committed by GitHub
parent 3739956b4c
commit d10b3f82fc

View File

@@ -13,60 +13,126 @@
//===----------------------------------------------------------------------===//
internal func _allASCII(_ input: UnsafeBufferPointer<UInt8>) -> Bool {
if input.isEmpty { return true }
//--------------- Implementation building blocks ---------------------------//
#if arch(arm64_32)
typealias Word = UInt64
#else
typealias Word = UInt
#endif
let mask = Word(truncatingIfNeeded: 0x80808080_80808080 as UInt64)
// NOTE: Avoiding for-in syntax to avoid bounds checks
#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
// TODO: Should consider AVX2 / AVX512 / AVX10 path here
typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
@_transparent func pmovmskb(_ vec: SIMD16<UInt8>) -> UInt16 {
UInt16(Builtin.bitcast_Vec16xInt1_Int16(
Builtin.cmp_slt_Vec16xInt8(vec._storage._value, Builtin.zeroInitializer())
))
}
#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
typealias Block = (SIMD16<UInt8>, SIMD16<UInt8>)
@_transparent func umaxv(_ vec: SIMD16<UInt8>) -> UInt8 {
UInt8(Builtin.int_vector_reduce_umax_Vec16xInt8(vec._storage._value))
}
#else
typealias Block = (Word, Word, Word, Word)
#endif
@_transparent
func allASCII(wordAt pointer: UnsafePointer<UInt8>) -> Bool {
let word = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Word.self)
return word & mask == 0
}
@_transparent
func allASCII(blockAt pointer: UnsafePointer<UInt8>) -> Bool {
let block = unsafe UnsafeRawPointer(pointer).loadUnaligned(as: Block.self)
#if (arch(i386) || arch(x86_64)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
return pmovmskb(block.0 | block.1) == 0
#elseif (arch(arm64) || arch(arm64_32)) && SWIFT_STDLIB_ENABLE_VECTOR_TYPES
return umaxv(block.0 | block.1) < 0x80
#else
return (block.0 | block.1 | block.2 | block.3) & mask == 0
#endif
}
//----------------------- Implementation proper ----------------------------//
guard input.count >= MemoryLayout<Word>.size else {
// They gave us a region of memory
// whose size is as modest as it can be.
// We'll check every byte
// for the bit of most height
// and return if we happen on any
//
// I'm sorry, I'm sorry, I'm trying to delete it. (This chunk of code, not
// the Limerick. I would wager that--at least for Strings--we could
// unconditionally load 16B here,¹ because of the small string encoding,
// and check them all at once, which would be much more efficient. That
// probably has to happen by lifting this check into the SmallString
// initializer directly, though.)
//
// ¹ well, most of the time, which makes it a rather conditional
// "unconditionally".
return unsafe input.allSatisfy { $0 < 0x80 }
}
// bytes.count is non-zero, so we can unconditionally unwrap baseAddress.
let base = unsafe input.baseAddress._unsafelyUnwrappedUnchecked
let n = input.count
var i = 0
guard n >= MemoryLayout<Block>.size else {
// The size isn't yet to a block
// word-by-word we are forced to walk.
// So as to not leave a gap
// the last word may lap
// the word that we already chalked.
//
// 0 k 2k 3k ?k n-k n-1
// | | | | | | |
// +------+------+------+ +------+ |
// | word | word | word | ... | word | |
// +------+------+------+ +------+ v
// +------+
// possibly overlapping final word > | word |
// +------+
//
// This means that we check any bytes in the overlap region twice, but
// that's much preferrable to using smaller accesses to avoid rechecking,
// because the entire last word is about as expensive as checking just
// one byte would be, and on average there's more than one byte remaining.
//
// Note that we don't bother trying to align any of these accesses, because
// there is minimal benefit to doing so on "modern" OoO cores, which can
// handle cacheline-crossing loads at full speed. If the string happens to
// be aligned, they'll be aligned, if not, they won't be. It will likely
// make sense to add a path that does align everything for more limited
// embedded CPUs, though.
let k = MemoryLayout<Word>.size
let last = n &- k
while i < last {
guard unsafe allASCII(wordAt: base + i) else { return false }
i &+= k
}
return unsafe allASCII(wordAt: base + last)
}
// check block-by-block, with a possibly overlapping last block to avoid
// sub-block cleanup. We should be able to avoid manual index arithmetic
// and write this loop and the one above something like the following:
//
// TODO(String performance): SIMD-ize
// return stride(from: 0, to: last, by: k).allSatisfy {
// allASCII(blockAt: base + $0)
// } && allASCII(blockAt: base + last)
//
let count = input.count
var ptr = unsafe UnsafeRawPointer(input.baseAddress._unsafelyUnwrappedUnchecked)
let asciiMask64 = 0x8080_8080_8080_8080 as UInt64
let asciiMask32 = UInt32(truncatingIfNeeded: asciiMask64)
let asciiMask16 = UInt16(truncatingIfNeeded: asciiMask64)
let asciiMask8 = UInt8(truncatingIfNeeded: asciiMask64)
let end128 = unsafe ptr + count & ~(MemoryLayout<(UInt64, UInt64)>.stride &- 1)
let end64 = unsafe ptr + count & ~(MemoryLayout<UInt64>.stride &- 1)
let end32 = unsafe ptr + count & ~(MemoryLayout<UInt32>.stride &- 1)
let end16 = unsafe ptr + count & ~(MemoryLayout<UInt16>.stride &- 1)
let end = unsafe ptr + count
while unsafe ptr < end128 {
let pair = unsafe ptr.loadUnaligned(as: (UInt64, UInt64).self)
let result = (pair.0 | pair.1) & asciiMask64
guard result == 0 else { return false }
unsafe ptr = unsafe ptr + MemoryLayout<(UInt64, UInt64)>.stride
// but LLVM leaves one unnecessary conditional operation in the loop
// when we do that, so we write them out as while loops instead for now.
let k = MemoryLayout<Block>.size
let last = n &- k
while i < last {
guard unsafe allASCII(blockAt: base + i) else { return false }
i &+= k
}
// If we had enough bytes for two iterations of this, we would have hit
// the loop above, so we only need to do this once
if unsafe ptr < end64 {
let value = unsafe ptr.loadUnaligned(as: UInt64.self)
guard value & asciiMask64 == 0 else { return false }
unsafe ptr = unsafe ptr + MemoryLayout<UInt64>.stride
}
if unsafe ptr < end32 {
let value = unsafe ptr.loadUnaligned(as: UInt32.self)
guard value & asciiMask32 == 0 else { return false }
unsafe ptr = unsafe ptr + MemoryLayout<UInt32>.stride
}
if unsafe ptr < end16 {
let value = unsafe ptr.loadUnaligned(as: UInt16.self)
guard value & asciiMask16 == 0 else { return false }
unsafe ptr = unsafe ptr + MemoryLayout<UInt16>.stride
}
if unsafe ptr < end {
let value = unsafe ptr.loadUnaligned(fromByteOffset: 0, as: UInt8.self)
guard value & asciiMask8 == 0 else { return false }
}
unsafe _internalInvariant(ptr == end || ptr + 1 == end)
return true
return unsafe allASCII(blockAt: base + last)
}
extension String {