Files
swift-mirror/stdlib/public/core/Hasher.swift
Karoy Lorentey ebf290a598 [stdlib] Optimize ASCII String hashing
- String hashing is not inlinable, so it can use _Hasher._core operations directly. Remove custom buffering.
- Speed up ASCII hashing by as much as 5.5x by feeding the storage buffer directly into hasher in a single go.
- For other strings, just feed the UTF-8 encoding of the normalized string to the hasher; don't switch to UTF-16 at the first non-ASCII scalar. (Doing that would make the hash encoding of some string sequences ambiguous, leading to artificial collisions.)
- Add a single unconditional terminator byte, 0xFF. It's not a valid UTF-8 code unit, so it won't ever occur within a normalized string encoding.
2018-04-19 11:49:51 +01:00

349 lines
10 KiB
Swift

//===----------------------------------------------------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2018 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See https://swift.org/LICENSE.txt for license information
// See https://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
//
// Defines the Hasher struct, representing Swift's standard hash function.
//
//===----------------------------------------------------------------------===//
import SwiftShims
internal protocol _HasherCore {
init(seed: (UInt64, UInt64))
mutating func compress(_ value: UInt64)
mutating func finalize(tailAndByteCount: UInt64) -> UInt64
}
@inline(__always)
internal func _loadPartialUnalignedUInt64LE(
_ p: UnsafeRawPointer,
byteCount: Int
) -> UInt64 {
var result: UInt64 = 0
switch byteCount {
case 7:
result |= UInt64(p.load(fromByteOffset: 6, as: UInt8.self)) &<< 48
fallthrough
case 6:
result |= UInt64(p.load(fromByteOffset: 5, as: UInt8.self)) &<< 40
fallthrough
case 5:
result |= UInt64(p.load(fromByteOffset: 4, as: UInt8.self)) &<< 32
fallthrough
case 4:
result |= UInt64(p.load(fromByteOffset: 3, as: UInt8.self)) &<< 24
fallthrough
case 3:
result |= UInt64(p.load(fromByteOffset: 2, as: UInt8.self)) &<< 16
fallthrough
case 2:
result |= UInt64(p.load(fromByteOffset: 1, as: UInt8.self)) &<< 8
fallthrough
case 1:
result |= UInt64(p.load(fromByteOffset: 0, as: UInt8.self))
fallthrough
case 0:
return result
default:
_sanityCheckFailure()
}
}
/// This is a buffer for segmenting arbitrary data into 8-byte chunks. Buffer
/// storage is represented by a single 64-bit value in the format used by the
/// finalization step of SipHash. (The least significant 56 bits hold the
/// trailing bytes, while the most significant 8 bits hold the count of bytes
/// appended so far, modulo 256. The count of bytes currently stored in the
/// buffer is in the lower three bits of the byte count.)
internal struct _HasherTailBuffer {
// msb lsb
// +---------+-------+-------+-------+-------+-------+-------+-------+
// |byteCount| tail (<= 56 bits) |
// +---------+-------+-------+-------+-------+-------+-------+-------+
internal var value: UInt64
@inline(__always)
internal init() {
self.value = 0
}
@inline(__always)
internal init(tail: UInt64, byteCount: UInt64) {
// byteCount can be any value, but we only keep the lower 8 bits. (The
// lower three bits specify the count of bytes stored in this buffer.)
_sanityCheck(tail & ~(1 << ((byteCount & 7) << 3) - 1) == 0)
self.value = (byteCount &<< 56 | tail)
}
internal var tail: UInt64 {
@inline(__always)
get { return value & ~(0xFF &<< 56) }
}
internal var byteCount: UInt64 {
@inline(__always)
get { return value &>> 56 }
}
internal var isFinalized: Bool {
@inline(__always)
get { return value == 1 }
}
@inline(__always)
internal mutating func finalize() {
// A byteCount of 0 with a nonzero tail never occurs during normal use.
value = 1
}
@inline(__always)
internal mutating func append(_ bytes: UInt64) -> UInt64 {
let c = byteCount & 7
if c == 0 {
value = value &+ (8 &<< 56)
return bytes
}
let shift = c &<< 3
let chunk = tail | (bytes &<< shift)
value = (((value &>> 56) &+ 8) &<< 56) | (bytes &>> (64 - shift))
return chunk
}
@inline(__always)
internal
mutating func append(_ bytes: UInt64, count: UInt64) -> UInt64? {
_sanityCheck(count >= 0 && count < 8)
_sanityCheck(bytes & ~((1 &<< (count &<< 3)) &- 1) == 0)
let c = byteCount & 7
let shift = c &<< 3
if c + count < 8 {
value = (value | (bytes &<< shift)) &+ (count &<< 56)
return nil
}
let chunk = tail | (bytes &<< shift)
value = ((value &>> 56) &+ count) &<< 56
if c + count > 8 {
value |= bytes &>> (64 - shift)
}
return chunk
}
}
internal struct _BufferingHasher<Core: _HasherCore> {
private var _buffer: _HasherTailBuffer
private var _core: Core
@inline(__always)
internal init(seed: (UInt64, UInt64)) {
self._buffer = _HasherTailBuffer()
self._core = Core(seed: seed)
}
@inline(__always)
internal mutating func combine(_ value: UInt) {
#if arch(i386) || arch(arm)
combine(UInt32(truncatingIfNeeded: value))
#else
combine(UInt64(truncatingIfNeeded: value))
#endif
}
@inline(__always)
internal mutating func combine(_ value: UInt64) {
precondition(!_buffer.isFinalized)
_core.compress(_buffer.append(value))
}
@inline(__always)
internal mutating func combine(_ value: UInt32) {
precondition(!_buffer.isFinalized)
let value = UInt64(truncatingIfNeeded: value)
if let chunk = _buffer.append(value, count: 4) {
_core.compress(chunk)
}
}
@inline(__always)
internal mutating func combine(_ value: UInt16) {
precondition(!_buffer.isFinalized)
let value = UInt64(truncatingIfNeeded: value)
if let chunk = _buffer.append(value, count: 2) {
_core.compress(chunk)
}
}
@inline(__always)
internal mutating func combine(_ value: UInt8) {
precondition(!_buffer.isFinalized)
let value = UInt64(truncatingIfNeeded: value)
if let chunk = _buffer.append(value, count: 1) {
_core.compress(chunk)
}
}
@inline(__always)
internal mutating func combine(bytes: UInt64, count: Int) {
precondition(!_buffer.isFinalized)
_sanityCheck(count <= 8)
let count = UInt64(truncatingIfNeeded: count)
if let chunk = _buffer.append(bytes, count: count) {
_core.compress(chunk)
}
}
@inline(__always)
internal mutating func combine(bytes: UnsafeRawBufferPointer) {
precondition(!_buffer.isFinalized)
var remaining = bytes.count
guard remaining > 0 else { return }
var data = bytes.baseAddress!
// Load first unaligned partial word of data
do {
let start = UInt(bitPattern: data)
let end = _roundUp(start, toAlignment: MemoryLayout<UInt64>.alignment)
let c = min(remaining, Int(end - start))
if c > 0 {
let chunk = _loadPartialUnalignedUInt64LE(data, byteCount: c)
combine(bytes: chunk, count: c)
data += c
remaining -= c
}
}
_sanityCheck(
remaining == 0 ||
Int(bitPattern: data) & (MemoryLayout<UInt64>.alignment - 1) == 0)
// Load as many aligned words as there are in the input buffer
while remaining >= MemoryLayout<UInt64>.size {
combine(UInt64(littleEndian: data.load(as: UInt64.self)))
data += MemoryLayout<UInt64>.size
remaining -= MemoryLayout<UInt64>.size
}
// Load last partial word of data
_sanityCheck(remaining >= 0 && remaining < 8)
if remaining > 0 {
let chunk = _loadPartialUnalignedUInt64LE(data, byteCount: remaining)
combine(bytes: chunk, count: remaining)
}
}
@inline(__always)
internal mutating func finalize() -> UInt64 {
precondition(!_buffer.isFinalized)
let hash = _core.finalize(tailAndByteCount: _buffer.value)
_buffer.finalize()
return hash
}
}
@_fixed_layout // FIXME: Should be resilient (rdar://problem/38549901)
public struct _Hasher {
internal typealias Core = _BufferingHasher<_SipHash13Core>
internal var _core: Core
@effects(releasenone)
public init() {
self._core = Core(seed: _Hasher._seed)
}
@usableFromInline
@effects(releasenone)
internal init(_seed seed: (UInt64, UInt64)) {
self._core = Core(seed: seed)
}
/// Indicates whether we're running in an environment where hashing needs to
/// be deterministic. If this is true, the hash seed is not random, and hash
/// tables do not apply per-instance perturbation that is not repeatable.
/// This is not recommended for production use, but it is useful in certain
/// test environments where randomization may lead to unwanted nondeterminism
/// of test results.
public // SPI
static var _isDeterministic: Bool {
@inlinable
@inline(__always)
get {
return _swift_stdlib_Hashing_parameters.deterministic;
}
}
/// The 128-bit hash seed used to initialize the hasher state. Initialized
/// once during process startup.
public // SPI
static var _seed: (UInt64, UInt64) {
@inlinable
@inline(__always)
get {
// The seed itself is defined in C++ code so that it is initialized during
// static construction. Almost every Swift program uses hash tables, so
// initializing the seed during the startup seems to be the right
// trade-off.
return (
_swift_stdlib_Hashing_parameters.seed0,
_swift_stdlib_Hashing_parameters.seed1)
}
}
@inlinable
@inline(__always)
public mutating func combine<H: Hashable>(_ value: H) {
value._hash(into: &self)
}
//FIXME: Convert to @usableFromInline internal once integers hash correctly.
@effects(releasenone)
public mutating func _combine(_ value: UInt) {
_core.combine(value)
}
//FIXME: Convert to @usableFromInline internal once integers hash correctly.
@effects(releasenone)
public mutating func _combine(_ value: UInt64) {
_core.combine(value)
}
//FIXME: Convert to @usableFromInline internal once integers hash correctly.
@effects(releasenone)
public mutating func _combine(_ value: UInt32) {
_core.combine(value)
}
//FIXME: Convert to @usableFromInline internal once integers hash correctly.
@effects(releasenone)
public mutating func _combine(_ value: UInt16) {
_core.combine(value)
}
//FIXME: Convert to @usableFromInline internal once integers hash correctly.
@effects(releasenone)
public mutating func _combine(_ value: UInt8) {
_core.combine(value)
}
@effects(releasenone)
public mutating func _combine(bytes value: UInt64, count: Int) {
_core.combine(bytes: value, count: count)
}
@effects(releasenone)
public mutating func combine(bytes: UnsafeRawBufferPointer) {
_core.combine(bytes: bytes)
}
@effects(releasenone)
public mutating func finalize() -> Int {
return Int(truncatingIfNeeded: _core.finalize())
}
}