|  | // Copyright 2019 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | package maphash | 
|  |  | 
|  | import ( | 
|  | "fmt" | 
|  | "math" | 
|  | "math/rand" | 
|  | "runtime" | 
|  | "strings" | 
|  | "testing" | 
|  | "unsafe" | 
|  | ) | 
|  |  | 
|  | // Smhasher is a torture test for hash functions. | 
|  | // https://code.google.com/p/smhasher/ | 
|  | // This code is a port of some of the Smhasher tests to Go. | 
|  |  | 
|  | var fixedSeed = MakeSeed() | 
|  |  | 
|  | // Sanity checks. | 
|  | // hash should not depend on values outside key. | 
|  | // hash should not depend on alignment. | 
|  | func TestSmhasherSanity(t *testing.T) { | 
|  | r := rand.New(rand.NewSource(1234)) | 
|  | const REP = 10 | 
|  | const KEYMAX = 128 | 
|  | const PAD = 16 | 
|  | const OFFMAX = 16 | 
|  | for k := 0; k < REP; k++ { | 
|  | for n := 0; n < KEYMAX; n++ { | 
|  | for i := 0; i < OFFMAX; i++ { | 
|  | var b [KEYMAX + OFFMAX + 2*PAD]byte | 
|  | var c [KEYMAX + OFFMAX + 2*PAD]byte | 
|  | randBytes(r, b[:]) | 
|  | randBytes(r, c[:]) | 
|  | copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n]) | 
|  | if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) { | 
|  | t.Errorf("hash depends on bytes outside key") | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | func bytesHash(b []byte) uint64 { | 
|  | var h Hash | 
|  | h.SetSeed(fixedSeed) | 
|  | h.Write(b) | 
|  | return h.Sum64() | 
|  | } | 
|  | func stringHash(s string) uint64 { | 
|  | var h Hash | 
|  | h.SetSeed(fixedSeed) | 
|  | h.WriteString(s) | 
|  | return h.Sum64() | 
|  | } | 
|  |  | 
|  | const hashSize = 64 | 
|  |  | 
|  | func randBytes(r *rand.Rand, b []byte) { | 
|  | r.Read(b) // can't fail | 
|  | } | 
|  |  | 
|  | // A hashSet measures the frequency of hash collisions. | 
|  | type hashSet struct { | 
|  | m map[uint64]struct{} // set of hashes added | 
|  | n int                 // number of hashes added | 
|  | } | 
|  |  | 
|  | func newHashSet() *hashSet { | 
|  | return &hashSet{make(map[uint64]struct{}), 0} | 
|  | } | 
|  | func (s *hashSet) add(h uint64) { | 
|  | s.m[h] = struct{}{} | 
|  | s.n++ | 
|  | } | 
|  | func (s *hashSet) addS(x string) { | 
|  | s.add(stringHash(x)) | 
|  | } | 
|  | func (s *hashSet) addB(x []byte) { | 
|  | s.add(bytesHash(x)) | 
|  | } | 
|  | func (s *hashSet) addS_seed(x string, seed Seed) { | 
|  | var h Hash | 
|  | h.SetSeed(seed) | 
|  | h.WriteString(x) | 
|  | s.add(h.Sum64()) | 
|  | } | 
|  | func (s *hashSet) check(t *testing.T) { | 
|  | const SLOP = 10.0 | 
|  | collisions := s.n - len(s.m) | 
|  | pairs := int64(s.n) * int64(s.n-1) / 2 | 
|  | expected := float64(pairs) / math.Pow(2.0, float64(hashSize)) | 
|  | stddev := math.Sqrt(expected) | 
|  | if float64(collisions) > expected+SLOP*(3*stddev+1) { | 
|  | t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev) | 
|  | } | 
|  | } | 
|  |  | 
|  | // a string plus adding zeros must make distinct hashes | 
|  | func TestSmhasherAppendedZeros(t *testing.T) { | 
|  | s := "hello" + strings.Repeat("\x00", 256) | 
|  | h := newHashSet() | 
|  | for i := 0; i <= len(s); i++ { | 
|  | h.addS(s[:i]) | 
|  | } | 
|  | h.check(t) | 
|  | } | 
|  |  | 
|  | // All 0-3 byte strings have distinct hashes. | 
|  | func TestSmhasherSmallKeys(t *testing.T) { | 
|  | h := newHashSet() | 
|  | var b [3]byte | 
|  | for i := 0; i < 256; i++ { | 
|  | b[0] = byte(i) | 
|  | h.addB(b[:1]) | 
|  | for j := 0; j < 256; j++ { | 
|  | b[1] = byte(j) | 
|  | h.addB(b[:2]) | 
|  | if !testing.Short() { | 
|  | for k := 0; k < 256; k++ { | 
|  | b[2] = byte(k) | 
|  | h.addB(b[:3]) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | h.check(t) | 
|  | } | 
|  |  | 
|  | // Different length strings of all zeros have distinct hashes. | 
|  | func TestSmhasherZeros(t *testing.T) { | 
|  | N := 256 * 1024 | 
|  | if testing.Short() { | 
|  | N = 1024 | 
|  | } | 
|  | h := newHashSet() | 
|  | b := make([]byte, N) | 
|  | for i := 0; i <= N; i++ { | 
|  | h.addB(b[:i]) | 
|  | } | 
|  | h.check(t) | 
|  | } | 
|  |  | 
|  | // Strings with up to two nonzero bytes all have distinct hashes. | 
|  | func TestSmhasherTwoNonzero(t *testing.T) { | 
|  | if runtime.GOARCH == "wasm" { | 
|  | t.Skip("Too slow on wasm") | 
|  | } | 
|  | if testing.Short() { | 
|  | t.Skip("Skipping in short mode") | 
|  | } | 
|  | h := newHashSet() | 
|  | for n := 2; n <= 16; n++ { | 
|  | twoNonZero(h, n) | 
|  | } | 
|  | h.check(t) | 
|  | } | 
|  | func twoNonZero(h *hashSet, n int) { | 
|  | b := make([]byte, n) | 
|  |  | 
|  | // all zero | 
|  | h.addB(b) | 
|  |  | 
|  | // one non-zero byte | 
|  | for i := 0; i < n; i++ { | 
|  | for x := 1; x < 256; x++ { | 
|  | b[i] = byte(x) | 
|  | h.addB(b) | 
|  | b[i] = 0 | 
|  | } | 
|  | } | 
|  |  | 
|  | // two non-zero bytes | 
|  | for i := 0; i < n; i++ { | 
|  | for x := 1; x < 256; x++ { | 
|  | b[i] = byte(x) | 
|  | for j := i + 1; j < n; j++ { | 
|  | for y := 1; y < 256; y++ { | 
|  | b[j] = byte(y) | 
|  | h.addB(b) | 
|  | b[j] = 0 | 
|  | } | 
|  | } | 
|  | b[i] = 0 | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Test strings with repeats, like "abcdabcdabcdabcd..." | 
|  | func TestSmhasherCyclic(t *testing.T) { | 
|  | if testing.Short() { | 
|  | t.Skip("Skipping in short mode") | 
|  | } | 
|  | r := rand.New(rand.NewSource(1234)) | 
|  | const REPEAT = 8 | 
|  | const N = 1000000 | 
|  | for n := 4; n <= 12; n++ { | 
|  | h := newHashSet() | 
|  | b := make([]byte, REPEAT*n) | 
|  | for i := 0; i < N; i++ { | 
|  | b[0] = byte(i * 79 % 97) | 
|  | b[1] = byte(i * 43 % 137) | 
|  | b[2] = byte(i * 151 % 197) | 
|  | b[3] = byte(i * 199 % 251) | 
|  | randBytes(r, b[4:n]) | 
|  | for j := n; j < n*REPEAT; j++ { | 
|  | b[j] = b[j-n] | 
|  | } | 
|  | h.addB(b) | 
|  | } | 
|  | h.check(t) | 
|  | } | 
|  | } | 
|  |  | 
|  | // Test strings with only a few bits set | 
|  | func TestSmhasherSparse(t *testing.T) { | 
|  | if runtime.GOARCH == "wasm" { | 
|  | t.Skip("Too slow on wasm") | 
|  | } | 
|  | if testing.Short() { | 
|  | t.Skip("Skipping in short mode") | 
|  | } | 
|  | sparse(t, 32, 6) | 
|  | sparse(t, 40, 6) | 
|  | sparse(t, 48, 5) | 
|  | sparse(t, 56, 5) | 
|  | sparse(t, 64, 5) | 
|  | sparse(t, 96, 4) | 
|  | sparse(t, 256, 3) | 
|  | sparse(t, 2048, 2) | 
|  | } | 
|  | func sparse(t *testing.T, n int, k int) { | 
|  | b := make([]byte, n/8) | 
|  | h := newHashSet() | 
|  | setbits(h, b, 0, k) | 
|  | h.check(t) | 
|  | } | 
|  |  | 
|  | // set up to k bits at index i and greater | 
|  | func setbits(h *hashSet, b []byte, i int, k int) { | 
|  | h.addB(b) | 
|  | if k == 0 { | 
|  | return | 
|  | } | 
|  | for j := i; j < len(b)*8; j++ { | 
|  | b[j/8] |= byte(1 << uint(j&7)) | 
|  | setbits(h, b, j+1, k-1) | 
|  | b[j/8] &= byte(^(1 << uint(j&7))) | 
|  | } | 
|  | } | 
|  |  | 
|  | // Test all possible combinations of n blocks from the set s. | 
|  | // "permutation" is a bad name here, but it is what Smhasher uses. | 
|  | func TestSmhasherPermutation(t *testing.T) { | 
|  | if runtime.GOARCH == "wasm" { | 
|  | t.Skip("Too slow on wasm") | 
|  | } | 
|  | if testing.Short() { | 
|  | t.Skip("Skipping in short mode") | 
|  | } | 
|  | permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8) | 
|  | permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8) | 
|  | permutation(t, []uint32{0, 1}, 20) | 
|  | permutation(t, []uint32{0, 1 << 31}, 20) | 
|  | permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6) | 
|  | } | 
|  | func permutation(t *testing.T, s []uint32, n int) { | 
|  | b := make([]byte, n*4) | 
|  | h := newHashSet() | 
|  | genPerm(h, b, s, 0) | 
|  | h.check(t) | 
|  | } | 
|  | func genPerm(h *hashSet, b []byte, s []uint32, n int) { | 
|  | h.addB(b[:n]) | 
|  | if n == len(b) { | 
|  | return | 
|  | } | 
|  | for _, v := range s { | 
|  | b[n] = byte(v) | 
|  | b[n+1] = byte(v >> 8) | 
|  | b[n+2] = byte(v >> 16) | 
|  | b[n+3] = byte(v >> 24) | 
|  | genPerm(h, b, s, n+4) | 
|  | } | 
|  | } | 
|  |  | 
|  | type key interface { | 
|  | clear()              // set bits all to 0 | 
|  | random(r *rand.Rand) // set key to something random | 
|  | bits() int           // how many bits key has | 
|  | flipBit(i int)       // flip bit i of the key | 
|  | hash() uint64        // hash the key | 
|  | name() string        // for error reporting | 
|  | } | 
|  |  | 
|  | type bytesKey struct { | 
|  | b []byte | 
|  | } | 
|  |  | 
|  | func (k *bytesKey) clear() { | 
|  | for i := range k.b { | 
|  | k.b[i] = 0 | 
|  | } | 
|  | } | 
|  | func (k *bytesKey) random(r *rand.Rand) { | 
|  | randBytes(r, k.b) | 
|  | } | 
|  | func (k *bytesKey) bits() int { | 
|  | return len(k.b) * 8 | 
|  | } | 
|  | func (k *bytesKey) flipBit(i int) { | 
|  | k.b[i>>3] ^= byte(1 << uint(i&7)) | 
|  | } | 
|  | func (k *bytesKey) hash() uint64 { | 
|  | return bytesHash(k.b) | 
|  | } | 
|  | func (k *bytesKey) name() string { | 
|  | return fmt.Sprintf("bytes%d", len(k.b)) | 
|  | } | 
|  |  | 
|  | // Flipping a single bit of a key should flip each output bit with 50% probability. | 
|  | func TestSmhasherAvalanche(t *testing.T) { | 
|  | if runtime.GOARCH == "wasm" { | 
|  | t.Skip("Too slow on wasm") | 
|  | } | 
|  | if testing.Short() { | 
|  | t.Skip("Skipping in short mode") | 
|  | } | 
|  | avalancheTest1(t, &bytesKey{make([]byte, 2)}) | 
|  | avalancheTest1(t, &bytesKey{make([]byte, 4)}) | 
|  | avalancheTest1(t, &bytesKey{make([]byte, 8)}) | 
|  | avalancheTest1(t, &bytesKey{make([]byte, 16)}) | 
|  | avalancheTest1(t, &bytesKey{make([]byte, 32)}) | 
|  | avalancheTest1(t, &bytesKey{make([]byte, 200)}) | 
|  | } | 
|  | func avalancheTest1(t *testing.T, k key) { | 
|  | const REP = 100000 | 
|  | r := rand.New(rand.NewSource(1234)) | 
|  | n := k.bits() | 
|  |  | 
|  | // grid[i][j] is a count of whether flipping | 
|  | // input bit i affects output bit j. | 
|  | grid := make([][hashSize]int, n) | 
|  |  | 
|  | for z := 0; z < REP; z++ { | 
|  | // pick a random key, hash it | 
|  | k.random(r) | 
|  | h := k.hash() | 
|  |  | 
|  | // flip each bit, hash & compare the results | 
|  | for i := 0; i < n; i++ { | 
|  | k.flipBit(i) | 
|  | d := h ^ k.hash() | 
|  | k.flipBit(i) | 
|  |  | 
|  | // record the effects of that bit flip | 
|  | g := &grid[i] | 
|  | for j := 0; j < hashSize; j++ { | 
|  | g[j] += int(d & 1) | 
|  | d >>= 1 | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // Each entry in the grid should be about REP/2. | 
|  | // More precisely, we did N = k.bits() * hashSize experiments where | 
|  | // each is the sum of REP coin flips. We want to find bounds on the | 
|  | // sum of coin flips such that a truly random experiment would have | 
|  | // all sums inside those bounds with 99% probability. | 
|  | N := n * hashSize | 
|  | var c float64 | 
|  | // find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999 | 
|  | for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 { | 
|  | } | 
|  | c *= 4.0 // allowed slack - we don't need to be perfectly random | 
|  | mean := .5 * REP | 
|  | stddev := .5 * math.Sqrt(REP) | 
|  | low := int(mean - c*stddev) | 
|  | high := int(mean + c*stddev) | 
|  | for i := 0; i < n; i++ { | 
|  | for j := 0; j < hashSize; j++ { | 
|  | x := grid[i][j] | 
|  | if x < low || x > high { | 
|  | t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // All bit rotations of a set of distinct keys | 
|  | func TestSmhasherWindowed(t *testing.T) { | 
|  | windowed(t, &bytesKey{make([]byte, 128)}) | 
|  | } | 
|  | func windowed(t *testing.T, k key) { | 
|  | if runtime.GOARCH == "wasm" { | 
|  | t.Skip("Too slow on wasm") | 
|  | } | 
|  | if testing.Short() { | 
|  | t.Skip("Skipping in short mode") | 
|  | } | 
|  | const BITS = 16 | 
|  |  | 
|  | for r := 0; r < k.bits(); r++ { | 
|  | h := newHashSet() | 
|  | for i := 0; i < 1<<BITS; i++ { | 
|  | k.clear() | 
|  | for j := 0; j < BITS; j++ { | 
|  | if i>>uint(j)&1 != 0 { | 
|  | k.flipBit((j + r) % k.bits()) | 
|  | } | 
|  | } | 
|  | h.add(k.hash()) | 
|  | } | 
|  | h.check(t) | 
|  | } | 
|  | } | 
|  |  | 
|  | // All keys of the form prefix + [A-Za-z0-9]*N + suffix. | 
|  | func TestSmhasherText(t *testing.T) { | 
|  | if testing.Short() { | 
|  | t.Skip("Skipping in short mode") | 
|  | } | 
|  | text(t, "Foo", "Bar") | 
|  | text(t, "FooBar", "") | 
|  | text(t, "", "FooBar") | 
|  | } | 
|  | func text(t *testing.T, prefix, suffix string) { | 
|  | const N = 4 | 
|  | const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789" | 
|  | const L = len(S) | 
|  | b := make([]byte, len(prefix)+N+len(suffix)) | 
|  | copy(b, prefix) | 
|  | copy(b[len(prefix)+N:], suffix) | 
|  | h := newHashSet() | 
|  | c := b[len(prefix):] | 
|  | for i := 0; i < L; i++ { | 
|  | c[0] = S[i] | 
|  | for j := 0; j < L; j++ { | 
|  | c[1] = S[j] | 
|  | for k := 0; k < L; k++ { | 
|  | c[2] = S[k] | 
|  | for x := 0; x < L; x++ { | 
|  | c[3] = S[x] | 
|  | h.addB(b) | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  | h.check(t) | 
|  | } | 
|  |  | 
|  | // Make sure different seed values generate different hashes. | 
|  | func TestSmhasherSeed(t *testing.T) { | 
|  | if unsafe.Sizeof(uintptr(0)) == 4 { | 
|  | t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)") | 
|  | } | 
|  | h := newHashSet() | 
|  | const N = 100000 | 
|  | s := "hello" | 
|  | for i := 0; i < N; i++ { | 
|  | h.addS_seed(s, Seed{s: uint64(i + 1)}) | 
|  | h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used | 
|  | } | 
|  | h.check(t) | 
|  | } |