smhasher_test.go

     1  // Copyright 2019 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:build !race
     6  
     7  package maphash
     8  
     9  import (
    10  	"fmt"
    11  	"math"
    12  	"math/rand"
    13  	"runtime"
    14  	"strings"
    15  	"testing"
    16  	"unsafe"
    17  )
    18  
    19  // Smhasher is a torture test for hash functions.
    20  // https://code.google.com/p/smhasher/
    21  // This code is a port of some of the Smhasher tests to Go.
    22  
    23  // Note: due to the long running time of these tests, they are
    24  // currently disabled in -race mode.
    25  
    26  var fixedSeed = MakeSeed()
    27  
    28  // Sanity checks.
    29  // hash should not depend on values outside key.
    30  // hash should not depend on alignment.
    31  func TestSmhasherSanity(t *testing.T) {
    32  	r := rand.New(rand.NewSource(1234))
    33  	const REP = 10
    34  	const KEYMAX = 128
    35  	const PAD = 16
    36  	const OFFMAX = 16
    37  	for k := 0; k < REP; k++ {
    38  		for n := 0; n < KEYMAX; n++ {
    39  			for i := 0; i < OFFMAX; i++ {
    40  				var b [KEYMAX + OFFMAX + 2*PAD]byte
    41  				var c [KEYMAX + OFFMAX + 2*PAD]byte
    42  				randBytes(r, b[:])
    43  				randBytes(r, c[:])
    44  				copy(c[PAD+i:PAD+i+n], b[PAD:PAD+n])
    45  				if bytesHash(b[PAD:PAD+n]) != bytesHash(c[PAD+i:PAD+i+n]) {
    46  					t.Errorf("hash depends on bytes outside key")
    47  				}
    48  			}
    49  		}
    50  	}
    51  }
    52  
    53  func bytesHash(b []byte) uint64 {
    54  	var h Hash
    55  	h.SetSeed(fixedSeed)
    56  	h.Write(b)
    57  	return h.Sum64()
    58  }
    59  func stringHash(s string) uint64 {
    60  	var h Hash
    61  	h.SetSeed(fixedSeed)
    62  	h.WriteString(s)
    63  	return h.Sum64()
    64  }
    65  
    66  const hashSize = 64
    67  
    68  func randBytes(r *rand.Rand, b []byte) {
    69  	r.Read(b) // can't fail
    70  }
    71  
    72  // A hashSet measures the frequency of hash collisions.
    73  type hashSet struct {
    74  	m map[uint64]struct{} // set of hashes added
    75  	n int                 // number of hashes added
    76  }
    77  
    78  func newHashSet() *hashSet {
    79  	return &hashSet{make(map[uint64]struct{}), 0}
    80  }
    81  func (s *hashSet) add(h uint64) {
    82  	s.m[h] = struct{}{}
    83  	s.n++
    84  }
    85  func (s *hashSet) addS(x string) {
    86  	s.add(stringHash(x))
    87  }
    88  func (s *hashSet) addB(x []byte) {
    89  	s.add(bytesHash(x))
    90  }
    91  func (s *hashSet) addS_seed(x string, seed Seed) {
    92  	var h Hash
    93  	h.SetSeed(seed)
    94  	h.WriteString(x)
    95  	s.add(h.Sum64())
    96  }
    97  func (s *hashSet) check(t *testing.T) {
    98  	const SLOP = 10.0
    99  	collisions := s.n - len(s.m)
   100  	pairs := int64(s.n) * int64(s.n-1) / 2
   101  	expected := float64(pairs) / math.Pow(2.0, float64(hashSize))
   102  	stddev := math.Sqrt(expected)
   103  	if float64(collisions) > expected+SLOP*(3*stddev+1) {
   104  		t.Errorf("unexpected number of collisions: got=%d mean=%f stddev=%f", collisions, expected, stddev)
   105  	}
   106  }
   107  
   108  // a string plus adding zeros must make distinct hashes
   109  func TestSmhasherAppendedZeros(t *testing.T) {
   110  	s := "hello" + strings.Repeat("\x00", 256)
   111  	h := newHashSet()
   112  	for i := 0; i <= len(s); i++ {
   113  		h.addS(s[:i])
   114  	}
   115  	h.check(t)
   116  }
   117  
   118  // All 0-3 byte strings have distinct hashes.
   119  func TestSmhasherSmallKeys(t *testing.T) {
   120  	h := newHashSet()
   121  	var b [3]byte
   122  	for i := 0; i < 256; i++ {
   123  		b[0] = byte(i)
   124  		h.addB(b[:1])
   125  		for j := 0; j < 256; j++ {
   126  			b[1] = byte(j)
   127  			h.addB(b[:2])
   128  			if !testing.Short() {
   129  				for k := 0; k < 256; k++ {
   130  					b[2] = byte(k)
   131  					h.addB(b[:3])
   132  				}
   133  			}
   134  		}
   135  	}
   136  	h.check(t)
   137  }
   138  
   139  // Different length strings of all zeros have distinct hashes.
   140  func TestSmhasherZeros(t *testing.T) {
   141  	N := 256 * 1024
   142  	if testing.Short() {
   143  		N = 1024
   144  	}
   145  	h := newHashSet()
   146  	b := make([]byte, N)
   147  	for i := 0; i <= N; i++ {
   148  		h.addB(b[:i])
   149  	}
   150  	h.check(t)
   151  }
   152  
   153  // Strings with up to two nonzero bytes all have distinct hashes.
   154  func TestSmhasherTwoNonzero(t *testing.T) {
   155  	if runtime.GOARCH == "wasm" {
   156  		t.Skip("Too slow on wasm")
   157  	}
   158  	if testing.Short() {
   159  		t.Skip("Skipping in short mode")
   160  	}
   161  	h := newHashSet()
   162  	for n := 2; n <= 16; n++ {
   163  		twoNonZero(h, n)
   164  	}
   165  	h.check(t)
   166  }
   167  func twoNonZero(h *hashSet, n int) {
   168  	b := make([]byte, n)
   169  
   170  	// all zero
   171  	h.addB(b)
   172  
   173  	// one non-zero byte
   174  	for i := 0; i < n; i++ {
   175  		for x := 1; x < 256; x++ {
   176  			b[i] = byte(x)
   177  			h.addB(b)
   178  			b[i] = 0
   179  		}
   180  	}
   181  
   182  	// two non-zero bytes
   183  	for i := 0; i < n; i++ {
   184  		for x := 1; x < 256; x++ {
   185  			b[i] = byte(x)
   186  			for j := i + 1; j < n; j++ {
   187  				for y := 1; y < 256; y++ {
   188  					b[j] = byte(y)
   189  					h.addB(b)
   190  					b[j] = 0
   191  				}
   192  			}
   193  			b[i] = 0
   194  		}
   195  	}
   196  }
   197  
   198  // Test strings with repeats, like "abcdabcdabcdabcd..."
   199  func TestSmhasherCyclic(t *testing.T) {
   200  	if testing.Short() {
   201  		t.Skip("Skipping in short mode")
   202  	}
   203  	r := rand.New(rand.NewSource(1234))
   204  	const REPEAT = 8
   205  	const N = 1000000
   206  	for n := 4; n <= 12; n++ {
   207  		h := newHashSet()
   208  		b := make([]byte, REPEAT*n)
   209  		for i := 0; i < N; i++ {
   210  			b[0] = byte(i * 79 % 97)
   211  			b[1] = byte(i * 43 % 137)
   212  			b[2] = byte(i * 151 % 197)
   213  			b[3] = byte(i * 199 % 251)
   214  			randBytes(r, b[4:n])
   215  			for j := n; j < n*REPEAT; j++ {
   216  				b[j] = b[j-n]
   217  			}
   218  			h.addB(b)
   219  		}
   220  		h.check(t)
   221  	}
   222  }
   223  
   224  // Test strings with only a few bits set
   225  func TestSmhasherSparse(t *testing.T) {
   226  	if runtime.GOARCH == "wasm" {
   227  		t.Skip("Too slow on wasm")
   228  	}
   229  	if testing.Short() {
   230  		t.Skip("Skipping in short mode")
   231  	}
   232  	sparse(t, 32, 6)
   233  	sparse(t, 40, 6)
   234  	sparse(t, 48, 5)
   235  	sparse(t, 56, 5)
   236  	sparse(t, 64, 5)
   237  	sparse(t, 96, 4)
   238  	sparse(t, 256, 3)
   239  	sparse(t, 2048, 2)
   240  }
   241  func sparse(t *testing.T, n int, k int) {
   242  	b := make([]byte, n/8)
   243  	h := newHashSet()
   244  	setbits(h, b, 0, k)
   245  	h.check(t)
   246  }
   247  
   248  // set up to k bits at index i and greater
   249  func setbits(h *hashSet, b []byte, i int, k int) {
   250  	h.addB(b)
   251  	if k == 0 {
   252  		return
   253  	}
   254  	for j := i; j < len(b)*8; j++ {
   255  		b[j/8] |= byte(1 << uint(j&7))
   256  		setbits(h, b, j+1, k-1)
   257  		b[j/8] &= byte(^(1 << uint(j&7)))
   258  	}
   259  }
   260  
   261  // Test all possible combinations of n blocks from the set s.
   262  // "permutation" is a bad name here, but it is what Smhasher uses.
   263  func TestSmhasherPermutation(t *testing.T) {
   264  	if runtime.GOARCH == "wasm" {
   265  		t.Skip("Too slow on wasm")
   266  	}
   267  	if testing.Short() {
   268  		t.Skip("Skipping in short mode")
   269  	}
   270  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7}, 8)
   271  	permutation(t, []uint32{0, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 8)
   272  	permutation(t, []uint32{0, 1}, 20)
   273  	permutation(t, []uint32{0, 1 << 31}, 20)
   274  	permutation(t, []uint32{0, 1, 2, 3, 4, 5, 6, 7, 1 << 29, 2 << 29, 3 << 29, 4 << 29, 5 << 29, 6 << 29, 7 << 29}, 6)
   275  }
   276  func permutation(t *testing.T, s []uint32, n int) {
   277  	b := make([]byte, n*4)
   278  	h := newHashSet()
   279  	genPerm(h, b, s, 0)
   280  	h.check(t)
   281  }
   282  func genPerm(h *hashSet, b []byte, s []uint32, n int) {
   283  	h.addB(b[:n])
   284  	if n == len(b) {
   285  		return
   286  	}
   287  	for _, v := range s {
   288  		b[n] = byte(v)
   289  		b[n+1] = byte(v >> 8)
   290  		b[n+2] = byte(v >> 16)
   291  		b[n+3] = byte(v >> 24)
   292  		genPerm(h, b, s, n+4)
   293  	}
   294  }
   295  
   296  type key interface {
   297  	clear()              // set bits all to 0
   298  	random(r *rand.Rand) // set key to something random
   299  	bits() int           // how many bits key has
   300  	flipBit(i int)       // flip bit i of the key
   301  	hash() uint64        // hash the key
   302  	name() string        // for error reporting
   303  }
   304  
   305  type bytesKey struct {
   306  	b []byte
   307  }
   308  
   309  func (k *bytesKey) clear() {
   310  	for i := range k.b {
   311  		k.b[i] = 0
   312  	}
   313  }
   314  func (k *bytesKey) random(r *rand.Rand) {
   315  	randBytes(r, k.b)
   316  }
   317  func (k *bytesKey) bits() int {
   318  	return len(k.b) * 8
   319  }
   320  func (k *bytesKey) flipBit(i int) {
   321  	k.b[i>>3] ^= byte(1 << uint(i&7))
   322  }
   323  func (k *bytesKey) hash() uint64 {
   324  	return bytesHash(k.b)
   325  }
   326  func (k *bytesKey) name() string {
   327  	return fmt.Sprintf("bytes%d", len(k.b))
   328  }
   329  
   330  // Flipping a single bit of a key should flip each output bit with 50% probability.
   331  func TestSmhasherAvalanche(t *testing.T) {
   332  	if runtime.GOARCH == "wasm" {
   333  		t.Skip("Too slow on wasm")
   334  	}
   335  	if testing.Short() {
   336  		t.Skip("Skipping in short mode")
   337  	}
   338  	avalancheTest1(t, &bytesKey{make([]byte, 2)})
   339  	avalancheTest1(t, &bytesKey{make([]byte, 4)})
   340  	avalancheTest1(t, &bytesKey{make([]byte, 8)})
   341  	avalancheTest1(t, &bytesKey{make([]byte, 16)})
   342  	avalancheTest1(t, &bytesKey{make([]byte, 32)})
   343  	avalancheTest1(t, &bytesKey{make([]byte, 200)})
   344  }
   345  func avalancheTest1(t *testing.T, k key) {
   346  	const REP = 100000
   347  	r := rand.New(rand.NewSource(1234))
   348  	n := k.bits()
   349  
   350  	// grid[i][j] is a count of whether flipping
   351  	// input bit i affects output bit j.
   352  	grid := make([][hashSize]int, n)
   353  
   354  	for z := 0; z < REP; z++ {
   355  		// pick a random key, hash it
   356  		k.random(r)
   357  		h := k.hash()
   358  
   359  		// flip each bit, hash & compare the results
   360  		for i := 0; i < n; i++ {
   361  			k.flipBit(i)
   362  			d := h ^ k.hash()
   363  			k.flipBit(i)
   364  
   365  			// record the effects of that bit flip
   366  			g := &grid[i]
   367  			for j := 0; j < hashSize; j++ {
   368  				g[j] += int(d & 1)
   369  				d >>= 1
   370  			}
   371  		}
   372  	}
   373  
   374  	// Each entry in the grid should be about REP/2.
   375  	// More precisely, we did N = k.bits() * hashSize experiments where
   376  	// each is the sum of REP coin flips. We want to find bounds on the
   377  	// sum of coin flips such that a truly random experiment would have
   378  	// all sums inside those bounds with 99% probability.
   379  	N := n * hashSize
   380  	var c float64
   381  	// find c such that Prob(mean-c*stddev < x < mean+c*stddev)^N > .9999
   382  	for c = 0.0; math.Pow(math.Erf(c/math.Sqrt(2)), float64(N)) < .9999; c += .1 {
   383  	}
   384  	c *= 11.0 // allowed slack: 40% to 60% - we don't need to be perfectly random
   385  	mean := .5 * REP
   386  	stddev := .5 * math.Sqrt(REP)
   387  	low := int(mean - c*stddev)
   388  	high := int(mean + c*stddev)
   389  	for i := 0; i < n; i++ {
   390  		for j := 0; j < hashSize; j++ {
   391  			x := grid[i][j]
   392  			if x < low || x > high {
   393  				t.Errorf("bad bias for %s bit %d -> bit %d: %d/%d\n", k.name(), i, j, x, REP)
   394  			}
   395  		}
   396  	}
   397  }
   398  
   399  // All bit rotations of a set of distinct keys
   400  func TestSmhasherWindowed(t *testing.T) {
   401  	windowed(t, &bytesKey{make([]byte, 128)})
   402  }
   403  func windowed(t *testing.T, k key) {
   404  	if runtime.GOARCH == "wasm" {
   405  		t.Skip("Too slow on wasm")
   406  	}
   407  	if testing.Short() {
   408  		t.Skip("Skipping in short mode")
   409  	}
   410  	const BITS = 16
   411  
   412  	for r := 0; r < k.bits(); r++ {
   413  		h := newHashSet()
   414  		for i := 0; i < 1<<BITS; i++ {
   415  			k.clear()
   416  			for j := 0; j < BITS; j++ {
   417  				if i>>uint(j)&1 != 0 {
   418  					k.flipBit((j + r) % k.bits())
   419  				}
   420  			}
   421  			h.add(k.hash())
   422  		}
   423  		h.check(t)
   424  	}
   425  }
   426  
   427  // All keys of the form prefix + [A-Za-z0-9]*N + suffix.
   428  func TestSmhasherText(t *testing.T) {
   429  	if testing.Short() {
   430  		t.Skip("Skipping in short mode")
   431  	}
   432  	text(t, "Foo", "Bar")
   433  	text(t, "FooBar", "")
   434  	text(t, "", "FooBar")
   435  }
   436  func text(t *testing.T, prefix, suffix string) {
   437  	const N = 4
   438  	const S = "ABCDEFGHIJKLMNOPQRSTabcdefghijklmnopqrst0123456789"
   439  	const L = len(S)
   440  	b := make([]byte, len(prefix)+N+len(suffix))
   441  	copy(b, prefix)
   442  	copy(b[len(prefix)+N:], suffix)
   443  	h := newHashSet()
   444  	c := b[len(prefix):]
   445  	for i := 0; i < L; i++ {
   446  		c[0] = S[i]
   447  		for j := 0; j < L; j++ {
   448  			c[1] = S[j]
   449  			for k := 0; k < L; k++ {
   450  				c[2] = S[k]
   451  				for x := 0; x < L; x++ {
   452  					c[3] = S[x]
   453  					h.addB(b)
   454  				}
   455  			}
   456  		}
   457  	}
   458  	h.check(t)
   459  }
   460  
   461  // Make sure different seed values generate different hashes.
   462  func TestSmhasherSeed(t *testing.T) {
   463  	if unsafe.Sizeof(uintptr(0)) == 4 {
   464  		t.Skip("32-bit platforms don't have ideal seed-input distributions (see issue 33988)")
   465  	}
   466  	h := newHashSet()
   467  	const N = 100000
   468  	s := "hello"
   469  	for i := 0; i < N; i++ {
   470  		h.addS_seed(s, Seed{s: uint64(i + 1)})
   471  		h.addS_seed(s, Seed{s: uint64(i+1) << 32}) // make sure high bits are used
   472  	}
   473  	h.check(t)
   474  }
   475
View as plain text