diff --git a/filter.go b/filter.go index 9a28742..3738e27 100644 --- a/filter.go +++ b/filter.go @@ -54,8 +54,11 @@ type Filter struct { count int64 // Estimate number of elements } -// MurmurHash3 function. -var murmur = new(digest) +// MurmurHash3 functions. +var ( + murmur = new(digest) + murmurString = new(digestString) +) // New creates an empty Bloom filter with room for n elements // at a false-positives rate less than 1/p. @@ -73,7 +76,15 @@ func New(n int, p int) *Filter { // AddByte adds b to the filter and tells if b was already a likely member. func (f *Filter) AddByte(b []byte) bool { - h1, h2 := murmur.hash(b) + return f.add(murmur.hash(b)) +} + +// Add adds s to the filter and tells if s was already a likely member. +func (f *Filter) Add(s string) bool { + return f.add(murmurString.hash(s)) +} + +func (f *Filter) add(h1, h2 uint64) bool { trunc := uint64(len(f.data))< 0; i-- { @@ -91,16 +102,17 @@ func (f *Filter) AddByte(b []byte) bool { return member } -// Add adds s to the filter and tells if s was already a likely member. -func (f *Filter) Add(s string) bool { - b := make([]byte, len(s)) - copy(b, s) - return f.AddByte(b) -} - // TestByte tells if b is a likely member of the filter. func (f *Filter) TestByte(b []byte) bool { - h1, h2 := murmur.hash(b) + return f.test(murmur.hash(b)) +} + +// Test tells if s is a likely member of the filter. +func (f *Filter) Test(s string) bool { + return f.test(murmurString.hash(s)) +} + +func (f *Filter) test(h1, h2 uint64) bool { trunc := uint64(len(f.data))< 0; i-- { h1 += h2 @@ -113,13 +125,6 @@ func (f *Filter) TestByte(b []byte) bool { return true } -// Test tells if s is a likely member of the filter. -func (f *Filter) Test(s string) bool { - b := make([]byte, len(s)) - copy(b, s) - return f.TestByte(b) -} - // Count returns an estimate of the number of elements in the filter. func (f *Filter) Count() int64 { return f.count diff --git a/filter_test.go b/filter_test.go index 531af00..135ddce 100644 --- a/filter_test.go +++ b/filter_test.go @@ -126,7 +126,7 @@ func BenchmarkTestByte(b *testing.B) { } } -func BenchmarkTestUnion(b *testing.B) { +func BenchmarkUnion(b *testing.B) { n := 1000 b.StopTimer() f1 := New(n, 200) diff --git a/hash.go b/hash.go index 766e252..5c44e88 100644 --- a/hash.go +++ b/hash.go @@ -1,9 +1,5 @@ package bloom -import ( - "encoding/binary" -) - // MurmurHash3 implementation adapted from Sébastien Paolacci // github.com/spaolacci/murmur3, released under BSD-3-Clause. @@ -20,11 +16,15 @@ const ( ) type digest struct { - clen int // Digested input cumulative length. - buf [16]byte // Expected (but not required) to be 16 large. - tail []byte // 0 to 15 bytes view of buf. - h1 uint64 // Running hash part 1. - h2 uint64 // Running hash part 2. + clen int + tail []byte + h1 uint64 + h2 uint64 +} + +func Uint64(b []byte) uint64 { + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 } func (d *digest) bmix(p []byte) (tail []byte) { @@ -32,8 +32,8 @@ func (d *digest) bmix(p []byte) (tail []byte) { nblocks := len(p) / 16 for i := 0; i < nblocks; i++ { j := 16 * i - k1 := binary.LittleEndian.Uint64(p[j : j+8]) - k2 := binary.LittleEndian.Uint64(p[j+8 : j+16]) + k1 := Uint64(p[j : j+8]) + k2 := Uint64(p[j+8 : j+16]) k1 *= c1 k1 = (k1 << 31) | (k1 >> 33) k1 *= c2 diff --git a/hash_string.go b/hash_string.go new file mode 100644 index 0000000..a96cdb2 --- /dev/null +++ b/hash_string.go @@ -0,0 +1,117 @@ +package bloom + +// MurmurHash3 implementation adapted from Sébastien Paolacci +// github.com/spaolacci/murmur3, released under BSD-3-Clause. + +func (d *digestString) hash(data string) (h1 uint64, h2 uint64) { + d.h1, d.h2 = 0, 0 + d.clen = len(data) + d.tail = d.bmixString(data) + return d.sum() +} + +type digestString struct { + clen int + tail string + h1 uint64 + h2 uint64 +} + +func Uint64String(b string) uint64 { + return uint64(b[0]) | uint64(b[1])<<8 | uint64(b[2])<<16 | uint64(b[3])<<24 | + uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56 +} + +func (d *digestString) bmixString(p string) (tail string) { + h1, h2 := d.h1, d.h2 + nblocks := len(p) / 16 + for i := 0; i < nblocks; i++ { + j := 16 * i + k1 := Uint64String(p[j : j+8]) + k2 := Uint64String(p[j+8 : j+16]) + k1 *= c1 + k1 = (k1 << 31) | (k1 >> 33) + k1 *= c2 + h1 ^= k1 + h1 = (h1 << 27) | (h1 >> 37) + h1 += h2 + h1 = h1*5 + 0x52dce729 + k2 *= c2 + k2 = (k2 << 33) | (k2 >> 31) + k2 *= c1 + h2 ^= k2 + h2 = (h2 << 31) | (h2 >> 33) + h2 += h1 + h2 = h2*5 + 0x38495ab5 + } + d.h1, d.h2 = h1, h2 + return p[nblocks*16:] +} + +func (d *digestString) sum() (h1, h2 uint64) { + h1, h2 = d.h1, d.h2 + var k1, k2 uint64 + switch len(d.tail) & 15 { + case 15: + k2 ^= uint64(d.tail[14]) << 48 + fallthrough + case 14: + k2 ^= uint64(d.tail[13]) << 40 + fallthrough + case 13: + k2 ^= uint64(d.tail[12]) << 32 + fallthrough + case 12: + k2 ^= uint64(d.tail[11]) << 24 + fallthrough + case 11: + k2 ^= uint64(d.tail[10]) << 16 + fallthrough + case 10: + k2 ^= uint64(d.tail[9]) << 8 + fallthrough + case 9: + k2 ^= uint64(d.tail[8]) << 0 + k2 *= c2 + k2 = (k2 << 33) | (k2 >> 31) + k2 *= c1 + h2 ^= k2 + fallthrough + case 8: + k1 ^= uint64(d.tail[7]) << 56 + fallthrough + case 7: + k1 ^= uint64(d.tail[6]) << 48 + fallthrough + case 6: + k1 ^= uint64(d.tail[5]) << 40 + fallthrough + case 5: + k1 ^= uint64(d.tail[4]) << 32 + fallthrough + case 4: + k1 ^= uint64(d.tail[3]) << 24 + fallthrough + case 3: + k1 ^= uint64(d.tail[2]) << 16 + fallthrough + case 2: + k1 ^= uint64(d.tail[1]) << 8 + fallthrough + case 1: + k1 ^= uint64(d.tail[0]) << 0 + k1 *= c1 + k1 = (k1 << 31) | (k1 >> 33) + k1 *= c2 + h1 ^= k1 + } + h1 ^= uint64(d.clen) + h2 ^= uint64(d.clen) + h1 += h2 + h2 += h1 + h1 = fmix(h1) + h2 = fmix(h2) + h1 += h2 + h2 += h1 + return h1, h2 +} diff --git a/hash_test.go b/hash_test.go index 9e198ae..785b94a 100644 --- a/hash_test.go +++ b/hash_test.go @@ -26,3 +26,26 @@ func TestHash(t *testing.T) { } } } + +func TestHashString(t *testing.T) { + d := new(digestString) + var data = []struct { + h1, h2 uint64 + s string + }{ + {0x0000000000000000, 0x0000000000000000, ""}, + {0xcbd8a7b341bd9b02, 0x5b1e906a48ae1d19, "hello"}, + {0x342fac623a5ebc8e, 0x4cdcbc079642414d, "hello, world"}, + {0xb89e5988b737affc, 0x664fc2950231b2cb, "19 Jan 2038 at 3:14:07 AM"}, + {0xcd99481f9ee902c9, 0x695da1a38987b6e7, "The quick brown fox jumps over the lazy dog."}, + } + for _, x := range data { + h1, h2 := d.hash(x.s) + if h1 != x.h1 { + t.Errorf("hash(%q).h1 = %d; want %d\n", x.s, h1, x.h1) + } + if h2 != x.h2 { + t.Errorf("hash(%q).h2 = %d; want %d\n", x.s, h2, x.h2) + } + } +}