add bloom filter

This commit is contained in:
tursom 2023-04-16 22:37:25 +08:00
parent 3ba7bfbcff
commit 9e06ed4b5e
6 changed files with 267 additions and 44 deletions

1
go.mod
View File

@ -4,5 +4,6 @@ go 1.20
require (
github.com/petermattis/goid v0.0.0-20220302125637-5f11c28912df
github.com/spaolacci/murmur3 v1.1.0
github.com/timandy/routine v1.1.1
)

View File

@ -1,5 +1,7 @@
package lang
import "unsafe"
type (
Array[T any] []T
@ -21,58 +23,102 @@ func (a Array[T]) Array() []T {
return a
}
func (a Int8Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 8
i := &a[arrIndex]
return SwapBit[int8]((*int8)(i), bit%8, up)
func (a UInt8Array) Bytes() []byte {
return *(*[]byte)(unsafe.Pointer(&a))
}
func (a Int16Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 16
i := &a[arrIndex]
return SwapBit[int16]((*int16)(i), bit%16, up)
func (a Int8Array) BitLength() uint {
return uint(len(a) * 8)
}
func (a Int32Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 32
i := &a[arrIndex]
return SwapBit[int32]((*int32)(i), bit%32, up)
func (a Int16Array) BitLength() uint {
return uint(len(a) * 16)
}
func (a Int64Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 64
i := &a[arrIndex]
return SwapBit[int64]((*int64)(i), bit%64, up)
func (a Int32Array) BitLength() uint {
return uint(len(a) * 32)
}
func (a UInt8Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 8
i := &a[arrIndex]
return SwapBit[uint8]((*uint8)(i), bit%8, up)
func (a Int64Array) BitLength() uint {
return uint(len(a) * 64)
}
func (a UInt16Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 16
i := &a[arrIndex]
return SwapBit[uint16]((*uint16)(i), bit%16, up)
func (a UInt8Array) BitLength() uint {
return uint(len(a) * 8)
}
func (a UInt32Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 32
i := &a[arrIndex]
return SwapBit[uint32]((*uint32)(i), bit%32, up)
func (a UInt16Array) BitLength() uint {
return uint(len(a) * 16)
}
func (a UInt64Array) SetBit(bit int, up bool) (old bool) {
arrIndex := bit / 64
i := &a[arrIndex]
return SwapBit[uint64]((*uint64)(i), bit%64, up)
func (a UInt32Array) BitLength() uint {
return uint(len(a) * 32)
}
func (a UInt64Array) BitLength() uint {
return uint(len(a) * 64)
}
func (a Int8Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/8], index%8)
}
func (a Int16Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/16], index%16)
}
func (a Int32Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/32], index%32)
}
func (a Int64Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/64], index%64)
}
func (a UInt8Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/8], index%8)
}
func (a UInt16Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/16], index%16)
}
func (a UInt32Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/32], index%32)
}
func (a UInt64Array) GetBit(index uint) (ok bool) {
return GetBit(a[index/64], index%64)
}
func (a Int8Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/8], index%8, up)
}
func (a Int16Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/16], index%16, up)
}
func (a Int32Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/32], index%32, up)
}
func (a Int64Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/64], index%64, up)
}
func (a UInt8Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/8], index%8, up)
}
func (a UInt16Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/16], index%16, up)
}
func (a UInt32Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/32], index%32, up)
}
func (a UInt64Array) SetBit(index uint, up bool) (old bool) {
return SwapBit(&a[index/64], index%64, up)
}

View File

@ -8,7 +8,8 @@ package lang
type (
BitSet interface {
BitLength() int
SetBit(bit int, up bool) (old bool)
BitLength() uint
SetBit(index uint, up bool) (old bool)
GetBit(index uint) (ok bool)
}
)

View File

@ -13,9 +13,15 @@ type Number interface {
ToFloat64() Float64
}
func GetBit[T int8 | int16 | int32 | int64 | uint8 | uint16 | uint32 | uint64 |
Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64](p T, index uint) (ok bool) {
location := T(1) << index
return p&location != 0
}
func SwapBit[T int8 | int16 | int32 | int64 | uint8 | uint16 | uint32 | uint64 |
Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64](p *T, bit int, new bool) (old bool) {
location := T(1) << bit
Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64](p *T, index uint, new bool) (old bool) {
location := T(1) << index
oldValue := *p
var newValue T
if new {

87
util/bloom/Bloom.go Normal file
View File

@ -0,0 +1,87 @@
package bloom
import (
"math"
"github.com/spaolacci/murmur3"
"github.com/tursom/GoCollections/lang"
)
var (
HashFunc = murmur3.Sum32WithSeed
)
type (
Bloom struct {
m lang.UInt8Array
k uint
c uint
}
)
func max(i1, i2 uint) uint {
if i1 < i2 {
return i2
} else {
return i1
}
}
func numHashFunctions(n, m float64) uint {
return max(1, uint(math.Floor(0.5+m/n*math.Ln2)))
}
func NumHashFunctions(n, m uint) uint {
return numHashFunctions(float64(n), float64(m))
}
func calcBitLength(n float64, p float64) uint {
if p == 0 {
p = math.SmallestNonzeroFloat64
}
return uint(-n * math.Log(p) / (math.Ln2 * math.Ln2))
}
func CalcBitLength(n uint, p float64) uint {
return calcBitLength(float64(n), p)
}
func NewBloom(n uint, p float64) *Bloom {
m := CalcBitLength(n, p) - 1
return &Bloom{
m: make(lang.UInt8Array, m/8+1),
k: NumHashFunctions(n, m),
}
}
func (b *Bloom) C() uint {
return b.c
}
func (b *Bloom) K() uint {
return b.k
}
func (b *Bloom) M() uint {
return uint(len(b.m)) * 8
}
func (b *Bloom) Contains(data []byte) bool {
for i := 0; i < int(b.k); i++ {
hashCode := uint(HashFunc(data, uint32(i)))
if !b.m.GetBit(hashCode & b.m.BitLength()) {
return false
}
}
return true
}
func (b *Bloom) Add(data []byte) {
b.c++
for i := 0; i < int(b.k); i++ {
hashCode := uint(HashFunc(data, uint32(i)))
b.m.SetBit(hashCode%b.m.BitLength(), true)
}
}

82
util/bloom/Bloom_test.go Normal file
View File

@ -0,0 +1,82 @@
/*
* Copyright (c) 2023 tursom. All rights reserved.
* Use of this source code is governed by a GPL-3
* license that can be found in the LICENSE file.
*/
package bloom
import (
"bytes"
"compress/gzip"
"fmt"
"testing"
)
func TestBloom_Contains(t *testing.T) {
bloom := NewBloom(100_0000, 0.1)
bloom.Add([]byte("hello, world!"))
for i := 0; i < 10000; i++ {
bloom.Add([]byte(fmt.Sprintf("%d", i)))
}
fmt.Println(len(bloom.m))
fmt.Println(len(gz(bloom.m.Bytes())))
if !bloom.Contains([]byte("hello, world!")) {
t.Failed()
}
if bloom.Contains([]byte("hello, not world!")) {
t.Failed()
}
}
func gz(b []byte) []byte {
buffer := bytes.NewBuffer(nil)
writer := gzip.NewWriter(buffer)
writer.Write(b)
writer.Flush()
return buffer.Bytes()
}
func TestCalcBitLength(t *testing.T) {
//fmt.Printf("%d\n", CalcBitLength(100_0000, 0.1)/8)
for i := 1; i < 63; i++ {
var n uint = 1 << i
numBytes := CalcBitLength(n, 0.1) / 8
fmt.Printf("%d: %d, %s / %s = %f\n",
i,
NumHashFunctions(n, numBytes*8),
storageFormat(numBytes),
storageFormat(n),
float64(numBytes)/float64(n))
}
}
func storageFormat(size uint) string {
var base uint = 1
if size < 1024*base {
return fmt.Sprintf("%db", size/base)
}
base *= 1024
if size < 1024*base {
return fmt.Sprintf("%fkb", float64(size)/float64(base))
}
base *= 1024
if size < 1024*base {
return fmt.Sprintf("%fmb", float64(size)/float64(base))
}
base *= 1024
if size < 1024*base {
return fmt.Sprintf("%fgb", float64(size)/float64(base))
}
base *= 1024
return fmt.Sprintf("%ftb", float64(size)/float64(base))
}