mirror of
https://github.com/tursom/GoCollections.git
synced 2025-03-13 17:00:18 +08:00
add bloom filter
This commit is contained in:
parent
3ba7bfbcff
commit
9e06ed4b5e
1
go.mod
1
go.mod
@ -4,5 +4,6 @@ go 1.20
|
||||
|
||||
require (
|
||||
github.com/petermattis/goid v0.0.0-20220302125637-5f11c28912df
|
||||
github.com/spaolacci/murmur3 v1.1.0
|
||||
github.com/timandy/routine v1.1.1
|
||||
)
|
||||
|
126
lang/Array.go
126
lang/Array.go
@ -1,5 +1,7 @@
|
||||
package lang
|
||||
|
||||
import "unsafe"
|
||||
|
||||
type (
|
||||
Array[T any] []T
|
||||
|
||||
@ -21,58 +23,102 @@ func (a Array[T]) Array() []T {
|
||||
return a
|
||||
}
|
||||
|
||||
func (a Int8Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 8
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[int8]((*int8)(i), bit%8, up)
|
||||
func (a UInt8Array) Bytes() []byte {
|
||||
return *(*[]byte)(unsafe.Pointer(&a))
|
||||
}
|
||||
|
||||
func (a Int16Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 16
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[int16]((*int16)(i), bit%16, up)
|
||||
func (a Int8Array) BitLength() uint {
|
||||
return uint(len(a) * 8)
|
||||
}
|
||||
|
||||
func (a Int32Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 32
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[int32]((*int32)(i), bit%32, up)
|
||||
func (a Int16Array) BitLength() uint {
|
||||
return uint(len(a) * 16)
|
||||
}
|
||||
|
||||
func (a Int64Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 64
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[int64]((*int64)(i), bit%64, up)
|
||||
func (a Int32Array) BitLength() uint {
|
||||
return uint(len(a) * 32)
|
||||
}
|
||||
|
||||
func (a UInt8Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 8
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[uint8]((*uint8)(i), bit%8, up)
|
||||
func (a Int64Array) BitLength() uint {
|
||||
return uint(len(a) * 64)
|
||||
}
|
||||
|
||||
func (a UInt16Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 16
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[uint16]((*uint16)(i), bit%16, up)
|
||||
func (a UInt8Array) BitLength() uint {
|
||||
return uint(len(a) * 8)
|
||||
}
|
||||
|
||||
func (a UInt32Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 32
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[uint32]((*uint32)(i), bit%32, up)
|
||||
func (a UInt16Array) BitLength() uint {
|
||||
return uint(len(a) * 16)
|
||||
}
|
||||
|
||||
func (a UInt64Array) SetBit(bit int, up bool) (old bool) {
|
||||
arrIndex := bit / 64
|
||||
i := &a[arrIndex]
|
||||
|
||||
return SwapBit[uint64]((*uint64)(i), bit%64, up)
|
||||
func (a UInt32Array) BitLength() uint {
|
||||
return uint(len(a) * 32)
|
||||
}
|
||||
|
||||
func (a UInt64Array) BitLength() uint {
|
||||
return uint(len(a) * 64)
|
||||
}
|
||||
|
||||
func (a Int8Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/8], index%8)
|
||||
}
|
||||
|
||||
func (a Int16Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/16], index%16)
|
||||
}
|
||||
|
||||
func (a Int32Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/32], index%32)
|
||||
}
|
||||
|
||||
func (a Int64Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/64], index%64)
|
||||
}
|
||||
|
||||
func (a UInt8Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/8], index%8)
|
||||
}
|
||||
|
||||
func (a UInt16Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/16], index%16)
|
||||
}
|
||||
|
||||
func (a UInt32Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/32], index%32)
|
||||
}
|
||||
|
||||
func (a UInt64Array) GetBit(index uint) (ok bool) {
|
||||
return GetBit(a[index/64], index%64)
|
||||
}
|
||||
|
||||
func (a Int8Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/8], index%8, up)
|
||||
}
|
||||
|
||||
func (a Int16Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/16], index%16, up)
|
||||
}
|
||||
|
||||
func (a Int32Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/32], index%32, up)
|
||||
}
|
||||
|
||||
func (a Int64Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/64], index%64, up)
|
||||
}
|
||||
|
||||
func (a UInt8Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/8], index%8, up)
|
||||
}
|
||||
|
||||
func (a UInt16Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/16], index%16, up)
|
||||
}
|
||||
|
||||
func (a UInt32Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/32], index%32, up)
|
||||
}
|
||||
|
||||
func (a UInt64Array) SetBit(index uint, up bool) (old bool) {
|
||||
return SwapBit(&a[index/64], index%64, up)
|
||||
}
|
||||
|
@ -8,7 +8,8 @@ package lang
|
||||
|
||||
type (
|
||||
BitSet interface {
|
||||
BitLength() int
|
||||
SetBit(bit int, up bool) (old bool)
|
||||
BitLength() uint
|
||||
SetBit(index uint, up bool) (old bool)
|
||||
GetBit(index uint) (ok bool)
|
||||
}
|
||||
)
|
||||
|
@ -13,9 +13,15 @@ type Number interface {
|
||||
ToFloat64() Float64
|
||||
}
|
||||
|
||||
func GetBit[T int8 | int16 | int32 | int64 | uint8 | uint16 | uint32 | uint64 |
|
||||
Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64](p T, index uint) (ok bool) {
|
||||
location := T(1) << index
|
||||
return p&location != 0
|
||||
}
|
||||
|
||||
func SwapBit[T int8 | int16 | int32 | int64 | uint8 | uint16 | uint32 | uint64 |
|
||||
Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64](p *T, bit int, new bool) (old bool) {
|
||||
location := T(1) << bit
|
||||
Int8 | Int16 | Int32 | Int64 | UInt8 | UInt16 | UInt32 | UInt64](p *T, index uint, new bool) (old bool) {
|
||||
location := T(1) << index
|
||||
oldValue := *p
|
||||
var newValue T
|
||||
if new {
|
||||
|
87
util/bloom/Bloom.go
Normal file
87
util/bloom/Bloom.go
Normal file
@ -0,0 +1,87 @@
|
||||
package bloom
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/spaolacci/murmur3"
|
||||
|
||||
"github.com/tursom/GoCollections/lang"
|
||||
)
|
||||
|
||||
var (
|
||||
HashFunc = murmur3.Sum32WithSeed
|
||||
)
|
||||
|
||||
type (
|
||||
Bloom struct {
|
||||
m lang.UInt8Array
|
||||
k uint
|
||||
c uint
|
||||
}
|
||||
)
|
||||
|
||||
func max(i1, i2 uint) uint {
|
||||
if i1 < i2 {
|
||||
return i2
|
||||
} else {
|
||||
return i1
|
||||
}
|
||||
}
|
||||
|
||||
func numHashFunctions(n, m float64) uint {
|
||||
return max(1, uint(math.Floor(0.5+m/n*math.Ln2)))
|
||||
}
|
||||
|
||||
func NumHashFunctions(n, m uint) uint {
|
||||
return numHashFunctions(float64(n), float64(m))
|
||||
}
|
||||
|
||||
func calcBitLength(n float64, p float64) uint {
|
||||
if p == 0 {
|
||||
p = math.SmallestNonzeroFloat64
|
||||
}
|
||||
return uint(-n * math.Log(p) / (math.Ln2 * math.Ln2))
|
||||
}
|
||||
|
||||
func CalcBitLength(n uint, p float64) uint {
|
||||
return calcBitLength(float64(n), p)
|
||||
}
|
||||
|
||||
func NewBloom(n uint, p float64) *Bloom {
|
||||
m := CalcBitLength(n, p) - 1
|
||||
return &Bloom{
|
||||
m: make(lang.UInt8Array, m/8+1),
|
||||
k: NumHashFunctions(n, m),
|
||||
}
|
||||
}
|
||||
|
||||
func (b *Bloom) C() uint {
|
||||
return b.c
|
||||
}
|
||||
|
||||
func (b *Bloom) K() uint {
|
||||
return b.k
|
||||
}
|
||||
|
||||
func (b *Bloom) M() uint {
|
||||
return uint(len(b.m)) * 8
|
||||
}
|
||||
|
||||
func (b *Bloom) Contains(data []byte) bool {
|
||||
for i := 0; i < int(b.k); i++ {
|
||||
hashCode := uint(HashFunc(data, uint32(i)))
|
||||
if !b.m.GetBit(hashCode & b.m.BitLength()) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
func (b *Bloom) Add(data []byte) {
|
||||
b.c++
|
||||
for i := 0; i < int(b.k); i++ {
|
||||
hashCode := uint(HashFunc(data, uint32(i)))
|
||||
b.m.SetBit(hashCode%b.m.BitLength(), true)
|
||||
}
|
||||
}
|
82
util/bloom/Bloom_test.go
Normal file
82
util/bloom/Bloom_test.go
Normal file
@ -0,0 +1,82 @@
|
||||
/*
|
||||
* Copyright (c) 2023 tursom. All rights reserved.
|
||||
* Use of this source code is governed by a GPL-3
|
||||
* license that can be found in the LICENSE file.
|
||||
*/
|
||||
|
||||
package bloom
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"fmt"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func TestBloom_Contains(t *testing.T) {
|
||||
bloom := NewBloom(100_0000, 0.1)
|
||||
bloom.Add([]byte("hello, world!"))
|
||||
for i := 0; i < 10000; i++ {
|
||||
bloom.Add([]byte(fmt.Sprintf("%d", i)))
|
||||
}
|
||||
|
||||
fmt.Println(len(bloom.m))
|
||||
fmt.Println(len(gz(bloom.m.Bytes())))
|
||||
|
||||
if !bloom.Contains([]byte("hello, world!")) {
|
||||
t.Failed()
|
||||
}
|
||||
|
||||
if bloom.Contains([]byte("hello, not world!")) {
|
||||
t.Failed()
|
||||
}
|
||||
}
|
||||
|
||||
func gz(b []byte) []byte {
|
||||
buffer := bytes.NewBuffer(nil)
|
||||
|
||||
writer := gzip.NewWriter(buffer)
|
||||
writer.Write(b)
|
||||
writer.Flush()
|
||||
|
||||
return buffer.Bytes()
|
||||
}
|
||||
|
||||
func TestCalcBitLength(t *testing.T) {
|
||||
//fmt.Printf("%d\n", CalcBitLength(100_0000, 0.1)/8)
|
||||
for i := 1; i < 63; i++ {
|
||||
var n uint = 1 << i
|
||||
numBytes := CalcBitLength(n, 0.1) / 8
|
||||
fmt.Printf("%d: %d, %s / %s = %f\n",
|
||||
i,
|
||||
NumHashFunctions(n, numBytes*8),
|
||||
storageFormat(numBytes),
|
||||
storageFormat(n),
|
||||
float64(numBytes)/float64(n))
|
||||
}
|
||||
}
|
||||
|
||||
func storageFormat(size uint) string {
|
||||
var base uint = 1
|
||||
if size < 1024*base {
|
||||
return fmt.Sprintf("%db", size/base)
|
||||
}
|
||||
|
||||
base *= 1024
|
||||
if size < 1024*base {
|
||||
return fmt.Sprintf("%fkb", float64(size)/float64(base))
|
||||
}
|
||||
|
||||
base *= 1024
|
||||
if size < 1024*base {
|
||||
return fmt.Sprintf("%fmb", float64(size)/float64(base))
|
||||
}
|
||||
|
||||
base *= 1024
|
||||
if size < 1024*base {
|
||||
return fmt.Sprintf("%fgb", float64(size)/float64(base))
|
||||
}
|
||||
|
||||
base *= 1024
|
||||
return fmt.Sprintf("%ftb", float64(size)/float64(base))
|
||||
}
|
Loading…
Reference in New Issue
Block a user