布隆过滤器（Bloom Filter）

2025-02-26 分类：performance 作者：admin 阅读（31）

布隆过滤器（Bloom Filter）是一种高效的概率数据结构，用于判断一个元素是否属于某个集合。它可能存在误报（即判断元素存在时，实际上可能不存在），但不会漏报（即判断元素不存在时，一定不存在）。

package main

import (
	"fmt"
	"hash/fnv"
	"math"
)

// BloomFilter 布隆过滤器结构
type BloomFilter struct {
	size      uint          // 位数组大小
	hashCount uint          // 哈希函数数量
	bitArray  []bool        // 位数组
	hashFuncs []hashFunc    // 哈希函数列表
}

// hashFunc 哈希函数类型
type hashFunc func([]byte) uint

// NewBloomFilter 创建一个新的布隆过滤器
func NewBloomFilter(size uint, hashCount uint) *BloomFilter {
	bf := &BloomFilter{
		size:      size,
		hashCount: hashCount,
		bitArray:  make([]bool, size),
		hashFuncs: make([]hashFunc, hashCount),
	}

	// 初始化哈希函数
	for i := uint(0); i < hashCount; i++ {
		bf.hashFuncs[i] = createHashFunc(i)
	}

	return bf
}

// createHashFunc 创建哈希函数
func createHashFunc(seed uint) hashFunc {
	return func(data []byte) uint {
		h := fnv.New32a()
		h.Write(data)
		return uint(h.Sum32()) + seed
	}
}

// Add 添加元素到布隆过滤器
func (bf *BloomFilter) Add(item []byte) {
	for _, hashFunc := range bf.hashFuncs {
		index := hashFunc(item) % bf.size
		bf.bitArray[index] = true
	}
}

// Contains 检查元素是否可能存在于布隆过滤器中
func (bf *BloomFilter) Contains(item []byte) bool {
	for _, hashFunc := range bf.hashFuncs {
		index := hashFunc(item) % bf.size
		if !bf.bitArray[index] {
			return false
		}
	}
	return true
}

// OptimalHashCount 计算最优的哈希函数数量
func OptimalHashCount(size uint, expectedElements uint) uint {
	return uint(math.Ceil(float64(size) / float64(expectedElements) * math.Log(2)))
}

// OptimalSize 计算最优的位数组大小
func OptimalSize(expectedElements uint, falsePositiveRate float64) uint {
	return uint(math.Ceil(-float64(expectedElements) * math.Log(falsePositiveRate) / math.Pow(math.Log(2), 2)))
}

func main() {
	// 预期元素数量和误报率
	expectedElements := uint(1000)
	falsePositiveRate := 0.01

	// 计算最优的位数组大小和哈希函数数量
	size := OptimalSize(expectedElements, falsePositiveRate)
	hashCount := OptimalHashCount(size, expectedElements)

	// 创建布隆过滤器
	bf := NewBloomFilter(size, hashCount)

	// 添加元素
	bf.Add([]byte("hello"))
	bf.Add([]byte("world"))

	// 检查元素是否存在
	fmt.Println(bf.Contains([]byte("hello"))) // true
	fmt.Println(bf.Contains([]byte("world"))) // true
	fmt.Println(bf.Contains([]byte("foo")))  // false (可能误报)
}

package main

import (

"fmt"

"hash/fnv"

"math"

)

// BloomFilter 布隆过滤器结构

type BloomFilter struct {

size uint // 位数组大小

hashCount uint // 哈希函数数量

bitArray []bool // 位数组

hashFuncs []hashFunc // 哈希函数列表

}

// hashFunc 哈希函数类型

type hashFunc func([]byte) uint

// NewBloomFilter 创建一个新的布隆过滤器

func NewBloomFilter(size uint, hashCount uint) *BloomFilter {

bf := &BloomFilter{

size: size,

hashCount: hashCount,

bitArray: make([]bool, size),

hashFuncs: make([]hashFunc, hashCount),

}

// 初始化哈希函数

for i := uint(0); i < hashCount; i++ {

bf.hashFuncs[i] = createHashFunc(i)

}

return bf

}

// createHashFunc 创建哈希函数

func createHashFunc(seed uint) hashFunc {

return func(data []byte) uint {

h := fnv.New32a()

h.Write(data)

return uint(h.Sum32()) + seed

}

// Add 添加元素到布隆过滤器

func (bf *BloomFilter) Add(item []byte) {

for _, hashFunc := range bf.hashFuncs {

index := hashFunc(item) % bf.size

bf.bitArray[index] = true

}

// Contains 检查元素是否可能存在于布隆过滤器中

func (bf *BloomFilter) Contains(item []byte) bool {

for _, hashFunc := range bf.hashFuncs {

index := hashFunc(item) % bf.size

if !bf.bitArray[index] {

return false

}

return true

}

// OptimalHashCount 计算最优的哈希函数数量

func OptimalHashCount(size uint, expectedElements uint) uint {

return uint(math.Ceil(float64(size) / float64(expectedElements) * math.Log(2)))

}

// OptimalSize 计算最优的位数组大小

func OptimalSize(expectedElements uint, falsePositiveRate float64) uint {

return uint(math.Ceil(-float64(expectedElements) * math.Log(falsePositiveRate) / math.Pow(math.Log(2), 2)))

}

func main() {

// 预期元素数量和误报率

expectedElements := uint(1000)

falsePositiveRate := 0.01

// 计算最优的位数组大小和哈希函数数量

size := OptimalSize(expectedElements, falsePositiveRate)

hashCount := OptimalHashCount(size, expectedElements)

// 创建布隆过滤器

bf := NewBloomFilter(size, hashCount)

// 添加元素

bf.Add([]byte("hello"))

bf.Add([]byte("world"))

// 检查元素是否存在

fmt.Println(bf.Contains([]byte("hello"))) // true

fmt.Println(bf.Contains([]byte("world"))) // true

fmt.Println(bf.Contains([]byte("foo"))) // false (可能误报)

}

运行结果

添加元素后，检查元素是否存在时，可能会存在误报（即返回 true 但实际上元素不存在）。

注意事项

布隆过滤器适用于需要快速判断元素是否存在的场景，但无法保证 100% 准确。
位数组大小和哈希函数数量需要根据实际需求进行调整，以平衡空间和误报率。

需要在程序启动时将数据加载到布隆过滤器中，可以在初始化布隆过滤器后，遍历数据并将其逐个添加到布隆过滤器中。

性能对比

特性	布隆过滤器	Go 语言 `map`
内存占用	极低（位数组）	较高（存储键值对）
查询速度	O(k)（k 是哈希函数数量）	O(1)（平均情况）
误报率	可能存在误报	无误报
支持删除	不支持（除非使用变种）	支持
适用场景	大规模数据、允许误报的场景	需要精确查询和存储的场景

适用场景

布隆过滤器：
- 适合需要快速判断元素是否存在的场景，且可以容忍一定的误报率。
- 例如：爬虫 URL 去重、缓存穿透保护、垃圾邮件过滤等。
Go 语言 map：
- 适合需要精确查询和存储键值对的场景。
- 例如：缓存、索引、配置存储等。

数据规模参考

小规模数据（< 100 万条）：
- 布隆过滤器的优势不明显，直接使用 map 或数据库索引可能更合适。
- 布隆过滤器的误报率可能较高（除非位数组足够大）。
中等规模数据（100 万条 ~ 1 亿条）：
- 布隆过滤器的优势开始显现，内存占用比 map 低很多。
- 例如：1 亿条数据，误报率为 1% 时，布隆过滤器需要约 114 MB 内存（计算公式见下文）。
大规模数据（> 1 亿条）：
- 布隆过滤器是理想选择，内存占用仍然可控。
- 例如：10 亿条数据，误报率为 1% 时，布隆过滤器需要约 1.14 GB 内存。

「三年博客，如果觉得我的文章对您有用，请帮助本站成长」

赞(0) 打赏

标签：

上一篇：依赖倒置原则（Dependency Inversion Principle DIP）

下一篇：git rebase 变基

你可能感兴趣

一个去重项目

性能指标：QPS、TPS、系统吞吐量理解

记一次PHP并发性能调优实战 -- 性能提升104%

常见性能优化策略的总结（美团）

共有 0 - 布隆过滤器（Bloom Filter）

点击这里取消回复。

博客简介

admin（6年前 (2020-03-09)）
分别用不同厚度的筏板定义，画图后这设置筏板变截面处理。 http://f.fwxgx.co...
评：新文章！
admin（6年前 (2020-03-09)）
分别用不同厚度的筏板定义，画图后这设置筏板变截面处理。 http://f.fwxgx.co...
评：新文章！
admin（6年前 (2020-03-09)）
新增一个框架图！ http://biji.jinli.vip/wp-content/upl...
评：新文章！
一位WordPress评论者（6年前 (2020-02-13)）
嗨，这是一条评论。要开始审核、编辑及删除评论，请访问仪表盘的“评论”页面。评论者头像来自...
评：世界，您好！