Hello! In this article I will try to describe what the Bloom filter is, explain its purpose and show the scenarios in which it can be used. I am also implementing the Bloom filter in Python from scratch to make it easier to understand its internals.
Purpose of the Bloom filter
— , — , ( , O , — O(1)
). , , , . , — : 100% , , 100% , , ( ). , Python, , !
. , .
, . , ( , ). uplink 100 /. IP-, . , , 100 /, (O(log(n))
), , IP-, , IP- . , , IP- .
— . , , , : , ; (, ) , .
, , , , .
?
, . , 100 IP-. , IP- , — 100 , — IP. IP- , «1», — «0».
4- IP- , .
IP-?
, 100 IP. IPv4- 32 , , 4 294 967 296 (2^32) ( , , , )! IP- , , . , . IP- . - -.
-
- — , . , - IP-, , . - , , .
- , ( IP), . , .
… - . . , 100 IP-. - 100 2^32 IP- 100 - ? , . . - , IP- , , 4 294 967 296 (2^32) IP-, . , -, — , , . , - 192.168.1.1 192.168.1.2, , , , ( , ).
. IP- , , .
, : 100 IP-. IP- -, - , . , , IP- . — ?
, IP- 178.23.12.63 112.64.90.12 . IP , — . , IP- , , IP- . , ?
, , — , . 0, . , 1, - , . , , , , , IP .
, . — . (, , - , ), . , ( 1, ) (1-e^(m / n))
, m — , , n .
— -. , IP- -, .. 1. k
-, (1-e^(mk/n))^k
, , - (n/m)*ln(2)
( ).
-. IP- , - , IP 112.64.90.12 , 1.
, Python ! 50 .
BloomFilter
( ). ( ) , . bitarray
, . , - , -, .
import math
from bitarray import bitarray
class BloomFilter(object):
def __init__(self, size, number_expected_elements=100000):
self.size = size
self.number_expected_elements = number_expected_elements
self.bloom_filter = bitarray(self.size)
self.bloom_filter.setall(0)
self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
def _hash_djb2(self, s):
hash = 5381
for x in s:
hash = ((hash << 5) + hash) + ord(x)
return hash % self.size
-, K -? . , -, -. -. - , -. , ?
def _hash(self, item, K):
return self._hash_djb2(str(K) + item)
. -, , , 1 ( True) .
def add_to_filter(self, item):
for i in range(self.number_hash_functions):
self.bloom_filter[self._hash(item, i)] = 1
, , . -. - 0, , . , 1, , .
def check_is_not_in_filter(self, item):
for i in range(self.number_hash_functions):
if self.bloom_filter[self._hash(item, i)] == 0:
return True
return False
! . !
, , . 1 , 100 000. «192.168.1.1» IP-.
bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
, i 1 100 000 , IP 192.168.1.i ( IP-, i>254, 192.168.289, ). , , ; , , .
for i in range(1, 100000):
if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
print(base_ip+str(i))
192.168.1.1
! , 100 000 IP- , , — IP-. !
:
import math
from bitarray import bitarray
class BloomFilter(object):
def __init__(self, size, number_expected_elements=100000):
self.size = size
self.number_expected_elements = number_expected_elements
self.bloom_filter = bitarray(self.size)
self.bloom_filter.setall(0)
self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
def _hash_djb2(self, s):
hash = 5381
for x in s:
hash = ((hash << 5) + hash) + ord(x)
return hash % self.size
def _hash(self, item, K):
return self._hash_djb2(str(K) + item)
def add_to_filter(self, item):
for i in range(self.number_hash_functions):
self.bloom_filter[self._hash(item, i)] = 1
def check_is_not_in_filter(self, item):
for i in range(self.number_hash_functions):
if self.bloom_filter[self._hash(item, i)] == 0:
return True
return False
bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
for i in range(1, 100000):
if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
print(base_ip+str(i))
, . , , . !
- «ML Spark». ML Spark, production