What is Bloom Filter?





Hello! In this article I will try to describe what the Bloom filter is, explain its purpose and show the scenarios in which it can be used. I am also implementing the Bloom filter in Python from scratch to make it easier to understand its internals.





Purpose of the Bloom filter 

— , — , ( , O , — O(1)



). , , , . , — : 100% , , 100% , , ( ). , Python, , !





. , .





, . , ( , ). uplink 100 /. IP-, . , , 100 /, (O(log(n))



), , IP-, , IP- . , , IP- .





— . , , , : , ​​ ; (, ) , .





, , , , .





.





?

, . , 100 IP-. , IP- , — 100 , — IP. IP- , «1», — «0».





4- IP- , .





IP-?

, 100 IP. IPv4- 32 , , 4 294 967 296 (2^32) ( , , , )! IP- , , . , . IP- . - -.





-

- — , . , - IP-, , . - , , .





- , ( IP), . , .





… - . . , 100 IP-. - 100 2^32 IP- 100 - ? , . . - , IP- , , 4 294 967 296 (2^32) IP-, . , -, — , , . , - 192.168.1.1 192.168.1.2, , , , ( , ).





. IP- , , .





, : 100 IP-. IP- -, - , . , , IP- . — ?





, IP- 178.23.12.63 112.64.90.12 . IP , — . , IP- , , IP- . , ?





, , — , . 0, . , 1, - , . , , , , , IP .





, . — . (, , - , ), . , ( 1, ) (1-e^(m / n))



, m — , , n .





— -. , IP- -, .. 1. k



-, (1-e^(mk/n))^k



, , - (n/m)*ln(2)



( ).





-. IP- , - , IP 112.64.90.12 , 1.





, Python ! 50 .





BloomFilter



( ). ( ) , . bitarray



, . , - , -, .





import math
from bitarray import bitarray

class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
      
      



- . ( ) DJB2. , .





def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size
      
      



-, K -? . , -, -. -. - , -. , ?





def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)
      
      



. -, , , 1 ( True) .





def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1
      
      



, , . -. - 0, , . , 1, , .





 def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False
      
      



! . !





, , . 1 , 100 000. «192.168.1.1» IP-.





bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
      
      



, i 1 100 000 , IP 192.168.1.i ( IP-, i>254, 192.168.289, ). , , ; , , .





for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



192.168.1.1





! , 100 000 IP- , , — IP-. !





:





import math
from bitarray import bitarray


class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))


    def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size


    def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)


    def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1


    def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False


bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))

for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



, . , , . !






«Data Engineer».









- «ML Spark». ML Spark, production












All Articles