¿Qué es Bloom Filter?





¡Hola! En este artículo, intentaré describir qué es el filtro Bloom, explicar su propósito y mostrar los escenarios en los que se puede utilizar. También estoy implementando el filtro Bloom en Python desde cero para que sea más fácil comprender sus aspectos internos.





PropĂłsito del filtro Bloom 

— , — , ( , O , — O(1)



). , , , . , — : 100% , , 100% , , ( ). , Python, , !





. , .





, . , ( , ). uplink 100 /. IP-, . , , 100 /, (O(log(n))



), , IP-, , IP- . , , IP- .





— . , , , : , ​​ ; (, ) , .





, , , , .





.





?

, . , 100 IP-. , IP- , — 100 , — IP. IP- , «1», — «0».





4- IP- , .





IP-?

, 100 IP. IPv4- 32 , , 4 294 967 296 (2^32) ( , , , )! IP- , , . , . IP- . - -.





-

- — , . , - IP-, , . - , , .





- , ( IP), . , .





… - . . , 100 IP-. - 100 2^32 IP- 100 - ? , . . - , IP- , , 4 294 967 296 (2^32) IP-, . , -, — , , . , - 192.168.1.1 192.168.1.2, , , , ( , ).





. IP- , , .





, : 100 IP-. IP- -, - , . , , IP- . — ?





, IP- 178.23.12.63 112.64.90.12 . IP , — . , IP- , , IP- . , ?





, , — , . 0, . , 1, - , . , , , , , IP .





, . — . (, , - , ), . , ( 1, ) (1-e^(m / n))



, m — , , n .





— -. , IP- -, .. 1. k



-, (1-e^(mk/n))^k



, , - (n/m)*ln(2)



( ).





-. IP- , - , IP 112.64.90.12 , 1.





, Python ! 50 .





BloomFilter



( ). ( ) , . bitarray



, . , - , -, .





import math
from bitarray import bitarray

class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))
      
      



- . ( ) DJB2. , .





def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size
      
      



-, K -? . , -, -. -. - , -. , ?





def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)
      
      



. -, , , 1 ( True) .





def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1
      
      



, , . -. - 0, , . , 1, , .





 def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False
      
      



! . !





, , . 1 , 100 000. «192.168.1.1» IP-.





bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))
      
      



, i 1 100 000 , IP 192.168.1.i ( IP-, i>254, 192.168.289, ). , , ; , , .





for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



192.168.1.1





! , 100 000 IP- , , — IP-. !





:





import math
from bitarray import bitarray


class BloomFilter(object):

    def __init__(self, size, number_expected_elements=100000):
        self.size = size
        self.number_expected_elements = number_expected_elements

        self.bloom_filter = bitarray(self.size)
        self.bloom_filter.setall(0)

        self.number_hash_functions = round((self.size / self.number_expected_elements) * math.log(2))


    def _hash_djb2(self, s):
        hash = 5381
        for x in s:
            hash = ((hash << 5) + hash) + ord(x)
        return hash % self.size


    def _hash(self, item, K):
        return self._hash_djb2(str(K) + item)


    def add_to_filter(self, item):
        for i in range(self.number_hash_functions):
            self.bloom_filter[self._hash(item, i)] = 1


    def check_is_not_in_filter(self, item):
        for i in range(self.number_hash_functions):
            if self.bloom_filter[self._hash(item, i)] == 0:
                return True
        return False


bloom_filter = BloomFilter(1000000, 100000)
base_ip = "192.168.1."
bloom_filter.add_to_filter(base_ip + str(1))

for i in range(1, 100000):
    if not bloom_filter.check_is_not_in_filter(base_ip + str(i)):
        print(base_ip+str(i))
      
      



, . , , . !






«Data Engineer».









- «ML Spark». ML Spark, production












All Articles