Matheus Schmitz
LinkedIn
Github Portfolio
Create a blackbox that supplies a continuous stream of incoming data randomly chosen from a dataset
import random
import sys
import binascii
class BlackBox:
    def ask(self, file, num):
        lines = open(file,'r').readlines()
        users = [0 for i in range(num)]
        for i in range(num):
            users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
        return users
# Blackbox
BB = BlackBox()
# Read user inputs
input_filename = "publicdata/users.txt"
stream_size = 50000
num_of_asks = 50
output_filename = "bloom_filter.csv"
# Variables to track performance
seen_users_truth = set()
seen_users_preds = set()
# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
    f_out.write("Time,FPR")
# Generate parameters for the hash functions
hash_params = [[random.randint(1, 100), random.randint(1, 100)] for _ in range(5)]
# Initialize the bloom filter bit array
filter_bit_array = [0 for _ in range(100000)]
m = len(filter_bit_array)
# Function to hash user into a set of indexes at the bloom filter
def myhashs(user):
    # Encode user to int
    user_int = int(binascii.hexlify(user.encode('utf8')),16)
    # Generate hash values
    result = []
    for f in hash_params:
        result.append((f[0] * user_int + f[1]) % m)
    
    return result
import pandas as pd
# DataFrame to track Bloom Filter performance over time (which always decreases)
performance_df = pd.DataFrame()
# Iterate over the asks
for ask_iteration in range(num_of_asks):
    stream_users = BB.ask(input_filename, stream_size)
    # According to Piazza post 502 FP and TN should be reset between iterations
    FP = 0.0
    TN = 0.0
    # Go over all users for this stream
    for user in stream_users:
        # List to store all indexes generated by all hashes * users for this iteration, so the bit array can be updated when the iteration ends
        stream_hashed_idxs = []
        # If I have actually seen the user before, then I should not test the match, as surely it will match because it should
        # In this case it would be a true positive, which I'm not interested in
        if user in seen_users_truth:
            continue
        # Hash the user into values
        hashed_idxs = myhashs(user)
        # Track how many of the hashed indexes colide with pre-existing 1s on the filter_bit_array
        bit_matches = 0
        for idx in hashed_idxs:
            if filter_bit_array[idx] == 1:
                bit_matches += 1
        # If all bits matches, then we have a false positive, else a true negative
        if bit_matches == len(hashed_idxs):
            # Make a prediction that already seen the users
            seen_users_preds.add(user)
            # Since the truly seen users have been weed out before, I know this is a false positive
            FP += 1
        else:
            TN += 1
        # Add the current user hashed indexes to the list with all indexes from the iteration, so that when the iteration finishes the bit array can be updated		
        stream_hashed_idxs.append(hashed_idxs)
    # Update the Bloom Filter after the iteration is done
    for user_hashed_idxs in stream_hashed_idxs:
        for idx in user_hashed_idxs:
            filter_bit_array[idx] = 1
    # Update the ground truth after the iteration is done
    seen_users_truth.update(stream_users)				
    # Once all users of the current stream have been iterated through, calculate the iteration's FPR
    FPR = float(FP) / (float(FP) + float(TN))
    # Then append the results to the output file
    with open(output_filename, "a") as f_out:
        f_out.write("\n" + str(ask_iteration) + "," + str(FPR))
        
    # And print bloom filter performance over time
    print(f"Batch: {str(ask_iteration)} | FPR: {str(FPR)}")
    
    # Update performance_df
    performance_df.at[ask_iteration, 'FPR'] = FPR
Batch: 0 | FPR: 0.0 Batch: 1 | FPR: 2.0669698222405953e-05 Batch: 2 | FPR: 4.265483705852244e-05 Batch: 3 | FPR: 2.2059958968476317e-05 Batch: 4 | FPR: 4.569339730408956e-05 Batch: 5 | FPR: 4.712979545668772e-05 Batch: 6 | FPR: 4.879000780640125e-05 Batch: 7 | FPR: 0.00012589067653649572 Batch: 8 | FPR: 7.808026651397637e-05 Batch: 9 | FPR: 8.08865162177465e-05 Batch: 10 | FPR: 5.592841163310962e-05 Batch: 11 | FPR: 8.56384345294168e-05 Batch: 12 | FPR: 0.00014953047431066452 Batch: 13 | FPR: 0.00015317688867103732 Batch: 14 | FPR: 3.1771247021445594e-05 Batch: 15 | FPR: 3.2866627226713995e-05 Batch: 16 | FPR: 0.0002032658039162545 Batch: 17 | FPR: 7.01508242721852e-05 Batch: 18 | FPR: 0.00014414414414414415 Batch: 19 | FPR: 7.517383950385266e-05 Batch: 20 | FPR: 0.00011640540121061617 Batch: 21 | FPR: 0.00031925931838135526 Batch: 22 | FPR: 0.00016503011799653437 Batch: 23 | FPR: 8.505932888189512e-05 Batch: 24 | FPR: 8.850340738118418e-05 Batch: 25 | FPR: 0.00031702898550724635 Batch: 26 | FPR: 0.00014029180695847364 Batch: 27 | FPR: 0.00019482733427499878 Batch: 28 | FPR: 0.00020164339365831528 Batch: 29 | FPR: 0.00010450958875476824 Batch: 30 | FPR: 0.00010800302408467438 Batch: 31 | FPR: 0.00011120378092855157 Batch: 32 | FPR: 0.00011480397221743872 Batch: 33 | FPR: 0.00011832919181161993 Batch: 34 | FPR: 0.00018357606168155672 Batch: 35 | FPR: 0.0004430379746835443 Batch: 36 | FPR: 0.00032769694586446454 Batch: 37 | FPR: 0.00026961445133459155 Batch: 38 | FPR: 0.000346860908775581 Batch: 39 | FPR: 0.00043661766846165044 Batch: 40 | FPR: 0.00022153300841825432 Batch: 41 | FPR: 0.00038402457757296467 Batch: 42 | FPR: 0.00016052652700858816 Batch: 43 | FPR: 8.300132802124833e-05 Batch: 44 | FPR: 0.0002530150965674285 Batch: 45 | FPR: 0.0005276116778051355 Batch: 46 | FPR: 0.00046356387910254034 Batch: 47 | FPR: 0.0003779646603042616 Batch: 48 | FPR: 0.0006804704967434626 Batch: 49 | FPR: 0.00039781203381402286
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
performance_df.plot(figsize=(15,5), grid=True, color='salmon')
plt.suptitle('Bloom Filter False-Positive Rate Over Time', size=22)
plt.xlabel('Batch No.')
plt.ylabel('False Positive Rate (FPR)')
plt.show()
Matheus Schmitz
LinkedIn
Github Portfolio