Matheus Schmitz
LinkedIn
Github Portfolio
Create a blackbox that supplies a continuous stream of incoming data randomly chosen from a dataset
import random
import sys
import binascii
class BlackBox:
def ask(self, file, num):
lines = open(file,'r').readlines()
users = [0 for i in range(num)]
for i in range(num):
users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
return users
# Blackbox
BB = BlackBox()
# Read user inputs
input_filename = "publicdata/users.txt"
stream_size = 50000
num_of_asks = 50
output_filename = "bloom_filter.csv"
# Variables to track performance
seen_users_truth = set()
seen_users_preds = set()
# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
f_out.write("Time,FPR")
# Generate parameters for the hash functions
hash_params = [[random.randint(1, 100), random.randint(1, 100)] for _ in range(5)]
# Initialize the bloom filter bit array
filter_bit_array = [0 for _ in range(100000)]
m = len(filter_bit_array)
# Function to hash user into a set of indexes at the bloom filter
def myhashs(user):
# Encode user to int
user_int = int(binascii.hexlify(user.encode('utf8')),16)
# Generate hash values
result = []
for f in hash_params:
result.append((f[0] * user_int + f[1]) % m)
return result
import pandas as pd
# DataFrame to track Bloom Filter performance over time (which always decreases)
performance_df = pd.DataFrame()
# Iterate over the asks
for ask_iteration in range(num_of_asks):
stream_users = BB.ask(input_filename, stream_size)
# According to Piazza post 502 FP and TN should be reset between iterations
FP = 0.0
TN = 0.0
# Go over all users for this stream
for user in stream_users:
# List to store all indexes generated by all hashes * users for this iteration, so the bit array can be updated when the iteration ends
stream_hashed_idxs = []
# If I have actually seen the user before, then I should not test the match, as surely it will match because it should
# In this case it would be a true positive, which I'm not interested in
if user in seen_users_truth:
continue
# Hash the user into values
hashed_idxs = myhashs(user)
# Track how many of the hashed indexes colide with pre-existing 1s on the filter_bit_array
bit_matches = 0
for idx in hashed_idxs:
if filter_bit_array[idx] == 1:
bit_matches += 1
# If all bits matches, then we have a false positive, else a true negative
if bit_matches == len(hashed_idxs):
# Make a prediction that already seen the users
seen_users_preds.add(user)
# Since the truly seen users have been weed out before, I know this is a false positive
FP += 1
else:
TN += 1
# Add the current user hashed indexes to the list with all indexes from the iteration, so that when the iteration finishes the bit array can be updated
stream_hashed_idxs.append(hashed_idxs)
# Update the Bloom Filter after the iteration is done
for user_hashed_idxs in stream_hashed_idxs:
for idx in user_hashed_idxs:
filter_bit_array[idx] = 1
# Update the ground truth after the iteration is done
seen_users_truth.update(stream_users)
# Once all users of the current stream have been iterated through, calculate the iteration's FPR
FPR = float(FP) / (float(FP) + float(TN))
# Then append the results to the output file
with open(output_filename, "a") as f_out:
f_out.write("\n" + str(ask_iteration) + "," + str(FPR))
# And print bloom filter performance over time
print(f"Batch: {str(ask_iteration)} | FPR: {str(FPR)}")
# Update performance_df
performance_df.at[ask_iteration, 'FPR'] = FPR
Batch: 0 | FPR: 0.0 Batch: 1 | FPR: 2.0669698222405953e-05 Batch: 2 | FPR: 4.265483705852244e-05 Batch: 3 | FPR: 2.2059958968476317e-05 Batch: 4 | FPR: 4.569339730408956e-05 Batch: 5 | FPR: 4.712979545668772e-05 Batch: 6 | FPR: 4.879000780640125e-05 Batch: 7 | FPR: 0.00012589067653649572 Batch: 8 | FPR: 7.808026651397637e-05 Batch: 9 | FPR: 8.08865162177465e-05 Batch: 10 | FPR: 5.592841163310962e-05 Batch: 11 | FPR: 8.56384345294168e-05 Batch: 12 | FPR: 0.00014953047431066452 Batch: 13 | FPR: 0.00015317688867103732 Batch: 14 | FPR: 3.1771247021445594e-05 Batch: 15 | FPR: 3.2866627226713995e-05 Batch: 16 | FPR: 0.0002032658039162545 Batch: 17 | FPR: 7.01508242721852e-05 Batch: 18 | FPR: 0.00014414414414414415 Batch: 19 | FPR: 7.517383950385266e-05 Batch: 20 | FPR: 0.00011640540121061617 Batch: 21 | FPR: 0.00031925931838135526 Batch: 22 | FPR: 0.00016503011799653437 Batch: 23 | FPR: 8.505932888189512e-05 Batch: 24 | FPR: 8.850340738118418e-05 Batch: 25 | FPR: 0.00031702898550724635 Batch: 26 | FPR: 0.00014029180695847364 Batch: 27 | FPR: 0.00019482733427499878 Batch: 28 | FPR: 0.00020164339365831528 Batch: 29 | FPR: 0.00010450958875476824 Batch: 30 | FPR: 0.00010800302408467438 Batch: 31 | FPR: 0.00011120378092855157 Batch: 32 | FPR: 0.00011480397221743872 Batch: 33 | FPR: 0.00011832919181161993 Batch: 34 | FPR: 0.00018357606168155672 Batch: 35 | FPR: 0.0004430379746835443 Batch: 36 | FPR: 0.00032769694586446454 Batch: 37 | FPR: 0.00026961445133459155 Batch: 38 | FPR: 0.000346860908775581 Batch: 39 | FPR: 0.00043661766846165044 Batch: 40 | FPR: 0.00022153300841825432 Batch: 41 | FPR: 0.00038402457757296467 Batch: 42 | FPR: 0.00016052652700858816 Batch: 43 | FPR: 8.300132802124833e-05 Batch: 44 | FPR: 0.0002530150965674285 Batch: 45 | FPR: 0.0005276116778051355 Batch: 46 | FPR: 0.00046356387910254034 Batch: 47 | FPR: 0.0003779646603042616 Batch: 48 | FPR: 0.0006804704967434626 Batch: 49 | FPR: 0.00039781203381402286
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
performance_df.plot(figsize=(15,5), grid=True, color='salmon')
plt.suptitle('Bloom Filter False-Positive Rate Over Time', size=22)
plt.xlabel('Batch No.')
plt.ylabel('False Positive Rate (FPR)')
plt.show()
Matheus Schmitz
LinkedIn
Github Portfolio