import random
import sys
import binascii

class BlackBox:

    def ask(self, file, num):
        lines = open(file,'r').readlines()
        users = [0 for i in range(num)]
        for i in range(num):
            users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
        return users


# Blackbox
BB = BlackBox()


# Read user inputs
input_filename = "publicdata/users.txt"
stream_size = 50000
num_of_asks = 50
output_filename = "bloom_filter.csv"


# Variables to track performance
seen_users_truth = set()
seen_users_preds = set()


# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
    f_out.write("Time,FPR")


# Generate parameters for the hash functions
hash_params = [[random.randint(1, 100), random.randint(1, 100)] for _ in range(5)]


# Initialize the bloom filter bit array
filter_bit_array = [0 for _ in range(100000)]
m = len(filter_bit_array)


# Function to hash user into a set of indexes at the bloom filter
def myhashs(user):

    # Encode user to int
    user_int = int(binascii.hexlify(user.encode('utf8')),16)

    # Generate hash values
    result = []
    for f in hash_params:
        result.append((f[0] * user_int + f[1]) % m)
    
    return result


import pandas as pd

# DataFrame to track Bloom Filter performance over time (which always decreases)
performance_df = pd.DataFrame()


# Iterate over the asks
for ask_iteration in range(num_of_asks):
    stream_users = BB.ask(input_filename, stream_size)

    # According to Piazza post 502 FP and TN should be reset between iterations
    FP = 0.0
    TN = 0.0

    # Go over all users for this stream
    for user in stream_users:

        # List to store all indexes generated by all hashes * users for this iteration, so the bit array can be updated when the iteration ends
        stream_hashed_idxs = []

        # If I have actually seen the user before, then I should not test the match, as surely it will match because it should
        # In this case it would be a true positive, which I'm not interested in
        if user in seen_users_truth:
            continue

        # Hash the user into values
        hashed_idxs = myhashs(user)

        # Track how many of the hashed indexes colide with pre-existing 1s on the filter_bit_array
        bit_matches = 0
        for idx in hashed_idxs:
            if filter_bit_array[idx] == 1:
                bit_matches += 1

        # If all bits matches, then we have a false positive, else a true negative
        if bit_matches == len(hashed_idxs):

            # Make a prediction that already seen the users
            seen_users_preds.add(user)

            # Since the truly seen users have been weed out before, I know this is a false positive
            FP += 1

        else:
            TN += 1

        # Add the current user hashed indexes to the list with all indexes from the iteration, so that when the iteration finishes the bit array can be updated		
        stream_hashed_idxs.append(hashed_idxs)

    # Update the Bloom Filter after the iteration is done
    for user_hashed_idxs in stream_hashed_idxs:
        for idx in user_hashed_idxs:
            filter_bit_array[idx] = 1

    # Update the ground truth after the iteration is done
    seen_users_truth.update(stream_users)				

    # Once all users of the current stream have been iterated through, calculate the iteration's FPR
    FPR = float(FP) / (float(FP) + float(TN))

    # Then append the results to the output file
    with open(output_filename, "a") as f_out:
        f_out.write("\n" + str(ask_iteration) + "," + str(FPR))
        
    # And print bloom filter performance over time
    print(f"Batch: {str(ask_iteration)} | FPR: {str(FPR)}")
    
    # Update performance_df
    performance_df.at[ask_iteration, 'FPR'] = FPR

Batch: 0 | FPR: 0.0
Batch: 1 | FPR: 2.0669698222405953e-05
Batch: 2 | FPR: 4.265483705852244e-05
Batch: 3 | FPR: 2.2059958968476317e-05
Batch: 4 | FPR: 4.569339730408956e-05
Batch: 5 | FPR: 4.712979545668772e-05
Batch: 6 | FPR: 4.879000780640125e-05
Batch: 7 | FPR: 0.00012589067653649572
Batch: 8 | FPR: 7.808026651397637e-05
Batch: 9 | FPR: 8.08865162177465e-05
Batch: 10 | FPR: 5.592841163310962e-05
Batch: 11 | FPR: 8.56384345294168e-05
Batch: 12 | FPR: 0.00014953047431066452
Batch: 13 | FPR: 0.00015317688867103732
Batch: 14 | FPR: 3.1771247021445594e-05
Batch: 15 | FPR: 3.2866627226713995e-05
Batch: 16 | FPR: 0.0002032658039162545
Batch: 17 | FPR: 7.01508242721852e-05
Batch: 18 | FPR: 0.00014414414414414415
Batch: 19 | FPR: 7.517383950385266e-05
Batch: 20 | FPR: 0.00011640540121061617
Batch: 21 | FPR: 0.00031925931838135526
Batch: 22 | FPR: 0.00016503011799653437
Batch: 23 | FPR: 8.505932888189512e-05
Batch: 24 | FPR: 8.850340738118418e-05
Batch: 25 | FPR: 0.00031702898550724635
Batch: 26 | FPR: 0.00014029180695847364
Batch: 27 | FPR: 0.00019482733427499878
Batch: 28 | FPR: 0.00020164339365831528
Batch: 29 | FPR: 0.00010450958875476824
Batch: 30 | FPR: 0.00010800302408467438
Batch: 31 | FPR: 0.00011120378092855157
Batch: 32 | FPR: 0.00011480397221743872
Batch: 33 | FPR: 0.00011832919181161993
Batch: 34 | FPR: 0.00018357606168155672
Batch: 35 | FPR: 0.0004430379746835443
Batch: 36 | FPR: 0.00032769694586446454
Batch: 37 | FPR: 0.00026961445133459155
Batch: 38 | FPR: 0.000346860908775581
Batch: 39 | FPR: 0.00043661766846165044
Batch: 40 | FPR: 0.00022153300841825432
Batch: 41 | FPR: 0.00038402457757296467
Batch: 42 | FPR: 0.00016052652700858816
Batch: 43 | FPR: 8.300132802124833e-05
Batch: 44 | FPR: 0.0002530150965674285
Batch: 45 | FPR: 0.0005276116778051355
Batch: 46 | FPR: 0.00046356387910254034
Batch: 47 | FPR: 0.0003779646603042616
Batch: 48 | FPR: 0.0006804704967434626
Batch: 49 | FPR: 0.00039781203381402286


import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
performance_df.plot(figsize=(15,5), grid=True, color='salmon')
plt.suptitle('Bloom Filter False-Positive Rate Over Time', size=22)
plt.xlabel('Batch No.')
plt.ylabel('False Positive Rate (FPR)')
plt.show()

Bloom Filter¶

Data Stream¶

Load Data¶

Track New Users Through Bloom Filtering¶

Performance Over Time¶

End¶