Matheus Schmitz
LinkedIn
Github Portfolio
Crete a blackbox that does continuous random sampling from an input file, generating a synthethic data stream.
import random
import sys
class BlackBox:
def ask(self, file, num):
lines = open(file,'r').readlines()
users = [0 for i in range(num)]
for i in range(num):
users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
return users
# Blackbox
BB = BlackBox()
import sys
import binascii
import random
import math
import time
import numpy as np
from tqdm import tqdm
import pandas as pd
# Read user inputs
input_filename = 'publicdata/users.txt'
stream_size = 100
num_of_asks = 50
output_filename = 'stream_sizes.csv'
# Hyperparameters to create hash functions
n_groups = 25
n_rows = 12
n_hash = int(n_groups * n_rows)
m = n_hash * n_groups
# Generate values for the hash functions
hash_params = [[random.randint(1, 100), random.randint(1, 100)] for _ in range(n_hash)]
def myhashs(user):
# Encode user to int
user_int = int(binascii.hexlify(user.encode('utf8')),16)
# Generate hash values
result = []
for f in hash_params:
result.append((f[0] * user_int + f[1]) % m)
return result
def count_trailing_zeroes(binary_as_string):
return len(str(binary_as_string)) - len(str(binary_as_string).rstrip('0'))
# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
f_out.write("Time,Ground Truth,Estimation")
# Dataframe to keep track of the model's performance
performance_df = pd.DataFrame()
# Iterate over the asks
for ask_iteration in tqdm(range(num_of_asks)):
# Randomize the stream size of each iteration
stream_size = np.random.randint(low=100, high=1001)
stream_users = BB.ask(input_filename, stream_size)
# Set to store all users seen in this iteration
seen_users_truth = set()
# Lists to store the hash binary representations generated
hash_bin = []
# Go over all users for this stream
for user in stream_users:
# Add the user to the set of seen users
seen_users_truth.add(user)
# Hash the user into values
hashed_idxs = myhashs(user)
# Store all binary values for the current user (one value per hash function)
iter_hash_bin = []
# For the current user, get the hashed index and its binary representation
for curr_idx in hashed_idxs:
user_bin = bin(curr_idx)[2:]
iter_hash_bin.append(user_bin)
# Add the hashed values from the current iteration (current user) to the list of all hashes
hash_bin.append(iter_hash_bin)
# For each of the generated binary encoding of hash values, calculate the distance based on the number of trailing zeroes
estimated_size_per_hash = []
# Iterate through all hash functions
for curr_hash in range(n_hash):
curr_hash_max_zeroes = 0
# Then, for a given hash function, go over the binary encodings generated for all users
for curr_user in range(len(hash_bin)):
# Count the number of trailing zeroes for the current user with the current hash
curr_user_max_zeroes = count_trailing_zeroes(hash_bin[curr_user][curr_hash])
# If it is longer than the previous max values for the current hash, then update the max value
if curr_user_max_zeroes > curr_hash_max_zeroes:
curr_hash_max_zeroes = curr_user_max_zeroes
# Once the largest number of trailing zeroes for a given has hash function has been found, calculate the estimated size and append it to the list of estimates
estimated_size_per_hash.append(math.pow(2, curr_hash_max_zeroes))
# Slice the estimated sizes in "n_groups", then for each group calculate the group average
group_avgs = []
for group_idx in range(0, n_groups):
group_sum = 0.0
# Loop over the rows in the group
for curr_row in range(0, n_rows):
# Get the row index to be fetched from "estimated_size_per_hash" which has all estimates
row_idx = group_idx*n_rows + curr_row
# Fetch the estimate for the current row and add it to the sum of estimates for the current group
group_sum += estimated_size_per_hash[row_idx]
# Calcualte the average for the current group and append it to the list of all group averages
group_avg = group_sum / n_rows
group_avgs.append(group_avg)
# Get the median value from the group averages by sorting them and taking the middle number
group_avgs = sorted(group_avgs)
distinct_users_prediction = int(group_avgs[int(n_groups/2)])
# Then append the results to the output file
with open(output_filename, "a") as f_out:
f_out.write("\n" + str(ask_iteration) + "," + str(len(seen_users_truth)) + "," + str(distinct_users_prediction))
# Update the performance_df
performance_df = performance_df.append({'Distinct Users Truth': len(seen_users_truth),
'Distinct Users Prediction': distinct_users_prediction},
ignore_index=True)
# Drop the NaN values for the non existant iteration 0
performance_df.dropna(inplace=True)
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:40<00:00, 1.22it/s]
# Calculate the accuracy attained in each round
performance_df['Accuracy'] = performance_df.apply(lambda row: min(row[0],row[1])/max(row[0],row[1]), axis=1, result_type="expand")
performance_df
Distinct Users Prediction | Distinct Users Truth | Accuracy | |
---|---|---|---|
0 | 683.0 | 896.0 | 0.762277 |
1 | 512.0 | 624.0 | 0.820513 |
2 | 160.0 | 116.0 | 0.725000 |
3 | 390.0 | 305.0 | 0.782051 |
4 | 683.0 | 724.0 | 0.943370 |
5 | 598.0 | 718.0 | 0.832869 |
6 | 683.0 | 634.0 | 0.928258 |
7 | 598.0 | 971.0 | 0.615860 |
8 | 278.0 | 229.0 | 0.823741 |
9 | 171.0 | 107.0 | 0.625731 |
10 | 512.0 | 659.0 | 0.776935 |
11 | 427.0 | 315.0 | 0.737705 |
12 | 619.0 | 721.0 | 0.858530 |
13 | 395.0 | 485.0 | 0.814433 |
14 | 214.0 | 221.0 | 0.968326 |
15 | 491.0 | 459.0 | 0.934827 |
16 | 598.0 | 768.0 | 0.778646 |
17 | 272.0 | 159.0 | 0.584559 |
18 | 405.0 | 346.0 | 0.854321 |
19 | 555.0 | 703.0 | 0.789474 |
20 | 299.0 | 336.0 | 0.889881 |
21 | 512.0 | 626.0 | 0.817891 |
22 | 598.0 | 824.0 | 0.725728 |
23 | 555.0 | 970.0 | 0.572165 |
24 | 512.0 | 737.0 | 0.694708 |
25 | 491.0 | 493.0 | 0.995943 |
26 | 363.0 | 411.0 | 0.883212 |
27 | 480.0 | 375.0 | 0.781250 |
28 | 512.0 | 583.0 | 0.878216 |
29 | 427.0 | 543.0 | 0.786372 |
30 | 640.0 | 830.0 | 0.771084 |
31 | 598.0 | 787.0 | 0.759848 |
32 | 683.0 | 956.0 | 0.714435 |
33 | 470.0 | 588.0 | 0.799320 |
34 | 192.0 | 102.0 | 0.531250 |
35 | 640.0 | 869.0 | 0.736479 |
36 | 640.0 | 777.0 | 0.823681 |
37 | 299.0 | 266.0 | 0.889632 |
38 | 598.0 | 740.0 | 0.808108 |
39 | 198.0 | 114.0 | 0.575758 |
40 | 214.0 | 183.0 | 0.855140 |
41 | 309.0 | 384.0 | 0.804688 |
42 | 491.0 | 360.0 | 0.733198 |
43 | 459.0 | 447.0 | 0.973856 |
44 | 427.0 | 603.0 | 0.708126 |
45 | 352.0 | 218.0 | 0.619318 |
46 | 683.0 | 947.0 | 0.721225 |
47 | 470.0 | 498.0 | 0.943775 |
48 | 619.0 | 901.0 | 0.687014 |
49 | 176.0 | 145.0 | 0.823864 |
# Find the average model accuracy
model_accuracy = performance_df['Accuracy'].mean() * 100
print(f'Model Accuracy: {model_accuracy:.1f} %')
Model Accuracy: 78.5 %
Matheus Schmitz
LinkedIn
Github Portfolio