Matheus Schmitz
LinkedIn
Github Portfolio
Crete a blackbox that does continuous random sampling from an input file, generating a synthethic data stream.
import random
import sys
class BlackBox:
def ask(self, file, num):
lines = open(file,'r').readlines()
users = [0 for i in range(num)]
for i in range(num):
users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
return users
# Blackbox
BB = BlackBox()
import sys
import binascii
import random
import math
import time
import numpy as np
from tqdm import tqdm
import pandas as pd
# Read user inputs
input_filename = 'publicdata/users.txt'
stream_size = 100
num_of_asks = 50
output_filename = 'stream_sizes.csv'
# Hyperparameters to create hash functions
n_groups = 25
n_rows = 12
n_hash = int(n_groups * n_rows)
m = n_hash * n_groups
# Generate values for the hash functions
hash_params = [[random.randint(1, 100), random.randint(1, 100)] for _ in range(n_hash)]
def myhashs(user):
# Encode user to int
user_int = int(binascii.hexlify(user.encode('utf8')),16)
# Generate hash values
result = []
for f in hash_params:
result.append((f[0] * user_int + f[1]) % m)
return result
def count_trailing_zeroes(binary_as_string):
return len(str(binary_as_string)) - len(str(binary_as_string).rstrip('0'))
# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
f_out.write("Time,Ground Truth,Estimation")
# Dataframe to keep track of the model's performance
performance_df = pd.DataFrame()
# Iterate over the asks
for ask_iteration in tqdm(range(num_of_asks)):
# Randomize the stream size of each iteration
stream_size = np.random.randint(low=100, high=1001)
stream_users = BB.ask(input_filename, stream_size)
# Set to store all users seen in this iteration
seen_users_truth = set()
# Lists to store the hash binary representations generated
hash_bin = []
# Go over all users for this stream
for user in stream_users:
# Add the user to the set of seen users
seen_users_truth.add(user)
# Hash the user into values
hashed_idxs = myhashs(user)
# Store all binary values for the current user (one value per hash function)
iter_hash_bin = []
# For the current user, get the hashed index and its binary representation
for curr_idx in hashed_idxs:
user_bin = bin(curr_idx)[2:]
iter_hash_bin.append(user_bin)
# Add the hashed values from the current iteration (current user) to the list of all hashes
hash_bin.append(iter_hash_bin)
# For each of the generated binary encoding of hash values, calculate the distance based on the number of trailing zeroes
estimated_size_per_hash = []
# Iterate through all hash functions
for curr_hash in range(n_hash):
curr_hash_max_zeroes = 0
# Then, for a given hash function, go over the binary encodings generated for all users
for curr_user in range(len(hash_bin)):
# Count the number of trailing zeroes for the current user with the current hash
curr_user_max_zeroes = count_trailing_zeroes(hash_bin[curr_user][curr_hash])
# If it is longer than the previous max values for the current hash, then update the max value
if curr_user_max_zeroes > curr_hash_max_zeroes:
curr_hash_max_zeroes = curr_user_max_zeroes
# Once the largest number of trailing zeroes for a given has hash function has been found, calculate the estimated size and append it to the list of estimates
estimated_size_per_hash.append(math.pow(2, curr_hash_max_zeroes))
# Slice the estimated sizes in "n_groups", then for each group calculate the group average
group_avgs = []
for group_idx in range(0, n_groups):
group_sum = 0.0
# Loop over the rows in the group
for curr_row in range(0, n_rows):
# Get the row index to be fetched from "estimated_size_per_hash" which has all estimates
row_idx = group_idx*n_rows + curr_row
# Fetch the estimate for the current row and add it to the sum of estimates for the current group
group_sum += estimated_size_per_hash[row_idx]
# Calcualte the average for the current group and append it to the list of all group averages
group_avg = group_sum / n_rows
group_avgs.append(group_avg)
# Get the median value from the group averages by sorting them and taking the middle number
group_avgs = sorted(group_avgs)
distinct_users_prediction = int(group_avgs[int(n_groups/2)])
# Then append the results to the output file
with open(output_filename, "a") as f_out:
f_out.write("\n" + str(ask_iteration) + "," + str(len(seen_users_truth)) + "," + str(distinct_users_prediction))
# Update the performance_df
performance_df = performance_df.append({'Distinct Users Truth': len(seen_users_truth),
'Distinct Users Prediction': distinct_users_prediction},
ignore_index=True)
# Drop the NaN values for the non existant iteration 0
performance_df.dropna(inplace=True)
100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:40<00:00, 1.22it/s]
# Calculate the accuracy attained in each round
performance_df['Accuracy'] = performance_df.apply(lambda row: min(row[0],row[1])/max(row[0],row[1]), axis=1, result_type="expand")
performance_df
| Distinct Users Prediction | Distinct Users Truth | Accuracy | |
|---|---|---|---|
| 0 | 683.0 | 896.0 | 0.762277 |
| 1 | 512.0 | 624.0 | 0.820513 |
| 2 | 160.0 | 116.0 | 0.725000 |
| 3 | 390.0 | 305.0 | 0.782051 |
| 4 | 683.0 | 724.0 | 0.943370 |
| 5 | 598.0 | 718.0 | 0.832869 |
| 6 | 683.0 | 634.0 | 0.928258 |
| 7 | 598.0 | 971.0 | 0.615860 |
| 8 | 278.0 | 229.0 | 0.823741 |
| 9 | 171.0 | 107.0 | 0.625731 |
| 10 | 512.0 | 659.0 | 0.776935 |
| 11 | 427.0 | 315.0 | 0.737705 |
| 12 | 619.0 | 721.0 | 0.858530 |
| 13 | 395.0 | 485.0 | 0.814433 |
| 14 | 214.0 | 221.0 | 0.968326 |
| 15 | 491.0 | 459.0 | 0.934827 |
| 16 | 598.0 | 768.0 | 0.778646 |
| 17 | 272.0 | 159.0 | 0.584559 |
| 18 | 405.0 | 346.0 | 0.854321 |
| 19 | 555.0 | 703.0 | 0.789474 |
| 20 | 299.0 | 336.0 | 0.889881 |
| 21 | 512.0 | 626.0 | 0.817891 |
| 22 | 598.0 | 824.0 | 0.725728 |
| 23 | 555.0 | 970.0 | 0.572165 |
| 24 | 512.0 | 737.0 | 0.694708 |
| 25 | 491.0 | 493.0 | 0.995943 |
| 26 | 363.0 | 411.0 | 0.883212 |
| 27 | 480.0 | 375.0 | 0.781250 |
| 28 | 512.0 | 583.0 | 0.878216 |
| 29 | 427.0 | 543.0 | 0.786372 |
| 30 | 640.0 | 830.0 | 0.771084 |
| 31 | 598.0 | 787.0 | 0.759848 |
| 32 | 683.0 | 956.0 | 0.714435 |
| 33 | 470.0 | 588.0 | 0.799320 |
| 34 | 192.0 | 102.0 | 0.531250 |
| 35 | 640.0 | 869.0 | 0.736479 |
| 36 | 640.0 | 777.0 | 0.823681 |
| 37 | 299.0 | 266.0 | 0.889632 |
| 38 | 598.0 | 740.0 | 0.808108 |
| 39 | 198.0 | 114.0 | 0.575758 |
| 40 | 214.0 | 183.0 | 0.855140 |
| 41 | 309.0 | 384.0 | 0.804688 |
| 42 | 491.0 | 360.0 | 0.733198 |
| 43 | 459.0 | 447.0 | 0.973856 |
| 44 | 427.0 | 603.0 | 0.708126 |
| 45 | 352.0 | 218.0 | 0.619318 |
| 46 | 683.0 | 947.0 | 0.721225 |
| 47 | 470.0 | 498.0 | 0.943775 |
| 48 | 619.0 | 901.0 | 0.687014 |
| 49 | 176.0 | 145.0 | 0.823864 |
# Find the average model accuracy
model_accuracy = performance_df['Accuracy'].mean() * 100
print(f'Model Accuracy: {model_accuracy:.1f} %')
Model Accuracy: 78.5 %
Matheus Schmitz
LinkedIn
Github Portfolio