import random
import sys

class BlackBox:

    def ask(self, file, num):
        lines = open(file,'r').readlines()
        users = [0 for i in range(num)]
        for i in range(num):
            users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
        return users


# Blackbox
BB = BlackBox()


import sys
import binascii
import random
import math
import time

import numpy as np
from tqdm import tqdm
import pandas as pd


# Read user inputs
input_filename = 'publicdata/users.txt'
stream_size = 100
num_of_asks = 50
output_filename = 'stream_sizes.csv'


# Hyperparameters to create hash functions
n_groups = 25
n_rows = 12
n_hash = int(n_groups * n_rows)
m = n_hash * n_groups


# Generate values for the hash functions
hash_params = [[random.randint(1, 100), random.randint(1, 100)] for _ in range(n_hash)]


def myhashs(user):

    # Encode user to int
    user_int = int(binascii.hexlify(user.encode('utf8')),16)

    # Generate hash values
    result = []
    for f in hash_params:
        result.append((f[0] * user_int + f[1]) % m)

    return result


def count_trailing_zeroes(binary_as_string):
    return len(str(binary_as_string)) - len(str(binary_as_string).rstrip('0'))


# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
    f_out.write("Time,Ground Truth,Estimation")


# Dataframe to keep track of the model's performance
performance_df = pd.DataFrame()


# Iterate over the asks
for ask_iteration in tqdm(range(num_of_asks)):
    
    # Randomize the stream size of each iteration
    stream_size = np.random.randint(low=100, high=1001)
    
    stream_users = BB.ask(input_filename, stream_size)

    # Set to store all users seen in this iteration
    seen_users_truth = set()

    # Lists to store the hash binary representations generated
    hash_bin = []

    # Go over all users for this stream
    for user in stream_users:

        # Add the user to the set of seen users
        seen_users_truth.add(user)

        # Hash the user into values
        hashed_idxs = myhashs(user)

        # Store all binary values for the current user (one value per hash function)
        iter_hash_bin = []

        # For the current user, get the hashed index and its binary representation
        for curr_idx in hashed_idxs:
            user_bin = bin(curr_idx)[2:]
            iter_hash_bin.append(user_bin)

        # Add the hashed values from the current iteration (current user) to the list of all hashes
        hash_bin.append(iter_hash_bin)

    # For each of the generated binary encoding of hash values, calculate the distance based on the number of trailing zeroes
    estimated_size_per_hash = []

    # Iterate through all hash functions
    for curr_hash in range(n_hash):
        curr_hash_max_zeroes = 0

        # Then, for a given hash function, go over the binary encodings generated for all users
        for curr_user in range(len(hash_bin)):

            # Count the number of trailing zeroes for the current user with the current hash
            curr_user_max_zeroes = count_trailing_zeroes(hash_bin[curr_user][curr_hash])

            # If it is longer than the previous max values for the current hash, then update the max value
            if curr_user_max_zeroes > curr_hash_max_zeroes:
                curr_hash_max_zeroes = curr_user_max_zeroes

        # Once the largest number of trailing zeroes for a given has hash function has been found, calculate the estimated size and append it to the list of estimates
        estimated_size_per_hash.append(math.pow(2, curr_hash_max_zeroes))

    # Slice the estimated sizes in "n_groups", then for each group calculate the group average
    group_avgs = []
    for group_idx in range(0, n_groups):
        group_sum = 0.0

        # Loop over the rows in the group
        for curr_row in range(0, n_rows):

            # Get the row index to be fetched from "estimated_size_per_hash" which has all estimates
            row_idx = group_idx*n_rows + curr_row

            # Fetch the estimate for the current row and add it to the sum of estimates for the current group
            group_sum += estimated_size_per_hash[row_idx]

        # Calcualte the average for the current group and append it to the list of all group averages
        group_avg = group_sum / n_rows
        group_avgs.append(group_avg)

    # Get the median value from the group averages by sorting them and taking the middle number
    group_avgs = sorted(group_avgs)
    distinct_users_prediction = int(group_avgs[int(n_groups/2)])

    # Then append the results to the output file
    with open(output_filename, "a") as f_out:
        f_out.write("\n" + str(ask_iteration) + "," + str(len(seen_users_truth)) + "," + str(distinct_users_prediction))
        
    # Update the performance_df
    performance_df = performance_df.append({'Distinct Users Truth': len(seen_users_truth),
                                            'Distinct Users Prediction': distinct_users_prediction},
                                           ignore_index=True)
    
# Drop the NaN values for the non existant iteration 0
performance_df.dropna(inplace=True)

100%|██████████████████████████████████████████████████████████████████████████████████| 50/50 [00:40<00:00,  1.22it/s]


# Calculate the accuracy attained in each round
performance_df['Accuracy'] = performance_df.apply(lambda row: min(row[0],row[1])/max(row[0],row[1]), axis=1, result_type="expand")


performance_df


# Find the average model accuracy
model_accuracy = performance_df['Accuracy'].mean() * 100
print(f'Model Accuracy: {model_accuracy:.1f} %')

Model Accuracy: 78.5 %

	Distinct Users Prediction	Distinct Users Truth	Accuracy
0	683.0	896.0	0.762277
1	512.0	624.0	0.820513
2	160.0	116.0	0.725000
3	390.0	305.0	0.782051
4	683.0	724.0	0.943370
5	598.0	718.0	0.832869
6	683.0	634.0	0.928258
7	598.0	971.0	0.615860
8	278.0	229.0	0.823741
9	171.0	107.0	0.625731
10	512.0	659.0	0.776935
11	427.0	315.0	0.737705
12	619.0	721.0	0.858530
13	395.0	485.0	0.814433
14	214.0	221.0	0.968326
15	491.0	459.0	0.934827
16	598.0	768.0	0.778646
17	272.0	159.0	0.584559
18	405.0	346.0	0.854321
19	555.0	703.0	0.789474
20	299.0	336.0	0.889881
21	512.0	626.0	0.817891
22	598.0	824.0	0.725728
23	555.0	970.0	0.572165
24	512.0	737.0	0.694708
25	491.0	493.0	0.995943
26	363.0	411.0	0.883212
27	480.0	375.0	0.781250
28	512.0	583.0	0.878216
29	427.0	543.0	0.786372
30	640.0	830.0	0.771084
31	598.0	787.0	0.759848
32	683.0	956.0	0.714435
33	470.0	588.0	0.799320
34	192.0	102.0	0.531250
35	640.0	869.0	0.736479
36	640.0	777.0	0.823681
37	299.0	266.0	0.889632
38	598.0	740.0	0.808108
39	198.0	114.0	0.575758
40	214.0	183.0	0.855140
41	309.0	384.0	0.804688
42	491.0	360.0	0.733198
43	459.0	447.0	0.973856
44	427.0	603.0	0.708126
45	352.0	218.0	0.619318
46	683.0	947.0	0.721225
47	470.0	498.0	0.943775
48	619.0	901.0	0.687014
49	176.0	145.0	0.823864

Data Stream Analytics¶

Generate Data Stream¶

Load Packages¶

Load Data¶

Hash Functions¶

Estimate Stream Sizes¶

Evaluate Model Performance¶

End¶