Matheus Schmitz
LinkedIn
Github Portfolio
Create a blackbox to simulate incoming an incoming stream of data
import random
random.seed(553)
import sys
from tqdm import tqdm
class BlackBox:
def ask(self, file, num):
lines = open(file,'r').readlines()
users = [0 for i in range(num)]
for i in range(num):
users[i] = lines[random.randint(0, len(lines) - 1)].rstrip("\n")
return users
# Blackbox
BB = BlackBox()
# Read user inputs
input_filename = "publicdata/users.txt"
stream_size = 300
num_of_asks = 30
output_filename = "reservoir.csv"
# Define reservoir size
reservoir_size = 100
# List to store current items in the reservoir
reservoir = []
# Global variable tracking the sequence number of the incoming users
sequence_number = 0
Keep track of samples at indexes 0, 20, 40, 60 and 80
# Before beginning to iterate, write the column headers
with open(output_filename, "w") as f_out:
f_out.write("seqnum,0_id,20_id,40_id,60_id,80_id")
# Iterate over the asks
for ask_iteration in tqdm(range(num_of_asks)):
stream_users = BB.ask(input_filename, stream_size)
# Go over all users for this stream
for user in stream_users:
# Update the sequence number for the current user
sequence_number += 1
# For all long as the reservoir has less samples that then cap, just keep adding users
if len(reservoir) < reservoir_size:
reservoir.append(user)
# Once the reservoir fills, start sampling who gets in and who gets out
else:
# Sample if the next user should get in the reservoir
if random.random() < (float(reservoir_size)/float(sequence_number)):
# If the new user was chosen go get in, sample the index of the user to be swapped for the new user
swap_idx = random.randint(0, reservoir_size-1)
# Then make the swap
reservoir[swap_idx] = user
# Every time you receive 100 users, you sohuld print the current stage of your reservoir to a CSV file
if sequence_number % 100 == 0:
# Then append the results to the output file
with open(output_filename, "a") as f_out:
f_out.write("\n" + str(sequence_number) + "," + reservoir[0].strip() + "," + reservoir[20].strip() + "," + reservoir[40].strip() + "," + reservoir[60].strip() + "," + reservoir[80].strip())
100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:16<00:00, 1.79it/s]
# View the current status of the reservoir
reservoir
['RAnRkCbg2IpKuw9kI52REA', 'cN5tSE-pbeHMpDclFcomkw', 'lOEQm5F6VkBA-hKoHwlJTA', 'swDv8K6G2as45aQiBsTWIw', 'KuouIPezf88cwmdWLx5wpA', 'A138Hn_kaiUpKrQ15RsfMg', 'aHiSoFbecen0MSHOYzMueA', '37jh6BgTy0yD2bRvlDXaqg', 'caTtxvZCjUmHkrCwbPN0DQ', 'D-swfXVezApoUv_GV2Ytig', 'BXlxwq_nrdXG3U2l9LbsXw', 'cF-rZscoEV5zV5lMTO-Icw', '3uQ69HLjnorXVXLq31MwZQ', 'zbSeUkk0sk6CXqcCz5umzA', 'cneJHTyQxoH1OFUgXDwMmw', 'uc1cZVUofrETCQWcWQGGNQ', 'kd_XkGViUDgPOMpCrXWX6A', '69V-c1iSlBv6ndIOO8ydQg', '64Zj7c9_SiDFvP8LXq7nLA', '-o-9Bw-dLc1dVMjSlAEGKg', 'qwBpDbANUmmmP4POrOpt4A', 'j0Y4htBL67yb7gyIo59wxw', 'P4v1zgRowuRO39RZCBMTHg', 'xs1MftEQfjbjMeUIm5Xicg', 'VwliPnWAKsEiKtOXSBHJVA', 'QLpgnKJi-rDTURSWgTy34w', 'rowQ1KZEGVhnwPQQpzEEgA', 'pLCX6K0M1DbYeUTtYVQaGg', 'wZ174oyhyvEu9ax60uPHCw', 'ZVqLpWf8VkkO6Dliv7Bqig', 'zBvxW_C-8AW-HY_lax3pFA', 'rxSKHOP765ro647q6mLUwg', 'j4kzSywHgrYLIe8I-najPg', 'W4mA4jrv2NdGiSnESgSzSg', 'ZU1t9G4LmpJg4UdEE__QgA', 'v9rKMNtQOkWR8dIoj7tuAw', 'I1LE5eCoRJkwmG9j3OrWpg', 'Z7J2VH9qHSH00q8ivAxM_w', '8sY4CB3PzBUvvwHMEPIXIg', 'QwGb_Yh4gkWBcvULzyraKg', 'pxqIvb3q6TLB7HYqCQf8nA', 'RXbkJ7Fw88i0SeTMMV-wZg', 'Z9j54kFixrbxo_gpytQEBQ', 'TZba2UvQfGBfb47AmtSHCQ', 'eLdnsLtaoIOh0wSol9HAOQ', 'fgA5A2k9mar_jmgpJHEY2w', 'R742SmIZuhpZ5HM8BJZ1lQ', 'vPwBMzuIBZWxh9PgJGsVQw', '-jW3kc6MI25lCdjYX43xoA', 'SFPS_oPyuP1jZBH1gUtZ5g', 'dlTXAIqEy5kKIoWRDIdWAg', 'FS5LYszMJm1VN6aUzA8Plg', 'yxM8-6K6MxjqacGIlpliQA', 'wqo5uQeR_FV2y9PzLntk2w', 'X7uvV7tD-n5z5X3zuB9_LA', 'LhYK85zdyADqg_IfKytpNw', 'PhOOyh8VMP6p3V318c136w', 'AO9-DnZ9fDLCy7I9oUeu5g', 'ptLXdIU29GmvOQbO7-1gVg', 'h2iZyeVr9o89v6Fdus83YA', 'KSKgMxI3X0Y47P2DkBqa_g', 'QsaTAVqVmDVHE6TVt_tOiw', 'uuXtyNBB0Iyo1ep4O6_PLw', 'y4avOR1SFUP6jq_ZKVxg7Q', 'N--0JJ4eM3zR4_OYrUukPw', 'Ee2IUa0a0zmnNQcOFSbgBw', 'cLZoKI_5woHUR86IdGnruA', 'ZlpAs_O4nYhIzXTi4u8tzw', 'c0fvMfnZx22IPeriu8pAYA', 'Ld3hEWtpOlWFbhKPDZoGhg', '40uYO5TP0PY-wol8rRA-xw', 'qrxlRngEZs2s7SwRBOr3Qg', 'swM1TW_7mrPk6UJnTa4SNw', 'LNNy7-P3Oy3kvic3PIfpXw', 'TdxJDq4nZUKWHaQykbHGuw', 'FdYMdmKZp5vDDLSOqHA-AA', 'DySKpJO5I-QlXA7X8mq0ew', '3Wy5NCi5QPLq-45pYKCoiA', 'iOySchLVs7WEsy4OvblB8A', '6nf3HEDCCh4QzjNxxqOHuw', 'wxtqpfi92UACR_T4TwaRNQ', 'mDr1am5sq7k3j3B-jTFtQA', 'W4nxkeLQhwwgNcxaY_BE0Q', 'WiSuzrzM0-C-FT5XXATy6Q', '0WggYoVzoTHgmP8BR00x1w', 'RkUEBSBKJfkoCfAsU8jV9A', 'kJgY21Mv6YobTyr1xMh01g', 'npjrtfGnYD05xOLqzzmbYw', '2tSk90jdfEz8ZTeVTDn8Vw', 'wAF3-WPGH5YfMDHGeE29ug', 'q87CAHhJitc9hsjEVd7C_w', 'Qj2M0LooW7ayosx2fP6D8A', 'pJ-8OzZ2lQp5zTFcxcfGgA', 'ep_o6N7o3RO4b0mAo1HXTg', 'lH29fo_3m9vCHmgIo-_CEA', 'ic3XjmGdP28V8GoAS3Hazw', '5RGyR_eA1aqO_aomBhch1A', 'OHfchsRTYUvv7_LpMN3j8Q', '-uhJxd_sX5wds2yT6m7JsQ', 'TOn4EF7avRSfRDLJF8jNzw']
Matheus Schmitz
LinkedIn
Github Portfolio