import warnings
warnings.filterwarnings("ignore")


# Imports
import sys
import time
import random
import numpy as np
import math
from sklearn import cluster
warnings.filterwarnings("ignore", category=DeprecationWarning)


# Track the time the algorithm takes
start_time = time.time()


# Define input paths
input_file = "./publicdata/clustering_dataset.txt"
n_cluster = 10
output_file = "predictions.txt"


# Read the input file (without using spark!)
with open(input_file, 'r') as f_in:
    dataset = np.array(f_in.readlines())
    
print(f'Dataset size: {dataset.shape[0]}')

Dataset size: 322312


# Conver the dataset to a dict for easily accessing the relevant information from a sample's id
dataset_dict = {}
for row in dataset:
    row_data = row.replace("\n","").split(",")
    dataset_dict[int(row_data[0])] = {'datapoint_id': int(row_data[0]), 
                                      'true_cluster': int(row_data[1]), 
                                      'features': [float(feat) for feat in row_data[2:]]}
    
# View sample
dataset_dict[np.random.choice(list(dataset_dict.keys()))]

{'datapoint_id': 308450,
 'true_cluster': 6,
 'features': [-15.620078012764438,
  30.046543370390356,
  51.30861319371247,
  59.19104682368959,
  -45.46259766274144,
  -46.32738916055233,
  -38.88754664939637,
  -29.587487734159385,
  -40.75516794238417,
  -34.46822234404671]}


# Get the dataset's dimensionality 
dataset_dimensionality = len(dataset_dict[0]['features'])
print(f'Dataset Dimensionality: {dataset_dimensionality}')

# Calculate the Mahalanobis threshold to be used
mahalanobis_threshold = 2 * np.sqrt(dataset_dimensionality)
print(f'Mahalanobis Threshold: {mahalanobis_threshold}')

Dataset Dimensionality: 10
Mahalanobis Threshold: 6.324555320336759


# From the file figure out the sample size for each of the 5 iterations
sample_size = math.ceil(len(dataset)*0.2)
print(f'Sample Size: {sample_size}')

# List all sample_ids which have not yet been clustered
unused_ids = set(dataset_dict.keys())

Sample Size: 64463


# Initialize the objects for the different sets
DS_CLUSTERS = dict()
DISCARD_SET_stats = dict()
COMPRESSION_SET_stats = dict()
RETAINED_SET = list()


# Sample the first round of data to initialize the algorithm
round_1_samples = random.sample(unused_ids, sample_size)
for sample_id in round_1_samples:
    unused_ids.remove(sample_id)


# Create the K-Means training data from the round_1_samples
init_sample_features = []
for sample_id in round_1_samples:
    init_sample_features.append(dataset_dict[sample_id]['features'])
X_train = np.array(init_sample_features)

# Run K-Means (e.g., from sklearn) with a large K (e.g., 5 times of the number of the input clusters)
kmeans = cluster.KMeans(n_clusters=10*n_cluster)
predicted_clusters = kmeans.fit_predict(X_train)


# Assign points to clusters
init_clusters = dict()
for cluster_id, sampled_id in zip(predicted_clusters, round_1_samples):
    # If there is no key for the current cluster id, create one 
    if init_clusters.get(cluster_id) == None:
        init_clusters[cluster_id] = []
    init_clusters[cluster_id].append(sampled_id)

# Move lone points to the RETAINED_SET
for cluster_id, clustered_points in init_clusters.items():
    if len(clustered_points) == 1:
        RETAINED_SET.append(clustered_points[0])
        round_1_samples.remove(clustered_points[0])


# Check which points ended up in the retained set
RETAINED_SET

[87905,
 177195,
 97437,
 293977,
 231763,
 181374,
 62431,
 273890,
 111817,
 14724,
 48647,
 263667,
 251550,
 156334,
 275282,
 93860]


# Cluster the round_1_samples after having moved outliers to the RS
train_sample_features = []
for sample_id in round_1_samples:
    train_sample_features.append(dataset_dict[sample_id]['features'])
X_train = np.array(train_sample_features)
kmeans = cluster.KMeans(n_clusters=n_cluster)
predicted_clusters = kmeans.fit_predict(X_train)

# Assign points to DS clusters
for cluster_id, sampled_id in zip(predicted_clusters, round_1_samples):
    # If there is no key for the current cluster id, create one 
    if DS_CLUSTERS.get(cluster_id) == None:
        DS_CLUSTERS[cluster_id] = []
    DS_CLUSTERS[cluster_id].append(sampled_id)


# Calculate the cluster statistics for the clusters generated from the initialization round
for key, value in DS_CLUSTERS.items():
    DISCARD_SET_stats[key] = {'ids_in_cluster': []}
    features_matrix = []
    for datapoint in value:
        DISCARD_SET_stats[key]['ids_in_cluster'].append(datapoint)
        features_matrix.append(dataset_dict[datapoint]['features'])
    features_matrix = np.array(features_matrix)
    DISCARD_SET_stats[key]['N'] = len(DISCARD_SET_stats[key]['ids_in_cluster'])
    DISCARD_SET_stats[key]['sum'] = features_matrix.sum(axis=0)
    DISCARD_SET_stats[key]['sumsq'] = np.sum(features_matrix**2, axis=0)
    DISCARD_SET_stats[key]['stdev'] = np.sqrt(
                                              (DISCARD_SET_stats[key]['sumsq']/DISCARD_SET_stats[key]['N']) - 
                                              (np.square(DISCARD_SET_stats[key]['sum']) / DISCARD_SET_stats[key]['N']**2)
                                         )
    DISCARD_SET_stats[key]['centroid'] = DISCARD_SET_stats[key]['sum']/DISCARD_SET_stats[key]['N']


# Run KMeans on the RETAINED_SET and generate COMPRESSION_SET clusters
RS_sample_features = []
for sample_id in RETAINED_SET:
    RS_sample_features.append(dataset_dict[sample_id]['features'])
X_train = np.array(RS_sample_features)
# If there were any samples in the RS, cluster them
RS_clusters = {}
if X_train.shape[0] > 0:
    kmeans = cluster.KMeans(n_clusters=min(7*n_cluster, int(1 + (X_train.shape[0]/2))))
    predicted_clusters = kmeans.fit_predict(X_train)
    for cluster_id, sampled_id in zip(predicted_clusters, RETAINED_SET):
        # If there is no key for the current cluster id, create one
        if RS_clusters.get(cluster_id) == None:
            RS_clusters[cluster_id] = list()
        # Then append the RS sample to the its newly found CS cluster
        RS_clusters[cluster_id].append(sampled_id)


# Calculate the cluster statistics for the clusters generated from the initialization round
for key, value in RS_clusters.items():

    # Only clusters with more than one sample in them go to the CS, those with only 1 sample remain RS 
    if len(value) > 1:

        COMPRESSION_SET_stats[key] = {'ids_in_cluster': []}
        features_matrix = []
        for datapoint in value:
            COMPRESSION_SET_stats[key]['ids_in_cluster'].append(datapoint)
            features_matrix.append(dataset_dict[datapoint]['features'])
        features_matrix = np.array(features_matrix)
        COMPRESSION_SET_stats[key]['N'] = len(COMPRESSION_SET_stats[key]['ids_in_cluster'])
        COMPRESSION_SET_stats[key]['sum'] = features_matrix.sum(axis=0)
        COMPRESSION_SET_stats[key]['sumsq'] = np.sum(features_matrix**2, axis=0)
        COMPRESSION_SET_stats[key]['stdev'] = np.sqrt(
                                                  (COMPRESSION_SET_stats[key]['sumsq']/COMPRESSION_SET_stats[key]['N']) - 
                                                  (np.square(COMPRESSION_SET_stats[key]['sum']) / COMPRESSION_SET_stats[key]['N']**2)                                                     
                                             )
        COMPRESSION_SET_stats[key]['centroid'] = COMPRESSION_SET_stats[key]['sum']/COMPRESSION_SET_stats[key]['N']

        # Clean the samples from the retained set which have been summarized
        for datapoint in value:
            RETAINED_SET.remove(datapoint)


# Objects to tally values to be outputted
num_DS_points = 0 
num_CS_clusters = 0
num_CS_points = 0
num_RS_points = 0

# Tally the values for outputting
for key, value in DISCARD_SET_stats.items():
    num_DS_points += value['N']
for key, value in COMPRESSION_SET_stats.items():
    num_CS_clusters += 1
    num_CS_points += value['N']
num_RS_points = len(RETAINED_SET)

# Write the results for the initialization round (round 1)
f_out = open(output_file, "w")
f_out.write("The intermediate results:")
f_out.write("\n" + "Round 1: " + str(num_DS_points) + "," + str(num_CS_clusters) + "," + str(num_CS_points) + "," + str(num_RS_points))
print(f"Round 1 | DS Points: {str(num_DS_points)} | CS Clusters: {str(num_CS_clusters)} | CS Points: {str(num_CS_points)} | RS Points: {str(num_RS_points)}")

Round 1 | DS Points: 64447 | CS Clusters: 5 | CS Points: 12 | RS Points: 4


def point_cluster_mahalanobis_distance(datapoint_idx, cluster_stats):
    
    stdev = cluster_stats['stdev']
    centroid = cluster_stats['centroid']
    mahalanobis_distance = 0
    for dim in range(dataset_dimensionality):
        mahalanobis_distance += ((dataset_dict[datapoint_idx]['features'][dim] - centroid[dim]) / stdev[dim])**2
    mahalanobis_distance = np.sqrt(mahalanobis_distance)

    return mahalanobis_distance


def intercluster_mahalanobis_distance(left_cluster_stats, right_cluster_stats): # takes the dict value
    
    stdev1, centroid1 = left_cluster_stats['stdev'], left_cluster_stats['centroid']
    stdev2, centroid2 = right_cluster_stats['stdev'], right_cluster_stats['centroid']

    # Calculate the mahalanobis distance between the clusters
    left_cluster_dist, right_cluster_dist = 0, 0
    for dim in range(dataset_dimensionality):
        left_cluster_dist += ((centroid1[dim] - centroid2[dim]) / stdev2[dim])**2
        right_cluster_dist += ((centroid2[dim] - centroid1[dim]) / stdev1[dim])**2
    left_cluster_dist = np.sqrt(left_cluster_dist)
    right_cluster_dist = np.sqrt(right_cluster_dist)

    return min(left_cluster_dist, right_cluster_dist)


def merge_CS_CS(key1, key2):
    COMPRESSION_SET_stats[key1]['ids_in_cluster'].extend(COMPRESSION_SET_stats[key2]['ids_in_cluster'])
    COMPRESSION_SET_stats[key1]['N'] += COMPRESSION_SET_stats[key2]['N']
    for dim in range(dataset_dimensionality):
        COMPRESSION_SET_stats[key1]['sum'][dim] += COMPRESSION_SET_stats[key2]['sum'][dim]
        COMPRESSION_SET_stats[key1]['sumsq'][dim] += COMPRESSION_SET_stats[key2]['sumsq'][dim]
    COMPRESSION_SET_stats[key1]['stdev'] = np.sqrt(
                                              (COMPRESSION_SET_stats[key1]['sumsq'] / COMPRESSION_SET_stats[key1]['N']) - 
                                              ((np.square(COMPRESSION_SET_stats[key1]['sum']) / COMPRESSION_SET_stats[key1]['N']**2))
                                         )
    COMPRESSION_SET_stats[key1]['centroid'] = COMPRESSION_SET_stats[key1]['sum'] / COMPRESSION_SET_stats[key1]['N']

    # Remove the merged cluster
    COMPRESSION_SET_stats.pop(key2, None)


def merge_CS_DS(keyCS, keyDS):
    DISCARD_SET_stats[keyDS]['ids_in_cluster'].extend(COMPRESSION_SET_stats[keyCS]['ids_in_cluster'])
    DISCARD_SET_stats[keyDS]['N'] += COMPRESSION_SET_stats[keyCS]['N']
    for dim in range(dataset_dimensionality):
        DISCARD_SET_stats[keyDS]['sum'][dim] += COMPRESSION_SET_stats[keyCS]['sum'][dim]
        DISCARD_SET_stats[keyDS]['sumsq'][dim] += COMPRESSION_SET_stats[keyCS]['sumsq'][dim]
    DISCARD_SET_stats[keyDS]['stdev'] = np.sqrt(
                                              (DISCARD_SET_stats[keyDS]['sumsq'] / DISCARD_SET_stats[keyDS]['N']) - 
                                              ((np.square(DISCARD_SET_stats[keyDS]['sum']) / DISCARD_SET_stats[keyDS]['N']**2))
                                         )
    DISCARD_SET_stats[keyDS]['centroid'] = DISCARD_SET_stats[keyDS]['sum'] / DISCARD_SET_stats[keyDS]['N']

    # Remove the merged cluster
    COMPRESSION_SET_stats.pop(keyCS, None)


### Initialization is finished, run the regular iterations ###
for curr_round in range(2,6):

    # Load the samples for the iteration about the begin
    if curr_round < 5:
        iteration_samples = random.sample(unused_ids, sample_size)
        for sample_id in iteration_samples:
            unused_ids.remove(sample_id) 
    # The last iteration might have a slightly different number of samples
    else:
        iteration_samples = list(unused_ids.copy())
        for sample_id in iteration_samples:
            unused_ids.remove(sample_id) 


    ##################################
    ### Assign Samples to DS/CS/RS ###
    ##################################
    
    # Store the cluster which to send each new sample, in order to do a batch update after all samples are assigned
    DS_clusters_ids = {key:[] for key in DISCARD_SET_stats.keys()}
    CS_clusters_ids = {key:[] for key in COMPRESSION_SET_stats.keys()}

    # Iterate over each sample among those drawn for this iterations
    for sample_id in iteration_samples:

        # Default to assuming points are in the RETAINED_SET
        assigned_cluster = -1    

        # Track the lowest distance between the sample and all clusters
        lowest_dist = mahalanobis_threshold

        # Find the DISCARD_SET cluster closest to the current sample      
        for cluster_id, cluster_stats in DISCARD_SET_stats.items():

            mahalanobis_distance = point_cluster_mahalanobis_distance(sample_id, cluster_stats)

            # If the distance is under the mahalanobis_threshold and also the lowest distance yet found, update the point's cluster
            if mahalanobis_distance < lowest_dist:
                assigned_cluster = cluster_id
                lowest_dist = mahalanobis_distance

        # Update the statistics of the cluster to which the point is assigned 
        if assigned_cluster != -1:
            DS_clusters_ids[assigned_cluster].append(sample_id)

        # If the sample could not be assigned to any cluster in the DS, try assigning it to the CS
        else:

            # Find the COMPRESSION_SET cluster closest to the current sample
            for cluster_id, cluster_stats in COMPRESSION_SET_stats.items():

                mahalanobis_distance = point_cluster_mahalanobis_distance(sample_id, cluster_stats)

                # If the distance is under the mahalanobis_threshold and also the lowest distance yet found, update the point's cluster
                if mahalanobis_distance < lowest_dist:
                    assigned_cluster = cluster_id
                    lowest_dist = mahalanobis_distance

            # Update the statistics of the cluster to which the point is assigned 
            if assigned_cluster != -1:
                CS_clusters_ids[assigned_cluster].append(sample_id)

            # If the algorithm also failed to assign the sample to a CS cluster, send it to the RS
            else:
                RETAINED_SET.append(sample_id)

                
    ###############################
    ### Update DS & CS Clusters ###
    ###############################    
    
    ### Update DS ###
    for cluster_idx, new_datapoints in DS_clusters_ids.items():
        
        # If there are no new datapoints for the current cluster, skip
        if len(new_datapoints) == 0:
            continue
        
        # Append the datapoints to the DS while extracting their features
        features_matrix = []
        for datapoint in new_datapoints:
            DISCARD_SET_stats[cluster_idx]['ids_in_cluster'].append(datapoint)
            features_matrix.append(dataset_dict[datapoint]['features'])
        features_matrix = np.array(features_matrix)
        
        # Update N
        DISCARD_SET_stats[cluster_idx]['N'] += len(new_datapoints)
        
        # Calculate SUM and SUMSQ for the new_datapoints
        sum_new_datapoints = features_matrix.sum(axis=0)
        sumsq_new_datapoints = np.sum(features_matrix**2, axis=0)
        
        # Update the DISCARD_SET_stats with the new SUM and SUMSQ
        for dim in range(dataset_dimensionality):
            DISCARD_SET_stats[cluster_idx]['sum'][dim] += sum_new_datapoints[dim]
            DISCARD_SET_stats[cluster_idx]['sumsq'][dim] += sumsq_new_datapoints[dim]
        
        # Recalculate the Standard Deviation and Centroids
        DISCARD_SET_stats[cluster_idx]['stdev'] = np.sqrt(
                                                  (DISCARD_SET_stats[cluster_idx]['sumsq'] / DISCARD_SET_stats[cluster_idx]['N']) - 
                                                  (np.square(DISCARD_SET_stats[cluster_idx]['sum']) / DISCARD_SET_stats[cluster_idx]['N']**2)
                                             )
        DISCARD_SET_stats[cluster_idx]['centroid'] = DISCARD_SET_stats[cluster_idx]['sum'] / DISCARD_SET_stats[cluster_idx]['N']


    ### Update CS ###
    for cluster_idx, new_datapoints in CS_clusters_ids.items():
        
        # If there are no new datapoints for the current cluster, skip
        if len(new_datapoints) == 0:
            continue

        # Append the datapoints to the CS while extracting their features
        features_matrix = []
        for datapoint in new_datapoints:
            COMPRESSION_SET_stats[cluster_idx]['ids_in_cluster'].append(datapoint)
            features_matrix.append(dataset_dict[datapoint]['features'])
        features_matrix = np.array(features_matrix)

        # Update N
        COMPRESSION_SET_stats[cluster_idx]['N'] += len(new_datapoints)

        # Calculate SUM and SUMSQ for the new_datapoints
        sum_new_datapoints = features_matrix.sum(axis=0)
        sumsq_new_datapoints = np.sum(features_matrix**2, axis=0)

        # Update the COMPRESSION_SET_stats with the new SUM and SUMSQ
        for dim in range(dataset_dimensionality):
            COMPRESSION_SET_stats[cluster_idx]['sum'][dim] += sum_new_datapoints[dim]
            COMPRESSION_SET_stats[cluster_idx]['sumsq'][dim] += sumsq_new_datapoints[dim]

        # Recalculate the Standard Deviation and Centroids
        COMPRESSION_SET_stats[cluster_idx]['stdev'] = np.sqrt(
                                                  (COMPRESSION_SET_stats[cluster_idx]['sumsq'] / COMPRESSION_SET_stats[cluster_idx]['N']) - 
                                                  (np.square(COMPRESSION_SET_stats[cluster_idx]['sum']) / COMPRESSION_SET_stats[cluster_idx]['N']**2)
                                             )
        COMPRESSION_SET_stats[cluster_idx]['centroid'] = COMPRESSION_SET_stats[cluster_idx]['sum'] / COMPRESSION_SET_stats[cluster_idx]['N']


    ######################################
    ### Create New CSs from RS Samples ###
    ######################################

    # Run KMeans on the RETAINED_SET and generate COMPRESSION_SET clusters
    RS_sample_features = []
    for sample_id in RETAINED_SET:
        RS_sample_features.append(dataset_dict[sample_id]['features'])
    X_train = np.array(RS_sample_features)
    RS_clusters = {}
    if X_train.shape[0] > 0:
        kmeans = cluster.KMeans(n_clusters=min(7*n_cluster, int(1 + (X_train.shape[0]/2))))
        predicted_clusters = kmeans.fit_predict(X_train)
        for cluster_id, sampled_id in zip(predicted_clusters, RETAINED_SET):
            # If there is no key for the current cluster id, create one
            if RS_clusters.get(cluster_id) == None:
                RS_clusters[cluster_id] = list()
            # Then append the RS sample to the its newly found CS cluster
            RS_clusters[cluster_id].append(sampled_id)


    # Calculate the cluster statistics for the clusters generated from the initialization round
    for key, value in RS_clusters.items():

        # Only clusters with more than one sample in them go to the CS, those with only 1 sample remain RS 
        if len(value) > 1:

            # Find the find the next cluster index to use for the COMPRESSION_SET_stats
            try:
                CS_stats_next_key = max(COMPRESSION_SET_stats.keys()) + 1
            # If there are no keys from which to get the max value, then start at key 0
            except:
                CS_stats_next_key = 0

            COMPRESSION_SET_stats[CS_stats_next_key] = {'ids_in_cluster': []}
            features_matrix = []
            for datapoint in value:
                COMPRESSION_SET_stats[CS_stats_next_key]['ids_in_cluster'].append(datapoint)
                features_matrix.append(dataset_dict[datapoint]['features'])
            features_matrix = np.array(features_matrix)
            COMPRESSION_SET_stats[CS_stats_next_key]['N'] = len(COMPRESSION_SET_stats[CS_stats_next_key]['ids_in_cluster'])
            COMPRESSION_SET_stats[CS_stats_next_key]['sum'] = features_matrix.sum(axis=0)
            COMPRESSION_SET_stats[CS_stats_next_key]['sumsq'] = np.sum(features_matrix**2, axis=0)
            COMPRESSION_SET_stats[CS_stats_next_key]['stdev'] = np.sqrt(
                                                      (COMPRESSION_SET_stats[CS_stats_next_key]['sumsq'] / COMPRESSION_SET_stats[CS_stats_next_key]['N']) -
                                                      (np.square(COMPRESSION_SET_stats[CS_stats_next_key]['sum']) / COMPRESSION_SET_stats[CS_stats_next_key]['N']**2) 
                                                 )
            COMPRESSION_SET_stats[CS_stats_next_key]['centroid'] = COMPRESSION_SET_stats[CS_stats_next_key]['sum'] / COMPRESSION_SET_stats[CS_stats_next_key]['N']

            # Clean the samples from the retained set which have been summarized
            for datapoint in value:
                RETAINED_SET.remove(datapoint)


    #################################################
    ### Merge CSs Below the Mahalanobis Threshold ###
    #################################################

    close_CSs = dict()
    for key1, value1 in COMPRESSION_SET_stats.items():

        # Default to assuming there is no other CS cluster close by
        assigned_cluster = None

        # Track the lowest distance between the sample and all clusters
        lowest_dist = mahalanobis_threshold

        # Compare each CS cluster to all other CS clusters
        for key2, value2 in COMPRESSION_SET_stats.items():

            # Do not compare the a cluster to itself
            if key1 == key2:
                continue

            intercluster_dist = intercluster_mahalanobis_distance(value1, value2)

            # If the intercluster distance is below the threshold, make the pair a candidate for merging
            if intercluster_dist < lowest_dist:
                assigned_cluster = key2
                lowest_dist = intercluster_dist

        # Once all pairwise comparisons were done for a given cluster (key1) store the results
        close_CSs[key1] = assigned_cluster

    # Once all closest CS clusters were found, merge them and update the COMPRESSION_SET_stats
    for CS_cluster1, CS_cluster2 in close_CSs.items():
        if CS_cluster1 in COMPRESSION_SET_stats and CS_cluster2 in COMPRESSION_SET_stats and CS_cluster1 != CS_cluster2:
            merge_CS_CS(CS_cluster1, CS_cluster2)


    ######################################################
    ### At the Final Iteration Merge CSs to Nearby DSs ###
    ######################################################

    # Check if it is the last iteration
    if curr_round == 5:

        close_DSs = dict()
        for CS_key, CS_value in COMPRESSION_SET_stats.items():

            # Default to assuming there is no DS cluster close by
            assigned_cluster = None

            # Track the lowest distance between the sample and all clusters
            lowest_dist = mahalanobis_threshold

            for DS_key, DS_value in DISCARD_SET_stats.items():

                intercluster_dist = intercluster_mahalanobis_distance(CS_value, DS_value)

                # If the intercluster distance is below the threshold, make the pair a candidate for merging
                if intercluster_dist < lowest_dist:
                    assigned_cluster = DS_key
                    lowest_dist = intercluster_dist

            # Once all pairwise comparisons were done for a given cluster (CS_key) store the results
            close_DSs[CS_key] = assigned_cluster

        # Once all closest CS clusters were found, merge them and update the COMPRESSION_SET_stats
        for CS_cluster, DS_cluster in close_DSs.items():
            if CS_cluster in COMPRESSION_SET_stats and DS_cluster in DISCARD_SET_stats:
                merge_CS_DS(CS_cluster, DS_cluster)


    ###############################
    ### Output Round Statistics ###
    ###############################

    # Objects to tally values to be outputted
    num_DS_points = 0 
    num_CS_clusters = 0
    num_CS_points = 0
    num_RS_points = 0

    # Tally the values for outputting
    for key, value in DISCARD_SET_stats.items():
        num_DS_points += value['N']
    for key, value in COMPRESSION_SET_stats.items():
        num_CS_clusters += 1
        num_CS_points += value['N']
    num_RS_points = len(RETAINED_SET)

    # Add the current round's results to the output file
    f_out.write("\n" + "Round " + str(curr_round) + ": " + str(num_DS_points) + "," + str(num_CS_clusters) + "," + str(num_CS_points) + "," + str(num_RS_points))
    print(f"Round {curr_round} | DS Points: {str(num_DS_points)} | CS Clusters: {str(num_CS_clusters)} | CS Points: {str(num_CS_points)} | RS Points: {str(num_RS_points)}")

Round 2 | DS Points: 128886 | CS Clusters: 10 | CS Points: 34 | RS Points: 6
Round 3 | DS Points: 193326 | CS Clusters: 12 | CS Points: 59 | RS Points: 4
Round 4 | DS Points: 257770 | CS Clusters: 10 | CS Points: 80 | RS Points: 2
Round 5 | DS Points: 322216 | CS Clusters: 7 | CS Points: 94 | RS Points: 2


# Go over all sets (DS, CS, RS) and extract each datapoint's final cluster
sample_id_to_cluster = dict()
for key, value in DISCARD_SET_stats.items():
    for datapoint in value['ids_in_cluster']:
        sample_id_to_cluster[datapoint] = key
for key, value in COMPRESSION_SET_stats.items():
    for datapoint in value['ids_in_cluster']:
        sample_id_to_cluster[datapoint] = -1
for datapoint in RETAINED_SET:
    sample_id_to_cluster[datapoint] = -1


# Write the clustering results
f_out.write("\n" + "\n" + "The clustering results:")
for key, value in sorted(sample_id_to_cluster.items()):
    f_out.write("\n" + str(key) + "," + str(value))


import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline


# Create a dictionary with the true clusters and their samples
truth_clusters = {-1:[], 0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[]}
for key, value in dataset_dict.items():
    truth_clusters[value['true_cluster']].append(key)


# Find the matching cluster IDs between the truth and the prediction 
# (because the points might be correctly clustered but the cluster number is most likely not the same)
cluster_mapping = {}
for true_cluster, true_members in truth_clusters.items():
    
    # The outliers won't have a matching cluster
    if true_cluster == -1:
        continue
    
    # Find the matching indexes by the size of matching elements in the clusters
    largest_intersection = 0
   
    for pred_cluster, value in DISCARD_SET_stats.items():
        pred_members = value['ids_in_cluster']
        
        # Check the number of points in both clusters (true and pred)
        intersection_size = len(set(true_members) & set(pred_members))
        
        # If this is the largest match size thus far (for the true cluster), update the index matching
        if intersection_size > largest_intersection:
            cluster_mapping[true_cluster] = pred_cluster
            largest_intersection = intersection_size
            
cluster_mapping

{0: 2, 1: 3, 2: 5, 3: 6, 4: 8, 5: 9, 6: 1, 7: 7, 8: 0, 9: 4}


# DataFrame to store metrics for each cluster pair
metrics_df = pd.DataFrame(columns = ['true_cluster', 'pred_cluster', 'true_size', 'pred_size', 
                                     'TP', 'FP', 'TN', 'FN', 
                                     'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Calculate TP, FP, TN, FN for each match
total_samples = len(dataset_dict)
for true_cluster, pred_cluster in cluster_mapping.items():
    true_members = truth_clusters[true_cluster]
    pred_members = DISCARD_SET_stats[pred_cluster]['ids_in_cluster']
    
    TP = len(set(true_members) & set(pred_members))
    FP = len(set(pred_members) - set(true_members))
    FN = len(set(true_members) - set(pred_members))
    all_negative_preds = total_samples - len(pred_members)
    TN = all_negative_preds - FN
    
    accuracy = (TP + TN) / (TP + FP + FN + TN)
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    F1 = (2 * precision * recall) / (precision + recall)

    scores = {'true_cluster':true_cluster, 
              'pred_cluster':pred_cluster,
              'true_size': len(true_members),
              'pred_size': len(pred_members),
              'TP':TP, 
              'FP':FP, 
              'TN':TN, 
              'FN':FN, 
              'Accuracy':accuracy, 
              'Precision':precision, 
              'Recall':recall, 
              'F1 Score':F1}
    
    metrics_df = metrics_df.append([scores], ignore_index=True)


# Collect the items left in the COMPRESSION_SET and RETAINED_SET as outliers
outliers = [sample for sample in RETAINED_SET]

for value in COMPRESSION_SET_stats.values():
    outliers = outliers + value['ids_in_cluster']


# Then add to the metrics the scores related to outliers
true_members = truth_clusters[-1]
pred_members = outliers

TP = len(set(true_members) & set(pred_members))
FP = len(set(pred_members) - set(true_members))
FN = len(set(true_members) - set(pred_members))
all_negative_preds = total_samples - len(pred_members)
TN = all_negative_preds - FN

# Need to handle the possiblity of there being no outliers, in which case I'll consider a perfect score
try:
    accuracy = (TP + TN) / (TP + FP + FN + TN)
except:
    accuracy = 1.0

try:
    precision = TP / (TP + FP)
except:
    precision = 1.0

try:
    recall = TP / (TP + FN)
except:
    recall = 1.0
    
F1 = (2 * precision * recall) / (precision + recall)

scores = {'true_cluster':-1, 
          'pred_cluster':-1,
          'true_size': len(true_members),
          'pred_size': len(pred_members),
          'TP':TP, 
          'FP':FP, 
          'TN':TN, 
          'FN':FN, 
          'Accuracy':accuracy, 
          'Precision':precision, 
          'Recall':recall, 
          'F1 Score':F1}

metrics_df = metrics_df.append([scores], ignore_index=True)


metrics_df


# Get model performance (macro-averaged = each class/cluster has the same weight, regardless of samples)
plot_metrics = {}
plot_metrics['Precision'] = metrics_df['Precision'].mean()
plot_metrics['Recall'] = metrics_df['Recall'].mean()
plot_metrics['F1 Score'] = metrics_df['F1 Score'].mean()
plot_metrics['Accuracy'] = metrics_df['Accuracy'].mean()


# plots
plt.style.use('fivethirtyeight')
sns.set(font_scale=1.5)
plt.figure(figsize=(15, 5))
sns.barplot(y=list(plot_metrics.keys()), x=list(plot_metrics.values()))
plt.suptitle('Macro-Averaged Model Metrics')
plt.gca().set_xticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_xticks()]) 
plt.show()


print(f'Duration: {time.time() - start_time:.0f} seconds.')

Duration: 186 seconds.

	true_cluster	pred_cluster	true_size	pred_size	TP	FP	TN	FN	Accuracy	Precision	Recall	F1 Score
0	0	2	37859	37859	37859	0	284453	0	1.000000	1.000000	1.000000	1.000000
1	1	3	23786	23786	23786	0	298526	0	1.000000	1.000000	1.000000	1.000000
2	2	5	37871	37871	37871	0	284441	0	1.000000	1.000000	1.000000	1.000000
3	3	6	30267	30270	30267	3	292042	0	0.999991	0.999901	1.000000	0.999950
4	4	8	39655	39654	39654	0	282657	1	0.999997	1.000000	0.999975	0.999987
5	5	9	31210	31209	31209	0	291102	1	0.999997	1.000000	0.999968	0.999984
6	6	1	32289	32289	32289	0	290023	0	1.000000	1.000000	1.000000	1.000000
7	7	7	35167	35169	35167	2	287143	0	0.999994	0.999943	1.000000	0.999972
8	8	0	22370	22370	22370	0	299942	0	1.000000	1.000000	1.000000	1.000000
9	9	4	31738	31739	31738	1	290573	0	0.999997	0.999968	1.000000	0.999984
10	-1	-1	100	96	96	0	322212	4	0.999988	1.000000	0.960000	0.979592

Scalable Clustering¶

1. Imports & Inputs¶

2. Load Data¶

3. Initialize Algorithm¶

4. Run Batch Iterations¶

5. Generate Predicted Clusters¶

6. Evaluate Model¶

The End¶