Implementation of a batch-based clustering algorithm inspired by the Bradley-Fayyad-Reina algorithm.
import warnings
warnings.filterwarnings("ignore")
# Imports
import sys
import time
import random
import numpy as np
import math
from sklearn import cluster
warnings.filterwarnings("ignore", category=DeprecationWarning)
# Track the time the algorithm takes
start_time = time.time()
# Define input paths
input_file = "./publicdata/clustering_dataset.txt"
n_cluster = 10
output_file = "predictions.txt"
# Read the input file (without using spark!)
with open(input_file, 'r') as f_in:
dataset = np.array(f_in.readlines())
print(f'Dataset size: {dataset.shape[0]}')
Dataset size: 322312
# Conver the dataset to a dict for easily accessing the relevant information from a sample's id
dataset_dict = {}
for row in dataset:
row_data = row.replace("\n","").split(",")
dataset_dict[int(row_data[0])] = {'datapoint_id': int(row_data[0]),
'true_cluster': int(row_data[1]),
'features': [float(feat) for feat in row_data[2:]]}
# View sample
dataset_dict[np.random.choice(list(dataset_dict.keys()))]
{'datapoint_id': 308450, 'true_cluster': 6, 'features': [-15.620078012764438, 30.046543370390356, 51.30861319371247, 59.19104682368959, -45.46259766274144, -46.32738916055233, -38.88754664939637, -29.587487734159385, -40.75516794238417, -34.46822234404671]}
# Get the dataset's dimensionality
dataset_dimensionality = len(dataset_dict[0]['features'])
print(f'Dataset Dimensionality: {dataset_dimensionality}')
# Calculate the Mahalanobis threshold to be used
mahalanobis_threshold = 2 * np.sqrt(dataset_dimensionality)
print(f'Mahalanobis Threshold: {mahalanobis_threshold}')
Dataset Dimensionality: 10 Mahalanobis Threshold: 6.324555320336759
# From the file figure out the sample size for each of the 5 iterations
sample_size = math.ceil(len(dataset)*0.2)
print(f'Sample Size: {sample_size}')
# List all sample_ids which have not yet been clustered
unused_ids = set(dataset_dict.keys())
Sample Size: 64463
# Initialize the objects for the different sets
DS_CLUSTERS = dict()
DISCARD_SET_stats = dict()
COMPRESSION_SET_stats = dict()
RETAINED_SET = list()
# Sample the first round of data to initialize the algorithm
round_1_samples = random.sample(unused_ids, sample_size)
for sample_id in round_1_samples:
unused_ids.remove(sample_id)
# Create the K-Means training data from the round_1_samples
init_sample_features = []
for sample_id in round_1_samples:
init_sample_features.append(dataset_dict[sample_id]['features'])
X_train = np.array(init_sample_features)
# Run K-Means (e.g., from sklearn) with a large K (e.g., 5 times of the number of the input clusters)
kmeans = cluster.KMeans(n_clusters=10*n_cluster)
predicted_clusters = kmeans.fit_predict(X_train)
# Assign points to clusters
init_clusters = dict()
for cluster_id, sampled_id in zip(predicted_clusters, round_1_samples):
# If there is no key for the current cluster id, create one
if init_clusters.get(cluster_id) == None:
init_clusters[cluster_id] = []
init_clusters[cluster_id].append(sampled_id)
# Move lone points to the RETAINED_SET
for cluster_id, clustered_points in init_clusters.items():
if len(clustered_points) == 1:
RETAINED_SET.append(clustered_points[0])
round_1_samples.remove(clustered_points[0])
# Check which points ended up in the retained set
RETAINED_SET
[87905, 177195, 97437, 293977, 231763, 181374, 62431, 273890, 111817, 14724, 48647, 263667, 251550, 156334, 275282, 93860]
# Cluster the round_1_samples after having moved outliers to the RS
train_sample_features = []
for sample_id in round_1_samples:
train_sample_features.append(dataset_dict[sample_id]['features'])
X_train = np.array(train_sample_features)
kmeans = cluster.KMeans(n_clusters=n_cluster)
predicted_clusters = kmeans.fit_predict(X_train)
# Assign points to DS clusters
for cluster_id, sampled_id in zip(predicted_clusters, round_1_samples):
# If there is no key for the current cluster id, create one
if DS_CLUSTERS.get(cluster_id) == None:
DS_CLUSTERS[cluster_id] = []
DS_CLUSTERS[cluster_id].append(sampled_id)
# Calculate the cluster statistics for the clusters generated from the initialization round
for key, value in DS_CLUSTERS.items():
DISCARD_SET_stats[key] = {'ids_in_cluster': []}
features_matrix = []
for datapoint in value:
DISCARD_SET_stats[key]['ids_in_cluster'].append(datapoint)
features_matrix.append(dataset_dict[datapoint]['features'])
features_matrix = np.array(features_matrix)
DISCARD_SET_stats[key]['N'] = len(DISCARD_SET_stats[key]['ids_in_cluster'])
DISCARD_SET_stats[key]['sum'] = features_matrix.sum(axis=0)
DISCARD_SET_stats[key]['sumsq'] = np.sum(features_matrix**2, axis=0)
DISCARD_SET_stats[key]['stdev'] = np.sqrt(
(DISCARD_SET_stats[key]['sumsq']/DISCARD_SET_stats[key]['N']) -
(np.square(DISCARD_SET_stats[key]['sum']) / DISCARD_SET_stats[key]['N']**2)
)
DISCARD_SET_stats[key]['centroid'] = DISCARD_SET_stats[key]['sum']/DISCARD_SET_stats[key]['N']
# Run KMeans on the RETAINED_SET and generate COMPRESSION_SET clusters
RS_sample_features = []
for sample_id in RETAINED_SET:
RS_sample_features.append(dataset_dict[sample_id]['features'])
X_train = np.array(RS_sample_features)
# If there were any samples in the RS, cluster them
RS_clusters = {}
if X_train.shape[0] > 0:
kmeans = cluster.KMeans(n_clusters=min(7*n_cluster, int(1 + (X_train.shape[0]/2))))
predicted_clusters = kmeans.fit_predict(X_train)
for cluster_id, sampled_id in zip(predicted_clusters, RETAINED_SET):
# If there is no key for the current cluster id, create one
if RS_clusters.get(cluster_id) == None:
RS_clusters[cluster_id] = list()
# Then append the RS sample to the its newly found CS cluster
RS_clusters[cluster_id].append(sampled_id)
# Calculate the cluster statistics for the clusters generated from the initialization round
for key, value in RS_clusters.items():
# Only clusters with more than one sample in them go to the CS, those with only 1 sample remain RS
if len(value) > 1:
COMPRESSION_SET_stats[key] = {'ids_in_cluster': []}
features_matrix = []
for datapoint in value:
COMPRESSION_SET_stats[key]['ids_in_cluster'].append(datapoint)
features_matrix.append(dataset_dict[datapoint]['features'])
features_matrix = np.array(features_matrix)
COMPRESSION_SET_stats[key]['N'] = len(COMPRESSION_SET_stats[key]['ids_in_cluster'])
COMPRESSION_SET_stats[key]['sum'] = features_matrix.sum(axis=0)
COMPRESSION_SET_stats[key]['sumsq'] = np.sum(features_matrix**2, axis=0)
COMPRESSION_SET_stats[key]['stdev'] = np.sqrt(
(COMPRESSION_SET_stats[key]['sumsq']/COMPRESSION_SET_stats[key]['N']) -
(np.square(COMPRESSION_SET_stats[key]['sum']) / COMPRESSION_SET_stats[key]['N']**2)
)
COMPRESSION_SET_stats[key]['centroid'] = COMPRESSION_SET_stats[key]['sum']/COMPRESSION_SET_stats[key]['N']
# Clean the samples from the retained set which have been summarized
for datapoint in value:
RETAINED_SET.remove(datapoint)
# Objects to tally values to be outputted
num_DS_points = 0
num_CS_clusters = 0
num_CS_points = 0
num_RS_points = 0
# Tally the values for outputting
for key, value in DISCARD_SET_stats.items():
num_DS_points += value['N']
for key, value in COMPRESSION_SET_stats.items():
num_CS_clusters += 1
num_CS_points += value['N']
num_RS_points = len(RETAINED_SET)
# Write the results for the initialization round (round 1)
f_out = open(output_file, "w")
f_out.write("The intermediate results:")
f_out.write("\n" + "Round 1: " + str(num_DS_points) + "," + str(num_CS_clusters) + "," + str(num_CS_points) + "," + str(num_RS_points))
print(f"Round 1 | DS Points: {str(num_DS_points)} | CS Clusters: {str(num_CS_clusters)} | CS Points: {str(num_CS_points)} | RS Points: {str(num_RS_points)}")
Round 1 | DS Points: 64447 | CS Clusters: 5 | CS Points: 12 | RS Points: 4
def point_cluster_mahalanobis_distance(datapoint_idx, cluster_stats):
stdev = cluster_stats['stdev']
centroid = cluster_stats['centroid']
mahalanobis_distance = 0
for dim in range(dataset_dimensionality):
mahalanobis_distance += ((dataset_dict[datapoint_idx]['features'][dim] - centroid[dim]) / stdev[dim])**2
mahalanobis_distance = np.sqrt(mahalanobis_distance)
return mahalanobis_distance
def intercluster_mahalanobis_distance(left_cluster_stats, right_cluster_stats): # takes the dict value
stdev1, centroid1 = left_cluster_stats['stdev'], left_cluster_stats['centroid']
stdev2, centroid2 = right_cluster_stats['stdev'], right_cluster_stats['centroid']
# Calculate the mahalanobis distance between the clusters
left_cluster_dist, right_cluster_dist = 0, 0
for dim in range(dataset_dimensionality):
left_cluster_dist += ((centroid1[dim] - centroid2[dim]) / stdev2[dim])**2
right_cluster_dist += ((centroid2[dim] - centroid1[dim]) / stdev1[dim])**2
left_cluster_dist = np.sqrt(left_cluster_dist)
right_cluster_dist = np.sqrt(right_cluster_dist)
return min(left_cluster_dist, right_cluster_dist)
def merge_CS_CS(key1, key2):
COMPRESSION_SET_stats[key1]['ids_in_cluster'].extend(COMPRESSION_SET_stats[key2]['ids_in_cluster'])
COMPRESSION_SET_stats[key1]['N'] += COMPRESSION_SET_stats[key2]['N']
for dim in range(dataset_dimensionality):
COMPRESSION_SET_stats[key1]['sum'][dim] += COMPRESSION_SET_stats[key2]['sum'][dim]
COMPRESSION_SET_stats[key1]['sumsq'][dim] += COMPRESSION_SET_stats[key2]['sumsq'][dim]
COMPRESSION_SET_stats[key1]['stdev'] = np.sqrt(
(COMPRESSION_SET_stats[key1]['sumsq'] / COMPRESSION_SET_stats[key1]['N']) -
((np.square(COMPRESSION_SET_stats[key1]['sum']) / COMPRESSION_SET_stats[key1]['N']**2))
)
COMPRESSION_SET_stats[key1]['centroid'] = COMPRESSION_SET_stats[key1]['sum'] / COMPRESSION_SET_stats[key1]['N']
# Remove the merged cluster
COMPRESSION_SET_stats.pop(key2, None)
def merge_CS_DS(keyCS, keyDS):
DISCARD_SET_stats[keyDS]['ids_in_cluster'].extend(COMPRESSION_SET_stats[keyCS]['ids_in_cluster'])
DISCARD_SET_stats[keyDS]['N'] += COMPRESSION_SET_stats[keyCS]['N']
for dim in range(dataset_dimensionality):
DISCARD_SET_stats[keyDS]['sum'][dim] += COMPRESSION_SET_stats[keyCS]['sum'][dim]
DISCARD_SET_stats[keyDS]['sumsq'][dim] += COMPRESSION_SET_stats[keyCS]['sumsq'][dim]
DISCARD_SET_stats[keyDS]['stdev'] = np.sqrt(
(DISCARD_SET_stats[keyDS]['sumsq'] / DISCARD_SET_stats[keyDS]['N']) -
((np.square(DISCARD_SET_stats[keyDS]['sum']) / DISCARD_SET_stats[keyDS]['N']**2))
)
DISCARD_SET_stats[keyDS]['centroid'] = DISCARD_SET_stats[keyDS]['sum'] / DISCARD_SET_stats[keyDS]['N']
# Remove the merged cluster
COMPRESSION_SET_stats.pop(keyCS, None)
### Initialization is finished, run the regular iterations ###
for curr_round in range(2,6):
# Load the samples for the iteration about the begin
if curr_round < 5:
iteration_samples = random.sample(unused_ids, sample_size)
for sample_id in iteration_samples:
unused_ids.remove(sample_id)
# The last iteration might have a slightly different number of samples
else:
iteration_samples = list(unused_ids.copy())
for sample_id in iteration_samples:
unused_ids.remove(sample_id)
##################################
### Assign Samples to DS/CS/RS ###
##################################
# Store the cluster which to send each new sample, in order to do a batch update after all samples are assigned
DS_clusters_ids = {key:[] for key in DISCARD_SET_stats.keys()}
CS_clusters_ids = {key:[] for key in COMPRESSION_SET_stats.keys()}
# Iterate over each sample among those drawn for this iterations
for sample_id in iteration_samples:
# Default to assuming points are in the RETAINED_SET
assigned_cluster = -1
# Track the lowest distance between the sample and all clusters
lowest_dist = mahalanobis_threshold
# Find the DISCARD_SET cluster closest to the current sample
for cluster_id, cluster_stats in DISCARD_SET_stats.items():
mahalanobis_distance = point_cluster_mahalanobis_distance(sample_id, cluster_stats)
# If the distance is under the mahalanobis_threshold and also the lowest distance yet found, update the point's cluster
if mahalanobis_distance < lowest_dist:
assigned_cluster = cluster_id
lowest_dist = mahalanobis_distance
# Update the statistics of the cluster to which the point is assigned
if assigned_cluster != -1:
DS_clusters_ids[assigned_cluster].append(sample_id)
# If the sample could not be assigned to any cluster in the DS, try assigning it to the CS
else:
# Find the COMPRESSION_SET cluster closest to the current sample
for cluster_id, cluster_stats in COMPRESSION_SET_stats.items():
mahalanobis_distance = point_cluster_mahalanobis_distance(sample_id, cluster_stats)
# If the distance is under the mahalanobis_threshold and also the lowest distance yet found, update the point's cluster
if mahalanobis_distance < lowest_dist:
assigned_cluster = cluster_id
lowest_dist = mahalanobis_distance
# Update the statistics of the cluster to which the point is assigned
if assigned_cluster != -1:
CS_clusters_ids[assigned_cluster].append(sample_id)
# If the algorithm also failed to assign the sample to a CS cluster, send it to the RS
else:
RETAINED_SET.append(sample_id)
###############################
### Update DS & CS Clusters ###
###############################
### Update DS ###
for cluster_idx, new_datapoints in DS_clusters_ids.items():
# If there are no new datapoints for the current cluster, skip
if len(new_datapoints) == 0:
continue
# Append the datapoints to the DS while extracting their features
features_matrix = []
for datapoint in new_datapoints:
DISCARD_SET_stats[cluster_idx]['ids_in_cluster'].append(datapoint)
features_matrix.append(dataset_dict[datapoint]['features'])
features_matrix = np.array(features_matrix)
# Update N
DISCARD_SET_stats[cluster_idx]['N'] += len(new_datapoints)
# Calculate SUM and SUMSQ for the new_datapoints
sum_new_datapoints = features_matrix.sum(axis=0)
sumsq_new_datapoints = np.sum(features_matrix**2, axis=0)
# Update the DISCARD_SET_stats with the new SUM and SUMSQ
for dim in range(dataset_dimensionality):
DISCARD_SET_stats[cluster_idx]['sum'][dim] += sum_new_datapoints[dim]
DISCARD_SET_stats[cluster_idx]['sumsq'][dim] += sumsq_new_datapoints[dim]
# Recalculate the Standard Deviation and Centroids
DISCARD_SET_stats[cluster_idx]['stdev'] = np.sqrt(
(DISCARD_SET_stats[cluster_idx]['sumsq'] / DISCARD_SET_stats[cluster_idx]['N']) -
(np.square(DISCARD_SET_stats[cluster_idx]['sum']) / DISCARD_SET_stats[cluster_idx]['N']**2)
)
DISCARD_SET_stats[cluster_idx]['centroid'] = DISCARD_SET_stats[cluster_idx]['sum'] / DISCARD_SET_stats[cluster_idx]['N']
### Update CS ###
for cluster_idx, new_datapoints in CS_clusters_ids.items():
# If there are no new datapoints for the current cluster, skip
if len(new_datapoints) == 0:
continue
# Append the datapoints to the CS while extracting their features
features_matrix = []
for datapoint in new_datapoints:
COMPRESSION_SET_stats[cluster_idx]['ids_in_cluster'].append(datapoint)
features_matrix.append(dataset_dict[datapoint]['features'])
features_matrix = np.array(features_matrix)
# Update N
COMPRESSION_SET_stats[cluster_idx]['N'] += len(new_datapoints)
# Calculate SUM and SUMSQ for the new_datapoints
sum_new_datapoints = features_matrix.sum(axis=0)
sumsq_new_datapoints = np.sum(features_matrix**2, axis=0)
# Update the COMPRESSION_SET_stats with the new SUM and SUMSQ
for dim in range(dataset_dimensionality):
COMPRESSION_SET_stats[cluster_idx]['sum'][dim] += sum_new_datapoints[dim]
COMPRESSION_SET_stats[cluster_idx]['sumsq'][dim] += sumsq_new_datapoints[dim]
# Recalculate the Standard Deviation and Centroids
COMPRESSION_SET_stats[cluster_idx]['stdev'] = np.sqrt(
(COMPRESSION_SET_stats[cluster_idx]['sumsq'] / COMPRESSION_SET_stats[cluster_idx]['N']) -
(np.square(COMPRESSION_SET_stats[cluster_idx]['sum']) / COMPRESSION_SET_stats[cluster_idx]['N']**2)
)
COMPRESSION_SET_stats[cluster_idx]['centroid'] = COMPRESSION_SET_stats[cluster_idx]['sum'] / COMPRESSION_SET_stats[cluster_idx]['N']
######################################
### Create New CSs from RS Samples ###
######################################
# Run KMeans on the RETAINED_SET and generate COMPRESSION_SET clusters
RS_sample_features = []
for sample_id in RETAINED_SET:
RS_sample_features.append(dataset_dict[sample_id]['features'])
X_train = np.array(RS_sample_features)
RS_clusters = {}
if X_train.shape[0] > 0:
kmeans = cluster.KMeans(n_clusters=min(7*n_cluster, int(1 + (X_train.shape[0]/2))))
predicted_clusters = kmeans.fit_predict(X_train)
for cluster_id, sampled_id in zip(predicted_clusters, RETAINED_SET):
# If there is no key for the current cluster id, create one
if RS_clusters.get(cluster_id) == None:
RS_clusters[cluster_id] = list()
# Then append the RS sample to the its newly found CS cluster
RS_clusters[cluster_id].append(sampled_id)
# Calculate the cluster statistics for the clusters generated from the initialization round
for key, value in RS_clusters.items():
# Only clusters with more than one sample in them go to the CS, those with only 1 sample remain RS
if len(value) > 1:
# Find the find the next cluster index to use for the COMPRESSION_SET_stats
try:
CS_stats_next_key = max(COMPRESSION_SET_stats.keys()) + 1
# If there are no keys from which to get the max value, then start at key 0
except:
CS_stats_next_key = 0
COMPRESSION_SET_stats[CS_stats_next_key] = {'ids_in_cluster': []}
features_matrix = []
for datapoint in value:
COMPRESSION_SET_stats[CS_stats_next_key]['ids_in_cluster'].append(datapoint)
features_matrix.append(dataset_dict[datapoint]['features'])
features_matrix = np.array(features_matrix)
COMPRESSION_SET_stats[CS_stats_next_key]['N'] = len(COMPRESSION_SET_stats[CS_stats_next_key]['ids_in_cluster'])
COMPRESSION_SET_stats[CS_stats_next_key]['sum'] = features_matrix.sum(axis=0)
COMPRESSION_SET_stats[CS_stats_next_key]['sumsq'] = np.sum(features_matrix**2, axis=0)
COMPRESSION_SET_stats[CS_stats_next_key]['stdev'] = np.sqrt(
(COMPRESSION_SET_stats[CS_stats_next_key]['sumsq'] / COMPRESSION_SET_stats[CS_stats_next_key]['N']) -
(np.square(COMPRESSION_SET_stats[CS_stats_next_key]['sum']) / COMPRESSION_SET_stats[CS_stats_next_key]['N']**2)
)
COMPRESSION_SET_stats[CS_stats_next_key]['centroid'] = COMPRESSION_SET_stats[CS_stats_next_key]['sum'] / COMPRESSION_SET_stats[CS_stats_next_key]['N']
# Clean the samples from the retained set which have been summarized
for datapoint in value:
RETAINED_SET.remove(datapoint)
#################################################
### Merge CSs Below the Mahalanobis Threshold ###
#################################################
close_CSs = dict()
for key1, value1 in COMPRESSION_SET_stats.items():
# Default to assuming there is no other CS cluster close by
assigned_cluster = None
# Track the lowest distance between the sample and all clusters
lowest_dist = mahalanobis_threshold
# Compare each CS cluster to all other CS clusters
for key2, value2 in COMPRESSION_SET_stats.items():
# Do not compare the a cluster to itself
if key1 == key2:
continue
intercluster_dist = intercluster_mahalanobis_distance(value1, value2)
# If the intercluster distance is below the threshold, make the pair a candidate for merging
if intercluster_dist < lowest_dist:
assigned_cluster = key2
lowest_dist = intercluster_dist
# Once all pairwise comparisons were done for a given cluster (key1) store the results
close_CSs[key1] = assigned_cluster
# Once all closest CS clusters were found, merge them and update the COMPRESSION_SET_stats
for CS_cluster1, CS_cluster2 in close_CSs.items():
if CS_cluster1 in COMPRESSION_SET_stats and CS_cluster2 in COMPRESSION_SET_stats and CS_cluster1 != CS_cluster2:
merge_CS_CS(CS_cluster1, CS_cluster2)
######################################################
### At the Final Iteration Merge CSs to Nearby DSs ###
######################################################
# Check if it is the last iteration
if curr_round == 5:
close_DSs = dict()
for CS_key, CS_value in COMPRESSION_SET_stats.items():
# Default to assuming there is no DS cluster close by
assigned_cluster = None
# Track the lowest distance between the sample and all clusters
lowest_dist = mahalanobis_threshold
for DS_key, DS_value in DISCARD_SET_stats.items():
intercluster_dist = intercluster_mahalanobis_distance(CS_value, DS_value)
# If the intercluster distance is below the threshold, make the pair a candidate for merging
if intercluster_dist < lowest_dist:
assigned_cluster = DS_key
lowest_dist = intercluster_dist
# Once all pairwise comparisons were done for a given cluster (CS_key) store the results
close_DSs[CS_key] = assigned_cluster
# Once all closest CS clusters were found, merge them and update the COMPRESSION_SET_stats
for CS_cluster, DS_cluster in close_DSs.items():
if CS_cluster in COMPRESSION_SET_stats and DS_cluster in DISCARD_SET_stats:
merge_CS_DS(CS_cluster, DS_cluster)
###############################
### Output Round Statistics ###
###############################
# Objects to tally values to be outputted
num_DS_points = 0
num_CS_clusters = 0
num_CS_points = 0
num_RS_points = 0
# Tally the values for outputting
for key, value in DISCARD_SET_stats.items():
num_DS_points += value['N']
for key, value in COMPRESSION_SET_stats.items():
num_CS_clusters += 1
num_CS_points += value['N']
num_RS_points = len(RETAINED_SET)
# Add the current round's results to the output file
f_out.write("\n" + "Round " + str(curr_round) + ": " + str(num_DS_points) + "," + str(num_CS_clusters) + "," + str(num_CS_points) + "," + str(num_RS_points))
print(f"Round {curr_round} | DS Points: {str(num_DS_points)} | CS Clusters: {str(num_CS_clusters)} | CS Points: {str(num_CS_points)} | RS Points: {str(num_RS_points)}")
Round 2 | DS Points: 128886 | CS Clusters: 10 | CS Points: 34 | RS Points: 6 Round 3 | DS Points: 193326 | CS Clusters: 12 | CS Points: 59 | RS Points: 4 Round 4 | DS Points: 257770 | CS Clusters: 10 | CS Points: 80 | RS Points: 2 Round 5 | DS Points: 322216 | CS Clusters: 7 | CS Points: 94 | RS Points: 2
# Go over all sets (DS, CS, RS) and extract each datapoint's final cluster
sample_id_to_cluster = dict()
for key, value in DISCARD_SET_stats.items():
for datapoint in value['ids_in_cluster']:
sample_id_to_cluster[datapoint] = key
for key, value in COMPRESSION_SET_stats.items():
for datapoint in value['ids_in_cluster']:
sample_id_to_cluster[datapoint] = -1
for datapoint in RETAINED_SET:
sample_id_to_cluster[datapoint] = -1
# Write the clustering results
f_out.write("\n" + "\n" + "The clustering results:")
for key, value in sorted(sample_id_to_cluster.items()):
f_out.write("\n" + str(key) + "," + str(value))
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
# Create a dictionary with the true clusters and their samples
truth_clusters = {-1:[], 0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[]}
for key, value in dataset_dict.items():
truth_clusters[value['true_cluster']].append(key)
# Find the matching cluster IDs between the truth and the prediction
# (because the points might be correctly clustered but the cluster number is most likely not the same)
cluster_mapping = {}
for true_cluster, true_members in truth_clusters.items():
# The outliers won't have a matching cluster
if true_cluster == -1:
continue
# Find the matching indexes by the size of matching elements in the clusters
largest_intersection = 0
for pred_cluster, value in DISCARD_SET_stats.items():
pred_members = value['ids_in_cluster']
# Check the number of points in both clusters (true and pred)
intersection_size = len(set(true_members) & set(pred_members))
# If this is the largest match size thus far (for the true cluster), update the index matching
if intersection_size > largest_intersection:
cluster_mapping[true_cluster] = pred_cluster
largest_intersection = intersection_size
cluster_mapping
{0: 2, 1: 3, 2: 5, 3: 6, 4: 8, 5: 9, 6: 1, 7: 7, 8: 0, 9: 4}
# DataFrame to store metrics for each cluster pair
metrics_df = pd.DataFrame(columns = ['true_cluster', 'pred_cluster', 'true_size', 'pred_size',
'TP', 'FP', 'TN', 'FN',
'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# Calculate TP, FP, TN, FN for each match
total_samples = len(dataset_dict)
for true_cluster, pred_cluster in cluster_mapping.items():
true_members = truth_clusters[true_cluster]
pred_members = DISCARD_SET_stats[pred_cluster]['ids_in_cluster']
TP = len(set(true_members) & set(pred_members))
FP = len(set(pred_members) - set(true_members))
FN = len(set(true_members) - set(pred_members))
all_negative_preds = total_samples - len(pred_members)
TN = all_negative_preds - FN
accuracy = (TP + TN) / (TP + FP + FN + TN)
precision = TP / (TP + FP)
recall = TP / (TP + FN)
F1 = (2 * precision * recall) / (precision + recall)
scores = {'true_cluster':true_cluster,
'pred_cluster':pred_cluster,
'true_size': len(true_members),
'pred_size': len(pred_members),
'TP':TP,
'FP':FP,
'TN':TN,
'FN':FN,
'Accuracy':accuracy,
'Precision':precision,
'Recall':recall,
'F1 Score':F1}
metrics_df = metrics_df.append([scores], ignore_index=True)
# Collect the items left in the COMPRESSION_SET and RETAINED_SET as outliers
outliers = [sample for sample in RETAINED_SET]
for value in COMPRESSION_SET_stats.values():
outliers = outliers + value['ids_in_cluster']
# Then add to the metrics the scores related to outliers
true_members = truth_clusters[-1]
pred_members = outliers
TP = len(set(true_members) & set(pred_members))
FP = len(set(pred_members) - set(true_members))
FN = len(set(true_members) - set(pred_members))
all_negative_preds = total_samples - len(pred_members)
TN = all_negative_preds - FN
# Need to handle the possiblity of there being no outliers, in which case I'll consider a perfect score
try:
accuracy = (TP + TN) / (TP + FP + FN + TN)
except:
accuracy = 1.0
try:
precision = TP / (TP + FP)
except:
precision = 1.0
try:
recall = TP / (TP + FN)
except:
recall = 1.0
F1 = (2 * precision * recall) / (precision + recall)
scores = {'true_cluster':-1,
'pred_cluster':-1,
'true_size': len(true_members),
'pred_size': len(pred_members),
'TP':TP,
'FP':FP,
'TN':TN,
'FN':FN,
'Accuracy':accuracy,
'Precision':precision,
'Recall':recall,
'F1 Score':F1}
metrics_df = metrics_df.append([scores], ignore_index=True)
metrics_df
true_cluster | pred_cluster | true_size | pred_size | TP | FP | TN | FN | Accuracy | Precision | Recall | F1 Score | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 2 | 37859 | 37859 | 37859 | 0 | 284453 | 0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
1 | 1 | 3 | 23786 | 23786 | 23786 | 0 | 298526 | 0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
2 | 2 | 5 | 37871 | 37871 | 37871 | 0 | 284441 | 0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
3 | 3 | 6 | 30267 | 30270 | 30267 | 3 | 292042 | 0 | 0.999991 | 0.999901 | 1.000000 | 0.999950 |
4 | 4 | 8 | 39655 | 39654 | 39654 | 0 | 282657 | 1 | 0.999997 | 1.000000 | 0.999975 | 0.999987 |
5 | 5 | 9 | 31210 | 31209 | 31209 | 0 | 291102 | 1 | 0.999997 | 1.000000 | 0.999968 | 0.999984 |
6 | 6 | 1 | 32289 | 32289 | 32289 | 0 | 290023 | 0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
7 | 7 | 7 | 35167 | 35169 | 35167 | 2 | 287143 | 0 | 0.999994 | 0.999943 | 1.000000 | 0.999972 |
8 | 8 | 0 | 22370 | 22370 | 22370 | 0 | 299942 | 0 | 1.000000 | 1.000000 | 1.000000 | 1.000000 |
9 | 9 | 4 | 31738 | 31739 | 31738 | 1 | 290573 | 0 | 0.999997 | 0.999968 | 1.000000 | 0.999984 |
10 | -1 | -1 | 100 | 96 | 96 | 0 | 322212 | 4 | 0.999988 | 1.000000 | 0.960000 | 0.979592 |
# Get model performance (macro-averaged = each class/cluster has the same weight, regardless of samples)
plot_metrics = {}
plot_metrics['Precision'] = metrics_df['Precision'].mean()
plot_metrics['Recall'] = metrics_df['Recall'].mean()
plot_metrics['F1 Score'] = metrics_df['F1 Score'].mean()
plot_metrics['Accuracy'] = metrics_df['Accuracy'].mean()
# plots
plt.style.use('fivethirtyeight')
sns.set(font_scale=1.5)
plt.figure(figsize=(15, 5))
sns.barplot(y=list(plot_metrics.keys()), x=list(plot_metrics.values()))
plt.suptitle('Macro-Averaged Model Metrics')
plt.gca().set_xticklabels(['{:.0f}%'.format(x*100) for x in plt.gca().get_xticks()])
plt.show()
print(f'Duration: {time.time() - start_time:.0f} seconds.')
Duration: 186 seconds.
Matheus Schmitz
LinkedIn
Github Portfolio