DSCI 552 - Machine Learning for Data Science
Homework 5
Matheus Schmitz
USC ID: 5039286453
# Data Manipulation
import numpy as np
import pandas as pd
# Scikit-Learn
from sklearn.model_selection import train_test_split, KFold
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, hamming_loss, make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
# SMOTE
from imblearn.over_sampling import SMOTE
# Warnings
import warnings
warnings.filterwarnings('ignore')
# Control downsampling of data to avoid long processing time during development
# 1 means 100% of data (no downsampling), 0.5 means 50% of data, and so on
DEV_DOWNSAMPLING = 1
# Read csv
df = pd.read_csv('../data/Frogs_MFCCs.csv')
df = df.sample(frac=DEV_DOWNSAMPLING)
print(f'df.shape: {df.shape}')
df.head(3)
# Train-test split
df_train, df_test = train_test_split(df, test_size=0.3)
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')
# Extract labels to be used
labels = [i for i in df.columns[-4:-1]]
labels
# Dataframes for Exact Match Loss
pred_train_labels = pd.DataFrame()
pred_test_labels = pd.DataFrame()
true_train_multilabel = df_train[labels].stack().groupby(level=0).apply(''.join).to_frame('true_train')
true_test_multilabel = df_test[labels].stack().groupby(level=0).apply(''.join).to_frame('true_test')
# Dataframes to store all results for comparison
summary = pd.DataFrame()
summary_multilabel = pd.DataFrame()
Exact Match only considers a classification as correct if all labels of the sample are correctly classified. It's a strict metric.
Hamming Score is the fraction of labels that are incorrectly predicted. It's a more lenient metric.
How to create a scorer with sklearn's make_scorer: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html
# Create a loss function using the hamming_loss metric
hamm_loss = make_scorer(hamming_loss, greater_is_better=False)
# Pipeline to standardize then run SVC
svc = Pipeline([("standardize", StandardScaler()),
("svc", SVC(kernel="rbf", decision_function_shape='ovr'))])
# Grid with parameters to be tested via CV
param_grid = {'svc__C': np.logspace(-3, 3, 7),
'svc__gamma': np.logspace(-3, 3, 7)}
# Instantiate GridSearchCV using hamming_loss as the scorer
gridCV = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1, scoring=hamm_loss)
# Train one model for each label
for label in labels:
# Get X's and Y's
x_train = df_train.iloc[:, :-4].copy()
y_train = df_train[label].copy()
x_test = df_test.iloc[:, :-4].copy()
y_test = df_test[label].copy()
# Fit using grid search to find the best params
gridCV.fit(x_train, y_train)
# Predict
pred_train = gridCV.predict(x_train)
pred_test = gridCV.predict(x_test)
pred_train_labels[label] = pred_train
pred_test_labels[label] = pred_test
# Store data for later comparison
summary.at['C', f'SVM_{label}'] = gridCV.best_params_['svc__C']
summary.at['gamma', f'SVM_{label}'] = gridCV.best_params_['svc__gamma']
summary.at['strict_train', f'SVM_{label}'] = 1 - accuracy_score(y_true=y_train, y_pred=pred_train)
summary.at['strict_test', f'SVM_{label}'] = 1 - accuracy_score(y_true=y_test, y_pred=pred_test)
summary.at['lenient_train', f'SVM_{label}'] = hamming_loss(y_true=y_train, y_pred=pred_train)
summary.at['lenient_test', f'SVM_{label}'] = hamming_loss(y_true=y_test, y_pred=pred_test)
# Print model results for current label
print(f'------------------------------ {label} ------------------------------')
print('Best C Parameter: ', summary.at['C', f'SVM_{label}'])
print('Best Gamma Parameter: ', summary.at['gamma', f'SVM_{label}'])
print()
print('Exact Match Loss | Training: ', summary.at['strict_train', f'SVM_{label}'])
print('Exact Match Loss | Testing: ', summary.at['strict_test', f'SVM_{label}'])
print()
print('Hamming Loss | Training: ', summary.at['lenient_train', f'SVM_{label}'])
print('Hamming Loss | Testing: ', summary.at['lenient_test', f'SVM_{label}'])
print()
print()
# Model Overall metrics
# Join all predicted label strings to calculate exact match loss
pred_train_multilabel = pred_train_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_train')
pred_test_multilabel = pred_test_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_test')
# Multilabel Multiclass Exact Match
summary_multilabel.at['strict_train', 'SVM'] = 1 - accuracy_score(y_true=true_train_multilabel, y_pred=pred_train_multilabel)
summary_multilabel.at['strict_test', 'SVM'] = 1 - accuracy_score(y_true=true_test_multilabel, y_pred=pred_test_multilabel)
# The overall hamming loss is simply the average across all labels
summary_multilabel.at['lenient_train', 'SVM'] = summary.iloc[-2, -3:].mean()
summary_multilabel.at['lenient_test', 'SVM'] = summary.iloc[-1, -3:].mean()
# Print model results for entire model
print(f'------------------------------ MODEL OVERALL ------------------------------')
print('Exact Match Loss | Training: ', summary_multilabel.at['strict_train', f'SVM'])
print('Exact Match Loss | Testing: ', summary_multilabel.at['strict_test', f'SVM'])
print()
print('Hamming Loss | Training: ', summary_multilabel.at['lenient_train', f'SVM'])
print('Hamming Loss | Testing: ', summary_multilabel.at['lenient_test', f'SVM'])
print()
print()
From Scikit-Learn about the dual parameter on LinearSVC: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
dual bool, default=True
Select the algorithm to either solve the dual or primal optimization problem.
Prefer dual=False
when n_samples > n_features.
# Pipeline to standardize then run SVC
svc = Pipeline([("standardize", StandardScaler()),
("svc", LinearSVC(penalty="l1", multi_class='ovr', dual=False))])
# Grid with parameters to be tested via CV
param_grid = {'svc__C': np.logspace(-3, 3, 7)}
# Instantiate GridSearchCV using hamming_loss as the scorer
gridCV = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1, scoring=hamm_loss)
# Train one model for each label
for label in labels:
# Get X's and Y's
x_train = df_train.iloc[:, :-4].copy()
y_train = df_train[label].copy()
x_test = df_test.iloc[:, :-4].copy()
y_test = df_test[label].copy()
# Fit using grid search to find the best params
gridCV.fit(x_train, y_train)
# Predict
pred_train = gridCV.predict(x_train)
pred_test = gridCV.predict(x_test)
pred_train_labels[label] = pred_train
pred_test_labels[label] = pred_test
# Store data for later comparison
summary.at['C', f'L1_{label}'] = gridCV.best_params_['svc__C']
summary.at['strict_train', f'L1_{label}'] = 1 - accuracy_score(y_true=y_train, y_pred=pred_train)
summary.at['strict_test', f'L1_{label}'] = 1 - accuracy_score(y_true=y_test, y_pred=pred_test)
summary.at['lenient_train', f'L1_{label}'] = hamming_loss(y_true=y_train, y_pred=pred_train)
summary.at['lenient_test', f'L1_{label}'] = hamming_loss(y_true=y_test, y_pred=pred_test)
# Print model results for current label
print(f'------------------------------ {label} ------------------------------')
print('Best C Parameter: ', summary.at['C', f'L1_{label}'])
print()
print('Exact Match Loss | Training: ', summary.at['strict_train', f'L1_{label}'])
print('Exact Match Loss | Testing: ', summary.at['strict_test', f'L1_{label}'])
print()
print('Hamming Loss | Training: ', summary.at['lenient_train', f'L1_{label}'])
print('Hamming Loss | Testing: ', summary.at['lenient_test', f'L1_{label}'])
print()
print()
# Model Overall metrics
# Join all predicted label strings to calculate exact match loss
pred_train_multilabel = pred_train_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_train')
pred_test_multilabel = pred_test_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_test')
# Multilabel Multiclass Exact Match
summary_multilabel.at['strict_train', 'L1'] = 1 - accuracy_score(y_true=true_train_multilabel, y_pred=pred_train_multilabel)
summary_multilabel.at['strict_test', 'L1'] = 1 - accuracy_score(y_true=true_test_multilabel, y_pred=pred_test_multilabel)
# The overall hamming loss is simply the average across all labels
summary_multilabel.at['lenient_train', 'L1'] = summary.iloc[-2, -3:].mean()
summary_multilabel.at['lenient_test', 'L1'] = summary.iloc[-1, -3:].mean()
# Print model results for entire model
print(f'------------------------------ MODEL OVERALL ------------------------------')
print('Exact Match Loss | Training: ', summary_multilabel.at['strict_train', f'L1'])
print('Exact Match Loss | Testing: ', summary_multilabel.at['strict_test', f'L1'])
print()
print('Hamming Loss | Training: ', summary_multilabel.at['lenient_train', f'L1'])
print('Hamming Loss | Testing: ', summary_multilabel.at['lenient_test', f'L1'])
print()
print()
Route #1: Applying Smote to the Whole Dataset
# SMOTE: Data Preparation
smote = SMOTE(n_jobs=-1)
# Dictionaries to store the datasets for each SMOTE round (SMOTE needs to be done once per label)
master_dict_train = {}
master_dict_test = {}
# Create one SMOTE'd dataset per label
for label in labels:
# Split data (required for smote)
x_train = df_train.iloc[:, :-4].copy()
y_train = df_train[label].copy()
x_test = df_test.iloc[:, :-4].copy()
y_test = df_test[label].copy()
# Apply SMOTE
tuple_train_smote = smote.fit_sample(x_train, y_train)
tuple_test_smote = smote.fit_sample(x_test, y_test) # (code here for experimentation purposes)
# UNDO the SMOTE on the test dataset, as it doesn't make sense to apply SMOTE to it
tuple_test_smote = (x_test, y_test)
# Get original column names
col_names = [i for i in df.columns[:-4]]
col_names.append(label)
# Reconstruct the dataframes
df_train_smote = pd.concat([tuple_train_smote[0], tuple_train_smote[1]], axis=1)
df_train_smote.columns = col_names
df_test_smote = pd.concat([tuple_test_smote[0], tuple_test_smote[1]], axis=1)
df_test_smote.columns = col_names
# Save dataframes to the dict
master_dict_train[label] = df_train_smote
master_dict_test[label] = df_test_smote
# Pipeline to standardize then run SVC
svc = Pipeline([("standardize", StandardScaler()),
("svc", LinearSVC(penalty="l1", multi_class='ovr', dual=False))])
# Grid with parameters to be tested via CV
param_grid = {'svc__C': np.logspace(-3, 3, 7)}
# Instantiate GridSearchCV using hamming_loss as the scorer
gridCV = GridSearchCV(svc, param_grid, cv=10, n_jobs=-1, scoring=hamm_loss)
# Train one model for each label
for label in labels:
# Get X's and Y's
x_train = master_dict_train[label].iloc[:, :-1].copy()
y_train = master_dict_train[label][label].copy()
x_test = master_dict_test[label].iloc[:, :-1].copy()
y_test = master_dict_test[label][label].copy()
# Fit using grid search to find the best params
gridCV.fit(x_train, y_train)
# Predict
pred_train = gridCV.predict(x_train)
pred_test = gridCV.predict(x_test)
#pred_train_labels[label] = pred_train
pred_test_labels[label] = pred_test
# Store data for later comparison
summary.at['C', f'SMOTE_{label}'] = gridCV.best_params_['svc__C']
summary.at['strict_train', f'SMOTE_{label}'] = 1 - accuracy_score(y_true=y_train, y_pred=pred_train)
summary.at['strict_test', f'SMOTE_{label}'] = 1 - accuracy_score(y_true=y_test, y_pred=pred_test)
summary.at['lenient_train', f'SMOTE_{label}'] = hamming_loss(y_true=y_train, y_pred=pred_train)
summary.at['lenient_test', f'SMOTE_{label}'] = hamming_loss(y_true=y_test, y_pred=pred_test)
# Print model results for current label
print(f'------------------------------ {label} ------------------------------')
print('Best C Parameter: ', summary.at['C', f'SMOTE_{label}'])
print()
print('Exact Match Loss | Training: ', summary.at['strict_train', f'SMOTE_{label}'])
print('Exact Match Loss | Testing: ', summary.at['strict_test', f'SMOTE_{label}'])
print()
print('Hamming Loss | Training: ', summary.at['lenient_train', f'SMOTE_{label}'])
print('Hamming Loss | Testing: ', summary.at['lenient_test', f'SMOTE_{label}'])
print()
print()
# Model Overall metrics
# Join all predicted label strings to calculate exact match loss
#pred_train_multilabel = pred_train_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_train')
pred_test_multilabel = pred_test_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_test')
# Multilabel Multiclass Exact Match
#summary_multilabel.at['strict_train', 'SMOTE'] = 1 - accuracy_score(y_true=true_train_multilabel, y_pred=pred_train_multilabel)
summary_multilabel.at['strict_test', 'SMOTE'] = 1 - accuracy_score(y_true=true_test_multilabel, y_pred=pred_test_multilabel)
# The overall hamming loss is simply the average across all labels
summary_multilabel.at['lenient_train', 'SMOTE'] = summary.iloc[-2, -3:].mean()
summary_multilabel.at['lenient_test', 'SMOTE'] = summary.iloc[-1, -3:].mean()
# Print model results for entire model
print(f'------------------------------ MODEL OVERALL ------------------------------')
#print('Exact Match Loss | Training: ', summary_multilabel.at['strict_train', f'SMOTE'])
print('As each label on training data has its unique SMOTEd dataset, training Exact Match Loss cannot be calculated')
print('Exact Match Loss | Testing: ', summary_multilabel.at['strict_test', f'SMOTE'])
print()
print('Hamming Loss | Training: ', summary_multilabel.at['lenient_train', f'SMOTE'])
print('Hamming Loss | Testing: ', summary_multilabel.at['lenient_test', f'SMOTE'])
print()
print()
Route #2: Applying Smote to K-1 Folds
This approach takes much slower then the above one which applies SMOTE once to the whole dataset and then performs GridSearchCV
To make the processing time more reasonable I shrank the param_grid, keeping only the C range found in the above summary, plus and minus one log.
Also, since the cross-validation is already being performed via K-fold splitting, I reduced the cv parameter inside GridSearch to 5 (down from 10), which halves the processing time, as my CPU has 8 cores, which means a cv of 10 requires two "cycles" (as 2 of the folds will have to wait for a core to become available), while a cv of 5 can be completed in one "cycle".
Still, the cell below takes about 1 hour to run.
%%time
# This cell takes close to 1 hour
# Instantiate SMOTE
smote = SMOTE(n_jobs=-1)
# Pipeline to standardize then run SVC
svc = Pipeline([("standardize", StandardScaler()),
("svc", LinearSVC(penalty="l1", multi_class='ovr', dual=False))])
# Grid with parameters to be tested via CV
param_grid = {'svc__C': np.logspace(0, 3, 4)}
# Instantiate GridSearchCV using hamming_loss as the scorer
gridCV = GridSearchCV(svc, param_grid, cv=5, n_jobs=-1, scoring=hamm_loss)
# KFold
kf = KFold(n_splits=10, shuffle=True)
# For each label, split the data in 10 folds, using 9 for training and 1 for validation
for label in labels:
print(f'------------------------------ {label} ------------------------------')
kfold_intermediate_results = pd.DataFrame()
for fold_num, (idx_train, idx_valid) in enumerate(kf.split(df_train), 1):
# Print current label and fold
print(f'Working on Fold: {fold_num}')
# Select all folds to be smoted except for the validation fold
x_train, y_train = smote.fit_sample(df_train.iloc[idx_train,:-4], df_train[label].iloc[idx_train])
x_valid = df_train.iloc[idx_valid,:-4]
y_valid = df_train[label].iloc[idx_valid]
# Fit using grid search to find the best params
gridCV.fit(x_train, y_train)
# Predict on the train and validation folds to calculate metrics
pred_train = gridCV.predict(x_train)
pred_valid = gridCV.predict(x_valid)
# Store K-Fold intermedaite results
kfold_intermediate_results.at['C', f'{fold_num}'] = gridCV.best_params_['svc__C']
kfold_intermediate_results.at['strict_train', f'{fold_num}'] = 1 - accuracy_score(y_true=y_train, y_pred=pred_train)
kfold_intermediate_results.at['strict_valid', f'{fold_num}'] = 1 - accuracy_score(y_true=y_valid, y_pred=pred_valid)
kfold_intermediate_results.at['lenient_train', f'{fold_num}'] = hamming_loss(y_true=y_train, y_pred=pred_train)
kfold_intermediate_results.at['lenient_valid', f'{fold_num}'] = hamming_loss(y_true=y_valid, y_pred=pred_valid)
# After running all K-Folds get average results for the label
kfold_intermediate_results['mean'] = kfold_intermediate_results.mean(axis=1)
print()
print(f'--- K-Fold Cross-Validation Results ---')
print(f'Mean C Parameter: {kfold_intermediate_results["mean"]["C"]}')
print()
print(f'Mean Exact Match Loss | Training : {kfold_intermediate_results["mean"]["strict_train"]}')
print(f'Mean Exact Match Loss | Validation : {kfold_intermediate_results["mean"]["strict_valid"]}')
print()
print(f'Mean Hamming Loss | Training : {kfold_intermediate_results["mean"]["lenient_train"]}')
print(f'Mean Hamming Loss | Validation : {kfold_intermediate_results["mean"]["lenient_valid"]}')
print()
# Create a classifier using the mean C value
svc_kfold = LinearSVC(penalty="l1", multi_class='ovr', dual=False,
C=kfold_intermediate_results.at['C', 'mean'])
# Get X's and Y's - This time using the full datasets for trainin and testing
x_train, y_train = smote.fit_sample(df_train.iloc[:,:-4], df_train[label])
x_test = df_test.iloc[:, :-4].copy()
y_test = df_test[label].copy()
# Fit using the SVM model created with the mean C from K-Fold cross-validation
svc_kfold.fit(x_train, y_train)
# Predict
pred_train = svc_kfold.predict(x_train)
pred_test = svc_kfold.predict(x_test)
#pred_train_labels[label] = pred_train
pred_test_labels[label] = pred_test
# Store data for later comparison
summary.at['C', f'SMOTE_KF_{label}'] = svc_kfold.C
summary.at['strict_train', f'SMOTE_KF_{label}'] = 1 - accuracy_score(y_true=y_train, y_pred=pred_train)
summary.at['strict_test', f'SMOTE_KF_{label}'] = 1 - accuracy_score(y_true=y_test, y_pred=pred_test)
summary.at['lenient_train', f'SMOTE_KF_{label}'] = hamming_loss(y_true=y_train, y_pred=pred_train)
summary.at['lenient_test', f'SMOTE_KF_{label}'] = hamming_loss(y_true=y_test, y_pred=pred_test)
# Print model results for current label
print(f'--- Full Dataset Results ---')
print('Exact Match Loss | Training: ', summary.at['strict_train', f'SMOTE_KF_{label}'])
print('Exact Match Loss | Testing: ', summary.at['strict_test', f'SMOTE_KF_{label}'])
print()
print('Hamming Loss | Training: ', summary.at['lenient_train', f'SMOTE_KF_{label}'])
print('Hamming Loss | Testing: ', summary.at['lenient_test', f'SMOTE_KF_{label}'])
print()
print()
# Model Overall metrics
# Join all predicted label strings to calculate exact match loss
#pred_train_multilabel = pred_train_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_train')
pred_test_multilabel = pred_test_labels.stack().groupby(level=0).apply(''.join).to_frame('pred_test')
# Multilabel Multiclass Exact Match
#summary_multilabel.at['strict_train', 'SMOTE_KF'] = 1 - accuracy_score(y_true=true_train_multilabel, y_pred=pred_train_multilabel)
summary_multilabel.at['strict_test', 'SMOTE_KF'] = 1 - accuracy_score(y_true=true_test_multilabel, y_pred=pred_test_multilabel)
# The overall hamming loss is simply the average across all labels
summary_multilabel.at['lenient_train', 'SMOTE_KF'] = summary.iloc[-2, -3:].mean()
summary_multilabel.at['lenient_test', 'SMOTE_KF'] = summary.iloc[-1, -3:].mean()
# Print model results for entire model
print(f'------------------------------ MODEL OVERALL ------------------------------')
#print('Exact Match Loss | Training: ', summary_multilabel.at['strict_train', f'SMOTE_KF'])
print('As each label on training data has its unique SMOTEd dataset, training Exact Match Loss cannot be calculated')
print('Exact Match Loss | Testing: ', summary_multilabel.at['strict_test', f'SMOTE_KF'])
print()
print('Hamming Loss | Training: ', summary_multilabel.at['lenient_train', f'SMOTE_KF'])
print('Hamming Loss | Testing: ', summary_multilabel.at['lenient_test', f'SMOTE_KF'])
print()
print()
Quite interestingly, for all labels a lower training and validation error is achieved when the model's parameters are found this this approach, versus the approach that uses only GridSearchCV but no K-Folds.
Yet, this does not translate into lower test error, suggesting that either the model is starting to overfit, or it has reached the limit of the dataset.
row_names = {'strict_train': 'Exact Match Loss | Train',
'strict_test': 'Exact Match Loss | Test',
'lenient_train': 'Hamming Loss | Train',
'lenient_test': 'Hamming Loss | Test'}
# Summary of single-label classifiers
summary.rename(row_names)
Exact Match Loss is only more strict that Hamming Losss when there is more than 1 label to be predicted at the same time. Hence it was already expected that they would match for the single-label problems.
# Summary of multi-label classifiers
# As each label on training data has its unique SMOTEd dataset, training Exact Match Loss cannot be calculated
summary_multilabel.rename(row_names)
The original SVM Classifier was the best performing model, which was expected given the known fact that L1 penalization can at best match an un-penalized model, and most likely will have a higher error, thus the following models which employed L1 were bound to underperform, as a tradeoff for the feature selection they provide.
The more interesting aspect is how SMOTE seems to have worsened (increased) the misclassification rate (error). One way to dive deeper into this issue would be to check the class-stratified misclassification rate, to see if the error for the rare classes got reduced at the expense of error in the majority class increasing.
DSCI 552 - Machine Learning for Data Science
Homework 5
Matheus Schmitz
USC ID: 5039286453
# tqdm is a progress bar
# Quite useful to know things are running the the processing time is long
!pip install tqdm
# Data Manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# K-Means
from sklearn.cluster import KMeans, MiniBatchKMeans
# Metrics
from sklearn.metrics import silhouette_score, hamming_loss
# Label Encoding
from sklearn.preprocessing import LabelEncoder
# Progress Bar
from tqdm.notebook import tqdm
# Warnings
import warnings
warnings.filterwarnings('ignore')
# Read csv
df = pd.read_csv('../data/Frogs_MFCCs.csv')
print(f'df.shape: {df.shape}')
df.head(3)
# Split features and labels
df_features = df.iloc[:, :-4]
df_labels = df.iloc[:, -4:-1]
# KMeans: Takes about 3 minutes to train on all k values
# MiniBatchKMeans: Takes about 1 minute to train on all k values
# Dictionary to store silhouette score for each k
silhouettes = {}
# Train, predict and score KMean on each k
for k in tqdm(range(2,51)):
kmeans = KMeans(n_clusters=k)
#kmeans = MiniBatchKMeans(n_clusters=k)
clusters = kmeans.fit_predict(df_features)
silhouettes[k] = silhouette_score(df_features, clusters)
# Get the best K value and the associated Silhouette Score
best_k = max(silhouettes, key=lambda key: silhouettes[key])
print(f'Best K: {best_k}')
best_silhouette = silhouettes[best_k]
print(f'Silhouette Score: {best_silhouette:.5f}')
# Instance a K-Means clusterer using the best_k
kmeans = KMeans(n_clusters=best_k)
#kmeans = MiniBatchKMeans(n_clusters=best_k)
# Train the K-Means and predict the clusters
clusters = kmeans.fit_predict(df_features)
# Add the predicted clusters to the dataframe with labels
df_labels['Cluster'] = clusters
# Group the dataframe by cluster
df_clusters = df_labels.groupby('Cluster')
# For each of the labels, check the most frequent class (the mode)
cluster_family = df_clusters['Family'].agg(pd.Series.mode)
cluster_genus = df_clusters['Genus'].agg(pd.Series.mode)
cluster_species = df_clusters['Species'].agg(pd.Series.mode)
# Summarize all on a dataframe
majority_classes = pd.DataFrame(data=[cluster_family, cluster_genus, cluster_species]).T
majority_classes
Hamming Distance - From Scipy: https://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.hamming.html
The Hamming distance between 1-D arrays u and v, is simply the proportion of disagreeing components in u and v.
From this I assume the hamming distance between N-D arrays is the sum of the distances between their "inner" 1-D arrays.
Hamming Loss - From Scikit-Learn: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html
The Hamming loss is the fraction of labels that are incorrectly predicted.
Hence Scikit-Learn will average the loss over all labels. Therefore in order to obtain the Hamming Distance one can simply multiply sklearn's hamming_loss by the number of labels in the data.
Hamming Score is the inverse of the hamming loss.
# Use the majority_classes dataframe to assign predicted classes to each sample
df_labels['pred_Family'] = df_labels['Cluster'].map(majority_classes['Family'])
df_labels['pred_Genus'] = df_labels['Cluster'].map(majority_classes['Genus'])
df_labels['pred_Species'] = df_labels['Cluster'].map(majority_classes['Species'])
# Need to convert labels from strings to numeric in order to calculate hamming metrics
# LabelBinazer and OneHotEncoder cannot be used as they would double the error as [0, 1] and [1, 0] have a hamming distance of 2
# While [0] and [3] have a hamming distance of one, which is the correct since classes do not have a hierarchy
LE = LabelEncoder()
df_labels['true_Family_encoded'] = LE.fit_transform(df_labels['Family'])
df_labels['pred_Family_encoded'] = LE.transform(df_labels['pred_Family'])
df_labels['true_Genus_encoded'] = LE.fit_transform(df_labels['Genus'])
df_labels['pred_Genus_encoded'] = LE.transform(df_labels['pred_Genus'])
df_labels['true_Species_encoded'] = LE.fit_transform(df_labels['Species'])
df_labels['pred_Species_encoded'] = LE.transform(df_labels['pred_Species'])
# Extract the true and predicted labels as arrays so they can be compared
true_labels_encoded = [data[['true_Family_encoded', 'true_Genus_encoded', 'true_Species_encoded']].values for clster, data in df_labels.groupby('Cluster')]
pred_labels_encoded = [data[['pred_Family_encoded', 'pred_Genus_encoded', 'pred_Species_encoded']].values for clster, data in df_labels.groupby('Cluster')]
# Calculate metrics
cluster_hamming_loss = [hamming_loss(np.vstack(true_labels_encoded).flatten(), np.vstack(pred_labels_encoded).flatten())]
cluster_hamming_score = [1-loss for loss in cluster_hamming_loss]
cluster_hamming_dist = [loss*len(majority_classes.columns) for loss in cluster_hamming_loss]
# Print average metrics
print(f'Average Hamming Loss: {np.mean(cluster_hamming_loss):.5f}')
print(f'Average Hamming Score: {np.mean(cluster_hamming_score):.5f}')
print(f'Average Hamming Distance: {np.mean(cluster_hamming_dist):.5f}')
# Read csv
df = pd.read_csv('../data/Frogs_MFCCs.csv')
# List to store the hamming distance in each iteration
hammings = []
# Perform the previous procedure (a + b + c) 50 times:
for iteration in tqdm(range(1, 51), desc='Monte-Carlo Simulation', ncols='90%'):
# Split features and labels
df_features = df.iloc[:, :-4]
df_labels = df.iloc[:, -4:-1]
#-------------------------------------------------------#
# (A) K-MEANS CLUSTERING #
#-------------------------------------------------------#
# Dictionary to store silhouette score for each k
silhouettes = {}
# Train, predict and score KMean on each k
# Note here we change the highest K to 10, based on the previous finding of best_k = 4
for k in range(2,11):
#for k in tqdm(range(2,51), desc='K-Means K ∈ {2, 3, ..., 50}', ncols='66%'):
kmeans = KMeans(n_clusters=k, random_state=iteration)
#kmeans = MiniBatchKMeans(n_clusters=k, random_state=iteration)
clusters = kmeans.fit_predict(df_features)
silhouettes[k] = silhouette_score(df_features, clusters)
# Get the best K value and the associated Silhouette Score
best_k = max(silhouettes, key=lambda key: silhouettes[key])
print(f'Iteration {iteration} | Best K: {best_k}')
best_silhouette = silhouettes[best_k]
print(f'Iteration {iteration} | Silhouette Score: {best_silhouette:.5f}')
#-------------------------------------------------------#
# (B) MAJORITY LABELS PER CLUSTER #
#-------------------------------------------------------#
# Instance a K-Means clusterer using the best_k
kmeans = KMeans(n_clusters=best_k, random_state=iteration)
#kmeans = MiniBatchKMeans(n_clusters=best_k, random_state=iteration)
# Train the K-Means and predict the clusters
clusters = kmeans.fit_predict(df_features)
# Add the predicted clusters to the dataframe with labels
df_labels['Cluster'] = clusters
# Group the dataframe by cluster
df_clusters = df_labels.groupby('Cluster')
# For each of the labels, check the most frequent class (the mode)
cluster_family = df_clusters['Family'].agg(pd.Series.mode)
cluster_genus = df_clusters['Genus'].agg(pd.Series.mode)
cluster_species = df_clusters['Species'].agg(pd.Series.mode)
# Summarize all on a dataframe
majority_classes = pd.DataFrame(data=[cluster_family, cluster_genus, cluster_species]).T
#-------------------------------------------------------#
# (c) HAMMING DISTANCE, HAMMING SCORE, HAMMING LOSS #
#-------------------------------------------------------#
# Use the majority_classes dataframe to assign predicted classes to each sample
df_labels['pred_Family'] = df_labels['Cluster'].map(majority_classes['Family'])
df_labels['pred_Genus'] = df_labels['Cluster'].map(majority_classes['Genus'])
df_labels['pred_Species'] = df_labels['Cluster'].map(majority_classes['Species'])
# Need to convert labels from strings to numeric in order to calculate hamming metrics
# LabelBinazer and OneHotEncoder cannot be used as they would double the error as [0, 1] and [1, 0] have a hamming distance of 2
# While [0] and [3] have a hamming distance of one, which is the correct since classes do not have a hierarchy
LE = LabelEncoder()
df_labels['true_Family_encoded'] = LE.fit_transform(df_labels['Family'])
df_labels['pred_Family_encoded'] = LE.transform(df_labels['pred_Family'])
df_labels['true_Genus_encoded'] = LE.fit_transform(df_labels['Genus'])
df_labels['pred_Genus_encoded'] = LE.transform(df_labels['pred_Genus'])
df_labels['true_Species_encoded'] = LE.fit_transform(df_labels['Species'])
df_labels['pred_Species_encoded'] = LE.transform(df_labels['pred_Species'])
# Extract the true and predicted labels as arrays so they can be compared
true_labels_encoded = [data[['true_Family_encoded', 'true_Genus_encoded', 'true_Species_encoded']].values for clster, data in df_labels.groupby('Cluster')]
pred_labels_encoded = [data[['pred_Family_encoded', 'pred_Genus_encoded', 'pred_Species_encoded']].values for clster, data in df_labels.groupby('Cluster')]
# Calculate metrics
cluster_hamming_loss = [hamming_loss(np.vstack(true_labels_encoded).flatten(), np.vstack(pred_labels_encoded).flatten())]
cluster_hamming_score = [1-loss for loss in cluster_hamming_loss]
cluster_hamming_dist = [loss*len(majority_classes.columns) for loss in cluster_hamming_loss]
# Print average metrics
print(f'Iteration {iteration} | Average Hamming Loss: {np.mean(cluster_hamming_loss):.5f}')
print(f'Iteration {iteration} | Average Hamming Score: {np.mean(cluster_hamming_score):.5f}')
print(f'Iteration {iteration} | Average Hamming Distance: {np.mean(cluster_hamming_dist):.5f}')
print()
#-------------------------------------------------------#
# ITERATION METRICS #
#-------------------------------------------------------#
mean_hamming_distance = np.mean(cluster_hamming_dist)
hammings.append(mean_hamming_distance)
print('Monte-Carlo Simulation Results:')
print(f'Hamming Distance | average = {np.mean(hammings):.5f}')
print(f'Hamming Distance | standard deviation = {np.std(hammings):.5f}')