DSCI 552 | Machine Learning for Data Science
Homework 6
Matheus Schmitz
USC ID: 5039286453
# Data Science
import numpy as np
import pandas as pd
# Scikit-Learn
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, auc, f1_score, precision_score, recall_score, roc_curve
from sklearn.cluster import KMeans, SpectralClustering
from sklearn.neighbors import KNeighborsClassifier
# Progress Bar
from tqdm import tqdm
# Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
# Get all column names
col_names = ['ID', 'Diagnosis']
collected_stats = ['Mean', 'SE', 'Worst']
collected_features = ['Radius', 'Texture', 'Perimeter', 'Area', 'Smoothness', 'Compactness', 'Concavity', 'ConcavePoints', 'Symmetry', 'FractalDimension']
for stat in collected_stats:
for feat in collected_features:
col_names.append(f'{stat}_{feat}')
df = pd.read_csv('../data/wdbc.data', header=None, names=col_names)
print(f'df.shape: {df.shape}')
df.head(3)
# Remove 'ID' column
if 'ID' in df.columns:
df.drop('ID', axis=1, inplace=True)
# Split Benign and Malign samples and shuffle them
df_B = df[df.Diagnosis == 'B'].sample(frac=1)
df_M = df[df.Diagnosis == 'M'].sample(frac=1)
# Create test and train dataframes
df_test = pd.concat([df_B[0:round(len(df_B)*0.2)],
df_M[0:round(len(df_M)*0.2)]])
df_train = pd.concat([df_B[round(len(df_B)*0.2):],
df_M[round(len(df_M)*0.2):]])
# Shapes
print(f'df_test.shape: {df_test.shape}')
print(f'df_train.shape: {df_train.shape}')
# Get X's and Y's
x_train = df_train.iloc[:, 1:]
x_test = df_test.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
y_test = df_test.iloc[:, 0]
# Normalize X's
SCALER = MinMaxScaler()
x_train = pd.DataFrame(SCALER.fit_transform(x_train), columns=[col_names[2:]])
x_test = pd.DataFrame(SCALER.transform(x_test), columns=[col_names[2:]])
# Label Encode Y's
LE = LabelEncoder()
y_train = pd.DataFrame(LE.fit_transform(y_train), columns=[col_names[1]])
y_test = pd.DataFrame(LE.transform(y_test), columns=[col_names[1]])
# Dataframe to summarize all results
summary = pd.DataFrame()
# Auxilary functions to facilitate plotting classification results
# Based on scikit-learn documentation: https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
def plot_cm(true_binary, pred_binary, classes=['B', 'M']):
# Get Confusion Matrix and plot
conf_mat = confusion_matrix(true_binary, pred_binary)
#plt.axis('equal')
sns.heatmap(conf_mat, annot=True, cmap='Blues', xticklabels=classes, yticklabels=classes, square=True, cbar=False, fmt='d')
plt.title('Confusion Matrix', pad = 20, fontweight='bold')
plt.ylabel('True Class', fontsize = 12, labelpad = 10)
plt.xlabel('Predicted Class', fontsize = 12, labelpad = 10)
def plot_roc_overall(true_onehot, pred_probs, classes=['B', 'M']):
# Coerce inputs to np.array
true_onehot = np.asarray(true_onehot)
pred_probs = np.asarray(pred_probs)
classes = np.asarray(classes)
N_CLASSES = len(classes)
# Compute global (micro-average) ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(true_onehot.ravel(), pred_probs.ravel())
roc_auc = auc(fpr, tpr)
# Plot the model overall ROC
plt.axis('square')
plt.plot(fpr, tpr, label=f'MODEL OVERALL (area = {roc_auc:.2f})', color='deeppink', lw=4)
plt.plot([0, 1], [0, 1], 'k--', lw=2, alpha=0.3)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize = 12, labelpad = 10)
plt.ylabel('True Positive Rate', fontsize = 12, labelpad = 10)
plt.title('Overall ROC Curve', pad = 20, fontweight='bold')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
def plot_classification_results(true_onehot, pred_probs, classes=['Benign', 'Malign']):
fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(8,6))
fig.sca(axs[0])
plot_cm(true_onehot, pred_probs, classes)
fig.sca(axs[1])
plot_roc_overall(true_onehot, pred_probs, classes)
fig.tight_layout(w_pad=5)
fig.show()
# Grid Search to find best penalty parameter
svc = LinearSVC(penalty='l1', dual=False)
param_grid = {'C': np.logspace(-2, 2, 10)}
grid = GridSearchCV(svc, param_grid=param_grid, cv=5)
# Lists to store metrics
acc_train, acc_test = [], []
auc_train, auc_test = [], []
prec_B_train, prec_M_train, prec_B_test, prec_M_test = [], [], [], []
recall_B_train, recall_M_train, recall_B_test, recall_M_test = [], [], [], []
f1_B_train, f1_M_train, f1_B_test, f1_M_test = [], [], [], []
# Run 30 Monte-Carlo Simulations
for M in tqdm(range(30)):
#--------------------------------------#
# (b) TRAIN-TEST SPLIT #
#--------------------------------------#
# Split Benign and Malign samples and shuffle them
df_B = df[df.Diagnosis == 'B'].sample(frac=1)
df_M = df[df.Diagnosis == 'M'].sample(frac=1)
# Create test and train dataframes
df_test = pd.concat([df_B[0:round(len(df_B)*0.2)],
df_M[0:round(len(df_M)*0.2)]])
df_train = pd.concat([df_B[round(len(df_B)*0.2):],
df_M[round(len(df_M)*0.2):]])
# Get X's and Y's
x_train = df_train.iloc[:, 1:]
x_test = df_test.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
y_test = df_test.iloc[:, 0]
# Normalize X's
SCALER = MinMaxScaler()
x_train = pd.DataFrame(SCALER.fit_transform(x_train), columns=[col_names[2:]])
x_test = pd.DataFrame(SCALER.transform(x_test), columns=[col_names[2:]])
# Label Encode Y's
LE = LabelEncoder()
y_train = pd.DataFrame(LE.fit_transform(y_train), columns=[col_names[1]])
y_test = pd.DataFrame(LE.transform(y_test), columns=[col_names[1]])
#--------------------------------------#
# (i) SUPERVISED LEARNING #
#--------------------------------------#
# Train
grid.fit(x_train, y_train)
# Predict
pred_train = grid.predict(x_train)
pred_test = grid.predict(x_test)
# Train Metrics
acc_train.append(accuracy_score(y_train, pred_train))
fpr, tpr, threshold = roc_curve(y_train, pred_train)
auc_train.append(auc(fpr, tpr))
prec_B_train.append(precision_score(y_train, pred_train, pos_label=0))
prec_M_train.append(precision_score(y_train, pred_train, pos_label=1))
recall_B_train.append(recall_score(y_train, pred_train, pos_label=0))
recall_M_train.append(recall_score(y_train, pred_train, pos_label=1))
f1_B_train.append(f1_score(y_train, pred_train, pos_label=0))
f1_M_train.append(f1_score(y_train, pred_train, pos_label=1))
# Test Metrics
acc_test.append(accuracy_score(y_test, pred_test))
fpr, tpr, threshold = roc_curve(y_test, pred_test)
auc_test.append(auc(fpr, tpr))
prec_B_test.append(precision_score(y_test, pred_test, pos_label=0))
prec_M_test.append(precision_score(y_test, pred_test, pos_label=1))
recall_B_test.append(recall_score(y_test, pred_test, pos_label=0))
recall_M_test.append(recall_score(y_test, pred_test, pos_label=1))
f1_B_test.append(f1_score(y_test, pred_test, pos_label=0))
f1_M_test.append(f1_score(y_test, pred_test, pos_label=1))
# Average Train Metrics
summary.at['Accuracy', 'Supervised Train'] = np.mean(acc_train)
summary.at['AUC', 'Supervised Train'] = np.mean(auc_train)
summary.at['Precision', 'Supervised Train'] = np.mean(prec_B_train + prec_M_train)
summary.at['Precision_B', 'Supervised Train'] = np.mean(prec_B_train)
summary.at['Precision_M', 'Supervised Train'] = np.mean(prec_M_train)
summary.at['Recall', 'Supervised Train'] = np.mean(recall_B_train + recall_M_train)
summary.at['Recall_B', 'Supervised Train'] = np.mean(recall_B_train)
summary.at['Recall_M', 'Supervised Train'] = np.mean(recall_M_train)
summary.at['F1', 'Supervised Train'] = np.mean(f1_B_train + f1_M_train)
summary.at['F1_B', 'Supervised Train'] = np.mean(f1_B_train)
summary.at['F1_M', 'Supervised Train'] = np.mean(f1_M_train)
# Average Test Metrics
summary.at['Accuracy', 'Supervised Test'] = np.mean(acc_test)
summary.at['AUC', 'Supervised Test'] = np.mean(auc_test)
summary.at['Precision', 'Supervised Test'] = np.mean(prec_B_test + prec_M_test)
summary.at['Precision_B', 'Supervised Test'] = np.mean(prec_B_test)
summary.at['Precision_M', 'Supervised Test'] = np.mean(prec_M_test)
summary.at['Recall', 'Supervised Test'] = np.mean(recall_B_test + recall_M_test)
summary.at['Recall_B', 'Supervised Test'] = np.mean(recall_B_test)
summary.at['Recall_M', 'Supervised Test'] = np.mean(recall_M_test)
summary.at['F1', 'Supervised Test'] = np.mean(f1_B_test + f1_M_test)
summary.at['F1_B', 'Supervised Test'] = np.mean(f1_B_test)
summary.at['F1_M', 'Supervised Test'] = np.mean(f1_M_test)
# Show results
summary.loc[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1']]
# Confusion Matrix and ROC Curve | Train Data
plot_classification_results(y_train, pred_train)
plt.suptitle('Supervised Learning | Train Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
# Confusion Matrix and ROC Curve | Test Data
plot_classification_results(y_test, pred_test)
plt.suptitle('Supervised Learning | Test Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
# Lists to store metrics
acc_train, acc_test = [], []
auc_train, auc_test = [], []
prec_B_train, prec_M_train, prec_B_test, prec_M_test = [], [], [], []
recall_B_train, recall_M_train, recall_B_test, recall_M_test = [], [], [], []
f1_B_train, f1_M_train, f1_B_test, f1_M_test = [], [], [], []
# Run 30 Monte-Carlo Simulations
for M in tqdm(range(30)):
#--------------------------------------#
# (b) TRAIN-TEST SPLIT #
#--------------------------------------#
# Split Benign and Malign samples and shuffle them
df_B = df[df.Diagnosis == 'B'].sample(frac=1)
df_M = df[df.Diagnosis == 'M'].sample(frac=1)
# Create test and train dataframes
df_test = pd.concat([df_B[0:round(len(df_B)*0.2)],
df_M[0:round(len(df_M)*0.2)]])
df_train = pd.concat([df_B[round(len(df_B)*0.2):],
df_M[round(len(df_M)*0.2):]])
# Get X's and Y's
x_train = df_train.iloc[:, 1:]
x_test = df_test.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
y_test = df_test.iloc[:, 0]
# Normalize X's
SCALER = MinMaxScaler()
x_train = pd.DataFrame(SCALER.fit_transform(x_train), columns=[col_names[2:]])
x_test = pd.DataFrame(SCALER.transform(x_test), columns=[col_names[2:]])
# Label Encode Y's
LE = LabelEncoder()
y_train = pd.DataFrame(LE.fit_transform(y_train), columns=[col_names[1]])
y_test = pd.DataFrame(LE.transform(y_test), columns=[col_names[1]])
#--------------------------------------#
# (ii) LABELED-UNLABELED SPLIT #
#--------------------------------------#
# Split labeled and unlabeled data
x_labeled, x_unlabeled, y_labeled, y_unlabeled = train_test_split(x_train, y_train, test_size=0.5, stratify=y_train)
#--------------------------------------#
# (A) FIND PENALTY HYPERPARAMETER #
#--------------------------------------#
# Grid Search to find best penalty hyperparameter
svc = LinearSVC(penalty='l1', dual=False)
param_grid = {'C': np.logspace(-2, 2, 10)}
grid = GridSearchCV(svc, param_grid=param_grid, cv=5)
# labeled
grid.fit(x_labeled, y_labeled)
#--------------------------------------#
# (B) SELF-TRAINING #
#--------------------------------------#
# SVC using the best C found in part A
svc = LinearSVC(penalty='l1', dual=False, C=grid.best_params_["C"])
# One-by-one predict an unlabeled sample and move it to the labeled dataset
while len(x_unlabeled) > 0:
# Train the classifier on the labeled samples
svc.fit(x_labeled, y_labeled)
# Find the unlabled sample farthest from the decision boundary
distances_unlabeled_x = np.absolute(svc.decision_function(x_unlabeled))
idx_farthest = np.argmax(distances_unlabeled_x)
farthest_unlabeled = pd.DataFrame(x_unlabeled.iloc[idx_farthest]).T
# Predict it's label
assigned_label = pd.DataFrame(svc.predict(farthest_unlabeled))
# Append the sample to the labeled data
x_labeled.append(farthest_unlabeled)
y_labeled.append(assigned_label)
# Remove the sample from the unlabeled data
x_unlabeled.drop(farthest_unlabeled.index, inplace=True)
y_unlabeled.drop(farthest_unlabeled.index, inplace=True)
# Once all unlabeled samples have been labeled, train a final classifier
svc.fit(x_labeled, y_labeled)
# Predict on the train and test datasets
pred_train = svc.predict(x_train)
pred_test = svc.predict(x_test)
# Train Metrics
acc_train.append(accuracy_score(y_train, pred_train))
fpr, tpr, threshold = roc_curve(y_train, pred_train)
auc_train.append(auc(fpr, tpr))
prec_B_train.append(precision_score(y_train, pred_train, pos_label=0))
prec_M_train.append(precision_score(y_train, pred_train, pos_label=1))
recall_B_train.append(recall_score(y_train, pred_train, pos_label=0))
recall_M_train.append(recall_score(y_train, pred_train, pos_label=1))
f1_B_train.append(f1_score(y_train, pred_train, pos_label=0))
f1_M_train.append(f1_score(y_train, pred_train, pos_label=1))
# Test Metrics
acc_test.append(accuracy_score(y_test, pred_test))
fpr, tpr, threshold = roc_curve(y_test, pred_test)
auc_test.append(auc(fpr, tpr))
prec_B_test.append(precision_score(y_test, pred_test, pos_label=0))
prec_M_test.append(precision_score(y_test, pred_test, pos_label=1))
recall_B_test.append(recall_score(y_test, pred_test, pos_label=0))
recall_M_test.append(recall_score(y_test, pred_test, pos_label=1))
f1_B_test.append(f1_score(y_test, pred_test, pos_label=0))
f1_M_test.append(f1_score(y_test, pred_test, pos_label=1))
# Average Train Metrics
summary.at['Accuracy', 'Semi-Supervised Train'] = np.mean(acc_train)
summary.at['AUC', 'Semi-Supervised Train'] = np.mean(auc_train)
summary.at['Precision', 'Semi-Supervised Train'] = np.mean(prec_B_train + prec_M_train)
summary.at['Precision_B', 'Semi-Supervised Train'] = np.mean(prec_B_train)
summary.at['Precision_M', 'Semi-Supervised Train'] = np.mean(prec_M_train)
summary.at['Recall', 'Semi-Supervised Train'] = np.mean(recall_B_train + recall_M_train)
summary.at['Recall_B', 'Semi-Supervised Train'] = np.mean(recall_B_train)
summary.at['Recall_M', 'Semi-Supervised Train'] = np.mean(recall_M_train)
summary.at['F1', 'Semi-Supervised Train'] = np.mean(f1_B_train + f1_M_train)
summary.at['F1_B', 'Semi-Supervised Train'] = np.mean(f1_B_train)
summary.at['F1_M', 'Semi-Supervised Train'] = np.mean(f1_M_train)
# Average Test Metrics
summary.at['Accuracy', 'Semi-Supervised Test'] = np.mean(acc_test)
summary.at['AUC', 'Semi-Supervised Test'] = np.mean(auc_test)
summary.at['Precision', 'Semi-Supervised Test'] = np.mean(prec_B_test + prec_M_test)
summary.at['Precision_B', 'Semi-Supervised Test'] = np.mean(prec_B_test)
summary.at['Precision_M', 'Semi-Supervised Test'] = np.mean(prec_M_test)
summary.at['Recall', 'Semi-Supervised Test'] = np.mean(recall_B_test + recall_M_test)
summary.at['Recall_B', 'Semi-Supervised Test'] = np.mean(recall_B_test)
summary.at['Recall_M', 'Semi-Supervised Test'] = np.mean(recall_M_test)
summary.at['F1', 'Semi-Supervised Test'] = np.mean(f1_B_test + f1_M_test)
summary.at['F1_B', 'Semi-Supervised Test'] = np.mean(f1_B_test)
summary.at['F1_M', 'Semi-Supervised Test'] = np.mean(f1_M_test)
# Show results
summary.loc[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1']]
# Confusion Matrix and ROC Curve | Train Data
plot_classification_results(y_train, pred_train)
plt.suptitle('Semi-Supervised Learning | Train Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
# Confusion Matrix and ROC Curve | Test Data
plot_classification_results(y_test, pred_test)
plt.suptitle('Semi-Supervised Learning | Test Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
# Lists to store metrics
acc_train, acc_test = [], []
auc_train, auc_test = [], []
prec_B_train, prec_M_train, prec_B_test, prec_M_test = [], [], [], []
recall_B_train, recall_M_train, recall_B_test, recall_M_test = [], [], [], []
f1_B_train, f1_M_train, f1_B_test, f1_M_test = [], [], [], []
# Run 30 Monte-Carlo Simulations
for M in tqdm(range(30)):
#--------------------------------------#
# (b) TRAIN-TEST SPLIT #
#--------------------------------------#
# Split Benign and Malign samples and shuffle them
df_B = df[df.Diagnosis == 'B'].sample(frac=1)
df_M = df[df.Diagnosis == 'M'].sample(frac=1)
# Create test and train dataframes
df_test = pd.concat([df_B[0:round(len(df_B)*0.2)],
df_M[0:round(len(df_M)*0.2)]])
df_train = pd.concat([df_B[round(len(df_B)*0.2):],
df_M[round(len(df_M)*0.2):]])
# Get X's and Y's
x_train = df_train.iloc[:, 1:]
x_test = df_test.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
y_test = df_test.iloc[:, 0]
# Normalize X's
SCALER = MinMaxScaler()
x_train = pd.DataFrame(SCALER.fit_transform(x_train), columns=[col_names[2:]])
x_test = pd.DataFrame(SCALER.transform(x_test), columns=[col_names[2:]])
# Label Encode Y's
LE = LabelEncoder()
y_train = pd.DataFrame(LE.fit_transform(y_train), columns=[col_names[1]])
y_test = pd.DataFrame(LE.transform(y_test), columns=[col_names[1]])
#--------------------------------------#
# (iii) 2 CLUSTER K-MEANS #
#--------------------------------------#
N_CLUSTERS = 2
#--------------------------------------#
# (A) AVOIDING LOCAL MINIMUM #
#--------------------------------------#
# The way to avoid having the algorithm becoming trapped into local minimum is to initialize it multiple times
# With each initializion picking the starting cluster centers at random
# This can be achieved with the hyperparameters "init" and "n_init"
kmeans = KMeans(n_clusters=N_CLUSTERS, init='random', n_init=100)
#--------------------------------------#
# (B) LABELING CLUSTERS #
#--------------------------------------#
# Compute cluster centers
kmeans.fit(x_train)
# Get the distances between samples and cluster centers
cluster_distances = kmeans.transform(x_train)
# Get the 30 samples closest to each cluster
cluster_0_distances = cluster_distances[:, 0]
cluster_1_distances = cluster_distances[:, 1]
closest_cluster_0 = cluster_0_distances.argsort()[:30]
closest_cluster_1 = cluster_1_distances.argsort()[:30]
# Get the most frequent class in each cluster (based on the 30 closest samples)
cluster_0_class = y_train.iloc[closest_cluster_0].mode(axis='index').values[0][0]
cluster_1_class = y_train.iloc[closest_cluster_1].mode(axis='index').values[0][0]
cluster_classes = {0:cluster_0_class,
1:cluster_1_class}
# Assign clusters to train data
clusters_train = kmeans.predict(x_train)
# Assign a class to each sample based on its cluster
pred_train = np.asarray(list(map(lambda key: cluster_classes[key], clusters_train)))
# Train Metrics
acc_train.append(accuracy_score(y_train, pred_train))
fpr, tpr, threshold = roc_curve(y_train, pred_train)
auc_train.append(auc(fpr, tpr))
prec_B_train.append(precision_score(y_train, pred_train, pos_label=0))
prec_M_train.append(precision_score(y_train, pred_train, pos_label=1))
recall_B_train.append(recall_score(y_train, pred_train, pos_label=0))
recall_M_train.append(recall_score(y_train, pred_train, pos_label=1))
f1_B_train.append(f1_score(y_train, pred_train, pos_label=0))
f1_M_train.append(f1_score(y_train, pred_train, pos_label=1))
#--------------------------------------#
# (C) TEST DATASET #
#--------------------------------------#
# Assign clusters to test data
clusters_test = kmeans.predict(x_test)
# Assign a class to each sample based on its cluster
pred_test = np.asarray(list(map(lambda key: cluster_classes[key], clusters_test)))
# Test Metrics
acc_test.append(accuracy_score(y_test, pred_test))
fpr, tpr, threshold = roc_curve(y_test, pred_test)
auc_test.append(auc(fpr, tpr))
prec_B_test.append(precision_score(y_test, pred_test, pos_label=0))
prec_M_test.append(precision_score(y_test, pred_test, pos_label=1))
recall_B_test.append(recall_score(y_test, pred_test, pos_label=0))
recall_M_test.append(recall_score(y_test, pred_test, pos_label=1))
f1_B_test.append(f1_score(y_test, pred_test, pos_label=0))
f1_M_test.append(f1_score(y_test, pred_test, pos_label=1))
# Average Train Metrics
summary.at['Accuracy', 'Unsupervised Train'] = np.mean(acc_train)
summary.at['AUC', 'Unsupervised Train'] = np.mean(auc_train)
summary.at['Precision', 'Unsupervised Train'] = np.mean(prec_B_train + prec_M_train)
summary.at['Precision_B', 'Unsupervised Train'] = np.mean(prec_B_train)
summary.at['Precision_M', 'Unsupervised Train'] = np.mean(prec_M_train)
summary.at['Recall', 'Unsupervised Train'] = np.mean(recall_B_train + recall_M_train)
summary.at['Recall_B', 'Unsupervised Train'] = np.mean(recall_B_train)
summary.at['Recall_M', 'Unsupervised Train'] = np.mean(recall_M_train)
summary.at['F1', 'Unsupervised Train'] = np.mean(f1_B_train + f1_M_train)
summary.at['F1_B', 'Unsupervised Train'] = np.mean(f1_B_train)
summary.at['F1_M', 'Unsupervised Train'] = np.mean(f1_M_train)
# Average Test Metrics
summary.at['Accuracy', 'Unsupervised Test'] = np.mean(acc_test)
summary.at['AUC', 'Unsupervised Test'] = np.mean(auc_test)
summary.at['Precision', 'Unsupervised Test'] = np.mean(prec_B_test + prec_M_test)
summary.at['Precision_B', 'Unsupervised Test'] = np.mean(prec_B_test)
summary.at['Precision_M', 'Unsupervised Test'] = np.mean(prec_M_test)
summary.at['Recall', 'Unsupervised Test'] = np.mean(recall_B_test + recall_M_test)
summary.at['Recall_B', 'Unsupervised Test'] = np.mean(recall_B_test)
summary.at['Recall_M', 'Unsupervised Test'] = np.mean(recall_M_test)
summary.at['F1', 'Unsupervised Test'] = np.mean(f1_B_test + f1_M_test)
summary.at['F1_B', 'Unsupervised Test'] = np.mean(f1_B_test)
summary.at['F1_M', 'Unsupervised Test'] = np.mean(f1_M_test)
# Show results
summary.loc[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1']]
# Confusion Matrix and ROC Curve | Train Data
plot_classification_results(y_train, pred_train)
plt.suptitle('Unsupervised Learning | Train Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
# Confusion Matrix and ROC Curve | Test Data
plot_classification_results(y_test, pred_test)
plt.suptitle('Unsupervised Learning | Test Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
A Tutorial on Spectral Clustering: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.165.9323&rep=rep1&type=pdf
# Lists to store metrics
acc_train, acc_test = [], []
auc_train, auc_test = [], []
prec_B_train, prec_M_train, prec_B_test, prec_M_test = [], [], [], []
recall_B_train, recall_M_train, recall_B_test, recall_M_test = [], [], [], []
f1_B_train, f1_M_train, f1_B_test, f1_M_test = [], [], [], []
# Run 30 Monte-Carlo Simulations
for M in tqdm(range(30)):
#--------------------------------------#
# (b) TRAIN-TEST SPLIT #
#--------------------------------------#
# Split Benign and Malign samples and shuffle them
df_B = df[df.Diagnosis == 'B'].sample(frac=1)
df_M = df[df.Diagnosis == 'M'].sample(frac=1)
# Create test and train dataframes
df_test = pd.concat([df_B[0:round(len(df_B)*0.2)],
df_M[0:round(len(df_M)*0.2)]])
df_train = pd.concat([df_B[round(len(df_B)*0.2):],
df_M[round(len(df_M)*0.2):]])
# Get X's and Y's
x_train = df_train.iloc[:, 1:]
x_test = df_test.iloc[:, 1:]
y_train = df_train.iloc[:, 0]
y_test = df_test.iloc[:, 0]
# Normalize X's
SCALER = MinMaxScaler()
x_train = pd.DataFrame(SCALER.fit_transform(x_train), columns=[col_names[2:]])
x_test = pd.DataFrame(SCALER.transform(x_test), columns=[col_names[2:]])
# Label Encode Y's
LE = LabelEncoder()
y_train = pd.DataFrame(LE.fit_transform(y_train), columns=[col_names[1]])
y_test = pd.DataFrame(LE.transform(y_test), columns=[col_names[1]])
#--------------------------------------#
# (iv) 2 CLUSTER K-MEANS #
#--------------------------------------#
N_CLUSTERS = 2
#--------------------------------------#
# (A) AVOIDING LOCAL MINIMUM #
#--------------------------------------#
# The way to avoid having the algorithm becoming trapped into local minimum is to initialize it multiple times
# With each initializion picking the starting cluster centers at random
# This can be achieved with the hyperparameters "init" and "n_init"
spectral_clst = SpectralClustering(n_clusters=N_CLUSTERS, affinity='rbf', gamma=1, n_init=100, n_jobs=-1)
#--------------------------------------#
# (B) LABELING CLUSTERS #
#--------------------------------------#
# Assign clusters to the classes
assigned_clusters_train = spectral_clst.fit_predict(x_train)
# For each cluster get the indexes of their datapoints
idx_cluster_0 = np.argwhere(assigned_clusters_train == 0).flatten()
idx_cluster_1 = np.argwhere(assigned_clusters_train == 1).flatten()
# Get the most frequent class in each cluster (based on all samples on the cluster)
cluster_0_class = y_train.iloc[idx_cluster_0].mode(axis='index').values[0][0]
cluster_1_class = y_train.iloc[idx_cluster_1].mode(axis='index').values[0][0]
cluster_classes = {0:cluster_0_class,
1:cluster_1_class}
# Assign a class to each sample based on its cluster
pred_train = np.asarray(list(map(lambda key: cluster_classes[key], assigned_clusters_train)))
# Train Metrics
acc_train.append(accuracy_score(y_train, pred_train))
fpr, tpr, threshold = roc_curve(y_train, pred_train)
auc_train.append(auc(fpr, tpr))
prec_B_train.append(precision_score(y_train, pred_train, pos_label=0))
prec_M_train.append(precision_score(y_train, pred_train, pos_label=1))
recall_B_train.append(recall_score(y_train, pred_train, pos_label=0))
recall_M_train.append(recall_score(y_train, pred_train, pos_label=1))
f1_B_train.append(f1_score(y_train, pred_train, pos_label=0))
f1_M_train.append(f1_score(y_train, pred_train, pos_label=1))
#--------------------------------------#
# (C) TEST DATASET #
#--------------------------------------#
'''
# Assign clusters to the classes
assigned_clusters_test = spectral_clst.fit_predict(x_test)
# For each cluster get the indexes of their datapoints
idx_cluster_0 = np.argwhere(assigned_clusters_test == 0).flatten()
idx_cluster_1 = np.argwhere(assigned_clusters_test == 1).flatten()
# Get the most frequent class in each cluster (based on all samples on the cluster)
cluster_0_class = y_test.iloc[idx_cluster_0].mode(axis='index').values[0][0]
cluster_1_class = y_test.iloc[idx_cluster_1].mode(axis='index').values[0][0]
cluster_classes = {0:cluster_0_class,
1:cluster_1_class}
# Assign a class to each sample based on its cluster
pred_test = np.asarray(list(map(lambda key: cluster_classes[key], assigned_clusters_test)))
'''
# https://piazza.com/class/kdfi4ly9oqy72y?cid=547
# Use KNN to assign labels to test samples based on train samples
KNN = KNeighborsClassifier(n_neighbors=1, n_jobs=-1)
KNN.fit(x_train, pred_train)
pred_test = KNN.predict(x_test)
# Test Metrics
acc_test.append(accuracy_score(y_test, pred_test))
fpr, tpr, threshold = roc_curve(y_test, pred_test)
auc_test.append(auc(fpr, tpr))
prec_B_test.append(precision_score(y_test, pred_test, pos_label=0))
prec_M_test.append(precision_score(y_test, pred_test, pos_label=1))
recall_B_test.append(recall_score(y_test, pred_test, pos_label=0))
recall_M_test.append(recall_score(y_test, pred_test, pos_label=1))
f1_B_test.append(f1_score(y_test, pred_test, pos_label=0))
f1_M_test.append(f1_score(y_test, pred_test, pos_label=1))
# Average Train Metrics
summary.at['Accuracy', 'Spectral Train'] = np.mean(acc_train)
summary.at['AUC', 'Spectral Train'] = np.mean(auc_train)
summary.at['Precision', 'Spectral Train'] = np.mean(prec_B_train + prec_M_train)
summary.at['Precision_B', 'Spectral Train'] = np.mean(prec_B_train)
summary.at['Precision_M', 'Spectral Train'] = np.mean(prec_M_train)
summary.at['Recall', 'Spectral Train'] = np.mean(recall_B_train + recall_M_train)
summary.at['Recall_B', 'Spectral Train'] = np.mean(recall_B_train)
summary.at['Recall_M', 'Spectral Train'] = np.mean(recall_M_train)
summary.at['F1', 'Spectral Train'] = np.mean(f1_B_train + f1_M_train)
summary.at['F1_B', 'Spectral Train'] = np.mean(f1_B_train)
summary.at['F1_M', 'Spectral Train'] = np.mean(f1_M_train)
# Average Test Metrics
summary.at['Accuracy', 'Spectral Test'] = np.mean(acc_test)
summary.at['AUC', 'Spectral Test'] = np.mean(auc_test)
summary.at['Precision', 'Spectral Test'] = np.mean(prec_B_test + prec_M_test)
summary.at['Precision_B', 'Spectral Test'] = np.mean(prec_B_test)
summary.at['Precision_M', 'Spectral Test'] = np.mean(prec_M_test)
summary.at['Recall', 'Spectral Test'] = np.mean(recall_B_test + recall_M_test)
summary.at['Recall_B', 'Spectral Test'] = np.mean(recall_B_test)
summary.at['Recall_M', 'Spectral Test'] = np.mean(recall_M_test)
summary.at['F1', 'Spectral Test'] = np.mean(f1_B_test + f1_M_test)
summary.at['F1_B', 'Spectral Test'] = np.mean(f1_B_test)
summary.at['F1_M', 'Spectral Test'] = np.mean(f1_M_test)
# Show results
summary.loc[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1']]
# Confusion Matrix and ROC Curve | Train Data
plot_classification_results(y_train, pred_train)
plt.suptitle('Spectral Clustering | Train Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
# Confusion Matrix and ROC Curve | Test Data
plot_classification_results(y_test, pred_test)
plt.suptitle('Spectral Clustering | Test Data', y=0.9, fontsize=15, fontweight='bold')
plt.show()
# Comparing results on train data
summary_train = summary.loc[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1'], ['Supervised Train', 'Semi-Supervised Train', 'Unsupervised Train', 'Spectral Train']]
summary_train
# Comparing results on test data
summary_test = summary.loc[['Accuracy', 'AUC', 'Precision', 'Recall', 'F1'], ['Supervised Test', 'Semi-Supervised Test', 'Unsupervised Test', 'Spectral Test']]
summary_test
# Plotting all metrics for train and test data
fig, axs = plt.subplots(nrows=5, ncols=2, figsize=(10,20), sharey='row')
custom_palette = sns.color_palette("hls", 4)
sns.set_palette(custom_palette)
# Train Data Accuracy
fig.sca(axs[0][0])
g = sns.barplot(y=summary_train.T['Accuracy'].values, x=summary_train.T['Accuracy'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[0][0].set_title('Accuracy | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data Accuracy
fig.sca(axs[0][1])
g = sns.barplot(y=summary_test.T['Accuracy'].values, x=summary_test.T['Accuracy'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[0][1].set_title('Accuracy | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data AUC
fig.sca(axs[1][0])
g = sns.barplot(y=summary_train.T['AUC'].values, x=summary_train.T['AUC'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[1][0].set_title('AUC | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data AUC
fig.sca(axs[1][1])
g = sns.barplot(y=summary_test.T['AUC'].values, x=summary_test.T['AUC'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[1][1].set_title('AUC | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data Precision
fig.sca(axs[2][0])
g = sns.barplot(y=summary_train.T['Precision'].values, x=summary_train.T['Precision'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[2][0].set_title('Precision | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data Precision
fig.sca(axs[2][1])
g = sns.barplot(y=summary_test.T['Precision'].values, x=summary_test.T['Precision'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[2][1].set_title('Precision | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data Recall
fig.sca(axs[3][0])
g = sns.barplot(y=summary_train.T['Recall'].values, x=summary_train.T['Recall'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[3][0].set_title('Recall | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data Recall
fig.sca(axs[3][1])
g = sns.barplot(y=summary_test.T['Recall'].values, x=summary_test.T['Recall'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[3][1].set_title('Recall | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data F1
fig.sca(axs[4][0])
g = sns.barplot(y=summary_train.T['F1'].values, x=summary_train.T['F1'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[4][0].set_title('F1 | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data F1
fig.sca(axs[4][1])
g = sns.barplot(y=summary_test.T['F1'].values, x=summary_test.T['F1'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[4][1].set_title('F1 | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
fig.tight_layout(h_pad=3)
plt.show()
# Analysing how the models perform considering stratified classes
stratified = summary.loc[['Precision_B', 'Precision_M','Recall_B', 'Recall_M', 'F1_B', 'F1_M']]
stratified_train = stratified.loc[['Precision_B', 'Precision_M','Recall_B', 'Recall_M', 'F1_B', 'F1_M'], ['Supervised Train', 'Semi-Supervised Train', 'Unsupervised Train', 'Spectral Train']]
stratified_test = stratified.loc[['Precision_B', 'Precision_M','Recall_B', 'Recall_M', 'F1_B', 'F1_M'], ['Supervised Test', 'Semi-Supervised Test', 'Unsupervised Test', 'Spectral Test']]
stratified
# Plotting all metrics for train and test data
fig, axs = plt.subplots(nrows=6, ncols=2, figsize=(10,24), sharey='row')
custom_palette = sns.color_palette("hls", 4)
sns.set_palette(custom_palette)
# Train Data Precision_B
fig.sca(axs[0][0])
g = sns.barplot(y=stratified_train.T['Precision_B'].values, x=stratified_train.T['Precision_B'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[0][0].set_title('Precision_B | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data Precision_B
fig.sca(axs[1][0])
g = sns.barplot(y=stratified_test.T['Precision_B'].values, x=stratified_test.T['Precision_B'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[1][0].set_title('Precision_B | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data Precision_M
fig.sca(axs[0][1])
g = sns.barplot(y=stratified_train.T['Precision_M'].values, x=stratified_train.T['Precision_M'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[0][1].set_title('Precision_M | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data Precision_M
fig.sca(axs[1][1])
g = sns.barplot(y=stratified_test.T['Precision_M'].values, x=stratified_test.T['Precision_M'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[1][1].set_title('Precision_M | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data Recall_B
fig.sca(axs[2][0])
g = sns.barplot(y=stratified_train.T['Recall_B'].values, x=stratified_train.T['Recall_B'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[2][0].set_title('Recall_B | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data Recall_B
fig.sca(axs[3][0])
g = sns.barplot(y=stratified_test.T['Recall_B'].values, x=stratified_test.T['Recall_B'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[3][0].set_title('Recall_B | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data Recall_M
fig.sca(axs[2][1])
g = sns.barplot(y=stratified_train.T['Recall_M'].values, x=stratified_train.T['Recall_M'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[2][1].set_title('Recall_M | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data Recall_M
fig.sca(axs[3][1])
g = sns.barplot(y=stratified_test.T['Recall_M'].values, x=stratified_test.T['Recall_M'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[3][1].set_title('Recall_M | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data F1_B
fig.sca(axs[4][0])
g = sns.barplot(y=stratified_train.T['F1_B'].values, x=stratified_train.T['F1_B'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[4][0].set_title('F1_B | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data F1_B
fig.sca(axs[5][0])
g = sns.barplot(y=stratified_test.T['F1_B'].values, x=stratified_test.T['F1_B'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[5][0].set_title('F1_B | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Train Data F1_M
fig.sca(axs[4][1])
g = sns.barplot(y=stratified_train.T['F1_M'].values, x=stratified_train.T['F1_M'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[4][1].set_title('F1_M | Train Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
# Test Data F1_M
fig.sca(axs[5][1])
g = sns.barplot(y=stratified_test.T['F1_M'].values, x=stratified_test.T['F1_M'].index)
g.set(ylim=(0, 1), ylabel=None)
g.tick_params(labelrotation=15)
axs[5][1].set_title('F1_M | Test Data', pad=10, fontdict={'fontsize':14, 'fontweight':'bold'})
fig.tight_layout(h_pad=3)
plt.show()
Conclusions Regarding Model Performance
When comparing the Supervised and the Semi-Supervised methods, there is only a small loss in performance across all metrics.
A somewhat larger drop in performance can be observed when moving from Semi-Supervised to Unsupervised methods.
Yet, the most striking loss in performance can be found among the Unsupervised methods themselves, with Spectral Clustering obtaining quite poor results.
There is also consitency across all metrics, with the model never displaying a palpable disparity in performance among metrics.
Conclusion Regarding Class-Stratified Performance
Performance is very slightly higher for the slightly more common class (Benign), but the difference is indeed very minimal.
Given how the dataset is decently balanced, it is no surprise that the classes obtain similar performances in all class-stratified metrics for both train and test datasets.