DSCI 552 - Machine Learning for Data Science
Homework 3
Matheus Schmitz
USC ID: 5039286453
# tqdm creates a loading bar for loops
# Extremely useful when your loop takes long
!pip install tqdm
# Data Manipulation
import numpy as np
import pandas as pd
# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# Progress bar
from tqdm.notebook import tqdm
# Metrics
from sklearn.metrics import roc_auc_score
# OS
import os
import warnings
warnings.filterwarnings('ignore')
# Auxilary functions to facilitate plotting classification results
# Based on scikit-learn documentation: https://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pylab
from sklearn.metrics import roc_curve, auc, confusion_matrix, precision_recall_curve
def plot_cm(true_onehot, pred_probs, classes):
# Dictionaries to store data
fpr = dict()
tpr = dict()
thresholds = dict()
roc_auc = dict()
# Coerce inputs to np.array
true_onehot = np.asarray(true_onehot)
pred_probs = np.asarray(pred_probs)
classes = np.asarray(classes)
N_CLASSES = len(classes)
# Extract true and predicted labels
pred_labels = classes[np.argmax(pred_probs, axis=1)]
true_labels = classes[np.argmax(true_onehot, axis=1)]
# Get Confusion Matrix and plot
conf_mat = confusion_matrix(true_labels, pred_labels, labels=classes)
#plt.axis('equal')
sns.heatmap(conf_mat, annot=True, cmap='Blues', xticklabels=classes, yticklabels=classes, square=True, cbar=False)
plt.title('Confusion Matrix', pad = 20, fontweight='bold')
plt.ylabel('True Emotion', fontsize = 12, labelpad = 10)
plt.xlabel('Predicted Emotion', fontsize = 12, labelpad = 10)
def plot_roc_multiclass(true_onehot, pred_probs, classes):
# Dictionaries to store data
fpr = dict()
tpr = dict()
thresholds = dict()
roc_auc = dict()
# Coerce inputs to np.array
true_onehot = np.asarray(true_onehot)
pred_probs = np.asarray(pred_probs)
classes = np.asarray(classes)
N_CLASSES = len(classes)
# For each class, get the fpr, tpr, thresholds and auc
for i in range(N_CLASSES):
fpr[i], tpr[i], thresholds[i] = roc_curve(true_onehot[:, i], pred_probs[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Generate multilabel colors with pylab
class_colors = []
cm = pylab.get_cmap('nipy_spectral')
for i in range(len(classes)):
color = cm(1.*i/N_CLASSES)
class_colors.append(color)
# Plot the class-stratified ROCs
plt.axis('square')
for i, color in zip(range(N_CLASSES), class_colors):
plt.plot(fpr[i], tpr[i], color=color, lw=3, label=f'{classes[i]} (area = {roc_auc[i]:.2f})', alpha=0.7)
plt.plot([0, 1], [0, 1], 'k--', lw=2, alpha=0.3)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize = 12, labelpad = 10)
plt.ylabel('True Positive Rate', fontsize = 12, labelpad = 10)
plt.title('Multi-Class ROC Curve', pad = 20, fontweight='bold')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
def plot_roc_overall(true_onehot, pred_probs, classes):
# Coerce inputs to np.array
true_onehot = np.asarray(true_onehot)
pred_probs = np.asarray(pred_probs)
classes = np.asarray(classes)
N_CLASSES = len(classes)
# Compute global (micro-average) ROC curve and ROC area
fpr, tpr, thresholds = roc_curve(true_onehot.ravel(), pred_probs.ravel())
roc_auc = auc(fpr, tpr)
# Plot the model overall ROC
plt.axis('square')
plt.plot(fpr, tpr, label=f'MODEL OVERALL (area = {roc_auc:.2f})', color='deeppink', lw=5)
plt.plot([0, 1], [0, 1], 'k--', lw=2, alpha=0.3)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize = 12, labelpad = 10)
plt.ylabel('True Positive Rate', fontsize = 12, labelpad = 10)
plt.title('Overall ROC Curve', pad = 20, fontweight='bold')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
def plot_prec_recall_curve(true_onehot, pred_probs, classes):
# Dictionaries to store data
precision = dict()
recall = dict()
thresholds = dict()
auprc = dict()
# Coerce inputs to np.array
true_onehot = np.asarray(true_onehot)
pred_probs = np.asarray(pred_probs)
classes = np.asarray(classes)
N_CLASSES = len(classes)
# For each class, get the fpr, tpr, thresholds and auc
for i in range(N_CLASSES):
precision[i], recall[i], thresholds[i] = precision_recall_curve(true_onehot[:, i], pred_probs[:, i])
auprc[i] = auc(recall[i], precision[i])
# Generate multilabel colors with pylab
class_colors = []
cm = pylab.get_cmap('nipy_spectral')
for i in range(len(classes)):
color = cm(1.*i/N_CLASSES)
class_colors.append(color)
# Plot the class-stratified ROCs
plt.axis('square')
for i, color in zip(range(N_CLASSES), class_colors):
plt.plot(recall[i], precision[i], color=color, lw=3, label=f'{classes[i]} (area = {auprc[i]:.2f})', alpha=0.7)
plt.plot([1, 0], [0, 1], 'k--', lw=2, alpha=0.3)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('Recall', fontsize = 12, labelpad = 10)
plt.ylabel('Precision', fontsize = 12, labelpad = 10)
plt.title('Precision-Recall Curve', pad = 20, fontweight='bold')
legend = plt.legend()
legend._legend_box.align = "left"
plt.legend(loc="lower left")
def plot_classification_results(true_onehot, pred_probs, classes):
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16,10))
fig.sca(axs[0][0])
plot_cm(true_onehot, pred_probs, classes)
fig.sca(axs[0][1])
plot_prec_recall_curve(true_onehot, pred_probs, classes)
fig.sca(axs[1][0])
plot_roc_overall(true_onehot, pred_probs, classes)
fig.sca(axs[1][1])
plot_roc_multiclass(true_onehot, pred_probs, classes)
fig.tight_layout(h_pad=3, w_pad=-30)
fig.show()
col_names = ['time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13', 'avg_rss23', 'var_rss23']
# Get all csv files
all_files = []
for rootname, dirnames, filenames in os.walk('../data'):
for filename in filenames:
filepath = os.path.join(rootname, filename)
filepath = filepath.replace('\\', '/')
all_files.append(filepath)
len(all_files)
# List train and test datasets
test_datasets = ['../data/bending1/dataset1.csv',
'../data/bending1/dataset2.csv',
'../data/bending2/dataset1.csv',
'../data/bending2/dataset2.csv',
'../data/cycling/dataset1.csv',
'../data/cycling/dataset2.csv',
'../data/cycling/dataset3.csv',
'../data/lying/dataset1.csv',
'../data/lying/dataset2.csv',
'../data/lying/dataset3.csv',
'../data/sitting/dataset1.csv',
'../data/sitting/dataset2.csv',
'../data/sitting/dataset3.csv',
'../data/standing/dataset1.csv',
'../data/standing/dataset2.csv',
'../data/standing/dataset3.csv',
'../data/walking/dataset1.csv',
'../data/walking/dataset2.csv',
'../data/walking/dataset3.csv',]
train_datasets = [fpath for fpath in all_files if fpath not in test_datasets]
print(f'Tally of test datasets: {len(test_datasets)}')
print(f'Tally of train datasets: {len(train_datasets)}')
Some CSVs are separated by commas "," while others are separated by spaces " ", so pandas has to be told to consider two separation criteria. This can be done with sep=r'\s+|,'
, where \s+
means to use one or more spaces, and ,
means to use a comma as separator.
df_test = pd.DataFrame()
for file in test_datasets:
tmp = pd.read_csv(file, skiprows=5, header=None, sep=r'\s+|,')
df_test = df_test.append(tmp, ignore_index=True)
df_test.columns = col_names
df_test.shape
df_test.tail(3)
df_train = pd.DataFrame()
for file in train_datasets:
tmp = pd.read_csv(file, skiprows=5, header=None, sep=r'\s+|,')
df_train = df_train.append(tmp, ignore_index=True)
df_train.columns = col_names
df_train.shape
One of the datasets has one fewer row.
"../data/sitting/dataset8.csv" shape is (479, 7).
The missing row is between lines 59 and 60. The time skips form 13250ms to 13750ms, skipping the expected 13500 measurement.
df_train.tail(3)
The most simple features which can be extracted from the data are those regarding it's distribution, known as descriptive statistics. Examples are: mean, median, standard deviation, minimum and maximum, quartiles or percentiles, as well as skewness and kurtosis.
Some slightly more advances features are those employed by ARIMA models and their evolutions, such as SARIMAX, which employes: Seasonality (S); Autoregression (AR); Integration (I); Moving Average (MA); Exogenous Variables (X).
There are three main classes of features for time series analysis:
Wikipedia has a comprehensive listing with further references:
https://en.wikipedia.org/wiki/Time_series#Measures
Another source which balances comprehensiveness and accessibility is the paper which Facebook published together with it's Prophet algorithm for time series forecasting:
table_cols = ['min1', 'max1', 'mean1', 'median1', 'std1', 'firstq1', 'thirdq1',
'min2', 'max2', 'mean2', 'median2', 'std2', 'firstq2', 'thirdq2',
'min3', 'max3', 'mean3', 'median3', 'std3', 'firstq3', 'thirdq3',
'min4', 'max4', 'mean4', 'median4', 'std4', 'firstq4', 'thirdq4',
'min5', 'max5', 'mean5', 'median5', 'std5', 'firstq5', 'thirdq5',
'min6', 'max6', 'mean6', 'median6', 'std6', 'firstq6', 'thirdq6']
len(table_cols)
# Create a master list containing all features for all csv's, then convert it to a dataframe
# The list will follow the order of 'table_cols', listing all attributes of the first feature before going to the second
master_list = []
# Loop through all CSVs
for file in all_files:
# Read the CSV and drop the 'time' column
tmp = pd.read_csv(file, skiprows=5, header=None, sep=r'\s+|,')
tmp.columns = col_names
tmp.drop('time', inplace=True, axis=1)
# Extract the features from each variable in the CSV
ft_min = pd.Series(tmp.min().values)
ft_max = pd.Series(tmp.max().values)
ft_mean = pd.Series(tmp.mean().values)
ft_median = pd.Series(tmp.median().values)
ft_std = pd.Series(tmp.std().values)
ft_firstq = pd.Series(tmp.quantile(0.25).values)
ft_thirdq = pd.Series(tmp.quantile(0.75).values)
# Put all the features in a dataframe
# Each of the features (the 'ft_' pd.Series from above) will be a row, the columns will be each of the 6 variables in the CSV
df_tmp = pd.DataFrame(data=[ft_min, ft_max, ft_mean, ft_median, ft_std, ft_firstq, ft_thirdq])
# The reason for converting to a dataframe of this shape is so that the .melt() method can be used,
# this allows the sequence to be converted...
# from: the .min() of all variables, followed by the .max() of all varibles, then the .mean() of all and so on - this is how it would have been were I to merely append all 'ft_' pd.Series together
# to: all features from variable 1, then all features from variable 2, then from variable 3 and so on - which is the desired format
# The resulting pd.Series is now a 42-item long list ordered by CSV variable, and not by 'ft_' feature - it will represent one row in the final dataframe
df_tmp = df_tmp.melt()
# Melt creates two columns, one named 'varible' which has in each row the name of the column that was melted
# The other column is 'value' which has the value associated with the previous match (at.[row, col])
# Keeping only the series of values, ordered by CSV variable
df_tmp = pd.Series(df_tmp['value'])
# Then append the data already in the correct order to the master_list which will be used to make the final dataframe
master_list.append(df_tmp)
# Now that the loop is over, create the final dataframe from the master_list
df_features = pd.DataFrame(data=master_list)
df_features.reset_index(inplace=True, drop=True)
df_features.columns = table_cols
df_features.shape
df_features
%%time
# Calculate the standard deviation for each feature from the dataset
df_column_stds = pd.DataFrame(df_features.std(), columns=['Standard Deviation'])
# Create a dataframe to store all bootstrapped std samples
bootstrap_df = pd.DataFrame()
# Run 1000 loops bootstrapping data
for i in range(0, 1000):
# Generate a bootstrapped dataset and get it's standard deviation
bs_std = df_features.sample(frac=1, replace=True).std()
# Then append it to bootstrap_df as a column
bootstrap_df = pd.concat([bootstrap_df, bs_std], axis=1)
# After finishing the boostraping loop, get the the 0.05 and 0.95 quantiles (0.95 - 0.05 = 90% CI)
lower_bound = bootstrap_df.quantile(q=0.05, axis=1)
upper_bound = bootstrap_df.quantile(q=0.95, axis=1)
# Then add the upper and lower bounds to the dataframe with the standard deviations
df_std_ci = pd.concat([df_column_stds, lower_bound, upper_bound], axis=1)
df_std_ci
min, max and mean seem like a good choice for representing the data, so I'll take the advice and go with those, as they do seem to together portray a good image of the data distribution.
top_features = ['min1', 'max1', 'mean1',
'min2', 'max2', 'mean2',
'min6', 'max6', 'mean6']
# This will require extracting the labels from each class
# Use list comprehension to go through all_files (a list with file paths)
# And use the folder in the file path to set the label
labels = [all_files[i].split('/')[2] for i in range(len(all_files))]
# Then create a new column on the dataframe containing the labels
df_features['label'] = labels
# Create a binary indicator to define if the label is for bending (1) or for something else (0)
labels_bending = [1 if 'bending' in label else 0 for label in labels]
df_features['is_bending'] = labels_bending
labels_binary_unique = ['non-bending', 'bending']
# Since we are at it, also attach the filepath to each sample, just in case it becomes necessary down the line
# Attach the filepatchs to each row
df_features['fpath'] = all_files
df_features.head(3)
# Train-test split
# Create a new df_test, overwriting the old one from (b)
df_test = df_features[df_features['fpath'].isin(test_datasets)]
print(f'Test dataset shape: {df_test.shape}')
# Create a new df_train, overwriting the old one from (b)
df_train = df_features[~df_features['fpath'].isin(test_datasets)]
print(f'Train dataset shape: {df_train.shape}')
# Create a training dataset with only the top features and the is_bending label
df_train_top_f = df_train[top_features]
df_train_top_f['is_bending'] = df_train['is_bending']
df_train_top_f.head(3)
sns.pairplot(data=df_train_top_f, hue='is_bending')
# We'll have 12 columns, so need to create appropriate headings
table_cols2 = ['min01', 'max01', 'mean01', 'median01', 'std01', 'firstq01', 'thirdq01',
'min02', 'max02', 'mean02', 'median02', 'std02', 'firstq02', 'thirdq02',
'min03', 'max03', 'mean03', 'median03', 'std03', 'firstq03', 'thirdq03',
'min04', 'max04', 'mean04', 'median04', 'std04', 'firstq04', 'thirdq04',
'min05', 'max05', 'mean05', 'median05', 'std05', 'firstq05', 'thirdq05',
'min06', 'max06', 'mean06', 'median06', 'std06', 'firstq06', 'thirdq06',
'min07', 'max07', 'mean07', 'median07', 'std07', 'firstq07', 'thirdq07',
'min08', 'max08', 'mean08', 'median08', 'std08', 'firstq08', 'thirdq08',
'min09', 'max09', 'mean09', 'median09', 'std09', 'firstq09', 'thirdq09',
'min10', 'max10', 'mean10', 'median10', 'std10', 'firstq10', 'thirdq10',
'min11', 'max11', 'mean11', 'median11', 'std11', 'firstq11', 'thirdq11',
'min12', 'max12', 'mean12', 'median12', 'std12', 'firstq12', 'thirdq12']
len(table_cols2)
# Redoing all the previous procedure, now with changes to take the inner (tmp) dataframe from (480, 6) to (240, 12)
# Create a master list containing all features for all csv's, then convert it to a dataframe
# The list already follows the order of 'table_cols', listing all attributes of the first feature before going to the second
master_list2 = []
# Loop through all CSVs
for file in all_files:
# Read the CSV
tmp = pd.read_csv(file, skiprows=5, header=None, sep=r'\s+|,')
# Drop the first column with time data
tmp.drop(0, axis=1, inplace=True)
# Take the halfway point to split the dataframe
cutpoint = len(tmp)//2
# Make one dataframe from the first half, and one from the second half
pt1 = tmp[:cutpoint].reset_index(drop=True)
pt2 = tmp[cutpoint:].reset_index(drop=True)
# Put them together as a dataframe with twice as many columns and half as many rows
tmp = pd.concat(objs=[pt1, pt2], axis=1, ignore_index=True)
# Extract the features from each variable in the dataframe
ft_min = pd.Series(tmp.min().values)
ft_max = pd.Series(tmp.max().values)
ft_mean = pd.Series(tmp.mean().values)
ft_median = pd.Series(tmp.median().values)
ft_std = pd.Series(tmp.std().values)
ft_firstq = pd.Series(tmp.quantile(0.25).values)
ft_thirdq = pd.Series(tmp.quantile(0.75).values)
# Put all the features in a dataframe
# Each of the features (the 'ft_' pd.Series from above) will be a row, the columns will be each of the 6 variables in the CSV
df_tmp = pd.DataFrame(data=[ft_min, ft_max, ft_mean, ft_median, ft_std, ft_firstq, ft_thirdq])
# The reason for converting to a dataframe of this hape is so that the .melt() method can be used,
# this allows the sequence to be converted...
# from: the .min() of all variables, followed by the .max() of all varibles, then the .mean() of all and so on - this is how it would have been were I to merely append all 'ft_' pd.Series together
# to: all features from variable 1, then all features from variable 2, then from variable 3 and so on - which is the desired format
# The resulting pd.Series is now a 42-item long list ordered by CSV variable, and not by 'ft_' feature - it will represent one row in the final dataframe
df_tmp = df_tmp.melt()
df_tmp = pd.Series(df_tmp['value'])
# Then append the data already in the correct order to the master_list which will be used to make the final dataframe
master_list2.append(df_tmp)
# Now that the loop is over, create the final dataframe from the master_list
df_features2 = pd.DataFrame(data=master_list2)
df_features2.reset_index(inplace=True, drop=True)
df_features2.columns = table_cols2
df_features2.shape
df_features2['label'] = df_features['label']
df_features2['is_bending'] = df_features['is_bending']
df_features2['fpath'] = df_features['fpath']
df_features2
# Train-test split
# Create a new df_test, overwriting the old one from (b)
df_test2 = df_features2[df_features2['fpath'].isin(test_datasets)]
print(f'Test dataset shape: {df_test2.shape}')
# Create a new df_train, overwriting the old one from (b)
df_train2 = df_features2[~df_features2['fpath'].isin(test_datasets)]
print(f'Train dataset shape: {df_train2.shape}')
top_features2 = ['min01', 'max01', 'mean01',
'min02', 'max02', 'mean02',
'min12', 'max12', 'mean12']
# Create a training dataset with only the top features and the is_bending label
df_train_top_f2 = df_train2[top_features2]
df_train_top_f2['is_bending'] = df_train2['is_bending']
df_train_top_f2
sns.pairplot(data=df_train_top_f2, hue='is_bending')
The pairplots from (d) i and (d) ii are highly similar. They do not have considerable differences between them.
This is complex (at least for me!) so I'll break it down in parts:
%%time
# Here we go again!
# dictionary to store ALL my dataframes
master_dict = {}
# Set range of l to loop through
l_range = np.arange(1, 21)
# Will be creating one dataframe per l in [1:20]
for l_split in l_range:
# Create a master list containing all features for all csv's, then convert it to a dataframe
# The list already follows the order of 'table_cols', listing all attributes of the first feature before going to the second
master_list2 = []
# Define the cutpoints
cutpoints = np.arange(0, l_split) + 1
# Get the number of chunks, from which the chunk size (truncation) will be obtained
n_chunks = max(cutpoints)
# Loop through all CSVs (done one time for each dataframe to be created)
for file in all_files:
# Read the CSV
tmp = pd.read_csv(file, skiprows=5, header=None, sep=r'\s+|,')
# Drop the first column with time data
tmp.drop(0, axis=1, inplace=True)
# Set the truncation length for splitting the dataframe parts
truncation = len(tmp)//n_chunks
# Temporary dataframe to store all parts from the csv file as they are being appended
inner_df = pd.DataFrame()
# For each cutpoint take a chunk/truncation of the dataframe and add it as columns to inner_df
for cutpt in cutpoints:
# Make a dataframe from the truncation range
pt1 = tmp[(cutpt-1)*truncation:cutpt*truncation].reset_index(drop=True)
# append it to the temporary inner_df (adding columns)
inner_df = pd.concat(objs=[inner_df, pt1], axis=1, ignore_index=True)
# Once all columns have been created
# Extract the features from each variable (column)
ft_min = pd.Series(inner_df.min().values)
ft_max = pd.Series(inner_df.max().values)
ft_mean = pd.Series(inner_df.mean().values)
ft_median = pd.Series(inner_df.median().values)
ft_std = pd.Series(inner_df.std().values)
ft_firstq = pd.Series(inner_df.quantile(0.25).values)
ft_thirdq = pd.Series(inner_df.quantile(0.75).values)
# Put all the features in a dataframe
# Each of the features (the 'ft_' pd.Series from above) will be a row, the columns will be each of the 6 variables in the CSV
df_tmp = pd.DataFrame(data=[ft_min, ft_max, ft_mean, ft_median, ft_std, ft_firstq, ft_thirdq])
# The reason for converting to a dataframe of this hape is so that the .melt() method can be used,
# this allows the sequence to be converted...
# from: the .min() of all variables, followed by the .max() of all varibles, then the .mean() of all and so on - this is how it would have been were I to merely append all 'ft_' pd.Series together
# to: all features from variable 1, then all features from variable 2, then from variable 3 and so on - which is the desired format
# The resulting pd.Series is now a ?-item long list ordered by variable, and not by 'ft_' feature - it has all the engineered features for one file and represents one row in the final dataframe
df_tmp = df_tmp.melt()
df_tmp = pd.Series(df_tmp['value'])
# Then append the data already in the correct order to the master_list which will be used to make the final dataframe
master_list2.append(df_tmp)
# Now that the loop is over (for all files for one given l_split), create the a dataframe from the master_list and add it to the master_dict
df_features2 = pd.DataFrame(data=master_list2)
df_features2.reset_index(inplace=True, drop=True)
df_features2['is_bending'] = labels_bending
master_dict[l_split] = df_features2
There are seven features: min, max, mean, median, std, firstq, thirdq.
There are six variables: avg_rss12, var_rss12, avg_rss13, var_rss13, avg_rss23, var_rss23
The base dataframe has one row per csv file with sample data, resulting in 88 rows (there are 88 CSVs). This never changes across dataframes. The base dataframe has one column for each combination of those (6x7), resulting in 42 columns.
For each increase in l the number os variables increases by 6, which means that the number of combinations increase by 42, which means that the number of columns increase by 42.
Hence the first dataframe at master_dict[1] will have 42 columns (42 x 01 = 42)
The last dataframe at master_dict[20] will have 840 columns (42 x 20 = 840)
All dataframes have one more column, which is the 'is_bending' label related to the binary classification problem.
print(f'master_dict[1].shape: {master_dict[1].shape}')
print(f'master_dict[20].shape: {master_dict[20].shape}')
# Create arrays of variable length to properly name the dataframes
# Dictionary to store the names for each dataframes
dict_col_names = {}
# The number of variables increase by 6 for each successive dataframe
variable_count = np.arange(6, 121, 6)
# The features remain 7, but multiply the variables
name_list = ['min_', 'max_', 'mean_', 'median_', 'std_', 'firstq_', 'thirdq_']
for idx, var_num in enumerate(variable_count):
idx += 1 # because master_dict starts at 1
number_of_variables = np.arange(1, var_num+1)
list_col_names = []
for number in number_of_variables:
for col_n in name_list:
# The dataframes columns grow by 6*7 per iteration (number_of_variables * name_list)
tmp_name = f'{col_n}{number}'
list_col_names.append(tmp_name)
list_col_names.append('is_bending')
dict_col_names[idx] = list_col_names
print(f'len(dict_col_names[1]: {len(dict_col_names[1])}')
print(f'len(dict_col_names[20]: {len(dict_col_names[20])}')
# Now rename all dataframes
for idx in range(1, 21):
master_dict[idx].columns = dict_col_names[idx]
# Check result
master_dict[20].head(3)
# Now each dictionary has to be split in a training and testing parts
master_dict_test = {}
master_dict_train = {}
# Loop though each ditionary
for l_split in l_range:
# First need to append the file patchs to each dictionary, so they can be used for indexing
master_dict[l_split]['fpath'] = all_files
# Then make the train/test split based on the list test_datasets
df_test = master_dict[l_split][master_dict[l_split]['fpath'].isin(test_datasets)]
df_train = master_dict[l_split][~master_dict[l_split]['fpath'].isin(test_datasets)]
# Now drop the file paths from the train and test dataframes, as they aren't features to use during training
df_test.drop(['fpath'], axis=1, inplace=True)
df_train.drop(['fpath'], axis=1, inplace=True)
# And store them in the new dictionaries
master_dict_test[l_split] = df_test
master_dict_train[l_split] = df_train
# Check if shapes are correct
print(f'master_dict_test[1].shape: {master_dict_test[1].shape}')
print(f'master_dict_test[20].shape: {master_dict_test[20].shape}')
print(f'master_dict_train[1].shape: {master_dict_train[1].shape}')
print(f'master_dict_train[20].shape: {master_dict_train[20].shape}')
Part #1 is done. All dataframes are prepared for training the Logistic Regression algorithms!
I'll choose to go the route of recursive feature elimination using cross-validation
The ensure the CV won't be biased, the parameters will be kept the same, and it'll be done using only the training dataset, so as avoid data leakage from the test dataset, which would compromise the results.
The wrong approach to cross-validation is to first use the entire dataset to select the predictor variables and then afterwards run the CV with the variables already selected. This is problematic because the variables will have been selected with the model having access to all samples, effectively not having a validation dataset.
The right approach to cross-validation is to first split the data in K-folds, then afterwards select the predictor variables to be used, so that for each CV-fold the validation fold is the one used to select the predictor variables. That is, the right way is to first split the folds, then select the predictors based on the training folds, which means the predictors may vary between the CV iterations.
from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
# Instance a Recursive Feature Eliminator which uses Cross-Validation
# Set n_jobs to -2 to use parallel processing but leaving 1 core spared, just in case...
rfecv = RFECV(estimator = LogisticRegression(penalty='none'),
min_features_to_select=1,
scoring='accuracy',
cv=StratifiedKFold(5),
n_jobs=-1)
%%time
# Notice, this cell takes ~5 minutes
# Dicts to store the features and accuracies from each l-sized dataframe
dict_features = {}
acc_scores = {}
# Loop through all 'l_splits' in [1:20]
for l_split in tqdm(l_range):
x = master_dict_train[l_split].iloc[:, :-1]
y = master_dict_train[l_split].iloc[:, -1]
rfecv.fit(x, y)
boolean_mask = rfecv.support_ # This returns an array of boolean indicating which features were chosen
# Filter the column names of the df using 'boolean_mask', so as to keep only the names of the chosen features (columns)
# Those features will be the ones that resulted in the highest accuracy
# (if there's more than one identical accuracy scores, RFECV picks the one with the fewest features)
# And store those features (column names) in a dictionary indexed by 'l_split'
dict_features[l_split] = x.iloc[:, boolean_mask].columns
# Store all accuracies obtained during recusive feature elimination for the given dataframe
acc_scores[l_split] = rfecv.grid_scores_
From RFECV Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html
"gridscores : array of shape [n_subsets_of_features]
The cross-validation scores such that grid_scores_[i] corresponds to the CV score of the i-th subset of features."
Restating it: gridscores at position [0] contains the accuracy using only 1 feature, at position [1] containts the accuracy using 2 features. Implicitly, the 1, 2 or however many features it used to get that accuracy is always the combination of features that results in the best accuracy.
This information can be crossed by looking at dict_features at a given l_split (which will say which features were chosen for that l_split), and then checking the accuracy in acc_scores[l_split] in the item position which matches the number of features shown in dict_features[l_split].
Eg:
dict_features[3] has: ['max_2', 'min_5', 'max_5', 'thirdq_5', 'thirdq_7', 'max_18'], which totals 6 features.
acc_scores[3] has a list of all accuracies, but when we look at the 6th item in that list (which will be index 5), we find that acc_scores[3][5] = 0.9857142857142858
This way, for each dataframe in l_splits, the chosen features and their related accuracy can be found
# 'dict_features' contains the features chosen for each value of 'l'
# For example, when l=3, the chosen features were:
dict_features[3]
# When l=3 and n_features=6 the score is: (filtering with [5] because indexation starts at 0)
acc_scores[3][5]
# The dictionary returns the feature list with some weird formatting that has Index() wrapping it (see above)
# Use list comprehension to get a clean list of features per 'l'
# Note that unlike the other dictionaries which go [1:20], this list will go [0:19], since lists always start at index 0
list_features_clean = [list(dict_features.values())[i].values for i in range(20)]
# Summarize the number of features chosen per 'l' value
num_features_chosen = {}
for l_split in l_range:
num_features_chosen[l_split] = len(dict_features[l_split])
# Now extract the accuracy score obtained from using the chosen features
best_acc_scores = {}
for k,v in num_features_chosen.items():
# k markes the 'l' in 'l_range'
# v means the number of features which produced the best score for a given k (aka for a given l_range, aka for a given dataframe)
# Since v starts at 1, but the list indexing starts at 0, then position with the best accuracy will be v-1
# This mean that eg: with 1 feature the realted accuracy is found in position 0
# In the 'acc_scores' list will contain the accuracy_score assocaited with using the best v features
best_acc_scores[k] = acc_scores[k][v-1]
# Summarizing it all in a dataframe
summary_df = pd.DataFrame()
summary_df['l_split'] = num_features_chosen.keys()
summary_df['accuracy'] = best_acc_scores.values()
summary_df['num_features_chosen'] = num_features_chosen.values()
summary_df['list_features_chosen'] = list_features_clean
summary_df
I'll chose to use the model which had the best training accuracy, and since multiple models had a training accuracy of 1, I'll choose to select the one with the least engineered features, which happend to be when l = 1.
The dataset associated with this model can be found in master_dict_train[1].
# Getting the l_split associated with the best accuracy
selected_split_idx = np.argmax(summary_df['accuracy'])
selected_split_num = summary_df['l_split'][selected_split_idx]
selected_split_num
# Showing the model to be used
summary_df[summary_df.l_split==selected_split_num]
# The best set of features for this dataset is:
selected_features = [i for i in summary_df[summary_df.l_split==selected_split_num]['list_features_chosen'].values[0]]
selected_features
# Getting the dataframe associated with the chosen l_split
selected_df_train = master_dict_train[selected_split_num]
# Logistic regression for the selected dataset with the selected features
model_lr = LogisticRegression(penalty='none')
# Split x and y
x = selected_df_train[selected_features]
y = selected_df_train.iloc[:, -1]
# Train
model_lr.fit(x, y)
# Predict (also on train dataset)
pred_binary = model_lr.predict(x)
pred_probs = model_lr.predict_proba(x)
pred_logp = model_lr.predict_log_proba(x)
# One-hot encode y
true_onehot = [[1,0] if i==0 else [0,1] for i in y]
# ROC-AUC
roc_auc = roc_auc_score(true_onehot, pred_probs, average='micro')
print(f'The train dataset accuracy is: {model_lr.score(x, y)}')
print(f'The ROC AUC is {roc_auc}')
classes = ['non-bending', 'bending']
plot_classification_results(true_onehot, pred_probs, labels_binary_unique)
plt.suptitle('Logistic Regression Binary Classification | Train Data', y=1.05, fontsize=20, fontweight='bold')
plt.show()
# In order to get the regression betas and p-values need to use the logistic regression from statsmodels
# As scikit-learn doesn't have those retrievals implemented
import statsmodels.api as sm
# Train the statsmodels Logit regression
model_sm = sm.Logit(endog=y, exog=sm.add_constant(x))
# Fit the model
model_sm_fitted = model_sm.fit_regularized()
# Get the parameters names
logit_names = model_sm_fitted.params.index
# Get the params coefficients
logit_params = model_sm_fitted.params.values
# Get the p-values
logit_pvals_ = model_sm_fitted.pvalues.values
# When there's perfect classification, statsmodels will return np.NaN for p-values that are 0, converting those integer
logit_pvalues = np.zeros(len(logit_pvals_))
for idx, p in enumerate(logit_pvals_):
if p > 0:
logit_pvalues[idx] = p
else:
logit_pvalues[idx] = 0
# Make a dataframe summarizing the Logistic Regression parameters
summary_logit = pd.DataFrame()
summary_logit['parameter'] = logit_names
summary_logit['coefficient'] = logit_params
summary_logit['p-value'] = logit_pvalues
summary_logit
The constant (beta0) has a p-value of 1, indicating that it is not significant.
All other parameters (betas) have a p-value small enough that it gets rounded to 0.
When I made all my l_splitted train dataset, I also made the test datasets, so now they only need to be retrieved.
They are stored in master_dict_test, indexed by [l_split]
print(f'Model was trained on master_dict_train[{selected_split_num}], of shape {master_dict_train[selected_split_num].shape}')
print()
print(f'Test set dataframe is master_dict_test[{selected_split_num}], of shape {master_dict_test[selected_split_num].shape}')
# Select the test dataset
selected_df_test = master_dict_test[selected_split_num]
# For the test dataset:
# Split x and y
x = selected_df_test[selected_features]
y = selected_df_test.iloc[:, -1]
# Predict on the test dataset
pred_binary = model_lr.predict(x)
pred_probs = model_lr.predict_proba(x)
pred_logp = model_lr.predict_log_proba(x)
# One-hot encode y
true_onehot = [[1,0] if i==0 else [0,1] for i in y]
# ROC-AUC
roc_auc = roc_auc_score(true_onehot, pred_probs, average='micro')
print(f'The test dataset accuracy is: {model_lr.score(x, y)}')
print(f'The ROC AUC is {roc_auc}')
classes = ['non-bending', 'bending']
plot_classification_results(true_onehot, pred_probs, labels_binary_unique)
plt.suptitle('Logistic Regression Binary Classification | Test Data', y=1.05, fontsize=20, fontweight='bold')
plt.show()
The accuracy for both the cross-validated train dataset as well as the test dataset was the same, with both achieving 100% classification accuracy!
Equally, both train and test achieved a ROC-AUC of 1.0.
Yes. From the perfect classification accuracy it's evident that the classes are well-separated, causing instability in calculating the logistic regression parameters.
Yes, the classes are imbalanced. On the training daset there are 60 samples for non-bending (class 0) and only 9 for bending (class 1). In the training dataset there are 15 samples for non-bending (class 0) and only 4 for bending (class 1).
from sklearn.utils import resample
# Split the selected_df_train into sub-dataframes for each class, so that case-control sampling can be used
selected_df_train_1 = selected_df_train[selected_df_train.is_bending == 1]
selected_df_train_0 = selected_df_train[selected_df_train.is_bending == 0]
# Resample the dataframe for class 1 (bending) to get 60 samples
selected_df_train_1_bootstrapped = resample(selected_df_train_1, replace=True, n_samples=60, random_state=1)
# Make a new train dataset, now with balanced classes
selected_df_train_balanced = pd.concat([selected_df_train_1_bootstrapped, selected_df_train_0])
selected_df_train_balanced.shape
# For the balanced dataset:
# Split x and y
x = selected_df_train_balanced[selected_features]
y = selected_df_train_balanced.iloc[:, -1]
# Train
model_lr.fit(x, y)
# Predict on the train dataset
pred_binary = model_lr.predict(x)
pred_probs = model_lr.predict_proba(x)
pred_logp = model_lr.predict_log_proba(x)
# One-hot encode y
true_onehot = [[1,0] if i==0 else [0,1] for i in y]
# ROC-AUC
roc_auc = roc_auc_score(true_onehot, pred_probs, average='micro')
print(f'The balanced dataset accuracy is: {model_lr.score(x, y)}')
print(f'The ROC AUC is {roc_auc}')
plot_classification_results(true_onehot, pred_probs, labels_binary_unique)
plt.suptitle('Logistic Regression Binary Classification | Case-Controlled Train Data', y=1.05, fontsize=20, fontweight='bold')
plt.show()
from sklearn.linear_model import LogisticRegressionCV
# Instance the Logistic Regression with 5 CVs, L1 Penalty and 'liblinear'
model_logit_l1 = LogisticRegressionCV(cv=StratifiedKFold(5),
penalty="l1",
solver="liblinear",
scoring='accuracy',
n_jobs=-1,
random_state=1)
%%time
# Dicts to store the features and accuracies from each l-sized dataframe
dict_features_l1_list = {}
dict_features_l1_num = {}
l1_C_weights = {}
l1_lambda_weights = {}
l1_accuracies = {}
# Loop through all 'l_splits' in [1:20]
for l_split in tqdm(l_range):
# Split X and Y
x = master_dict_train[l_split].iloc[:, :-1]
y = master_dict_train[l_split].iloc[:, -1]
# Train
model_logit_l1.fit(x, y)
# Convert the list of coefficient weights to a boolean list that merely indicated if the coefficient was used
coefs_bool = []
for i in model_logit_l1.coef_[0]:
if i == 0:
coefs_bool.append(False)
else:
coefs_bool.append(True)
# Add an extra False at the end, to account for the label column
coefs_bool.append(False)
# Use the boolean list to get the features selected by the model
dict_features_l1_list[l_split] = [i for i in master_dict_train[l_split].columns[coefs_bool]]
# Append the count of chosen features to dict_features_l1_num
dict_features_l1_num[l_split] = len(dict_features_l1_list[l_split])
# Get the optimal value of C found by the model, putting it on the L1_C_weights dictionary
# C = 1/λ
l1_C_weights[l_split] = model_logit_l1.C_[0]
# "model_logit_l1.Cs_" is the array with all C values tested on cross-valition, but no need to extract it here
# And invert C to get the actual value for the penalty (λ) parameter
l1_lambda_weights[l_split] = 1/l1_C_weights[l_split]
# Get the model accuracy
l1_accuracies[l_split] = model_logit_l1.score(x, y)
Notes on the C_ parameter
From scikit-learn documentation: https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
C_ : ndarray of shape (n_classes,) or (n_classes - 1,)
Array of C that maps to the best scores across every class. If refit is set to False, then for each class, the best C is the average of the C’s that correspond to the best scores for each fold. C_ is of shape(n_classes,) when the problem is binary.
Each of the values in Cs describes the inverse of regularization strength. Like in support vector machines, smaller values specify stronger regularization.
That is: C = 1/λ
From Kaggle: https://www.kaggle.com/joparga3/2-tuning-parameters-for-logistic-regression
Lambda (λ) controls the trade-off between allowing the model to increase it's complexity as much as it wants with trying to keep it simple. For example, if λ is very low or 0, the model will have enough power to increase it's complexity (overfit) by assigning big values to the weights for each parameter. If, in the other hand, we increase the value of λ, the model will tend to underfit, as the model will become too simple.
Parameter C will work the other way around. For small values of C, we increase the regularization strength which will create simple models which underfit the data. For big values of C, we low the power of regularization which imples the model is allowed to increase it's complexity, and therefore, overfit the data.
# Put the results into a dataframe for visualization
summary_l1 = pd.DataFrame()
summary_l1['l_split'] = l_range
summary_l1['accuracy'] = l1_accuracies.values()
summary_l1['error_rate'] = 1 - summary_l1['accuracy']
summary_l1['l1_C_weights'] = l1_C_weights.values()
summary_l1['l1_lambda_weights'] = l1_lambda_weights.values()
summary_l1['num_features_chosen'] = dict_features_l1_num.values()
summary_l1['list_features_chosen'] = dict_features_l1_list.values()
summary_l1
Interestingly, using L1 penalization allowed all l_splits to achieve 100% accuracy on the training dataset
# Recalling the results table from using Recursive Feature Elimination
summary_df
L1-penalized variable selection works better than Recursive Feature Elimination (my chosen approach for 1(d)iii), with L1-penalized being able to achieve 100% for all l ∈ {1,2,...,20} splits of the training dataset, which RFE wasn't able to.
L1-penalized variable selection is also easier to implement, requiring only an extra argument on the LogisticRegression functions, and it also computes faster. Unlike p-value variable selection, this approach does not require refitting the model.
# A list containing multiclass labels was already created previously, under the variable ``labels``
# It was generated from the files path
# First, it need to be converted as to merge 'bending1' and 'bending2' into a single bending label
labels_new = ['bending' if 'bending' in label else label for label in labels]
labels_multiclass_unique = np.unique(labels_new)
len(labels_new)
labels_multiclass_unique
# Now retrofit the master_dict, to add multiclass labels, and then split it into training and testing dictinaries
multiclass_dict_test = {}
multiclass_dict_train = {}
# First add the multiclass labels to each dataframe in master_dict
for l_split in l_range:
master_dict[l_split]['labels_multiclass'] = labels_new
# Then make the train/test split based on the list test_datasets
df_test = master_dict[l_split][master_dict[l_split]['fpath'].isin(test_datasets)]
df_train = master_dict[l_split][~master_dict[l_split]['fpath'].isin(test_datasets)]
# Now drop the unused tag columns from train/test DFs, as they aren't features to use during training
df_test.drop(['fpath', 'is_bending'], axis=1, inplace=True)
df_train.drop(['fpath', 'is_bending'], axis=1, inplace=True)
# And store them in the new dictionaries
multiclass_dict_test[l_split] = df_test
multiclass_dict_train[l_split] = df_train
# Check if shapes are correct
print(f'multiclass_dict_test[1].shape: {multiclass_dict_test[1].shape}')
print(f'multiclass_dict_test[20].shape: {multiclass_dict_test[20].shape}')
print(f'multiclass_dict_train[1].shape: {multiclass_dict_train[1].shape}')
print(f'multiclass_dict_train[20].shape: {multiclass_dict_train[20].shape}')
from sklearn.multiclass import OneVsRestClassifier
# Use OneVsRest to create one Logistic regression per class
ovr_classifier = OneVsRestClassifier(LogisticRegressionCV(cv=StratifiedKFold(5),
penalty="l1",
solver="liblinear",
scoring='accuracy',
n_jobs=-1,
random_state=1))
# One Hot Encoder for y
from sklearn.preprocessing import OneHotEncoder
OHE = OneHotEncoder(sparse=False)
OHE.fit(np.array(labels_new).reshape(-1, 1))
%%time
# Dicts to store the features and accuracies from each l-sized dataframe
dict_features_l1_list = {}
dict_features_l1_num = {}
l1_C_weights = {}
l1_lambda_weights = {}
l1_accuracies = {}
# Loop through all 'l_splits' in [1:20]
for l_split in tqdm(l_range):
# Split X and Y
x = multiclass_dict_train[l_split].iloc[:, :-1]
y = multiclass_dict_train[l_split].iloc[:, -1]
# One-Hot-Encode Y
y = OHE.transform(y.values.reshape(-1, 1))
# Train
ovr_classifier.fit(x, y)
# Convert the list of coefficient weights to a boolean list that merely indicated if the coefficient was used
coefs_bool = []
for i in ovr_classifier.coef_[0]:
if i == 0:
coefs_bool.append(False)
else:
coefs_bool.append(True)
# Add an extra False at the end, to account for the label column
coefs_bool.append(False)
# Use the boolean list to get the features selected by the model
dict_features_l1_list[l_split] = [i for i in multiclass_dict_train[l_split].columns[coefs_bool]]
# Append the count of chosen features to dict_features_l1_num
dict_features_l1_num[l_split] = len(dict_features_l1_list[l_split])
# There is one C_ for each class estimator in ovr_classifier.estimators_[0]
# Calculating the average C_ for reporting
avg_c = 0
for estimator in range(len(ovr_classifier.estimators_)):
avg_c += ovr_classifier.estimators_[estimator].C_
avg_c = avg_c/len(ovr_classifier.estimators_)
# Get the optimal value of C found by the model, putting it on the L1_C_weights dictionary
# C = 1/λ
l1_C_weights[l_split] = avg_c[0]
# "model_logit_l1.Cs_" is the array with all C values tested on cross-valition, but no need to extract it here
# And invert C to get the actual value for the penalty (λ) parameter
l1_lambda_weights[l_split] = 1/l1_C_weights[l_split]
# Get the model accuracy
l1_accuracies[l_split] = ovr_classifier.score(x, y)
# Put the results into a dataframe for visualization
summary_multiclass = pd.DataFrame()
summary_multiclass['l_split'] = l_range
summary_multiclass['accuracy'] = l1_accuracies.values()
summary_multiclass['error_rate'] = 1 - summary_multiclass['accuracy']
summary_multiclass['average_C_weights'] = l1_C_weights.values()
summary_multiclass['average_lambda_weights'] = l1_lambda_weights.values()
summary_multiclass['num_features_chosen'] = dict_features_l1_num.values()
summary_multiclass['list_features_chosen'] = dict_features_l1_list.values()
summary_multiclass
# Since multiple models achieved an error_rate of 0, I'll choose the l_slit with least engineered features which had error of 0
idx_best_l = np.argmin(summary_multiclass['error_rate'])
multiclass_chosen_l = summary_multiclass.iloc[idx_best_l]['l_split']
print(f'Will use an l of {multiclass_chosen_l}')
# Now predict with a model using the multiclass_chosen_l
# Split X and Y
x = multiclass_dict_train[multiclass_chosen_l].iloc[:, :-1]
y = multiclass_dict_train[multiclass_chosen_l].iloc[:, -1]
# One-Hot-Encode Y
y = OHE.transform(y.values.reshape(-1, 1))
# Train
ovr_classifier.fit(x, y)
# Predict (also on train dataset)
pred_onehot= ovr_classifier.predict(x)
pred_probs = ovr_classifier.predict_proba(x)
# Set a more obvious name for y
true_onehot = y.copy()
# And finally, retrieve the decoded labels
true_labels = OHE.inverse_transform(true_onehot).flatten()
pred_labels = OHE.inverse_transform(pred_onehot).flatten()
unique_labels = np.unique(true_labels)
# And get train error
train_acc = ovr_classifier.score(x, y)
train_error = 1 - train_acc
print(f'Train error: {train_error}')
plot_classification_results(true_onehot, pred_probs, classes=labels_multiclass_unique)
plt.suptitle('Logistic Regression Multi-Class Classification | Train Data', y=1.05, fontsize=20, fontweight='bold')
plt.show()
Now repeating everything for the test dataset
# Now predict on the test dataset
# Split X and Y
x = multiclass_dict_test[multiclass_chosen_l].iloc[:, :-1]
y = multiclass_dict_test[multiclass_chosen_l].iloc[:, -1]
# One-Hot-Encode Y
y = OHE.transform(y.values.reshape(-1, 1))
# Predict
pred_onehot = ovr_classifier.predict(x)
pred_probs = ovr_classifier.predict_proba(x)
# Set a more obvious name for y
true_onehot = y.copy()
# And finally, retrieve the decoded labels
true_labels = OHE.inverse_transform(true_onehot).flatten()
pred_labels = OHE.inverse_transform(pred_onehot).flatten()
unique_labels = np.unique(true_labels)
# And get test error
test_acc = ovr_classifier.score(x, y)
test_error = 1 - test_acc
print(f'Test error: {test_error}')
plot_classification_results(true_onehot, pred_probs, classes=labels_multiclass_unique)
plt.suptitle('Logistic Regression Multi-Class Classification | Test Data', y=1.05, fontsize=20, fontweight='bold')
plt.show()
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
%%time
# Instanciate the model
model_GNB = GaussianNB()
# Dicts to store the features and accuracies from each l-sized dataframe
GNB_accuracies_train = {}
GNB_accuracies_test = {}
# Loop through all 'l_splits' in [1:20]
for l_split in l_range:
# Split X and Y
x = multiclass_dict_train[l_split].iloc[:, :-1]
y = multiclass_dict_train[l_split].iloc[:, -1]
# Cross-Validate and get training error
GNB_score = cross_val_score(model_GNB, x, y, scoring='accuracy', cv=StratifiedKFold(5), n_jobs=-1)
# Get the train accuracy
GNB_accuracies_train[l_split] = GNB_score.mean()
# To get the test error, need to first need to fit the model on the training data
# cross_val_score does not fit the instantiated model, so the untrained model cannot predict on test
model_GNB.fit(x, y)
# Change x and y to the test dataset
x = multiclass_dict_test[l_split].iloc[:, :-1]
y = multiclass_dict_test[l_split].iloc[:, -1]
# Get the test accuracy
GNB_accuracies_test[l_split] = model_GNB.score(x, y)
summary_GNB = pd.DataFrame()
summary_GNB['l_split'] = l_range
summary_GNB['accuracy_train_CV'] = GNB_accuracies_train.values()
summary_GNB['error_rate_train_CV'] = 1 - summary_GNB['accuracy_train_CV']
summary_GNB['accuracy_test'] = GNB_accuracies_test.values()
summary_GNB['error_rate_test'] = 1 - summary_GNB['accuracy_test']
summary_GNB
# Pick the l-split with lowest train error (because using the test dataset would be cheating)
GNB_chosen_l_idx = np.argmin(summary_GNB['error_rate_train_CV'])
GNB_chosen_l = int(summary_GNB.iloc[GNB_chosen_l_idx]['l_split'])
GNB_chosen_l
# With the chosen l, train the model and make predictions to plot ROC-AUC and Confusion Matrix
# Split X and Y
x = multiclass_dict_train[GNB_chosen_l].iloc[:, :-1]
y = multiclass_dict_train[GNB_chosen_l].iloc[:, -1]
# Train
model_GNB.fit(x, y)
# Change x and y to the test dataset
x = multiclass_dict_test[GNB_chosen_l].iloc[:, :-1]
y = multiclass_dict_test[GNB_chosen_l].iloc[:, -1]
# Get the test accuracy
pred_probs = model_GNB.predict_proba(x)
pred_labels = model_GNB.predict(x)
# Convert y to the necessary formats to plot
true_onehot = OHE.transform(y.values.reshape(-1, 1))
true_labels = y.copy()
unique_labels = np.unique(true_labels)
plot_classification_results(true_onehot, pred_probs, classes=labels_multiclass_unique)
plt.suptitle('Gaussian Naive Bayes Multi-Class Classification | Test Data', y=1.05, fontsize=20, fontweight='bold')
plt.show()
print(f'The selected Gaussian Naïve Bayes model during training achieved a test error rate of: {summary_GNB.iloc[GNB_chosen_l_idx]["error_rate_test"]}')
It's important to notice that well testing all possible l_splits, the lowest test error was actually achieved by models which hadn't had the lowest training error. That is why the chosen model, which was chosen based on training data, did not have the best performance on testing data.
Multinomial
from sklearn.naive_bayes import MultinomialNB
%%time
# Instantiate the model
model_MNB = MultinomialNB()
# Dicts to store the features and accuracies from each l-sized dataframe
MNB_accuracies_train = {}
MNB_accuracies_test = {}
# Loop through all 'l_splits' in [1:20]
for l_split in l_range:
# Split X and Y
x = multiclass_dict_train[l_split].iloc[:, :-1]
y = multiclass_dict_train[l_split].iloc[:, -1]
# Cross-Validate
MNB_score = cross_val_score(model_MNB, x, y, scoring='accuracy', cv=StratifiedKFold(5), n_jobs=-1).mean()
# Get the train accuracy
MNB_accuracies_train[l_split] = MNB_score
# To get the test error, need to first need to fit the model on the training data
# cross_val_score does not fit the instantiated model, so the untrained model cannot predict on test
model_MNB.fit(x, y)
# Change x and y to the test dataset
x = multiclass_dict_test[l_split].iloc[:, :-1]
y = multiclass_dict_test[l_split].iloc[:, -1]
# Get the test accuracy
MNB_accuracies_test[l_split] = model_MNB.score(x, y)
summary_MNB = pd.DataFrame()
summary_MNB['l_split'] = l_range
summary_MNB['accuracy_train_CV'] = MNB_accuracies_train.values()
summary_MNB['error_rate_train_CV'] = 1 - summary_MNB['accuracy_train_CV']
summary_MNB['accuracy_test'] = MNB_accuracies_test.values()
summary_MNB['error_rate_test'] = 1 - summary_MNB['accuracy_test']
summary_MNB
# Pick the l-split with lowest train error (because using the test dataset would be cheating)
MNB_chosen_l_idx = np.argmin(summary_MNB['error_rate_train_CV'])
MNB_chosen_l = int(summary_MNB.iloc[MNB_chosen_l_idx]['l_split'])
MNB_chosen_l
# With the chosen l, train the model and make predictions to plot ROC-AUC and Confusion Matrix
# Split X and Y
x = multiclass_dict_train[MNB_chosen_l].iloc[:, :-1]
y = multiclass_dict_train[MNB_chosen_l].iloc[:, -1]
# Train
model_MNB.fit(x, y)
# Change x and y to the test dataset
x = multiclass_dict_test[MNB_chosen_l].iloc[:, :-1]
y = multiclass_dict_test[MNB_chosen_l].iloc[:, -1]
# Get the test accuracy
pred_probs = model_MNB.predict_proba(x)
pred_labels = model_MNB.predict(x)
# Convert y to the necessary formats to plot
true_onehot = OHE.transform(y.values.reshape(-1, 1))
true_labels = y.copy()
unique_labels = np.unique(true_labels)
plot_classification_results(true_onehot, pred_probs, classes=labels_multiclass_unique)
plt.suptitle('Multinomial Naive Bayes Multi-Class Classification | Test Data', y=1.05, fontsize=20, fontweight='bold')
plt.show()
print(f'The selected Multinomial Naïve Bayes model during training achieved a test error rate of: {summary_MNB.iloc[MNB_chosen_l_idx]["error_rate_test"]}')
It's important to notice that when testing all possible l_splits, the lowest test error was actually achieved by models which hadn't had the lowest training error. That is why the chosen model, which was chosen based on training data, did not have the best performance on testing data.
Both models performed similarly, with both achieving equal error rate on the test dataset. Yet, when considering the roc-auc-curve, which account for the assigned probabilities, the Multinomial Naïve Bayes model performed slightly better.