Matheus Schmitz
LinkedIn
Github Portfolio
Source:
http://archive.ics.uci.edu/ml/datasets/Spambase
Creators:
Mark Hopkins, Erik Reeber, George Forman, Jaap Suermondt
Hewlett-Packard Labs, 1501 Page Mill Rd., Palo Alto, CA 94304
Donor:
George Forman (gforman at nospam hpl.hp.com) 650-857-7835
# Py Data Stack
import numpy as np
import pandas as pd
# Visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Classifiers
import tpot
from tpot import TPOTClassifier
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Sklearn Auxilarty Functions
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
# SMOTE
import imblearn
from imblearn.over_sampling import SMOTE
# Package versions
%reload_ext watermark
%watermark -v -iv
Python implementation: CPython Python version : 3.7.9 IPython version : 7.20.0 tpot : 0.11.7 sklearn : 0.24.1 seaborn : 0.11.1 pandas : 1.2.2 numpy : 1.20.1 matplotlib: 3.3.4 imblearn : 0.7.0
col_names = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over',
'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business', 'word_freq_email',
'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp',
'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm',
'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu',
'word_freq_table', 'word_freq_conference', 'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$',
'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'LABEL']
len(col_names)
58
# Load the dataset and define the column names
df = pd.read_csv('spambase.data', header=None, names=col_names)
print(f'df.shape: {df.shape}')
df.head()
df.shape: (4601, 58)
word_freq_make | word_freq_address | word_freq_all | word_freq_3d | word_freq_our | word_freq_over | word_freq_remove | word_freq_internet | word_freq_order | word_freq_mail | ... | char_freq_; | char_freq_( | char_freq_[ | char_freq_! | char_freq_$ | char_freq_# | capital_run_length_average | capital_run_length_longest | capital_run_length_total | LABEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.00 | 0.64 | 0.64 | 0.0 | 0.32 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ... | 0.00 | 0.000 | 0.0 | 0.778 | 0.000 | 0.000 | 3.756 | 61 | 278 | 1 |
1 | 0.21 | 0.28 | 0.50 | 0.0 | 0.14 | 0.28 | 0.21 | 0.07 | 0.00 | 0.94 | ... | 0.00 | 0.132 | 0.0 | 0.372 | 0.180 | 0.048 | 5.114 | 101 | 1028 | 1 |
2 | 0.06 | 0.00 | 0.71 | 0.0 | 1.23 | 0.19 | 0.19 | 0.12 | 0.64 | 0.25 | ... | 0.01 | 0.143 | 0.0 | 0.276 | 0.184 | 0.010 | 9.821 | 485 | 2259 | 1 |
3 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.137 | 0.0 | 0.137 | 0.000 | 0.000 | 3.537 | 40 | 191 | 1 |
4 | 0.00 | 0.00 | 0.00 | 0.0 | 0.63 | 0.00 | 0.31 | 0.63 | 0.31 | 0.63 | ... | 0.00 | 0.135 | 0.0 | 0.135 | 0.000 | 0.000 | 3.537 | 40 | 191 | 1 |
5 rows × 58 columns
# Check if there are any missing values and if all data types were properly loaded
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 4601 entries, 0 to 4600 Data columns (total 58 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 word_freq_make 4601 non-null float64 1 word_freq_address 4601 non-null float64 2 word_freq_all 4601 non-null float64 3 word_freq_3d 4601 non-null float64 4 word_freq_our 4601 non-null float64 5 word_freq_over 4601 non-null float64 6 word_freq_remove 4601 non-null float64 7 word_freq_internet 4601 non-null float64 8 word_freq_order 4601 non-null float64 9 word_freq_mail 4601 non-null float64 10 word_freq_receive 4601 non-null float64 11 word_freq_will 4601 non-null float64 12 word_freq_people 4601 non-null float64 13 word_freq_report 4601 non-null float64 14 word_freq_addresses 4601 non-null float64 15 word_freq_free 4601 non-null float64 16 word_freq_business 4601 non-null float64 17 word_freq_email 4601 non-null float64 18 word_freq_you 4601 non-null float64 19 word_freq_credit 4601 non-null float64 20 word_freq_your 4601 non-null float64 21 word_freq_font 4601 non-null float64 22 word_freq_000 4601 non-null float64 23 word_freq_money 4601 non-null float64 24 word_freq_hp 4601 non-null float64 25 word_freq_hpl 4601 non-null float64 26 word_freq_george 4601 non-null float64 27 word_freq_650 4601 non-null float64 28 word_freq_lab 4601 non-null float64 29 word_freq_labs 4601 non-null float64 30 word_freq_telnet 4601 non-null float64 31 word_freq_857 4601 non-null float64 32 word_freq_data 4601 non-null float64 33 word_freq_415 4601 non-null float64 34 word_freq_85 4601 non-null float64 35 word_freq_technology 4601 non-null float64 36 word_freq_1999 4601 non-null float64 37 word_freq_parts 4601 non-null float64 38 word_freq_pm 4601 non-null float64 39 word_freq_direct 4601 non-null float64 40 word_freq_cs 4601 non-null float64 41 word_freq_meeting 4601 non-null float64 42 word_freq_original 4601 non-null float64 43 word_freq_project 4601 non-null float64 44 word_freq_re 4601 non-null float64 45 word_freq_edu 4601 non-null float64 46 word_freq_table 4601 non-null float64 47 word_freq_conference 4601 non-null float64 48 char_freq_; 4601 non-null float64 49 char_freq_( 4601 non-null float64 50 char_freq_[ 4601 non-null float64 51 char_freq_! 4601 non-null float64 52 char_freq_$ 4601 non-null float64 53 char_freq_# 4601 non-null float64 54 capital_run_length_average 4601 non-null float64 55 capital_run_length_longest 4601 non-null int64 56 capital_run_length_total 4601 non-null int64 57 LABEL 4601 non-null int64 dtypes: float64(55), int64(3) memory usage: 2.0 MB
# Split train and set data using class stratification
df_train, df_test= train_test_split(df, test_size=0.2, stratify=df.iloc[:,-1])
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')
df_train.shape: (3680, 58) df_test.shape: (921, 58)
Considering only the training data so as to leave testing data untouched
# Check the distribution of the variables
df_train.describe()
word_freq_make | word_freq_address | word_freq_all | word_freq_3d | word_freq_our | word_freq_over | word_freq_remove | word_freq_internet | word_freq_order | word_freq_mail | ... | char_freq_; | char_freq_( | char_freq_[ | char_freq_! | char_freq_$ | char_freq_# | capital_run_length_average | capital_run_length_longest | capital_run_length_total | LABEL | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | ... | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 | 3680.000000 |
mean | 0.104326 | 0.224799 | 0.279769 | 0.054228 | 0.310087 | 0.091864 | 0.113130 | 0.105326 | 0.089731 | 0.244329 | ... | 0.038564 | 0.135266 | 0.016735 | 0.272490 | 0.077965 | 0.043748 | 5.249336 | 49.709511 | 280.558967 | 0.394022 |
std | 0.304140 | 1.338092 | 0.508302 | 1.221217 | 0.656361 | 0.253049 | 0.381496 | 0.407075 | 0.280806 | 0.662715 | ... | 0.248559 | 0.218538 | 0.114857 | 0.864163 | 0.250333 | 0.454713 | 33.133562 | 128.257333 | 555.548311 | 0.488706 |
min | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 |
25% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.593500 | 6.000000 | 35.000000 | 0.000000 |
50% | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.065000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.275000 | 15.000000 | 95.000000 | 0.000000 |
75% | 0.000000 | 0.000000 | 0.420000 | 0.000000 | 0.390000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.170000 | ... | 0.000000 | 0.186000 | 0.000000 | 0.325000 | 0.054250 | 0.000000 | 3.734000 | 44.000000 | 266.250000 | 1.000000 |
max | 4.540000 | 14.280000 | 5.100000 | 42.810000 | 8.330000 | 3.570000 | 7.270000 | 11.110000 | 5.260000 | 18.180000 | ... | 4.385000 | 4.271000 | 4.081000 | 32.478000 | 6.003000 | 19.829000 | 1102.500000 | 2204.000000 | 9163.000000 | 1.000000 |
8 rows × 58 columns
# Check the distribution of variables using a boxplot
fix, ax = plt.subplots(figsize=(16,9))
df_train.boxplot()
plt.show()
The skewness in this boxplot indicated the need to implement a MinMaxScaler during the model training pipeline.
# Correlation Matrix
fig, ax = plt.subplots(figsize=(40,40))
g = sns.heatmap(df_train.corr(), vmin=-1, vmax=1, cmap='coolwarm_r', square=True)
plt.show()
I don't see much strong evidence of correlation among variables, which would be a strong motivation to use PCA, SVD or other dimensionality reduction method, so I'll opt to skip those, while perhaps using a classifier that has some dimensionality reduction built-in, such as Linear Discriminant Analysis.
# Coefficient of Variation (the variables with the largest std to mean ratio)
var_mean = df_train.mean(axis=0)
var_std = df_train.std(axis=0)
CV = var_std/var_mean
CV.head()
word_freq_make 2.915281 word_freq_address 5.952394 word_freq_all 1.816863 word_freq_3d 22.519930 word_freq_our 2.116698 dtype: float64
# Identify the features with the highest CV
num_picks = int(np.floor(np.sqrt(df.shape[1])))
print(f'Number of features chosen: {num_picks}')
chosen_features = CV.sort_values(ascending=False).head(num_picks).index
chosen_features = [i for i in chosen_features]
print(f'List of features chosen: {chosen_features}')
Number of features chosen: 7 List of features chosen: ['word_freq_3d', 'word_freq_parts', 'word_freq_table', 'char_freq_#', 'word_freq_conference', 'word_freq_cs', 'word_freq_font']
# Scatterplots using target variable as hue
g = sns.pairplot(data=df_train, hue='LABEL', vars=chosen_features, palette='coolwarm')
plt.show()
Some of those features have a nearly linearly separable boundary, others don't, indicating the need for testing algorithms with different assumptions about the data.
One this is clear though, I do see a lot more blue dots than red dots, so maybe SMOTE could be of use here.
# Boxplots for the highest CV features
fig = plt.figure(figsize=(16,9))
g = sns.boxplot(data=df_train[chosen_features], orient='h')
plt.show()
Those boxplots are just more evidence of the need for scaling the variables.
Given the skewness seen in the boxplots, I'll be including a normalizing (StandardScaler) pipeline with all classifiers to be tested.
Additionally, before making a call on which classifier to choose as the final one, I'll first optimize all classifiers using a GridSearch approach.
# Pipeline to standardize then run the classifier
svc = Pipeline([("standardize", StandardScaler()),
("svc", SVC(kernel="rbf", decision_function_shape='ovr'))])
# Grid with parameters to be tested via CV
svc_param_grid_ = {'svc__C': np.logspace(-3, 3, 5),
'svc__gamma': np.logspace(-3, 3, 5)}
# Instantiate GridSearchCV using accuracy as the scorer
svc_gridCV = GridSearchCV(svc, svc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')
# Pipeline to standardize then run the classifier
rfc = Pipeline([("standardize", StandardScaler()),
("rfc", RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1))])
# Grid with parameters to be tested via CV
rfc_param_grid_ = {'rfc__min_samples_split': [2,3],
'rfc__min_samples_leaf': [1,2,3]}
# Instantiate GridSearchCV using accuracy as the scorer
rfc_gridCV = GridSearchCV(rfc, rfc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')
knn = Pipeline([("standardize", StandardScaler()),
("knn", KNeighborsClassifier(metric='minkowski', leaf_size=30, weights='distance', n_jobs=-1))])
# Grid with parameters to be tested via CV
knn_param_grid_ = {'knn__n_neighbors': [3,5,7,9]}
# Instantiate GridSearchCV using accuracy as the scorer
knn_gridCV = GridSearchCV(knn, knn_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')
gpc = Pipeline([("standardize", StandardScaler()),
("gpc", GaussianProcessClassifier(optimizer='fmin_l_bfgs_b', max_iter_predict=100, n_jobs=-1))])
# Grid with parameters to be tested via CV
gpc_param_grid_ = {'gpc__n_restarts_optimizer': [0,5,10]}
# Instantiate GridSearchCV using accuracy as the scorer
gpc_gridCV = GridSearchCV(gpc, gpc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')
lda = Pipeline([("standardize", StandardScaler()),
("lda", LinearDiscriminantAnalysis(solver='svd'))])
# Grid with parameters to be tested via CV
lda_param_grid_ = {'lda__tol': [1.0e-2, 1.0e-4, 1.0e-6]}
# Instantiate GridSearchCV using accuracy as the scorer
lda_gridCV = GridSearchCV(lda, lda_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')
%%time
# Given the mild imbalance observed in the dataset I'll be adding a SMOTE step to the training data of each K-Fold
smote = SMOTE(n_jobs=-1)
# K-Fold + GridSearch to find the best model hyperparameters
kf = KFold(n_splits=5, shuffle=True)
kfold_intermediate_results = pd.DataFrame()
for fold_num, (idx_train, idx_valid) in enumerate(kf.split(df_train), 1):
# Print current label and fold
print(f'Working on Fold: {fold_num}')
# Select all folds to be smoted except for the validation fold
x_train, y_train = smote.fit_sample(df_train.iloc[idx_train,:-1], df_train.iloc[idx_train,-1])
x_valid = df_train.iloc[idx_valid,:-1]
y_valid = df_train.iloc[idx_valid,-1]
# Fit using grid search to find the best params
svc_gridCV.fit(x_train, y_train)
rfc_gridCV.fit(x_train, y_train)
knn_gridCV.fit(x_train, y_train)
gpc_gridCV.fit(x_train, y_train)
lda_gridCV.fit(x_train, y_train)
# Predict on the train and validation folds to calculate metrics
pred_svc_train = svc_gridCV.predict(x_train)
pred_svc_valid = svc_gridCV.predict(x_valid)
pred_rfc_train = rfc_gridCV.predict(x_train)
pred_rfc_valid = rfc_gridCV.predict(x_valid)
pred_knn_train = knn_gridCV.predict(x_train)
pred_knn_valid = knn_gridCV.predict(x_valid)
pred_gpc_train = gpc_gridCV.predict(x_train)
pred_gpc_valid = gpc_gridCV.predict(x_valid)
pred_lda_train = lda_gridCV.predict(x_train)
pred_lda_valid = lda_gridCV.predict(x_valid)
# Store best params of each classifier for each fold
kfold_intermediate_results.at['SVC_C', f'{fold_num}'] = svc_gridCV.best_params_['svc__C']
kfold_intermediate_results.at['SVC_gamma', f'{fold_num}'] = svc_gridCV.best_params_['svc__gamma']
kfold_intermediate_results.at['RFC_split', f'{fold_num}'] = rfc_gridCV.best_params_['rfc__min_samples_split']
kfold_intermediate_results.at['RFC_leaf', f'{fold_num}'] = rfc_gridCV.best_params_['rfc__min_samples_leaf']
kfold_intermediate_results.at['KNN_N', f'{fold_num}'] = knn_gridCV.best_params_['knn__n_neighbors']
kfold_intermediate_results.at['GPC_restarts', f'{fold_num}'] = gpc_gridCV.best_params_['gpc__n_restarts_optimizer']
kfold_intermediate_results.at['LDA_tol', f'{fold_num}'] = lda_gridCV.best_params_['lda__tol']
# Store errors for each model in each fold
kfold_intermediate_results.at['SVC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_svc_train)
kfold_intermediate_results.at['SVC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_svc_valid)
kfold_intermediate_results.at['RFC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_rfc_train)
kfold_intermediate_results.at['RFC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_rfc_valid)
kfold_intermediate_results.at['KNN_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_knn_train)
kfold_intermediate_results.at['KNN_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_knn_valid)
kfold_intermediate_results.at['GPC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_gpc_train)
kfold_intermediate_results.at['GPC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_gpc_valid)
kfold_intermediate_results.at['LDA_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_lda_train)
kfold_intermediate_results.at['LDA_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_lda_valid)
# After running all K-Folds get average results for each classifier
kfold_intermediate_results['mean'] = kfold_intermediate_results.mean(axis=1)
print()
print('--- GridSearch Best Parameters ---')
print(f'Mean SVC C Parameter: {kfold_intermediate_results["mean"]["SVC_C"]}')
print(f'Mean SVC gamma Parameter: {kfold_intermediate_results["mean"]["SVC_gamma"]}')
print(f'Mean RFC min_samples_split Parameter: {kfold_intermediate_results["mean"]["RFC_split"]}')
print(f'Mean RFC min_samples_leaf Parameter: {kfold_intermediate_results["mean"]["RFC_leaf"]}')
print(f'Mean KNN N Parameter: {kfold_intermediate_results["mean"]["KNN_N"]}')
print(f'Mean GPA n_restarts_optimizer Parameter: {kfold_intermediate_results["mean"]["GPC_restarts"]}')
print(f'Mean LDA tol Parameter: {kfold_intermediate_results["mean"]["LDA_tol"]}')
print()
print('--- K-Fold Cross-Validation Results ---')
print(f'SVC Error | Training : {kfold_intermediate_results["mean"]["SVC_train"]}')
print(f'RFC Error | Training : {kfold_intermediate_results["mean"]["RFC_train"]}')
print(f'KNN Error | Training : {kfold_intermediate_results["mean"]["KNN_train"]}')
print(f'GPC Error | Training : {kfold_intermediate_results["mean"]["GPC_train"]}')
print(f'LDA Error | Training : {kfold_intermediate_results["mean"]["LDA_train"]}')
print()
print(f'SVC Error | Validation : {kfold_intermediate_results["mean"]["SVC_valid"]}')
print(f'RFC Error | Validation : {kfold_intermediate_results["mean"]["RFC_valid"]}')
print(f'KNN Error | Validation : {kfold_intermediate_results["mean"]["KNN_valid"]}')
print(f'GPC Error | Validation : {kfold_intermediate_results["mean"]["GPC_valid"]}')
print(f'LDA Error | Validation : {kfold_intermediate_results["mean"]["LDA_valid"]}')
print()
Working on Fold: 1 Working on Fold: 2 Working on Fold: 3 Working on Fold: 4 Working on Fold: 5 --- GridSearch Best Parameters --- Mean SVC C Parameter: 1000.0 Mean SVC gamma Parameter: 0.001 Mean RFC min_samples_split Parameter: 2.2 Mean RFC min_samples_leaf Parameter: 1.0 Mean KNN N Parameter: 7.8 Mean GPA n_restarts_optimizer Parameter: 0.0 Mean LDA tol Parameter: 0.01 --- K-Fold Cross-Validation Results --- SVC Error | Training : 0.03290193784820476 RFC Error | Training : 0.0006722528969830943 KNN Error | Training : 0.0005607080001621245 GPC Error | Training : 0.009920530100554359 LDA Error | Training : 0.09024132695878322 SVC Error | Validation : 0.06467391304347825 RFC Error | Validation : 0.04646739130434783 KNN Error | Validation : 0.08478260869565216 GPC Error | Validation : 0.07717391304347827 LDA Error | Validation : 0.09347826086956519 Wall time: 12min 48s
Considering the validation data the Random Forest Classifier is ahead, but none of the other algorithms is too far behind! Let me now use the best hyperparameters from each K-Fold iteration to build a final version of each model to be tested on the test dataset, so that I can select the best model.
# Dataframe to store results
summary = pd.DataFrame()
# Create new classifiers using the average of the best parameters in each k-fold
svc_kfold = Pipeline([("standardize", StandardScaler()),
("svc", SVC(kernel="rbf", decision_function_shape='ovr',
C=kfold_intermediate_results.at['SVC_C', 'mean'],
gamma=kfold_intermediate_results.at['SVC_gamma', 'mean']))])
rfc_kfold = Pipeline([("standardize", StandardScaler()),
("rfc", RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
min_samples_split=round(kfold_intermediate_results.at['RFC_split', 'mean']),
min_samples_leaf=round(kfold_intermediate_results.at['RFC_leaf', 'mean'])))])
knn_kfold = Pipeline([("standardize", StandardScaler()),
("knn", KNeighborsClassifier(metric='minkowski', leaf_size=30, weights='distance', n_jobs=-1,
n_neighbors=round(kfold_intermediate_results.at['KNN_N', 'mean'])))])
gpc_kfold = Pipeline([("standardize", StandardScaler()),
("gpc", GaussianProcessClassifier(optimizer='fmin_l_bfgs_b', max_iter_predict=100, n_jobs=-1,
n_restarts_optimizer=round(kfold_intermediate_results["mean"]["GPC_restarts"])))])
lda_kfold = Pipeline([("standardize", StandardScaler()),
("lda", LinearDiscriminantAnalysis(solver='svd',
tol=kfold_intermediate_results.at['LDA_tol', 'mean']))])
# Get X's and Y's - This time using the full datasets for trainin and testing
x_train, y_train = smote.fit_sample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
x_test = df_test.iloc[:, :-1].copy()
y_test = df_test.iloc[:, -1].copy()
# Fit using the optimized model created with the mean hyperparameters from K-Fold cross-validation
svc_kfold.fit(x_train, y_train)
rfc_kfold.fit(x_train, y_train)
knn_kfold.fit(x_train, y_train)
gpc_kfold.fit(x_train, y_train)
lda_kfold.fit(x_train, y_train)
# Predict
pred_svc_train = svc_kfold.predict(x_train)
pred_svc_test = svc_kfold.predict(x_test)
pred_rfc_train = rfc_kfold.predict(x_train)
pred_rfc_test = rfc_kfold.predict(x_test)
pred_knn_train = knn_kfold.predict(x_train)
pred_knn_test = knn_kfold.predict(x_test)
pred_gpc_train = gpc_kfold.predict(x_train)
pred_gpc_test = gpc_kfold.predict(x_test)
pred_lda_train = gpc_kfold.predict(x_train)
pred_lda_test = gpc_kfold.predict(x_test)
# Model Error
summary.at['train', f'SVC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_svc_train)
summary.at['test', f'SVC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_svc_test)
summary.at['train', f'RFC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_rfc_train)
summary.at['test', f'RFC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_rfc_test)
summary.at['train', f'KNN'] = 1-accuracy_score(y_true=y_train, y_pred=pred_knn_train)
summary.at['test', f'KNN'] = 1-accuracy_score(y_true=y_test, y_pred=pred_knn_test)
summary.at['train', f'GPC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_gpc_train)
summary.at['test', f'GPC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_gpc_test)
summary.at['train', f'LDA'] = 1-accuracy_score(y_true=y_train, y_pred=pred_lda_train)
summary.at['test', f'LDA'] = 1-accuracy_score(y_true=y_test, y_pred=pred_lda_test)
# Print model results for optimized classifiers model
print(f'------------------------------ MODEL OVERALL ------------------------------')
print('SVC Error | Training: ', summary.at['train', f'SVC'])
print('RFC Error | Training: ', summary.at['train', f'RFC'])
print('KNN Error | Training: ', summary.at['train', f'KNN'])
print('GPC Error | Training: ', summary.at['train', f'GPC'])
print('LDA Error | Training: ', summary.at['train', f'LDA'])
print()
print('SVC Error | Testing: ', summary.at['test', f'SVC'])
print('RFC Error | Testing: ', summary.at['test', f'RFC'])
print('KNN Error | Testing: ', summary.at['test', f'KNN'])
print('GPC Error | Testing: ', summary.at['test', f'GPC'])
print('LDA Error | Testing: ', summary.at['test', f'LDA'])
------------------------------ MODEL OVERALL ------------------------------ SVC Error | Training: 0.03385650224215242 RFC Error | Training: 0.0006726457399103269 KNN Error | Training: 0.0006726457399103269 GPC Error | Training: 0.010313901345291532 LDA Error | Training: 0.010313901345291532 SVC Error | Testing: 0.06514657980456029 RFC Error | Testing: 0.057546145494028256 KNN Error | Testing: 0.07709011943539634 GPC Error | Testing: 0.07600434310532034 LDA Error | Testing: 0.07600434310532034
# View results in the dataframe
summary
SVC | RFC | KNN | GPC | LDA | |
---|---|---|---|---|---|
train | 0.033857 | 0.000673 | 0.000673 | 0.010314 | 0.010314 |
test | 0.065147 | 0.057546 | 0.077090 | 0.076004 | 0.076004 |
Among the optimized models there were many ties in the training set, but when looking at the testing set, the Random Forecast Classifier is the clear winner!
But before I move ahead to calculating the performance metrics for our winning algorithm, let's introduce one final challenger: TPOT's AutoML.
Attempting to improve on the performance of the best classifier by using TPOT's AutoML tool to search for an ideal classifier pipeline.
TPOT AutoML Documentation: http://epistasislab.github.io/tpot/
# Split X and Y
x_train = df_train.iloc[:,:-1]
y_train= df_train.iloc[:,-1]
x_test = df_test.iloc[:,:-1]
y_test= df_test.iloc[:,-1]
#Instantiate and run the AutoML classifier
AutoML = TPOTClassifier(generations=5, population_size=20, cv=5, verbosity=2, n_jobs=-1)
AutoML.fit(x_train, y_train)
Generation 1 - Current best internal CV score: 0.9461956521739131 Generation 2 - Current best internal CV score: 0.9461956521739131 Generation 3 - Current best internal CV score: 0.9470108695652174 Generation 4 - Current best internal CV score: 0.9470108695652174 Generation 5 - Current best internal CV score: 0.9475543478260869 Best pipeline: XGBClassifier(BernoulliNB(input_matrix, alpha=1.0, fit_prior=False), learning_rate=0.1, max_depth=9, min_child_weight=12, n_estimators=100, n_jobs=1, subsample=0.6000000000000001, verbosity=0)
TPOTClassifier(generations=5, n_jobs=-1, population_size=20, verbosity=2)
# And the best model is:
AutoML.fitted_pipeline_.steps[-1][1]
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1, importance_type='gain', interaction_constraints='', learning_rate=0.1, max_delta_step=0, max_depth=9, min_child_weight=12, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=1, num_parallel_tree=1, random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.6000000000000001, tree_method='exact', validate_parameters=1, verbosity=0)
# Check performance
print("AutoML Error | Training: ", 1-AutoML.score(x_train, y_train))
print("AutoML Error | Testing: ", 1-AutoML.score(x_test, y_test))
AutoML Error | Training: 0.03505434782608696 AutoML Error | Testing: 0.061889250814332275
Looks like TPOT proved no match for our Random Forest Classifier, so let's move ahead with extracting performance metrics for our champion.
Now the the best model has been identified, rerun a K-Fold cross-validation without using SMOTE, so that more representative FPR, FNR and Error values can be obtained
# Now let's run a K-Fold cross-validation with the winner model and see how it performs over each fold
kf = KFold(n_splits=5, shuffle=True)
scores_train, scores_valid, scores_test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for fold_num, (idx_train, idx_valid) in enumerate(kf.split(df_train), 1):
# Print current label and fold
print(f'Working on Fold: {fold_num}')
# Select all folds to be smoted except for the validation fold
x_train = df_train.iloc[idx_train,:-1].copy()
y_train = df_train.iloc[idx_train,-1].copy()
x_valid = df_train.iloc[idx_valid,:-1].copy()
y_valid = df_train.iloc[idx_valid,-1].copy()
x_test = df_test.iloc[:, :-1].copy()
y_test = df_test.iloc[:, -1].copy()
# Fit a Random Forest Classifier using the best parameters
winner_model = Pipeline([("standardize", StandardScaler()),
("rfc", RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
min_samples_split=round(kfold_intermediate_results.at['RFC_split', 'mean']),
min_samples_leaf=round(kfold_intermediate_results.at['RFC_leaf', 'mean'])))])
winner_model.fit(x_train, y_train)
# Predict on the train and validation folds to calculate metrics (with the winnier rfc_kfold model)
pred_train = winner_model.predict(x_train)
pred_valid = winner_model.predict(x_valid)
pred_test = winner_model.predict(x_test)
# Train dataset metrics
scores_train.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_train, y_pred=pred_train)
cm_train = confusion_matrix(y_train, pred_train)
TN, FP, FN, TP = cm_train.ravel()
scores_train.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
scores_train.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)
# Valid dataset metrics
scores_valid.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_valid)
cm_valid = confusion_matrix(y_valid, pred_valid)
TN, FP, FN, TP = cm_valid.ravel()
scores_valid.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
scores_valid.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)
# Test dataset metrics
scores_test.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_test, y_pred=pred_test)
cm_test = confusion_matrix(y_test, pred_test)
TN, FP, FN, TP = cm_test.ravel()
scores_test.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
scores_test.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)
# Get mean scores over all K-folds
scores_train.at['mean', 'Error'] = scores_train['Error'].mean()
scores_train.at['mean', 'FPR'] = scores_train['FPR'].mean()
scores_train.at['mean', 'FNR'] = scores_train['FNR'].mean()
scores_valid.at['mean', 'Error'] = scores_valid['Error'].mean()
scores_valid.at['mean', 'FPR'] = scores_valid['FPR'].mean()
scores_valid.at['mean', 'FNR'] = scores_valid['FNR'].mean()
scores_test.at['mean', 'Error'] = scores_test['Error'].mean()
scores_test.at['mean', 'FPR'] = scores_test['FPR'].mean()
scores_test.at['mean', 'FNR'] = scores_test['FNR'].mean()
Working on Fold: 1 Working on Fold: 2 Working on Fold: 3 Working on Fold: 4 Working on Fold: 5
scores_train
Error | FPR | FNR | |
---|---|---|---|
1 | 0.000679 | 0.000562 | 0.000859 |
2 | 0.001019 | 0.001115 | 0.000870 |
3 | 0.000679 | 0.000558 | 0.000867 |
4 | 0.000340 | 0.000564 | 0.000000 |
5 | 0.000340 | 0.000561 | 0.000000 |
mean | 0.000611 | 0.000672 | 0.000519 |
scores_valid
Error | FPR | FNR | |
---|---|---|---|
1 | 0.052989 | 0.031111 | 0.087413 |
2 | 0.043478 | 0.011468 | 0.090000 |
3 | 0.061141 | 0.052392 | 0.074074 |
4 | 0.042120 | 0.030702 | 0.060714 |
5 | 0.040761 | 0.031180 | 0.055749 |
mean | 0.048098 | 0.031371 | 0.073590 |
scores_test
Error | FPR | FNR | |
---|---|---|---|
1 | 0.055375 | 0.034050 | 0.088154 |
2 | 0.057546 | 0.028674 | 0.101928 |
3 | 0.058632 | 0.034050 | 0.096419 |
4 | 0.054289 | 0.030466 | 0.090909 |
5 | 0.056460 | 0.030466 | 0.096419 |
mean | 0.056460 | 0.031541 | 0.094766 |
Note: It is possible to obtain all the metrics (error, FPR, FNR) while comparing all optimized models.
I've chosen to first (2.3) compare all models and then later (3.1) re-train the winning model and calculate the metrics only for that model for the sake of clarity, as I believe this apporach makes the step-by-step of the pipeline I've implemented clearer.
# Function to plot confusion matrix
def show_confusion_matrix(confusion_matrix):
hmap = sns.heatmap(confusion_matrix, annot = True, fmt = "d", cmap = "Blues", square=True)
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation = 0, ha = "right")
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation = 0, ha = "right")
plt.title('Confusion Matrix', pad=20, fontweight='bold', fontsize=15)
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
# Function to plot ROC-AUC Curve
def show_roc_curve(roc_curve):
auc_score = auc(roc_curve[0], roc_curve[1])
plt.axis('square')
plt.plot(roc_curve[0], roc_curve[1], color='tomato', lw=3, label='AUC: ' + str(auc_score))
plt.plot([0, 1], [0, 1], 'k--', lw=1.5)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve', pad=20, fontweight='bold', fontsize=15)
legend = plt.legend('AUC Score')
legend._legend_box.align = "right"
plt.legend(loc="lower right")
# Use the trained model to predict on both train and test datasets (now without smoting them)
pred_train = rfc_kfold.predict(x_train)
pred_test = rfc_kfold.predict(x_test)
# Confusion Matrix
cm_train = confusion_matrix(y_true=y_train, y_pred=pred_train)
cm_test = confusion_matrix(y_true=y_test, y_pred=pred_test)
# ROC Curve
roc_train = roc_curve(y_true=y_train, y_score=pred_train)
roc_test = roc_curve(y_true=y_test, y_score=pred_test)
# Create Figure
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,8))
# Plot Train ROC Curve
plt.subplot(2,2,1)
show_roc_curve(roc_train)
plt.title('ROC Curve | Train Data', fontsize=15, fontweight='bold', pad=15)
# Plot Train Confusion Matrix
plt.subplot(2,2,2)
show_confusion_matrix(cm_train)
plt.title('Confusion Matrix | Train Data', fontsize=15, fontweight='bold', pad=15)
# Plot Test ROC Curve
plt.subplot(2,2,3)
show_roc_curve(roc_test)
plt.title('ROC Curve | Test Data', fontsize=15, fontweight='bold', pad=15)
# Plot Test Confusion Matrix
plt.subplot(2,2,4)
show_confusion_matrix(cm_test)
plt.title('Confusion Matrix | Test Data', fontsize=15, fontweight='bold', pad=15)
fig.tight_layout(h_pad=4, w_pad=-10)
Matheus Schmitz
LinkedIn
Github Portfolio