# Py Data Stack
import numpy as np
import pandas as pd


# Visualization
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


# Classifiers
import tpot
from tpot import TPOTClassifier
import sklearn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


# Sklearn Auxilarty Functions
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline


# SMOTE
import imblearn
from imblearn.over_sampling import SMOTE


# Package versions
%reload_ext watermark
%watermark -v -iv

Python implementation: CPython
Python version       : 3.7.9
IPython version      : 7.20.0

tpot      : 0.11.7
sklearn   : 0.24.1
seaborn   : 0.11.1
pandas    : 1.2.2
numpy     : 1.20.1
matplotlib: 3.3.4
imblearn  : 0.7.0


col_names = ['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d', 'word_freq_our', 'word_freq_over',
             'word_freq_remove', 'word_freq_internet', 'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
             'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free', 'word_freq_business', 'word_freq_email',
             'word_freq_you', 'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money', 'word_freq_hp',
             'word_freq_hpl', 'word_freq_george', 'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
             'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology', 'word_freq_1999', 'word_freq_parts', 'word_freq_pm',
             'word_freq_direct', 'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project', 'word_freq_re', 'word_freq_edu',
             'word_freq_table', 'word_freq_conference', 'char_freq_;',  'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$',
             'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest', 'capital_run_length_total', 'LABEL']
len(col_names)

58


# Load the dataset and define the column names
df = pd.read_csv('spambase.data', header=None, names=col_names)
print(f'df.shape: {df.shape}')
df.head()

df.shape: (4601, 58)


# Check if there are any missing values and if all data types were properly loaded
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4601 entries, 0 to 4600
Data columns (total 58 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   word_freq_make              4601 non-null   float64
 1   word_freq_address           4601 non-null   float64
 2   word_freq_all               4601 non-null   float64
 3   word_freq_3d                4601 non-null   float64
 4   word_freq_our               4601 non-null   float64
 5   word_freq_over              4601 non-null   float64
 6   word_freq_remove            4601 non-null   float64
 7   word_freq_internet          4601 non-null   float64
 8   word_freq_order             4601 non-null   float64
 9   word_freq_mail              4601 non-null   float64
 10  word_freq_receive           4601 non-null   float64
 11  word_freq_will              4601 non-null   float64
 12  word_freq_people            4601 non-null   float64
 13  word_freq_report            4601 non-null   float64
 14  word_freq_addresses         4601 non-null   float64
 15  word_freq_free              4601 non-null   float64
 16  word_freq_business          4601 non-null   float64
 17  word_freq_email             4601 non-null   float64
 18  word_freq_you               4601 non-null   float64
 19  word_freq_credit            4601 non-null   float64
 20  word_freq_your              4601 non-null   float64
 21  word_freq_font              4601 non-null   float64
 22  word_freq_000               4601 non-null   float64
 23  word_freq_money             4601 non-null   float64
 24  word_freq_hp                4601 non-null   float64
 25  word_freq_hpl               4601 non-null   float64
 26  word_freq_george            4601 non-null   float64
 27  word_freq_650               4601 non-null   float64
 28  word_freq_lab               4601 non-null   float64
 29  word_freq_labs              4601 non-null   float64
 30  word_freq_telnet            4601 non-null   float64
 31  word_freq_857               4601 non-null   float64
 32  word_freq_data              4601 non-null   float64
 33  word_freq_415               4601 non-null   float64
 34  word_freq_85                4601 non-null   float64
 35  word_freq_technology        4601 non-null   float64
 36  word_freq_1999              4601 non-null   float64
 37  word_freq_parts             4601 non-null   float64
 38  word_freq_pm                4601 non-null   float64
 39  word_freq_direct            4601 non-null   float64
 40  word_freq_cs                4601 non-null   float64
 41  word_freq_meeting           4601 non-null   float64
 42  word_freq_original          4601 non-null   float64
 43  word_freq_project           4601 non-null   float64
 44  word_freq_re                4601 non-null   float64
 45  word_freq_edu               4601 non-null   float64
 46  word_freq_table             4601 non-null   float64
 47  word_freq_conference        4601 non-null   float64
 48  char_freq_;                 4601 non-null   float64
 49  char_freq_(                 4601 non-null   float64
 50  char_freq_[                 4601 non-null   float64
 51  char_freq_!                 4601 non-null   float64
 52  char_freq_$                 4601 non-null   float64
 53  char_freq_#                 4601 non-null   float64
 54  capital_run_length_average  4601 non-null   float64
 55  capital_run_length_longest  4601 non-null   int64  
 56  capital_run_length_total    4601 non-null   int64  
 57  LABEL                       4601 non-null   int64  
dtypes: float64(55), int64(3)
memory usage: 2.0 MB


# Split train and set data using class stratification
df_train, df_test= train_test_split(df, test_size=0.2, stratify=df.iloc[:,-1])
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')

df_train.shape: (3680, 58)
df_test.shape: (921, 58)


# Check the distribution of the variables
df_train.describe()


# Check the distribution of variables using a boxplot
fix, ax = plt.subplots(figsize=(16,9))
df_train.boxplot()
plt.show()


# Correlation Matrix
fig, ax = plt.subplots(figsize=(40,40))
g = sns.heatmap(df_train.corr(), vmin=-1, vmax=1, cmap='coolwarm_r', square=True)
plt.show()


# Coefficient of Variation (the variables with the largest std to mean ratio)
var_mean = df_train.mean(axis=0)
var_std = df_train.std(axis=0)
CV = var_std/var_mean
CV.head()

word_freq_make        2.915281
word_freq_address     5.952394
word_freq_all         1.816863
word_freq_3d         22.519930
word_freq_our         2.116698
dtype: float64


# Identify the features with the highest CV
num_picks = int(np.floor(np.sqrt(df.shape[1])))
print(f'Number of features chosen: {num_picks}')
chosen_features = CV.sort_values(ascending=False).head(num_picks).index
chosen_features = [i for i in chosen_features]
print(f'List of features chosen: {chosen_features}')

Number of features chosen: 7
List of features chosen: ['word_freq_3d', 'word_freq_parts', 'word_freq_table', 'char_freq_#', 'word_freq_conference', 'word_freq_cs', 'word_freq_font']


# Scatterplots using target variable as hue
g = sns.pairplot(data=df_train, hue='LABEL', vars=chosen_features, palette='coolwarm')
plt.show()


# Boxplots for the highest CV features
fig = plt.figure(figsize=(16,9))
g = sns.boxplot(data=df_train[chosen_features], orient='h')
plt.show()


# Pipeline to standardize then run the classifier
svc =  Pipeline([("standardize", StandardScaler()),
                 ("svc", SVC(kernel="rbf", decision_function_shape='ovr'))])

# Grid with parameters to be tested via CV
svc_param_grid_ = {'svc__C': np.logspace(-3, 3, 5),
                   'svc__gamma': np.logspace(-3, 3, 5)}

# Instantiate GridSearchCV using accuracy as the scorer
svc_gridCV = GridSearchCV(svc, svc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')


# Pipeline to standardize then run the classifier
rfc = Pipeline([("standardize", StandardScaler()),
                 ("rfc", RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1))]) 

# Grid with parameters to be tested via CV
rfc_param_grid_ = {'rfc__min_samples_split': [2,3],
                   'rfc__min_samples_leaf': [1,2,3]}

# Instantiate GridSearchCV using accuracy as the scorer
rfc_gridCV = GridSearchCV(rfc, rfc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')


knn = Pipeline([("standardize", StandardScaler()),
                ("knn", KNeighborsClassifier(metric='minkowski', leaf_size=30, weights='distance', n_jobs=-1))]) 

# Grid with parameters to be tested via CV
knn_param_grid_ = {'knn__n_neighbors': [3,5,7,9]}

# Instantiate GridSearchCV using accuracy as the scorer
knn_gridCV = GridSearchCV(knn, knn_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')


gpc = Pipeline([("standardize", StandardScaler()),
                ("gpc", GaussianProcessClassifier(optimizer='fmin_l_bfgs_b', max_iter_predict=100, n_jobs=-1))]) 

# Grid with parameters to be tested via CV
gpc_param_grid_ = {'gpc__n_restarts_optimizer': [0,5,10]}

# Instantiate GridSearchCV using accuracy as the scorer
gpc_gridCV = GridSearchCV(gpc, gpc_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')


lda = Pipeline([("standardize", StandardScaler()),
                ("lda", LinearDiscriminantAnalysis(solver='svd'))]) 

# Grid with parameters to be tested via CV
lda_param_grid_ = {'lda__tol': [1.0e-2, 1.0e-4, 1.0e-6]}

# Instantiate GridSearchCV using accuracy as the scorer
lda_gridCV = GridSearchCV(lda, lda_param_grid_, cv=5, n_jobs=-1, scoring='accuracy')


%%time
# Given the mild imbalance observed in the dataset I'll be adding a SMOTE step to the training data of each K-Fold
smote = SMOTE(n_jobs=-1)

# K-Fold + GridSearch to find the best model hyperparameters
kf = KFold(n_splits=5, shuffle=True)
kfold_intermediate_results = pd.DataFrame()
for fold_num, (idx_train, idx_valid) in enumerate(kf.split(df_train), 1):

    # Print current label and fold
    print(f'Working on Fold: {fold_num}')

    # Select all folds to be smoted except for the validation fold
    x_train, y_train = smote.fit_sample(df_train.iloc[idx_train,:-1], df_train.iloc[idx_train,-1])
    x_valid = df_train.iloc[idx_valid,:-1] 
    y_valid = df_train.iloc[idx_valid,-1] 

    # Fit using grid search to find the best params
    svc_gridCV.fit(x_train, y_train)
    rfc_gridCV.fit(x_train, y_train)
    knn_gridCV.fit(x_train, y_train)
    gpc_gridCV.fit(x_train, y_train)
    lda_gridCV.fit(x_train, y_train)

    # Predict on the train and validation folds to calculate metrics
    pred_svc_train = svc_gridCV.predict(x_train)   
    pred_svc_valid = svc_gridCV.predict(x_valid)
    pred_rfc_train = rfc_gridCV.predict(x_train)   
    pred_rfc_valid = rfc_gridCV.predict(x_valid)   
    pred_knn_train = knn_gridCV.predict(x_train)   
    pred_knn_valid = knn_gridCV.predict(x_valid)
    pred_gpc_train = gpc_gridCV.predict(x_train)   
    pred_gpc_valid = gpc_gridCV.predict(x_valid)
    pred_lda_train = lda_gridCV.predict(x_train)   
    pred_lda_valid = lda_gridCV.predict(x_valid)
    
    # Store best params of each classifier for each fold
    kfold_intermediate_results.at['SVC_C', f'{fold_num}'] = svc_gridCV.best_params_['svc__C']
    kfold_intermediate_results.at['SVC_gamma', f'{fold_num}'] = svc_gridCV.best_params_['svc__gamma']
    kfold_intermediate_results.at['RFC_split', f'{fold_num}'] = rfc_gridCV.best_params_['rfc__min_samples_split']
    kfold_intermediate_results.at['RFC_leaf', f'{fold_num}'] = rfc_gridCV.best_params_['rfc__min_samples_leaf']
    kfold_intermediate_results.at['KNN_N', f'{fold_num}'] = knn_gridCV.best_params_['knn__n_neighbors']
    kfold_intermediate_results.at['GPC_restarts', f'{fold_num}'] = gpc_gridCV.best_params_['gpc__n_restarts_optimizer']
    kfold_intermediate_results.at['LDA_tol', f'{fold_num}'] = lda_gridCV.best_params_['lda__tol']
   
    # Store errors for each model in each fold 
    kfold_intermediate_results.at['SVC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_svc_train)
    kfold_intermediate_results.at['SVC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_svc_valid)
    kfold_intermediate_results.at['RFC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_rfc_train)
    kfold_intermediate_results.at['RFC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_rfc_valid)
    kfold_intermediate_results.at['KNN_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_knn_train)
    kfold_intermediate_results.at['KNN_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_knn_valid)
    kfold_intermediate_results.at['GPC_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_gpc_train)
    kfold_intermediate_results.at['GPC_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_gpc_valid)    
    kfold_intermediate_results.at['LDA_train', f'{fold_num}'] = 1-accuracy_score(y_true=y_train, y_pred=pred_lda_train)
    kfold_intermediate_results.at['LDA_valid', f'{fold_num}'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_lda_valid)
    
# After running all K-Folds get average results for each classifier
kfold_intermediate_results['mean'] = kfold_intermediate_results.mean(axis=1)

print()
print('--- GridSearch Best Parameters ---')
print(f'Mean SVC C Parameter: {kfold_intermediate_results["mean"]["SVC_C"]}')
print(f'Mean SVC gamma Parameter: {kfold_intermediate_results["mean"]["SVC_gamma"]}')
print(f'Mean RFC min_samples_split Parameter: {kfold_intermediate_results["mean"]["RFC_split"]}')
print(f'Mean RFC min_samples_leaf Parameter: {kfold_intermediate_results["mean"]["RFC_leaf"]}')
print(f'Mean KNN N Parameter: {kfold_intermediate_results["mean"]["KNN_N"]}')
print(f'Mean GPA n_restarts_optimizer Parameter: {kfold_intermediate_results["mean"]["GPC_restarts"]}')
print(f'Mean LDA tol Parameter: {kfold_intermediate_results["mean"]["LDA_tol"]}')
print()
print('--- K-Fold Cross-Validation Results ---')
print(f'SVC Error | Training : {kfold_intermediate_results["mean"]["SVC_train"]}')
print(f'RFC Error | Training : {kfold_intermediate_results["mean"]["RFC_train"]}')
print(f'KNN Error | Training : {kfold_intermediate_results["mean"]["KNN_train"]}')
print(f'GPC Error | Training : {kfold_intermediate_results["mean"]["GPC_train"]}')
print(f'LDA Error | Training : {kfold_intermediate_results["mean"]["LDA_train"]}')
print()
print(f'SVC Error | Validation : {kfold_intermediate_results["mean"]["SVC_valid"]}')
print(f'RFC Error | Validation : {kfold_intermediate_results["mean"]["RFC_valid"]}')
print(f'KNN Error | Validation : {kfold_intermediate_results["mean"]["KNN_valid"]}')
print(f'GPC Error | Validation : {kfold_intermediate_results["mean"]["GPC_valid"]}')
print(f'LDA Error | Validation : {kfold_intermediate_results["mean"]["LDA_valid"]}')
print()

Working on Fold: 1
Working on Fold: 2
Working on Fold: 3
Working on Fold: 4
Working on Fold: 5

--- GridSearch Best Parameters ---
Mean SVC C Parameter: 1000.0
Mean SVC gamma Parameter: 0.001
Mean RFC min_samples_split Parameter: 2.2
Mean RFC min_samples_leaf Parameter: 1.0
Mean KNN N Parameter: 7.8
Mean GPA n_restarts_optimizer Parameter: 0.0
Mean LDA tol Parameter: 0.01

--- K-Fold Cross-Validation Results ---
SVC Error | Training : 0.03290193784820476
RFC Error | Training : 0.0006722528969830943
KNN Error | Training : 0.0005607080001621245
GPC Error | Training : 0.009920530100554359
LDA Error | Training : 0.09024132695878322

SVC Error | Validation : 0.06467391304347825
RFC Error | Validation : 0.04646739130434783
KNN Error | Validation : 0.08478260869565216
GPC Error | Validation : 0.07717391304347827
LDA Error | Validation : 0.09347826086956519

Wall time: 12min 48s


# Dataframe to store results
summary = pd.DataFrame()

# Create new classifiers using the average of the best parameters in each k-fold
svc_kfold = Pipeline([("standardize", StandardScaler()),
                      ("svc", SVC(kernel="rbf", decision_function_shape='ovr',
                                  C=kfold_intermediate_results.at['SVC_C', 'mean'],
                                  gamma=kfold_intermediate_results.at['SVC_gamma', 'mean']))])

rfc_kfold = Pipeline([("standardize", StandardScaler()),
                      ("rfc", RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
                                                     min_samples_split=round(kfold_intermediate_results.at['RFC_split', 'mean']),
                                                     min_samples_leaf=round(kfold_intermediate_results.at['RFC_leaf', 'mean'])))])

knn_kfold = Pipeline([("standardize", StandardScaler()),
                      ("knn", KNeighborsClassifier(metric='minkowski', leaf_size=30, weights='distance', n_jobs=-1,
                                                   n_neighbors=round(kfold_intermediate_results.at['KNN_N', 'mean'])))])

gpc_kfold = Pipeline([("standardize", StandardScaler()),
                      ("gpc", GaussianProcessClassifier(optimizer='fmin_l_bfgs_b', max_iter_predict=100, n_jobs=-1,
                                                        n_restarts_optimizer=round(kfold_intermediate_results["mean"]["GPC_restarts"])))])
                                       
lda_kfold = Pipeline([("standardize", StandardScaler()),
                      ("lda", LinearDiscriminantAnalysis(solver='svd',
                                                         tol=kfold_intermediate_results.at['LDA_tol', 'mean']))])


# Get X's and Y's - This time using the full datasets for trainin and testing
x_train, y_train = smote.fit_sample(df_train.iloc[:,:-1], df_train.iloc[:,-1])
x_test = df_test.iloc[:, :-1].copy()
y_test = df_test.iloc[:, -1].copy()

# Fit using the optimized model created with the mean hyperparameters from K-Fold cross-validation
svc_kfold.fit(x_train, y_train)
rfc_kfold.fit(x_train, y_train)
knn_kfold.fit(x_train, y_train)
gpc_kfold.fit(x_train, y_train)
lda_kfold.fit(x_train, y_train)                                     
                                       
# Predict
pred_svc_train = svc_kfold.predict(x_train)
pred_svc_test = svc_kfold.predict(x_test)
pred_rfc_train = rfc_kfold.predict(x_train)
pred_rfc_test = rfc_kfold.predict(x_test)
pred_knn_train = knn_kfold.predict(x_train)
pred_knn_test = knn_kfold.predict(x_test)
pred_gpc_train = gpc_kfold.predict(x_train)
pred_gpc_test = gpc_kfold.predict(x_test)
pred_lda_train = gpc_kfold.predict(x_train)
pred_lda_test = gpc_kfold.predict(x_test)
                                       
# Model Error
summary.at['train', f'SVC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_svc_train)
summary.at['test', f'SVC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_svc_test)
summary.at['train', f'RFC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_rfc_train)
summary.at['test', f'RFC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_rfc_test)
summary.at['train', f'KNN'] = 1-accuracy_score(y_true=y_train, y_pred=pred_knn_train)
summary.at['test', f'KNN'] = 1-accuracy_score(y_true=y_test, y_pred=pred_knn_test)
summary.at['train', f'GPC'] = 1-accuracy_score(y_true=y_train, y_pred=pred_gpc_train)
summary.at['test', f'GPC'] = 1-accuracy_score(y_true=y_test, y_pred=pred_gpc_test)
summary.at['train', f'LDA'] = 1-accuracy_score(y_true=y_train, y_pred=pred_lda_train)
summary.at['test', f'LDA'] = 1-accuracy_score(y_true=y_test, y_pred=pred_lda_test)
                                        
# Print model results for optimized classifiers model
print(f'------------------------------ MODEL OVERALL ------------------------------') 
print('SVC Error | Training: ', summary.at['train', f'SVC'])
print('RFC Error | Training: ', summary.at['train', f'RFC'])                                       
print('KNN Error | Training: ', summary.at['train', f'KNN'])
print('GPC Error | Training: ', summary.at['train', f'GPC'])                                       
print('LDA Error | Training: ', summary.at['train', f'LDA'])                                     
print()                                       
print('SVC Error | Testing: ', summary.at['test', f'SVC'])
print('RFC Error | Testing: ', summary.at['test', f'RFC'])
print('KNN Error | Testing: ', summary.at['test', f'KNN'])
print('GPC Error | Testing: ', summary.at['test', f'GPC'])
print('LDA Error | Testing: ', summary.at['test', f'LDA'])

------------------------------ MODEL OVERALL ------------------------------
SVC Error | Training:  0.03385650224215242
RFC Error | Training:  0.0006726457399103269
KNN Error | Training:  0.0006726457399103269
GPC Error | Training:  0.010313901345291532
LDA Error | Training:  0.010313901345291532

SVC Error | Testing:  0.06514657980456029
RFC Error | Testing:  0.057546145494028256
KNN Error | Testing:  0.07709011943539634
GPC Error | Testing:  0.07600434310532034
LDA Error | Testing:  0.07600434310532034


# View results in the dataframe
summary


# Split X and Y
x_train = df_train.iloc[:,:-1]
y_train= df_train.iloc[:,-1]
x_test = df_test.iloc[:,:-1]
y_test= df_test.iloc[:,-1]


#Instantiate and run the AutoML classifier
AutoML = TPOTClassifier(generations=5, population_size=20, cv=5, verbosity=2, n_jobs=-1)
AutoML.fit(x_train, y_train)

Generation 1 - Current best internal CV score: 0.9461956521739131

Generation 2 - Current best internal CV score: 0.9461956521739131

Generation 3 - Current best internal CV score: 0.9470108695652174

Generation 4 - Current best internal CV score: 0.9470108695652174

Generation 5 - Current best internal CV score: 0.9475543478260869

Best pipeline: XGBClassifier(BernoulliNB(input_matrix, alpha=1.0, fit_prior=False), learning_rate=0.1, max_depth=9, min_child_weight=12, n_estimators=100, n_jobs=1, subsample=0.6000000000000001, verbosity=0)

TPOTClassifier(generations=5, n_jobs=-1, population_size=20, verbosity=2)


# And the best model is:
AutoML.fitted_pipeline_.steps[-1][1]

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.1, max_delta_step=0, max_depth=9,
              min_child_weight=12, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=1, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
              subsample=0.6000000000000001, tree_method='exact',
              validate_parameters=1, verbosity=0)


# Check performance
print("AutoML Error | Training: ", 1-AutoML.score(x_train, y_train))
print("AutoML Error | Testing: ", 1-AutoML.score(x_test, y_test))

AutoML Error | Training:  0.03505434782608696
AutoML Error | Testing:  0.061889250814332275


# Now let's run a K-Fold cross-validation with the winner model and see how it performs over each fold
kf = KFold(n_splits=5, shuffle=True)
scores_train, scores_valid, scores_test = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for fold_num, (idx_train, idx_valid) in enumerate(kf.split(df_train), 1):
    
    # Print current label and fold
    print(f'Working on Fold: {fold_num}')

    # Select all folds to be smoted except for the validation fold
    x_train = df_train.iloc[idx_train,:-1].copy() 
    y_train = df_train.iloc[idx_train,-1].copy() 
    x_valid = df_train.iloc[idx_valid,:-1].copy() 
    y_valid = df_train.iloc[idx_valid,-1].copy() 
    x_test = df_test.iloc[:, :-1].copy()
    y_test = df_test.iloc[:, -1].copy()
    
    # Fit a Random Forest Classifier using the best parameters
    winner_model = Pipeline([("standardize", StandardScaler()),
                              ("rfc", RandomForestClassifier(n_estimators=100, criterion='gini', n_jobs=-1,
                                      min_samples_split=round(kfold_intermediate_results.at['RFC_split', 'mean']),
                                      min_samples_leaf=round(kfold_intermediate_results.at['RFC_leaf', 'mean'])))])
    
    winner_model.fit(x_train, y_train)
    
    # Predict on the train and validation folds to calculate metrics (with the winnier rfc_kfold model)
    pred_train = winner_model.predict(x_train)   
    pred_valid = winner_model.predict(x_valid)
    pred_test = winner_model.predict(x_test)
    
    # Train dataset metrics
    scores_train.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_train, y_pred=pred_train)
    cm_train = confusion_matrix(y_train, pred_train)
    TN, FP, FN, TP = cm_train.ravel()
    scores_train.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
    scores_train.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)
    
    # Valid dataset metrics
    scores_valid.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_valid, y_pred=pred_valid)
    cm_valid = confusion_matrix(y_valid, pred_valid)
    TN, FP, FN, TP = cm_valid.ravel()
    scores_valid.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
    scores_valid.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)
    
    # Test dataset metrics
    scores_test.at[f'{fold_num}', 'Error'] = 1-accuracy_score(y_true=y_test, y_pred=pred_test)
    cm_test = confusion_matrix(y_test, pred_test)
    TN, FP, FN, TP = cm_test.ravel()
    scores_test.at[f'{fold_num}', 'FPR'] = FP/(FP+TN)
    scores_test.at[f'{fold_num}', 'FNR'] = FN/(TP+FN)

# Get mean scores over all K-folds
scores_train.at['mean', 'Error'] = scores_train['Error'].mean()
scores_train.at['mean', 'FPR'] = scores_train['FPR'].mean()
scores_train.at['mean', 'FNR'] = scores_train['FNR'].mean()
scores_valid.at['mean', 'Error'] = scores_valid['Error'].mean()
scores_valid.at['mean', 'FPR'] = scores_valid['FPR'].mean()
scores_valid.at['mean', 'FNR'] = scores_valid['FNR'].mean()
scores_test.at['mean', 'Error'] = scores_test['Error'].mean()
scores_test.at['mean', 'FPR'] = scores_test['FPR'].mean()
scores_test.at['mean', 'FNR'] = scores_test['FNR'].mean()

Working on Fold: 1
Working on Fold: 2
Working on Fold: 3
Working on Fold: 4
Working on Fold: 5


scores_train


scores_valid


scores_test


# Function to plot confusion matrix
def show_confusion_matrix(confusion_matrix):
    hmap = sns.heatmap(confusion_matrix, annot = True, fmt = "d", cmap = "Blues", square=True)
    hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation = 0, ha = "right")
    hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation = 0, ha = "right")
    plt.title('Confusion Matrix', pad=20, fontweight='bold', fontsize=15)
    plt.ylabel('True Class')
    plt.xlabel('Predicted Class')


# Function to plot ROC-AUC Curve
def show_roc_curve(roc_curve):
    auc_score = auc(roc_curve[0], roc_curve[1])
    plt.axis('square')
    plt.plot(roc_curve[0], roc_curve[1], color='tomato', lw=3, label='AUC: ' + str(auc_score))
    plt.plot([0, 1], [0, 1], 'k--', lw=1.5)
    plt.xlim([-0.01, 1.01])
    plt.ylim([-0.01, 1.01])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve', pad=20, fontweight='bold', fontsize=15)
    legend = plt.legend('AUC Score')
    legend._legend_box.align = "right"
    plt.legend(loc="lower right")


# Use the trained model to predict on both train and test datasets (now without smoting them)
pred_train = rfc_kfold.predict(x_train)
pred_test = rfc_kfold.predict(x_test)

# Confusion Matrix
cm_train = confusion_matrix(y_true=y_train, y_pred=pred_train)
cm_test = confusion_matrix(y_true=y_test, y_pred=pred_test)

# ROC Curve
roc_train = roc_curve(y_true=y_train, y_score=pred_train)
roc_test = roc_curve(y_true=y_test, y_score=pred_test)


# Create Figure
fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(10,8))

# Plot Train ROC Curve
plt.subplot(2,2,1)
show_roc_curve(roc_train)
plt.title('ROC Curve | Train Data', fontsize=15, fontweight='bold', pad=15)

# Plot Train Confusion Matrix
plt.subplot(2,2,2)
show_confusion_matrix(cm_train)
plt.title('Confusion Matrix | Train Data', fontsize=15, fontweight='bold', pad=15)

# Plot Test ROC Curve
plt.subplot(2,2,3)
show_roc_curve(roc_test)
plt.title('ROC Curve | Test Data', fontsize=15, fontweight='bold', pad=15)

# Plot Test Confusion Matrix
plt.subplot(2,2,4)
show_confusion_matrix(cm_test)
plt.title('Confusion Matrix | Test Data', fontsize=15, fontweight='bold', pad=15)

fig.tight_layout(h_pad=4, w_pad=-10)

	word_freq_make	word_freq_address	word_freq_all	word_freq_3d	word_freq_our	word_freq_over	word_freq_remove	word_freq_internet	word_freq_order	word_freq_mail	...	char_freq_;	char_freq_(	char_freq_[	char_freq_!	char_freq_$	char_freq_#	capital_run_length_average	capital_run_length_longest	capital_run_length_total	LABEL
count	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	...	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000	3680.000000
mean	0.104326	0.224799	0.279769	0.054228	0.310087	0.091864	0.113130	0.105326	0.089731	0.244329	...	0.038564	0.135266	0.016735	0.272490	0.077965	0.043748	5.249336	49.709511	280.558967	0.394022
std	0.304140	1.338092	0.508302	1.221217	0.656361	0.253049	0.381496	0.407075	0.280806	0.662715	...	0.248559	0.218538	0.114857	0.864163	0.250333	0.454713	33.133562	128.257333	555.548311	0.488706
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.593500	6.000000	35.000000	0.000000
50%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.065000	0.000000	0.000000	0.000000	0.000000	2.275000	15.000000	95.000000	0.000000
75%	0.000000	0.000000	0.420000	0.000000	0.390000	0.000000	0.000000	0.000000	0.000000	0.170000	...	0.000000	0.186000	0.000000	0.325000	0.054250	0.000000	3.734000	44.000000	266.250000	1.000000
max	4.540000	14.280000	5.100000	42.810000	8.330000	3.570000	7.270000	11.110000	5.260000	18.180000	...	4.385000	4.271000	4.081000	32.478000	6.003000	19.829000	1102.500000	2204.000000	9163.000000	1.000000

	Error	FPR	FNR
1	0.000679	0.000562	0.000859
2	0.001019	0.001115	0.000870
3	0.000679	0.000558	0.000867
4	0.000340	0.000564	0.000000
5	0.000340	0.000561	0.000000
mean	0.000611	0.000672	0.000519

	Error	FPR	FNR
1	0.052989	0.031111	0.087413
2	0.043478	0.011468	0.090000
3	0.061141	0.052392	0.074074
4	0.042120	0.030702	0.060714
5	0.040761	0.031180	0.055749
mean	0.048098	0.031371	0.073590

	Error	FPR	FNR
1	0.055375	0.034050	0.088154
2	0.057546	0.028674	0.101928
3	0.058632	0.034050	0.096419
4	0.054289	0.030466	0.090909
5	0.056460	0.030466	0.096419
mean	0.056460	0.031541	0.094766

Algorithm Optimization & Selection for Spam Detection¶

Dataset¶

Imports¶

1. Loading the Dataset¶

1.1. Train-Test Split¶

1.2. Exploratory Analysis¶

2. Building the Classifier¶

2.1 Defining Grid Searches to Optimize the Classifiers¶

Support Vector Classifier¶

Random Forest Classifier¶

KNN Classifier¶

Gaussian Process Classifier¶

Linear Discriminant Analysis¶

2.2. Finding Optimal Hyperparameter with Grid Search & K-Fold Cross-Validation¶

2.3. Comparing Results of the Optimized Classifiers & Selecting the Best¶

Bonus: AutoML¶

3. Evaluating the Results¶

3.1. K-Fold Cross-Validation for the Best Model¶

3.2. Confusion Matrix and ROC-AUC Curve for the Best Model¶

End¶

	word_freq_make	word_freq_address	word_freq_all	word_freq_our	word_freq_over	word_freq_remove	word_freq_internet	word_freq_order	word_freq_mail	...	char_freq_;	char_freq_(	char_freq_!	char_freq_$	char_freq_#	capital_run_length_average	capital_run_length_longest	capital_run_length_total	LABEL
0	0.00	0.64	0.64	0.32	0.00	0.00	0.00	0.00	0.00	...	0.00	0.000	0.778	0.000	0.000	3.756	61	278	1
1	0.21	0.28	0.50	0.14	0.28	0.21	0.07	0.00	0.94	...	0.00	0.132	0.372	0.180	0.048	5.114	101	1028	1
2	0.06	0.00	0.71	1.23	0.19	0.19	0.12	0.64	0.25	...	0.01	0.143	0.276	0.184	0.010	9.821	485	2259	1
3	0.00	0.00	0.00	0.63	0.00	0.31	0.63	0.31	0.63	...	0.00	0.137	0.137	0.000	0.000	3.537	40	191	1
4	0.00	0.00	0.00	0.63	0.00	0.31	0.63	0.31	0.63	...	0.00	0.135	0.135	0.000	0.000	3.537	40	191	1

	SVC	RFC	KNN	GPC	LDA
train	0.033857	0.000673	0.000673	0.010314	0.010314
test	0.065147	0.057546	0.077090	0.076004	0.076004