Introduction¶

Exploring the NSL-KDD dataset from the Canadian Institute for Cybersecurity. (https://www.unb.ca/cic/datasets/nsl.html)

Using the older 1999 dataset since it's smaller and allows for faster iterations.

Download source I used: http://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

Using this dataset to train an Autoencoder for Hack Detection.

References: [1] M. Tavallaee, E. Bagheri, W. Lu, and A. Ghorbani, “A Detailed Analysis of the KDD CUP 99 Data Set,” Submitted to Second IEEE Symposium on Computational Intelligence for Security and Defense Applications (CISDA), 2009.

Imports¶

# Imports
from tqdm import tqdm
import random
import sklearn
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
import matplotlib
import matplotlib.pyplot as plt
import datetime
import tensorflow.keras.backend as K
from mpl_toolkits import mplot3d
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler, OneHotEncoder
from tensorflow.keras.layers import Input, Dense, LSTM, TimeDistributed, RepeatVector, AlphaDropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.models import Model
from scipy.spatial.distance import euclidean
from sklearn.metrics import confusion_matrix, roc_auc_score, roc_curve, f1_score 
from sklearn.metrics import precision_score, recall_score, accuracy_score
pd.options.mode.chained_assignment = None

%reload_ext watermark
%watermark -v --iv

numpy      1.18.5
tensorflow 2.2.0
pandas     1.0.5
matplotlib 3.2.2
seaborn    0.10.1
sklearn    0.23.1
CPython 3.7.7
IPython 7.16.1

Dataset¶

# File paths
path_train = 'https://raw.githubusercontent.com/Matheus-Schmitz/Autoencoder_Hack_Detection/master/data/KDDTrain.csv'
path_test = 'https://raw.githubusercontent.com/Matheus-Schmitz/Autoencoder_Hack_Detection/master/data/KDDTest.csv'
path_colnames = 'https://raw.githubusercontent.com/Matheus-Schmitz/Autoencoder_Hack_Detection/master/data/kddcup-columns-names.txt'

# Load train dataset
df_train = pd.read_csv(path_train, header = None)

df_train.head()

# Load column names from txt file
col_names = pd.read_csv(path_colnames, header = None)
col_names = col_names.squeeze()
col_names

0                        duration: continuous.
1                     protocol_type: symbolic.
2                           service: symbolic.
3                              flag: symbolic.
4                       src_bytes: continuous.
5                       dst_bytes: continuous.
6                              land: symbolic.
7                  wrong_fragment: continuous.
8                          urgent: continuous.
9                             hot: continuous.
10              num_failed_logins: continuous.
11                        logged_in: symbolic.
12                num_compromised: continuous.
13                     root_shell: continuous.
14                   su_attempted: continuous.
15                       num_root: continuous.
16             num_file_creations: continuous.
17                     num_shells: continuous.
18               num_access_files: continuous.
19              num_outbound_cmds: continuous.
20                    is_host_login: symbolic.
21                   is_guest_login: symbolic.
22                          count: continuous.
23                      srv_count: continuous.
24                    serror_rate: continuous.
25                srv_serror_rate: continuous.
26                    rerror_rate: continuous.
27                srv_rerror_rate: continuous.
28                  same_srv_rate: continuous.
29                  diff_srv_rate: continuous.
30             srv_diff_host_rate: continuous.
31                 dst_host_count: continuous.
32             dst_host_srv_count: continuous.
33         dst_host_same_srv_rate: continuous.
34         dst_host_diff_srv_rate: continuous.
35    dst_host_same_src_port_rate: continuous.
36    dst_host_srv_diff_host_rate: continuous.
37           dst_host_serror_rate: continuous.
38       dst_host_srv_serror_rate: continuous.
39           dst_host_rerror_rate: continuous.
40       dst_host_srv_rerror_rate: continuous.
Name: 0, dtype: object

# Clean column names
col_names_cleaned = [i.split(':')[0] for i in col_names]
col_names_cleaned

['duration',
 'protocol_type',
 'service',
 'flag',
 'src_bytes',
 'dst_bytes',
 'land',
 'wrong_fragment',
 'urgent',
 'hot',
 'num_failed_logins',
 'logged_in',
 'num_compromised',
 'root_shell',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_shells',
 'num_access_files',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login',
 'count',
 'srv_count',
 'serror_rate',
 'srv_serror_rate',
 'rerror_rate',
 'srv_rerror_rate',
 'same_srv_rate',
 'diff_srv_rate',
 'srv_diff_host_rate',
 'dst_host_count',
 'dst_host_srv_count',
 'dst_host_same_srv_rate',
 'dst_host_diff_srv_rate',
 'dst_host_same_src_port_rate',
 'dst_host_srv_diff_host_rate',
 'dst_host_serror_rate',
 'dst_host_srv_serror_rate',
 'dst_host_rerror_rate',
 'dst_host_srv_rerror_rate']

# Add a column for the webtraffic type (the target variable)
col_names_cleaned.extend(['result'])

# Set the column names on the training dataset
df_train.columns = col_names_cleaned

df_train.head()

# Data types
df_train.dtypes

duration                         int64
protocol_type                   object
service                         object
flag                            object
src_bytes                        int64
dst_bytes                        int64
land                             int64
wrong_fragment                   int64
urgent                           int64
hot                              int64
num_failed_logins                int64
logged_in                        int64
num_compromised                  int64
root_shell                       int64
su_attempted                     int64
num_root                         int64
num_file_creations               int64
num_shells                       int64
num_access_files                 int64
num_outbound_cmds                int64
is_host_login                    int64
is_guest_login                   int64
count                            int64
srv_count                        int64
serror_rate                    float64
srv_serror_rate                float64
rerror_rate                    float64
srv_rerror_rate                float64
same_srv_rate                  float64
diff_srv_rate                  float64
srv_diff_host_rate             float64
dst_host_count                   int64
dst_host_srv_count               int64
dst_host_same_srv_rate         float64
dst_host_diff_srv_rate         float64
dst_host_same_src_port_rate    float64
dst_host_srv_diff_host_rate    float64
dst_host_serror_rate           float64
dst_host_srv_serror_rate       float64
dst_host_rerror_rate           float64
dst_host_srv_rerror_rate       float64
result                          object
dtype: object

Exploratory Analysis & Data Cleaning¶

# Check all possible traffic classifications
# 'normal' is benign traffic, all alternatives are anomalous activity
df_train.result.unique()

array(['normal', 'neptune', 'warezclient', 'ipsweep', 'portsweep',
       'teardrop', 'nmap', 'satan', 'smurf', 'pod', 'back',
       'guess_passwd', 'ftp_write', 'multihop', 'rootkit',
       'buffer_overflow', 'imap', 'warezmaster', 'phf', 'land',
       'loadmodule', 'spy', 'perl'], dtype=object)

# Check all network service classifications
df_train.service.unique()

array(['ftp_data', 'other', 'private', 'http', 'remote_job', 'name',
       'netbios_ns', 'eco_i', 'mtp', 'telnet', 'finger', 'domain_u',
       'supdup', 'uucp_path', 'Z39_50', 'smtp', 'csnet_ns', 'uucp',
       'netbios_dgm', 'urp_i', 'auth', 'domain', 'ftp', 'bgp', 'ldap',
       'ecr_i', 'gopher', 'vmnet', 'systat', 'http_443', 'efs', 'whois',
       'imap4', 'iso_tsap', 'echo', 'klogin', 'link', 'sunrpc', 'login',
       'kshell', 'sql_net', 'time', 'hostnames', 'exec', 'ntp_u',
       'discard', 'nntp', 'courier', 'ctf', 'ssh', 'daytime', 'shell',
       'netstat', 'pop_3', 'nnsp', 'IRC', 'pop_2', 'printer', 'tim_i',
       'pm_dump', 'red_i', 'netbios_ssn', 'rje', 'X11', 'urh_i',
       'http_8001', 'aol', 'http_2784', 'tftp_u', 'harvest'], dtype=object)

There are many network service types for many purposes, and those can have confounding attributes when it comes to identifying malicious/anomalous activity.

I'll keep only http services and focus on web activity.

# Extracting only samples with http service
df_train_http = df_train[df_train['service'] == 'http']

df_train_http.head()

# Check amount of normal and anomalous samples
normal_samples = df_train_http[df_train_http['result'] == 'normal'].shape[0]
anomalous_samples = df_train_http[df_train_http['result'] != 'normal'].shape[0]

print(f'Share of Normal Samples:    {100*(normal_samples/len(df_train_http)):.2f} %')
print(f'Share of Anomalous Samples:  {100*(anomalous_samples/len(df_train_http)):.2f} %')

Share of Normal Samples:    94.33 %
Share of Anomalous Samples:  5.67 %

The dataset is highly unbalanced. One alternative would be using SMOTE to balance it, but here I'll take another route which I was taught can work well in such scenarios:

I'll create a single class autoencoder, which learns the 'signature' of a class and then classifies a test sample as being or not being part of that class based on it's distance (in standard deviations) from the class signature. This is done with SELUs (a 'self-normalizing' variation of RELUs).

This model will essentially return the probability of a test sample being or not being 'normal' traffic.

# Extracting only the samples with normal activity
df_train_http_normal = df_train_http[df_train_http['result'] == 'normal']
df_train_http_normal.shape

(38049, 42)

# Describe
df_train_http_normal.describe()

Since the model works based on standard deviations, some columns will have to be dropped, namely those without a standard deviation.

Note: This means dropping the target variable (results) too, since this will be unsupervised learning.

# List of all columns to remove
zero_std_cols = []

# Columns with string values
mask = (df_train_http_normal.dtypes == 'object').values
string_cols = df_train_http_normal.columns[mask]
zero_std_cols.extend(string_cols)
zero_std_cols

['protocol_type', 'service', 'flag', 'result']

# Columns with standard deviation of zero
mask = df_train_http_normal.std(axis = 0) == 0
no_std_cols = mask.index[mask.values == True]
zero_std_cols.extend(no_std_cols)
zero_std_cols

['protocol_type',
 'service',
 'flag',
 'result',
 'land',
 'wrong_fragment',
 'urgent',
 'num_failed_logins',
 'su_attempted',
 'num_root',
 'num_file_creations',
 'num_outbound_cmds',
 'is_host_login',
 'is_guest_login']

# Total columns
len(df_train_http_normal.columns)

42

# Columns to be dropped
len(zero_std_cols)

14

# Removing columns with a standard deviation of zero
df_train_std = df_train_http_normal.drop(zero_std_cols, axis = 1)

# Describe
df_train_std.describe()

# Boxplot
df_train_std.boxplot(figsize = (35, 10))

<matplotlib.axes._subplots.AxesSubplot at 0x1a9d9daba48>

The 'dst_bytes' column has a much greater range, and the other columns also have varying ranges. This clearly calls for a dataset scaling.

Here Standandizing (mean = 0, std = 1) is much preferred over Normalizing ([0,1]) due to the approach being used.

# Standardizing data
scaler = StandardScaler()
df_train_scaled = pd.DataFrame(scaler.fit_transform(df_train_std),
                               columns = df_train_std.columns)

df_train_scaled.head()

# Correlation plot
plt.figure(figsize = (15, 10))
sns.heatmap(df_train_scaled.corr(), cmap = 'viridis')
plt.show()

Although most features are not correlated there are a few pairs of features with high correlation. Moreso, the dataset has 28 dimensions, somewhat too much to efficiently train a model.

Hence I'll employ PCA to reduce the feature space while simultaneously removing the correlations present on the dataset.

In line with Pareto's principle I'll aim to keep 80% of the varation present in the original feature space (in hopes of needing only ~20% of the features)

PCA¶

# Reduce dataset dimensionality with PCA
# Aiming to keep 80% of the explained variance present in the original dataset
pca = PCA(n_components = 0.8)
pca.fit(df_train_scaled)

PCA(n_components=0.8)

# Extracting the Principal Components resultant from PCA
pca_cols = ['PCA_' + str(i) for i in range(pca.n_components_)]
df_train_pca = pd.DataFrame(pca.transform(df_train_scaled), columns = pca_cols)

df_train_pca.head()

From the original Task description:

"The raw training data was about four gigabytes of compressed binary TCP dump data from seven weeks of network traffic. This was processed into about five million connection records. Similarly, the two weeks of test data yielded around two million connection records.

A connection is a sequence of TCP packets starting and ending at some well defined times, between which data flows to and from a source IP address to a target IP address under some well defined protocol. Each connection is labeled as either normal, or as an attack, with exactly one specific attack type. Each connection record consists of about 100 bytes."

In other words, this is a time series dataset!

Therefore... (drums beating) ... we are going with LSTMs!

Source: https://kdd.ics.uci.edu/databases/kddcup99/task.html

Because LSTMs require 2D input data (timesteps, features), the dataset will need to be transformed. Using a function to extract 2D windows.

After some testing a window_size of 10 seems to be a good value. Also using a stride of 10, therefore there is no overlap between windows.

# Function to get windows
def get_windows(df, window_size = 10, stride = 10):
    windows_arr = []
    for i in tqdm(range(0, len(df)-stride, stride)):
        windows_arr.append(df.iloc[i:i+window_size, :].to_numpy())
    return np.array(windows_arr)

# Get windows
window_size, stride = 10, 10
windows_arr = get_windows(df_train_pca)

100%|████████████████████████████████████| 3804/3804 [00:00<00:00, 8207.47it/s]

df_train_pca.shape

(38049, 14)

# 3804 samples (38049 rows divided by stride of 10, keeping only the integer part)
# Each sample constains 10 timesteps (from window_size)
# Each timestep constains 14 features (from pca.n_components_)
windows_arr.shape

(3804, 10, 14)

# Shuffle the samples to avoid biasing the model during training
indices = np.arange(windows_arr.shape[0])
np.random.shuffle(indices)
windows_shuffled = windows_arr[indices]

LSTM Autoencoder¶

The activation function used will be SELU (Scaled Exponential Linear Unit) instead of the usual RELU.

From TF: https://www.tensorflow.org/api_docs/python/tf/keras/activations/selu

TF recommends to use AlphaDropout along with SELU: https://www.tensorflow.org/api_docs/python/tf/keras/layers/AlphaDropout

The SELU activation auto-normalizes the neural network.

The reasoning this choice is that given the assumption that the network's weights are initialized as a normal distribuition, then SELU will make it so that the entire NN always follows a gaussian distribution. This is because when multiplying the NNs componenets (forward/backward step) it will still be gaussian thanks to SELU. All this implies that the NN output will also be normally distributed.

This architecture choice supposedly results in better performance for this type of problem (unbalanced classes on an autoencoder).

# Clean Keras session
K.clear_session()

# Encoder with Stacked LSTM
encoder = Sequential([LSTM(80, return_sequences = True,
                           activation = 'selu',
                           input_shape = (window_size, pca.n_components_)),
                      AlphaDropout(rate = 0.2),
                      LSTM(50, activation = 'selu', return_sequences = True),
                      LSTM(20, activation = 'selu')],
                     name = 'encoder')

WARNING:tensorflow:Layer lstm will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU

The last LSTM does not return_sequences, which will compress the output, hence the decoder needs to start with a RepeatVector of window_size to get the data back to the shape of windows_arr

TF RepeatVector: https://www.tensorflow.org/api_docs/python/tf/keras/layers/RepeatVector

# Decoder with output dimension equals to input dimension
decoder = Sequential([RepeatVector(window_size),
                      LSTM(50, activation = 'selu', return_sequences = True),
                      LSTM(80, activation = 'selu', return_sequences = True),
                      TimeDistributed(Dense(pca.n_components_, activation = 'linear'))],
                    name = 'decoder')

WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_4 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU

# Sequential Autoencoder
autoencoder = Sequential([encoder, decoder], name = 'autoencoder')

When to use HuberLoss:

As said earlier that Huber loss has both MAE and MSE. So when we think higher weightage should not be given to outliers, go for Huber.

https://medium.com/@gobiviswaml/huber-error-loss-functions-3f2ac015cd45

Wikipedia: https://en.wikipedia.org/wiki/Huber_loss

# Compile the autoenceoder with Huber Loss and adam
autoencoder.compile(optimizer = 'adam', loss = tf.keras.losses.Huber(100.))

# Model summary
autoencoder.summary()

Model: "autoencoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
encoder (Sequential)         (None, 20)                62280     
_________________________________________________________________
decoder (Sequential)         (None, 10, 14)            57254     
=================================================================
Total params: 119,534
Trainable params: 119,534
Non-trainable params: 0
_________________________________________________________________

# Encoder and Decoder summary
encoder.summary(), decoder.summary()

Model: "encoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
lstm (LSTM)                  (None, 10, 80)            30400     
_________________________________________________________________
alpha_dropout (AlphaDropout) (None, 10, 80)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 10, 50)            26200     
_________________________________________________________________
lstm_2 (LSTM)                (None, 20)                5680      
=================================================================
Total params: 62,280
Trainable params: 62,280
Non-trainable params: 0
_________________________________________________________________
Model: "decoder"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
repeat_vector (RepeatVector) (None, 10, 20)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 50)            14200     
_________________________________________________________________
lstm_4 (LSTM)                (None, 10, 80)            41920     
_________________________________________________________________
time_distributed (TimeDistri (None, 10, 14)            1134      
=================================================================
Total params: 57,254
Trainable params: 57,254
Non-trainable params: 0
_________________________________________________________________

(None, None)

# Model checkpoint (save model whenever validation loss improves)
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M")

check_point = tf.keras.callbacks.ModelCheckpoint(f'models/autoencoder_{now}.h5',
                                                 monitor = 'val_loss',
                                                 save_best_only = True,
                                                 mode = 'min',
                                                 verbose = 1)

%%time

# Note: test y as windows_shuffled[:, :, ::-1]
# This would be a flipped autoencoder

# Train model
train_hist = autoencoder.fit(windows_shuffled, 
                             windows_shuffled, 
                             batch_size = 64, 
                             validation_split = 0.2, 
                             epochs = 100, 
                             callbacks = [check_point])

Epoch 1/100
48/48 [==============================] - ETA: 0s - loss: 0.8279
Epoch 00001: val_loss improved from inf to 0.69685, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 140ms/step - loss: 0.8279 - val_loss: 0.6969
Epoch 2/100
48/48 [==============================] - ETA: 0s - loss: 0.8008
Epoch 00002: val_loss improved from 0.69685 to 0.67137, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 146ms/step - loss: 0.8008 - val_loss: 0.6714
Epoch 3/100
48/48 [==============================] - ETA: 0s - loss: 0.7746
Epoch 00003: val_loss improved from 0.67137 to 0.64298, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 142ms/step - loss: 0.7746 - val_loss: 0.6430
Epoch 4/100
48/48 [==============================] - ETA: 0s - loss: 0.7396
Epoch 00004: val_loss improved from 0.64298 to 0.62079, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 147ms/step - loss: 0.7396 - val_loss: 0.6208
Epoch 5/100
48/48 [==============================] - ETA: 0s - loss: 0.7037
Epoch 00005: val_loss improved from 0.62079 to 0.58756, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 144ms/step - loss: 0.7037 - val_loss: 0.5876
Epoch 6/100
48/48 [==============================] - ETA: 0s - loss: 0.6645
Epoch 00006: val_loss improved from 0.58756 to 0.55676, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 142ms/step - loss: 0.6645 - val_loss: 0.5568
Epoch 7/100
48/48 [==============================] - ETA: 0s - loss: 0.6417
Epoch 00007: val_loss did not improve from 0.55676
48/48 [==============================] - 6s 134ms/step - loss: 0.6417 - val_loss: 0.5640
Epoch 8/100
48/48 [==============================] - ETA: 0s - loss: 0.6457
Epoch 00008: val_loss improved from 0.55676 to 0.54689, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 150ms/step - loss: 0.6457 - val_loss: 0.5469
Epoch 9/100
48/48 [==============================] - ETA: 0s - loss: 0.6146
Epoch 00009: val_loss did not improve from 0.54689
48/48 [==============================] - 6s 133ms/step - loss: 0.6146 - val_loss: 0.5562
Epoch 10/100
48/48 [==============================] - ETA: 0s - loss: 0.6144
Epoch 00010: val_loss did not improve from 0.54689
48/48 [==============================] - 6s 134ms/step - loss: 0.6144 - val_loss: 0.5742
Epoch 11/100
48/48 [==============================] - ETA: 0s - loss: 0.6039
Epoch 00011: val_loss improved from 0.54689 to 0.51587, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 142ms/step - loss: 0.6039 - val_loss: 0.5159
Epoch 12/100
48/48 [==============================] - ETA: 0s - loss: 0.5889
Epoch 00012: val_loss improved from 0.51587 to 0.51043, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 144ms/step - loss: 0.5889 - val_loss: 0.5104
Epoch 13/100
48/48 [==============================] - ETA: 0s - loss: 0.5705
Epoch 00013: val_loss improved from 0.51043 to 0.50750, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 144ms/step - loss: 0.5705 - val_loss: 0.5075
Epoch 14/100
48/48 [==============================] - ETA: 0s - loss: 0.5743
Epoch 00014: val_loss improved from 0.50750 to 0.49705, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 145ms/step - loss: 0.5743 - val_loss: 0.4970
Epoch 15/100
48/48 [==============================] - ETA: 0s - loss: 0.5516
Epoch 00015: val_loss did not improve from 0.49705
48/48 [==============================] - 7s 135ms/step - loss: 0.5516 - val_loss: 0.5047
Epoch 16/100
48/48 [==============================] - ETA: 0s - loss: 0.5498
Epoch 00016: val_loss improved from 0.49705 to 0.47297, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 145ms/step - loss: 0.5498 - val_loss: 0.4730
Epoch 17/100
48/48 [==============================] - ETA: 0s - loss: 0.5842
Epoch 00017: val_loss did not improve from 0.47297
48/48 [==============================] - 6s 134ms/step - loss: 0.5842 - val_loss: 0.5009
Epoch 18/100
48/48 [==============================] - ETA: 0s - loss: 0.5578
Epoch 00018: val_loss did not improve from 0.47297
48/48 [==============================] - 6s 132ms/step - loss: 0.5578 - val_loss: 0.4758
Epoch 19/100
48/48 [==============================] - ETA: 0s - loss: 0.5375
Epoch 00019: val_loss did not improve from 0.47297
48/48 [==============================] - 6s 133ms/step - loss: 0.5375 - val_loss: 0.5195
Epoch 20/100
48/48 [==============================] - ETA: 0s - loss: 0.5299
Epoch 00020: val_loss improved from 0.47297 to 0.45322, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 145ms/step - loss: 0.5299 - val_loss: 0.4532
Epoch 21/100
48/48 [==============================] - ETA: 0s - loss: 0.5665
Epoch 00021: val_loss did not improve from 0.45322
48/48 [==============================] - 6s 135ms/step - loss: 0.5665 - val_loss: 0.4855
Epoch 22/100
48/48 [==============================] - ETA: 0s - loss: 0.5374
Epoch 00022: val_loss did not improve from 0.45322
48/48 [==============================] - 6s 133ms/step - loss: 0.5374 - val_loss: 0.4645
Epoch 23/100
48/48 [==============================] - ETA: 0s - loss: 0.5015
Epoch 00023: val_loss did not improve from 0.45322
48/48 [==============================] - 6s 135ms/step - loss: 0.5015 - val_loss: 0.4568
Epoch 24/100
48/48 [==============================] - ETA: 0s - loss: 0.4940
Epoch 00024: val_loss improved from 0.45322 to 0.44345, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 139ms/step - loss: 0.4940 - val_loss: 0.4435
Epoch 25/100
48/48 [==============================] - ETA: 0s - loss: 0.4801
Epoch 00025: val_loss did not improve from 0.44345
48/48 [==============================] - 6s 131ms/step - loss: 0.4801 - val_loss: 0.4464
Epoch 26/100
48/48 [==============================] - ETA: 0s - loss: 0.4693
Epoch 00026: val_loss did not improve from 0.44345
48/48 [==============================] - 7s 140ms/step - loss: 0.4693 - val_loss: 0.4607
Epoch 27/100
48/48 [==============================] - ETA: 0s - loss: 0.4755
Epoch 00027: val_loss improved from 0.44345 to 0.43853, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 141ms/step - loss: 0.4755 - val_loss: 0.4385
Epoch 28/100
48/48 [==============================] - ETA: 0s - loss: 0.4690
Epoch 00028: val_loss did not improve from 0.43853
48/48 [==============================] - 6s 133ms/step - loss: 0.4690 - val_loss: 0.4422
Epoch 29/100
48/48 [==============================] - ETA: 0s - loss: 0.4484
Epoch 00029: val_loss improved from 0.43853 to 0.43045, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 146ms/step - loss: 0.4484 - val_loss: 0.4304
Epoch 30/100
48/48 [==============================] - ETA: 0s - loss: 0.4787
Epoch 00030: val_loss did not improve from 0.43045
48/48 [==============================] - 6s 129ms/step - loss: 0.4787 - val_loss: 0.4954
Epoch 31/100
48/48 [==============================] - ETA: 0s - loss: 0.5331
Epoch 00031: val_loss did not improve from 0.43045
48/48 [==============================] - 7s 135ms/step - loss: 0.5331 - val_loss: 0.4653
Epoch 32/100
48/48 [==============================] - ETA: 0s - loss: 0.4643
Epoch 00032: val_loss did not improve from 0.43045
48/48 [==============================] - 6s 132ms/step - loss: 0.4643 - val_loss: 0.4463
Epoch 33/100
48/48 [==============================] - ETA: 0s - loss: 0.4745
Epoch 00033: val_loss did not improve from 0.43045
48/48 [==============================] - 6s 131ms/step - loss: 0.4745 - val_loss: 0.4603
Epoch 34/100
48/48 [==============================] - ETA: 0s - loss: 0.4693
Epoch 00034: val_loss did not improve from 0.43045
48/48 [==============================] - 6s 132ms/step - loss: 0.4693 - val_loss: 0.4847
Epoch 35/100
48/48 [==============================] - ETA: 0s - loss: 0.4586
Epoch 00035: val_loss did not improve from 0.43045
48/48 [==============================] - 6s 132ms/step - loss: 0.4586 - val_loss: 0.4383
Epoch 36/100
48/48 [==============================] - ETA: 0s - loss: 0.4367
Epoch 00036: val_loss did not improve from 0.43045
48/48 [==============================] - 6s 129ms/step - loss: 0.4367 - val_loss: 0.4356
Epoch 37/100
48/48 [==============================] - ETA: 0s - loss: 0.4200
Epoch 00037: val_loss did not improve from 0.43045
48/48 [==============================] - 6s 130ms/step - loss: 0.4200 - val_loss: 0.4448
Epoch 38/100
48/48 [==============================] - ETA: 0s - loss: 0.4246
Epoch 00038: val_loss did not improve from 0.43045
48/48 [==============================] - 7s 140ms/step - loss: 0.4246 - val_loss: 0.4333
Epoch 39/100
48/48 [==============================] - ETA: 0s - loss: 0.4390
Epoch 00039: val_loss improved from 0.43045 to 0.43039, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 151ms/step - loss: 0.4390 - val_loss: 0.4304
Epoch 40/100
48/48 [==============================] - ETA: 0s - loss: 0.4293
Epoch 00040: val_loss did not improve from 0.43039
48/48 [==============================] - 7s 137ms/step - loss: 0.4293 - val_loss: 0.4327
Epoch 41/100
48/48 [==============================] - ETA: 0s - loss: 0.4158
Epoch 00041: val_loss improved from 0.43039 to 0.42695, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 145ms/step - loss: 0.4158 - val_loss: 0.4269
Epoch 42/100
48/48 [==============================] - ETA: 0s - loss: 0.4106
Epoch 00042: val_loss did not improve from 0.42695
48/48 [==============================] - 6s 132ms/step - loss: 0.4106 - val_loss: 0.4294
Epoch 43/100
48/48 [==============================] - ETA: 0s - loss: 0.4226
Epoch 00043: val_loss improved from 0.42695 to 0.42079, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 142ms/step - loss: 0.4226 - val_loss: 0.4208
Epoch 44/100
48/48 [==============================] - ETA: 0s - loss: 0.4079
Epoch 00044: val_loss improved from 0.42079 to 0.41474, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 150ms/step - loss: 0.4079 - val_loss: 0.4147
Epoch 45/100
48/48 [==============================] - ETA: 0s - loss: 0.4037
Epoch 00045: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 135ms/step - loss: 0.4037 - val_loss: 0.4223
Epoch 46/100
48/48 [==============================] - ETA: 0s - loss: 0.4004
Epoch 00046: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 132ms/step - loss: 0.4004 - val_loss: 0.4183
Epoch 47/100
48/48 [==============================] - ETA: 0s - loss: 0.4240
Epoch 00047: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 130ms/step - loss: 0.4240 - val_loss: 0.4379
Epoch 48/100
48/48 [==============================] - ETA: 0s - loss: 0.4537
Epoch 00048: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 131ms/step - loss: 0.4537 - val_loss: 0.4372
Epoch 49/100
48/48 [==============================] - ETA: 0s - loss: 0.4863
Epoch 00049: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 133ms/step - loss: 0.4863 - val_loss: 0.4396
Epoch 50/100
48/48 [==============================] - ETA: 0s - loss: 0.4244
Epoch 00050: val_loss did not improve from 0.41474
48/48 [==============================] - 7s 136ms/step - loss: 0.4244 - val_loss: 0.4741
Epoch 51/100
48/48 [==============================] - ETA: 0s - loss: 0.4878
Epoch 00051: val_loss did not improve from 0.41474
48/48 [==============================] - 7s 136ms/step - loss: 0.4878 - val_loss: 0.4203
Epoch 52/100
48/48 [==============================] - ETA: 0s - loss: 0.4551
Epoch 00052: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 132ms/step - loss: 0.4551 - val_loss: 0.4181
Epoch 53/100
48/48 [==============================] - ETA: 0s - loss: 0.4265
Epoch 00053: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 131ms/step - loss: 0.4265 - val_loss: 0.4166
Epoch 54/100
48/48 [==============================] - ETA: 0s - loss: 0.4077
Epoch 00054: val_loss did not improve from 0.41474
48/48 [==============================] - 6s 135ms/step - loss: 0.4077 - val_loss: 0.4214
Epoch 55/100
48/48 [==============================] - ETA: 0s - loss: 0.4030
Epoch 00055: val_loss improved from 0.41474 to 0.40558, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 147ms/step - loss: 0.4030 - val_loss: 0.4056
Epoch 56/100
48/48 [==============================] - ETA: 0s - loss: 0.3944
Epoch 00056: val_loss did not improve from 0.40558
48/48 [==============================] - 6s 134ms/step - loss: 0.3944 - val_loss: 0.4080
Epoch 57/100
48/48 [==============================] - ETA: 0s - loss: 0.3922
Epoch 00057: val_loss improved from 0.40558 to 0.40155, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 147ms/step - loss: 0.3922 - val_loss: 0.4015
Epoch 58/100
48/48 [==============================] - ETA: 0s - loss: 0.3880
Epoch 00058: val_loss improved from 0.40155 to 0.39638, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 143ms/step - loss: 0.3880 - val_loss: 0.3964
Epoch 59/100
48/48 [==============================] - ETA: 0s - loss: 0.3868
Epoch 00059: val_loss did not improve from 0.39638
48/48 [==============================] - 6s 132ms/step - loss: 0.3868 - val_loss: 0.4519
Epoch 60/100
48/48 [==============================] - ETA: 0s - loss: 0.4999
Epoch 00060: val_loss did not improve from 0.39638
48/48 [==============================] - 6s 132ms/step - loss: 0.4999 - val_loss: 0.4606
Epoch 61/100
48/48 [==============================] - ETA: 0s - loss: 0.4281
Epoch 00061: val_loss did not improve from 0.39638
48/48 [==============================] - 7s 136ms/step - loss: 0.4281 - val_loss: 0.4076
Epoch 62/100
48/48 [==============================] - ETA: 0s - loss: 0.4256
Epoch 00062: val_loss did not improve from 0.39638
48/48 [==============================] - 6s 132ms/step - loss: 0.4256 - val_loss: 0.4286
Epoch 63/100
48/48 [==============================] - ETA: 0s - loss: 0.4026
Epoch 00063: val_loss did not improve from 0.39638
48/48 [==============================] - 6s 131ms/step - loss: 0.4026 - val_loss: 0.4109
Epoch 64/100
48/48 [==============================] - ETA: 0s - loss: 0.3872
Epoch 00064: val_loss did not improve from 0.39638
48/48 [==============================] - 6s 133ms/step - loss: 0.3872 - val_loss: 0.4059
Epoch 65/100
48/48 [==============================] - ETA: 0s - loss: 0.3831
Epoch 00065: val_loss did not improve from 0.39638
48/48 [==============================] - 6s 131ms/step - loss: 0.3831 - val_loss: 0.4007
Epoch 66/100
48/48 [==============================] - ETA: 0s - loss: 0.3852
Epoch 00066: val_loss did not improve from 0.39638
48/48 [==============================] - 6s 132ms/step - loss: 0.3852 - val_loss: 0.4027
Epoch 67/100
48/48 [==============================] - ETA: 0s - loss: 0.3778
Epoch 00067: val_loss improved from 0.39638 to 0.39417, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 147ms/step - loss: 0.3778 - val_loss: 0.3942
Epoch 68/100
48/48 [==============================] - ETA: 0s - loss: 0.3689
Epoch 00068: val_loss did not improve from 0.39417
48/48 [==============================] - 6s 132ms/step - loss: 0.3689 - val_loss: 0.4006
Epoch 69/100
48/48 [==============================] - ETA: 0s - loss: 0.3693
Epoch 00069: val_loss did not improve from 0.39417
48/48 [==============================] - 6s 133ms/step - loss: 0.3693 - val_loss: 0.3958
Epoch 70/100
48/48 [==============================] - ETA: 0s - loss: 0.3696
Epoch 00070: val_loss improved from 0.39417 to 0.39400, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 153ms/step - loss: 0.3696 - val_loss: 0.3940
Epoch 71/100
48/48 [==============================] - ETA: 0s - loss: 0.3749
Epoch 00071: val_loss improved from 0.39400 to 0.38629, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 8s 156ms/step - loss: 0.3749 - val_loss: 0.3863
Epoch 72/100
48/48 [==============================] - ETA: 0s - loss: 0.3728
Epoch 00072: val_loss did not improve from 0.38629
48/48 [==============================] - 6s 129ms/step - loss: 0.3728 - val_loss: 0.3877
Epoch 73/100
48/48 [==============================] - ETA: 0s - loss: 0.3753
Epoch 00073: val_loss improved from 0.38629 to 0.38384, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 141ms/step - loss: 0.3753 - val_loss: 0.3838
Epoch 74/100
48/48 [==============================] - ETA: 0s - loss: 0.3660
Epoch 00074: val_loss improved from 0.38384 to 0.37924, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 147ms/step - loss: 0.3660 - val_loss: 0.3792
Epoch 75/100
48/48 [==============================] - ETA: 0s - loss: 0.3576
Epoch 00075: val_loss improved from 0.37924 to 0.37886, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 149ms/step - loss: 0.3576 - val_loss: 0.3789
Epoch 76/100
48/48 [==============================] - ETA: 0s - loss: 0.3686
Epoch 00076: val_loss did not improve from 0.37886
48/48 [==============================] - 6s 134ms/step - loss: 0.3686 - val_loss: 0.3805
Epoch 77/100
48/48 [==============================] - ETA: 0s - loss: 0.3661
Epoch 00077: val_loss did not improve from 0.37886
48/48 [==============================] - 6s 133ms/step - loss: 0.3661 - val_loss: 0.3838
Epoch 78/100
48/48 [==============================] - ETA: 0s - loss: 0.3727
Epoch 00078: val_loss did not improve from 0.37886
48/48 [==============================] - 6s 131ms/step - loss: 0.3727 - val_loss: 0.3969
Epoch 79/100
48/48 [==============================] - ETA: 0s - loss: 0.3722
Epoch 00079: val_loss did not improve from 0.37886
48/48 [==============================] - 6s 132ms/step - loss: 0.3722 - val_loss: 0.3810
Epoch 80/100
48/48 [==============================] - ETA: 0s - loss: 0.3588
Epoch 00080: val_loss improved from 0.37886 to 0.37700, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 146ms/step - loss: 0.3588 - val_loss: 0.3770
Epoch 81/100
48/48 [==============================] - ETA: 0s - loss: 0.3633
Epoch 00081: val_loss improved from 0.37700 to 0.37194, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 143ms/step - loss: 0.3633 - val_loss: 0.3719
Epoch 82/100
48/48 [==============================] - ETA: 0s - loss: 0.3607
Epoch 00082: val_loss did not improve from 0.37194
48/48 [==============================] - 7s 136ms/step - loss: 0.3607 - val_loss: 0.3729
Epoch 83/100
48/48 [==============================] - ETA: 0s - loss: 0.3496
Epoch 00083: val_loss did not improve from 0.37194
48/48 [==============================] - 6s 133ms/step - loss: 0.3496 - val_loss: 0.3735
Epoch 84/100
48/48 [==============================] - ETA: 0s - loss: 0.3503
Epoch 00084: val_loss did not improve from 0.37194
48/48 [==============================] - 6s 131ms/step - loss: 0.3503 - val_loss: 0.3720
Epoch 85/100
48/48 [==============================] - ETA: 0s - loss: 0.3593
Epoch 00085: val_loss did not improve from 0.37194
48/48 [==============================] - 6s 131ms/step - loss: 0.3593 - val_loss: 0.3860
Epoch 86/100
48/48 [==============================] - ETA: 0s - loss: 0.3579
Epoch 00086: val_loss did not improve from 0.37194
48/48 [==============================] - 6s 131ms/step - loss: 0.3579 - val_loss: 0.3762
Epoch 87/100
48/48 [==============================] - ETA: 0s - loss: 0.3506
Epoch 00087: val_loss improved from 0.37194 to 0.36972, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 145ms/step - loss: 0.3506 - val_loss: 0.3697
Epoch 88/100
48/48 [==============================] - ETA: 0s - loss: 0.3460
Epoch 00088: val_loss did not improve from 0.36972
48/48 [==============================] - 6s 133ms/step - loss: 0.3460 - val_loss: 0.3717
Epoch 89/100
48/48 [==============================] - ETA: 0s - loss: 0.3449
Epoch 00089: val_loss improved from 0.36972 to 0.36964, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 142ms/step - loss: 0.3449 - val_loss: 0.3696
Epoch 90/100
48/48 [==============================] - ETA: 0s - loss: 0.3440
Epoch 00090: val_loss did not improve from 0.36964
48/48 [==============================] - 6s 132ms/step - loss: 0.3440 - val_loss: 0.3713
Epoch 91/100
48/48 [==============================] - ETA: 0s - loss: 0.3413
Epoch 00091: val_loss improved from 0.36964 to 0.36718, saving model to models/autoencoder_2020-07-27-19-44.h5
48/48 [==============================] - 7s 148ms/step - loss: 0.3413 - val_loss: 0.3672
Epoch 92/100
48/48 [==============================] - ETA: 0s - loss: 0.3688
Epoch 00092: val_loss did not improve from 0.36718
48/48 [==============================] - 7s 143ms/step - loss: 0.3688 - val_loss: 0.4122
Epoch 93/100
48/48 [==============================] - ETA: 0s - loss: 0.3787
Epoch 00093: val_loss did not improve from 0.36718
48/48 [==============================] - 6s 132ms/step - loss: 0.3787 - val_loss: 0.3932
Epoch 94/100
48/48 [==============================] - ETA: 0s - loss: 0.3614
Epoch 00094: val_loss did not improve from 0.36718
48/48 [==============================] - 7s 139ms/step - loss: 0.3614 - val_loss: 0.3936
Epoch 95/100
48/48 [==============================] - ETA: 0s - loss: 0.3514
Epoch 00095: val_loss did not improve from 0.36718
48/48 [==============================] - 7s 140ms/step - loss: 0.3514 - val_loss: 0.4362
Epoch 96/100
48/48 [==============================] - ETA: 0s - loss: 0.3997
Epoch 00096: val_loss did not improve from 0.36718
48/48 [==============================] - 6s 135ms/step - loss: 0.3997 - val_loss: 0.3885
Epoch 97/100
48/48 [==============================] - ETA: 0s - loss: 0.3573
Epoch 00097: val_loss did not improve from 0.36718
48/48 [==============================] - 6s 133ms/step - loss: 0.3573 - val_loss: 0.3771
Epoch 98/100
48/48 [==============================] - ETA: 0s - loss: 0.3403
Epoch 00098: val_loss did not improve from 0.36718
48/48 [==============================] - 6s 134ms/step - loss: 0.3403 - val_loss: 0.3833
Epoch 99/100
48/48 [==============================] - ETA: 0s - loss: 0.3335
Epoch 00099: val_loss did not improve from 0.36718
48/48 [==============================] - 6s 135ms/step - loss: 0.3335 - val_loss: 0.3739
Epoch 100/100
48/48 [==============================] - ETA: 0s - loss: 0.3283
Epoch 00100: val_loss did not improve from 0.36718
48/48 [==============================] - 6s 132ms/step - loss: 0.3283 - val_loss: 0.3769
Wall time: 11min 21s

# Load best model
autoencoder_loaded = tf.keras.models.load_model(f'models/autoencoder_{now}.h5')

WARNING:tensorflow:Layer lstm will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_1 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_2 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_3 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU
WARNING:tensorflow:Layer lstm_4 will not use cuDNN kernel since it doesn't meet the cuDNN kernel criteria. It will use generic GPU kernel as fallback when running on GPU

Preprocessing Test Dataset¶

# Load test dataset
df_test = pd.read_csv(path_test, header = None, names = col_names_cleaned)
df_test.head(3)

# Get only samples with service == http
df_test_http = df_test[df_test['service'] == 'http']

A sample is anomalous if it's result is != normal A time window will be anomalous if it contains any sample which is != normal

# Binary label to represent anomalies
status = pd.Series([0 if i == 'normal' else 1 for i in df_test_http['result']])
test_labels = [1 if np.sum(status[i:i+window_size])>0 else 0 for i in range(0, len(status)-stride, stride)]

print(f'Test samples: {len(status)}')
print(f'Test windows: {len(test_labels)}')

Test samples: 7853
Test windows: 785

# Removing columns with a standard deviation of zero
df_test_std = df_test_http.drop(zero_std_cols, axis = 1)

# Scale test dataframe
df_test_scaled = pd.DataFrame(scaler.transform(df_test_std),
                              columns = df_test_std.columns)

# PCA
df_test_pca = pd.DataFrame(pca.transform(df_test_scaled),
                           columns = pca_cols)
df_test_pca.head(3)

# Create time windows
test_windows = get_windows(df_test_pca, window_size = window_size, stride = stride)
test_windows.shape

100%|██████████████████████████████████████| 785/785 [00:00<00:00, 5844.74it/s]

(785, 10, 14)

Test Prediction¶

# Predict
test_windows_pred = autoencoder_loaded.predict(test_windows)

The reconstruction error is used as a metric to predict the probability of a sample being anomalous.

The reasoning is that the autoencoder was trained only with normal data. Thus, during inference (prediction) if the model receives an anomalous sample the encoder will compress it to a latent representation similar to that of normal samples (because it was trained to compress only normal samples).

Thefore this latent representation will lose information related to it's anomalous characteristics, and thus the decoder will reconstruct it as a normal sample, then when the input and output are compared there will be a huge error, as the output will look nothing like the anomalous input. This huge error is what signifies a sample being anomalous.

After calculating the reconstruction errors for all samples in the test dataset, they can be scaled to [0, 1], known as Anomaly Score. A threshold can be defined on this Anomaly Score so that all samples above the threshold are considered anomalous.

# Reconstruction error for each sample
# The reconstruction error I'm using is the euclidean distance between the y_true and y_pred tensors
# Euclidean distance = The hipotenuse that you get on the pitagoras formula, aka the shortest distance between two points
# Since it will be lots of matrix (tensor) math its best done in TensorFlow

# List with errors
recon_errors = []

# Condition to continue the loop: 
# While i is less than iters, continue the loop, once i gets to the value of iters, then the loop ends
def cond(y_true, y_pred, i, iters):
    return tf.less(i, iters)

def body(y_true, y_pred, i, iters):
    
    # Tensor
    # First reference is sample (as in the index reference), 
    # second is timewindow (the sample's sample of time) and third is PCA components (14 components for each timewindow unit)
    # Here I'm getting the differente (math.subtract) between two entire windows (tf.slice), namely the y_true and y_pred windows
    tensor_for_error = tf.math.subtract(tf.slice(y_true, [i, 0, 0], [1, -1, -1]),
                                        tf.slice(y_pred, [i, 0, 0], [1, -1, -1]))
    
    # Reshape
    # Get the error back to the format of the batch used (number of samples in a timewindow x attributes per sample)
    tensor_for_error = tf.reshape(tensor_for_error, [window_size, pca.n_components_])
    
    # Reconstruction error 
    # Measured as the distance between the y_true and y_pred tensors
    # The error will be the mean euclidean distance between each of the pca.n_components_ of y_true and y_pred
    recon_error = tf.math.reduce_mean(tf.norm(tensor_for_error, ord = 'euclidean', axis = 1))
    
    # Append error to the list
    recon_errors.append(recon_error.numpy())
    
    return [y_true, y_pred, tf.add(i, 1), iters]

Note on tf.norm(ord = 'euclidean')

It essentially means I'm taking the dimensional location of the tensor on an n-dimensional plane where the number of planar dimensions is the number of dimensions in the tensor. Kinda like plotting a point on a 3D graph, except here it's a 14D graph.

From wikipedia: https://en.wikipedia.org/wiki/Norm_(mathematics)#Euclidean_norm

In mathematics, a norm is a function from a vector space over the real or complex numbers to the nonnegative real numbers that satisfies certain properties pertaining to scalability and additivity, and takes the value zero only if the input vector is zero. A pseudonorm or seminorm satisfies the same properties, except that it may have a zero value for some nonzero vectors.[1]

The Euclidean norm or 2-norm is a specific norm on a Euclidean vector space, that is strongly related to the Euclidean distance, and equals the square root of the inner product of a vector with itself.

A vector space on which a norm is defined is called a normed vector space. Similarly, a vector space with a seminorm is called a seminormed vector space.

On the n-dimensional Euclidean space ℝn, the intuitive notion of length of the vector x = (x1, x2, ..., xn) is captured by the formula

This is the Euclidean norm, which gives the ordinary distance from the origin to the point X, a consequence of the Pythagorean theorem. This operation may also be referred to as "SRSS" which is an acronym for the square root of the sum of squares.

# Iterations (which is the number of test_windows)
iters = tf.constant(len(test_windows))

# Loop
result = tf.while_loop(cond, body, [tf.constant(test_windows.astype(np.float32)),
                                    tf.constant(test_windows_pred.astype(np.float32)), 0, iters])

# Minmax scaler
mm_scaler = MinMaxScaler()

# Reshape
recon_errors = np.array(recon_errors).reshape(-1, 1)

# Apply scaler
anomaly_predictions = mm_scaler.fit_transform(recon_errors).flatten()

# Plot
plt.figure(figsize = (20, 10))
plt.plot(test_labels, c = 'blue', label = 'Original')
plt.plot(anomaly_predictions, c = 'red', label = 'Predicted')
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlabel('Samples')
plt.ylabel('Anomaly Scores')
plt.grid()
plt.legend()
plt.show()

# Putting y_true and y_pred into a dataframe so that it can be ordered 
df_compare = pd.DataFrame(data = {'y_true': test_labels,
                                  'y_pred': anomaly_predictions})

# Sort dataframe so the plot is more legible
df_compare = df_compare.sort_values(['y_pred']).reset_index(drop=True)

# Plot
plt.figure(figsize = (20, 10))
plt.plot(df_compare.y_true, c = 'blue', label = 'Original', marker='.', linestyle='None')
plt.plot(df_compare.y_pred, c = 'red', label = 'Predicted')
plt.yticks(np.arange(0, 1.1, 0.1))
plt.xlabel('Samples')
plt.ylabel('Anomaly Scores')
plt.title('Probability of Sample Being Anomalous', fontsize = 24)
plt.grid()
plt.legend()
plt.show()

Evaluate Model¶

# ROC Curve
fpr, tpr, thresholds = roc_curve(test_labels, anomaly_predictions)

# AUC Score
auc = roc_auc_score(test_labels, anomaly_predictions)
print(f'AUC Score: {auc}')

AUC Score: 0.9890112185848289

# Plot ROC Curve
plt.figure(figsize = (10,5))
plt.plot([0, 1], [0, 1], color = 'black', linestyle = '--')
plt.plot(fpr, tpr, label = f'AUC = {auc}')
plt.grid()
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.legend()
plt.title('ROC')
plt.show()

Identifying Anomalies¶

Need to find the threshold above which a sample will be considered an anomaly. Too low and the system will catch all anomalies at the cost of raising way too many false positives. To high and the the system will allow many anomalies to pass by undetected. From a visual analysis of the AUC Curve is seems that the ideal value is in the range of [0.05, 0,15].

Using the F-1 Score to determine the ideal threshold, as it accounts for both Precision (impact of false positives) and Recall (impact of false negatives).

# List the thresholds suggested by roc_curve and their F-1 scores
thresholds_anomalies = [(anomaly_predictions > i).astype(np.int32) for i in thresholds]
# thresholds_anomalies is a list of lists, each containig 0 or 1 depending on whether a given threshold predicts a given sample as anomaly
# Compare thresholds_anomalies to test_labels to calculate the F1 score (by first calculating precision and recall) of each proposed threshold value
f1_scores = [f1_score(test_labels, i) for i in thresholds_anomalies]

# Plot
plt.figure(figsize = (10, 5))
plt.plot(thresholds, f1_scores)
plt.grid()
plt.xlabel('Thresholds')
plt.ylabel('F-1 Score')
plt.title('F-1 Score vs Thresholds')
plt.show()

# Get the best threshold
max_f1_score = np.max(f1_scores)
best_threshold = thresholds[f1_scores.index(max_f1_score)]
print(f'Best Threshold = {best_threshold}')

Best Threshold = 0.03244171291589737

# Create an anomaly indicator (a mask)
anomaly_indicator = (anomaly_predictions > best_threshold).astype(np.int32)
anomaly_indicator

array([1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1])

# Confusion Matrix
confusion_matrix(test_labels, anomaly_indicator)

array([[129,  22],
       [ 10, 624]], dtype=int64)

# Adjust labels
anomaly_indicator_final = ['normal' if i == 0 else 'anomaly' for i in anomaly_indicator]

# Plot
plt.figure(figsize = (20,10))
sns.scatterplot(x = np.arange(0, len(anomaly_predictions)), y = anomaly_predictions, hue = anomaly_indicator_final,
                palette = ['red', 'blue'], legend = 'full')
plt.axhline(y = best_threshold, linestyle = '--', label = 'threshold')
plt.legend()
plt.grid()
plt.show()

Performance Metrics¶

# Metrics
precision = precision_score(test_labels, anomaly_indicator)
recall = recall_score(test_labels, anomaly_indicator)
f1_sc = f1_score(test_labels, anomaly_indicator)
accuracy_sc = accuracy_score(test_labels, anomaly_indicator)

print('Model Performance Metrics:')
print(f'Precision  =  {precision:.5f}')
print(f'Recal      =  {recall:.5f}')
print(f'F1 Score   =  {f1_sc:.5f}')
print(f'Accuracy   =  {accuracy_sc:.5f}')

Model Performance Metrics:
Precision  =  0.96594
Recal      =  0.98423
F1 Score   =  0.97500
Accuracy   =  0.95924

	1	2	3	4	5	...	32	33	34	35	36	37	38	39	40	41
0	tcp	ftp_data	SF	491	0	...	25	0.17	0.03	0.17	0.00	0.00	0.00	0.05	0.00	normal
1	udp	other	SF	146	0	...	1	0.00	0.60	0.88	0.00	0.00	0.00	0.00	0.00	normal
2	tcp	private	S0	0	0	...	26	0.10	0.05	0.00	0.00	1.00	1.00	0.00	0.00	neptune
3	tcp	http	SF	232	8153	...	255	1.00	0.00	0.03	0.04	0.03	0.01	0.00	0.01	normal
4	tcp	http	SF	199	420	...	255	1.00	0.00	0.00	0.00	0.00	0.00	0.00	0.00	normal

	duration	src_bytes	dst_bytes	land	wrong_fragment	urgent	hot	num_failed_logins	logged_in	num_compromised	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate
count	38049.000000	38049.000000	3.804900e+04	38049.0	38049.0	38049.0	38049.000000	38049.0	38049.000000	38049.000000	...	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000
mean	0.657021	243.711241	4.571256e+03	0.0	0.0	0.0	0.012274	0.0	0.922626	0.001919	...	125.879314	243.453047	0.996478	0.000993	0.069459	0.030305	0.004876	0.001130	0.069606	0.071019
std	12.395866	318.325400	2.702648e+04	0.0	0.0	0.0	0.167326	0.0	0.267187	0.076533	...	104.716216	42.201227	0.031279	0.011916	0.173045	0.047278	0.044915	0.007129	0.241947	0.240875
min	0.000000	0.000000	0.000000e+00	0.0	0.0	0.0	0.000000	0.0	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	210.000000	4.710000e+02	0.0	0.0	0.0	0.000000	0.0	1.000000	0.000000	...	22.000000	255.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	241.000000	1.456000e+03	0.0	0.0	0.0	0.000000	0.0	1.000000	0.000000	...	93.000000	255.000000	1.000000	0.000000	0.010000	0.020000	0.000000	0.000000	0.000000	0.000000
75%	0.000000	303.000000	3.928000e+03	0.0	0.0	0.0	0.000000	0.0	1.000000	0.000000	...	255.000000	255.000000	1.000000	0.000000	0.050000	0.040000	0.000000	0.000000	0.000000	0.000000
max	715.000000	54540.000000	3.746371e+06	0.0	0.0	0.0	9.000000	0.0	1.000000	4.000000	...	255.000000	255.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.500000	1.000000	1.000000

	duration	src_bytes	dst_bytes	hot	logged_in	num_compromised	root_shell	num_shells	num_access_files	count	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate
count	38049.000000	38049.000000	3.804900e+04	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	...	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000	38049.000000
mean	0.657021	243.711241	4.571256e+03	0.012274	0.922626	0.001919	0.001446	0.000026	0.005151	9.041368	...	125.879314	243.453047	0.996478	0.000993	0.069459	0.030305	0.004876	0.001130	0.069606	0.071019
std	12.395866	318.325400	2.702648e+04	0.167326	0.267187	0.076533	0.037993	0.005127	0.071588	12.157668	...	104.716216	42.201227	0.031279	0.011916	0.173045	0.047278	0.044915	0.007129	0.241947	0.240875
min	0.000000	0.000000	0.000000e+00	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	210.000000	4.710000e+02	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	2.000000	...	22.000000	255.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	241.000000	1.456000e+03	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	6.000000	...	93.000000	255.000000	1.000000	0.000000	0.010000	0.020000	0.000000	0.000000	0.000000	0.000000
75%	0.000000	303.000000	3.928000e+03	0.000000	1.000000	0.000000	0.000000	0.000000	0.000000	13.000000	...	255.000000	255.000000	1.000000	0.000000	0.050000	0.040000	0.000000	0.000000	0.000000	0.000000
max	715.000000	54540.000000	3.746371e+06	9.000000	1.000000	4.000000	1.000000	1.000000	1.000000	511.000000	...	255.000000	255.000000	1.000000	1.000000	1.000000	1.000000	1.000000	0.500000	1.000000	1.000000

	duration	src_bytes	dst_bytes	hot	logged_in	num_compromised	root_shell	num_shells	num_access_files	count	...	dst_host_count	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_serror_rate	dst_host_srv_serror_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate
0	-0.053004	-0.036791	0.132529	-0.073353	0.289591	-0.025069	-0.038047	-0.005127	-0.071958	-0.332417	...	-0.915623	0.273620	0.112609	-0.083351	-0.228032	0.205073	0.559386	1.244182	-0.287693	-0.253327
1	-0.053004	-0.140460	-0.153602	-0.073353	0.289591	-0.025069	-0.038047	-0.005127	-0.071958	1.723925	...	1.233070	0.273620	0.112609	-0.083351	-0.401400	-0.640992	-0.108552	-0.158564	-0.287693	-0.294843
2	-0.053004	0.135991	-0.085852	-0.073353	0.289591	-0.025069	-0.038047	-0.005127	-0.071958	-0.496925	...	-1.125717	-0.579447	0.112609	-0.083351	0.292072	-0.006443	-0.108552	-0.158564	-0.287693	-0.294843
3	-0.053004	0.176830	0.341031	-0.073353	0.289591	-0.025069	-0.038047	-0.005127	-0.071958	-0.085656	...	-0.333089	0.273620	0.112609	-0.083351	-0.343610	-0.217959	-0.108552	-0.158564	-0.287693	-0.294843
4	-0.053004	-0.033649	-0.146349	-0.073353	0.289591	-0.025069	-0.038047	-0.005127	-0.071958	-0.496925	...	-0.571832	0.273620	0.112609	-0.083351	-0.285821	-0.006443	-0.108552	-0.158564	-0.205029	-0.294843

	PCA_0	PCA_1	PCA_2	PCA_3	PCA_4	PCA_5	PCA_6	PCA_7	PCA_8	PCA_9	PCA_10	PCA_11	PCA_12	PCA_13
0	0.163909	3.059060	0.128476	-0.554497	-0.495396	0.041305	-0.099874	0.140583	-0.388709	0.091592	-0.238757	0.539805	0.188193	-0.402677
1	1.519231	-1.064208	0.706739	-1.545224	0.087794	-0.751925	-0.793736	-0.059551	0.087207	-0.006736	0.157736	-0.385939	-0.390762	0.292603
2	0.010214	0.332885	-0.625921	1.009312	-0.218476	0.936780	0.229612	-0.163900	0.354713	-0.184938	0.035684	-0.326628	-0.462572	0.937782
3	0.596046	0.636370	-0.175967	0.051269	-0.020153	-0.067064	0.443829	0.149696	0.180779	-0.080800	-0.019181	0.050618	0.040003	0.412018
4	0.426452	0.049744	-0.503544	0.572939	-0.171638	0.295346	0.350311	0.028210	0.087203	-0.010561	-0.074898	0.325340	0.261813	-0.644231

	duration	protocol_type	service	flag	src_bytes	...	dst_host_srv_count	dst_host_same_srv_rate	dst_host_diff_srv_rate	dst_host_same_src_port_rate	dst_host_srv_diff_host_rate	dst_host_rerror_rate	dst_host_srv_rerror_rate	result
0	0	tcp	private	REJ	0	...	10	0.04	0.06	0.00	0.00	1.0	1.0	neptune
1	0	tcp	private	REJ	0	...	1	0.00	0.06	0.00	0.00	1.0	1.0	neptune
2	2	tcp	ftp_data	SF	12983	...	86	0.61	0.04	0.61	0.02	0.0	0.0	normal

	PCA_0	PCA_1	PCA_2	PCA_3	PCA_4	PCA_5	PCA_6	PCA_7	PCA_8	PCA_9	PCA_10	PCA_11	PCA_12	PCA_13
0	0.648970	0.050833	-0.280374	0.385155	0.019903	-0.059401	0.422905	0.265268	-0.112398	0.031108	-0.157911	0.456287	0.302863	-0.480187
1	1.406994	-1.178106	0.597288	-1.951569	0.085846	-0.293601	-1.778430	-0.080162	0.298282	-0.031351	0.433763	-0.781126	0.083943	0.486750
2	0.552098	-0.094930	-0.380227	0.252656	-0.078205	0.313356	-0.003676	0.015319	0.059834	-0.001715	-0.020184	0.232639	0.530158	-0.338386