Matheus Schmitz
LinkedIn
Github Portfolio
!pip install --quiet tensorflow_addons
|████████████████████████████████| 1.1 MB 5.3 MB/s
# Data Manipulation
import numpy as np
import pandas as pd
import pickle
import os
from scipy.ndimage import interpolation
from sklearn.preprocessing import OneHotEncoder
# Machine Learning
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras import layers
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, CSVLogger, EarlyStopping
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score, confusion_matrix, roc_curve, auc, precision_recall_curve
# Plotting
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
%matplotlib inline
import seaborn as sns
import pylab
print(f'Tensorflow version: {tf.__version__}')
print(f'Tensorflow_addons version: {tfa.__version__}')
Tensorflow version: 2.8.0 Tensorflow_addons version: 0.16.1
# Unpickle data
with open('data/cremad_openface_mfcc.pkl', 'rb') as f_in:
data = pickle.load(f_in)
# Train
df_train = pd.DataFrame(data['train']).T
mfcc_train = df_train[0].to_numpy()
au_train = df_train[1].to_numpy()
labels_train = df_train[2].to_numpy(dtype=int)
# Validation
df_valid = pd.DataFrame(data['val']).T
mfcc_valid = df_valid[0].to_numpy()
au_valid = df_valid[1].to_numpy()
labels_valid = df_valid[2].to_numpy(dtype=int)
# Test
df_test = pd.DataFrame(data['test']).T
mfcc_test = df_test[0].to_numpy()
au_test = df_test[1].to_numpy()
labels_test = df_test[2].to_numpy(dtype=int)
idx_to_emotion = {0: 'neutral',
1: 'anger',
2: 'happy',
3: 'fear',
4: 'disgust',
5: 'sad'}
emotion_to_idx = {v:k for k, v in idx_to_emotion.items()}
class_names = np.asarray(list(emotion_to_idx.keys()))
# Sizing parameters
NUM_MFCCS = mfcc_train[0].shape[1]
NUM_AUS = au_train[0].shape[1]
OUTPUT_SIZE = len(class_names)
BATCH_SIZE = 128
EPOCHS = 100
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-3
# Define directory for model checkpoints
BACKUP_DIR = './models'
if not os.path.exists(BACKUP_DIR): os.mkdir(BACKUP_DIR)
LOGS_DIR = './logs'
if not os.path.exists(LOGS_DIR): os.mkdir(LOGS_DIR)
Standardize Timesteps
# Find the 95th quantile for the length of the samples
mfcc_sizes = [sample.shape[0] for sample in mfcc_train]
MFCC_TIMESTEPS_95_QUANTILE = int(np.ceil(pd.Series(mfcc_sizes).quantile(0.95)))
au_sizes = [sample.shape[0] for sample in au_train]
AU_TIMESTEPS_95_QUANTILE = int(np.ceil(pd.Series(au_sizes).quantile(0.95)))
def standardize_timesteps(data, timesteps):
'''Take a dataset with shape (samples, timesteps, features) whose timesteps are of varying size and standardize the timestep size'''
# Zero matrix to be filled -- zero will be the mask token
zeros = np.zeros(shape=(data.shape[0], timesteps, len(data[0][0])))
# Iterate through samples and standardize timesteps to the 95th quantile of MFCC samples
for idx, sample in enumerate(data):
if len(sample) > timesteps:
zeros[idx] = sample[:timesteps]
else:
zeros[idx][:len(sample)] = sample
return zeros
# Standardize length and use 0 as masking value -- MFCC
mfcc_train_standardized = standardize_timesteps(mfcc_train, MFCC_TIMESTEPS_95_QUANTILE)
mfcc_valid_standardized = standardize_timesteps(mfcc_valid, MFCC_TIMESTEPS_95_QUANTILE)
mfcc_test_standardized = standardize_timesteps(mfcc_test, MFCC_TIMESTEPS_95_QUANTILE)
# Standardize length and use 0 as masking value -- Action Units
au_train_standardized = standardize_timesteps(au_train, AU_TIMESTEPS_95_QUANTILE)
au_valid_standardized = standardize_timesteps(au_valid, AU_TIMESTEPS_95_QUANTILE)
au_test_standardized = standardize_timesteps(au_test, AU_TIMESTEPS_95_QUANTILE)
One-Hot Encode Labels
OHE = OneHotEncoder(sparse=False)
labels_train_one_hot = OHE.fit_transform(labels_train.reshape(-1, 1))
labels_valid_one_hot = OHE.transform(labels_valid.reshape(-1, 1))
labels_test_one_hot = OHE.transform(labels_test.reshape(-1, 1))
# Instead of feeding the NN a full dataset in matrix-style data this will feed a keras Sequence which batches data online and can even randomly sample
class BatchData(tf.keras.utils.Sequence):
def __init__(self, X, y, n_classes, batch_size=64, shuffle=True):
# Hyperparameters
self.X = X
self.y = y
self.n_classes = n_classes
self.batch_size = batch_size
self.shuffle = shuffle
# Shuffle
if self.shuffle:
assert len(self.X) == len(self.y)
p = np.random.permutation(len(self.X))
self.X, self.y = self.X[p], self.y[p]
def __len__(self):
''' Defines number of calls per epoch '''
return int(np.ceil(len(self.X) / self.batch_size))
def __getitem__(self, index):
batch_X = self.X[index*self.batch_size:(index+1)*self.batch_size]
batch_y = self.y[index*self.batch_size:(index+1)*self.batch_size]
return batch_X, batch_y
def on_epoch_end(self):
# Shuffle
if self.shuffle:
assert len(self.X) == len(self.y)
p = np.random.permutation(len(self.X))
self.X, self.y = self.X[p], self.y[p]
def get_model(name, input_shape, output_size):
model = Sequential(name=name)
model.add(layers.Masking(mask_value=0, input_shape=input_shape))
model.add(layers.Bidirectional(layers.LSTM(512, return_sequences=True, recurrent_dropout=0.0)))
model.add(layers.TimeDistributed(layers.Dropout(0.8)))
model.add(layers.Bidirectional(layers.LSTM(256, return_sequences=False, recurrent_dropout=0.0)))
model.add(layers.Dropout(0.8))
model.add(layers.Dense(512, activation=None))
model.add(layers.BatchNormalization())
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.8))
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(output_size, activation='Softmax'))
model.compile(optimizer=tfa.optimizers.AdaBelief(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY, amsgrad=True, epsilon=1e-7),
loss='categorical_crossentropy',
metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC', name='roc_auc')])
return model
def load_trained_weights(LOAD_TRAINED_WEIGHTS, model, path):
'''
If LOAD_TRAINED_WEIGHTS == True, load the model's training using the pre-trained neurons - THIS WILL GIVE AN ERROR IF THE WINDOW_SIZE OR NEURONS WAS CHANGED
If LOAD_TRAINED_WEIGHTS == False, train the neural network from scratch - THIS WILL LOSE ALL PROGRESS AND CAUSE WORSE PREDICTIONS
'''
if LOAD_TRAINED_WEIGHTS and os.path.exists(path):
# Try loading weights. Will fail if the model structure changed
try:
model.load_weights(path)
model.compile(optimizer=tfa.optimizers.AdaBelief(learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY, amsgrad=True, epsilon=1e-7),
loss='categorical_crossentropy',
metrics=['accuracy', tf.keras.metrics.AUC(curve='ROC', name='roc_auc')])
SUCCESSFUL_WEIGHT_LOAD = True
print('Best weights loaded successfully!')
except:
SUCCESSFUL_WEIGHT_LOAD = False
print('Could not load weights. Most likely the network architecture changed.')
else:
SUCCESSFUL_WEIGHT_LOAD = False
return LOAD_TRAINED_WEIGHTS, SUCCESSFUL_WEIGHT_LOAD, model
def get_callbacks(backup_file):
checkpoint = ModelCheckpoint(backup_file, monitor='val_loss', save_best_only=True, save_weights_only=True, verbose=0)
plateauLRreduce = ReduceLROnPlateau(factor = 0.8, patience = 3, monitor='val_loss', min_lr = 3e-5, verbose=1)
stopearly = EarlyStopping(monitor='val_loss', patience = 20, verbose=1, restore_best_weights=True)
model_callbacks = [checkpoint, plateauLRreduce, stopearly]
return model_callbacks
def plot_keras_log(log_path):
# Read the log file
model_log = pd.read_csv(log_path)
# Create figure
fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20,5))
# Loss
fig.sca(axs[0])
plt.plot(model_log.index, model_log.loss, color='C0', lw=3, alpha=0.7, label='Train')
plt.plot(model_log.index, model_log.val_loss, color='C1', lw=3, alpha=0.7, label='Validation')
axs[0].title.set_text('Loss')
axs[0].set_xlabel('Epoch')
# Accuracy
fig.sca(axs[1])
plt.plot(model_log.index, model_log.accuracy, color='C0', lw=3, alpha=0.7, label='Train')
plt.plot(model_log.index, model_log.val_accuracy, color='C1', lw=3, alpha=0.7, label='Validation')
axs[1].set_ylim([0, 1])
axs[1].title.set_text('Accuracy')
axs[1].set_xlabel('Epoch')
# ROC-AUC
fig.sca(axs[2])
plt.plot(model_log.index, model_log.roc_auc, color='C0', lw=3, alpha=0.7, label='Train')
plt.plot(model_log.index, model_log.val_roc_auc, color='C1', lw=3, alpha=0.7, label='Validation')
axs[2].set_ylim([0, 1])
axs[2].title.set_text('ROC-AUC')
axs[2].set_xlabel('Epoch')
# Learning Rate
fig.sca(axs[3])
axs[3].set_yscale('log')
plt.plot(model_log.index, model_log.lr, lw=3, color='C2')
axs[3].title.set_text('Learning Rate')
axs[3].set_xlabel('Epoch')
plt.show()
Dataloaders
acoustic_train_loader = BatchData(mfcc_train_standardized, labels_train_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=True)
acoustic_valid_loader = BatchData(mfcc_valid_standardized, labels_valid_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=False)
acoustic_test_loader = BatchData(mfcc_test_standardized, labels_test_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=False)
Model
model_1 = get_model(name='Acoustic', input_shape=(MFCC_TIMESTEPS_95_QUANTILE, NUM_MFCCS), output_size=OUTPUT_SIZE)
model_1.summary()
Model: "Acoustic" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= masking (Masking) (None, 356, 52) 0 bidirectional (Bidirectiona (None, 356, 1024) 2314240 l) time_distributed (TimeDistr (None, 356, 1024) 0 ibuted) bidirectional_1 (Bidirectio (None, 512) 2623488 nal) dropout_1 (Dropout) (None, 512) 0 dense (Dense) (None, 512) 262656 batch_normalization (BatchN (None, 512) 2048 ormalization) activation (Activation) (None, 512) 0 dropout_2 (Dropout) (None, 512) 0 dense_1 (Dense) (None, 512) 262656 dense_2 (Dense) (None, 6) 3078 ================================================================= Total params: 5,468,166 Trainable params: 5,467,142 Non-trainable params: 1,024 _________________________________________________________________
Training
# Load pre-trained weights
LOAD_TRAINED_WEIGHTS, SUCCESSFUL_WEIGHT_LOAD, model_1 = load_trained_weights(LOAD_TRAINED_WEIGHTS=True, model=model_1, path='./models/acoustic.h5')
TRAIN_MODEL = False
if TRAIN_MODEL:
# Define file to store checkpoint
BACKUP_FILE = os.path.join(BACKUP_DIR, 'acoustic.h5')
# Callbacks
model_callbacks = get_callbacks(BACKUP_FILE)
logCSV = CSVLogger(filename='logs/log_acoustic.csv', separator=',', append=(LOAD_TRAINED_WEIGHTS & SUCCESSFUL_WEIGHT_LOAD))
model_callbacks.append(logCSV)
# Train model and save history
model_1.fit(x = acoustic_train_loader,
batch_size = BATCH_SIZE,
epochs = EPOCHS,
validation_data = acoustic_valid_loader,
shuffle = True,
workers = -1,
use_multiprocessing = True,
callbacks=model_callbacks)
Best weights loaded successfully!
Plotting
plot_keras_log('logs/log_acoustic.csv')
Dataloaders
visual_train_loader = BatchData(au_train_standardized, labels_train_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=True)
visual_valid_loader = BatchData(au_valid_standardized, labels_valid_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=False)
visual_test_loader = BatchData(au_test_standardized, labels_test_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=False)
Model
model_2 = get_model(name='Visual', input_shape=(AU_TIMESTEPS_95_QUANTILE, NUM_AUS), output_size=OUTPUT_SIZE)
model_2.summary()
Model: "Visual" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= masking_1 (Masking) (None, 105, 17) 0 bidirectional_2 (Bidirectio (None, 105, 1024) 2170880 nal) time_distributed_1 (TimeDis (None, 105, 1024) 0 tributed) bidirectional_3 (Bidirectio (None, 512) 2623488 nal) dropout_4 (Dropout) (None, 512) 0 dense_3 (Dense) (None, 512) 262656 batch_normalization_1 (Batc (None, 512) 2048 hNormalization) activation_1 (Activation) (None, 512) 0 dropout_5 (Dropout) (None, 512) 0 dense_4 (Dense) (None, 512) 262656 dense_5 (Dense) (None, 6) 3078 ================================================================= Total params: 5,324,806 Trainable params: 5,323,782 Non-trainable params: 1,024 _________________________________________________________________
Training
# Load pre-trained weights
LOAD_TRAINED_WEIGHTS, SUCCESSFUL_WEIGHT_LOAD, model_2 = load_trained_weights(LOAD_TRAINED_WEIGHTS=True, model=model_2, path='./models/visual.h5')
TRAIN_MODEL = False
if TRAIN_MODEL:
# Define file to store checkpoint
BACKUP_FILE = os.path.join(BACKUP_DIR, 'visual.h5')
# Callbacks
model_callbacks = get_callbacks(BACKUP_FILE)
logCSV = CSVLogger(filename='logs/log_visual.csv', separator=',', append=(LOAD_TRAINED_WEIGHTS & SUCCESSFUL_WEIGHT_LOAD))
model_callbacks.append(logCSV)
# Train model
model_2.fit(x = visual_train_loader,
batch_size = BATCH_SIZE,
epochs = EPOCHS,
validation_data = visual_valid_loader,
shuffle = True,
workers = -1,
use_multiprocessing = True,
callbacks=model_callbacks)
Best weights loaded successfully!
Plotting
plot_keras_log('logs/log_visual.csv')
class LateFusionModel():
''' Takes models trained on two different modalities and averages their predictions to generate a new prediction '''
def __init__(self, acoustic_model, visual_model):
super().__init__()
self.acoustic_model = acoustic_model
self.visual_model = visual_model
def predict_proba(self, acoustic_loader, visual_loader):
# Get predictions for each modality
probas_acoustic = self.acoustic_model.predict(acoustic_loader)
probas_visual = self.visual_model.predict(visual_loader)
# Average the probabilities from each modality
probas_late_fusion = np.add(probas_acoustic, probas_visual) / 2
return probas_late_fusion
def predict(self, acoustic_loader, visual_loader):
# Get class probability predictions
probas_late_fusion = self.predict_proba(acoustic_loader, visual_loader)
# Use argmax to get predicted class
preds_late_fusion = np.argmax(probas_late_fusion, axis=1)
return preds_late_fusion
model_3 = LateFusionModel(acoustic_model = model_1,
visual_model = model_2)
Data Preparation
def fuse_data(mfcc_data, au_data, timesteps):
'''
Take the unstandardized MFCC and Action Unit data and merge them with the following procedure:
1 - Enlarge the Action Unit feature vector using interpolation to estimate intermediary states, such that it matches the timesteps of the MFCC feature vector
2 - Concatenate both feature vectors timestep-wise
3 - Standardize the number of timesteps across all samples, using the length (timesteps) of the 95th percetile
'''
assert mfcc_data.shape[0] == au_data.shape[0]
fused_feature_vector_size = len(mfcc_data[0][0]) + len(au_data[0][0])
fused = np.ndarray(shape = (mfcc_data.shape[0], timesteps, fused_feature_vector_size))
for idx, (mfcc_sample, au_sample) in enumerate(zip(mfcc_data, au_data)):
au_interpolated = interpolation.zoom(au_sample, (mfcc_sample.shape[0]/au_sample.shape[0], 1), order=1)
concatenated = np.concatenate((mfcc_sample, au_interpolated), axis=-1)
# Standardize timesteps
if len(concatenated) > timesteps:
standardized = concatenated[:timesteps]
else:
standardized = np.zeros(shape=(timesteps, fused_feature_vector_size))
standardized[:len(concatenated)] = concatenated
# Store to fused array
fused[idx] = standardized
return fused
# Apply function and fuse data
fused_train = fuse_data(mfcc_train, au_train, MFCC_TIMESTEPS_95_QUANTILE)
fused_valid = fuse_data(mfcc_valid, au_valid, MFCC_TIMESTEPS_95_QUANTILE)
fused_test = fuse_data(mfcc_test, au_test, MFCC_TIMESTEPS_95_QUANTILE)
Dataloaders
multimodal_train_loader = BatchData(fused_train, labels_train_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=True)
multimodal_valid_loader = BatchData(fused_valid, labels_valid_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=False)
multimodal_test_loader = BatchData(fused_test, labels_test_one_hot, OUTPUT_SIZE, BATCH_SIZE, shuffle=False)
Model
model_4 = get_model(name='Multimodal', input_shape=(MFCC_TIMESTEPS_95_QUANTILE, NUM_MFCCS+NUM_AUS), output_size=OUTPUT_SIZE)
model_4.summary()
Model: "Multimodal" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= masking_2 (Masking) (None, 356, 69) 0 bidirectional_4 (Bidirectio (None, 356, 1024) 2383872 nal) time_distributed_2 (TimeDis (None, 356, 1024) 0 tributed) bidirectional_5 (Bidirectio (None, 512) 2623488 nal) dropout_7 (Dropout) (None, 512) 0 dense_6 (Dense) (None, 512) 262656 batch_normalization_2 (Batc (None, 512) 2048 hNormalization) activation_2 (Activation) (None, 512) 0 dropout_8 (Dropout) (None, 512) 0 dense_7 (Dense) (None, 512) 262656 dense_8 (Dense) (None, 6) 3078 ================================================================= Total params: 5,537,798 Trainable params: 5,536,774 Non-trainable params: 1,024 _________________________________________________________________
Training
# Load pre-trained weights
LOAD_TRAINED_WEIGHTS, SUCCESSFUL_WEIGHT_LOAD, model_4 = load_trained_weights(LOAD_TRAINED_WEIGHTS=True, model=model_4, path='./models/multimodal.h5')
TRAIN_MODEL = False
if TRAIN_MODEL:
# Define file to store checkpoint
BACKUP_FILE = os.path.join(BACKUP_DIR, 'multimodal.h5')
# Callbacks
model_callbacks = get_callbacks(BACKUP_FILE)
logCSV = CSVLogger(filename='logs/log_multimodal.csv', separator=',', append=(LOAD_TRAINED_WEIGHTS & SUCCESSFUL_WEIGHT_LOAD))
model_callbacks.append(logCSV)
# Train model
model_4.fit(x = multimodal_train_loader,
batch_size = BATCH_SIZE,
epochs = EPOCHS,
validation_data = multimodal_valid_loader,
shuffle = True,
workers = -1,
use_multiprocessing = True,
callbacks=model_callbacks)
Best weights loaded successfully!
Plotting
plot_keras_log('logs/log_multimodal.csv')
class Plotter():
''' Receives three inputs, all of which must match in the NN's output dimension, which represents an array with length equals the number of classes:
1 - True labels as one-hot encodings
2 - Predicted probabilities for each class
3 - Class names
Provides functionlity for plotting: Confusion Matrix, Precision-Recall Curve, Micro (Global) ROC, and Class-Stratified ROC.
Provides the following metrics: F1 Score (Macro), Accuracy, Balanced Accuracy
'''
def __init__(self, true_onehot, pred_probs, class_names):
self.true_onehot = np.asarray(true_onehot)
self.pred_probs = np.asarray(pred_probs)
self.class_names = np.asarray(class_names)
self.num_classes = len(class_names)
# Get predictions from the probabilities
self.pred_labels = self.class_names[np.argmax(pred_probs, axis=1)]
self.true_labels = self.class_names[np.argmax(true_onehot, axis=1)]
# Metrics
self.f1_score = f1_score(self.true_labels, self.pred_labels, average='macro')
self.accuracy = accuracy_score(self.true_labels, self.pred_labels)
self.balanced_accuracy = balanced_accuracy_score(self.true_labels, self.pred_labels)
# Dictionaries to store data
self.fpr, self.tpr, self.roc_thresholds, self.roc_auc = dict(), dict(), dict(), dict()
self.precision, self.recall, self.prc_thresholds, self.prc_auc = dict(), dict(), dict(), dict()
# Iterate through classes and get class-stratified metrics
for i, label in enumerate(self.class_names):
# Get the fpr, tpr, thresholds and auc
self.fpr[i], self.tpr[i], self.roc_thresholds[i] = roc_curve(self.true_onehot[:, i], self.pred_probs[:, i])
self.roc_auc[i] = auc(self.fpr[i], self.tpr[i])
# Get the precision, recall, thresholds and auc
self.precision[i], self.recall[i], self.prc_thresholds[i] = precision_recall_curve(self.true_onehot[:, i], self.pred_probs[:, i])
self.prc_auc[i] = auc(self.recall[i], self.precision[i])
# Generate multilabel colors with pylab
self.class_colors = []
cm = pylab.get_cmap('nipy_spectral')
for i, label in enumerate(self.class_names):
color = cm(1.*i/self.num_classes)
self.class_colors.append(color)
def plot_confusion_matrix(self):
conf_mat = confusion_matrix(self.true_labels, self.pred_labels, labels=self.class_names)
sns.heatmap(conf_mat, annot=True, cmap='Blues', xticklabels=self.class_names, yticklabels=self.class_names, square=True, cbar=False, fmt='g')
plt.title('Confusion Matrix', pad = 20, fontweight='bold')
plt.ylabel('True Emotion', fontsize = 12, labelpad = 10)
plt.xlabel('Predicted Emotion', fontsize = 12, labelpad = 10)
def plot_prec_recall_curve(self):
# Plot the class-stratified PRCs
plt.axis('square')
for i, color in enumerate(self.class_colors):
plt.plot(self.recall[i], self.precision[i], color=color, lw=3, label=f'{self.class_names[i]} (area = {self.prc_auc[i]:.2f})', alpha=0.7)
plt.plot([1, 0], [0, 1], 'k--', lw=2, alpha=0.3)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('Recall', fontsize = 12, labelpad = 10)
plt.ylabel('Precision', fontsize = 12, labelpad = 10)
plt.title('Precision-Recall Curve', pad = 20, fontweight='bold')
plt.legend(loc="best")
def plot_roc_overall(self):
# Compute global (micro-average) ROC curve and ROC area -- sensitive to class imbalance
fpr, tpr, thresholds = roc_curve(self.true_onehot.ravel(), self.pred_probs.ravel())
roc_auc = auc(fpr, tpr)
# Plot the model overall ROC
plt.axis('square')
plt.plot(fpr, tpr, color='deeppink', lw=5, label=f'MODEL OVERALL (area = {roc_auc:.2f})',)
plt.plot([0, 1], [0, 1], 'k--', lw=2, alpha=0.3)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize = 12, labelpad = 10)
plt.ylabel('True Positive Rate', fontsize = 12, labelpad = 10)
plt.title('Overall ROC Curve', pad = 20, fontweight='bold')
plt.legend(loc="lower right")
def plot_roc_multiclass(self):
# Plot the class-stratified ROCs
plt.axis('square')
for i, color in enumerate(self.class_colors):
plt.plot(self.fpr[i], self.tpr[i], color=color, lw=3, label=f'{self.class_names[i]} (area = {self.roc_auc[i]:.2f})', alpha=0.7)
plt.plot([0, 1], [0, 1], 'k--', lw=2, alpha=0.3)
plt.xlim([-0.01, 1.01])
plt.ylim([-0.01, 1.01])
plt.xlabel('False Positive Rate', fontsize = 12, labelpad = 10)
plt.ylabel('True Positive Rate', fontsize = 12, labelpad = 10)
plt.title('Multi-Class ROC Curve', pad = 20, fontweight='bold')
plt.legend(loc="lower right")
def plot_classification_results(self):
fig, axs = plt.subplots(nrows=2, ncols=2, figsize=(16,10))
fig.sca(axs[0][0])
self.plot_confusion_matrix()
fig.sca(axs[0][1])
self.plot_prec_recall_curve()
fig.sca(axs[1][0])
self.plot_roc_overall()
fig.sca(axs[1][1])
self.plot_roc_multiclass()
fig.tight_layout(h_pad=3, w_pad=-30)
plt.show()
# Generate probabilities with each model
test_probas_acoustic = model_1.predict(acoustic_test_loader)
test_probas_visual = model_2.predict(visual_test_loader)
test_probas_late = model_3.predict_proba(acoustic_test_loader, visual_test_loader)
test_probas_early = model_4.predict(multimodal_test_loader)
# Instantiate plotter classes with the predicions from each model
plotter_acoustic = Plotter(labels_test_one_hot, test_probas_acoustic, class_names)
plotter_visual = Plotter(labels_test_one_hot, test_probas_visual, class_names)
plotter_late = Plotter(labels_test_one_hot, test_probas_late, class_names)
plotter_early = Plotter(labels_test_one_hot, test_probas_early, class_names)
# Aggregate
plotters = [plotter_acoustic, plotter_visual, plotter_late, plotter_early]
modalities = ['Acoustic', 'Visual', 'Late Fusion', 'Early Fusion']
Macro F1 Score
# Plot
fig, ax = plt.subplots(figsize=(10,5))
# Barplot
scores = [plotter.f1_score for plotter in plotters]
ax.bar(x=modalities, height=scores, color='slategray')
# Formatting
plt.title('F1 Score (Macro)', size=25)
plt.ylim([0, 1])
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1, decimals=0))
ax.locator_params(axis ='y', nbins=6)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Add values above bars
for x_index, score in enumerate(scores):
ax.text(x_index-0.2, score+0.01, f'{100*score:.1f} %', color='black', size=20)
# Show
plt.show()
Accuracy
# Plot
fig, ax = plt.subplots(figsize=(10,5))
# Barplot
scores = [plotter.accuracy for plotter in plotters]
ax.bar(x=modalities, height=scores, color='slategray')
# Formatting
plt.title('Accuracy', size=25)
plt.ylim([0, 1])
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1, decimals=0))
ax.locator_params(axis ='y', nbins=6)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Add values above bars
for x_index, score in enumerate(scores):
ax.text(x_index-0.2, score+0.01, f'{100*score:.1f} %', color='black', size=20)
# Show
plt.show()
Balanced Accuracy
# Plot
fig, ax = plt.subplots(figsize=(10,5))
# Barplot
scores = [plotter.balanced_accuracy for plotter in plotters]
ax.bar(x=modalities, height=scores, color='slategray')
# Formatting
plt.title('Balanced Accuracy', size=25)
plt.ylim([0, 1])
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
ax.yaxis.set_major_formatter(mtick.PercentFormatter(1, decimals=0))
ax.locator_params(axis ='y', nbins=6)
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
# Add values above bars
for x_index, score in enumerate(scores):
ax.text(x_index-0.2, score+0.01, f'{100*score:.1f} %', color='black', size=20)
# Show
plt.show()
Confusion Matrix
# Confusion Matrices
fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20,10))
for idx, (plotter, modality) in enumerate(zip(plotters, modalities)):
# Model metric
fig.sca(axs[idx])
plotter.plot_confusion_matrix()
plt.title(modality, size=20, pad=20)
# Show
fig.subplots_adjust(wspace=0.3)
plt.show()
Receiver Operating Characteristic (ROC) - Global
# ROC-AUC Overall
fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20,10))
for idx, (plotter, modality) in enumerate(zip(plotters, modalities)):
# Model metric
fig.sca(axs[idx])
plotter.plot_roc_overall()
plt.title(modality, size=20, pad=20)
# Show
fig.subplots_adjust(wspace=0.3)
plt.show()
Receiver Operating Characteristic (ROC) - Stratified
# ROC-AUC Class Stratified
fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20,10))
for idx, (plotter, modality) in enumerate(zip(plotters, modalities)):
# Model metric
fig.sca(axs[idx])
plotter.plot_roc_multiclass()
plt.title(modality, size=20, pad=20)
# Show
fig.subplots_adjust(wspace=0.3)
plt.show()
Precision-Recall Curve (PRC) - Stratified
# Precision-Recall Curve
fig, axs = plt.subplots(nrows=1, ncols=4, figsize=(20,10))
for idx, (plotter, modality) in enumerate(zip(plotters, modalities)):
# Model metric
fig.sca(axs[idx])
plotter.plot_prec_recall_curve()
plt.title(modality, size=20, pad=20)
# Show
fig.subplots_adjust(wspace=0.3)
plt.show()
All metrics were in accordance as to each model's performance, agreeing that from best to worst the models are ranked as:
From this ranking the first obvious observation is that multimodal features can improve classification performance, given that both models using multimodal features outperform the two other models built on unimodal features.
Early fusion outperforms late fusion, which is what I would expect, give that early fusion allows the model to learn about co-dependant effects between acoustic and visual features. Interestingly enough, even though there is a significant performance gap between acoustic and visual, all other models which in one way or another consider visual features have only incremental performance gains. That is to say visual features seem to be the most useful, and they can be made slightly better with the addition of acoustic features, especially under early fusion.
Possibly one reason for such a small gap is my choice to keep the model architecture fixed between all modalities, so as to be fair. Yet, given that it is reasonable to expect there to be more room for learning nuances when working with multimodal data, that means that potentially such model would benefit from a deeper or more complex architecture. This is to say we might be able to extract better performance from the early fusion model, and thus see it's performance margin widen, by optimizing model architecture.
Matheus Schmitz
LinkedIn
Github Portfolio