Creating a multilabel multitarget dataset from TED Talks transcripts
# Package to store the versions of packages used
!pip install -q watermark
# Imports
# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from tqdm.notebook import tqdm
import datetime
from time import time
import ast
%matplotlib inline
df_main = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/TED_Talks_Data_Analysis/master/ted_main.csv')
df_main.head(2)
df_transcript = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/TED_Talks_Data_Analysis/master/transcripts.csv')
df_transcript.head()
df = pd.merge(left=df_main, right=df_transcript, how='left', left_on='url', right_on='url')
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
df['transcript'] = df['transcript'].fillna('')
df['wc'] = df['transcript'].apply(lambda x: len(x.split()))
# Longest transcript
max(df.wc)
df.head(2)
sns.set_style("whitegrid")
plt.figure(figsize=(25,5))
sns.distplot(df.wc)
# Checking different percentiles
print(f' 1st percentile: {int(np.percentile(df.wc, 1))} words')
print(f' 5th percentile: {int(np.percentile(df.wc, 5))} words')
print(f'10th percentile: {int(np.percentile(df.wc, 10))} words')
print(f'50th percentile: {int(np.percentile(df.wc, 50))} words')
print(f'90th percentile: {int(np.percentile(df.wc, 90))} words')
print(f'95th percentile: {int(np.percentile(df.wc, 95))} words')
print(f'99th percentile: {int(np.percentile(df.wc, 99))} words')
# Basic text cleaning
# CONSIDER IMPROVING THIS IN A LATER REVIEW
df.transcript.replace('.','', inplace=True)
df.transcript.replace(',','', regex=True, inplace=True)
df.transcript.replace('/','', regex=True, inplace=True)
df.transcript.replace('"','', regex=True, inplace=True)
df.head(2)
# Pandas failed to read the items in the 'tags' column as a list of strings
# Using the ast package to properly read that column
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
type(df.tags[0][0])
df.head(2)
all_tags = df.apply(lambda x: pd.Series(x['tags']),axis=1).stack().reset_index(level=1, drop=True)
unique_tags = np.unique(all_tags.values)
len(unique_tags)
tag_count = all_tags.value_counts()
tag_count.head(10)
# Number of tags with more than 200 tags
len(tag_count[tag_count >= 200])
tag_filter = tag_count[tag_count >= 200].index
tag_filter
for tag_name in tag_filter:
df[f'{tag_name}'] = pd.Series()
df.columns
for index, row in tqdm(df.iterrows()):
for col_name in df.columns:
if col_name in row.tags:
df.at[index, col_name] = int(1)
df.head(3)
def get_split(text1):
l_total = []
l_parcial = []
n = (len(text1.split())//128)+1
for w in range(n):
l_parcial = text1.split()[w*128:w*128 + 128]
l_total.append(" ".join(l_parcial))
return l_total
df_input = pd.DataFrame()
df_input['text_split'] = df['transcript'].apply(get_split)
df_splits = df_input.text_split.apply(pd.Series)
df_labels = df.iloc[:, 19:]
df_labels
df_merged = pd.merge(df_splits, df_labels, left_index=True, right_index=True)
df_merged
df_melted = df_merged.melt(id_vars=tag_filter)
df_melted.shape
df_drops = df_melted.dropna(subset=['value'])
df_drops.drop(labels=['variable'], axis=1, inplace=True)
df_drops.reset_index(inplace=True, drop=True)
df_drops.shape
df_drops.tail(20)
# First 13 columns are labels
# Last column is text (input)
df_final = df_drops.fillna(0)
df_final.head(1)
# Could consider also dropping rows which have no associated theme
# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis")
df_final.to_csv("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis/df_final.csv", index=False)
Using TED Talks transcripts to predict the topic being presented
# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis")
# Package to store the versions of packages used
!pip install -q watermark
# Package to download the BERT models and process data
!pip install -q transformers
# Imports
# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from tqdm.notebook import tqdm
import datetime
from time import time
import random
import pylab
# Sklearn
import sklearn
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, plot_confusion_matrix
# Deep Learning, NLP and metrics
import torch
import transformers
from textwrap import wrap
from torch import nn, optim
from torch.utils import data
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
%matplotlib inline
# Package versions
%reload_ext watermark
%watermark -v -iv
df = pd.read_csv('df_final.csv')
df.shape
df = shuffle(df)
df.tail(1)
# Model download
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
# Model Hyperparameters
EPOCHS = 10
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 0.00002
CLASSIFICATION_THRESHOLD = 0.2
# BERT was trained on a 2e^-5 learning rate, using other LRs on my 'transformer head' gave me problems
class DataBatcher(data.Dataset):
# Constructor
def __init__(self, review, targets, tokenizer, max_len):
# Initialize class atributes
self.review = review
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
# Shuffle
tmp = list(zip(review, targets))
random.shuffle(tmp)
review, targets = zip(*tmp)
def __len__(self):
return len(self.review)
# Method to obtain each review
def __getitem__(self, item):
# Load a review
review = str(self.review[item])
# Create the review embedding
encoding = tokenizer.encode_plus(review,
max_length = self.max_len,
truncation=True,
add_special_tokens = True,
pad_to_max_length = True,
return_attention_mask = True,
return_token_type_ids = False,
return_tensors = 'pt')
# Among the methods returns, there is the attention mask
return {'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(self.targets[item], dtype = torch.long)}
# This function creates a data loader to convert the dataset to the BERT format
# torch.utils.data.dataloader.DataLoader
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = DataBatcher(review = df.value.to_numpy(),
targets = df.iloc[:, :-1].to_numpy(),
tokenizer = tokenizer,
max_len = max_len)
return data.DataLoader(ds, batch_size = batch_size, num_workers = 4)
# Taking only a fraction of the dataset for experimentation purposes
# Using the whole dataset obviously improved performance, but training takes way to long
# The model epochs take 1 minute for every 3000 rows
# Total time in minutes is EPOCHS * (ROWS/3000)
df = df[0:3000]
# Train test split
df_train, df_test = train_test_split(df, test_size = 0.2)
# Test validation split
df_valid, df_test = train_test_split(df_test, test_size = 0.5)
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')
print(f'df_valid.shape: {df_valid.shape}')
# Total Multiclass Predictions
total_preds_df_train = df_train.shape[0] * (df_train.shape[1] -1)
total_preds_df_test = df_test.shape[0] * (df_test.shape[1] -1)
total_preds_df_valid = df_valid.shape[0] * (df_valid.shape[1] -1)
# Load the data_loaders
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LENGTH, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LENGTH, BATCH_SIZE)
valid_data_loader = create_data_loader(df_valid, tokenizer, MAX_LENGTH, BATCH_SIZE)
# Visualize a sample on the training data
sample = next(iter(train_data_loader))
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)
print(sample['targets'].shape)
# Loading the pre-trained BERT model
model_bert = BertModel.from_pretrained('bert-base-cased')
class SentimentClassifier(nn.Module):
# Constructor
def __init__ (self, n_classes):
# Initialize atributes
super(SentimentClassifier, self).__init__()
# Define the pre-trained BERT model
self.bert = BertModel.from_pretrained('bert-base-cased')
# Add a dropout layer
self.drop1 = nn.Dropout(p=0.25)
# Add a hidden layer
self.fc1 = nn.Linear(self.bert.config.hidden_size, 128)
# Add a dense layer
self.fc2 = nn.Linear(128, n_classes)
# Add a dropout layer
self.drop2 = nn.Dropout(p=0.25)
# Final classification with sigmoid
self.sigmoid = nn.Sigmoid()
# Forward method
def forward(self, input_ids, attention_mask):
# Load the pooling layer from BERT
_, pooled_output = self.bert(input_ids = input_ids, attention_mask = attention_mask)
# Define the outputs from the created layers
output = self.drop1(pooled_output)
output = self.fc1(output)
output = self.fc2(output)
output = self.drop2(output)
output = self.sigmoid(output)
# Return
return output
# Setting the device to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
class_names = df.columns.drop('value')
class_names
# Create instance of the model
model_sentiment_classifier = SentimentClassifier(len(class_names))
# Send model to the device
model_sentiment_classifier = model_sentiment_classifier.to(device)
# The original BERT model uses AdamW: algorithm with fixed decay weight
optimizer = AdamW(model_sentiment_classifier.parameters(), lr = LEARNING_RATE, correct_bias = False)
# Defining the total number of steps
total_step = len(train_data_loader) * EPOCHS
# Adjust the learning rate
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_step)
# Loss function
loss_fn = nn.BCELoss().to(device)
#loss_fn = nn.BCEWithLogitsLoss().to(device)
# Define threshold for sampled to be considered part of a class
class_threshold = torch.Tensor([CLASSIFICATION_THRESHOLD]).to(device)
# Tensors with predictions
predict_true = torch.Tensor([1]).to(device)
predict_false = torch.Tensor([0]).to(device)
# Train function
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
# Prepare for training
model = model.train()
losses = []
correct_prediction = 0
# Loop through the data samples
# Complete Deep Learing cicle
for d in data_loader:
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids = input_ids, attention_mask = attention_mask)
preds = torch.where(outputs > class_threshold, predict_true, predict_false)
loss = loss_fn(outputs, targets.float())
correct_prediction += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return torch.true_divide(correct_prediction, n_examples), np.mean(losses)
# Evaluate function
def evaluate_model(model, data_loader, loss_fn, device, n_examples):
model.eval()
losses = []
correct_prediction = 0
with torch.no_grad():
for d in data_loader:
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids = input_ids, attention_mask = attention_mask)
preds = torch.where(outputs > class_threshold, predict_true, predict_false)
loss = loss_fn(outputs, targets.float())
correct_prediction += torch.sum(preds == targets)
losses.append(loss.item())
return torch.true_divide(correct_prediction, n_examples), np.mean(losses)
%%time
# Store the train history
history = defaultdict(list)
# Control the best accuracy
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
best_accuracy = 0
# Loop
for epoch in range(EPOCHS):
start_time = time()
print(f'Epoch {epoch+1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_model(model_sentiment_classifier,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
total_preds_df_train)
print(f'Train error: {train_loss:.5f} | Train accuracy: {train_acc:.5f}')
valid_acc, valid_loss = evaluate_model(model_sentiment_classifier,
valid_data_loader,
loss_fn,
device,
total_preds_df_valid)
print(f'Valid error: {valid_loss:.5f} | Valid accuracy: {valid_acc:.5f}')
end_time = time()
print(f'Iteration Time: {end_time - start_time:.2f} seconds')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['valid_acc'].append(valid_acc)
history['valid_loss'].append(valid_loss)
if valid_acc > best_accuracy:
torch.save(model_sentiment_classifier.state_dict(), f'models/model_sentiment_classifier_{now}.bin')
best_accuracy = valid_acc
fig, ax = plt.subplots(1, 2, figsize=(16,4))
ax[0].plot(history['train_acc'], label='train')
ax[0].plot(history['valid_acc'], label='valid')
ax[0].set_title('Accuracy')
ax[1].plot(history['train_loss'], label='train')
ax[1].plot(history['valid_loss'], label='valid')
ax[1].set_title('Loss')
plt.legend()
plt.show()
# Create a model instance
model = SentimentClassifier(len(class_names))
# Load the model
model.load_state_dict(torch.load(f'models/model_sentiment_classifier_{now}.bin'))
# Send model to device
model = model.to(device)
# Predicting using test data
test_acc, test_loss = evaluate_model(model, test_data_loader, loss_fn, device, total_preds_df_test)
# Model performance
print(f'Test Accuracy: {test_acc}')
print(f'Test Loss: {test_loss}')
# Function to collect reviews
def get_reviews(model, data_loader):
model = model.eval()
review_texts = []
predictions = []
prediction_probs = []
real_values = []
with torch.no_grad():
for d in data_loader:
texts = d['review_text']
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids = input_ids, attention_mask = attention_mask)
preds = torch.where(outputs > class_threshold, predict_true, predict_false)
review_texts.extend(texts)
predictions.extend(preds)
prediction_probs.extend(outputs)
real_values.extend(targets)
predictions = torch.stack(predictions).cpu()
prediction_probs = torch.stack(prediction_probs).cpu()
real_values = torch.stack(real_values).cpu()
return review_texts, predictions, prediction_probs, real_values
# Gathering real data
y_review_texts, pred_onehot, pred_probs, true_onehot = get_reviews(model, test_data_loader)
pred_bool = pred_onehot.bool()
# Classification report
print(classification_report(true_onehot, pred_onehot, target_names = class_names))
# Sample the predictions tensor
pred_probs[0:3]
# Checking one review
idx = random.randint(0, len(true_onehot))
review_text = y_review_texts[idx]
true_sentiment = true_onehot[idx]
pred_df = pd.DataFrame(
{
'class_names': class_names,
'values': pred_probs[idx]
}
)
print("\n".join(wrap(review_text)))
print()
print(f'Real Topic: {true_sentiment}')
# Prediction plot
fig = plt.gcf()
fig.set_size_inches(25, 5)
sns.barplot(x = 'values', y = 'class_names', data = pred_df, orient = 'h')
plt.title('Probability Assigned to Each Topic', fontsize=18, fontweight="bold", pad=5)
plt.ylabel('Topic', fontsize=12, fontweight="bold", labelpad=20)
plt.xlabel('Probability', fontsize=12, fontweight="bold", labelpad=5)
plt.xlim([0, 1]);
plt.axvline(class_threshold, label='Classification Threshold', linestyle='dashed')
plt.legend()
# Convert tensors to numpy for usage with sklearn
pred_onehot_np = pred_onehot.numpy().astype('int')
true_onehot_np = true_onehot.numpy().astype('int')
pred_probs_np = pred_probs.numpy()
# Set number of classes
N_CLASSES = len(class_names)
N_CLASSES
# Data for the plots
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(N_CLASSES):
fpr[i], tpr[i], _ = roc_curve(true_onehot_np[:, i], pred_probs_np[:, i])
roc_auc[i] = auc(fpr[i], tpr[i])
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(true_onehot_np.ravel(), pred_probs_np.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# ROC-AUC Plot: Micro and Macro Scores
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(N_CLASSES)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(N_CLASSES):
mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= N_CLASSES
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
plt.figure(figsize=(10,10))
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi Class ROC')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
plt.show()
# ROC-AUC Plot: Class Scores
plt.figure(figsize=(10,10))
colors = []
cm = pylab.get_cmap('nipy_spectral')
for i in range(N_CLASSES):
color = cm(1.*i/N_CLASSES)
colors.append(color)
unique_labels = np.unique(class_names)
for i, color in zip(range(N_CLASSES), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=2,
label='{0} (area = {1:0.2f})'.format(unique_labels[i], roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi Class ROC')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
plt.show()
# Metrics
precision = precision_score(true_onehot_np, pred_onehot_np, average = 'macro')
recall = recall_score(true_onehot_np, pred_onehot_np, average = 'macro')
f1_sc = f1_score(true_onehot_np, pred_onehot_np, average = 'macro')
#accuracy_sc = accuracy_score(true_onehot_np, pred_onehot_np)
roc_auc_macro = roc_auc['macro']
print('Model Performance Metrics:')
print(f'Precision = {precision:.5f}')
print(f'Recal = {recall:.5f}')
print(f'F1 Score = {f1_sc:.5f}')
print(f'Accuracy = {test_acc:.5f}')
print(f'ROC-AUC = {roc_auc_macro:.5f}')