Predicting Disasters by Analyzing Keywords in Texts on Social Media
Twitter has become an important communication channel in times of emergency.
The ubiquity of smartphones allows people to announce an emergency that they are observing in real time. For this reason, more agencies are interested in programmatically monitoring Twitter (that is, disaster relief organizations and news agencies).
But it is not always clear whether a person's words are really announcing a disaster.
This project contains a complete pipeline for the Natural Language Processing task of text classification. Specifically, it tries to classify whether a tweet describes a real or fake disaster.
The pipeline consists of:
Problem Definition
This project will be predicting whether a particular tweet is about a real disaster or not. If it is a disaster, the forecast must be 1. Otherwise, 0.
Each sample in the training and test set has the following information:
Dataset
Using a dataset based on the public dataset: Multilingual Disaster Response Messages
The data contains a set of messages related to disaster response, covering several languages, suitable for categorizing text and tasks related to the processing of natural languages.
Details about the dataset can be obtained from the address below.
!pip install -q -U watermark
!pip install -q gensim
# Imports
import re
import gc
import nltk
import torch
import sklearn
import gensim
import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import seaborn as sns
import gensim.downloader as api
from collections import Counter
from copy import deepcopy
from nltk.tokenize import TweetTokenizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
%matplotlib inline
# Google Colab Package Versions
%reload_ext watermark
%watermark -v -iv
# Set the device to run the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'Device: {device}')
# Download NTLK tagger
nltk.download('averaged_perceptron_tagger')
# Download lexicon
nltk.download('vader_lexicon')
# Loading a model pretrained with twitter data
model_glove_twitter = api.load("glove-twitter-100")
# Model vector size
model_glove_twitter.vector_size
# Create a randomized vector to represent the <UNK> token (unseen word)
random_vec_for_unk = np.random.uniform(-1, 1, size = model_glove_twitter.vector_size).astype('float32')
random_vec_for_unk = random_vec_for_unk.reshape(1, model_glove_twitter.vector_size)
random_vec_for_unk
# Similarity test
model_glove_twitter.most_similar(random_vec_for_unk)
# Add the random vector to the model
model_glove_twitter.add(['<UNK>'], random_vec_for_unk, replace = True)
# Generate normalized vectors and substitute the original ones
model_glove_twitter.init_sims(replace = True)
# Create a tokenizer which converts to lower case, reduces length, and preserves the user ('@user')
tokenizer = TweetTokenizer(preserve_case = False, reduce_len = True, strip_handles = False)
Creating multiple functions that will be used in the cleaning process
# Sample text to test functions
txt = 'ALLCAPS Capitalized 1234 #Hashtag @UserName ">.<* http://t.co/8kscqKfKkF'
def normalize_tweet(text):
# Change hyperlinks to '<url>' tokens
output = re.sub(r'http[s]{0,1}://t.co/[a-zA-Z0-9]+\b', '<url>', text)
# Split the '#' symbols from the ensuing word with a blank space
output = re.sub(r'#(\w+)', r'# \1', output)
return output
def tokenize(tokenizer, string):
# Tokenize sentenses but keeps hashtags (#) and users (@)
tokenized = tokenizer.tokenize(string)
return tokenized
# Function that returns the tokenized string (list) with numbers substituted by a numeric token
def number_tokens(tokenized_string, num_token = '<number>'):
# Create a list of tuples (word, POS tags)
pos_tagged = nltk.pos_tag(tokenized_string)
# Find all number indexes in the POS tags
num_indexes = [idx for idx in range(len(pos_tagged)) if pos_tagged[idx][1] == 'CD']
# Substitute numbers for tokens
for idx in num_indexes:
tokenized_string[idx] = num_token
return tokenized_string
# Function which runs all text cleaning functions
def preprocess_text(tokenizer, string):
return number_tokens(tokenize(tokenizer, normalize_tweet(string)))
preprocess_text(tokenizer, txt)
# Return the tokeniezd and cleaned keyword
def preprocess_keyword(keyword):
# Return None if the keyword is np.nan
if type(keyword) == np.float and np.isnan(keyword):
return
# Replace '%20' with space, lower case and tokenized
output = re.sub(r'%20', ' ', keyword)
output = output.lower()
output = output.split()
return output
preprocess_keyword(txt)
# Function to tally words written in ALL CAPS
def count_all_caps(text):
return len([word for word in text.split() if word.isupper()])
# Function to tally words with the First Letter Capitalized
def count_capitalized(text):
return len([word for word in text.split() if word.istitle()])
# Function to tally number of words in the tweet
def count_words(text):
return len(text.split())
print(f'ALLCAPS: {count_all_caps(txt)}')
print(f'Captilaized: {count_capitalized(txt)}')
print(f'words: {count_words(txt)}')
# Function that appends 4 sentiment analysis score columns to a DataFrame
def sentiment_analyze_df(df, column):
# Instance the sentiment intensity analyzer
sid = SentimentIntensityAnalyzer()
# Create a matrix and fill with scores from each of the df[column]
output_values = np.zeros((len(df), 4))
for tup in df.itertuples():
output_values[tup.Index, :] = list(sid.polarity_scores(' '.join(getattr(tup, column))).values())
# Append the column to the DataFrame
for idx, col in enumerate(['sent_neg', 'sent_neu', 'sent_pos', 'sent_compound']):
df[col] = output_values[:, idx]
# Get the embeddinbg vector of the input word
def get_word_vec(embedding_model, use_norm, word):
if word[0] == '@':
return embedding_model.word_vec('<user>', use_norm = use_norm)
elif word == '#':
return embedding_model.word_vec('<hashtag>', use_norm = use_norm)
elif word in embedding_model.vocab:
return embedding_model.word_vec(word, use_norm = use_norm)
else:
return embedding_model.word_vec('<UNK>', use_norm = use_norm)
get_word_vec(model_glove_twitter, True, 'car')
# Get embedding vectors of all words in a tweet
def text_to_vectors(embedding_model, use_norm, tokenized_text):
vectors = [get_word_vec(embedding_model, use_norm, word) for word in tokenized_text]
vectors = np.array(vectors)
return vectors
# Return a matrix with the embedding vectors of the texts with dimensions (seq_len, embedding)
def trim_and_pad_vectors(text_vectors, embedding_dimension, seq_len):
# Instance the 0's matrix
output = np.zeros((seq_len, embedding_dimension))
# Adjust (cut) the tweets longer than seq_len
trimmed_vectors = text_vectors[:seq_len]
# Calculate the number of zeroes needed to pad the beginning of tweets shorter than seq_len
end_of_padding_index = seq_len - trimmed_vectors.shape[0]
# Alternative: Pad at the end of tweets
#tweet_len = len(trimmed_vectors)
# Output
output[end_of_padding_index:] = trimmed_vectors
return output
# Return embedding representations from the tokenized input text
def embedding_preprocess(embedding_model, use_norm, seq_len, tokenized_text):
# Get the matrix with embedding vectors (tweet length, embedding_dimension)
text_vectors = text_to_vectors(embedding_model, use_norm, tokenized_text)
# Output
output = trim_and_pad_vectors(text_vectors, embedding_model.vector_size, seq_len)
return output
embedding_preprocess(model_glove_twitter, True, 30, 'car').shape
# Return embedding vectors from the keywords
def keyword_to_avg_vector(embedding_model, use_norm, tokenized_keyword):
# Return a zeros vector if tokenized_keyword is None
if tokenized_keyword is None:
return np.zeros((1, embedding_model.vector_size))
# If not, calculate the average embedding
vectors = [get_word_vec(embedding_model, use_norm, word) for word in tokenized_keyword]
vectors = np.array(vectors)
avg_vector = np.mean(vectors, axis = 0)
avg_vector = avg_vector.reshape((1, embedding_model.vector_size))
return avg_vector
keyword_to_avg_vector(model_glove_twitter, True, 'car')
# Load training data
data_train = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/Disaster_Occurance_Twitter/master/dataset_train.csv')
data_train.head()
# Normalize and tokenize text
data_train['tok_norm_text'] = [preprocess_text(tokenizer, text) for text in data_train['text']]
# Normalize and tokenize keyword
data_train['keyword'] = data_train['keyword'].apply(preprocess_keyword)
# Check
data_train.head(3)
# Apply the functions to the data
data_train['num_all_caps'] = data_train['text'].apply(count_all_caps)
data_train['num_caps'] = data_train['text'].apply(count_capitalized)
data_train['num_words'] = data_train['text'].apply(count_words)
# Create a scaler to set all featuresi to the [-1, 1] range
scaler = MinMaxScaler(feature_range=(-1, 1))
# Apply the scaler
columns_to_scale = ['num_all_caps', 'num_caps', 'num_words']
scaler.fit(data_train[columns_to_scale])
data_train[columns_to_scale] = scaler.transform(data_train[columns_to_scale])
# Create sentiment analysis features
sentiment_analyze_df(data_train, 'tok_norm_text')
# Visualize
data_train.head()
# Plot
sns.distplot([len(tok) for tok in data_train['tok_norm_text']])
Seems like most texts have under 30 words. Therefore a reasonable choice between data loss and computational complexity is to set the maximum sequence length to 30.
# Max sequence length
sequence_max_length = 30
# Generate the text embedding
data_train['text_embedding'] = [embedding_preprocess(embedding_model = model_glove_twitter,
use_norm = True,
seq_len = sequence_max_length,
tokenized_text = text)
for text in data_train['tok_norm_text']]
data_train['keyword_embedding'] = [keyword_to_avg_vector(embedding_model = model_glove_twitter,
use_norm = True,
tokenized_keyword = keyword)
for keyword in data_train['keyword']]
# Visualize
data_train.head()
data_train['text_embedding'][0]
Creating a vector projection which contains the text embeddings, plus the keyword embeddings, plus the engineered features.
The most common approach is to simply embed a text, which dispenses this step, but since here I want to add more things to my GLOVE model, this merging of all features in a single vector is necessary.
# Function which returns a numpy array containing unique values with length seq_len
def _single_values_repeat(seq_len, static_single_values):
# Create a sequenced array with one position representing each of the engineered features
output = static_single_values.reshape((1, len(static_single_values)))
# Repeat that array seq_len times since the engineered features are the same for all words in a same tweet
output = np.repeat(output, seq_len, axis = 0)
return output
static_singles_cols = ['num_all_caps', 'num_caps', 'num_words', 'sent_neg', 'sent_neu', 'sent_pos', 'sent_compound']
data_train[static_singles_cols].shape
data_train[static_singles_cols].head()
_single_values_repeat(30, data_train['num_all_caps'].values).shape
# one for each word position in each sample, with a size of one (7613 * 1) since it's a single feature
_single_values_repeat(30, data_train['num_all_caps'].values)
# Vector size used by the twitter glove model
model_glove_twitter.vector_size
# Return a numpy array of stacked embedding vectors
def _static_embedding_repeat(seq_len, static_embedding_values):
# Reshape the keyword embedding by stacking it horizontally
horizontally_stacked = np.hstack(static_embedding_values)
# Repeat that array seq_len times since the keyword is the same for all words in a same tweet
output = np.repeat(horizontally_stacked, seq_len, axis = 0)
return output
_static_embedding_repeat(30, data_train['keyword_embedding']).shape
# one for each word position in each sample, with a size of a 100 (7613 * 100) since it's a vector using the representation length of model_glove_twitter.vector_size
_static_embedding_repeat(30, data_train['keyword_embedding'])
# Function which returns the embedding representations of all features
def concatenate_embeddings(df,
embedding_model,
seq_len,
sequence_embedding_col,
static_embedding_cols,
static_singles_cols):
# Embedding dimensions
emb_dim = embedding_model.vector_size
# Output matrix
output = np.zeros((len(df), seq_len, len(static_singles_cols) + len(static_embedding_cols) * emb_dim + emb_dim))
# Loop
for idx, row in df.iterrows():
single_vals = _single_values_repeat(seq_len, row[static_singles_cols].values)
static_emb_vals = _static_embedding_repeat(seq_len, row[static_embedding_cols])
seq_emb_vals = row[sequence_embedding_col]
# Stack embeddings and features for each tweet
# AKA putting together the vectors for all text word embeddings + keyword embeddings + feature engineering embeddings + sentiment score embeddings
row_embedding = np.hstack((single_vals, static_emb_vals, seq_emb_vals))
output[idx, :, :] = row_embedding
return output
# Create a final embedding representation of all features selected for training
embedding_matrix = concatenate_embeddings(df = data_train,
embedding_model = model_glove_twitter,
seq_len = sequence_max_length,
sequence_embedding_col = 'text_embedding',
static_embedding_cols = ['keyword_embedding'],
static_singles_cols = ['num_all_caps',
'num_caps',
'num_words',
'sent_neg',
'sent_neu',
'sent_pos',
'sent_compound'])
# Shape
embedding_matrix.shape
The first 7 positions represent the engineered features for that tweet, the next 100 represent the vectorized encoding for the keyword associated with that tweet. Those both are cloned/projected for every word in the tweet. The next 100 positions are the word embeddings for a specific word on a tweet.
That is, the fist 107 positions will be identical for all word embedding slots (30 here) of a tweet, and the last 100 will differ, since each is the representation fo a specific word on that tweet.
PyTorch implementation of a Bidirectional LSTM model
class BiLSTM(nn.Module):
# Constructor Method
def __init__(self, embedding_dim, hidden_dim, num_layers, num_classes, batch_size, dropout, device):
super(BiLSTM, self).__init__()
# Initialize attributes
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.num_layers = num_layers
# Dropout to reduce overfitting
self.dropout = nn.Dropout(p = dropout)
# LSTM model
self.lstm = nn.LSTM(input_size = embedding_dim,
hidden_size = hidden_dim,
num_layers = num_layers,
batch_first = True,
dropout = dropout,
bidirectional = True)
# Fully connected layer
self.fc = nn.Linear(hidden_dim * 2, num_classes)
# Device
self.device = device
# Lists for evaluations and plots
self.train_loss = []
self.train_acc = []
self.val_loss = []
self.val_acc = []
# Attribute to store the best model weights (used for evaluating)
self.best_weights = deepcopy(self.state_dict())
# Hidden layers and LSTM cells
def _init_hidden(self, current_batch_size):
h0 = torch.zeros(self.num_layers * 2, current_batch_size, self.hidden_dim).to(self.device)
c0 = torch.zeros(self.num_layers * 2, current_batch_size, self.hidden_dim).to(self.device)
return h0, c0
# Forward step
def forward(self, x):
# Forward LSTM
h, c = self._init_hidden(current_batch_size = x.size(0))
out, _ = self.lstm(x, (h, c))
# Dropout
out = self.dropout(out)
# Decode the hidden state for the last time step
out = self.fc(out[:, -1, :])
return out
# Predictions
def predict(self, x: torch.tensor):
class_predictions = self(x).data
_, predicted = torch.max(class_predictions, dim = 1)
return predicted
# Training and evaluation with validation data
def _train_evaluate(self, X_train, y_train, X_val, y_val, criterion):
# Change the model to evaluation mode
self.eval()
# Calculate accuracy and loss on training data
epoch_train_acc = (self.predict(X_train) == y_train).sum().item() / y_train.shape[0]
epoch_train_loss = criterion(self(X_train), y_train).item()
self.train_acc.append(epoch_train_acc)
self.train_loss.append(epoch_train_loss)
# Calculate accuracyand loss on validation data
if X_val is not None and y_val is not None:
epoch_val_acc = (self.predict(X_val) == y_val).sum().item() / y_val.shape[0]
epoch_val_loss = criterion(self(X_val), y_train).item()
self.val_acc.append(epoch_val_acc)
self.val_loss.append(epoch_val_loss)
# Return the accuracy and loss values
return epoch_train_loss, epoch_train_acc, epoch_val_loss, epoch_val_acc
# Return accuracy and loss values if there is not validation dataset
return epoch_train_loss, epoch_train_acc, None, None
# Return a dictionary with the best epochs
def best_epoch(self):
best_train_loss_epoch = np.argmin(np.array(self.train_loss)) + 1
best_train_acc_epoch = np.argmax(np.array(self.train_acc)) + 1
output = {'Epoch with lowest training loss': best_train_loss_epoch,
'Epoch with highest training accuracy': best_train_acc_epoch}
if len(self.val_loss) > 0:
best_val_loss_epoch = np.argmin(np.array(self.val_loss))
best_val_acc_epoch = np.argmax(np.array(self.val_acc))
output.update({'Epoch with lowest validation loss': best_val_loss_epoch,
'Epoch with highest validation accuracy': best_val_acc_epoch})
return output
# Return a dictionary with the total number of parameters
def get_num_parameters(self):
total_params = sum(p.numel() for p in self.parameters())
trainable_params = sum(p.numel() for p in self.parameters() if p.requires_grad)
return {'total_parameters': total_params, 'trainable_parameters': trainable_params}
@staticmethod
def _print_progress(epoch, train_loss, train_acc, val_loss, val_acc, improved, verbose=False):
output = f'Epoch {str(epoch + 1).zfill(3)}:'
output += f'\n\t Training Error: {str(train_loss)[:5]} | Accuracy: {str(train_acc)[:5]}'
if val_loss is not None and val_acc is not None:
output += f'\n\t Validation Error: {str(val_loss)}[:5] | Accuracy: {str(val_acc)[:5]}'
if improved:
output += f'\n\t The model improved!'
if verbose:
print(output)
# Training method
def fit(self, X_train, y_train, X_val, y_val, epoch_num, criterion, optimizer, verbose = False):
# Variable to determine if the best weights should be updated (and report progress)
best_acc = 0.0
# Divide the dataset in batches
X_train_tensor_batches = torch.split(X_train, self.batch_size)
y_train_tensor_batches = torch.split(y_train, self.batch_size)
# Loop
for epoch in range(epoch_num):
# Set the model to training mode
# At the end of each epoch the _train_evaluate method is called and sets the model to evaluation mode
self.train()
for i, (X_batch, y_batch) in enumerate(zip(X_train_tensor_batches, y_train_tensor_batches)):
# Forward pass
outputs = self(X_batch)
loss = criterion(outputs, y_batch)
# Backward and optimization
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Calculate accuracy and loss for train and validation
train_loss, train_acc, val_loss, val_acc = self._train_evaluate(X_train, y_train, X_val, y_val, criterion)
# A boolean to determine the correct accuracy to be considered on progress (validation or training)
if X_val is not None and y_val is not None:
accuracy = val_acc
else:
accuracy = train_acc
# If the accuracy improves on the previous best, print it and update the best accuracy records and the model weights
if accuracy > best_acc:
self._print_progress(epoch,
train_loss,
train_acc,
val_loss,
val_acc,
improved = True,
verbose = verbose)
best_acc = accuracy
self.best_weights = deepcopy(self.state_dict())
# Else, just print wihtout updating
else:
self._print_progress(epoch,
train_loss,
train_acc,
val_loss,
val_acc,
improved = False,
verbose = verbose)
# Gargabe collector, to clean memory
gc.collect()
def plot_graphs(model):
plt.figure(figsize = (24, 12))
plt.subplot(311)
plt.title('Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.plot(range(1, len(model.train_acc)+1), model.train_acc, label = 'Train')
plt.xticks(np.arange(0, len(model.train_acc)+1, 5))
plt.legend()
plt.subplot(312)
plt.title('Loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.plot(range(1, len(model.train_loss)+1), model.train_loss, label = 'Train')
plt.xticks(np.arange(0, len(model.train_acc)+1, 5))
plt.legend()
plt.tight_layout(pad=0.4, w_pad=0.5, h_pad=1.0)
plt.show()
# Hyperparameters
embedding_dim = embedding_matrix.shape[2]
hidden_size = 50
num_layers = 2
num_classes = 2
batch_size = 256
dropout = 0.3
num_epochs = 300
learning_rate = 0.0005
weight_decay = 0.0005
# Load attributes
X_train = torch.from_numpy(embedding_matrix).float().to(device)
# Load label
y_train = torch.from_numpy(data_train['target'].values).long().to(device)
# Create model
model = BiLSTM(embedding_dim, hidden_size, num_layers, num_classes, batch_size, dropout, device).to(device)
# Loss function
criterion = nn.CrossEntropyLoss()
# Optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate, weight_decay = weight_decay)
%time
# Train
model.fit(X_train = X_train,
y_train = y_train,
X_val = None,
y_val = None,
epoch_num = num_epochs,
criterion = criterion,
optimizer = optimizer,
verbose = True)
# Plot
plot_graphs(model)
Appling the same preprocessing pipeline to the test dataset
# Load test dataset
data_test = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/Disaster_Occurance_Twitter/master/dataset_test.csv')
data_test.head()
# Normalize and tokenize text
data_test['tok_norm_text'] = [preprocess_text(tokenizer, text) for text in data_test['text']]
# Preprocess keywords
data_test['keyword'] = data_test['keyword'].apply(preprocess_keyword)
# Extract features
data_test['num_all_caps'] = data_test['text'].apply(count_all_caps)
data_test['num_caps'] = data_test['text'].apply(count_capitalized)
data_test['num_words'] = data_test['text'].apply(count_words)
# Scale
data_test[columns_to_scale] = scaler.transform(data_test[columns_to_scale])
# Sentiment Analyser
sentiment_analyze_df(data_test, 'tok_norm_text')
# Text embedding
data_test['text_embedding'] = [embedding_preprocess(embedding_model = model_glove_twitter,
use_norm = True,
seq_len = sequence_max_length,
tokenized_text = text)
for text in data_test['tok_norm_text']]
# Keyword embedding
data_test['keyword_embedding'] = [keyword_to_avg_vector(embedding_model = model_glove_twitter,
use_norm = True,
tokenized_keyword = keyword)
for keyword in data_test['keyword']]
# Visualize
data_test.head()
# Create a final embedding representation of all features selected for training
test_embedding_matrix = concatenate_embeddings(df = data_test,
embedding_model = model_glove_twitter,
seq_len = sequence_max_length,
sequence_embedding_col = 'text_embedding',
static_embedding_cols = ['keyword_embedding'],
static_singles_cols = ['num_all_caps',
'num_caps',
'num_words',
'sent_neg',
'sent_neu',
'sent_pos',
'sent_compound'])
# Create object with the attributes
X_test = torch.from_numpy(test_embedding_matrix).float().to(device)
# Predictions
preds = model.predict(X_test)
# Concatenate predictions and ids for each test register in a dataframe
final_preds = preds.cpu().numpy().reshape(-1,1)
ids = data_test['id'].values.reshape(-1,1)
data = np.hstack((ids, final_preds))
# Dataframe
predictions = pd.DataFrame(data = data, columns = ['id', 'target'])
# Visualize
predictions.head()