I love Asana, I'm a PM and they are a product company. I like data science. I wanna explore Asana and it's competitors. I also want to play with and learn about the BERT model. This will be a curiosity-driven journey, not sure where it'll end.
# Package to store the versions of packages used
!pip install -q watermark
# Package to download the BERT models and process data
!pip install -q transformers
# Package for scrapping data on Google Store
# https://pypi.org/project/google-play-scraper/
!pip install -q google_play_scraper
# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/BERT_App_Sentiment_Analysis")
# Imports
# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from tqdm.notebook import tqdm
import datetime
from time import time
# Deep Learning, NLP and metrics
import sklearn
import torch
import transformers
from textwrap import wrap
from torch import nn, optim
from torch.utils import data
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
# Web Scrapping Imports
# https://pypi.org/project/Pygments/
import json
import pygments
import google_play_scraper
from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter
# Random Seed
#RANDOM_SEED = 99
#np.random.seed(RANDOM_SEED)
#torch.manual_seed(RANDOM_SEED)
%matplotlib inline
# Package versions
%reload_ext watermark
%watermark -v -iv
# Listing apps I want to gather data on
# They'll all be Asana's competitors on task management
# Took the apps from Asana's comparison page, plus a few other alternatives the app store recommends
# https://asana.com/compare
# Asana, Airtable, Basecamp, Jira, Microsoft To Do
# Monday.com, Smartsheet, Taskade, Trello, Wrike
# The google_play_scrapper documentations details how to get the url for each app
# https://github.com/facundoolano/google-play-scraper
apps_list = ['com.asana.app',
'com.formagrid.airtable',
'com.basecamp.bc3',
'com.atlassian.android.jira.core',
'com.microsoft.todos',
'com.monday.monday',
'com.smartsheet.android',
'com.taskade.mobile',
'com.trello',
'com.wrike']
# List to store details from the apps
app_details = []
# Loop through the app list and retrieve details of each app
for ap in tqdm(apps_list):
# Retrieve app details
info = google_play_scraper.app(ap, lang='en', country='us')
# Store the details
app_details.append(info)
# Function to print a request in JSON format
def print_json(json_object):
# Generate json format
json_str = json.dumps(json_object,
indent = 2,
sort_keys = True,
default = str)
# The highlight function from pygments highlights the output text
# It uses different colorts to facilitate reading
print(highlight(json_str, JsonLexer(), TerminalFormatter()))
# Check the result in JSON format
print_json(app_details[0])
# Put the retrieved information into a dataframe
df_app_details = pd.DataFrame(app_details)
# Save the dataframe to disk
# Retrieve datetime to stamp the file
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# Save with current datetime
df_app_details.to_csv(f'data/app_details_{now}.csv', header=True, index=None)
df_app_details.head(3)
# List to store app reviews
app_reviews = []
# Loop to retrieve and store app reviews
for ap in tqdm(apps_list):
# Extract sample reviews from reviews with different stars given
for star in list(range(1, 6)):
# Extract the most relevant and the most recent reviews
for sort_order in [google_play_scraper.Sort.MOST_RELEVANT, google_play_scraper.Sort.NEWEST]:
rvws, _ = google_play_scraper.reviews(ap,
lang='en',
country='us',
sort=sort_order,
count = 100 if star == 3 else 50,
filter_score_with = star)
for r in rvws:
r['sortOrder'] = 'most_relevant' if sort_order == google_play_scraper.Sort.MOST_RELEVANT else 'newest'
r['appId'] = ap
# Save reviews
app_reviews.extend(rvws)
# Create a dataframe with the reviews
df_app_reviews = pd.DataFrame(app_reviews)
# Save the dataframe to disk
# Retrieve datetime to stamp the file
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
# Save with current datetime
df_app_reviews.to_csv(f'data/app_reviews_{now}.csv', header = True, index = None)
# Loading the csv with app reviews
df_reviews = pd.read_csv(f'data/app_reviews_{now}.csv')
df_reviews.head(3)
df_reviews.info()
# Plot stars
sns.set(style = 'whitegrid', palette = 'muted', font_scale = 1.5)
rcParams['figure.figsize'] = 15, 9
sns.countplot(df_reviews.score)
plt.xlabel('Stars')
plt.ylabel('Total')
# Plot appId
sns.set(style = 'whitegrid', palette = 'muted', font_scale = 1)
rcParams['figure.figsize'] = 15, 9
ax = sns.countplot(df_reviews.appId)
ax.set_xticklabels(ax.get_xticklabels(),rotation=30)
plt.xlabel('App')
plt.ylabel('Number of Samples')
# Creating a pivot table to see which app x star combination didn't retrieve the desired amount of data
app_x_stars = df_reviews.groupby(['appId', 'score']).size().unstack()
app_x_stars
# Plotting app x stars as a heatmap
sns.heatmap(app_x_stars, linewidths=1, linecolor='white', cmap='Blues')
# Grouping function
# This will convert range of 1-5 star reviews into negative(0), neutral(1) and positive(2)
# This is why I've gathered twice as much data for 3 star reviews
def group_rating(rating):
# initialize groups on -1 to catch any bugs
grp_rating = -1
# Convert ratings to integers
rating = int(rating)
# If the rating is above 3, then positive (2)
if rating > 3:
grp_rating = 2
# If rating is 3, then neutral (1)
elif rating == 3:
grp_rating = 1
# If rating is below 3, then negative (0)
else:
grp_rating = 0
return grp_rating
# Apply the function to the dataset and create a 'sentiment' column with the output
df_reviews['sentiment'] = df_reviews.score.apply(group_rating)
df_reviews.head(3)
# Shuffling the dataframe to avoid biasing the model later on
df_reviews = df_reviews.sample(frac=1).reset_index(drop=True)
# List with class names
class_names = ['negative', 'neutral', 'positive']
print(f'Negative: {(len(df_reviews[df_reviews.sentiment == 0])/len(df_reviews))}')
print(f'Neutral: {(len(df_reviews[df_reviews.sentiment == 1])/len(df_reviews))}')
print(f'Positive: {(len(df_reviews[df_reviews.sentiment == 2])/len(df_reviews))}')
# Plot class distribution
sns.set(style = 'whitegrid', palette = 'muted', font_scale = 1.5)
rcParams['figure.figsize'] = 15, 9
sns.countplot(df_reviews.sentiment)
plt.xlabel('Class')
plt.ylabel('Total')
Downloading the pre-treined BERT model.
List of available models: https://github.com/google-research/bert
# Model download
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')
# Test text
test_text = 'Just a test sentence. Test 2.'
test_text
# Tokenize
tokens = tokenizer.tokenize(test_text)
tokens
# Extract the token_ids
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_ids
# Create the encoding object to format the data for the BERT model
encoding = tokenizer.encode_plus(test_text,
max_length = 32,
add_special_tokens = True,
pad_to_max_length = True,
return_attention_mask = True,
return_token_type_ids = False,
return_tensors = 'pt')
# Print
encoding
Applying the BERT tokenizer to the dataset
# List for the tokens
token_length = []
# Drop NaN values before tokenizing
df_reviews = df_reviews.dropna(subset=['content'], how='all')
df_reviews.reset_index(inplace = True, drop=True)
df_reviews.shape
# Loop through the dataset content applying the tokenizer
for content in df_reviews.content:
tokens = tokenizer.encode(content)
token_length.append(len(tokens))
# Sample of contents
df_reviews.content.tail(5)
# Plot
ax = sns.distplot(token_length)
plt.xlim([0, 200])
plt.xlabel('Token Length')
# Model Hyperparameters
EPOCHS = 10
BATCH_SIZE = 16
MAX_LENGTH = 150
LEARNING_RATE = 0.00002
'''
Spent about 7 hours debugging this model to find out that the learning rate
has to be precisely 2e^-5 as anything else was causing the model not to learn at all
'''
class DataBatcher(data.Dataset):
# Constructor
def __init__(self, review, targets, tokenizer, max_len):
# Initialize class atributes
self.review = review
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return len(self.review)
# Method to obtain each review
def __getitem__(self, item):
# Load a review
review = str(self.review[item])
# Create the review embedding
encoding = tokenizer.encode_plus(review,
max_length = self.max_len,
truncation=True,
add_special_tokens = True,
pad_to_max_length = True,
return_attention_mask = True,
return_token_type_ids = False,
return_tensors = 'pt')
# Among the methods returns, there is the attention mask
return {'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(self.targets[item], dtype = torch.long)}
# This function creates a data loader to convert the dataset to the BERT format
# torch.utils.data.dataloader.DataLoader
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = DataBatcher(review = df.content.to_numpy(),
targets = df.sentiment.to_numpy(),
tokenizer = tokenizer,
max_len = max_len)
return data.DataLoader(ds, batch_size = batch_size, num_workers = 4)
# Train test split
df_train, df_test = train_test_split(df_reviews, test_size = 0.2) #, random_state = RANDOM_SEED
# Test validation split
df_valid, df_test = train_test_split(df_test, test_size = 0.5) #, random_state = RANDOM_SEED
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')
print(f'df_valid.shape: {df_valid.shape}')
# Load the data_loaders
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LENGTH, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LENGTH, BATCH_SIZE)
valid_data_loader = create_data_loader(df_valid, tokenizer, MAX_LENGTH, BATCH_SIZE)
# Visualize a sample on the training data
sample = next(iter(train_data_loader))
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)
print(sample['targets'].shape)
# Single review sample already on BERT format
print(sample)
# Loading the pre-trained BERT model
model_bert = BertModel.from_pretrained('bert-base-cased')
# Model
model_bert
# Visualize the shape of the last dense layer and the last pooling layer
last_hidden_state, pooled_output = model_bert(input_ids = encoding['input_ids'], attention_mask = encoding['attention_mask'])
last_hidden_state.shape
pooled_output.shape
Adding the layers relative to my specific model.
Only those get trained in practice.
class SentimentClassifier(nn.Module):
# Constructor
def __init__ (self, n_classes):
# Initialize atributes
super(SentimentClassifier, self).__init__()
# Define the pre-trained BERT model
self.bert = BertModel.from_pretrained('bert-base-cased')
# Add a dropout layer
self.drop1 = nn.Dropout()
# Add a hidden layer
self.fc1 = nn.Linear(self.bert.config.hidden_size, 100)
# Add a dense layer
self.fc2 = nn.Linear(100, n_classes)
# Final classification with softmax
self.softmax = nn.Softmax(dim = 1)
# Forward method
def forward(self, input_ids, attention_mask):
# Load the pooling layer from BERT
_, pooled_output = self.bert(input_ids = input_ids, attention_mask = attention_mask)
# Define the outputs from the created layers
output = self.drop1(pooled_output)
output = self.fc1(output)
output = self.fc2(output)
# Return
return self.softmax(output)
# Setting the device to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
# Create instance of the model
model_sentiment_classifier = SentimentClassifier(len(class_names))
# Send model to the device
model_sentiment_classifier = model_sentiment_classifier.to(device)
# Load the inputs and attention mask
input_ids = sample['input_ids'].to(device)
attention_mask = sample['attention_mask'].to(device)
# Print
print(input_ids.shape)
print(attention_mask.shape)
# Load the inputs and attention mask onto the model
model_sentiment_classifier(input_ids, attention_mask)
# The original BERT model uses AdamW: algorithm with fixed decay weight
optimizer = AdamW(model_sentiment_classifier.parameters(), lr = LEARNING_RATE, correct_bias = False)
# Defining the total number of steps
total_step = len(train_data_loader) * EPOCHS
# Adjust the learning rate
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_step)
# Loss function
loss_fn = nn.CrossEntropyLoss().to(device)
#loss_fn = nn.NLLLoss().to(device)
#loss_fn = nn.MultiMarginLoss().to(device)
# Train function
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
# Prepare for training
model = model.train()
losses = []
correct_prediction = 0
# Loop through the data samples
# Complete Deep Learing cicle
for d in data_loader:
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids = input_ids, attention_mask = attention_mask)
_, preds = torch.max(outputs, dim = 1)
loss = loss_fn(outputs, targets)
correct_prediction += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
return correct_prediction.double() / n_examples, np.mean(losses)
# Evaluate function
def evaluate_model(model, data_loader, loss_fn, device, n_examples):
model.eval()
losses = []
correct_prediction = 0
with torch.no_grad():
for d in data_loader:
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids = input_ids, attention_mask = attention_mask)
_, preds = torch.max(outputs, dim = 1)
loss = loss_fn(outputs, targets)
correct_prediction += torch.sum(preds == targets)
losses.append(loss.item())
return correct_prediction.double() / n_examples, np.mean(losses)
%%time
# Store the train history
history = defaultdict(list)
# Control the best accuracy
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
best_accuracy = 0
# Loop
for epoch in range(EPOCHS):
start_time = time()
print(f'Epoch {epoch+1}/{EPOCHS}')
print('-' * 10)
train_acc, train_loss = train_model(model_sentiment_classifier,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train))
print(f'Train error: {train_loss} Train accuracy: {train_acc}')
valid_acc, valid_loss = evaluate_model(model_sentiment_classifier,
valid_data_loader,
loss_fn,
device,
len(df_valid))
print(f'Validation error: {valid_loss} Validation accuracy: {valid_acc}')
print()
end_time = time()
print(f'Iteration Time: {end_time - start_time:.2f} seconds')
print()
history['train_acc'].append(train_acc)
history['train_loss'].append(train_loss)
history['valid_acc'].append(valid_acc)
history['valid_loss'].append(valid_loss)
if valid_acc > best_accuracy:
torch.save(model_sentiment_classifier.state_dict(), f'models/model_sentiment_classifier_{now}.bin')
best_accuracy = valid_acc
Model trained and saved to disk!
history
# Create a model instance
model = SentimentClassifier(len(class_names))
# Load the model
model.load_state_dict(torch.load(f'models/model_sentiment_classifier_{now}.bin'))
# Send model to device
model = model.to(device)
# Predicting using test data
test_acc, test_loss = evaluate_model(model, test_data_loader, loss_fn, device, len(df_test))
# Model performance
print(f'Test Accuracy: {test_acc}')
print(f'Test Loss: {test_loss}')
# Function to collect reviews
def get_reviews(model, data_loader):
model = model.eval()
review_texts = []
predictions = []
prediction_probs = []
real_values = []
with torch.no_grad():
for d in data_loader:
texts = d['review_text']
input_ids = d['input_ids'].to(device)
attention_mask = d['attention_mask'].to(device)
targets = d['targets'].to(device)
outputs = model(input_ids = input_ids, attention_mask = attention_mask)
_, preds = torch.max(outputs, dim = 1)
review_texts.extend(texts)
predictions.extend(preds)
prediction_probs.extend(outputs)
real_values.extend(targets)
predictions = torch.stack(predictions).cpu()
prediction_probs = torch.stack(prediction_probs).cpu()
real_values = torch.stack(real_values).cpu()
return review_texts, predictions, prediction_probs, real_values
# Gathering real data
y_review_texts, y_pred, y_pred_probs, y_test = get_reviews(model, test_data_loader)
# Classification report
print(classification_report(y_test, y_pred, target_names = class_names))
y_pred_probs
# Function to plot confusion matrix
def show_confusion_matrix(confusion_matrix):
hmap = sns.heatmap(confusion_matrix, annot = True, fmt = "d", cmap = "Blues")
hmap.yaxis.set_ticklabels(hmap.yaxis.get_ticklabels(), rotation = 0, ha = "right")
hmap.xaxis.set_ticklabels(hmap.xaxis.get_ticklabels(), rotation = 30, ha = "right")
plt.ylabel('Real Sentiment')
plt.xlabel('BERT Predicted Sentiment')
# Create confusion matrix
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame(cm, index = class_names, columns = class_names)
# Result
show_confusion_matrix(df_cm)
# Checking one review
idx = 0
review_text = y_review_texts[idx]
true_sentiment = y_test[idx]
pred_df = pd.DataFrame(
{
'class_names': class_names,
'values': y_pred_probs[idx]
}
)
print("\n".join(wrap(review_text)))
print()
print(f'Real Sentiment: {class_names[true_sentiment]}')
# Prediction plot
sns.barplot(x = 'values', y = 'class_names', data = pred_df, orient = 'h')
plt.ylabel('Sentiment')
plt.xlabel('Probability')
plt.xlim([0, 1]);
Testing with new data (s new app review).
test_text = 'I really love this app. It improved my work organization and efficiency'
# Apply the same transformation which was applied to the training data, creating the embedding object
encoded_eval = tokenizer.encode_plus(test_text,
max_length = MAX_LENGTH,
add_special_tokens= True,
return_token_type_ids = False,
pad_to_max_length = True,
return_attention_mask = True,
return_tensors = 'pt')
# Extract the inputs and attention_mask to make a prediction
input_ids = encoded_eval['input_ids'].to(device)
attention_mask = encoded_eval['attention_mask'].to(device)
# Output (prediction)
output = model(input_ids, attention_mask)
# Final prediction
probability, prediction = torch.max(output, dim = 1)
# Print
print(f'\nApp Review Text: {test_text}')
print(f'\nSentiment: {class_names[prediction]}')
print(f'\nProbability: {probability[0]}')