# General Packages
import numpy as np
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
# Iterators
from collections import Counter
from itertools import islice
from operator import itemgetter
from tqdm import tqdm
# Text
import re
from textblob import TextBlob
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize, wordpunct_tokenize, MWETokenizer
from nltk.stem import porter, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.util import ngrams
# Scikit-Learn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD, NMF
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.neighbors import NearestNeighbors
# Download nltk packages
nltk.download('punkt')
nltk.download('brown')
nltk.download('stopwords')
# Path to CSVs
path = 'C:\Portfolio\TED_Talks_Recommender/'
# Load TED Main
ted_main = pd.read_csv(path + 'ted_main.csv')
# Load TED Transcripts
ted_transcripts = pd.read_csv(path + 'transcripts.csv')
# Meger them
ted_transcripts = pd.merge(ted_main, ted_transcripts, on='url')
ted_transcripts.head(3)
# Return first n items of the iterable as a list
def take(n, iterable):
return list(islice(iterable, n))
# Apply word counter, then sort by frequency, then extract top words
WC = Counter(" ".join(ted_transcripts['transcript']).split())
WC_sorted = {k: v for k, v in sorted(WC.items(), key=lambda item: item[1], reverse=True)}
top_words = take(15, WC_sorted.items())
top_words
As far as common words go, everything seems to be fine
# Reading a couple of the transcripts
ted_transcripts['transcript'][2020][0:1000]
There seem to be some non-speech sounds or other occurances which are added to the transcripts. In the example above we can see (Beatboxing)
and (Laughter)
. Luckily those non-speech sounds included in the transcripts seem to be demarked as parentheticals (parathesis ()
or brackets []
), which makes it easier to remove them using regular expressions.
# Checking how many different elements in the transcript have () or []
non_speech_elements = [key for key, value in WC_sorted.items() if '(' in key or '[' in key]
print("Number of non-speech elements:", len(non_speech_elements))
print("Most common non-speech elements:", non_speech_elements[:20])
# Regular Expression approach to removing elements inside parenthesis
removed_parenthesis = re.sub(r'\([^)]*\)', ' ',
ted_transcripts['transcript'][2020])
# Check result
removed_parenthesis[0:1000]
Exploring a couple methods for tokenizing words:
1) Word tokenization with TextBlob().words
tokens = TextBlob(removed_parenthesis).words
tokens[:150]
OK, he's trying to battle me. → 'OK', 'he', "'s", 'trying', 'to', 'battle', 'me'
2) Word tokenization with NLTK's word_tokenize()
This approach keeps punctuation
tokens = word_tokenize(removed_parenthesis)
print(tokens[:150])
OK, he's trying to battle me. → 'OK', ',', 'he', "'s", 'trying', 'to', 'battle', 'me', '.'
3) Word tokenization with NLTK's wordpunct_tokenize()
This approach keeps punctuation AND splits contractions into three parts (eg: he's = "he", "'", "s")
tokens = wordpunct_tokenize(removed_parenthesis)
print(tokens[:150])
OK, he's trying to battle me. → 'OK', ',', 'he', "'", 's', 'trying', 'to', 'battle', 'me', '.'
4) Sentence tokenization with TextBlob().sentences
tokens = TextBlob(removed_parenthesis).sentences
tokens[:10]
OK, he's trying to battle me. → OK, he's trying to battle me.
5) Sentence tokenization with NLTK's sent_tokenize()
tokens = sent_tokenize(removed_parenthesis)
print(tokens[:10])
OK, he's trying to battle me. → OK, he's trying to battle me.
6) Noun phrases with TextBlob().noun_phrases
tokens = TextBlob(removed_parenthesis).noun_phrases
tokens[:150]
Well, this one definitely doesn't cut it!
# Using WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()
lemmatized_text = [lemmatizer.lemmatize(w) for w in TextBlob(removed_parenthesis).words]
print(lemmatized_text[:100])
stemmer = nltk.stem.porter.PorterStemmer()
stemmed_text = [stemmer.stem(w) for w in TextBlob(removed_parenthesis).words]
print(stemmed_text[:100])
def corpus_cleaner(corpus, stem='lemmatizer'):
'''
Take a corpus of documents and apply the best cleaning steps from above.
1. Remove parantheticals
2. Tokenize into words using TextBlob().words
3. Set to lower case and remove stopwords
4. Lemmatize
5. Again lowercase and remove stopwords
Output = A list (corpus) of lists (cleaned documents)
'''
# Define stemmer
if stem == 'lemmatizer':
lemmatizer = nltk.stem.WordNetLemmatizer()
elif stem == 'stemmer':
stemmer = nltk.stem.porter.PorterStemmer()
else:
raise Error("Invalid stemmer, choose either 'lemmatizer' or 'stemmer'.")
# Set stopwords
stop = stopwords.words('english')
stop += ['.', ',',':','...','!"','?"', "'", '"',' - ',' — ',',"','."','!', ';','♫♫','♫',\
'.\'"','[',']','—',".\'", 'ok','okay','yeah','ya','stuff', ' 000 ',' em ',\
' oh ','thank','thanks','la','was','wa','?','like','go',' le ',' ca ',' I '," ? ","s", " t ","ve","re", \
'oh', 'sort', 'maybe', 'guy', 'applause']
output_corpus = []
for document in corpus:
cleaned_doc = []
# Remove parentheticals
clean_parens = re.sub(r'\([^)]*\)', ' ', document)
# Tokenize
for word in wordpunct_tokenize(clean_parens):
# Remove stopwords
if word.lower() not in stop:
# Lemmatize or Stem
if stem == 'lemmatizer':
cleaned_word = lemmatizer.lemmatize(word.lower())
elif stem == 'stemmer':
cleaned_word = stemmer.stem(word.lower())
else:
raise Error("Invalid stemmer, choose either 'lemmatizer' or 'stemmer'.")
# Add to document
cleaned_doc.append(cleaned_word.lower())
# After cleaning all words fora document, add it to the corpus
output_corpus.append(' '.join(cleaned_doc))
return output_corpus
%%time
# Execute function to clean all corpus
cleaned_corpus = corpus_cleaner(ted_transcripts['transcript'])
# Check result
print(cleaned_corpus[2020][0:100])
Checking whether unigrams, bigrams or trigrams make for more useful representations
# Function to extract most common n-grams
def top_n_gram(corpus, n=2, top_ngrams=10):
counter = Counter()
for doc in tqdm(corpus):
words = TextBlob(doc).words
n_grams = ngrams(words, n)
counter += Counter(n_grams)
for n_gram, count in counter.most_common(top_ngrams):
print('%30s - %i' % (' '.join(n_gram), count))
# 15 Most common Unigrams
top_n_gram(cleaned_corpus, n=1, top_ngrams=15)
# 15 Most common Bigrams
top_n_gram(cleaned_corpus, n=2, top_ngrams=15)
# 15 Most common Trigrams
top_n_gram(cleaned_corpus, n=3, top_ngrams=15)
# The trigrams don't seem to have much useful information that can't be captured by bigrams
# Hence I'll limit my N-gram range to unigrams and bigrams
NGRAM_MIN = 1
NGRAM_MAX = 2
Vectorizing = Turning words into numerical representations
# Instantiate TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range = (NGRAM_MIN, NGRAM_MAX),
stop_words = 'english',
max_df = 0.5,
max_features = len(cleaned_corpus))
# Obtain vectorized data
vect_data = vectorizer.fit_transform(cleaned_corpus)
# Check result
plt.figure(figsize=(8, 8))
plt.spy(vect_data, markersize=0.01)
plt.show()
TF-IDF vectorization doesn't seem to go well along other topic modeling techniques down the pipeline, so I'll use CounteVectorizer instead.
From taking a look at the LDA paper, this seems to happen because LDA has some sort of built in TF-IDF...
Source: http://www.cs.columbia.edu/~blei/papers/BleiLafferty2009.pdf
# Instantiate CountVectorizer
vectorizer = CountVectorizer(ngram_range = (NGRAM_MIN, NGRAM_MAX),
stop_words = 'english',
max_df = 0.5,
max_features = len(cleaned_corpus))
# Obtain vectorized data
vect_data = vectorizer.fit_transform(cleaned_corpus)
# Check result
plt.figure(figsize=(8, 8))
plt.spy(vect_data, markersize=0.01)
plt.show()
The word vectors seems to be well distributed in their correlations with the documents.
NUM_TOPICS = 15
#NUM_TOPICS = np.random.randint(10, 25)
print(f'Modeling with {NUM_TOPICS} topics!')
# Instantiate object
LDA_obj = LatentDirichletAllocation(n_components = NUM_TOPICS,
max_iter = NUM_TOPICS,
batch_size = 32,
learning_method = 'online',
n_jobs = -1)
# Obtain clustered data
LDA_data = LDA_obj.fit_transform(vect_data)
# Create dictonary with most common words in each topic
LDA_topics_dict = {}
for idx, topic in tqdm(enumerate(LDA_obj.components_), total=len(LDA_obj.components_)):
LDA_topics_dict[idx] = [vectorizer.get_feature_names()[i] for i in topic.argsort()][:-15:-1]
# Print the topics and their common words
for k, v in LDA_topics_dict.items():
print(f'Topic {k}:')
print(" ".join(v))
print('')
# For each document, classify it's theme as the highest scoring topic
document_topics = np.argmax(LDA_data, axis=1)
# Then convert to dataframe
pred_labels = pd.DataFrame(document_topics)
pred_labels.head()
# Covert the numbers to more understandable topic names
topic_names = pred_labels.copy()
for topic_code in range(pred_labels.nunique()[0]):
topic_names[0][topic_names[0] == topic_code] = ' '.join(LDA_topics_dict[topic_code][:5])
topic_names.head()
# Visualize topic frequency
fig, ax = plt.subplots(figsize=(15,12))
plt.tick_params(labelsize=15)
sns.countplot(y=topic_names[0].values)
plt.show()
# Iterative visualization for topic modeling with pyLDAvis
import pyLDAvis, pyLDAvis.sklearn
from IPython.display import display
# Setup to run in Jupyter notebook
pyLDAvis.enable_notebook()
# Create the visualization
vis = pyLDAvis.sklearn.prepare(LDA_obj, vect_data, vectorizer)
# Export as a standalone HTML web page
pyLDAvis.save_html(vis, 'lda.html')
# Let's view it!
display(vis)
# Option to skip testing other methods since we'll be using LDA
SKIP = False
if not SKIP:
# Instantiate NMF object
NMF_obj = NMF(n_components = NUM_TOPICS)
# Obtain clustered data
NMF_data = NMF_obj.fit_transform(vect_data)
# Create dictonary with most common words in each topic
NMF_topics_dict = {}
for idx, topic in tqdm(enumerate(NMF_obj.components_), total=len(NMF_obj.components_)):
NMF_topics_dict[idx] = [vectorizer.get_feature_names()[i] for i in topic.argsort()][:-15:-1]
# Print the topics and their common words
for k, v in NMF_topics_dict.items():
print(f'Topic {k}:')
print(" ".join(v))
print('')
if not SKIP:
# Instantiate LSA object
LSA_obj = TruncatedSVD(n_components = NUM_TOPICS)
# Obtain clustered data
LSA_data = LSA_obj.fit_transform(vect_data)
# Create dictonary with most common words in each topic
LSA_topics_dict = {}
for idx, topic in tqdm(enumerate(LSA_obj.components_), total=len(LSA_obj.components_)):
LSA_topics_dict[idx] = [vectorizer.get_feature_names()[i] for i in topic.argsort()][:-15:-1]
# Print the topics and their common words
for k, v in LSA_topics_dict.items():
print(f'Topic {k}:')
print(" ".join(v))
print('')
if not SKIP:
# Instantiate LSA_norm object
LSA_norm_obj = TruncatedSVD(n_components = NUM_TOPICS)
# Normalize the vectorized data
stdScale = Normalizer()
vect_data_norm = stdScale.fit_transform(vect_data)
# Obtain clustered data
LSA_norm_data = LSA_norm_obj.fit_transform(vect_data_norm)
# Create dictonary with most common words in each topic
LSA_norm_topics_dict = {}
for idx, topic in tqdm(enumerate(LSA_norm_obj.components_), total=len(LSA_norm_obj.components_)):
LSA_norm_topics_dict[idx] = [vectorizer.get_feature_names()[i] for i in topic.argsort()][:-15:-1]
# Print the topics and their common words
for k, v in LSA_norm_topics_dict.items():
print(f'Topic {k}:')
print(" ".join(v))
print('')
None of the topic modeling techniques seem to be a clear winner here...
Since LDA is considered to the state of the art technique I'll choose to employ it in the recommender system.
def get_recommendation(TARGET_ID, NUM_RECOMMENDATIONS = 5):
'''
Requires the following previous objects from this notebook:
1. Trained vectorizer
2. Trained LDA_obj
3. Converted LDA_data
4. topic_names (list with the modeled topic for each TED Talk)
5. ted_transcripts dataframe contaning both CSVs already merged
'''
# Vectorize the document correspondent to the TARGET_ID
target_vector = vectorizer.transform([cleaned_corpus[TARGET_ID]])
# Model the vector with the trained LDA_Obj
target_modeled = LDA_obj.transform(target_vector)
# Fit a KNN algorithm on the whole dataset modeled with LDA
NN = NearestNeighbors(n_neighbors=NUM_RECOMMENDATIONS+1, metric='cosine', algorithm='brute', n_jobs=-1)
NN.fit(LDA_data)
# Find the nearest neighbords for the LDA vector correspondent to the TARGET_ID
results = NN.kneighbors(target_modeled)
recommend_list = results[1][0]
similarity_scores = results[0][0]
# Loop to extract revelant information about the recommendations
titles, modeled_topics, tags, descriptions = [], [] ,[], []
for idx in recommend_list:
titles.append(ted_transcripts.loc[idx,'title'])
modeled_topics.append(topic_names.iloc[idx,0])
tags.append(ted_transcripts.loc[idx,'tags'])
descriptions.append(ted_transcripts.loc[idx,'description'])
# Put recommendations in a dataframe for outputting
output_df = pd.DataFrame({'ID': recommend_list,
'Similarity Score': similarity_scores,
'Title': titles,
'Modeled Topic': modeled_topics,
'Tags': tags,
'Description': descriptions})
# Customize index to specify that the first row is the TED Talk from the TARGET_ID
custom_index = np.arange(1, NUM_RECOMMENDATIONS+1).tolist()
custom_index.insert(0, 'Base')
output_df.set_index([custom_index], inplace=True)
return output_df
get_recommendation(2020, NUM_RECOMMENDATIONS=10)