TED_Talks_Transcript_Dataset

Introduction

Creating a multilabel multitarget dataset from TED Talks transcripts

Imports

In [ ]:
# Package to store the versions of packages used
!pip install -q watermark
In [ ]:
# Imports

# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from tqdm.notebook import tqdm
import datetime
from time import time
import ast

%matplotlib inline
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

Load Dataset

In [ ]:
df_main = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/TED_Talks_Data_Analysis/master/ted_main.csv')
df_main.head(2)
Out[ ]:
comments description duration event film_date languages main_speaker name num_speaker published_date ratings related_talks speaker_occupation tags title url views
0 4553 Sir Ken Robinson makes an entertaining and pro... 1164 TED2006 1140825600 60 Ken Robinson Ken Robinson: Do schools kill creativity? 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... Author/educator ['children', 'creativity', 'culture', 'dance',... Do schools kill creativity? https://www.ted.com/talks/ken_robinson_says_sc... 47227110
1 265 With the same humor and humanity he exuded in ... 977 TED2006 1140825600 43 Al Gore Al Gore: Averting the climate crisis 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... Climate advocate ['alternative energy', 'cars', 'climate change... Averting the climate crisis https://www.ted.com/talks/al_gore_on_averting_... 3200520
In [ ]:
df_transcript = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/TED_Talks_Data_Analysis/master/transcripts.csv')
df_transcript.head()
Out[ ]:
transcript url
0 Good morning. How are you?(Laughter)It's been ... https://www.ted.com/talks/ken_robinson_says_sc...
1 Thank you so much, Chris. And it's truly a gre... https://www.ted.com/talks/al_gore_on_averting_...
2 (Music: "The Sound of Silence," Simon & Garfun... https://www.ted.com/talks/david_pogue_says_sim...
3 If you're here today — and I'm very happy that... https://www.ted.com/talks/majora_carter_s_tale...
4 About 10 years ago, I took on the task to teac... https://www.ted.com/talks/hans_rosling_shows_t...
In [ ]:
df = pd.merge(left=df_main, right=df_transcript, how='left', left_on='url', right_on='url')
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
In [ ]:
df['transcript'] = df['transcript'].fillna('')
df['wc'] = df['transcript'].apply(lambda x: len(x.split()))
In [ ]:
# Longest transcript
max(df.wc)
Out[ ]:
9044
In [ ]:
df.head(2)
Out[ ]:
comments description duration event film_date languages main_speaker name num_speaker published_date ratings related_talks speaker_occupation tags title url views transcript wc
0 4553 Sir Ken Robinson makes an entertaining and pro... 1164 TED2006 1140825600 60 Ken Robinson Ken Robinson: Do schools kill creativity? 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... Author/educator ['children', 'creativity', 'culture', 'dance',... Do schools kill creativity? https://www.ted.com/talks/ken_robinson_says_sc... 47227110 Good morning. How are you?(Laughter)It's been ... 3066
1 265 With the same humor and humanity he exuded in ... 977 TED2006 1140825600 43 Al Gore Al Gore: Averting the climate crisis 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... Climate advocate ['alternative energy', 'cars', 'climate change... Averting the climate crisis https://www.ted.com/talks/al_gore_on_averting_... 3200520 Thank you so much, Chris. And it's truly a gre... 2089
In [ ]:
sns.set_style("whitegrid")
plt.figure(figsize=(25,5))
sns.distplot(df.wc)
Out[ ]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fb4c8d4a470>
In [ ]:
# Checking different percentiles
print(f' 1st percentile:  {int(np.percentile(df.wc, 1))} words')
print(f' 5th percentile:  {int(np.percentile(df.wc, 5))} words')
print(f'10th percentile:  {int(np.percentile(df.wc, 10))} words')
print(f'50th percentile: {int(np.percentile(df.wc, 50))} words')
print(f'90th percentile: {int(np.percentile(df.wc, 90))} words')
print(f'95th percentile: {int(np.percentile(df.wc, 95))} words')
print(f'99th percentile: {int(np.percentile(df.wc, 99))} words')
 1st percentile:  218 words
 5th percentile:  562 words
10th percentile:  780 words
50th percentile: 2029 words
90th percentile: 3191 words
95th percentile: 3530 words
99th percentile: 4388 words

Regular Expressions

In [ ]:
# Basic text cleaning
# CONSIDER IMPROVING THIS IN A LATER REVIEW
df.transcript.replace('.','', inplace=True)
df.transcript.replace(',','', regex=True, inplace=True)
df.transcript.replace('/','', regex=True, inplace=True)
df.transcript.replace('"','', regex=True, inplace=True)
In [ ]:
df.head(2)
Out[ ]:
comments description duration event film_date languages main_speaker name num_speaker published_date ratings related_talks speaker_occupation tags title url views transcript wc
0 4553 Sir Ken Robinson makes an entertaining and pro... 1164 TED2006 1140825600 60 Ken Robinson Ken Robinson: Do schools kill creativity? 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... Author/educator ['children', 'creativity', 'culture', 'dance',... Do schools kill creativity? https://www.ted.com/talks/ken_robinson_says_sc... 47227110 Good morning. How are you?(Laughter)It's been ... 3066
1 265 With the same humor and humanity he exuded in ... 977 TED2006 1140825600 43 Al Gore Al Gore: Averting the climate crisis 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... Climate advocate ['alternative energy', 'cars', 'climate change... Averting the climate crisis https://www.ted.com/talks/al_gore_on_averting_... 3200520 Thank you so much Chris. And it's truly a grea... 2089

Feature Engineering

In [ ]:
# Pandas failed to read the items in the 'tags' column as a list of strings
# Using the ast package to properly read that column
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))
In [ ]:
type(df.tags[0][0])
Out[ ]:
str
In [ ]:
df.head(2)
Out[ ]:
comments description duration event film_date languages main_speaker name num_speaker published_date ratings related_talks speaker_occupation tags title url views transcript wc
0 4553 Sir Ken Robinson makes an entertaining and pro... 1164 TED2006 1140825600 60 Ken Robinson Ken Robinson: Do schools kill creativity? 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... Author/educator [children, creativity, culture, dance, educati... Do schools kill creativity? https://www.ted.com/talks/ken_robinson_says_sc... 47227110 Good morning. How are you?(Laughter)It's been ... 3066
1 265 With the same humor and humanity he exuded in ... 977 TED2006 1140825600 43 Al Gore Al Gore: Averting the climate crisis 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... Climate advocate [alternative energy, cars, climate change, cul... Averting the climate crisis https://www.ted.com/talks/al_gore_on_averting_... 3200520 Thank you so much Chris. And it's truly a grea... 2089
In [ ]:
all_tags = df.apply(lambda x: pd.Series(x['tags']),axis=1).stack().reset_index(level=1, drop=True)
In [ ]:
unique_tags = np.unique(all_tags.values)
len(unique_tags)
Out[ ]:
416
In [ ]:
tag_count = all_tags.value_counts()
tag_count.head(10)
Out[ ]:
technology       708
science          548
global issues    489
culture          476
TEDx             417
design           407
business         337
entertainment    286
health           234
innovation       224
dtype: int64
In [ ]:
# Number of tags with more than 200 tags
len(tag_count[tag_count >= 200])
Out[ ]:
13
In [ ]:
tag_filter = tag_count[tag_count >= 200].index
tag_filter
Out[ ]:
Index(['technology', 'science', 'global issues', 'culture', 'TEDx', 'design',
       'business', 'entertainment', 'health', 'innovation', 'society',
       'social change', 'art'],
      dtype='object')
In [ ]:
for tag_name in tag_filter:

    df[f'{tag_name}'] = pd.Series()
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  This is separate from the ipykernel package so we can avoid doing imports until
In [ ]:
df.columns
Out[ ]:
Index(['comments', 'description', 'duration', 'event', 'film_date',
       'languages', 'main_speaker', 'name', 'num_speaker', 'published_date',
       'ratings', 'related_talks', 'speaker_occupation', 'tags', 'title',
       'url', 'views', 'transcript', 'wc', 'technology', 'science',
       'global issues', 'culture', 'TEDx', 'design', 'business',
       'entertainment', 'health', 'innovation', 'society', 'social change',
       'art'],
      dtype='object')
In [ ]:
for index, row in tqdm(df.iterrows()):

    for col_name in df.columns:

        if col_name in row.tags:

            df.at[index, col_name] = int(1)

In [ ]:
df.head(3)
Out[ ]:
comments description duration event film_date languages main_speaker name num_speaker published_date ratings related_talks speaker_occupation tags title url views transcript wc technology science global issues culture TEDx design business entertainment health innovation society social change art
0 4553 Sir Ken Robinson makes an entertaining and pro... 1164 TED2006 1140825600 60 Ken Robinson Ken Robinson: Do schools kill creativity? 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 19645}, {... [{'id': 865, 'hero': 'https://pe.tedcdn.com/im... Author/educator [children, creativity, culture, dance, educati... Do schools kill creativity? https://www.ted.com/talks/ken_robinson_says_sc... 47227110 Good morning. How are you?(Laughter)It's been ... 3066 NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 265 With the same humor and humanity he exuded in ... 977 TED2006 1140825600 43 Al Gore Al Gore: Averting the climate crisis 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 544}, {'i... [{'id': 243, 'hero': 'https://pe.tedcdn.com/im... Climate advocate [alternative energy, cars, climate change, cul... Averting the climate crisis https://www.ted.com/talks/al_gore_on_averting_... 3200520 Thank you so much Chris. And it's truly a grea... 2089 1.0 1.0 1.0 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 124 New York Times columnist David Pogue takes aim... 1286 TED2006 1140739200 26 David Pogue David Pogue: Simplicity sells 1 1151367060 [{'id': 7, 'name': 'Funny', 'count': 964}, {'i... [{'id': 1725, 'hero': 'https://pe.tedcdn.com/i... Technology columnist [computers, entertainment, interface design, m... Simplicity sells https://www.ted.com/talks/david_pogue_says_sim... 1636292 (Music: The Sound of Silence Simon & Garfunkel... 3253 1.0 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN
In [ ]:
def get_split(text1):
    l_total = []
    l_parcial = []
    n = (len(text1.split())//128)+1
    for w in range(n):
        l_parcial = text1.split()[w*128:w*128 + 128]
        l_total.append(" ".join(l_parcial))
    return l_total
In [ ]:
df_input = pd.DataFrame()
df_input['text_split'] = df['transcript'].apply(get_split)
In [ ]:
df_splits = df_input.text_split.apply(pd.Series)
In [ ]:
df_labels = df.iloc[:, 19:]
In [ ]:
df_labels
Out[ ]:
technology science global issues culture TEDx design business entertainment health innovation society social change art
0 NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 1.0 1.0 1.0 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1.0 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN
3 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN
4 NaN NaN 1.0 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2456 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN
2457 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2458 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN
2459 NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN
2460 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN 1.0 NaN NaN

2461 rows × 13 columns

In [ ]:
df_merged = pd.merge(df_splits, df_labels, left_index=True, right_index=True)
In [ ]:
df_merged
Out[ ]:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 ... 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 technology science global issues culture TEDx design business entertainment health innovation society social change art
0 Good morning. How are you?(Laughter)It's been ... at a dinner party and you say you work in educ... all do. We have a huge vested interest in it p... but I think she's not so to speak exceptional ... the teacher said this girl hardly ever paid at... we were thrilled about. We considered this to ... I bring you myrrh. And the third boy said Fran... you can make. And the result is that we are ed... you? Do you? Because you don't think of Shakes... Sarah. He'd known her for a month.(Laughter)Mi... arts. Art and music are normally given a highe... if you look at the output who really succeeds ... heads. They live up there and slightly to one ... came into being to meet the needs of industria... public education around the world is a protrac... it's because you didn't want one. And I didn't... a human brain as we heard yesterday from a num... at some things but if she's cooking she's deal... a new book at the moment called Epiphany which... school in the '30s wrote to her parents and sa... the end the doctor went and sat next to Gillia... She did. I can't tell you how wonderful it was... to calm down.(Applause)What I think it comes t... all human beings disappeared from the Earth wi... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Thank you so much Chris. And it's truly a grea... driving from our home in Nashville to a little... then went to the couple in the booth next to u... with you: Tipper and I were driving ourselves ... it could be a bunch of things.(Laughter)But wh... later I got a nice long handwritten letter fro... the slide show every time I give it. I add new... projected to go with the U.S. contribution to ... so we should address that. But it's part of th... profitable. Insulation better design. Buy gree... reduce your carbon dioxide emissions with the ... as hard as you think. Integrate climate soluti... nights ago except it's a lot more entertaining... week to keep it right on the cutting edge. Wor... the United States is out of the world system i... in our modern country the role of logic and re... have more influence than some of us who are De... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 1.0 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 (Music: The Sound of Silence Simon & Garfunkel... gashes. With your fading strength you call 9-1... downside and that is they intended to publish ... design of it to make it easy and enjoyable to ... look like DOS. Over the years it's gotten easi... assurance? Uh-uh. Your call may be recorded so... much time have we got? Another one a guy calle... subject to another primal force: the mandate t... administration.(Laughter)But what's the altern... are you going to stick them? You only have so ... your links automatically. The off switch is in... to be: let's break it down; let's just make it... to break all those rules if they violate the b... intelligent.This one's been touched on before ... talk I met one of the employees. He says Nice ... blank document? You do not. On the opposite si... the code. Every software company is doing Micr... the old 1982 chassis. But there's also a new M... Jobs came back to Apple in 1997 after 12 years... for me Cupertino.(Laughter)The truth is I neve... the meaning of depressed.(Laughter)But it turn... back to your computer? Well you either haul ar... your lives or one of your children. You walk a... getting the echo from the hall and stuff. The ... We'll make this software work right. Right? Be... among the people who create this stuff: Easy i... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN
3 If you're here today — and I'm very happy that... facility planned for the East River waterfront... the lovely facilities that I mentioned earlier... first waterfront park that the South Bronx had... might find the good stuff like parks and trees... children has asthma. Our asthma hospitalizatio... the short term and won't destroy us all in the... These things make me different from you. But t... many cities around the country. Red-lining was... were often given less than a month's notice be... that began in the 1960s set the stage for all ... the plan for a waterfront esplanade with dedic... more. We run a project called the Bronx [Envir... utilized for parkland affordable housing and l... for our own green roof installation business b... abused at worst by negligent regulatory agenci... This presentation today only represents some o... agencies on how to deal with the cumulative ef... working closely with Columbia University and o... all responsible for the future that we create.... plazas created one of the most efficient bus m... population is still considered a radical idea ... addressed everywhere. Oh good glad I have a li... you like this. Please don't waste me. By worki... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN
4 About 10 years ago I took on the task to teach... country has the highest child mortality of the... really realized my discovery. I have shown tha... displays it like this: every bubble here is a ... of children per woman: one two three four up t... live here? Or have they got longer lives and l... in the '90s we have the terrible HIV epidemic ... size. And in the '80s now they give up Communi... longer. This is a myth. There's a little hump ... population most in poverty. This is OECD. The ... increases there are hundreds of millions in As... you have sub-Saharan Africa there and we take ... can go here and I can split sub-Saharan Africa... big bubble in the middle. But a huge differenc... Today we don't have to go to Cuba to find a he... the world.But I would like to bring you back t... you are healthy first than if you are wealthy ... you look at the average data of the countries ... yet we tend to discuss on what solutions there... organizations. Because the data is hidden down... Gapminder was appropriate. And we started to w... be searched as others can be searched. We cann... to look at income distributions in completely ... quite easily get any variable you would like t... tendency. It's as if the world is flattening o... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2456 So Ma was trying to explain something to me ab... Ma! Ma!(Laughter)My mother explained that The ... citizen is in many ways shaped by newcomers an... volunteer to fight this war?And she told me Be... the people who are left behind: the voters the... Ramon was being deported to Latin America whil... restaurant reviews — restaurant reviews! In th... of their social circle were at risk.I am not s... not born. And in my career in my life I've bee... seeks to answer becomes to me the same one tha... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN
2457 This is a picture of a sunset on Mars taken by... life in order to understand if we could find l... few fantastic examples he has found on how lif... ocean mist as a source of water and strikingly... fogs or clouds I reported four other sites muc... UV radiation as a source of energy. If confirm... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2458 In my early days as a graduate student I went ... the sheer oneness of it as if it wasn't hundre... or when I think of a flock of starlings that f... for AI then becomes what are those rules of en... to exhibit collective intelligence and that's ... going to take on and set the patterns of our b... two rules are sufficient for the group to be a... the rules are such that we can get the collect... before these insects actually also have patter... these rules we can start to create the robot b... you are completely obsessed with army ants the... and this can enable many different kinds of fu... itself an incredible manifestation of collecti... like? I believe that we can do that. I believe... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN
2459 I took a cell phone and accidentally made myse... a survivor myself of police brutality and havi... our nature and gave them Ferraris.(Laughter)Yo... a human. I was an idea an object a caricature.... But I needed to know. Like I wanted to know. A... didn't necessarily agree with and this worked ... YouTube I became Lucius25 white supremacist lu... spend days clicking through my new racist prof... what else led to the momentum of the alt-right... ideas easily debunked. Alt-facts have that qua... so was I. Never in a billion years did I think... they labeled as white genocide that diversity ... race. Join the party. The water's great. Until... Wise and Michelle Alexander Dr. Joy DeGruy Boy... devices however advanced become a blessing and... in mind conversations stop violence conversati... there was a severe lack of trust in the black ... laptops and meet us in person to have real con... have to go through each other to get these thi... when you trick the algorithm of your existence... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN NaN NaN NaN NaN
2460 We humans are becoming an urban species so cit... that I am not alone in this. People love citie... they are all different and I hope you like all... couple of those and this is what he created. S... that you put all of the services in the center... even more into the future. Astergea by Yuttho.... explained this concept he had Speck explain it... kind of a situation. If we build a new road wo... that I have shown you these are the people who... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN NaN NaN NaN 1.0 NaN NaN

2461 rows × 84 columns

In [ ]:
df_melted = df_merged.melt(id_vars=tag_filter)
df_melted.shape
Out[ ]:
(174731, 15)
In [ ]:
df_drops = df_melted.dropna(subset=['value'])
df_drops.drop(labels=['variable'], axis=1, inplace=True)
df_drops.reset_index(inplace=True, drop=True)
df_drops.shape
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py:3997: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
In [ ]:
df_drops.tail(20)
Out[ ]:
technology science global issues culture TEDx design business entertainment health innovation society social change art value
40466 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN imagine you're speaking to — I don't know — th...
40467 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN want to hear more on this because the very wor...
40468 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN of his supporters Let's not be vile anymore to...
40469 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN to this issue: first of all that we are comple...
40470 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN our categories. All the categories that we tho...
40471 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN to weave the two together and to understand th...
40472 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN be universal preschool; there have to be chart...
40473 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN you don't pay attention to what you hear to wh...
40474 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN and figure out whether we can add something he...
40475 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN go we may all be back in the forest soon. We'r...
40476 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.0 NaN as you already heard is extraordinary and I ca...
40477 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN is extremely unfair and that we should realize...
40478 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN if you think about technological disruption so...
40479 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN should want to want to know the truth to under...
40480 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN make us satisfied. Let's now try to gain contr...
40481 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN facing an ecological meltdown. And if we now t...
40482 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN social critic. I mean the industry focuses mai...
40483 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN challenge and the best example we have of huma...
40484 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN are killed in war. So this I think gives us a ...
40485 1.0 NaN 1.0 NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 NaN us do it. Reach out to other people try and ha...
In [ ]:
# First 13 columns are labels
# Last column is text (input)
df_final = df_drops.fillna(0)
In [ ]:
df_final.head(1)
In [ ]:
# Could consider also dropping rows which have no associated theme

Save Dataset

In [ ]:
# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis")
In [ ]:
df_final.to_csv("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis/df_final.csv", index=False)

End

TED_Talks_Topic_Prediction

TED_Talks_Topic_Prediction

Introduction

Using TED Talks transcripts to predict the topic being presented

Google Colab

In [1]:
# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis")
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Imports

In [2]:
# Package to store the versions of packages used
!pip install -q watermark
In [3]:
# Package to download the BERT models and process data
!pip install -q transformers
In [4]:
# Imports

# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from tqdm.notebook import tqdm
import datetime
from time import time
import random
import pylab

# Sklearn
import sklearn 
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, plot_confusion_matrix

# Deep Learning, NLP and metrics
import torch
import transformers 
from textwrap import wrap
from torch import nn, optim 
from torch.utils import data
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

%matplotlib inline
/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm
In [5]:
# Package versions
%reload_ext watermark
%watermark -v -iv
transformers 3.0.2
seaborn      0.10.1
sklearn      0.22.2.post1
pandas       1.0.5
matplotlib   3.2.2
numpy        1.18.5
torch        1.6.0+cu101
CPython 3.6.9
IPython 5.5.0

Load Data

In [6]:
df = pd.read_csv('df_final.csv')
df.shape
Out[6]:
(40486, 14)
In [7]:
df = shuffle(df)
In [8]:
df.tail(1)
Out[8]:
technology science global issues culture TEDx design business entertainment health innovation society art social change value
25696 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 all. Clinton just always had that light in him...

Tokenizer

In [9]:
# Model download
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

Configurations

In [10]:
# Model Hyperparameters
EPOCHS = 10
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 0.00002 
CLASSIFICATION_THRESHOLD = 0.2

# BERT was trained on a 2e^-5 learning rate, using other LRs on my 'transformer head' gave me problems

Data Batching

In [11]:
class DataBatcher(data.Dataset):

    # Constructor
    def __init__(self, review, targets, tokenizer, max_len):

        # Initialize class atributes
        self.review = review
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Shuffle
        tmp = list(zip(review, targets))
        random.shuffle(tmp)
        review, targets = zip(*tmp)

    def __len__(self):
        return len(self.review)

    # Method to obtain each review
    def __getitem__(self, item):

        # Load a review
        review = str(self.review[item])

        # Create the review embedding
        encoding = tokenizer.encode_plus(review,
                                         max_length = self.max_len,
                                         truncation=True,
                                         add_special_tokens = True,
                                         pad_to_max_length = True,
                                         return_attention_mask = True,
                                         return_token_type_ids = False,
                                         return_tensors = 'pt')
        
        # Among the methods returns, there is the attention mask
        return {'review_text': review,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'targets': torch.tensor(self.targets[item], dtype = torch.long)}
In [12]:
# This function creates a data loader to convert the dataset to the BERT format
# torch.utils.data.dataloader.DataLoader
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = DataBatcher(review = df.value.to_numpy(),
                     targets = df.iloc[:, :-1].to_numpy(),
                     tokenizer = tokenizer,
                     max_len = max_len)
    
    return data.DataLoader(ds, batch_size = batch_size, num_workers = 4)
In [13]:
# Taking only a fraction of the dataset for experimentation purposes
# Using the whole dataset obviously improved performance, but training takes way to long

# The model epochs take 1 minute for every 3000 rows
# Total time in minutes is EPOCHS * (ROWS/3000)

df = df[0:3000]
In [14]:
# Train test split
df_train, df_test = train_test_split(df, test_size = 0.2)
In [15]:
# Test validation split
df_valid, df_test = train_test_split(df_test, test_size = 0.5) 
In [16]:
print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')
print(f'df_valid.shape: {df_valid.shape}')
df_train.shape: (2400, 14)
df_test.shape: (300, 14)
df_valid.shape: (300, 14)
In [17]:
# Total Multiclass Predictions
total_preds_df_train = df_train.shape[0] * (df_train.shape[1] -1)
total_preds_df_test = df_test.shape[0] * (df_test.shape[1] -1)
total_preds_df_valid = df_valid.shape[0] * (df_valid.shape[1] -1)
In [18]:
# Load the data_loaders
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LENGTH, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LENGTH, BATCH_SIZE)
valid_data_loader = create_data_loader(df_valid, tokenizer, MAX_LENGTH, BATCH_SIZE)
In [19]:
# Visualize a sample on the training data
sample = next(iter(train_data_loader))
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)
print(sample['targets'].shape)
torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 13])

Model

In [20]:
# Loading the pre-trained BERT model
model_bert = BertModel.from_pretrained('bert-base-cased')
In [21]:
class SentimentClassifier(nn.Module):

    # Constructor
    def __init__ (self, n_classes):

        # Initialize atributes
        super(SentimentClassifier, self).__init__()

        # Define the pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-cased')

        # Add a dropout layer
        self.drop1 = nn.Dropout(p=0.25)

        # Add a hidden layer
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 128)

        # Add a dense layer
        self.fc2 = nn.Linear(128, n_classes)

        # Add a dropout layer
        self.drop2 = nn.Dropout(p=0.25)

        # Final classification with sigmoid
        self.sigmoid = nn.Sigmoid()

    # Forward method
    def forward(self, input_ids, attention_mask):

        # Load the pooling layer from BERT
        _, pooled_output = self.bert(input_ids = input_ids, attention_mask = attention_mask)

        # Define the outputs from the created layers
        output = self.drop1(pooled_output)
        output = self.fc1(output)
        output = self.fc2(output)
        output = self.drop2(output)     
        output = self.sigmoid(output)

        # Return
        return output
In [22]:
# Setting the device to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device
Out[22]:
device(type='cuda', index=0)
In [23]:
class_names = df.columns.drop('value')
class_names
Out[23]:
Index(['technology', 'science', 'global issues', 'culture', 'TEDx', 'design',
       'business', 'entertainment', 'health', 'innovation', 'society', 'art',
       'social change'],
      dtype='object')
In [24]:
# Create instance of the model
model_sentiment_classifier = SentimentClassifier(len(class_names))
In [25]:
# Send model to the device
model_sentiment_classifier = model_sentiment_classifier.to(device)
In [26]:
# The original BERT model uses AdamW: algorithm with fixed decay weight
optimizer = AdamW(model_sentiment_classifier.parameters(), lr = LEARNING_RATE, correct_bias = False)
In [27]:
# Defining the total number of steps
total_step = len(train_data_loader) * EPOCHS
In [28]:
# Adjust the learning rate
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_step)
In [29]:
# Loss function
loss_fn = nn.BCELoss().to(device)
#loss_fn = nn.BCEWithLogitsLoss().to(device)
In [30]:
# Define threshold for sampled to be considered part of a class
class_threshold = torch.Tensor([CLASSIFICATION_THRESHOLD]).to(device)
In [31]:
# Tensors with predictions
predict_true = torch.Tensor([1]).to(device)
predict_false = torch.Tensor([0]).to(device)
In [32]:
# Train function
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):

    # Prepare for training
    model = model.train()
    losses = []
    correct_prediction = 0

    # Loop through the data samples
    # Complete Deep Learing cicle
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)

        preds = torch.where(outputs > class_threshold, predict_true, predict_false)
        loss = loss_fn(outputs, targets.float())

        correct_prediction += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return torch.true_divide(correct_prediction, n_examples), np.mean(losses)
In [33]:
# Evaluate function
def evaluate_model(model, data_loader, loss_fn, device, n_examples):

    model.eval()
    losses = []
    correct_prediction = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)

            preds = torch.where(outputs > class_threshold, predict_true, predict_false)
            loss = loss_fn(outputs, targets.float())

            correct_prediction += torch.sum(preds == targets)
            losses.append(loss.item())

    return torch.true_divide(correct_prediction, n_examples), np.mean(losses)

Training

In [34]:
%%time

# Store the train history
history = defaultdict(list)

# Control the best accuracy
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
best_accuracy = 0

# Loop
for epoch in range(EPOCHS):

    start_time = time()

    print(f'Epoch {epoch+1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_model(model_sentiment_classifier,
                                        train_data_loader,
                                        loss_fn,
                                        optimizer,
                                        device,
                                        scheduler,
                                        total_preds_df_train)
    
    print(f'Train error: {train_loss:.5f} | Train accuracy: {train_acc:.5f}')

    valid_acc, valid_loss = evaluate_model(model_sentiment_classifier,
                                           valid_data_loader,
                                           loss_fn,
                                           device,
                                           total_preds_df_valid)
    
    print(f'Valid error: {valid_loss:.5f} | Valid accuracy: {valid_acc:.5f}')

    end_time = time()
    print(f'Iteration Time: {end_time - start_time:.2f} seconds')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)

    history['valid_acc'].append(valid_acc)
    history['valid_loss'].append(valid_loss)

    if valid_acc > best_accuracy:
        torch.save(model_sentiment_classifier.state_dict(), f'models/model_sentiment_classifier_{now}.bin')
        best_accuracy = valid_acc
Epoch 1/10
----------
Train error: 0.49360 | Train accuracy: 0.54455
Valid error: 0.41049 | Valid accuracy: 0.63282
Iteration Time: 62.50 seconds

Epoch 2/10
----------
Train error: 0.46665 | Train accuracy: 0.60058
Valid error: 0.39626 | Valid accuracy: 0.64385
Iteration Time: 61.84 seconds

Epoch 3/10
----------
Train error: 0.44233 | Train accuracy: 0.61801
Valid error: 0.38026 | Valid accuracy: 0.66923
Iteration Time: 62.01 seconds

Epoch 4/10
----------
Train error: 0.41870 | Train accuracy: 0.63516
Valid error: 0.37675 | Valid accuracy: 0.68846
Iteration Time: 62.09 seconds

Epoch 5/10
----------
Train error: 0.39726 | Train accuracy: 0.64756
Valid error: 0.37195 | Valid accuracy: 0.70282
Iteration Time: 62.08 seconds

Epoch 6/10
----------
Train error: 0.38003 | Train accuracy: 0.65955
Valid error: 0.37061 | Valid accuracy: 0.71462
Iteration Time: 62.08 seconds

Epoch 7/10
----------
Train error: 0.36270 | Train accuracy: 0.67436
Valid error: 0.37469 | Valid accuracy: 0.70103
Iteration Time: 62.13 seconds

Epoch 8/10
----------
Train error: 0.35433 | Train accuracy: 0.67869
Valid error: 0.37767 | Valid accuracy: 0.69564
Iteration Time: 61.87 seconds

Epoch 9/10
----------
Train error: 0.34043 | Train accuracy: 0.68981
Valid error: 0.36837 | Valid accuracy: 0.72487
Iteration Time: 62.07 seconds

Epoch 10/10
----------
Train error: 0.33954 | Train accuracy: 0.68750
Valid error: 0.36610 | Valid accuracy: 0.73667
Iteration Time: 62.22 seconds

CPU times: user 6min 18s, sys: 3min 56s, total: 10min 14s
Wall time: 10min 37s
In [35]:
fig, ax = plt.subplots(1, 2, figsize=(16,4))

ax[0].plot(history['train_acc'], label='train')
ax[0].plot(history['valid_acc'], label='valid')
ax[0].set_title('Accuracy')

ax[1].plot(history['train_loss'], label='train')
ax[1].plot(history['valid_loss'], label='valid')
ax[1].set_title('Loss')

plt.legend()
plt.show()

Evaluate Model

In [36]:
# Create a model instance
model = SentimentClassifier(len(class_names))
In [37]:
# Load the model
model.load_state_dict(torch.load(f'models/model_sentiment_classifier_{now}.bin'))
Out[37]:
<All keys matched successfully>
In [38]:
# Send model to device
model = model.to(device)
In [39]:
# Predicting using test data
test_acc, test_loss = evaluate_model(model, test_data_loader, loss_fn, device, total_preds_df_test)
In [40]:
# Model performance
print(f'Test Accuracy:  {test_acc}')
print(f'Test Loss:      {test_loss}')
Test Accuracy:  0.7353845834732056
Test Loss:      0.3489445181269395
In [41]:
# Function to collect reviews
def get_reviews(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d['review_text']
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)

            preds = torch.where(outputs > class_threshold, predict_true, predict_false)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values
In [42]:
# Gathering real data
y_review_texts, pred_onehot, pred_probs, true_onehot = get_reviews(model, test_data_loader)
In [43]:
pred_bool = pred_onehot.bool()
In [44]:
# Classification report
print(classification_report(true_onehot, pred_onehot, target_names = class_names))
               precision    recall  f1-score   support

   technology       0.39      0.79      0.52        72
      science       0.49      0.84      0.62        74
global issues       0.38      0.77      0.51        62
      culture       0.27      0.80      0.40        54
         TEDx       0.22      0.65      0.32        62
       design       0.30      0.63      0.41        35
     business       0.26      0.62      0.37        39
entertainment       0.31      0.59      0.41        22
       health       0.43      0.54      0.48        41
   innovation       0.25      0.56      0.35        25
      society       0.14      0.38      0.21        24
          art       0.24      0.50      0.32        18
social change       0.18      0.37      0.24        30

    micro avg       0.31      0.67      0.42       558
    macro avg       0.30      0.62      0.40       558
 weighted avg       0.32      0.67      0.43       558
  samples avg       0.32      0.60      0.39       558

/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))
In [45]:
# Sample the predictions tensor
pred_probs[0:3] 
Out[45]:
tensor([[0.6053, 0.8808, 0.0790, 0.0642, 0.1559, 0.1556, 0.0816, 0.0296, 0.0749,
         0.2057, 0.0360, 0.0397, 0.0340],
        [0.8425, 0.8366, 0.0509, 0.0631, 0.1616, 0.3793, 0.1339, 0.0437, 0.0802,
         0.3334, 0.0391, 0.0748, 0.0466],
        [0.0833, 0.0449, 0.3175, 0.4704, 0.2721, 0.0513, 0.2557, 0.0950, 0.0553,
         0.0309, 0.1681, 0.0510, 0.2299]])

Checking One Review

In [46]:
# Checking one review
idx = random.randint(0, len(true_onehot))

review_text = y_review_texts[idx]
true_sentiment = true_onehot[idx]

pred_df = pd.DataFrame(
    {
        'class_names': class_names,
        'values': pred_probs[idx]
    }
)
In [47]:
print("\n".join(wrap(review_text)))
print()
print(f'Real Topic: {true_sentiment}')
of understanding the grassroots the root causes of things; they don't
want to know why people hate us. I want to understand it. The reason
you're trying to understand why they hate us is to get them to quit
hating us. The idea when you go through this moral exercise of really
coming to appreciate their humanity and better understand them is part
of an effort to get them to appreciate your humanity in the long run.
I think it's the first step toward that. That's the long-term
goal.There are people who worry about this and in fact I myself
apparently was denounced on national TV a couple of nights ago because
of an op-ed I'd written. It was kind of along these lines and the
allegation was

Real Topic: tensor([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])
In [48]:
# Prediction plot
fig = plt.gcf()
fig.set_size_inches(25, 5)
sns.barplot(x = 'values', y = 'class_names', data = pred_df, orient = 'h')
plt.title('Probability Assigned to Each Topic', fontsize=18, fontweight="bold", pad=5)
plt.ylabel('Topic', fontsize=12, fontweight="bold", labelpad=20)
plt.xlabel('Probability', fontsize=12, fontweight="bold", labelpad=5)
plt.xlim([0, 1]);
plt.axvline(class_threshold, label='Classification Threshold', linestyle='dashed')
plt.legend()
Out[48]:
<matplotlib.legend.Legend at 0x7f3d90f5d320>

ROC-AUC

In [49]:
# Convert tensors to numpy for usage with sklearn
pred_onehot_np = pred_onehot.numpy().astype('int')
true_onehot_np = true_onehot.numpy().astype('int')
pred_probs_np = pred_probs.numpy()
In [50]:
# Set number of classes
N_CLASSES = len(class_names)
N_CLASSES
Out[50]:
13
In [51]:
# Data for the plots
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(N_CLASSES):
    fpr[i], tpr[i], _ = roc_curve(true_onehot_np[:, i], pred_probs_np[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(true_onehot_np.ravel(), pred_probs_np.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
In [52]:
# ROC-AUC Plot: Micro and Macro Scores

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(N_CLASSES)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(N_CLASSES):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= N_CLASSES

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(10,10))
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)


plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi Class ROC')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
plt.show()
In [53]:
# ROC-AUC Plot: Class Scores

plt.figure(figsize=(10,10))

colors = []
cm = pylab.get_cmap('nipy_spectral')
for i in range(N_CLASSES):
    color = cm(1.*i/N_CLASSES)
    colors.append(color)
unique_labels = np.unique(class_names)

for i, color in zip(range(N_CLASSES), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='{0} (area = {1:0.2f})'.format(unique_labels[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi Class ROC')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
plt.show()

Metrics

In [54]:
# Metrics
precision = precision_score(true_onehot_np, pred_onehot_np, average = 'macro')
recall = recall_score(true_onehot_np, pred_onehot_np, average = 'macro')
f1_sc = f1_score(true_onehot_np, pred_onehot_np, average = 'macro')
#accuracy_sc = accuracy_score(true_onehot_np, pred_onehot_np)
roc_auc_macro = roc_auc['macro']
In [55]:
print('Model Performance Metrics:')
print(f'Precision  =  {precision:.5f}')
print(f'Recal      =  {recall:.5f}')
print(f'F1 Score   =  {f1_sc:.5f}')
print(f'Accuracy   =  {test_acc:.5f}')
print(f'ROC-AUC    =  {roc_auc_macro:.5f}')
Model Performance Metrics:
Precision  =  0.29646
Recal      =  0.61679
F1 Score   =  0.39594
Accuracy   =  0.73538
ROC-AUC    =  0.75834

End