Introduction¶

Creating a multilabel multitarget dataset from TED Talks transcripts

Imports¶

# Package to store the versions of packages used
!pip install -q watermark

# Imports

# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from tqdm.notebook import tqdm
import datetime
from time import time
import ast

%matplotlib inline

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

Load Dataset¶

df_main = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/TED_Talks_Data_Analysis/master/ted_main.csv')
df_main.head(2)

df_transcript = pd.read_csv('https://raw.githubusercontent.com/Matheus-Schmitz/TED_Talks_Data_Analysis/master/transcripts.csv')
df_transcript.head()

df = pd.merge(left=df_main, right=df_transcript, how='left', left_on='url', right_on='url')
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)

df['transcript'] = df['transcript'].fillna('')
df['wc'] = df['transcript'].apply(lambda x: len(x.split()))

# Longest transcript
max(df.wc)

9044

df.head(2)

sns.set_style("whitegrid")
plt.figure(figsize=(25,5))
sns.distplot(df.wc)

<matplotlib.axes._subplots.AxesSubplot at 0x7fb4c8d4a470>

# Checking different percentiles
print(f' 1st percentile:  {int(np.percentile(df.wc, 1))} words')
print(f' 5th percentile:  {int(np.percentile(df.wc, 5))} words')
print(f'10th percentile:  {int(np.percentile(df.wc, 10))} words')
print(f'50th percentile: {int(np.percentile(df.wc, 50))} words')
print(f'90th percentile: {int(np.percentile(df.wc, 90))} words')
print(f'95th percentile: {int(np.percentile(df.wc, 95))} words')
print(f'99th percentile: {int(np.percentile(df.wc, 99))} words')

 1st percentile:  218 words
 5th percentile:  562 words
10th percentile:  780 words
50th percentile: 2029 words
90th percentile: 3191 words
95th percentile: 3530 words
99th percentile: 4388 words

Regular Expressions¶

# Basic text cleaning
# CONSIDER IMPROVING THIS IN A LATER REVIEW
df.transcript.replace('.','', inplace=True)
df.transcript.replace(',','', regex=True, inplace=True)
df.transcript.replace('/','', regex=True, inplace=True)
df.transcript.replace('"','', regex=True, inplace=True)

df.head(2)

Feature Engineering¶

# Pandas failed to read the items in the 'tags' column as a list of strings
# Using the ast package to properly read that column
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x))

type(df.tags[0][0])

str

df.head(2)

all_tags = df.apply(lambda x: pd.Series(x['tags']),axis=1).stack().reset_index(level=1, drop=True)

unique_tags = np.unique(all_tags.values)
len(unique_tags)

416

tag_count = all_tags.value_counts()
tag_count.head(10)

technology       708
science          548
global issues    489
culture          476
TEDx             417
design           407
business         337
entertainment    286
health           234
innovation       224
dtype: int64

# Number of tags with more than 200 tags
len(tag_count[tag_count >= 200])

13

tag_filter = tag_count[tag_count >= 200].index
tag_filter

Index(['technology', 'science', 'global issues', 'culture', 'TEDx', 'design',
       'business', 'entertainment', 'health', 'innovation', 'society',
       'social change', 'art'],
      dtype='object')

for tag_name in tag_filter:

    df[f'{tag_name}'] = pd.Series()

/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:3: DeprecationWarning: The default dtype for empty Series will be 'object' instead of 'float64' in a future version. Specify a dtype explicitly to silence this warning.
  This is separate from the ipykernel package so we can avoid doing imports until

df.columns

Index(['comments', 'description', 'duration', 'event', 'film_date',
       'languages', 'main_speaker', 'name', 'num_speaker', 'published_date',
       'ratings', 'related_talks', 'speaker_occupation', 'tags', 'title',
       'url', 'views', 'transcript', 'wc', 'technology', 'science',
       'global issues', 'culture', 'TEDx', 'design', 'business',
       'entertainment', 'health', 'innovation', 'society', 'social change',
       'art'],
      dtype='object')

for index, row in tqdm(df.iterrows()):

    for col_name in df.columns:

        if col_name in row.tags:

            df.at[index, col_name] = int(1)

df.head(3)

def get_split(text1):
    l_total = []
    l_parcial = []
    n = (len(text1.split())//128)+1
    for w in range(n):
        l_parcial = text1.split()[w*128:w*128 + 128]
        l_total.append(" ".join(l_parcial))
    return l_total

df_input = pd.DataFrame()
df_input['text_split'] = df['transcript'].apply(get_split)

df_splits = df_input.text_split.apply(pd.Series)

df_labels = df.iloc[:, 19:]

df_labels

df_merged = pd.merge(df_splits, df_labels, left_index=True, right_index=True)

df_merged

df_melted = df_merged.melt(id_vars=tag_filter)
df_melted.shape

(174731, 15)

df_drops = df_melted.dropna(subset=['value'])
df_drops.drop(labels=['variable'], axis=1, inplace=True)
df_drops.reset_index(inplace=True, drop=True)
df_drops.shape

/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py:3997: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,

df_drops.tail(20)

# First 13 columns are labels
# Last column is text (input)
df_final = df_drops.fillna(0)

df_final.head(1)

# Could consider also dropping rows which have no associated theme

Save Dataset¶

# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis")

df_final.to_csv("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis/df_final.csv", index=False)

End¶

Introduction¶

Using TED Talks transcripts to predict the topic being presented

Google Colab¶

# File manipulation imports for Google Colab
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir("/content/drive/My Drive/Colab Notebooks/TED_Talks_Data_Analysis")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Imports¶

# Package to store the versions of packages used
!pip install -q watermark

# Package to download the BERT models and process data
!pip install -q transformers

# Imports

# Data manipulation and visualization
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from pylab import rcParams
from matplotlib import rc
from tqdm.notebook import tqdm
import datetime
from time import time
import random
import pylab

# Sklearn
import sklearn 
from sklearn.utils import shuffle
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.metrics import roc_curve, auc, precision_recall_curve
from sklearn.metrics import confusion_matrix, multilabel_confusion_matrix, plot_confusion_matrix

# Deep Learning, NLP and metrics
import torch
import transformers 
from textwrap import wrap
from torch import nn, optim 
from torch.utils import data
from collections import defaultdict
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from transformers import BertModel
from transformers import BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

%matplotlib inline

/usr/local/lib/python3.6/dist-packages/statsmodels/tools/_testing.py:19: FutureWarning: pandas.util.testing is deprecated. Use the functions in the public API at pandas.testing instead.
  import pandas.util.testing as tm

# Package versions
%reload_ext watermark
%watermark -v -iv

transformers 3.0.2
seaborn      0.10.1
sklearn      0.22.2.post1
pandas       1.0.5
matplotlib   3.2.2
numpy        1.18.5
torch        1.6.0+cu101
CPython 3.6.9
IPython 5.5.0

Load Data¶

df = pd.read_csv('df_final.csv')
df.shape

(40486, 14)

df = shuffle(df)

df.tail(1)

Tokenizer¶

# Model download
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-cased')

Configurations¶

# Model Hyperparameters
EPOCHS = 10
BATCH_SIZE = 16
MAX_LENGTH = 128
LEARNING_RATE = 0.00002 
CLASSIFICATION_THRESHOLD = 0.2

# BERT was trained on a 2e^-5 learning rate, using other LRs on my 'transformer head' gave me problems

Data Batching¶

class DataBatcher(data.Dataset):

    # Constructor
    def __init__(self, review, targets, tokenizer, max_len):

        # Initialize class atributes
        self.review = review
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

        # Shuffle
        tmp = list(zip(review, targets))
        random.shuffle(tmp)
        review, targets = zip(*tmp)

    def __len__(self):
        return len(self.review)

    # Method to obtain each review
    def __getitem__(self, item):

        # Load a review
        review = str(self.review[item])

        # Create the review embedding
        encoding = tokenizer.encode_plus(review,
                                         max_length = self.max_len,
                                         truncation=True,
                                         add_special_tokens = True,
                                         pad_to_max_length = True,
                                         return_attention_mask = True,
                                         return_token_type_ids = False,
                                         return_tensors = 'pt')
        
        # Among the methods returns, there is the attention mask
        return {'review_text': review,
                'input_ids': encoding['input_ids'].flatten(),
                'attention_mask': encoding['attention_mask'].flatten(),
                'targets': torch.tensor(self.targets[item], dtype = torch.long)}

# This function creates a data loader to convert the dataset to the BERT format
# torch.utils.data.dataloader.DataLoader
def create_data_loader(df, tokenizer, max_len, batch_size):
    ds = DataBatcher(review = df.value.to_numpy(),
                     targets = df.iloc[:, :-1].to_numpy(),
                     tokenizer = tokenizer,
                     max_len = max_len)
    
    return data.DataLoader(ds, batch_size = batch_size, num_workers = 4)

# Taking only a fraction of the dataset for experimentation purposes
# Using the whole dataset obviously improved performance, but training takes way to long

# The model epochs take 1 minute for every 3000 rows
# Total time in minutes is EPOCHS * (ROWS/3000)

df = df[0:3000]

# Train test split
df_train, df_test = train_test_split(df, test_size = 0.2)

# Test validation split
df_valid, df_test = train_test_split(df_test, test_size = 0.5)

print(f'df_train.shape: {df_train.shape}')
print(f'df_test.shape: {df_test.shape}')
print(f'df_valid.shape: {df_valid.shape}')

df_train.shape: (2400, 14)
df_test.shape: (300, 14)
df_valid.shape: (300, 14)

# Total Multiclass Predictions
total_preds_df_train = df_train.shape[0] * (df_train.shape[1] -1)
total_preds_df_test = df_test.shape[0] * (df_test.shape[1] -1)
total_preds_df_valid = df_valid.shape[0] * (df_valid.shape[1] -1)

# Load the data_loaders
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LENGTH, BATCH_SIZE)
test_data_loader = create_data_loader(df_test, tokenizer, MAX_LENGTH, BATCH_SIZE)
valid_data_loader = create_data_loader(df_valid, tokenizer, MAX_LENGTH, BATCH_SIZE)

# Visualize a sample on the training data
sample = next(iter(train_data_loader))
print(sample['input_ids'].shape)
print(sample['attention_mask'].shape)
print(sample['targets'].shape)

torch.Size([16, 128])
torch.Size([16, 128])
torch.Size([16, 13])

Model¶

# Loading the pre-trained BERT model
model_bert = BertModel.from_pretrained('bert-base-cased')

class SentimentClassifier(nn.Module):

    # Constructor
    def __init__ (self, n_classes):

        # Initialize atributes
        super(SentimentClassifier, self).__init__()

        # Define the pre-trained BERT model
        self.bert = BertModel.from_pretrained('bert-base-cased')

        # Add a dropout layer
        self.drop1 = nn.Dropout(p=0.25)

        # Add a hidden layer
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 128)

        # Add a dense layer
        self.fc2 = nn.Linear(128, n_classes)

        # Add a dropout layer
        self.drop2 = nn.Dropout(p=0.25)

        # Final classification with sigmoid
        self.sigmoid = nn.Sigmoid()

    # Forward method
    def forward(self, input_ids, attention_mask):

        # Load the pooling layer from BERT
        _, pooled_output = self.bert(input_ids = input_ids, attention_mask = attention_mask)

        # Define the outputs from the created layers
        output = self.drop1(pooled_output)
        output = self.fc1(output)
        output = self.fc2(output)
        output = self.drop2(output)     
        output = self.sigmoid(output)

        # Return
        return output

# Setting the device to GPU
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

class_names = df.columns.drop('value')
class_names

Index(['technology', 'science', 'global issues', 'culture', 'TEDx', 'design',
       'business', 'entertainment', 'health', 'innovation', 'society', 'art',
       'social change'],
      dtype='object')

# Create instance of the model
model_sentiment_classifier = SentimentClassifier(len(class_names))

# Send model to the device
model_sentiment_classifier = model_sentiment_classifier.to(device)

# The original BERT model uses AdamW: algorithm with fixed decay weight
optimizer = AdamW(model_sentiment_classifier.parameters(), lr = LEARNING_RATE, correct_bias = False)

# Defining the total number of steps
total_step = len(train_data_loader) * EPOCHS

# Adjust the learning rate
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = 0, num_training_steps = total_step)

# Loss function
loss_fn = nn.BCELoss().to(device)
#loss_fn = nn.BCEWithLogitsLoss().to(device)

# Define threshold for sampled to be considered part of a class
class_threshold = torch.Tensor([CLASSIFICATION_THRESHOLD]).to(device)

# Tensors with predictions
predict_true = torch.Tensor([1]).to(device)
predict_false = torch.Tensor([0]).to(device)

# Train function
def train_model(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):

    # Prepare for training
    model = model.train()
    losses = []
    correct_prediction = 0

    # Loop through the data samples
    # Complete Deep Learing cicle
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        targets = d['targets'].to(device)
        outputs = model(input_ids = input_ids, attention_mask = attention_mask)

        preds = torch.where(outputs > class_threshold, predict_true, predict_false)
        loss = loss_fn(outputs, targets.float())

        correct_prediction += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm = 1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return torch.true_divide(correct_prediction, n_examples), np.mean(losses)

# Evaluate function
def evaluate_model(model, data_loader, loss_fn, device, n_examples):

    model.eval()
    losses = []
    correct_prediction = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)

            preds = torch.where(outputs > class_threshold, predict_true, predict_false)
            loss = loss_fn(outputs, targets.float())

            correct_prediction += torch.sum(preds == targets)
            losses.append(loss.item())

    return torch.true_divide(correct_prediction, n_examples), np.mean(losses)

Training¶

%%time

# Store the train history
history = defaultdict(list)

# Control the best accuracy
now = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
best_accuracy = 0

# Loop
for epoch in range(EPOCHS):

    start_time = time()

    print(f'Epoch {epoch+1}/{EPOCHS}')
    print('-' * 10)
    train_acc, train_loss = train_model(model_sentiment_classifier,
                                        train_data_loader,
                                        loss_fn,
                                        optimizer,
                                        device,
                                        scheduler,
                                        total_preds_df_train)
    
    print(f'Train error: {train_loss:.5f} | Train accuracy: {train_acc:.5f}')

    valid_acc, valid_loss = evaluate_model(model_sentiment_classifier,
                                           valid_data_loader,
                                           loss_fn,
                                           device,
                                           total_preds_df_valid)
    
    print(f'Valid error: {valid_loss:.5f} | Valid accuracy: {valid_acc:.5f}')

    end_time = time()
    print(f'Iteration Time: {end_time - start_time:.2f} seconds')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)

    history['valid_acc'].append(valid_acc)
    history['valid_loss'].append(valid_loss)

    if valid_acc > best_accuracy:
        torch.save(model_sentiment_classifier.state_dict(), f'models/model_sentiment_classifier_{now}.bin')
        best_accuracy = valid_acc

Epoch 1/10
----------
Train error: 0.49360 | Train accuracy: 0.54455
Valid error: 0.41049 | Valid accuracy: 0.63282
Iteration Time: 62.50 seconds

Epoch 2/10
----------
Train error: 0.46665 | Train accuracy: 0.60058
Valid error: 0.39626 | Valid accuracy: 0.64385
Iteration Time: 61.84 seconds

Epoch 3/10
----------
Train error: 0.44233 | Train accuracy: 0.61801
Valid error: 0.38026 | Valid accuracy: 0.66923
Iteration Time: 62.01 seconds

Epoch 4/10
----------
Train error: 0.41870 | Train accuracy: 0.63516
Valid error: 0.37675 | Valid accuracy: 0.68846
Iteration Time: 62.09 seconds

Epoch 5/10
----------
Train error: 0.39726 | Train accuracy: 0.64756
Valid error: 0.37195 | Valid accuracy: 0.70282
Iteration Time: 62.08 seconds

Epoch 6/10
----------
Train error: 0.38003 | Train accuracy: 0.65955
Valid error: 0.37061 | Valid accuracy: 0.71462
Iteration Time: 62.08 seconds

Epoch 7/10
----------
Train error: 0.36270 | Train accuracy: 0.67436
Valid error: 0.37469 | Valid accuracy: 0.70103
Iteration Time: 62.13 seconds

Epoch 8/10
----------
Train error: 0.35433 | Train accuracy: 0.67869
Valid error: 0.37767 | Valid accuracy: 0.69564
Iteration Time: 61.87 seconds

Epoch 9/10
----------
Train error: 0.34043 | Train accuracy: 0.68981
Valid error: 0.36837 | Valid accuracy: 0.72487
Iteration Time: 62.07 seconds

Epoch 10/10
----------
Train error: 0.33954 | Train accuracy: 0.68750
Valid error: 0.36610 | Valid accuracy: 0.73667
Iteration Time: 62.22 seconds

CPU times: user 6min 18s, sys: 3min 56s, total: 10min 14s
Wall time: 10min 37s

fig, ax = plt.subplots(1, 2, figsize=(16,4))

ax[0].plot(history['train_acc'], label='train')
ax[0].plot(history['valid_acc'], label='valid')
ax[0].set_title('Accuracy')

ax[1].plot(history['train_loss'], label='train')
ax[1].plot(history['valid_loss'], label='valid')
ax[1].set_title('Loss')

plt.legend()
plt.show()

Evaluate Model¶

# Create a model instance
model = SentimentClassifier(len(class_names))

# Load the model
model.load_state_dict(torch.load(f'models/model_sentiment_classifier_{now}.bin'))

<All keys matched successfully>

# Send model to device
model = model.to(device)

# Predicting using test data
test_acc, test_loss = evaluate_model(model, test_data_loader, loss_fn, device, total_preds_df_test)

# Model performance
print(f'Test Accuracy:  {test_acc}')
print(f'Test Loss:      {test_loss}')

Test Accuracy:  0.7353845834732056
Test Loss:      0.3489445181269395

# Function to collect reviews
def get_reviews(model, data_loader):
    model = model.eval()

    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d['review_text']
            input_ids = d['input_ids'].to(device)
            attention_mask = d['attention_mask'].to(device)
            targets = d['targets'].to(device)
            outputs = model(input_ids = input_ids, attention_mask = attention_mask)

            preds = torch.where(outputs > class_threshold, predict_true, predict_false)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(outputs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()

    return review_texts, predictions, prediction_probs, real_values

# Gathering real data
y_review_texts, pred_onehot, pred_probs, true_onehot = get_reviews(model, test_data_loader)

pred_bool = pred_onehot.bool()

# Classification report
print(classification_report(true_onehot, pred_onehot, target_names = class_names))

               precision    recall  f1-score   support

   technology       0.39      0.79      0.52        72
      science       0.49      0.84      0.62        74
global issues       0.38      0.77      0.51        62
      culture       0.27      0.80      0.40        54
         TEDx       0.22      0.65      0.32        62
       design       0.30      0.63      0.41        35
     business       0.26      0.62      0.37        39
entertainment       0.31      0.59      0.41        22
       health       0.43      0.54      0.48        41
   innovation       0.25      0.56      0.35        25
      society       0.14      0.38      0.21        24
          art       0.24      0.50      0.32        18
social change       0.18      0.37      0.24        30

    micro avg       0.31      0.67      0.42       558
    macro avg       0.30      0.62      0.40       558
 weighted avg       0.32      0.67      0.43       558
  samples avg       0.32      0.60      0.39       558

/usr/local/lib/python3.6/dist-packages/sklearn/metrics/_classification.py:1272: UndefinedMetricWarning: Recall and F-score are ill-defined and being set to 0.0 in samples with no true labels. Use `zero_division` parameter to control this behavior.
  _warn_prf(average, modifier, msg_start, len(result))

# Sample the predictions tensor
pred_probs[0:3]

tensor([[0.6053, 0.8808, 0.0790, 0.0642, 0.1559, 0.1556, 0.0816, 0.0296, 0.0749,
         0.2057, 0.0360, 0.0397, 0.0340],
        [0.8425, 0.8366, 0.0509, 0.0631, 0.1616, 0.3793, 0.1339, 0.0437, 0.0802,
         0.3334, 0.0391, 0.0748, 0.0466],
        [0.0833, 0.0449, 0.3175, 0.4704, 0.2721, 0.0513, 0.2557, 0.0950, 0.0553,
         0.0309, 0.1681, 0.0510, 0.2299]])

Checking One Review¶

# Checking one review
idx = random.randint(0, len(true_onehot))

review_text = y_review_texts[idx]
true_sentiment = true_onehot[idx]

pred_df = pd.DataFrame(
    {
        'class_names': class_names,
        'values': pred_probs[idx]
    }
)

print("\n".join(wrap(review_text)))
print()
print(f'Real Topic: {true_sentiment}')

of understanding the grassroots the root causes of things; they don't
want to know why people hate us. I want to understand it. The reason
you're trying to understand why they hate us is to get them to quit
hating us. The idea when you go through this moral exercise of really
coming to appreciate their humanity and better understand them is part
of an effort to get them to appreciate your humanity in the long run.
I think it's the first step toward that. That's the long-term
goal.There are people who worry about this and in fact I myself
apparently was denounced on national TV a couple of nights ago because
of an op-ed I'd written. It was kind of along these lines and the
allegation was

Real Topic: tensor([0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Prediction plot
fig = plt.gcf()
fig.set_size_inches(25, 5)
sns.barplot(x = 'values', y = 'class_names', data = pred_df, orient = 'h')
plt.title('Probability Assigned to Each Topic', fontsize=18, fontweight="bold", pad=5)
plt.ylabel('Topic', fontsize=12, fontweight="bold", labelpad=20)
plt.xlabel('Probability', fontsize=12, fontweight="bold", labelpad=5)
plt.xlim([0, 1]);
plt.axvline(class_threshold, label='Classification Threshold', linestyle='dashed')
plt.legend()

<matplotlib.legend.Legend at 0x7f3d90f5d320>

ROC-AUC¶

# Convert tensors to numpy for usage with sklearn
pred_onehot_np = pred_onehot.numpy().astype('int')
true_onehot_np = true_onehot.numpy().astype('int')
pred_probs_np = pred_probs.numpy()

# Set number of classes
N_CLASSES = len(class_names)
N_CLASSES

13

# Data for the plots
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(N_CLASSES):
    fpr[i], tpr[i], _ = roc_curve(true_onehot_np[:, i], pred_probs_np[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(true_onehot_np.ravel(), pred_probs_np.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# ROC-AUC Plot: Micro and Macro Scores

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(N_CLASSES)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(N_CLASSES):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= N_CLASSES

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(figsize=(10,10))
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)


plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi Class ROC')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
plt.show()

# ROC-AUC Plot: Class Scores

plt.figure(figsize=(10,10))

colors = []
cm = pylab.get_cmap('nipy_spectral')
for i in range(N_CLASSES):
    color = cm(1.*i/N_CLASSES)
    colors.append(color)
unique_labels = np.unique(class_names)

for i, color in zip(range(N_CLASSES), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='{0} (area = {1:0.2f})'.format(unique_labels[i], roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multi Class ROC')
legend = plt.legend()
legend._legend_box.align = "right"
plt.legend(loc="lower right")
plt.show()

Metrics¶

# Metrics
precision = precision_score(true_onehot_np, pred_onehot_np, average = 'macro')
recall = recall_score(true_onehot_np, pred_onehot_np, average = 'macro')
f1_sc = f1_score(true_onehot_np, pred_onehot_np, average = 'macro')
#accuracy_sc = accuracy_score(true_onehot_np, pred_onehot_np)
roc_auc_macro = roc_auc['macro']

print('Model Performance Metrics:')
print(f'Precision  =  {precision:.5f}')
print(f'Recal      =  {recall:.5f}')
print(f'F1 Score   =  {f1_sc:.5f}')
print(f'Accuracy   =  {test_acc:.5f}')
print(f'ROC-AUC    =  {roc_auc_macro:.5f}')

Model Performance Metrics:
Precision  =  0.29646
Recal      =  0.61679
F1 Score   =  0.39594
Accuracy   =  0.73538
ROC-AUC    =  0.75834

	comments	description	duration	event	film_date	languages	main_speaker	name	num_speaker	published_date	ratings	related_talks	speaker_occupation	tags	title	url	views
0	4553	Sir Ken Robinson makes an entertaining and pro...	1164	TED2006	1140825600	60	Ken Robinson	Ken Robinson: Do schools kill creativity?	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...	Author/educator	['children', 'creativity', 'culture', 'dance',...	Do schools kill creativity?	https://www.ted.com/talks/ken_robinson_says_sc...	47227110
1	265	With the same humor and humanity he exuded in ...	977	TED2006	1140825600	43	Al Gore	Al Gore: Averting the climate crisis	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...	Climate advocate	['alternative energy', 'cars', 'climate change...	Averting the climate crisis	https://www.ted.com/talks/al_gore_on_averting_...	3200520

	transcript	url
0	Good morning. How are you?(Laughter)It's been ...	https://www.ted.com/talks/ken_robinson_says_sc...
1	Thank you so much, Chris. And it's truly a gre...	https://www.ted.com/talks/al_gore_on_averting_...
2	(Music: "The Sound of Silence," Simon & Garfun...	https://www.ted.com/talks/david_pogue_says_sim...
3	If you're here today — and I'm very happy that...	https://www.ted.com/talks/majora_carter_s_tale...
4	About 10 years ago, I took on the task to teac...	https://www.ted.com/talks/hans_rosling_shows_t...

	comments	description	duration	event	film_date	languages	main_speaker	name	num_speaker	published_date	ratings	related_talks	speaker_occupation	tags	title	url	views	transcript	wc
0	4553	Sir Ken Robinson makes an entertaining and pro...	1164	TED2006	1140825600	60	Ken Robinson	Ken Robinson: Do schools kill creativity?	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...	Author/educator	['children', 'creativity', 'culture', 'dance',...	Do schools kill creativity?	https://www.ted.com/talks/ken_robinson_says_sc...	47227110	Good morning. How are you?(Laughter)It's been ...	3066
1	265	With the same humor and humanity he exuded in ...	977	TED2006	1140825600	43	Al Gore	Al Gore: Averting the climate crisis	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...	Climate advocate	['alternative energy', 'cars', 'climate change...	Averting the climate crisis	https://www.ted.com/talks/al_gore_on_averting_...	3200520	Thank you so much, Chris. And it's truly a gre...	2089

	comments	description	duration	event	film_date	languages	main_speaker	name	num_speaker	published_date	ratings	related_talks	speaker_occupation	tags	title	url	views	transcript	wc
0	4553	Sir Ken Robinson makes an entertaining and pro...	1164	TED2006	1140825600	60	Ken Robinson	Ken Robinson: Do schools kill creativity?	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...	Author/educator	['children', 'creativity', 'culture', 'dance',...	Do schools kill creativity?	https://www.ted.com/talks/ken_robinson_says_sc...	47227110	Good morning. How are you?(Laughter)It's been ...	3066
1	265	With the same humor and humanity he exuded in ...	977	TED2006	1140825600	43	Al Gore	Al Gore: Averting the climate crisis	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...	Climate advocate	['alternative energy', 'cars', 'climate change...	Averting the climate crisis	https://www.ted.com/talks/al_gore_on_averting_...	3200520	Thank you so much Chris. And it's truly a grea...	2089

	comments	description	duration	event	film_date	languages	main_speaker	name	num_speaker	published_date	ratings	related_talks	speaker_occupation	tags	title	url	views	transcript	wc
0	4553	Sir Ken Robinson makes an entertaining and pro...	1164	TED2006	1140825600	60	Ken Robinson	Ken Robinson: Do schools kill creativity?	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 19645}, {...	[{'id': 865, 'hero': 'https://pe.tedcdn.com/im...	Author/educator	[children, creativity, culture, dance, educati...	Do schools kill creativity?	https://www.ted.com/talks/ken_robinson_says_sc...	47227110	Good morning. How are you?(Laughter)It's been ...	3066
1	265	With the same humor and humanity he exuded in ...	977	TED2006	1140825600	43	Al Gore	Al Gore: Averting the climate crisis	1	1151367060	[{'id': 7, 'name': 'Funny', 'count': 544}, {'i...	[{'id': 243, 'hero': 'https://pe.tedcdn.com/im...	Climate advocate	[alternative energy, cars, climate change, cul...	Averting the climate crisis	https://www.ted.com/talks/al_gore_on_averting_...	3200520	Thank you so much Chris. And it's truly a grea...	2089

TED_Talks_Transcript_Dataset

Introduction¶

Imports¶

Load Dataset¶

Regular Expressions¶

Feature Engineering¶

Save Dataset¶

End¶

TED_Talks_Topic_Prediction

Introduction¶

Google Colab¶

Imports¶

Load Data¶

Tokenizer¶

Configurations¶

Data Batching¶

Model¶

Training¶

Evaluate Model¶

Checking One Review¶

ROC-AUC¶

Metrics¶

End¶

	technology	science	global issues	culture	TEDx	design	business	entertainment	health	innovation	society	social change	art
0	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1.0	1.0	1.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1.0	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN
3	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN
4	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2456	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN
2457	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2458	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN
2459	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2460	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	1.0	NaN	NaN

	0	1	2	3	4	5	6	7	8	9	10	11	12	13	14	15	16	17	18	19	20	21	22	23	24	25	26	27	28	29	30	31	32	33	34	35	36	37	38	39	...	44	45	46	47	48	49	50	51	52	53	54	55	56	57	58	59	60	61	62	63	64	65	66	67	68	69	70	technology	science	global issues	culture	TEDx	design	business	entertainment	health	innovation	society	social change	art
0	Good morning. How are you?(Laughter)It's been ...	at a dinner party and you say you work in educ...	all do. We have a huge vested interest in it p...	but I think she's not so to speak exceptional ...	the teacher said this girl hardly ever paid at...	we were thrilled about. We considered this to ...	I bring you myrrh. And the third boy said Fran...	you can make. And the result is that we are ed...	you? Do you? Because you don't think of Shakes...	Sarah. He'd known her for a month.(Laughter)Mi...	arts. Art and music are normally given a highe...	if you look at the output who really succeeds ...	heads. They live up there and slightly to one ...	came into being to meet the needs of industria...	public education around the world is a protrac...	it's because you didn't want one. And I didn't...	a human brain as we heard yesterday from a num...	at some things but if she's cooking she's deal...	a new book at the moment called Epiphany which...	school in the '30s wrote to her parents and sa...	the end the doctor went and sat next to Gillia...	She did. I can't tell you how wonderful it was...	to calm down.(Applause)What I think it comes t...	all human beings disappeared from the Earth wi...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	Thank you so much Chris. And it's truly a grea...	driving from our home in Nashville to a little...	then went to the couple in the booth next to u...	with you: Tipper and I were driving ourselves ...	it could be a bunch of things.(Laughter)But wh...	later I got a nice long handwritten letter fro...	the slide show every time I give it. I add new...	projected to go with the U.S. contribution to ...	so we should address that. But it's part of th...	profitable. Insulation better design. Buy gree...	reduce your carbon dioxide emissions with the ...	as hard as you think. Integrate climate soluti...	nights ago except it's a lot more entertaining...	week to keep it right on the cutting edge. Wor...	the United States is out of the world system i...	in our modern country the role of logic and re...	have more influence than some of us who are De...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	1.0	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	(Music: The Sound of Silence Simon & Garfunkel...	gashes. With your fading strength you call 9-1...	downside and that is they intended to publish ...	design of it to make it easy and enjoyable to ...	look like DOS. Over the years it's gotten easi...	assurance? Uh-uh. Your call may be recorded so...	much time have we got? Another one a guy calle...	subject to another primal force: the mandate t...	administration.(Laughter)But what's the altern...	are you going to stick them? You only have so ...	your links automatically. The off switch is in...	to be: let's break it down; let's just make it...	to break all those rules if they violate the b...	intelligent.This one's been touched on before ...	talk I met one of the employees. He says Nice ...	blank document? You do not. On the opposite si...	the code. Every software company is doing Micr...	the old 1982 chassis. But there's also a new M...	Jobs came back to Apple in 1997 after 12 years...	for me Cupertino.(Laughter)The truth is I neve...	the meaning of depressed.(Laughter)But it turn...	back to your computer? Well you either haul ar...	your lives or one of your children. You walk a...	getting the echo from the hall and stuff. The ...	We'll make this software work right. Right? Be...	among the people who create this stuff: Easy i...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN
3	If you're here today — and I'm very happy that...	facility planned for the East River waterfront...	the lovely facilities that I mentioned earlier...	first waterfront park that the South Bronx had...	might find the good stuff like parks and trees...	children has asthma. Our asthma hospitalizatio...	the short term and won't destroy us all in the...	These things make me different from you. But t...	many cities around the country. Red-lining was...	were often given less than a month's notice be...	that began in the 1960s set the stage for all ...	the plan for a waterfront esplanade with dedic...	more. We run a project called the Bronx [Envir...	utilized for parkland affordable housing and l...	for our own green roof installation business b...	abused at worst by negligent regulatory agenci...	This presentation today only represents some o...	agencies on how to deal with the cumulative ef...	working closely with Columbia University and o...	all responsible for the future that we create....	plazas created one of the most efficient bus m...	population is still considered a radical idea ...	addressed everywhere. Oh good glad I have a li...	you like this. Please don't waste me. By worki...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN
4	About 10 years ago I took on the task to teach...	country has the highest child mortality of the...	really realized my discovery. I have shown tha...	displays it like this: every bubble here is a ...	of children per woman: one two three four up t...	live here? Or have they got longer lives and l...	in the '90s we have the terrible HIV epidemic ...	size. And in the '80s now they give up Communi...	longer. This is a myth. There's a little hump ...	population most in poverty. This is OECD. The ...	increases there are hundreds of millions in As...	you have sub-Saharan Africa there and we take ...	can go here and I can split sub-Saharan Africa...	big bubble in the middle. But a huge differenc...	Today we don't have to go to Cuba to find a he...	the world.But I would like to bring you back t...	you are healthy first than if you are wealthy ...	you look at the average data of the countries ...	yet we tend to discuss on what solutions there...	organizations. Because the data is hidden down...	Gapminder was appropriate. And we started to w...	be searched as others can be searched. We cann...	to look at income distributions in completely ...	quite easily get any variable you would like t...	tendency. It's as if the world is flattening o...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
2456	So Ma was trying to explain something to me ab...	Ma! Ma!(Laughter)My mother explained that The ...	citizen is in many ways shaped by newcomers an...	volunteer to fight this war?And she told me Be...	the people who are left behind: the voters the...	Ramon was being deported to Latin America whil...	restaurant reviews — restaurant reviews! In th...	of their social circle were at risk.I am not s...	not born. And in my career in my life I've bee...	seeks to answer becomes to me the same one tha...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN
2457	This is a picture of a sunset on Mars taken by...	life in order to understand if we could find l...	few fantastic examples he has found on how lif...	ocean mist as a source of water and strikingly...	fogs or clouds I reported four other sites muc...	UV radiation as a source of energy. If confirm...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2458	In my early days as a graduate student I went ...	the sheer oneness of it as if it wasn't hundre...	or when I think of a flock of starlings that f...	for AI then becomes what are those rules of en...	to exhibit collective intelligence and that's ...	going to take on and set the patterns of our b...	two rules are sufficient for the group to be a...	the rules are such that we can get the collect...	before these insects actually also have patter...	these rules we can start to create the robot b...	you are completely obsessed with army ants the...	and this can enable many different kinds of fu...	itself an incredible manifestation of collecti...	like? I believe that we can do that. I believe...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN
2459	I took a cell phone and accidentally made myse...	a survivor myself of police brutality and havi...	our nature and gave them Ferraris.(Laughter)Yo...	a human. I was an idea an object a caricature....	But I needed to know. Like I wanted to know. A...	didn't necessarily agree with and this worked ...	YouTube I became Lucius25 white supremacist lu...	spend days clicking through my new racist prof...	what else led to the momentum of the alt-right...	ideas easily debunked. Alt-facts have that qua...	so was I. Never in a billion years did I think...	they labeled as white genocide that diversity ...	race. Join the party. The water's great. Until...	Wise and Michelle Alexander Dr. Joy DeGruy Boy...	devices however advanced become a blessing and...	in mind conversations stop violence conversati...	there was a severe lack of trust in the black ...	laptops and meet us in person to have real con...	have to go through each other to get these thi...	when you trick the algorithm of your existence...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2460	We humans are becoming an urban species so cit...	that I am not alone in this. People love citie...	they are all different and I hope you like all...	couple of those and this is what he created. S...	that you put all of the services in the center...	even more into the future. Astergea by Yuttho....	explained this concept he had Speck explain it...	kind of a situation. If we build a new road wo...	that I have shown you these are the people who...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	NaN	1.0	NaN	NaN

	technology	science	global issues	culture	TEDx	design	business	entertainment	health	innovation	society	social change	art	value
40466	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	imagine you're speaking to — I don't know — th...
40467	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	want to hear more on this because the very wor...
40468	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	of his supporters Let's not be vile anymore to...
40469	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	to this issue: first of all that we are comple...
40470	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	our categories. All the categories that we tho...
40471	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	to weave the two together and to understand th...
40472	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	be universal preschool; there have to be chart...
40473	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	you don't pay attention to what you hear to wh...
40474	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	and figure out whether we can add something he...
40475	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	go we may all be back in the forest soon. We'r...
40476	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	as you already heard is extraordinary and I ca...
40477	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	is extremely unfair and that we should realize...
40478	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	if you think about technological disruption so...
40479	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	should want to want to know the truth to under...
40480	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	make us satisfied. Let's now try to gain contr...
40481	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	facing an ecological meltdown. And if we now t...
40482	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	social critic. I mean the industry focuses mai...
40483	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	challenge and the best example we have of huma...
40484	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	are killed in war. So this I think gives us a ...
40485	1.0	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	1.0	NaN	us do it. Reach out to other people try and ha...