Introduction¶

Study political discourses with: Webscraping, Natural Language Processing, Topic Modeling, Sentiment Analysis

Data Sources¶

Data come from the oficial websites from the politicians being studied:

New York City Mayor: https://www1.nyc.gov

New York State Governor: https://www.governor.ny.gov

Packages¶

# Imports

# Data Manipulation
import os
import re
import sys
import time
import string
import pickle
import requests
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm

# Data Visualization
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint

# Natual Language Processing
import spacy
from spacy.symbols import amod
from collections import Counter

# Web Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys

# Topic Modeling
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

# Sentiment Analysy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Defining the parameter below to avoid the error:
# RecursionError: maximum recursion depth exceeded
# When saving the results to disk
sys.setrecursionlimit(10000)

%reload_ext watermark
%watermark -v --iv

numpy              1.18.5
pandas             1.0.5
matplotlib         3.2.2
requests           2.24.0
spacy              2.2.4
selenium.webdriver 3.14.1
seaborn            0.10.1
sklearn            0.23.1
re                 2.2.1
CPython 3.7.7
IPython 7.16.1

Part 1 - Web Scraping¶

Web scraping discourses from NY City mayor Bill de Blasio and form NY State governor Andrew Cuomo.

The goal is recovering transcriptions from their official discourses in 2020. Since their websites are updated with new discourses frequently this code will result in a different result every time its ran. It might even not work at some point in the future.

# Define chromedriver
# https://chromedriver.chromium.org/downloads
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

Web Scraping the Discourses from Mayor Bill de Blasio¶

Starting by scraping the URLs from his main transcriptions page and afterwards proceed to scraping the content in each of the URLs.

# List to store the URLs
list_urls = []

# Loop to create a list of urls through which to run the web scraping
for i in range(1, 41):
    full_url = 'https://www1.nyc.gov/office-of-the-mayor/news.page' + '#page-' + str(i)
    list_urls.append(full_url)

# Check list
list_urls[37:]

['https://www1.nyc.gov/office-of-the-mayor/news.page#page-38',
 'https://www1.nyc.gov/office-of-the-mayor/news.page#page-39',
 'https://www1.nyc.gov/office-of-the-mayor/news.page#page-40']

# Counter (sets how many urls will be scraped)
counter = 25

# Funtcion for web scraping
def scraping_urls(urls):
    
    # Define the driver
    driver = webdriver.Chrome("./chromedriver")
    
    # List for the result
    soup_list = []
    
    # Counter
    count = 0
    
    # Loop through urls
    for i in tqdm(urls):
        if count < counter:
            driver.get(i)
            driver.refresh()
            time.sleep(5)
            soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
        count += 1
    driver.close()
    return soup_list

# Scrap the source code from each page
soups = scraping_urls(list_urls)

100%|██████████████████████████████████████████| 40/40 [02:26<00:00,  3.66s/it]

# Function to extract the links from the source code on the html pages. These links contain the discourse texts (transcriptions)
def extract_links_mayor(soup_object):
    links_list = []
    for s in tqdm(soup_object):
        links = s.find_all('a', {'href': re.compile(r'transcript')})
        for i in links:
            link1 = str(i).replace('"', '')
            if re.search('=(.+)>T', link1) is not None:
                link = re.search('=(.+)>T', link1).group(1)
            else:
                continue
            full_link = 'https://www1.nyc.gov' + link
            links_list.append(full_link)
    return links_list

# Extract the links for each page with transcripts
link_list = extract_links_mayor(soups)

100%|█████████████████████████████████████████| 25/25 [00:00<00:00, 218.54it/s]

# Salvamos links to disk
with open('data/db_links_mayor.pickle', 'wb') as to_write:
    pickle.dump(link_list, to_write)

# Function to extract the text of the political discourse from the source code
def extract_discourse_mayor(urls):
    driver = webdriver.Chrome(chromedriver)
    doc_source = []
    for i in tqdm(urls):
        driver.get(i)
        time.sleep(5)
        doc_source.append(BeautifulSoup(driver.page_source, 'html.parser'))
    driver.close()
    return doc_source

# Extract transcription from each discourse
discourses = extract_discourse_mayor(link_list)

100%|████████████████████████████████████████| 116/116 [12:04<00:00,  6.24s/it]

# Save the discourses extracted
with open('data/db_discourse_mayor.pickle', 'wb') as to_write:
    pickle.dump(discourses, to_write)

Web Scraping the Discourses from Governor Andrew Cuomo¶

# Counter (sets how many urls will be scraped)
counter = 10

# URL of the pages with discourses
url = 'https://www.governor.ny.gov/keywords/media'

# Função to scrape the urls
def scraping_urls(urls):
    
    # Load the driver and extract the html code
    driver = webdriver.Chrome(chromedriver)
    soup_list = []
    driver.get(url)
    driver.refresh()
    time.sleep(5)
    soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
    pages = list(range(2,9)) + ([4] * 12) 
    count = 0
    
    # Loop pelas páginas
    for i in tqdm(pages):
        if count < counter:
            path = '//*[@id="DataTables_Table_0_paginate"]/span/a[' + str(i) + ']'
            driver.find_element_by_xpath(path).click()
            time.sleep(5)
            soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
        count += 1
    driver.close()
    return soup_list

# Scraping the source code from the pages
sources = scraping_urls(url)

100%|██████████████████████████████████████████| 19/19 [00:53<00:00,  2.84s/it]

# Function to extract links
def extract_links_gov(soup_object):
    links_list = []
    for s in tqdm(soup_object):
        links = s.find_all('a', {'href': re.compile(r'transcript')})
        for i in links:
            link1 = str(i).replace('"', '')
            if re.search('=(.+)>\n', link1) is not None:
                link = re.search('=(.+)>\n', link1).group(1)
            else:
                continue
            full_link = 'https://www.governor.ny.gov' + link
            links_list.append(full_link)
    return list(set(links_list))

# Extract the links from each source code
link_list = extract_links_gov(sources)

100%|██████████████████████████████████████████| 11/11 [00:00<00:00, 98.27it/s]

# Save each page's links
with open('data/db_links_governor.pickle', 'wb') as to_write:
    pickle.dump(link_list, to_write)

# Function to extract discourses
def extract_discourses_gov(urls):
    driver = webdriver.Chrome(chromedriver)
    doc_source = []
    for i in tqdm(urls):
        driver.get(i)
        time.sleep(5)
        doc_source.append(BeautifulSoup(driver.page_source, 'html.parser'))
    driver.close()
    return doc_source

# Extract discourses
discourses = extract_discourses_gov(link_list)

100%|████████████████████████████████████████| 101/101 [12:28<00:00,  7.41s/it]

# Save discourses
with open('data/db_discourse_governor.pickle', 'wb') as to_write:
    pickle.dump(discourses, to_write)

Part 2 - Data Cleaning¶

Now that the source from each discourse page was extracted, the discourse texts need to be retrieved.

# Open the database
with open('data/db_links_mayor.pickle', 'rb') as read_file:
    db_links_mayor = pickle.load(read_file)

# Open the database
with open('data/db_discourse_mayor.pickle', 'rb') as read_file:
    db_discourse_mayor = pickle.load(read_file)

# Function to extract text
def extract_text(source_object):
    text_list = []
    for s in tqdm(source_object):
        text = s.find_all('p')
        text_list.append(text)
    return text_list

# List for the text
discourse_length = []

# Loop to extract the text
for i in extract_text(db_discourse_mayor):
    discourse_length.append(len(i))

100%|██████████████████████████████████████| 116/116 [00:00<00:00, 1061.10it/s]

# Convert to numpy array
discourse_length_array = np.array(discourse_length)

# Extract discourses
first_discourses = extract_text(db_discourse_mayor)

100%|██████████████████████████████████████| 116/116 [00:00<00:00, 1801.30it/s]

# These are the indexes which need to be re-extracted with get_text2
# This is necessary because the pages have differencs among them
indexes = list(np.argwhere(discourse_length_array == 1).flatten())

# Function for new text extraction
def extract_text2(source_object):
    text_list = []
    for s in tqdm(source_object):
        text = s.find('p').parent
        text_list.append(text)
    return text_list

# Loop to extract discourses with clearer text
best_discourses = []
for i in indexes:
    best_discourses.append(extract_text2(db_discourse_mayor[i]))

100%|████████████████████████████████████████████| 1/1 [00:00<00:00, 45.43it/s]

# Substitute the first_discourses by best_discourses
for (indexes, best_discourses) in zip(indexes, best_discourses):
    first_discourses[indexes] = best_discourses

# Function to remove html tags
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Function to clean data
def clean_data(transcript_list, link_list):
    date = []
    text = []
    for i in tqdm(transcript_list):
        cleaned = remove_html_tags(str(i))
        try:
            if re.search('\[\\n(.+)\\n(.+)\]', cleaned) is not None and re.search('\[\\n(.+)\\n(.+)\]', cleaned) is not None:
                date_clean = re.search('\[\\n(.+)\\n(.+)\]', cleaned).group(1)
                date.append(date_clean)
                text_clean = re.search('\[\\n(.+)\\n(.+)\]', cleaned).group(2)
                text.append(text_clean)
            else:
                date_clean = re.search('\\n\\n(.+20.{2})\\n(.+)\\n\\ufeff', cleaned).group(1)
                date.append(date_clean)
                text_clean = re.search('\\n\\n(.+20.{2})\\n(.+)\\n\\ufeff', cleaned).group(2)
                text.append(text_clean)
        except:
            continue
    date = pd.to_datetime(date)
    df = pd.DataFrame([date, link_list, text]).T
    df.columns = ['date', 'link', 'text']
    return df

# Clean
data_cleaned = clean_data(first_discourses, db_links_mayor)

100%|███████████████████████████████████████| 116/116 [00:00<00:00, 453.86it/s]

# Function to extract the discourse using regular expressions
def extract_text_discourse(transcript):
    if len(re.findall('sio:([^:]+)|yor:([^:]+)', str(transcript))) > 0:
        return str(re.findall('sio:([^:]+)|yor:([^:]+)', str(transcript)))
    else:
        return ''

# Pattern to remove
toremove = "'\\,\"\[\(\]\)-–"

# Lambda functions to extract undesired characters and patterns from the data
punc_lower = lambda x: re.sub('[%s]' % re.escape(toremove), '', x.lower())
remove_xa0 = lambda x: x.replace('xa0', '')
remove_space = lambda x: x.replace('  ', ' ')

# Apply the cleaning
data_cleaned['monologue'] = data_cleaned['text'].map(extract_text_discourse).map(punc_lower).map(remove_xa0).map(remove_space)

# Incluide a column indicating that this is a discourse from mayor de Blasio
data_cleaned.insert(0, 'speaker', 'de blasio')

# Salvamos os discursos agora limpos
# Save the now-cleaned discourses
with open('data/db_discourse_mayor_clean.pickle', 'wb') as to_write:
     pickle.dump(data_cleaned, to_write)

Now repeating everything for governor Cuomo

# Open the database
with open('data/db_links_governor.pickle', 'rb') as read_file:
    db_links_governor = pickle.load(read_file)

# Open the database
with open('data/db_discourse_governor.pickle', 'rb') as read_file:
    db_discourse_governor = pickle.load(read_file)

# Function to get the date (I'll be using this info later)
def get_date(source_object):
    date_list = []
    for i in tqdm(source_object):
        date = i.find('div', class_="published-date").text
        date_clean = re.search('\\n\\n(.+20.{2})', date).group(1).strip()
        date_list.append(date_clean)
    return date_list

# Extract the date
cuomo_date = get_date(db_discourse_governor)

100%|███████████████████████████████████████| 101/101 [00:00<00:00, 484.38it/s]

# Function to extract the text
def get_text(source_object):
    text_list = []
    for s in tqdm(source_object):
        text = s.find('div', class_='field field--name-field-body field--type-text-long field--label-hidden')
        text_list.append(text)
    return text_list

# Extract the text with a first pass through the data
cuomo_step1 = get_text(db_discourse_governor)

100%|███████████████████████████████████████| 101/101 [00:00<00:00, 416.41it/s]

# Function to clean the data
def clean_cuomo(transcript_list):
    clean_transcripts = []
    for idx, i in tqdm(enumerate(transcript_list)):
        cleaned = remove_html_tags(str(i))
        if re.search('below:(.+)', cleaned) is not None:
            text_clean = re.search('below:(.+)', cleaned).group(1)
            clean_transcripts.append(text_clean)
        elif re.search('here.(.+)', cleaned) is not None:
            text_clean = re.search('here.(.+)', cleaned).group(1)
            clean_transcripts.append(text_clean)
    return clean_transcripts

# Limpa os dados
cuomo_step2 = clean_cuomo(cuomo_step1)

101it [00:00, 652.91it/s]

# Visualizin what has been extracted
test_len = []
for i in cuomo_step2:
    test_len.append(len(i))
test_len_array = np.array(test_len)

# Sample an extracted text
cuomo_step2[99]

'\xa0Good morning. Pleasure to be with all of you. To my left, Melissa DeRosa, secretary to the governor. To my right, Robert Mujica, budget director of the State of New York, also gracing the cover of City &amp; State magazine this month, where he has that big, warm smile that he\'s famous for. Today is day 95 of the situation dealing with the coronavirus pandemic, and it is day 10 of the situation dealing with the civil unrest after the murder of Mr. Floyd that we all saw on TV. The president held up the bible the other day in Washington, D.C.. Here in New York we actually read the bible, and there are some passages that I think are especially appropriate for today, and this time of where we are."Blessed are the peacemakers, for they will be called children of God." Matthew 5:9. "If a house be divided against itself, that house cannot stand." That was Mark 3:25. Actually, before Abraham Lincoln. "Turn away from evil and do good, search for peace and work to maintain it." Psalms 34:14. "The seed whose fruit is righteousness is sown in peace by those who make peace." I think those words are all appropriate for where we are today.Here\'s another quote that I think is applicable to where we are today. "You can\'t use the military as a political weapon." Do you remember who said that? I said that. There\'s another quote, "The option to use active duty forces in a law enforcement role should only be used as a matter of last resort, and only in the most urgent and dire of situations. We are not in one of those situations now. I do not support invoking the Insurrection Act." Do you know who said that? Secretary of Defense Mark Esper, appointed by President Trump."You can\'t set fire to the house, and then claim you are the one trying to put out the flames." Do you know who said that? You guys are not well read., that I can tell you. A.J. Parkinson said that.Last night, we had continued protests across the state, but there are two very different situations that are going on, and we have to keep them separate, and we have to address them as separate situations, because they are night and day. One is protesting, and the other is looting. They are two very different situations. Some people choose to morph those two together, all the protesters are actually looters, and we should treat them as looters. That is not a fact, that is not the truth, that is not the reality of what is going on. There are people who are protesting, and there are people who are looting, very, very different situations. The protesting is righteous indignation over Mr. Floyd\'s murder and systemic racism and injustice. And you listen to their point, I think they\'re right. You look at Mr. Floyd\'s murder on television and it is reprehensible. There is no police officer in this nation that would defend that. And people are appalled, and again, it\'s not the first time. You can\'t say, well this is an isolated incident.Mr. Floyd was one in an ongoing series. Ahmaud Arbery, Breonna Taylor, you can go back to Rodney King, you can go back to Martin Luther King, there are dozens and dozens of the same case. Righteous indignation. It\'s more systemic racism. Righteous indignation. It\'s systemic injustice. Righteous indignation, yes. But, don\'t be violent. Because when you are violent, you lose the righteous indignation. When you are violent, you play to the critics who want to say, oh, they\'re all violent looters, they\'re all a criminal element. And that actually defeats the righteousness of the message.Mr. Floyd was nonviolent. He was nonviolent against the police, the crime that they were possibly investigating was a nonviolent crime. Violence actually demeans the situation and loses the righteous indignation. And I urge the protesters to respect the curfew, because a curfew is necessary because the police have a real job of policing, dealing with the looters. The looting is criminal behavior pure and simple. There\'s no righteousness, there\'s no message, there\'s no indignation. These are opportunists who see the police are busy dealing with the protestors and they say well here\'s an opportunity to loot and to steal and to cause mayhem. That\'s all they are. This is criminal behavior, period, designed to create chaos. Well, there are anarchists in there also. Fine. It\'s all illegal behavior. We will not allow our cities in this state to be in chaos, period. Public safety is rule one. Maintaining is order is rule one. It\'s not going to happen in the state of New York. We\'re not going to allow the looting that we\'ve seen on videotapes, the chaos that we\'ve seen. Period.The police must be empowered to keep order, stop the looting, and stop the criminal activity. Distinguish between the protestors and the looting. Yes, you have police action which is necessary to work with the protestors, but the police have to be there and be empowered to stop the looting and the chaos and the criminal behavior that people are trying to exploit this moment for their own selfish criminal purposes. The police have to be able to do their job. The police have to be supported in being able to do their job.I\'ve sent New York State Police to cities to deal with these protests. They have to have the right numbers. They have to be empowered to do their job. We\'ve had protests again last night all across the state. I want to applaud the local police who have done a great job. I want to applaud the State Police who have done a great job. The protests were mainly peaceful all across the state. I want to thank all involved for keeping it that way, protestors included.New York City, last night, was much better. The protestors were mainly peaceful. The police officers had the resources and the capacity to do their jobs. The results last night were much much different than the night before and that\'s what it\'s all about. I think the people in New York City should feel much better today than they did after the night of looting.These are perilous times. There\'s a lot going on and we have to understand what\'s going on and the difference among the issues that we\'re dealing with. You have the COVID crisis, you have the murder of Mr. Floyd - two very different situations, but both critical in of themselves and both happening at the same time. It\'s then wrapped in an environment and a dynamic that is racially charged and politically charged. It makes it a very, very perilous time in this country and we have to be careful. We have to be very careful because the consequences are steep on both sides of this equation. Leadership, good government, responsibility is more important than ever before especially in these divided times.COVID-19 is still a real threat. We\'re still battling that. I know it\'s not on the front pages today but it is still in people and in society. We\'re still battling that. That is going better than it has ever gone in New York. We have the lowest number of hospitalizations ever and we have the lowest death toll ever. God bless the people of New York for what they did. God bless the nurses and the doctors and the essential workers and the frontline workers because they saved hundreds of thousands of lives in the State of New York.We have to remember what made us successful during COVID. That we\'re New York tough, but New York tough is multi-faceted. It means New York smart. If you\'re going to protest, protest intelligently. Remember the COVID virus is still out there, so protest intelligently. We\'re united. We\'re not black and white, we\'re not upstate, downstate. We\'re not red and blue. We are one state. One community and we came together that way. We\'re disciplined in fighting this COVID. We\'re disciplined in having our right to protest, but doing it peacefully and in a way that respects law and order. And we are loving at the end of the day. Yes, we have issues. Yes, we have challenges, but we\'ve shown how good we can be as a community and how much we respect one another and the sacrifice we\'re willing to make for one another.Let\'s keep that spirit that we developed over the past 95 days. Let\'s keep that going because that is pure magic. If we stay united and we stay loving and we stay smart, we\'re going to handle all of these issues and we\'re going to be the better for it. We\'ve overcome the greatest challenge that this state has faced in my lifetime with this COVID virus. This was the beast that we didn\'t know if we could beat, but so far we\'ve beaten it. We have to stay smart to make sure we control the beast. But we did it we overcame. We--the people--overcame together. Last words for today, do not be overcome by evil, but overcome evil with good, Romans 12. That, my friends, is New York tough, smart, united, disciplined, and loving.'

# To free memory, deleting objects which arent necessary anymore
delete_index = test_len_array.argsort()[0]
del cuomo_step2[delete_index]
del db_links_governor[delete_index]
del cuomo_date[delete_index]

# Converting the date to datetime type
cuomo_date = pd.to_datetime(cuomo_date)

# Create a dataframe with the clean data
cuomo_clean = pd.DataFrame([cuomo_date, db_links_governor, cuomo_step2]).T
cuomo_clean.columns = ['date', 'links', 'text']

# visualize
cuomo_clean.head()

# Save the discourses
with open('data/db_discourse_governor_clean.pickle', 'wb') as to_write:
    pickle.dump(cuomo_clean, to_write)

Using three different functions with different regular expressions to extract only the parts of the discourse in which the governor was speaking, as I want to remove other people's comments. Luckily that info is contained in the page from each discourse.

# Function to extract discourse
def extract_discourse_gov1(transcript):
    if len(re.findall('Cuomo:([^:]+)', str(transcript))) > 0:
        return str(re.findall('Cuomo:([^:]+)', str(transcript)))
    else:
        return str(transcript)

# Function to extract discourse
def extract_discourse_gov2(transcript):
    if len(re.findall('Cuomo:(.*?)[A-Z][a-z]+:', str(transcript))) > 0:
        return str(re.findall('Cuomo:(.*?)[A-Z][a-z]+:', str(transcript)))
    else:
        return str(transcript)

# Function to extract discourse
def extract_discourse_gov3(transcript):
    if len(re.findall('^(.*?)[A-Z][a-z]+:', str(transcript))) > 0:
        return str(re.findall('^(.*?)[A-Z][a-z]+:', str(transcript)))
    else:
        return str(transcript)

# Clean data
cuomo_clean['monologue'] = cuomo_clean['text'].map(extract_discourse_gov1)

# Clean data
cuomo_clean['monologue2'] = cuomo_clean['text'].map(extract_discourse_gov2)

cuomo_clean.dropna(inplace = True)

# !! Mixed results with this function !!
def extract_m3(col1, col2):
    if (len(col1)-len(col2))/len(col2) > 10:
        return extract_discourse_gov3(col1)
    else:
        return ''

# Clean data
cuomo_clean['monologue3'] = cuomo_clean.apply(lambda x: extract_m3(x.text, x.monologue), axis = 1)

# Function to compare
def compare(col1, col2):
    if (len(col1) - len(col2)) / len(col2) > 1:
        return col1
    else:
        return col2

# Texto final com duas extrações
cuomo_clean['final_text'] = cuomo_clean.apply(lambda x: compare(x.monologue, x.monologue2), axis = 1)

# Final text with the third extraction
cuomo_clean['final_text2'] = cuomo_clean.monologue3 + cuomo_clean.final_text

# Remove undesired characters
remove_sxa0 = lambda x: x.replace('\xa0', '')

# Final cleaning
cuomo_clean['final_clean'] = cuomo_clean['final_text2'].map(punc_lower).map(remove_xa0).map(remove_sxa0).map(remove_space)

# Final cleaning
cuomo_final = cuomo_clean.loc[:, ['date', 'links', 'text', 'final_clean']]
cuomo_final.columns = ['date', 'link', 'text', 'monologue']

# Insert a column indicating this is a discourse form governor Cuomo
cuomo_final.insert(0, 'speaker', 'cuomo')

# View
cuomo_final.head()

# Save
with open('data/db_discourse_governor_clean_final.pickle', 'wb') as to_write:
    pickle.dump(cuomo_final, to_write)

Opening the already clearned databases to concatenate them into a single dataframe

with open('data/db_discourse_mayor_clean.pickle','rb') as read_file:
    deblasio = pickle.load(read_file)

with open('data/db_discourse_governor_clean_final.pickle','rb') as read_file:
    cuomo = pickle.load(read_file)

# Concatenate the databases
db_final = pd.concat([cuomo, deblasio], axis = 0).reset_index(drop = True)

# Check the proportion of discourses from each politician
db_final['speaker'].value_counts()

de blasio    116
cuomo        100
Name: speaker, dtype: int64

# Grava o database final
# Save the final database
with open('data/db_final.pickle', 'wb') as to_write:
     pickle.dump(db_final, to_write)

Part 3 - Topic Modeling¶

http://journalofdigitalhumanities.org/2-1/topic-modeling-a-basic-introduction-by-megan-r-brett/#topic-modeling-a-basic-introduction-by-megan-r-brett-n-1

# Download the language model from spacy
!python -m spacy download en_core_web_sm

Requirement already satisfied: en_core_web_sm==2.2.5 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz#egg=en_core_web_sm==2.2.5 in c:\users\matheus\anaconda3\lib\site-packages (2.2.5)
Requirement already satisfied: spacy>=2.2.2 in c:\users\matheus\anaconda3\lib\site-packages (from en_core_web_sm==2.2.5) (2.2.4)
Requirement already satisfied: setuptools in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (47.3.1.post20200622)
Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.6.0)
Requirement already satisfied: srsly<1.1.0,>=1.0.2 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)
Requirement already satisfied: thinc==7.4.0 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (7.4.0)
Requirement already satisfied: blis<0.5.0,>=0.4.0 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (0.4.1)
Requirement already satisfied: preshed<3.1.0,>=3.0.2 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.2)
Requirement already satisfied: requests<3.0.0,>=2.13.0 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.24.0)
Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (4.47.0)
Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.0)
Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.0.2)
Requirement already satisfied: cymem<2.1.0,>=2.0.2 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (2.0.3)
Requirement already satisfied: numpy>=1.15.0 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.18.5)
Requirement already satisfied: plac<1.2.0,>=0.9.6 in c:\users\matheus\anaconda3\lib\site-packages (from spacy>=2.2.2->en_core_web_sm==2.2.5) (1.1.3)
Requirement already satisfied: idna<3,>=2.5 in c:\users\matheus\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2.10)
Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in c:\users\matheus\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.25.9)
Requirement already satisfied: chardet<4,>=3.0.2 in c:\users\matheus\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.0.4)
Requirement already satisfied: certifi>=2017.4.17 in c:\users\matheus\anaconda3\lib\site-packages (from requests<3.0.0,>=2.13.0->spacy>=2.2.2->en_core_web_sm==2.2.5) (2020.6.20)
Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in c:\users\matheus\anaconda3\lib\site-packages (from catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (1.7.0)
Requirement already satisfied: zipp>=0.5 in c:\users\matheus\anaconda3\lib\site-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy>=2.2.2->en_core_web_sm==2.2.5) (3.1.0)
[+] Download and installation successful
You can now load the model via spacy.load('en_core_web_sm')

import en_core_web_sm

# Set the language model
#sp = spacy.load("en_core_web_sm")
sp = en_core_web_sm.load()

# Load the database
with open('data/db_final.pickle','rb') as read_file:
    db_final = pickle.load(read_file)

# Create patterns to extract text from the discourses
alphanumeric = lambda x: re.sub('\w*\d\w*', '', x)
punc = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
remove_space = lambda x: x.replace('  ', ' ')

# Apply the previous function to the final database
db_final['for_spacy'] = (db_final['monologue']
                         .map(alphanumeric)
                         .map(punc)
                         .map(remove_space))

# Applying SpaCy's language model
db_final['spacy_monologue'] = db_final['for_spacy'].map(lambda x: sp(x))

# Visualize
db_final.head()

# Lemmatize
db_final['lemmatized'] = (db_final['spacy_monologue']
                          .map(lambda x: [' '.join(word.lemma_ 
                                                   if word.lemma_ != '-PRON-' 
                                                   else word.text for word in x)][0]))

# Visualize
db_final.head()

# List stop words
list_stop_words = (text
                    .ENGLISH_STOP_WORDS
                    .union(['lehrer', 'brian', 'darden', 'moderator', 'alan', 'howard', 'wolf', 'blitzer', 
                            'errol', 'louis', 'alisyn', 'chris', 'camerota', 'dan', 'mannarino','john', 'berman', 
                            'savannah', 'guthrie', 'hoda']))

# Configurating max_df=0.5 and min_df=2 because those resulted in the best topics
cv1 = CountVectorizer(stop_words = list_stop_words, max_df = 0.5, min_df = 2)

# Create the document matrix, which basically contains words in numeric representations
docterm_matrix = cv1.fit_transform(db_final.loc[:, 'lemmatized'])

# Labels
doc_label = ['Document' + str(t) for t in range(len(db_final.loc[:, 'lemmatized']))]

# Final matrix
pd.DataFrame(docterm_matrix.toarray(), index = doc_label, columns = cv1.get_feature_names()).iloc[:10, 630:650]

Using Non-Negative Matrix Factorization (NMF) to fill the matrix with the topics per document.

The code below uses NMF for 12 topics, which demonstrated to create the most clear and distinct separation.

# Create and train model
nmf_cv = NMF(12)
nmf_topics1 = nmf_cv.fit_transform(docterm_matrix)

# Dataframe de tópicos
topicword_cv1 = pd.DataFrame(nmf_cv.components_.round(3),
                            index = ['topic0', 'topic1', 'topic2', 'topic3',
                                     'topic4', 'topic5', 'topic6', 'topic7',
                                     'topic8', 'topic9', 'topic10', 'topic11'],
                            columns = cv1.get_feature_names())

Create a dataframe with 12 topics on the lines and the terms on the columns. The values on the dataframe describe how the terms relate to the topic, where higher values mean a stronger relationship.

# Matrix
topicword_cv1.iloc[:, 1790:1820]

# Function to find the most important words per topic
def top_words_per_topic(model, terms, topic_names = None):
    for ix, topic in tqdm(enumerate(nmf_cv.components_)):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic '",topic_names[ix],"'")
        print(", ".join([cv1.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]))

# Locating the speakers (politicians)
db_final_docs_topics = db_final.loc[:, ['speaker']]

# Visualize
db_final_docs_topics

# Finding the most relevant topics
db_final_docs_topics['topics12'] = nmf_topics1.argmax(axis = 1)

# Topics per politician
speaker_topics = pd.DataFrame(db_final_docs_topics.groupby(['topics12']).speaker.value_counts())
speaker_topics.columns = ['count']
speaker_topics.reset_index()
speaker_topics_pivot = speaker_topics.reset_index().pivot_table(index = 'topics12', columns = 'speaker', values = 'count', fill_value = 0).reset_index(drop = True)
speaker_topics_pivot['total'] = speaker_topics_pivot['cuomo'] + speaker_topics_pivot['de blasio']
speaker_topics_pivot.sort_values('total', inplace = True)

# Plot
plt.figure(figsize = (14, 8))
names = list(set(speaker_topics_pivot.index))
plt.title('Most Discussed Topics in Speeches')
plt.xlabel('Total Documents per Topic')
plt.xlim(0, 80)
plt.barh(names, speaker_topics_pivot['total'], color = 'magenta', edgecolor = 'white')
plt.yticks(range(0, 12), ['Neighborhood Impact', 'Covid Mechanics', 'Healthcare', 'Education/School', 
                         'Neighborhood Resilience', 'Homelessness', 'DOH Communication', 'New Yorkers',
                         'Hate Crime', 'Hospital Needs', 'Hospital Status', 'Reopening Metrics'])
plt.tight_layout()

The topic which appears most frequently refers to reopening metrics. This includes terms such as rate, virus, reopening, infection, business, infection and hospitalization. This makes sense, since New York City is in the process of reopening its economy at the moment I'm working on this project.

# Plot
plt.figure(figsize = (14, 8))
names = list(set(speaker_topics_pivot.index))
plt.title('Politicians Covering Different Topics')
plt.xlabel('Total Documents per Topic')
plt.xlim(0, 80)
plt.barh(names, speaker_topics_pivot['cuomo'], color = '#77a9cf', edgecolor = 'white', label = 'Cuomo')
plt.barh(names, speaker_topics_pivot['de blasio'], left = speaker_topics_pivot['cuomo'], color = '#df8a62', edgecolor = 'white', label = 'de Blasio')
plt.yticks(range(0, 12), ['Neighborhood Impact', 'Covid Mechanics', 'Healthcare', 'Education/School', 
                         'Neighborhood Resilience', 'Homelessness', 'DOH Communication', 'New Yorkers',
                         'Hate Crime', 'Hospital Needs', 'Hospital Status', 'Reopening Metrics'])
plt.legend()
plt.tight_layout()

The reopening is purely a topic from Cuomo. Cuomo also began many of his discourses supplying relevant statistics for the reopening. Mayor de Blasio dominates the topic of hospital needs, while Cuomo discusses the hospital's status. They describe different things, in which the discourses from Cuomo tend to envolve updates about ventilators, beds and available equipments, and de Blasio talks about what New York's hospitals are losing: personell, supplies and ventilators. Both are covering hospitals in slightly different manners.

Mayor de Blasio focuses on available universal public service, while Cuomo focuses on appropriate and intensive treatment. The analysis from parts of the discourse highlights Blasio's progressive roots, with his focus on working class new yorkers and the need for universal public healthcare.

Part 4 - Sentiment Analysis¶

Using the SentimentIntensityAnalyzer function and feeding the discourses to retrieve the sentiment scores.

Teh VADER package returns a negative, neutral, positive and compound score, which can be plotted over time to understand how the sentiments vary over time.

# Load dataset
with open('data/db_final.pickle','rb') as read_file:
    db_final = pickle.load(read_file)

# Create the sentiment analyser
analyzer = SentimentIntensityAnalyzer()

# Scores list
scores = []

# Calculate sentiment scores
for i in tqdm(db_final['monologue']):
    scores.append(analyzer.polarity_scores(i))

100%|████████████████████████████████████████| 216/216 [04:27<00:00,  1.24s/it]

# Concatenate results
db_final_docs_sentiment = pd.concat([db_final.loc[:, ['speaker', 'date']], pd.DataFrame(scores)], axis = 1)

Calculating the mean sentiment score for each politician, observating the compound and the positive scores. Doing this as a sliding time window in time series.

# Split the data by politician
cuomo_roll = db_final_docs_sentiment[db_final_docs_sentiment.speaker == 'cuomo'].sort_values(by = 'date')
de_blasio_roll = db_final_docs_sentiment[db_final_docs_sentiment.speaker == 'de blasio'].sort_values(by = 'date')

# Moving averages for Cuomo
cuomo_roll['cp_roll_avg'] = cuomo_roll.compound.rolling(window = 7).mean()
cuomo_roll['pos_roll_avg'] = cuomo_roll.pos.rolling(window = 7).mean()

# Moving averages for de Blasio
de_blasio_roll['cp_roll_avg'] = de_blasio_roll.compound.rolling(window = 7).mean()
de_blasio_roll['pos_roll_avg'] = de_blasio_roll.pos.rolling(window = 7).mean()

# Combine the moving averages in a single dataframe
combined_roll = pd.concat([cuomo_roll, de_blasio_roll]).sort_values('date').reset_index(drop = True)

# Visualize
combined_roll.head(10)

# Plot the Compound Score
fig = plt.figure(figsize = (15, 8))
ax = plt.axes()
sns.lineplot(x = 'date', y = 'cp_roll_avg', hue = 'speaker', data = combined_roll)
plt.title('Compound Score 7-Day Rolling Average')
plt.xlabel('Discourse Date')
plt.ylabel('Compoound Score')
plt.ylim((-0.6, 1.1))
plt.axhline(y = 0, color = '#77a9cf', linestyle = ':')
plt.axvline(x = datetime.date(2020, 7, 1), color = 'black', linestyle = 'dashed')
plt.xticks(rotation = 30)
plt.tight_layout()

# Plot Positive Score
fig = plt.figure(figsize = (15, 8))
ax = plt.axes()
sns.lineplot(x = 'date', y = 'pos_roll_avg', hue = 'speaker', data = combined_roll)
plt.title('Positive Score 7-Day Rolling Average')
plt.xlabel('Discourse Date')
plt.ylabel('Positive Score')
plt.axvline(x = datetime.date(2020, 7, 1), color = 'black', linestyle = 'dashed')
plt.xticks(rotation = 30);
plt.tight_layout()

	date	links	text
0	2020-06-11	https://www.governor.ny.gov/news/video-audio-p...	Good morning, pleasure to be back. Wish you we...
1	2020-06-12	https://www.governor.ny.gov/news/audio-rush-tr...	Wolf Blitzer: The New York Governor Andrew Cu...
2	2020-05-30	https://www.governor.ny.gov/news/video-audio-p...	Good morning to everyone. It's a pleasure to ...
3	2020-06-30	https://www.governor.ny.gov/news/audio-rush-tr...	AlabamaArkansasArizonaCaliforniaFloridaGeorgia...
4	2020-05-31	https://www.governor.ny.gov/news/video-audio-p...	Good morning. Let me introduce who is with me ...

	speaker	date	link	text	monologue
0	cuomo	2020-06-11	https://www.governor.ny.gov/news/video-audio-p...	Good morning, pleasure to be back. Wish you we...	good morning pleasure to be back. wish you wer...
1	cuomo	2020-06-12	https://www.governor.ny.gov/news/audio-rush-tr...	Wolf Blitzer: The New York Governor Andrew Cu...	well look i think its clear from day one on th...
2	cuomo	2020-05-30	https://www.governor.ny.gov/news/video-audio-p...	Good morning to everyone. It's a pleasure to ...	good morning to everyone. its a pleasure to be...
3	cuomo	2020-06-30	https://www.governor.ny.gov/news/audio-rush-tr...	AlabamaArkansasArizonaCaliforniaFloridaGeorgia...	yes good to be with you pat. first of all weve...
4	cuomo	2020-05-31	https://www.governor.ny.gov/news/video-audio-p...	Good morning. Let me introduce who is with me ...	good morning. let me introduce who is with me ...

	speaker	date	link	text	monologue	for_spacy	spacy_monologue
0	cuomo	2020-06-11	https://www.governor.ny.gov/news/video-audio-p...	Good morning, pleasure to be back. Wish you we...	good morning pleasure to be back. wish you wer...	good morning pleasure to be back wish you were...	(good, morning, pleasure, to, be, back, wish, ...
1	cuomo	2020-06-12	https://www.governor.ny.gov/news/audio-rush-tr...	Wolf Blitzer: The New York Governor Andrew Cu...	well look i think its clear from day one on th...	well look i think its clear from day one on th...	(well, look, i, think, its, clear, from, day, ...
2	cuomo	2020-05-30	https://www.governor.ny.gov/news/video-audio-p...	Good morning to everyone. It's a pleasure to ...	good morning to everyone. its a pleasure to be...	good morning to everyone its a pleasure to be ...	(good, morning, to, everyone, its, a, pleasure...
3	cuomo	2020-06-30	https://www.governor.ny.gov/news/audio-rush-tr...	AlabamaArkansasArizonaCaliforniaFloridaGeorgia...	yes good to be with you pat. first of all weve...	yes good to be with you pat first of all weve ...	(yes, good, to, be, with, you, pat, first, of,...
4	cuomo	2020-05-31	https://www.governor.ny.gov/news/video-audio-p...	Good morning. Let me introduce who is with me ...	good morning. let me introduce who is with me ...	good morning let me introduce who is with me t...	(good, morning, let, me, introduce, who, is, w...

	speaker	date	link	text	monologue	for_spacy	spacy_monologue	lemmatized
0	cuomo	2020-06-11	https://www.governor.ny.gov/news/video-audio-p...	Good morning, pleasure to be back. Wish you we...	good morning pleasure to be back. wish you wer...	good morning pleasure to be back wish you were...	(good, morning, pleasure, to, be, back, wish, ...	good morning pleasure to be back wish you be w...
1	cuomo	2020-06-12	https://www.governor.ny.gov/news/audio-rush-tr...	Wolf Blitzer: The New York Governor Andrew Cu...	well look i think its clear from day one on th...	well look i think its clear from day one on th...	(well, look, i, think, its, clear, from, day, ...	well look i think its clear from day one on th...
2	cuomo	2020-05-30	https://www.governor.ny.gov/news/video-audio-p...	Good morning to everyone. It's a pleasure to ...	good morning to everyone. its a pleasure to be...	good morning to everyone its a pleasure to be ...	(good, morning, to, everyone, its, a, pleasure...	good morning to everyone its a pleasure to be ...
3	cuomo	2020-06-30	https://www.governor.ny.gov/news/audio-rush-tr...	AlabamaArkansasArizonaCaliforniaFloridaGeorgia...	yes good to be with you pat. first of all weve...	yes good to be with you pat first of all weve ...	(yes, good, to, be, with, you, pat, first, of,...	yes good to be with you pat first of all we ha...
4	cuomo	2020-05-31	https://www.governor.ny.gov/news/video-audio-p...	Good morning. Let me introduce who is with me ...	good morning. let me introduce who is with me ...	good morning let me introduce who is with me t...	(good, morning, let, me, introduce, who, is, w...	good morning let me introduce who be with me t...

	caput	careful	carolina
Document0	0	0	2
Document1	0	0	0
Document2	0	0	0
Document3	0	0	0
Document4	0	0	0
Document5	1	2	0
Document6	0	0	0
Document7	0	0	0
Document8	0	0	0
Document9	0	1	0

	firsthand	fiscal	fiscally	fish	fisher	fit	fitting	fiveandahalf	fix	flag	...	flight	flinch	flood	floodgate	flooding	floor	florida	flow	floyd	floyds
topic0	0.000	0.379	0.060	0.000	0.001	0.130	0.000	0.000	0.330	0.000	...	0.000	0.000	0.008	0.000	0.000	0.000	0.000	0.000	0.014	0.000
topic1	0.018	0.000	0.000	0.000	0.000	0.063	0.002	0.000	0.147	0.002	...	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.256	0.013
topic2	0.000	0.000	0.000	0.021	0.002	0.039	0.000	0.000	0.000	0.025	...	0.207	0.057	0.073	0.021	0.000	0.013	0.402	0.011	0.200	0.178
topic3	0.000	0.000	0.001	0.000	0.004	0.043	0.020	0.012	0.000	0.000	...	0.000	0.000	0.000	0.013	0.013	0.000	0.013	0.000	0.000	0.000
topic4	0.007	0.868	0.021	0.003	0.000	0.092	0.000	0.000	0.035	0.000	...	0.008	0.000	0.001	0.000	0.000	0.000	0.063	0.003	0.000	0.000
topic5	0.006	0.062	0.005	0.005	0.001	0.054	0.000	0.000	0.145	0.000	...	0.017	0.000	0.014	0.000	0.010	0.001	0.000	0.000	0.000	0.000
topic6	0.016	0.000	0.000	0.000	0.002	0.000	0.008	0.000	0.098	0.000	...	0.000	0.000	0.003	0.000	0.029	0.000	0.000	0.122	0.089	0.000
topic7	0.000	0.001	0.019	0.006	0.000	0.000	0.020	0.000	0.000	0.000	...	0.000	0.000	0.018	0.000	0.019	0.002	0.000	0.004	0.000	0.000
topic8	0.003	0.000	0.000	0.007	0.002	0.000	0.007	0.000	0.274	0.000	...	0.000	0.000	0.004	0.000	0.000	0.000	0.048	0.053	0.000	0.000
topic9	0.000	0.000	0.006	0.000	0.003	0.000	0.000	0.001	0.623	0.004	...	0.000	0.000	0.000	0.000	0.000	0.004	0.207	0.002	0.567	0.295
topic10	0.000	0.000	0.000	0.002	0.000	0.033	0.000	0.000	0.025	0.000	...	0.000	0.000	0.004	0.000	0.002	0.003	0.026	0.000	0.090	0.081
topic11	0.000	0.000	0.000	0.008	0.000	0.138	0.000	0.000	0.171	0.000	...	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.000	0.239	0.027

	speaker	date	neg	neu	pos	compound	cp_roll_avg	pos_roll_avg
0	cuomo	2020-05-15	0.048	0.856	0.096	0.9993	NaN	NaN
1	cuomo	2020-05-16	0.048	0.823	0.129	0.9994	NaN	NaN
2	cuomo	2020-05-17	0.060	0.826	0.114	0.9991	NaN	NaN
3	cuomo	2020-05-18	0.056	0.796	0.148	0.9998	NaN	NaN
4	de blasio	2020-05-19	0.026	0.810	0.164	0.9995	NaN	NaN
5	de blasio	2020-05-19	0.052	0.812	0.137	0.9994	NaN	NaN
6	de blasio	2020-05-19	0.058	0.788	0.153	1.0000	NaN	NaN
7	cuomo	2020-05-19	0.053	0.829	0.118	0.9997	NaN	NaN
8	cuomo	2020-05-20	0.000	0.787	0.213	0.9910	NaN	NaN
9	de blasio	2020-05-20	0.056	0.804	0.140	1.0000	NaN	NaN