Study political discourses with: Webscraping, Natural Language Processing, Topic Modeling, Sentiment Analysis
Data come from the oficial websites from the politicians being studied:
New York City Mayor: https://www1.nyc.gov
New York State Governor: https://www.governor.ny.gov
# Imports
# Data Manipulation
import os
import re
import sys
import time
import string
import pickle
import requests
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
# Data Visualization
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
from pprint import pprint
# Natual Language Processing
import spacy
from spacy.symbols import amod
from collections import Counter
# Web Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
# Topic Modeling
import sklearn
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF
# Sentiment Analysy
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
# Defining the parameter below to avoid the error:
# RecursionError: maximum recursion depth exceeded
# When saving the results to disk
sys.setrecursionlimit(10000)
%reload_ext watermark
%watermark -v --iv
Web scraping discourses from NY City mayor Bill de Blasio and form NY State governor Andrew Cuomo.
The goal is recovering transcriptions from their official discourses in 2020. Since their websites are updated with new discourses frequently this code will result in a different result every time its ran. It might even not work at some point in the future.
# Define chromedriver
# https://chromedriver.chromium.org/downloads
chromedriver = "./chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver
Starting by scraping the URLs from his main transcriptions page and afterwards proceed to scraping the content in each of the URLs.
# List to store the URLs
list_urls = []
# Loop to create a list of urls through which to run the web scraping
for i in range(1, 41):
full_url = 'https://www1.nyc.gov/office-of-the-mayor/news.page' + '#page-' + str(i)
list_urls.append(full_url)
# Check list
list_urls[37:]
# Counter (sets how many urls will be scraped)
counter = 25
# Funtcion for web scraping
def scraping_urls(urls):
# Define the driver
driver = webdriver.Chrome("./chromedriver")
# List for the result
soup_list = []
# Counter
count = 0
# Loop through urls
for i in tqdm(urls):
if count < counter:
driver.get(i)
driver.refresh()
time.sleep(5)
soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
count += 1
driver.close()
return soup_list
# Scrap the source code from each page
soups = scraping_urls(list_urls)
# Function to extract the links from the source code on the html pages. These links contain the discourse texts (transcriptions)
def extract_links_mayor(soup_object):
links_list = []
for s in tqdm(soup_object):
links = s.find_all('a', {'href': re.compile(r'transcript')})
for i in links:
link1 = str(i).replace('"', '')
if re.search('=(.+)>T', link1) is not None:
link = re.search('=(.+)>T', link1).group(1)
else:
continue
full_link = 'https://www1.nyc.gov' + link
links_list.append(full_link)
return links_list
# Extract the links for each page with transcripts
link_list = extract_links_mayor(soups)
# Salvamos links to disk
with open('data/db_links_mayor.pickle', 'wb') as to_write:
pickle.dump(link_list, to_write)
# Function to extract the text of the political discourse from the source code
def extract_discourse_mayor(urls):
driver = webdriver.Chrome(chromedriver)
doc_source = []
for i in tqdm(urls):
driver.get(i)
time.sleep(5)
doc_source.append(BeautifulSoup(driver.page_source, 'html.parser'))
driver.close()
return doc_source
# Extract transcription from each discourse
discourses = extract_discourse_mayor(link_list)
# Save the discourses extracted
with open('data/db_discourse_mayor.pickle', 'wb') as to_write:
pickle.dump(discourses, to_write)
# Counter (sets how many urls will be scraped)
counter = 10
# URL of the pages with discourses
url = 'https://www.governor.ny.gov/keywords/media'
# Função to scrape the urls
def scraping_urls(urls):
# Load the driver and extract the html code
driver = webdriver.Chrome(chromedriver)
soup_list = []
driver.get(url)
driver.refresh()
time.sleep(5)
soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
pages = list(range(2,9)) + ([4] * 12)
count = 0
# Loop pelas páginas
for i in tqdm(pages):
if count < counter:
path = '//*[@id="DataTables_Table_0_paginate"]/span/a[' + str(i) + ']'
driver.find_element_by_xpath(path).click()
time.sleep(5)
soup_list.append(BeautifulSoup(driver.page_source, 'html.parser'))
count += 1
driver.close()
return soup_list
# Scraping the source code from the pages
sources = scraping_urls(url)
# Function to extract links
def extract_links_gov(soup_object):
links_list = []
for s in tqdm(soup_object):
links = s.find_all('a', {'href': re.compile(r'transcript')})
for i in links:
link1 = str(i).replace('"', '')
if re.search('=(.+)>\n', link1) is not None:
link = re.search('=(.+)>\n', link1).group(1)
else:
continue
full_link = 'https://www.governor.ny.gov' + link
links_list.append(full_link)
return list(set(links_list))
# Extract the links from each source code
link_list = extract_links_gov(sources)
# Save each page's links
with open('data/db_links_governor.pickle', 'wb') as to_write:
pickle.dump(link_list, to_write)
# Function to extract discourses
def extract_discourses_gov(urls):
driver = webdriver.Chrome(chromedriver)
doc_source = []
for i in tqdm(urls):
driver.get(i)
time.sleep(5)
doc_source.append(BeautifulSoup(driver.page_source, 'html.parser'))
driver.close()
return doc_source
# Extract discourses
discourses = extract_discourses_gov(link_list)
# Save discourses
with open('data/db_discourse_governor.pickle', 'wb') as to_write:
pickle.dump(discourses, to_write)
Now that the source from each discourse page was extracted, the discourse texts need to be retrieved.
# Open the database
with open('data/db_links_mayor.pickle', 'rb') as read_file:
db_links_mayor = pickle.load(read_file)
# Open the database
with open('data/db_discourse_mayor.pickle', 'rb') as read_file:
db_discourse_mayor = pickle.load(read_file)
# Function to extract text
def extract_text(source_object):
text_list = []
for s in tqdm(source_object):
text = s.find_all('p')
text_list.append(text)
return text_list
# List for the text
discourse_length = []
# Loop to extract the text
for i in extract_text(db_discourse_mayor):
discourse_length.append(len(i))
# Convert to numpy array
discourse_length_array = np.array(discourse_length)
# Extract discourses
first_discourses = extract_text(db_discourse_mayor)
# These are the indexes which need to be re-extracted with get_text2
# This is necessary because the pages have differencs among them
indexes = list(np.argwhere(discourse_length_array == 1).flatten())
# Function for new text extraction
def extract_text2(source_object):
text_list = []
for s in tqdm(source_object):
text = s.find('p').parent
text_list.append(text)
return text_list
# Loop to extract discourses with clearer text
best_discourses = []
for i in indexes:
best_discourses.append(extract_text2(db_discourse_mayor[i]))
# Substitute the first_discourses by best_discourses
for (indexes, best_discourses) in zip(indexes, best_discourses):
first_discourses[indexes] = best_discourses
# Function to remove html tags
def remove_html_tags(text):
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
# Function to clean data
def clean_data(transcript_list, link_list):
date = []
text = []
for i in tqdm(transcript_list):
cleaned = remove_html_tags(str(i))
try:
if re.search('\[\\n(.+)\\n(.+)\]', cleaned) is not None and re.search('\[\\n(.+)\\n(.+)\]', cleaned) is not None:
date_clean = re.search('\[\\n(.+)\\n(.+)\]', cleaned).group(1)
date.append(date_clean)
text_clean = re.search('\[\\n(.+)\\n(.+)\]', cleaned).group(2)
text.append(text_clean)
else:
date_clean = re.search('\\n\\n(.+20.{2})\\n(.+)\\n\\ufeff', cleaned).group(1)
date.append(date_clean)
text_clean = re.search('\\n\\n(.+20.{2})\\n(.+)\\n\\ufeff', cleaned).group(2)
text.append(text_clean)
except:
continue
date = pd.to_datetime(date)
df = pd.DataFrame([date, link_list, text]).T
df.columns = ['date', 'link', 'text']
return df
# Clean
data_cleaned = clean_data(first_discourses, db_links_mayor)
# Function to extract the discourse using regular expressions
def extract_text_discourse(transcript):
if len(re.findall('sio:([^:]+)|yor:([^:]+)', str(transcript))) > 0:
return str(re.findall('sio:([^:]+)|yor:([^:]+)', str(transcript)))
else:
return ''
# Pattern to remove
toremove = "'\\,\"\[\(\]\)-–"
# Lambda functions to extract undesired characters and patterns from the data
punc_lower = lambda x: re.sub('[%s]' % re.escape(toremove), '', x.lower())
remove_xa0 = lambda x: x.replace('xa0', '')
remove_space = lambda x: x.replace(' ', ' ')
# Apply the cleaning
data_cleaned['monologue'] = data_cleaned['text'].map(extract_text_discourse).map(punc_lower).map(remove_xa0).map(remove_space)
# Incluide a column indicating that this is a discourse from mayor de Blasio
data_cleaned.insert(0, 'speaker', 'de blasio')
# Salvamos os discursos agora limpos
# Save the now-cleaned discourses
with open('data/db_discourse_mayor_clean.pickle', 'wb') as to_write:
pickle.dump(data_cleaned, to_write)
Now repeating everything for governor Cuomo
# Open the database
with open('data/db_links_governor.pickle', 'rb') as read_file:
db_links_governor = pickle.load(read_file)
# Open the database
with open('data/db_discourse_governor.pickle', 'rb') as read_file:
db_discourse_governor = pickle.load(read_file)
# Function to get the date (I'll be using this info later)
def get_date(source_object):
date_list = []
for i in tqdm(source_object):
date = i.find('div', class_="published-date").text
date_clean = re.search('\\n\\n(.+20.{2})', date).group(1).strip()
date_list.append(date_clean)
return date_list
# Extract the date
cuomo_date = get_date(db_discourse_governor)
# Function to extract the text
def get_text(source_object):
text_list = []
for s in tqdm(source_object):
text = s.find('div', class_='field field--name-field-body field--type-text-long field--label-hidden')
text_list.append(text)
return text_list
# Extract the text with a first pass through the data
cuomo_step1 = get_text(db_discourse_governor)
# Function to clean the data
def clean_cuomo(transcript_list):
clean_transcripts = []
for idx, i in tqdm(enumerate(transcript_list)):
cleaned = remove_html_tags(str(i))
if re.search('below:(.+)', cleaned) is not None:
text_clean = re.search('below:(.+)', cleaned).group(1)
clean_transcripts.append(text_clean)
elif re.search('here.(.+)', cleaned) is not None:
text_clean = re.search('here.(.+)', cleaned).group(1)
clean_transcripts.append(text_clean)
return clean_transcripts
# Limpa os dados
cuomo_step2 = clean_cuomo(cuomo_step1)
# Visualizin what has been extracted
test_len = []
for i in cuomo_step2:
test_len.append(len(i))
test_len_array = np.array(test_len)
# Sample an extracted text
cuomo_step2[99]
# To free memory, deleting objects which arent necessary anymore
delete_index = test_len_array.argsort()[0]
del cuomo_step2[delete_index]
del db_links_governor[delete_index]
del cuomo_date[delete_index]
# Converting the date to datetime type
cuomo_date = pd.to_datetime(cuomo_date)
# Create a dataframe with the clean data
cuomo_clean = pd.DataFrame([cuomo_date, db_links_governor, cuomo_step2]).T
cuomo_clean.columns = ['date', 'links', 'text']
# visualize
cuomo_clean.head()
# Save the discourses
with open('data/db_discourse_governor_clean.pickle', 'wb') as to_write:
pickle.dump(cuomo_clean, to_write)
Using three different functions with different regular expressions to extract only the parts of the discourse in which the governor was speaking, as I want to remove other people's comments. Luckily that info is contained in the page from each discourse.
# Function to extract discourse
def extract_discourse_gov1(transcript):
if len(re.findall('Cuomo:([^:]+)', str(transcript))) > 0:
return str(re.findall('Cuomo:([^:]+)', str(transcript)))
else:
return str(transcript)
# Function to extract discourse
def extract_discourse_gov2(transcript):
if len(re.findall('Cuomo:(.*?)[A-Z][a-z]+:', str(transcript))) > 0:
return str(re.findall('Cuomo:(.*?)[A-Z][a-z]+:', str(transcript)))
else:
return str(transcript)
# Function to extract discourse
def extract_discourse_gov3(transcript):
if len(re.findall('^(.*?)[A-Z][a-z]+:', str(transcript))) > 0:
return str(re.findall('^(.*?)[A-Z][a-z]+:', str(transcript)))
else:
return str(transcript)
# Clean data
cuomo_clean['monologue'] = cuomo_clean['text'].map(extract_discourse_gov1)
# Clean data
cuomo_clean['monologue2'] = cuomo_clean['text'].map(extract_discourse_gov2)
cuomo_clean.dropna(inplace = True)
# !! Mixed results with this function !!
def extract_m3(col1, col2):
if (len(col1)-len(col2))/len(col2) > 10:
return extract_discourse_gov3(col1)
else:
return ''
# Clean data
cuomo_clean['monologue3'] = cuomo_clean.apply(lambda x: extract_m3(x.text, x.monologue), axis = 1)
# Function to compare
def compare(col1, col2):
if (len(col1) - len(col2)) / len(col2) > 1:
return col1
else:
return col2
# Texto final com duas extrações
cuomo_clean['final_text'] = cuomo_clean.apply(lambda x: compare(x.monologue, x.monologue2), axis = 1)
# Final text with the third extraction
cuomo_clean['final_text2'] = cuomo_clean.monologue3 + cuomo_clean.final_text
# Remove undesired characters
remove_sxa0 = lambda x: x.replace('\xa0', '')
# Final cleaning
cuomo_clean['final_clean'] = cuomo_clean['final_text2'].map(punc_lower).map(remove_xa0).map(remove_sxa0).map(remove_space)
# Final cleaning
cuomo_final = cuomo_clean.loc[:, ['date', 'links', 'text', 'final_clean']]
cuomo_final.columns = ['date', 'link', 'text', 'monologue']
# Insert a column indicating this is a discourse form governor Cuomo
cuomo_final.insert(0, 'speaker', 'cuomo')
# View
cuomo_final.head()
# Save
with open('data/db_discourse_governor_clean_final.pickle', 'wb') as to_write:
pickle.dump(cuomo_final, to_write)
Opening the already clearned databases to concatenate them into a single dataframe
with open('data/db_discourse_mayor_clean.pickle','rb') as read_file:
deblasio = pickle.load(read_file)
with open('data/db_discourse_governor_clean_final.pickle','rb') as read_file:
cuomo = pickle.load(read_file)
# Concatenate the databases
db_final = pd.concat([cuomo, deblasio], axis = 0).reset_index(drop = True)
# Check the proportion of discourses from each politician
db_final['speaker'].value_counts()
# Grava o database final
# Save the final database
with open('data/db_final.pickle', 'wb') as to_write:
pickle.dump(db_final, to_write)
# Download the language model from spacy
!python -m spacy download en_core_web_sm
import en_core_web_sm
# Set the language model
#sp = spacy.load("en_core_web_sm")
sp = en_core_web_sm.load()
# Load the database
with open('data/db_final.pickle','rb') as read_file:
db_final = pickle.load(read_file)
# Create patterns to extract text from the discourses
alphanumeric = lambda x: re.sub('\w*\d\w*', '', x)
punc = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x)
remove_space = lambda x: x.replace(' ', ' ')
# Apply the previous function to the final database
db_final['for_spacy'] = (db_final['monologue']
.map(alphanumeric)
.map(punc)
.map(remove_space))
# Applying SpaCy's language model
db_final['spacy_monologue'] = db_final['for_spacy'].map(lambda x: sp(x))
# Visualize
db_final.head()
# Lemmatize
db_final['lemmatized'] = (db_final['spacy_monologue']
.map(lambda x: [' '.join(word.lemma_
if word.lemma_ != '-PRON-'
else word.text for word in x)][0]))
# Visualize
db_final.head()
# List stop words
list_stop_words = (text
.ENGLISH_STOP_WORDS
.union(['lehrer', 'brian', 'darden', 'moderator', 'alan', 'howard', 'wolf', 'blitzer',
'errol', 'louis', 'alisyn', 'chris', 'camerota', 'dan', 'mannarino','john', 'berman',
'savannah', 'guthrie', 'hoda']))
# Configurating max_df=0.5 and min_df=2 because those resulted in the best topics
cv1 = CountVectorizer(stop_words = list_stop_words, max_df = 0.5, min_df = 2)
# Create the document matrix, which basically contains words in numeric representations
docterm_matrix = cv1.fit_transform(db_final.loc[:, 'lemmatized'])
# Labels
doc_label = ['Document' + str(t) for t in range(len(db_final.loc[:, 'lemmatized']))]
# Final matrix
pd.DataFrame(docterm_matrix.toarray(), index = doc_label, columns = cv1.get_feature_names()).iloc[:10, 630:650]
Using Non-Negative Matrix Factorization (NMF) to fill the matrix with the topics per document.
The code below uses NMF for 12 topics, which demonstrated to create the most clear and distinct separation.
# Create and train model
nmf_cv = NMF(12)
nmf_topics1 = nmf_cv.fit_transform(docterm_matrix)
# Dataframe de tópicos
topicword_cv1 = pd.DataFrame(nmf_cv.components_.round(3),
index = ['topic0', 'topic1', 'topic2', 'topic3',
'topic4', 'topic5', 'topic6', 'topic7',
'topic8', 'topic9', 'topic10', 'topic11'],
columns = cv1.get_feature_names())
Create a dataframe with 12 topics on the lines and the terms on the columns. The values on the dataframe describe how the terms relate to the topic, where higher values mean a stronger relationship.
# Matrix
topicword_cv1.iloc[:, 1790:1820]
# Function to find the most important words per topic
def top_words_per_topic(model, terms, topic_names = None):
for ix, topic in tqdm(enumerate(nmf_cv.components_)):
if not topic_names or not topic_names[ix]:
print("\nTopic ", ix)
else:
print("\nTopic '",topic_names[ix],"'")
print(", ".join([cv1.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]))
# Locating the speakers (politicians)
db_final_docs_topics = db_final.loc[:, ['speaker']]
# Visualize
db_final_docs_topics
# Finding the most relevant topics
db_final_docs_topics['topics12'] = nmf_topics1.argmax(axis = 1)
# Topics per politician
speaker_topics = pd.DataFrame(db_final_docs_topics.groupby(['topics12']).speaker.value_counts())
speaker_topics.columns = ['count']
speaker_topics.reset_index()
speaker_topics_pivot = speaker_topics.reset_index().pivot_table(index = 'topics12', columns = 'speaker', values = 'count', fill_value = 0).reset_index(drop = True)
speaker_topics_pivot['total'] = speaker_topics_pivot['cuomo'] + speaker_topics_pivot['de blasio']
speaker_topics_pivot.sort_values('total', inplace = True)
# Plot
plt.figure(figsize = (14, 8))
names = list(set(speaker_topics_pivot.index))
plt.title('Most Discussed Topics in Speeches')
plt.xlabel('Total Documents per Topic')
plt.xlim(0, 80)
plt.barh(names, speaker_topics_pivot['total'], color = 'magenta', edgecolor = 'white')
plt.yticks(range(0, 12), ['Neighborhood Impact', 'Covid Mechanics', 'Healthcare', 'Education/School',
'Neighborhood Resilience', 'Homelessness', 'DOH Communication', 'New Yorkers',
'Hate Crime', 'Hospital Needs', 'Hospital Status', 'Reopening Metrics'])
plt.tight_layout()
The topic which appears most frequently refers to reopening metrics. This includes terms such as rate, virus, reopening, infection, business, infection and hospitalization. This makes sense, since New York City is in the process of reopening its economy at the moment I'm working on this project.
# Plot
plt.figure(figsize = (14, 8))
names = list(set(speaker_topics_pivot.index))
plt.title('Politicians Covering Different Topics')
plt.xlabel('Total Documents per Topic')
plt.xlim(0, 80)
plt.barh(names, speaker_topics_pivot['cuomo'], color = '#77a9cf', edgecolor = 'white', label = 'Cuomo')
plt.barh(names, speaker_topics_pivot['de blasio'], left = speaker_topics_pivot['cuomo'], color = '#df8a62', edgecolor = 'white', label = 'de Blasio')
plt.yticks(range(0, 12), ['Neighborhood Impact', 'Covid Mechanics', 'Healthcare', 'Education/School',
'Neighborhood Resilience', 'Homelessness', 'DOH Communication', 'New Yorkers',
'Hate Crime', 'Hospital Needs', 'Hospital Status', 'Reopening Metrics'])
plt.legend()
plt.tight_layout()
The reopening is purely a topic from Cuomo. Cuomo also began many of his discourses supplying relevant statistics for the reopening. Mayor de Blasio dominates the topic of hospital needs, while Cuomo discusses the hospital's status. They describe different things, in which the discourses from Cuomo tend to envolve updates about ventilators, beds and available equipments, and de Blasio talks about what New York's hospitals are losing: personell, supplies and ventilators. Both are covering hospitals in slightly different manners.
Mayor de Blasio focuses on available universal public service, while Cuomo focuses on appropriate and intensive treatment. The analysis from parts of the discourse highlights Blasio's progressive roots, with his focus on working class new yorkers and the need for universal public healthcare.
Using the SentimentIntensityAnalyzer function and feeding the discourses to retrieve the sentiment scores.
Teh VADER package returns a negative, neutral, positive and compound score, which can be plotted over time to understand how the sentiments vary over time.
# Load dataset
with open('data/db_final.pickle','rb') as read_file:
db_final = pickle.load(read_file)
# Create the sentiment analyser
analyzer = SentimentIntensityAnalyzer()
# Scores list
scores = []
# Calculate sentiment scores
for i in tqdm(db_final['monologue']):
scores.append(analyzer.polarity_scores(i))
# Concatenate results
db_final_docs_sentiment = pd.concat([db_final.loc[:, ['speaker', 'date']], pd.DataFrame(scores)], axis = 1)
Calculating the mean sentiment score for each politician, observating the compound and the positive scores. Doing this as a sliding time window in time series.
# Split the data by politician
cuomo_roll = db_final_docs_sentiment[db_final_docs_sentiment.speaker == 'cuomo'].sort_values(by = 'date')
de_blasio_roll = db_final_docs_sentiment[db_final_docs_sentiment.speaker == 'de blasio'].sort_values(by = 'date')
# Moving averages for Cuomo
cuomo_roll['cp_roll_avg'] = cuomo_roll.compound.rolling(window = 7).mean()
cuomo_roll['pos_roll_avg'] = cuomo_roll.pos.rolling(window = 7).mean()
# Moving averages for de Blasio
de_blasio_roll['cp_roll_avg'] = de_blasio_roll.compound.rolling(window = 7).mean()
de_blasio_roll['pos_roll_avg'] = de_blasio_roll.pos.rolling(window = 7).mean()
# Combine the moving averages in a single dataframe
combined_roll = pd.concat([cuomo_roll, de_blasio_roll]).sort_values('date').reset_index(drop = True)
# Visualize
combined_roll.head(10)
# Plot the Compound Score
fig = plt.figure(figsize = (15, 8))
ax = plt.axes()
sns.lineplot(x = 'date', y = 'cp_roll_avg', hue = 'speaker', data = combined_roll)
plt.title('Compound Score 7-Day Rolling Average')
plt.xlabel('Discourse Date')
plt.ylabel('Compoound Score')
plt.ylim((-0.6, 1.1))
plt.axhline(y = 0, color = '#77a9cf', linestyle = ':')
plt.axvline(x = datetime.date(2020, 7, 1), color = 'black', linestyle = 'dashed')
plt.xticks(rotation = 30)
plt.tight_layout()
# Plot Positive Score
fig = plt.figure(figsize = (15, 8))
ax = plt.axes()
sns.lineplot(x = 'date', y = 'pos_roll_avg', hue = 'speaker', data = combined_roll)
plt.title('Positive Score 7-Day Rolling Average')
plt.xlabel('Discourse Date')
plt.ylabel('Positive Score')
plt.axvline(x = datetime.date(2020, 7, 1), color = 'black', linestyle = 'dashed')
plt.xticks(rotation = 30);
plt.tight_layout()