Matheus Schmitz
LinkedIn
Github Portfolio
import os
print('Installing spaCy 2.2.4')
os.system('pip install -q -q -q --force-reinstall spacy==2.2.4')
Installing spaCy 2.2.4
0
import sys
import re
import spacy
import en_core_web_sm
from spacy.matcher import Matcher
import csv
import json
from tqdm import tqdm
import pandas as pd
scraped_data = pd.read_csv("Matheus_Schmitz_hw02_bios.csv", header=None)
print(f'scraped_data.shape: {scraped_data.shape}')
scraped_data.head()
scraped_data.shape: (982, 2)
0 | 1 | |
---|---|---|
0 | https://www.imdb.com/name/nm2604107/bio | Wolf-Guido Grasenick was born on January 30, 1... |
1 | https://www.imdb.com/name/nm0001873/bio | Steven Zaillian was born on January 30, 1953 i... |
2 | https://www.imdb.com/name/nm12027010/bio | Michal Kasprzak was born on January 30, 1985 a... |
3 | https://www.imdb.com/name/nm11017764/bio | Stefan Gemmel was born on January 30, 1970 in ... |
4 | https://www.imdb.com/name/nm0014294/bio | Barbara M. Ahren was born on January 30, 1950 ... |
Check a Sample Text From Which to Extract Information
This data was crawled from the author biography in IMDB, and thus has a lot of relevant information, but all of it is buried in the middle of the text.
scraped_data.sample(1).values[0][1]
'Devon Greenwood recently made is film debut in Cry Myself Awake, a short film written and directed by Ryan Allen. Prior to film, he appeared in multiple stage plays/musicals. Devon also performed in Peter Pan at the iTheatrics Junior Theater Festival in Atlanta, GA along with his elementary school drama team.'
nlp = en_core_web_sm.load()
Birthplace
birthplace_lexical = [
{'LOWER': 'born'},
{'OP': '*'},
{'LOWER': 'in'},
{'TEXT': {'REGEX': '\s*'}, 'OP': '*'},
{'IS_PUNCT': True, 'OP': '*'},
{'TEXT': {'REGEX': '\s*'}, 'OP': '+'},
{'IS_PUNCT': True, 'OP': '*'},
]
birthplace_syntactic = [
{'POS': 'VERB', 'ORTH': 'born'},
{'OP': '*'},
{'LOWER': 'in', 'POS': 'ADP'},
{'ENT_TYPE': 'GPE', 'OP': '+'},
{'IS_PUNCT': True, 'OP': '*'},
{'ENT_TYPE': 'GPE', 'OP': '*'},
{'IS_PUNCT': True, 'OP': '*'},
]
Education
education_lexical = [
{'TEXT': {'REGEX': '^(attend|attended|went|studied|University|College|trained|graduated)$'}},
{'OP': '+'},
]
education_syntactic = [
{'TEXT': {'REGEX': '^(attend|attended|went|studied|University|College|trained|graduated)$'}},
{'ENT_TYPE': 'ORG', 'OP': '+'},
]
Parents
parents_lexical = [
{'LOWER': {'REGEX': '^(born|son|daughter|parents)$'}},
{'OP': '*'},
{'LOWER': {'REGEX': '^(to|of|are)$'}},
{'TEXT': {'REGEX': '\s*'}, 'OP': '*'},
{'IS_PUNCT': True, 'OP': '*'},
{'TEXT': {'REGEX': '\s*'}, 'OP': '+'},
{'IS_PUNCT': True, 'OP': '*'},
{'LOWER': 'and','OP': '?'},
{'TEXT': {'REGEX': '\s*'}, 'OP': '*'},
{'IS_PUNCT': True, 'OP': '*'},
{'TEXT': {'REGEX': '\s*'}, 'OP': '+'},
{'IS_PUNCT': True, 'OP': '*'},
]
parents_syntactic = [
{'LOWER': {'REGEX': '^(born|son|daughter|parents)$'}},
{'OP': '*'},
{'LOWER': {'REGEX': '^(to|of|are)$'}},
{'TEXT': {'REGEX': '\s*'}, 'OP': '*'},
{'IS_PUNCT': True, 'OP': '*'},
{'ENT_TYPE': 'PERSON', 'TEXT': {'REGEX': '\s*'}, 'OP': '+'},
{'IS_PUNCT': True, 'OP': '*'},
{'LOWER': 'and','OP': '?'},
{'TEXT': {'REGEX': '\s*'}, 'OP': '*'},
{'IS_PUNCT': True, 'OP': '*'},
{'ENT_TYPE': 'PERSON', 'TEXT': {'REGEX': '\s*'}, 'OP': '+'},
{'IS_PUNCT': True, 'OP': '*'},
]
Awards
awards_lexical = [
{'LOWER': {'REGEX': '^(winner|recipient|won|awarded|award-winning|nominated|nominee|nomination)$'}},
{'LOWER': {'REGEX': '^(of|the|on|for|as)$'}},
{'OP': '+'}
]
awards_syntactic = [
{'OP': '*'},
{'POS': 'NOUN', 'IS_TITLE': '+', 'OP': '*'},
{'LOWER': {'REGEX': '^(winner|recipient|won|awarded|award-winning|nominated|nominee|nomination)$'}},
{'POS': 'ADP', 'TEXT': {'REGEX': '^(of|the|for)$'}, 'OP': '*'},
{'POS': 'NOUN', 'IS_TITLE': '+', 'OP': '*'},
{'OP': '*'},
]
Performances
performances_lexical = [
{'LOWER': {'REGEX': '^(star|starred|known|appeared|appearing|appearances|debute|album|recorded|play|played|role)$'}},
{'LOWER': {'REGEX': '^(in|for|on|for|as)$'}},
{'OP': '+'},
]
performances_syntactic = [
{'LOWER': {'REGEX': '^(star|starred|known|appeared|appearing|appearances|debute|album|recorded|play|played|role)$'}},
{'POS': 'ADP', 'TEXT': {'REGEX': '^(in|of|the|for|as)$'}, 'OP': '+'},
{'OP': '+'},
]
Colleagues
colleagues_lexical = [
{'IS_TITLE': '+', 'OP': '*'},
{'LOWER': {'REGEX': '^(starring|collaborating|working|worked|partnering|sharing.*screen.*space|appears.*opposite|opposite)$'}},
{'LOWER': {'REGEX': '^(with|alongside|to)$'}}, {'OP': '*'},
{'OP': '*'},
]
colleagues_syntactic = [
{'OP': '*'},
{'LOWER': {'REGEX': '^(starring|collaborating|working|worked|partnering|sharing.*screen.*space|appears.*opposite|opposite|joined)$'}},
{'LOWER': {'REGEX': '^(with|alongside|to)$'}}, {'OP': '*'},
{'ENT_TYPE': 'PERSON'},
{'OP': '*'},
]
def spacy_matcher(doc, pattern):
output = []
# Instantiate the spaCy Matcher
matcher = Matcher(nlp.vocab)
matcher.add("matching", None, pattern)
# Iterate throgh sentences
for sent in doc.sents:
# Match sentences to the pattern
matches = matcher(nlp(sent.text))
# If any matches were found, extract them and append to the list of matches for that pair of (document x pattern)
if len(matches)>0:
span = sent[matches[-1][1]:matches[-1][2]]
output.append(span.text)
return output
def birthplace_filter(list_with_text):
if len(list_with_text) > 0:
# Convert to a single single
joined = " ".join(list_with_text)
list_with_text = joined.rsplit("as")
return [[' '.join(re.findall(r"\b(?:[A-Z][^\s]*\s?)+", list_with_text[i]))
for i in range(len(list_with_text))][0]
.strip(r'^January$|^February$|^March$|^April$|^May$|^June$|^July$|^August$|^September$|^October$|^November$|^December$|^USA$')][0]
else:
return ""
def education_filter(list_with_text):
if len(list_with_text) > 0:
# Convert to a single single
joined = ",".join(list_with_text)
# Split the text considering punctions and other demarkings
text_splits = re.split(",| and ", joined)
# Keep only Titlecasewords and their punctuations
titlecases = [' '.join(re.findall(r"\b(?:[A-Z][^\s]*\s?)+", [text_splits[i] for i in range(len(text_splits))][x])) for x in range(len(text_splits))]
# Clear additional frequent nuisances
output1 = [re.sub(r'^She$|^He$|^January$|^February$|^March$|^April$|^May$|^June$|^July$|^August$|^September$|^October$|^November$|^December$', '', titlecases[i]) for i in range(len(titlecases))]
# Remove any empty strings generated
output2 = [x for x in output1 if x.strip()]
return output2
else:
return list_with_text
def parents_filter(list_with_text):
if len(list_with_text) > 0:
# Convert to a single single
joined = ",".join(list_with_text)
# Split the text considering punctions and other demarkings
text_splits = re.split(",| and ", joined)
# Keep only Titlecasewords and their punctuations
titlecases = [' '.join(re.findall(r"\b(?:[A-Z][^\s]*\s?)+", [text_splits[i] for i in range(len(text_splits))][x])) for x in range(len(text_splits))]
# Clear additional frequent nuisances
output1 = [re.sub(r'^She$|^He$|^January$|^February$|^March$|^April$|^May$|^June$|^July$|^August$|^September$|^October$|^November$|^December$', '', titlecases[i]) for i in range(len(titlecases))]
# Remove any empty strings generated
output2 = [x for x in output1 if x.strip()]
return output2
else:
return list_with_text
def awards_filter(list_with_text):
if len(list_with_text) > 0:
# Convert to a single single
joined = ",".join(list_with_text)
# Split the text considering punctions and other demarkings
text_splits = re.split(",| and | for ", joined)
# Keep only Titlecasewords and their punctuations
titlecases = [' '.join(re.findall(r"\b(?:[A-Z][^\s]*\s?)+", [text_splits[i] for i in range(len(text_splits))][x])) for x in range(len(text_splits))]
# Remove any empty strings generated
output1 = [x for x in titlecases if x.strip()]
return output1
else:
return list_with_text
def performances_filter(list_with_text):
if len(list_with_text) > 0:
# Convert to a single single
joined = ",".join(list_with_text)
# Remove "known for"
clean_1 = re.sub("known for", "", joined)
# Split the text considering punctions and other demarkings
text_splits = re.split(",| and ", clean_1)
# Keep only Titlecasewords and their punctuations
titlecases = [' '.join(re.findall(r"\b(?:[A-Z][^\s]*\s?)+", [text_splits[i] for i in range(len(text_splits))][x])) for x in range(len(text_splits))]
# Remove any empty strings generated
output1 = [x for x in titlecases if x.strip()]
return output1
else:
return list_with_text
def colleagues_filter(list_with_text):
if len(list_with_text) > 0:
# Remove double quotations " around some movie names
unquoted = [x.strip("'") for x in list_with_text]
# Split the text considering punctions and other demarkings
text_splits = re.split(",| and | in", unquoted[0])
# Keep only Titlecasewords and their punctuations
titlecases = [' '.join(re.findall(r"\b(?:[A-Z][^\s]*\s?)+", [text_splits[i] for i in range(len(text_splits))][x])) for x in range(len(text_splits))]
# Remove any empty strings generated
output1 = [x for x in titlecases if x.strip()]
return output1
else:
return list_with_text
input_csv = "Matheus_Schmitz_hw02_bios.csv"
output_jl = "spacy_extractions.jl"
extractor_number = 1 # 0 for lexical, 1 for syntactic
# Lexical
if extractor_number == 0:
print('Applying lexical extractor.')
# Open the jsonlines to be written
with open(output_jl, 'w') as hw2_lexical:
# Iterate through the csv with bios
for url, bio in tqdm(csv.reader(open(input_csv, encoding='utf-8')), total=sum(1 for row in csv.reader(open(input_csv, encoding='utf-8')))):
# Create dict to store the outputs of each row
output = {}
# Populate the outputs with spaCy's Matcher
output["url"] = url
output["birthplace"] = spacy_matcher(nlp(bio), birthplace_lexical)
output["education"] = spacy_matcher(nlp(bio), education_lexical)
output["parents"] = spacy_matcher(nlp(bio), parents_lexical)
output["awards"] = spacy_matcher(nlp(bio), awards_lexical)
output["performances"] = spacy_matcher(nlp(bio), performances_lexical)
output["colleagues"] = spacy_matcher(nlp(bio), colleagues_lexical)
# Keep only nouns for certain outputs
output["birthplace"] = birthplace_filter(output["birthplace"])
output["education"] = education_filter(output["education"])
output["parents"] = parents_filter(output["parents"])
output["awards"] = awards_filter(output["awards"])
output["performances"] = performances_filter(output["performances"])
output["colleagues"] = colleagues_filter(output["colleagues"])
# Write a csv row as a line in jsonlines
json.dump(output, hw2_lexical)
hw2_lexical.write("\n")
# Close the jsonlines file
hw2_lexical.close()
# Syntactic
elif extractor_number == 1:
print('Applying syntactic extractor.')
# Open the jsonlines to be written
with open(output_jl, 'w') as hw2_syntactic:
# Iterate through the csv with bios
for url, bio in tqdm(csv.reader(open(input_csv, encoding='utf-8')), total=sum(1 for row in csv.reader(open(input_csv, encoding='utf-8')))):
# Create dict to store the outputs of each row
output = {}
# Populate the outputs with spaCy's Matcher
output["url"] = url
output["birthplace"] = spacy_matcher(nlp(bio), birthplace_syntactic)
output["education"] = spacy_matcher(nlp(bio), education_syntactic)
output["parents"] = spacy_matcher(nlp(bio), parents_syntactic)
output["awards"] = spacy_matcher(nlp(bio), awards_syntactic)
output["performances"] = spacy_matcher(nlp(bio), performances_syntactic)
output["colleagues"] = spacy_matcher(nlp(bio), colleagues_syntactic)
# Keep only nouns for certain outputs
output["birthplace"] = birthplace_filter(output["birthplace"])
output["education"] = education_filter(output["education"])
output["parents"] = parents_filter(output["parents"])
output["awards"] = awards_filter(output["awards"])
output["performances"] = performances_filter(output["performances"])
output["colleagues"] = colleagues_filter(output["colleagues"])
# Write a csv row as a line in jsonlines
json.dump(output, hw2_syntactic)
hw2_syntactic.write("\n")
# Close the jsonlines file
hw2_syntactic.close()
# Error
else:
raise ValueError('Input either 0 for lexical patterns or 1 for syntactic patterns.')
0%| | 0/982 [00:00<?, ?it/s]
Applying syntactic extractor.
100%|████████████████████████████████████████████████████████████████████████████████| 982/982 [26:03<00:00, 1.59s/it]
extractions_df = pd.read_json(output_jl, lines=True)
print(f'extractions_df.shape: {extractions_df.shape}')
extractions_df.head()
extractions_df.shape: (982, 7)
url | birthplace | education | parents | awards | performances | colleagues | |
---|---|---|---|---|---|---|---|
0 | https://www.imdb.com/name/nm2604107/bio | [] | [] | [] | [Schiri Abseits , Der Fahnder , Streit Drei ] | [] | |
1 | https://www.imdb.com/name/nm0001873/bio | Fresno, California, | [] | [] | [] | [Searching Bobby Fischer , Schindler's List ,... | [] |
2 | https://www.imdb.com/name/nm12027010/bio | [] | [] | [] | [M , Dziewczyny Dubaju , Barwy ] | [] | |
3 | https://www.imdb.com/name/nm11017764/bio | Morbach, | [] | [] | [] | [] | [] |
4 | https://www.imdb.com/name/nm0014294/bio | Salzburg, Austria. | [] | [] | [] | [Anatomy , Rote Rosen , Brandnacht ] | [] |
View all extractions for one sample
extractions_df.iloc[180].to_dict()
{'url': 'https://www.imdb.com/name/nm0000432/bio', 'birthplace': ' Bernardino, California,', 'education': ['University Illinois'], 'parents': ['San Bernardino', 'California', 'Anna Lyda Elizabeth Gray)', 'Eugene Ezra Hackman'], 'awards': ['Oscar', 'Detective Jimmy Popeye" Doyle The French Connection ', 'Oscar', 'Golden Globe'], 'performances': ['Norman Lilith ', 'Warren Beatty.', "Little Bill Daggett Clint Eastwood's Unforgiven ", 'Hackman ', 'Geronimo:'], 'colleagues': []}
Matheus Schmitz
LinkedIn
Github Portfolio