In 2016 California voters aproved Proposition 64, legalizing recreative marijuana usage. More information about the Proposal can be found at:
California Proposition 64, Marijuana Legalization (2016)
Those opposing the Proposal presented five main objections:
The goal of this project is to analyze such claims through time series and geospacial analytical tools, to assess whether those predictions became true since legalization. The focus will be on marijuana related detensions and in the general crime rate near dispensaries. Claims 2, 4 and 5 will be assessed.
Data Sources:
All datasets have been separatedly downloaded and will be loaded from my GitHub to ensure forward compatibility.
#!pip install -q yelp
#!pip install -q yelpapi
# Disabling the multiple messages generated by the new versions from Pandas and Matplotlib
import sys
import warnings
import matplotlib.cbook
if not sys.warnoptions:
warnings.simplefilter('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=matplotlib.cbook.mplDeprecation)
# Data manipulation imports
import re
import json
import time
import numpy as np
import pandas as pd
from yelp.client import Client
from yelpapi import YelpAPI
# File manipulation imports for Google Colab
#from google.colab import drive
#drive.mount('/content/drive')
#import os
#os.chdir("/content/drive/My Drive/Colab Notebooks/Marijuana_Crime")
!ls
Obtaining from Yelp data about Los Angeles dispensaries
# Yelp API key
# Link to get my API key: https://www.yelp.com/developers/v3/manage_app?app_created=True
my_api = ''
# Connect to Yelp
client_access = Client(my_api)
# Function to format the file with the API queries
def format_file(file_path,
logfile = './files/log_file.txt',
file_description = None):
# Applying regular expressiosn to clean the file name
# Adjusting the file extension
try:
ext = re.search('(?<!^)(?<!\.)\.(?!\.)', file_path).start()
except:
raise NameError('File could not be found in this path.')
# Adjusting the timestamp for the file name
try:
stamp = re.search('(?<!^)(?<!\.)[a-z]+_[a-z]+(?=\.)', file_path).start()
except:
raise NameError('File could not be found in this path.')
# Format the file name adding the timestamp
formatted_name = f'{file_path[:stamp]}{round(time.time())}_{file_path[stamp:]}'
# In case there is no file description, generate one
if not file_description:
file_description = f'File generated in: {time.asctime(time.gmtime(round(time.time())))}'
# Opening the log file and storing the formatted data file and its description
with open(logfile, 'a+') as f:
f.write(f'{formatted_name}: {file_description}\n')
# Returning the formatted data file and its description
return formatted_name, file_description
# Connecting to the Yelp API and retrieving 1000 samples
# That is, 1000 registers from stores classified as 'dispensaries'
def yelp_search(category,
location,
offset_number = 0,
n_samples = 1000):
# API
yelp_api = YelpAPI(my_api)
# Register from the last result
last_result = round(time.time())
# List to store results
results = []
# Size
size = 50
# Initialize the loop count
loops = 0
# Initialize the runs count
run = 1
# Initialize offset
offset_count = offset_number
# Loop to retrieve data:
while loops < n_samples:
print(f'Initializing query {run}')
# Query
posts = yelp_api.search_query(categories = category,
location = location,
offset = offset_count,
limit = size)
# Business related posts
results.extend(posts['businesses'])
# Increment the loop count
loops += size
# Increment offset
offset_count += size
# Wait 3 seconds to run next query
time.sleep(3)
# Increment runs
run += 1
# Once the loop is finished, obtain the formatted file name and description
formatted_name, file_description = format_file(file_path = f'./files/file_{category}.json')
# Opening the formatted file and dumping the query results in json format
with open(formatted_name, 'w+') as f:
json.dump(results, f)
print(f'\nQuery finished. Number of stores found: {len(results)} {category}.')
global timestamp
timestamp = round(time.time())
return print(f'\nThe last timestamp was: {timestamp}.')
# Run query with business category and city
yelp_search('cannabisdispensaries', 'los angeles', n_samples = 1000)
# Open the JSON file, read it, and generat the final list
with open(f'files/{timestamp}_file_cannabisdispensaries.json', 'r') as f:
la_dispensaries = json.load(f)
# Checking
la_dispensaries[5]
# Function to create a dataframe with the dispensaries data
def organize_data(stores_list, df_name = 'df_stores'):
# Convert the list to a dataframe
df_name = pd.DataFrame(stores_list)
# List with the desired columns
col_list = ['name',
'is_closed',
'url',
'rating',
'price',
'review_count']
# Filter the dataframe to have only the desired columns
df_name = df_name[col_list]
return df_name
# Apply the function to create the dataframe
df_stores = organize_data(la_dispensaries)
df_stores.shape
df_stores.head()
# Extracting latitudes
latitude_list = [la_dispensaries[i]['coordinates']['latitude'] for i in range(len(la_dispensaries))]
# Extracting longitudes
longitude_list = [la_dispensaries[i]['coordinates']['longitude'] for i in range(len(la_dispensaries))]
# Add geolocation to dataframe
df_stores['latitude'] = latitude_list
df_stores['longitude'] = longitude_list
df_stores.head()
# Create a location column with a tuple containing latitude and longitude
df_stores['location'] = list(zip(df_stores['latitude'], df_stores['longitude']))
df_stores.head()
df_stores.shape
# Saving the dataset
df_stores.to_csv('files/df_stores.csv')
Obtaining data from Los Angeles crimes between 2010 and 2019. Dataset downloaded from LA Open Data Portal. Step-by-step to download it is:
Data dictonary: https://data.lacity.org/A-Safe-City/Crime-Data-from-2010-to-2019/63jg-8b9z
I've uploaded the CSV to the ./files folder in this project's directory.
# Loading the CSV with crime data
df_crimes = pd.read_csv('files/Crime_Data_from_2010_to_2019.csv')
df_crimes.shape
df_crimes.head(3)
df_crimes.isnull().sum()
# Substituting the spaces in the columns' names for underscores
# To simplify filtering/indexation
df_crimes.columns = [column.lower().replace(' ', '_') for column in df_crimes.columns]
# Dropping unnecessary columns
df_crimes.drop(labels = ['crm_cd_1',
'crm_cd_2',
'crm_cd_3',
'crm_cd_4',
'premis_cd',
'premis_desc',
'vict_descent',
'vict_sex',
'status',
'dr_no',
'area_',
'date_rptd',
'rpt_dist_no',
'crm_cd',
'part_1-2',
'mocodes',
'cross_street',
'weapon_used_cd',
'status_desc',
'time_occ',
'vict_age'],
axis = 1,
inplace = True)
# Keeping the weapon_desc column and filling NAs with 'unknown'
df_crimes.weapon_desc.fillna('unknown', inplace = True)
df_crimes.isnull().sum()
df_crimes.head(3)
# Shape
df_crimes.shape
# Saving the dataframe to a CSV file
df_crimes.to_csv('files/df_crimes.csv')
Obtaining data from Los Angeles arrests from 2010 onwards. Dataset downloaded from LA Open Data Portal. Step-by-step to download it is:
1) Access: https://data.lacity.org/ 2) Search: "Arrests" 3) Choose: "Arrest Data from 2010 to Present" 4) Select: "View Data" -> "Export" -> "Download" -> "CSV" Data dictonary: https://data.lacity.org/A-Safe-City/Arrest-Data-from-2010-to-Present/yru6-6re4
I've uploaded the CSV to the ./files folder in this project's directory.
# Loading the Arrests CSV
df_arrests = pd.read_csv('files/Arrest_Data_from_2010_to_Present.csv')
df_arrests.shape
df_arrests.head(3)
df_arrests.isnull().sum()
df_arrests.dtypes
# Converting the 'Arrest Date' column from String to Datetime, to facilitate manipulation
df_arrests['Arrest Date'] = pd.to_datetime(df_arrests['Arrest Date'])
# Using regular expressions to clean the 'Location' column and convert it to a list
df_arrests['Location'] = df_arrests['Location'].map(lambda x: re.sub('[(),°]', '', x)).str.split()
df_arrests.head(3)
# Extracting latitude and longitude
df_arrests['latitude'] = df_arrests['Location'].map(lambda x: x[0])
df_arrests['longitude'] = df_arrests['Location'].map(lambda x: x[1])
# Converting the geolocation to float
df_arrests['latitude'] = df_arrests['latitude'].map(lambda x: float(x))
df_arrests['longitude'] = df_arrests['longitude'].map(lambda x: float(x))
# Converting the 'Charge Description' column to string and lower case
df_arrests['Charge Description'] = df_arrests['Charge Description'].map(lambda x: str(x))
df_arrests['Charge Description'] = df_arrests['Charge Description'].map(lambda x: x.lower())
df_arrests.shape
df_arrests.head(3)
# Listing all arrest classifications
arrests_list = list(df_arrests['Charge Description'].value_counts().index.sort_values())
# Setting to lower case to standardize
arrests_list = [x.lower() for x in arrests_list]
# Checking all arrest classifications
arrests_list
# Filtering all marijuana arrests
# Some descriptions were abbreviated to 'marij'
marijuana_arrests = [x for x in arrests_list if 'marij' in x]
len(marijuana_arrests)
marijuana_arrests
# Creating a new df column to identify arrests which were marijuana related
df_arrests['marijuana_related'] = df_arrests['Charge Description'].map(lambda x: x if x in marijuana_arrests else np.NaN)
df_arrests.head(3)
# Counting non-NA values to see how many marijuana arrests there were
len(df_arrests[~df_arrests['marijuana_related'].isnull()])
# Keeping only the marijuana related data
df_arrests = df_arrests[~df_arrests['marijuana_related'].isnull()]
df_arrests.shape
df_arrests.head(3)
# Saving dataframe as csv
df_arrests.to_csv('files/df_arrests.csv')
Obtaining geolocation data on Los Angeles schools.
Data sourced from: http://www.lausd.k12.ca.us/lausd/offices/bulletins/
File used: http://www.lausd.k12.ca.us/lausd/offices/bulletins/lausdk12.tab
# Loading file
df_schools = pd.read_csv('files/lausdk12.tab', sep = '\t')
df_schools.shape
df_schools.head(3)
# Merging Address + City + State + ZIP to create a new 'complete_address' column
df_schools['complete_address'] = df_schools['Address'] + ' ' + df_schools['City'] + ' ' + df_schools['State'] + ' ' + df_schools['Zip Code'].astype(str)
df_schools['complete_address'] = df_schools['complete_address'].astype(str)
# Deleting unnecessary columns
df_schools = df_schools.drop(['Address',
'City',
'State',
'Cost Center Code',
'Legacy Code',
'Telephone',
'Fax',
'Calendar',
'File Build Date'],
1)
# Checking for duplicates
# This can happen if there are two schools registered on the same address, such as a kindergarten and middle school
df_schools = df_schools[~df_schools.duplicated(subset = 'complete_address')].sort_values('complete_address')
# Resetting index
df_schools.reset_index(drop = True, inplace = True)
df_schools.shape
df_schools.head(3)
# Saving the schools df
df_schools.to_csv('files/df_schools.csv')
Tasks for part 2:
# Disabling the multiple messages generated by the new versions from Pandas and Matplotlib
import sys
import warnings
import matplotlib.cbook
if not sys.warnoptions:
warnings.simplefilter('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=matplotlib.cbook.mplDeprecation)
# Data manipulation and visualization
import scipy
import numpy as np
import pandas as pd
import matplotlib as m
import matplotlib.pyplot as plt
from scipy import stats
# Graphics formatting
m.rcParams['axes.labelsize'] = 14
m.rcParams['xtick.labelsize'] = 12
m.rcParams['ytick.labelsize'] = 12
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15,7
plt.style.use('fivethirtyeight')
%matplotlib inline
df_arrests = pd.read_csv('files/df_arrests.csv')
df_arrests.shape
df_arrests.head()
df_arrests.describe()
Checking the range of arrest dates:
# Minimum value
df_arrests['Arrest Date'].min()
# Maxium value
df_arrests['Arrest Date'].max()
Considering that the law came into force in 2017, and that therefore there are three years for which there is complete data of the after effects, I'll be restricting the analysis to 3 years before and 3 years after. In other words, the follows years will be considered: 2014, 2015, 2016, 2017, 2018, 2019.
# Filtering the dataframe to the 2014-2019 range
df_arrests = df_arrests[(df_arrests['Arrest Date'] >= '2014-01-01') & (df_arrests['Arrest Date'] <= '2019-12-31')]
df_arrests.shape
# Saving the updated dataset
df_arrests.to_csv('files/df_arrests_02.csv', index = False)
# Filtering the dataframe per year
df_arrests_2014 = df_arrests[(df_arrests['Arrest Date'] >= '2014-01-01') & (df_arrests['Arrest Date'] <= '2014-12-31')]
df_arrests_2015 = df_arrests[(df_arrests['Arrest Date'] >= '2015-01-01') & (df_arrests['Arrest Date'] <= '2015-12-31')]
df_arrests_2016 = df_arrests[(df_arrests['Arrest Date'] >= '2016-01-01') & (df_arrests['Arrest Date'] <= '2016-12-31')]
df_arrests_2017 = df_arrests[(df_arrests['Arrest Date'] >= '2017-01-01') & (df_arrests['Arrest Date'] <= '2017-12-31')]
df_arrests_2018 = df_arrests[(df_arrests['Arrest Date'] >= '2018-01-01') & (df_arrests['Arrest Date'] <= '2018-12-31')]
df_arrests_2019 = df_arrests[(df_arrests['Arrest Date'] >= '2019-01-01') & (df_arrests['Arrest Date'] <= '2019-12-31')]
# Function to create the arrest geolocation plots
def geo_plot_arrests(dataframe, title):
# Total
print(f'Tally of Marijuana Related Arrests: {len(dataframe)}')
# Plot area
fig, ax = plt.subplots(figsize = (20,10))
# Scatterplot
plt.scatter(dataframe['longitude'], dataframe['latitude'],c = 'red', s = 12, alpha = 0.4, label = 'Arrests')
# Labels and Legend
plt.xlabel('Longitude', fontsize = 20)
plt.ylabel('Latitude', fontsize = 20)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.legend(fontsize = 18)
plt.title(title, fontsize = 26)
plt.tight_layout()
# Total arrests in 2014
geo_plot_arrests(df_arrests_2014, 'Total Arrests in 2014')
# Total arrests in 2015
geo_plot_arrests(df_arrests_2015, 'Total Arrests in 2015')
# Total arrests in 2016
geo_plot_arrests(df_arrests_2016, 'Total Arrests in 2016')
# Total arrests in 2017
geo_plot_arrests(df_arrests_2017, 'Total Arrests in 2017')
# Total arrests in 2018
geo_plot_arrests(df_arrests_2018, 'Total Arrests in 2018')
# Total arrests in 2019
geo_plot_arrests(df_arrests_2019, 'Total Arrests in 2019')
The data and the plots show that there was a drop in arrests from 2014 to 2019, with a marked fall starting in 2017, the year the new marijuana laws came into force. Although correlation doesn't imply causation, this is nevertheless evidence that the law impacted marijuana arrests.
# Loading the dispensaries dataset
df_stores = pd.read_csv('files/df_stores.csv')
df_stores.shape
df_stores.head()
Closed dispensaries are not meaningful to the analysis and can skew results. Removing them:
# Checking for closed stores
df_stores['is_closed'].value_counts()
# Checking the closed store's register
df_stores[df_stores['is_closed']]
# Removing the register
df_stores.drop(df_stores.index[234], inplace = True)
# Checking for closed stores
df_stores['is_closed'].value_counts()
For the rest of my analysis, only the name, latitude and longitude variables will be necessary. Dropping the other variables:
# Filtering the dataframe
df_stores = df_stores[['name', 'latitude', 'longitude']]
df_stores.head()
# Statistical summary
df_stores.describe()
# Saving the filtered dataframe
df_stores.to_csv('files/df_stores_02.csv', index = False)
# Function to plot arrest location x store location
def geo_plot_02(dataframe_arrests, year):
# Plot area
fig, ax = plt.subplots(figsize = (25,15))
# Scatterplot of Arrests
plt.scatter(dataframe_arrests['longitude'], dataframe_arrests['latitude'], c = 'red', s = 12, label = 'Arrests')
# Scatterplot of Dispensaries
ax.scatter(df_stores['longitude'], df_stores['latitude'], c = 'b', s = 12, label = 'Dispensaries')
# Labels and Legend
plt.xlabel('Longitude', fontsize = 20)
plt.ylabel('Latitude', fontsize = 20)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.legend(fontsize = 18)
plt.xlim(None, -118)
plt.title(f'{year}: Arrests x Dispensaries', fontsize = 30)
plt.tight_layout()
geo_plot_02(df_arrests_2014, 2014)
geo_plot_02(df_arrests_2015, 2015)
geo_plot_02(df_arrests_2016, 2016)
geo_plot_02(df_arrests_2017, 2017)
geo_plot_02(df_arrests_2018, 2018)
geo_plot_02(df_arrests_2019, 2019)
The arrests were clearly concentrated in areas close to dispensaries, both before and after legalization. Moreso, few if any stores changed places, and over the years more stores were opened. Therefore, considering that legalization didn't change the arrest locatations even though new stores were opened, it seems likely that arrest locations are not related to dispensary locations, but rather to other significant attributes of the city landscape (such as certain neighborhoods or popular areas).
df_schools = pd.read_csv('files/df_schools.csv')
df_schools.shape
df_schools.head()
Geolocation data was not included in the school dataset, hence it will need to be obtained by other means. I'll use the school address paired with the google maps API to obtain the latitude and longitude of each school. Such data will also allow for the identification of duplicate entries (under different names) in the school list.
!pip install -q googlemaps
import googlemaps
# API Key
gmaps_key = googlemaps.Client(key = '')
# Lists to retrieve latitude and longitude
list_latitude = []
list_longitude = []
# Loop through the school addresses list
# This may take a while
for address in df_schools['complete_address']:
# Obtain the address' geocode
g = gmaps_key.geocode(address)
# Extract latitude and longitude
try:
lat = g[0]['geometry']['location']['lat']
lng = g[0]['geometry']['location']['lng']
# In case of error fill with NaN
except:
lat = np.NaN
lng = np.NaN
# Apprend latitude and longide to the lists
list_latitude.append(lat)
list_longitude.append(lng)
# Adding the lists as columns on the schools dataframe
df_schools['latitude'] = list_latitude
df_schools['longitude'] = list_longitude
# Checking for missing values
df_schools.isnull().sum()
# Using dropna() to avoid problems in case the API fails to fetch the geolocation of a school
df_schools = df_schools.dropna()
df_schools.shape
# Creating a single column with all geolocation data
# This column can be used to filter duplicate entries
df_schools['coordinates'] = df_schools['latitude'].astype('str') + ', ' + df_schools['longitude'].astype(str)
df_schools.head()
# Using the coordinates to remove duplicate entries
df_schools = df_schools[~df_schools.duplicated(subset = 'coordinates')]
# Reseting index as rows were removed
df_schools.reset_index(drop = True, inplace = True)
# Shape
df_schools.shape
The shape shrank from 939 to 906, therefore there were indeed duplicate entries. Most likely schools which operated on the same location two different programs as two different institutions.
# Statistical summary
df_schools.describe()
# The Unnamed: 0 column was the old index which became a column when the csv was imported
# It's not necessary. Removing it
df_schools.drop(labels = 'Unnamed: 0', axis = 1, inplace = True)
df_schools.head()
# Saving the dataframe
df_schools.to_csv('files/df_schools_02.csv', index = False)
# Function to plot arrest location x school location
def geo_plot_03(dataframe_arrests, year):
# Plot area
fig, ax = plt.subplots(figsize = (25,15))
# Scatterplot of Arrests
plt.scatter(dataframe_arrests['longitude'], dataframe_arrests['latitude'], c = 'red', s = 12, label = 'Arrests')
# Scatterplot of Dispensaries
ax.scatter(df_schools['longitude'], df_schools['latitude'], c = 'g', s = 12, label = 'Schools')
# Labels and Legend
plt.xlabel('Longitude', fontsize = 20)
plt.ylabel('Latitude', fontsize = 20)
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.legend(fontsize = 18)
plt.title(f'{year}: Arrests x Schools', fontsize = 30)
plt.tight_layout()
geo_plot_03(df_arrests_2014, 2014)
geo_plot_03(df_arrests_2015, 2015)
geo_plot_03(df_arrests_2016, 2016)
geo_plot_03(df_arrests_2017, 2017)
geo_plot_03(df_arrests_2018, 2018)
geo_plot_03(df_arrests_2019, 2019)
There are schools everywhere, so it's hard to assess visually whether there is a higher incidence of marijuana arrests near schools.
As instructed, my next step will be using the geopy package to calculate the distance between schools and arrests.
Here I'm looking at how many stores where in the area around each arrest
# Creating a dataframe to calculate distances - stores
df_stores_dist = df_stores[['name', 'latitude', 'longitude']]
df_stores_dist.shape
df_stores_dist.head()
# Creating a dataframe to calculate distances - arrests
df_arrests_dist = df_arrests[['Report ID', 'latitude', 'longitude']]
# The index is skipping rows, reseting it
df_arrests_dist.reset_index(drop = True, inplace = True)
df_arrests_dist.shape
df_arrests_dist.head(3)
# Loading the geopy package which can be used to calculate distance between coordinates
# The function which does this is geodesic()
import geopy
from geopy.distance import geodesic
%%time
# Loop through all stores and all arrests and calculate the distance
# This takes a while (more than 30 minutes)
for store in range(len(df_stores_dist)):
# Get the data from that store
store_name = df_stores_dist.iloc[store]['name']
store_lat = df_stores_dist.iloc[store]['latitude']
store_long = df_stores_dist.iloc[store]['longitude']
# create new column on arrest_dist with store name
df_arrests_dist[store_name] = np.NaN
# Create an empty list to be filled with
# The distance between arrest and that given store
distance_list = []
# Loop though all arrests
for arrest in range(len(df_arrests_dist)):
# Get the geolocation from arrest
arrest_lat = df_arrests_dist.iloc[arrest]['latitude']
arrest_long = df_arrests_dist.iloc[arrest]['longitude']
# Calculate distance from arrest to store
distance = geodesic((store_lat, store_long), (arrest_lat, arrest_long)).miles
# Add to the list containing all distances from a given store
distance_list.append(distance)
# Fill the store name column with the distances list
df_arrests_dist[store_name] = distance_list
df_arrests_dist.to_csv('files/distances_dispensaries.csv')
df_arrests_dist
# Creating a new dataframe with only distance data
df_dist_arrest_store = df_arrests_dist.drop(['Report ID', 'latitude', 'longitude'], axis = 1)
df_dist_arrest_store.head(3)
# For each arrest count how many stores were in a 0.5 mile range
df_arrests_dist['store_0.5_mile'] = (df_dist_arrest_store < 0.5).sum(axis = 1)
# For each arrest count how many stores were in a 1 mile range
df_arrests_dist['store_1_mile'] = (df_dist_arrest_store < 1).sum(axis = 1)
# Problem prevention: Checking df_arrests and df_arrests_dist
# To see if the 'Report ID' columns still match
len(df_arrests['Report ID'].unique()) == len(df_arrests_dist['Report ID'].unique())
# Adding the columns with the calculated distances back to df_arrests
df_arrests = df_arrests.merge(df_arrests_dist[['Report ID', 'store_0.5_mile', 'store_1_mile']],
left_on = 'Report ID',
right_on = 'Report ID')
df_arrests.head(2)
# Dropping the 'Unnamed: 0' column
df_arrests = df_arrests.drop('Unnamed: 0', 1)
# Saving df_arrests to disk
df_arrests.to_csv('files/df_arrests_03.csv', index = False)
Here I'm looking at how many arrests happened near each school
AND at how many schools were near each arrest
# Creating a dataframe to calculate distances - stores
df_schools_dist = df_schools[['School', 'latitude', 'longitude']].dropna()
df_schools_dist.shape
df_schools.isnull().sum()
df_schools_dist
# Dataframe for distance between arrests and schools
df_dist_arrest_school = df_arrests[['Report ID', 'latitude', 'longitude']]
# The index is skipping rows, reseting it
df_dist_arrest_school.reset_index(drop = True, inplace = True)
df_dist_arrest_school
%%time
# Loop through all stores and all arrests and calculate the distance
# This takes a long time! (2 hours or so)
for store in range(len(df_schools_dist)):
# Get the data from that store
store_name = df_schools_dist.iloc[store]['School']
store_lat = df_schools_dist.iloc[store]['latitude']
store_long = df_schools_dist.iloc[store]['longitude']
# create new column on arrest_dist with store name
df_dist_arrest_school[store_name] = np.NaN
# Create an empty list to be filled with
# The distance between arrest and that given store
distance_list = []
# Loop though all arrests
for arrest in range(len(df_arrests_dist)):
# Get the geolocation from arrest
arrest_lat = df_arrests_dist.iloc[arrest]['latitude']
arrest_long = df_arrests_dist.iloc[arrest]['longitude']
# Calculate distance from arrest to store
distance = geodesic((store_lat, store_long), (arrest_lat, arrest_long)).miles
# Add to the list containing all distances from a given store
distance_list.append(distance)
# Fill the store name column with the distances list
df_dist_arrest_school[store_name] = distance_list
df_dist_arrest_school.to_csv('files/distances_schools.csv')
df_dist_arrest_school.shape
df_dist_arrest_school.head(2)
df_dist_arrest_school_2 = df_dist_arrest_school.drop(['Report ID', 'latitude', 'longitude'], axis = 1)
df_dist_arrest_school_2.head(2)
# For each ARREST count how many SCHOOLS were in a 0.5 mile range
df_dist_arrest_school['school_0.5_mile'] = (df_dist_arrest_school_2 < 0.5).sum(axis = 1)
# For each ARREST count how many SCHOOLS were in a 1 mile range
df_dist_arrest_school['school_1_mile'] = (df_dist_arrest_school_2 < 1).sum(axis = 1)
df_dist_arrest_school.head(2)
# Now transposing the original distances dataframe in order to count the number of arrests near each school
df_dist_school_arrest = df_dist_arrest_school_2.T
# For each SCHOOL count how many ARREST happened in a 0.5 mile range
df_dist_school_arrest['arrest_0.5_mile'] = (df_dist_school_arrest < 0.5).sum(axis = 1)
# For each SCHOOL count how many ARREST happened in a 1 mile range
df_dist_school_arrest['arrest_1_mile'] = (df_dist_school_arrest < 1).sum(axis = 1)
df_dist_school_arrest.head(2)
df_dist_school_arrest.reset_index(inplace = True)
df_dist_school_arrest.rename(columns = {'index': 'School'}, inplace = True)
# Problem prevention: Checking df_arrests and df_arrests_dist
# To see if the 'Report ID' columns still match
len(df_schools['School'].unique()) == len(df_dist_school_arrest['School'].unique())
# Adding two columns to the schools dataframe
# With the tally of how many arrests happened nearby
df_schools = df_schools.merge(df_dist_school_arrest[['School', 'arrest_0.5_mile', 'arrest_1_mile']],
left_on = 'School',
right_on = 'School')
df_schools.head(2)
df_arrests.head(2)
# Adding two columns to the arrests dataframe
# With the tally of how many schools were nearby
df_arrests['school_0.5_mile'] = (df_dist_arrest_school_2 < 0.5).sum(axis =1)
df_arrests['school_1_mile'] = (df_dist_arrest_school_2 < 1).sum(axis =1)
df_arrests.head(2)
df_arrests.columns
# Saving to disk all dataframes
df_arrests.to_csv('files/df_arrests_04.csv', index = False)
df_schools.to_csv('files/df_schools_03.csv' ,index = False)
# Range of dates for indexing
index_dates = pd.date_range('2014-01-01', '2019-12-31')
df_arrests
# Dataframe with arrests grouped per day (categorial variables will disappear)
df_arrests_g = df_arrests.groupby('Arrest Date').sum()
# Tally of arrest near schools per day
df_arrests_g['school_0.5_mile'].head()
# Tally of arrest near schools per day
df_arrests_g['school_1_mile'].head()
# Making dataframes
df_arrests_half_mile_schools = pd.DataFrame(df_arrests_g['school_0.5_mile'])
df_arrests_one_mile_schools = pd.DataFrame(df_arrests_g['school_1_mile'])
df_arrests_half_mile_schools.head()
df_arrests_one_mile_schools.head()
The dataframe index is of type: Index
To be able to tally the monthly totals, it must be of type: DateTimeIndex
# Checking index type
df_arrests_half_mile_schools.index
# Converting index type
df_arrests_half_mile_schools.index = pd.to_datetime(df_arrests_half_mile_schools.index)
df_arrests_one_mile_schools.index = pd.to_datetime(df_arrests_one_mile_schools.index)
# Using resampling to tally monthly data
df_arrests_half_mile_schools.resample('M').sum().head()
# Number of arrests with a school in a 0.5-mile area
# Figure
plt.figure(figsize = (20,10))
# Plot data
plt.plot(df_arrests_half_mile_schools.resample('M').sum(), color = 'blue', linewidth = 3)
# Labels and Legend
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.title('\nNumber of Arrests with Schools on a 0.5-Mile Range\n', fontsize = 30)
plt.xlabel('\nMonth of Year')
plt.ylabel('\nNumber of Arrests')
plt.tight_layout()
# Number of arrests with a school in a 0.5-mile area
# Figure
plt.figure(figsize = (20,10))
# Plot data
plt.plot(df_arrests_one_mile_schools.resample('M').sum(), color = 'blue', linewidth = 3)
# Labels and Legend
plt.xticks(fontsize = 15)
plt.yticks(fontsize = 15)
plt.title('\nNumber of Arrests with Schools on a 1-Mile Range\n', fontsize = 30)
plt.xlabel('\nMonth of Year')
plt.ylabel('\nNumber of Arrests')
plt.tight_layout()
Something changed in 2017 that caused arrests near schools to drop. Was it marijuana legalization? I cannot be affirmed with total certainty, but everything thus far points on that direction. One important question is: Have the 'crimes' been reduced, or was this simply a change in how they are accounted for (what used to be a crime no longer is)?
# Function to plot the tally of arrests per crime on each year
def plot_total_crime_type(dataframe, year):
# Filtering the dataframe by type of crime
# Counting the number of occurances per type
# Rearrange in ascending order
# Create a bar plot
ax = dataframe['Charge Description'].value_counts(ascending = True).tail().plot(kind = 'bar',
figsize = (16,10),
color = 'magenta',
alpha = 0.7,
rot = 30)
ax.set_title(f'\nTally of Arrests per Crime Type in {year}\n', fontsize = 30)
ax.set_ylabel('\nCrime Type\n', fontsize = 18);
ax.set_xlabel('\nCrime Tally\n', fontsize = 18)
# Before plotting the data there is an issue to fix on the dataset
# There are two entries for "possess 28.5 grams or less of marijuana"
# With the second one having an extra "**"
# Merging both those
df_arrests['Charge Description'] = df_arrests['Charge Description'].map(lambda x: 'possess 28.5 grams or less of marijuana' if x == 'possess 28.5 grams or less of marijuana**' else x)
# Creating new dataframes per year
df_arrests_2014 = df_arrests[(df_arrests['Arrest Date'] <= '2014-12-31') & (df_arrests['Arrest Date'] >= '2014-01-01')]
df_arrests_2015 = df_arrests[(df_arrests['Arrest Date'] <= '2015-12-31') & (df_arrests['Arrest Date'] >= '2015-01-01')]
df_arrests_2016 = df_arrests[(df_arrests['Arrest Date'] <= '2016-12-31') & (df_arrests['Arrest Date'] >= '2016-01-01')]
df_arrests_2017 = df_arrests[(df_arrests['Arrest Date'] <= '2017-12-31') & (df_arrests['Arrest Date'] >= '2017-01-01')]
df_arrests_2018 = df_arrests[(df_arrests['Arrest Date'] <= '2018-12-31') & (df_arrests['Arrest Date'] >= '2018-01-01')]
df_arrests_2019 = df_arrests[(df_arrests['Arrest Date'] <= '2019-12-31') & (df_arrests['Arrest Date'] >= '2019-01-01')]
# Total Arrests per Crime in 2014
plot_total_crime_type(df_arrests_2014, 2014)
# Total Arrests per Crime in 2015
plot_total_crime_type(df_arrests_2015, 2015)
# Total Arrests per Crime in 2016
plot_total_crime_type(df_arrests_2016, 2016)
# Total Arrests per Crime in 2017
plot_total_crime_type(df_arrests_2017, 2017)
# Total Arrests per Crime in 2018
plot_total_crime_type(df_arrests_2018, 2018)
# Total Arrests per Crime in 2019
plot_total_crime_type(df_arrests_2019, 2019)
df_arrests.head()
df_arrests.describe().T
df_arrests.dtypes
df_arrests.nunique()
df_arrests_vars = df_arrests.describe().columns
# Histograms
# Figure
fig = plt.figure(figsize = (32,16))
# Plot a histogram of each variable
for i, col in enumerate(df_arrests_vars):
fig.add_subplot(4,4,1+i)
col_data = df_arrests[col]
plt.hist(col_data, color = 'khaki')
plt.title(col)
Looks like none of the variables follows a normal distribution. This can be later checked with am appropriate test!
# Now moving to the schools dataframe
df_schools.describe()
df_schools_vars = ['arrest_0.5_mile', 'arrest_1_mile']
# Histograms
# Figure
fig = plt.figure(figsize = (16,8))
# Plot a histogram of each variable
for i, col in enumerate(df_schools_vars):
fig.add_subplot(2,4,1+i)
col_data = df_schools[col]
plt.hist(col_data, color = 'green')
plt.title(col)
Again doesn't seem to be normally distributed.
Quick Review: The p-value is the probability of obtaining a test statistic at least as extreme as the one observed when the null hypothesis is true. The null hypothesis (H0) affirms that the population is normally distributed, against the alternative hypothesis (H1) that it is not normally distributed.
# Function to test variable normality
def test_normality(dataframe, list_of_cols, sig_level = 0.05):
# List columns on dataframe
total_variables = len(list_of_cols)
# Start the counter of varibles which are 'non normal'
not_normal_variables = 0
# Loop through each column
for col in list_of_cols:
# Run the normality test (from statsmodels)
p_val = stats.normaltest(dataframe[col])[1]
# Significance Level
sig_lvl = 0.05
# Check p-value
if p_val < sig_lvl:
print(f'{col}')
print(f'P-value: {p_val}\nSignificance Level: {sig_lvl}\n')
print("The p-value is lower than the Significance Level, hence the null hypothesis is rejected and we conclude that the variable is not normally distributed.\n")
not_normal_variables += 1
elif p_val > sig_lvl:
print(f'{col}')
print(f'P-value: {p_val}\nSignificance Level: {sig_lvl}\n')
print("The p-value is greater than the Significance Level, hence we fail to reject the null hypothesis that the variable is normally distributed.\n")
if not_normal_variables > 0:
print(f'{not_normal_variables} variables failed the normality test')
# Test normality for arrests
test_normality(df_arrests, df_arrests_vars)
# Test normality for schools
test_normality(df_schools, df_schools_vars)
Conclusion: None of the variables is normally distributed. This means that much caution is needed when creating the predictive models.
Clearly there seems to be a pattern between number of arrests and marijuana legalization. The proximity between arrests and dispensaries or schools doesn't seem to change from year to year, which seems to imply that there is no correlation between them. The important question to answer now is whether is there is some other factor which could better explain the drop in marijuana related crimes.
Tasks for part 3:
# Disabling the multiple messages generated by the new versions from Pandas and Matplotlib
import sys
import warnings
import matplotlib.cbook
if not sys.warnoptions:
warnings.simplefilter("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=FutureWarning)
warnings.filterwarnings("ignore", category=matplotlib.cbook.mplDeprecation)
# Data manipulation and visualization
import math
import numpy as np
import pandas as pd
import matplotlib as m
import matplotlib.pyplot as plt
from scipy import stats
# Machine Learning
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
# Time series analysis
import pickle
import calendar
import statsmodels
from statsmodels.tsa.arima_model import ARIMA, ARMA
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import acf, pacf
# Graphics formatting
m.rcParams['axes.labelsize'] = 14
m.rcParams['xtick.labelsize'] = 12
m.rcParams['ytick.labelsize'] = 12
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 15,7
plt.style.use('fivethirtyeight')
%matplotlib inline
# Loading data
df_arrests = pd.read_csv('files/df_arrests_04.csv')
df_arrests.shape
df_arrests.head()
df_arrests.drop(labels = ['store_0.5_mile_y', 'store_1_mile_y'], axis = 1, inplace = True)
df_arrests.rename(axis = 1, mapper = {'store_0.5_mile_x' : 'store_0.5_mile', 'store_1_mile_x' : 'store_1_mile'}, inplace = True)
df_arrests.head(3)
# Listing all charge descriptions
charge_description_list = list(df_arrests['Charge Description'].unique())
# Ordering the list
charge_description_list.sort()
charge_description_list
Creating a target variable, which will be based on whether or not the charge was related to marijuana sale.
# Using map function to identify marijuana sale
target = df_arrests['Charge Description'].map(lambda x: 'sell' if 'sale' in x else x)
# Identifying sale or not
target = target.map(lambda x: 'sell' if 'sell' in x else x)
target = target.map(lambda x: 'not sell' if 'sell' not in x else x)
# Counting the number of examples in each class
target.value_counts(normalize = True)
# Adding the variabe on the dataframe
df_arrests['target'] = target
# Encoding the variable
df_arrests['target'] = df_arrests['target'].map(lambda x: 1 if x == 'sell' else 0)
df_arrests.head()
# Checking correlation between the two dispensary distance variables
df_arrests[['store_0.5_mile', 'store_1_mile']].corr()
# Checking correlation between the two school distance variables
df_arrests[['school_0.5_mile', 'school_1_mile']].corr()
The variables are highly correlated, so one of them will need to be dropped. I'll also drop some other variables which shouldn't be part of the model, such as the 'Charge Description' which was used to generate the target, as well as the 'Arrest Date', since the goal will be predicting how things might have played out with a law change. I'll keep the latitude and longitude variables and remove most other variables which refer to location.
df_arrests.columns
X = df_arrests.drop(['Report ID', 'Arrest Date', 'Area ID', 'Reporting District',
'Charge Group Code', 'Charge Group Description', 'Charge',
'Charge Description', 'Address', 'Cross Street', 'Location',
'marijuana_related', 'store_0.5_mile', 'school_0.5_mile',
'target'], axis = 1)
x
# Creating dummy variables for the categorical variables
X = pd.get_dummies(X)
X.head()
# Target (y)
Y = df_arrests['target']
Y.head()
# Train test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, stratify = Y, test_size = 0.25)
# Create model
model_01 = LogisticRegression()
model_01.fit(X_train, Y_train)
# Evaluate accuracy
acc_train = model_01.score(X_train, Y_train)
print(f'Train accuracy: {round(acc_train, 2) * 100}')
acc_test = model_01.score(X_test, Y_test)
print(f'Test accuracy: {round(acc_test, 2) * 100}')
The accuracy is quite good
# Extracting the coeficients
beta = model_01.coef_[0]
model_01.coef_[0]
# Attributes
attributes = list(X.columns)
# Creating a dataframe matching attributes to their coefficients
beta_df = pd.DataFrame({'attributes': attributes, 'beta': beta})
beta_df.set_index('attributes', inplace = True)
beta_df.head()
# Extracting the log of the odds
beta_df['log_odds'] = beta_df['beta'].map(lambda x: np.exp(x))
# Descending order
beta_df = beta_df.sort_values('log_odds', ascending = False)
beta_df.head(10)
# Checking if the coeficients for the distance variables: school
beta_df[beta_df.index == 'school_1_mile']
# Checking if the coeficients for the distance variables: dispensary
beta_df[beta_df.index == 'store_1_mile']
# Extracting the probabilities to make a 'confidence plot'
prob_arrest_sale = model_01.predict_proba(X_test)[:,1]
prob_arrest_sale
# Create dataframe with actual values versus predicted probabilities (1 = sale arrest)
pred_df = pd.DataFrame({'true_values': Y_test, 'predicted_values':prob_arrest_sale})
pred_df
# Plotting the probability distribuition
# Figure
plt.figure(figsize = (18,10))
# Histogram of real values x predicted values for class 1
plt.hist(pred_df[pred_df['true_values'] == 1]['predicted_values'],
bins = 30,
color = 'r',
alpha = 0.6,
label = 'Marijuana Sale Arrest')
# Histogram of real values x predicted values for class 0
plt.hist(pred_df[pred_df['true_values'] == 0]['predicted_values'],
bins = 30,
color = 'blue',
alpha = 0.6,
label = 'Non-Marijuana Sale Arrest')
# Center line to split the plot
plt.vlines(x = 0.5, ymin = 0, ymax = 200, color = 'green', linestyle = '--')
# Labels and Legend
plt.title('\nProbability Distribution\n', fontsize = 30)
plt.ylabel('\nFrequency\n', fontsize = 30)
plt.xlabel('\nPredicted Probability of Arrest for Marijuana Sale\n', fontsize = 24)
plt.legend(fontsize = 24);
Does having a police department nearby alters the result?
# Extracting the coeficients per area name
beta_df_area = beta_df[beta_df.index.str.contains('Area Name')].head(8)
beta_df_area.head()
The Los Angeles departments are registered here:
http://www.lapdonline.org/our_communities/content_basic_view/6279
Manually extacting their geolocation coordinates:
# Create dataframe for geolocation data of LAPD
LAPD = pd.DataFrame()
LAPD['latitude'] = [33.9383761,
34.097986,
33.9920067,
34.0443028,
33.7584097,
34.050264,
34.1195162,
34.1842023]
LAPD['longitude'] = [-118.2749244,
-118.331013,
-118.4199295,
-118.4509833,
-118.2880336,
-118.291531,
-118.2497385,
-118.3021552]
# Plot for class 1 with main attributes versus marijuana arrests
# Figure
fig, ax = plt.subplots(figsize = (18,10))
# Plot
plt.scatter(df_arrests[df_arrests['target'] == 1]['latitude'],
df_arrests[df_arrests['target'] == 1]['longitude'],
s = 15,
alpha = 0.4,
label = 'Arrests for Marijuana Sale')
# Títulos, labels e legenda
ax.scatter(LAPD['latitude'], LAPD['longitude'], color = 'red', label = 'Los Angeles Police Departments', s = 100)
plt.title('\nArrests for Marijuana Sale x LAPD Offices\n', fontsize = 24)
plt.xlabel('\nLatitude\n', fontsize = 20)
plt.ylabel('\nLongitude\n', fontsize = 20)
ax.legend(fontsize = 20)
plt.tight_layout()
# Plot for class 1 with main attributes versus marijuana arrests
# Figure
fig, ax = plt.subplots(figsize = (18,10))
# Plot
plt.scatter(df_arrests[df_arrests['target'] == 1]['latitude'],
df_arrests[df_arrests['target'] == 1]['longitude'],
s = 15,
alpha = 0.4,
label = 'Arrests for Other Reasons')
# Títulos, labels e legenda
ax.scatter(LAPD['latitude'], LAPD['longitude'], color = 'red', label = 'Los Angeles Police Departments', s = 100)
plt.title('\nArrests for Other Reasons x LAPD Offices\n', fontsize = 24)
plt.xlabel('\nLatitude\n', fontsize = 20)
plt.ylabel('\nLongitude\n', fontsize = 20)
ax.legend(fontsize = 20)
plt.tight_layout()
The fear of exposing children to marijuana doesn't seem to be a factor. There are two classes: selling and not selling marijuana, and the argument was that marijuana sales expose more children to it. The variable school_1_mile é the count of schools which are whithin a 1 mile range of each arrest. Given the log chances of school_1_mile being roughly equal to one, having a school in a 1 mile range from an arrest is as likely as the arrest being related to the sale as to other causes.