import glob
import pandas as pd
from tqdm import tqdm

# Before starting the work in R, I need to run a python script to concatenate all CSVs
dfs = glob.glob('**/*.csv', recursive=True)
print(f'Number of files found: {len(dfs)}')

Number of files found: 1061


# Now concatete all CSVs into a single datafram
result = pd.concat([pd.read_csv(df) for df in tqdm(dfs)], ignore_index=True)
result.shape

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 1061/1061 [02:18<00:00,  7.66it/s]

(12880086, 12)


# Then save the dataframe to disk
result.to_csv('data/UK_Crime.csv', index=False)


# Since the base dataset turned out to be so massive, I'll also create a downsampled size to ensure I can run it
result_donwsampled = result.sample(frac=0.001)
result_donwsampled.to_csv('data/UK_Crime_downsampled.csv', index=False)


# Load packages
library(tidyverse)
library(visNetwork)
library(sparklyr)
library(reshape2)
library(leaflet)
library(leaflet.extras)
library(htmlwidgets)

Warning message:
"package 'htmlwidgets' was built under R version 3.6.3"


# Install Spark
spark_install(version = '2.4.4')

# Specify Java 1.8 as the java version, as required by sparklyr
Sys.setenv(JAVA_HOME="C:/Java/jdk1.8.0_241")

# Connect to the Spark cluster
sc <- spark_connect(master = "local")

Registered S3 method overwritten by 'openssl':
  method      from
  print.bytes Rcpp


# Load the dataset
setwd("C:/Portfolio/Mining_Association_Rules_Sparklyr")
df <- read_csv('data/UK_Crime_downsampled.csv')
head(df)

Parsed with column specification:
cols(
  `Crime ID` = col_character(),
  Month = col_character(),
  `Reported by` = col_character(),
  `Falls within` = col_character(),
  Longitude = col_double(),
  Latitude = col_double(),
  Location = col_character(),
  `LSOA code` = col_character(),
  `LSOA name` = col_character(),
  `Crime type` = col_character(),
  `Last outcome category` = col_character(),
  Context = col_logical()
)


# Data summary
summary(df)

   Crime ID            Month           Reported by        Falls within      
 Length:12880       Length:12880       Length:12880       Length:12880      
 Class :character   Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character   Mode  :character  
                                                                            
                                                                            
                                                                            
                                                                            
   Longitude          Latitude       Location          LSOA code        
 Min.   :-7.7370   Min.   :50.10   Length:12880       Length:12880      
 1st Qu.:-2.1328   1st Qu.:51.50   Class :character   Class :character  
 Median :-1.2513   Median :52.11   Mode  :character   Mode  :character  
 Mean   :-1.3304   Mean   :52.43                                        
 3rd Qu.:-0.1685   3rd Qu.:53.41                                        
 Max.   : 1.7521   Max.   :55.92                                        
 NA's   :277       NA's   :277                                          
  LSOA name          Crime type        Last outcome category Context       
 Length:12880       Length:12880       Length:12880          Mode:logical  
 Class :character   Class :character   Class :character      NA's:12880    
 Mode  :character   Mode  :character   Mode  :character


# Visualizing the dispersion of crimes across the UK
map1 <- df %>% dplyr:: filter(!is.na(Longitude)) %>%
              filter(!is.na(Latitude)) %>%
              select(Longitude, Latitude) %>% 
              group_by(Longitude, Latitude) %>% 
              count() %>%
              leaflet() %>%
              addTiles() %>%
              addWebGLHeatmap(lng=~Longitude, lat=~Latitude, intensity = ~n, size=2000, opacity = 0.8)

# View map
map1

# Saving the map as an interactive html.
saveWidget(map1, file="map1.html")


map2 <- df %>% dplyr::filter(!is.na(Longitude)) %>%
              filter(!is.na(Latitude)) %>%
              select(Longitude, Latitude, 'Crime type') %>% 
              leaflet() %>%
              addTiles() %>%
              addMarkers(
                clusterOptions = markerClusterOptions(),
                popup = ~'Crime type'
              )
# View map
map2

# Saving the map as an interactive html.
saveWidget(map2, file="map2.html")

Assuming "Longitude" and "Latitude" are longitude and latitude, respectively


# Inspecting df(head) is seems like the cases without a "Crime ID" also don't have a "Last outcome category". Let's verify that!
null_test <- df %>% select('Crime ID', 'Last outcome category')

# Check if any sample has only "Crime ID" or "Last outcome category" but not both
test_1 <- null_test %>% filter(is.na('Crime ID') & !is.na('Last outcome category')) %>% nrow()
test_2 <- null_test %>% filter(!is.na('Crime ID') & is.na('Last outcome category')) %>% nrow()
print(c(test_1, test_2))

[1] 0 0


# Column names
colnames(df)


# Number of samples
df %>% count('Reported by')


# Let's focus on a single city: Swindom, Wilthshire, England
# See how many crimes we have for that city
swindom_crimes <- df %>% filter(grepl('Swindon', `LSOA name`) | grepl('Wiltshire', `LSOA name`)) %>% count()
paste("Tally of Crimes in Swindom, Wilthshire: ", swindom_crimes$n)


# Check the data
swindom_df <- df %>% filter(grepl('Swindon', `LSOA name`) | grepl('Wiltshire', `LSOA name`))
head(swindom_df)


# The Context column seems to have only NA's. Let's check
df %>% filter(!is.na(Context)) %>% nrow()


# Let's check what is the current status of crime reports in the dataset
df %>% select(`Last outcome category`) %>%
  table(useNA = 'always') %>%
  sort(decreasing = T) %>%
  head(10)

.
      Investigation complete; no suspect identified 
                                               3527 
                                               <NA> 
                                               3394 
                        Unable to prosecute suspect 
                                               2850 
                          Status update unavailable 
                                               1260 
                                Under investigation 
                                                525 
                           Court result unavailable 
                                                460 
                             Awaiting court outcome 
                                                210 
                                   Local resolution 
                                                205 
                           Offender given a caution 
                                                105 
Further investigation is not in the public interest 
                                                101


# The Swindom data gives the impression that all "Anti-social behavior" as NA for "Last outcome category". Checing that too!
df %>% filter(is.na(`Last outcome category`)) %>% 
  select(c(`Location`, `LSOA name`,  `Crime type`, `Last outcome category`)) %>%
  head(10)


# Total number of Anti-social behavior crimes
asb <- df %>% filter(`Crime type`== 'Anti-social behaviour') %>% 
                  count()
paste("Total number of Anti-social behavior crimes: ", asb$n)


df %>% filter(is.na(`Last outcome category`)) %>% 
  select(`Crime type`) %>%
  table()

.
       Anti-social behaviour                Bicycle theft 
                        3121                            9 
                    Burglary    Criminal damage and arson 
                          11                           41 
                       Drugs                  Other crime 
                          21                            7 
                 Other theft        Possession of weapons 
                          32                            4 
                Public order                      Robbery 
                          19                            1 
                 Shoplifting        Theft from the person 
                          16                           15 
               Vehicle crime Violence and sexual offences 
                           6                           91


# Unfortutately given the share of nulls for Anti-social behavior, I have to drop it from the samples for the Association Rules
# Finish data cleaning by:
# 1 - Keeping only Swindom/Wilthshire, 
# 2 - Removing Anti-social behavior
# 3 - Cleaning the location string to remove "On or near"

df_clean <- df %>% select(-c(`Crime ID`, `Reported by`, 
                             `Falls within`, `LSOA code`, Context)) %>%
  filter(grepl('Swindon', `LSOA name`) | grepl('Wiltshire', `LSOA name`)) %>%
  filter(`Crime type` != 'Anti-social behaviour') %>%
  mutate(Location = trimws(str_replace(Location, 'On or near', ""))) %>%
  rowid_to_column("id")

# Check the clean dataset
head(df_clean)


# The goal is to search for associations between a crime's location, type and outcome.
df_assoc <- df_clean %>% select(id,
                                `LSOA name`,
                                Location,
                                `Crime type`, 
                                `Last outcome category`) %>%
  melt(id.vars = 'id') %>%
  select(id, value)

# Check the resulting dataframe
tail(df_assoc)


# Move the dataframe to spark
df_assoc_tbl <- sparklyr::sdf_copy_to(sc, df_assoc, overwrite = T)

# Inspect result
head(df_assoc_tbl)

# Source: spark<?> [?? x 2]
     id value         
  <int> <chr>         
1     1 Swindon 010C  
2     2 Swindon 020C  
3     3 Swindon 015C  
4     4 Swindon 009A  
5     5 Swindon 010A  
6     6 Wiltshire 059D


# To create the baskets to be used in Association Rule Mining each element must be collected to a list by id
df_assoc_collect <- df_assoc_tbl %>% 
  group_by(id) %>%
  summarise(
    items = collect_list(value)
  )

# Inspect result
head(df_assoc_collect)

# Source: spark<?> [?? x 2]
     id items     
  <int> <list>    
1    12 <list [4]>
2    13 <list [4]>
3    14 <list [4]>
4    18 <list [4]>
5    38 <list [4]>
6    46 <list [4]>


# How to use FP-Growth
?ml_fpgrowth
ml_fpgrowth(df_assoc_collect)

FPGrowthModel (Transformer)
<fpgrowth__10b0bf70_b43b_11eb_b731_05f8a276ca00> 
 (Parameters -- Column Names)
  items_col: items
  prediction_col: prediction
 (Transformer Info)
  association_rules: <function> 
  freq_itemsets: <tbl_spark>


# Defininf the model and rules
model <- sparklyr::ml_fpgrowth(df_assoc_collect, min_support = 0.01, min_confidence = 0.7)
rules <- sparklyr::ml_association_rules(model)


# Let's check the rules
rules_df <- as.data.frame(rules)
head(rules_df[order(rules_df$lift, decreasing=TRUE),], 10)


# Prepare the algorithm
uid = sparklyr:::random_string("fpgrowth_")
jobj = invoke_new(sc, "org.apache.spark.ml.fpm.FPGrowth", uid)


# Create the model
FPGmodel <- jobj %>% 
  invoke("setItemsCol", "items") %>%
  invoke("setMinConfidence", 0.03) %>%
  invoke("setMinSupport", 0.01)  %>%
  invoke("fit", spark_dataframe(df_assoc_collect))


# Function to extract the association rules
ml_fpgrowth_extract_rules = function(FPGmodel, nLHS = 2, nRHS = 1)
{
  rules = FPGmodel %>% invoke("associationRules")
  sdf_register(rules, "rules")
  
  exprs1 <- lapply(
    0:(nLHS - 1), 
    function(i) paste("CAST(antecedent[", i, "] AS string) AS LHSitem", i, sep="")
  )
  exprs2 <- lapply(
    0:(nRHS - 1), 
    function(i) paste("CAST(consequent[", i, "] AS string) AS RHSitem", i, sep="")
  )
  
  splittedLHS = rules %>% invoke("selectExpr", exprs1) 
  splittedRHS = rules %>% invoke("selectExpr", exprs2) 
  p1 = sdf_register(splittedLHS, "tmp1")
  p2 = sdf_register(splittedRHS, "tmp2")
  
  # Collect the rules for manipulation in R
  bind_cols(
    sdf_bind_cols(p1, p2) %>% collect(),
    rules %>% collect() %>% select(confidence)
  )
}


# Extract the association rules generated by the model
rules <- ml_fpgrowth_extract_rules(FPGmodel)
head(rules)


# Association map
rules = rules %>% filter(confidence > 0.5)
nds = unique(
    c(
      rules[,"LHSitem0"][[1]],
      rules[,"RHSitem0"][[1]]
    )
  )


# Graph nodes
nodes = data.frame(id = nds, label = nds, title = nds) %>% arrange(id)


# Graph edges
edges = data.frame(
    from =  rules[,"LHSitem0"][[1]],
    to = rules[,"RHSitem0"][[1]]
  )


# Visualize the Association Rules Graph
graph_plot <- visNetwork(nodes, edges, main = "Association Rules of Crimes in Swindom, Wilthshire", size=1) %>%
    visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE) %>%
    visEdges(arrows = 'from') %>%
    visPhysics(
      solver = "barnesHut", 
      forceAtlas2Based = list(gravitationalConstant = -20, maxVelocity = 1)
    )

# View map
graph_plot

# Saving the map as an interactive html.
saveWidget(graph_plot, file="graph_plot.html")


# Close the Spark connection
spark_disconnect(sc)

Crime ID	Month	Reported by	Falls within	Longitude	Latitude	Location	LSOA code	LSOA name	Crime type	Last outcome category	Context
NA	2020-10	Devon & Cornwall Police	Devon & Cornwall Police	-3.521994	50.49251	On or near Sports/Recreation Area	E01015236	Torbay 002A	Anti-social behaviour	NA	NA
c3bea8c306a64b5424dad60f64cb168820eb8b0c750e76a56db75fb9ef8c11a1	2020-10	West Mercia Police	West Mercia Police	-2.492143	52.90896	On or near Tudor Close	E01028875	Shropshire 005C	Violence and sexual offences	Unable to prosecute suspect	NA
NA	2019-10	West Yorkshire Police	West Yorkshire Police	-1.563360	53.87736	On or near Village Road	E01011554	Leeds 012A	Anti-social behaviour	NA	NA
a7e783b015c2ddb4238b7708638dfb675d6101a24e2230a965c21e441b6a4724	2019-08	West Midlands Police	West Midlands Police	-2.174542	52.58081	On or near White Oak Drive	E01010509	Wolverhampton 019A	Violence and sexual offences	Unable to prosecute suspect	NA
NA	2020-04	South Yorkshire Police	South Yorkshire Police	-1.136918	53.52296	On or near Parking Area	E01007645	Doncaster 022E	Anti-social behaviour	NA	NA
8ca3d9c1567c96cbda22a1fae9bd24f343fb59d2c1bb6e8b49acc596a45f88c8	2019-09	Cambridgeshire Constabulary	Cambridgeshire Constabulary	-0.284872	52.60068	On or near Stumpacre	E01015596	Peterborough 009B	Violence and sexual offences	Investigation complete; no suspect identified	NA

Crime ID	Month	Reported by	Falls within	Longitude	Latitude	Location	LSOA code	LSOA name	Crime type	Last outcome category	Context
b70d8ce9de41bce8ad5476f226dae6b468902a0d4a45d43e4c816324f043838c	2019-08	Wiltshire Police	Wiltshire Police	-1.780198	51.56940	On or near Beatrice Street	E01015510	Swindon 010C	Criminal damage and arson	Investigation complete; no suspect identified	NA
5e0362fea6f93b044af724f1d720fbef818ec054efe6fb890194cffc9023b152	2020-11	Wiltshire Police	Wiltshire Police	-1.754611	51.55432	On or near Colston Close	E01015540	Swindon 020C	Violence and sexual offences	Unable to prosecute suspect	NA
NA	2020-09	Wiltshire Police	Wiltshire Police	-2.002448	51.32839	On or near Firs Hill Way	E01031872	Wiltshire 028D	Anti-social behaviour	NA	NA
50aeb31628bf8bd2a8092f4276b12db3fa4ca5cac54e4171e88536d8ab79bce4	2019-05	Wiltshire Police	Wiltshire Police	-1.794252	51.55621	On or near William Street	E01015477	Swindon 015C	Drugs	Court result unavailable	NA
NA	2020-12	Wiltshire Police	Wiltshire Police	-1.746214	51.56771	On or near Sports/Recreation Area	E01015550	Swindon 009B	Anti-social behaviour	NA	NA
NA	2019-12	Wiltshire Police	Wiltshire Police	-2.129935	51.46354	On or near Chestnut Road	E01031910	Wiltshire 009B	Anti-social behaviour	NA	NA

	antecedent	consequent	confidence	lift
1	Wiltshire 061A , Criminal damage and arson , Investigation complete; no suspect identified	Blind Lane	1	76
3	Swindon 001C	Folly Way	1	76
6	Local resolution	Castle Well Road	1	76
8	Local resolution	Wiltshire 057B	1	76
9	Swindon 009A , Drugs , Status update unavailable	Tilley'S Lane	1	76
10	Sports/Recreation Area , Criminal damage and arson	Wiltshire 008A	1	76
16	Wiltshire 056A , Vehicle crime , Investigation complete; no suspect identified	College Street	1	76
19	Sports/Recreation Area, Public order	Wiltshire 018E	1	76
22	Offender given penalty notice	Swindon 013F	1	76
23	Offender given penalty notice	Charminster Close	1	76

Create_UK_Crime_Dataset

Mining Association Rules in Distibuted Databases¶

Problem Statement¶

Dataset Creation¶

Mining_Association_Rules_Sparklyr

R Envrionment Setup¶

Exploratory Analysis¶

Data Cleaning¶

Feature Engineering¶

Mining Association Rules¶

Visualize the Association Rules¶

End¶

	id	value
299	71	Status update unavailable
300	72	Investigation complete; no suspect identified
301	73	Local resolution
302	74	Investigation complete; no suspect identified
303	75	Unable to prosecute suspect
304	76	Investigation complete; no suspect identified

LHSitem0	LHSitem1	RHSitem0	confidence
Wiltshire 061A	Criminal damage and arson	Blind Lane	1
Wiltshire 023D	Parking Area	Other theft	1
Swindon 001C	NA	Folly Way	1
Swindon 001C	NA	Unable to prosecute suspect	1
Swindon 001C	NA	Violence and sexual offences	1
Local resolution	NA	Castle Well Road	1