setwd("C:/Portfolio/People_Analytics_Resignations")
getwd()


# Imports
library(caret)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(tidyr)
library(data.table)
library(car)
library(caTools)
library(corrplot)
library(rpart)
library(rpart.plot)

# Set a smaller plot size
options(repr.plot.width=8, repr.plot.height=4)

# Align all titles to the middle of the plot
theme_update(plot.title = element_text(hjust = 0.5))

Warning message:
"package 'caret' was built under R version 3.6.3"Loading required package: lattice
Loading required package: ggplot2
Registered S3 methods overwritten by 'ggplot2':
  method         from 
  [.quosures     rlang
  c.quosures     rlang
  print.quosures rlang
Warning message:
"package 'gridExtra' was built under R version 3.6.3"Warning message:
"package 'dplyr' was built under R version 3.6.3"
Attaching package: 'dplyr'

The following object is masked from 'package:gridExtra':

    combine

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Warning message:
"package 'tidyr' was built under R version 3.6.3"
Attaching package: 'data.table'

The following objects are masked from 'package:dplyr':

    between, first, last

Warning message:
"package 'car' was built under R version 3.6.3"Loading required package: carData
Warning message:
"package 'carData' was built under R version 3.6.3"
Attaching package: 'car'

The following object is masked from 'package:dplyr':

    recode

Warning message:
"package 'caTools' was built under R version 3.6.3"Warning message:
"package 'corrplot' was built under R version 3.6.3"corrplot 0.84 loaded
Warning message:
"package 'rpart.plot' was built under R version 3.6.3"


# Load dataset from CSV file
data_hr <- fread('data/dataset.csv')
dim(data_hr)


head(data_hr)


# Check variable types
str(data_hr)

Classes 'data.table' and 'data.frame':	23058 obs. of  30 variables:
 $ Age                     : int  41 37 41 37 37 37 41 41 41 41 ...
 $ Attrition               : chr  "Voluntary Resignation" "Voluntary Resignation" "Voluntary Resignation" "Voluntary Resignation" ...
 $ BusinessTravel          : chr  "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" "Travel_Rarely" ...
 $ Department              : chr  "Sales" "Human Resources" "Sales" "Human Resources" ...
 $ DistanceFromHome        : int  1 6 1 6 6 6 1 1 1 1 ...
 $ Education               : int  2 4 2 4 4 4 2 2 2 2 ...
 $ EducationField          : chr  "Life Sciences" "Human Resources" "Life Sciences" "Marketing" ...
 $ EnvironmentSatisfaction : int  2 1 2 1 1 1 2 2 2 4 ...
 $ Gender                  : chr  "Female" "Female" "Female" "Female" ...
 $ JobInvolvement          : int  3 3 3 3 3 3 3 3 3 3 ...
 $ JobLevel                : int  2 2 2 2 2 2 2 2 2 4 ...
 $ JobRole                 : chr  "Sales Executive" "Sales Executive" "Sales Executive" "Sales Executive" ...
 $ JobSatisfaction         : int  4 4 4 4 4 4 4 4 4 3 ...
 $ MaritalStatus           : chr  "Single" "Single" "Single" "Single" ...
 $ MonthlyIncome           : int  5993 5993 5993 5993 5993 5993 5993 5993 5993 14756 ...
 $ NumCompaniesWorked      : int  8 8 4 5 8 5 8 4 8 2 ...
 $ OverTime                : chr  "Yes" "Yes" "Yes" "Yes" ...
 $ PercentSalaryHike       : int  11 11 11 11 11 11 11 11 11 14 ...
 $ PerformanceRating       : int  3 4 3 3 3 3 3 3 3 3 ...
 $ RelationshipSatisfaction: int  1 1 1 1 1 1 1 1 1 3 ...
 $ StockOptionLevel        : int  0 0 0 0 0 0 0 0 0 3 ...
 $ TotalWorkingYears       : int  8 8 8 8 8 8 8 8 8 21 ...
 $ TrainingTimesLastYear   : int  0 0 0 0 0 0 0 0 0 2 ...
 $ WorkLifeBalance         : int  1 1 1 1 1 1 1 1 1 3 ...
 $ YearsAtCompany          : int  6 6 6 6 6 6 6 6 6 5 ...
 $ YearsInCurrentRole      : int  4 4 4 4 4 4 4 4 4 0 ...
 $ YearsSinceLastPromotion : int  0 0 0 0 0 0 0 0 0 0 ...
 $ YearsWithCurrManager    : int  5 5 5 5 5 5 5 5 5 2 ...
 $ Employee Source         : chr  "Referral" "Referral" "Referral" "Referral" ...
 $ AgeStartedWorking       : int  33 29 33 29 29 29 33 33 33 20 ...
 - attr(*, ".internal.selfref")=<externalptr>


# Basic statistics on the variables
summary(data_hr)

      Age         Attrition         BusinessTravel      Department       
 Min.   :18.00   Length:23058       Length:23058       Length:23058      
 1st Qu.:30.00   Class :character   Class :character   Class :character  
 Median :36.00   Mode  :character   Mode  :character   Mode  :character  
 Mean   :37.04                                                           
 3rd Qu.:43.00                                                           
 Max.   :60.00                                                           
 DistanceFromHome   Education     EducationField     EnvironmentSatisfaction
 Min.   : 1.000   Min.   :1.000   Length:23058       Min.   :1.00           
 1st Qu.: 2.000   1st Qu.:2.000   Class :character   1st Qu.:2.00           
 Median : 7.000   Median :3.000   Mode  :character   Median :3.00           
 Mean   : 9.215   Mean   :2.915                      Mean   :2.72           
 3rd Qu.:14.000   3rd Qu.:4.000                      3rd Qu.:4.00           
 Max.   :29.000   Max.   :5.000                      Max.   :4.00           
    Gender          JobInvolvement    JobLevel       JobRole         
 Length:23058       Min.   :1.00   Min.   :1.000   Length:23058      
 Class :character   1st Qu.:2.00   1st Qu.:1.000   Class :character  
 Mode  :character   Median :3.00   Median :2.000   Mode  :character  
                    Mean   :2.73   Mean   :2.044                     
                    3rd Qu.:3.00   3rd Qu.:3.000                     
                    Max.   :4.00   Max.   :5.000                     
 JobSatisfaction MaritalStatus      MonthlyIncome   NumCompaniesWorked
 Min.   :1.000   Length:23058       Min.   : 1009   Min.   :0.000     
 1st Qu.:2.000   Class :character   1st Qu.: 2900   1st Qu.:1.000     
 Median :3.000   Mode  :character   Median : 4898   Median :2.000     
 Mean   :2.725                      Mean   : 6416   Mean   :2.691     
 3rd Qu.:4.000                      3rd Qu.: 8120   3rd Qu.:4.000     
 Max.   :4.000                      Max.   :19999   Max.   :9.000     
   OverTime         PercentSalaryHike PerformanceRating
 Length:23058       Min.   :11.00     Min.   :3.000    
 Class :character   1st Qu.:12.00     1st Qu.:3.000    
 Mode  :character   Median :14.00     Median :3.000    
                    Mean   :15.22     Mean   :3.155    
                    3rd Qu.:18.00     3rd Qu.:3.000    
                    Max.   :25.00     Max.   :4.000    
 RelationshipSatisfaction StockOptionLevel TotalWorkingYears
 Min.   :1.000            Min.   :0.0000   Min.   : 0.00    
 1st Qu.:2.000            1st Qu.:0.0000   1st Qu.: 6.00    
 Median :3.000            Median :1.0000   Median :10.00    
 Mean   :2.713            Mean   :0.7944   Mean   :11.07    
 3rd Qu.:4.000            3rd Qu.:1.0000   3rd Qu.:15.00    
 Max.   :4.000            Max.   :3.0000   Max.   :40.00    
 TrainingTimesLastYear WorkLifeBalance YearsAtCompany  YearsInCurrentRole
 Min.   :0.000         Min.   :1.000   Min.   : 0.00   Min.   : 0.000    
 1st Qu.:2.000         1st Qu.:2.000   1st Qu.: 3.00   1st Qu.: 2.000    
 Median :3.000         Median :3.000   Median : 5.00   Median : 3.000    
 Mean   :2.804         Mean   :2.762   Mean   : 6.91   Mean   : 4.201    
 3rd Qu.:3.000         3rd Qu.:3.000   3rd Qu.: 9.00   3rd Qu.: 7.000    
 Max.   :6.000         Max.   :4.000   Max.   :40.00   Max.   :18.000    
 YearsSinceLastPromotion YearsWithCurrManager Employee Source   
 Min.   : 0.000          Min.   : 0.000       Length:23058      
 1st Qu.: 0.000          1st Qu.: 2.000       Class :character  
 Median : 1.000          Median : 3.000       Mode  :character  
 Mean   : 2.164          Mean   : 4.091                         
 3rd Qu.: 3.000          3rd Qu.: 7.000                         
 Max.   :15.000          Max.   :17.000                         
 AgeStartedWorking
 Min.   : 0.00    
 1st Qu.:20.00    
 Median :25.00    
 Mean   :25.96    
 3rd Qu.:31.00    
 Max.   :60.00


# Converting categorical variables to factor
data_hr$Attrition                <- as.factor(data_hr$Attrition)
data_hr$BusinessTravel           <- as.factor(data_hr$BusinessTravel)
data_hr$Department               <- as.factor(data_hr$Department)
data_hr$Education                <- as.factor(data_hr$Education)
data_hr$EducationField           <- as.factor(data_hr$EducationField)
data_hr$'Employee Source'        <- as.factor(data_hr$'Employee Source')
data_hr$EnvironmentSatisfaction  <- as.factor(data_hr$EnvironmentSatisfaction)
data_hr$Gender                   <- as.factor(data_hr$Gender)
data_hr$JobInvolvement           <- as.factor(data_hr$JobInvolvement)
data_hr$JobLevel                 <- as.factor(data_hr$JobLevel)
data_hr$JobRole                  <- as.factor(data_hr$JobRole)
data_hr$JobSatisfaction          <- as.factor(data_hr$JobSatisfaction)
data_hr$MaritalStatus            <- as.factor(data_hr$MaritalStatus)
data_hr$OverTime                 <- as.factor(data_hr$OverTime)
data_hr$PerformanceRating        <- as.factor(data_hr$PerformanceRating)
data_hr$RelationshipSatisfaction <- as.factor(data_hr$RelationshipSatisfaction)
data_hr$StockOptionLevel         <- as.factor(data_hr$StockOptionLevel)
data_hr$WorkLifeBalance          <- as.factor(data_hr$WorkLifeBalance)


# Converting numerical variables to integer
data_hr$DistanceFromHome  <- as.integer(data_hr$DistanceFromHome)
data_hr$MonthlyIncome     <- as.integer(data_hr$MonthlyIncome)
data_hr$PercentSalaryHike <- as.integer(data_hr$PercentSalaryHike)


# Drop any factor levels with a count of 0
data_hr <- droplevels(data_hr)


# Check variable types
str(data_hr)

Classes 'data.table' and 'data.frame':	23058 obs. of  30 variables:
 $ Age                     : int  41 37 41 37 37 37 41 41 41 41 ...
 $ Attrition               : Factor w/ 3 levels "Current employee",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ BusinessTravel          : Factor w/ 3 levels "Non-Travel","Travel_Frequently",..: 3 3 3 3 3 3 3 3 3 3 ...
 $ Department              : Factor w/ 3 levels "Human Resources",..: 3 1 3 1 1 1 3 3 3 3 ...
 $ DistanceFromHome        : int  1 6 1 6 6 6 1 1 1 1 ...
 $ Education               : Factor w/ 5 levels "1","2","3","4",..: 2 4 2 4 4 4 2 2 2 2 ...
 $ EducationField          : Factor w/ 6 levels "Human Resources",..: 2 1 2 3 1 3 2 2 2 2 ...
 $ EnvironmentSatisfaction : Factor w/ 4 levels "1","2","3","4": 2 1 2 1 1 1 2 2 2 4 ...
 $ Gender                  : Factor w/ 2 levels "Female","Male": 1 1 1 1 1 1 1 1 1 1 ...
 $ JobInvolvement          : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
 $ JobLevel                : Factor w/ 5 levels "1","2","3","4",..: 2 2 2 2 2 2 2 2 2 4 ...
 $ JobRole                 : Factor w/ 9 levels "Healthcare Representative",..: 8 8 8 8 8 8 8 8 8 4 ...
 $ JobSatisfaction         : Factor w/ 4 levels "1","2","3","4": 4 4 4 4 4 4 4 4 4 3 ...
 $ MaritalStatus           : Factor w/ 3 levels "Divorced","Married",..: 3 3 3 3 3 3 3 3 3 1 ...
 $ MonthlyIncome           : int  5993 5993 5993 5993 5993 5993 5993 5993 5993 14756 ...
 $ NumCompaniesWorked      : int  8 8 4 5 8 5 8 4 8 2 ...
 $ OverTime                : Factor w/ 2 levels "No","Yes": 2 2 2 2 2 2 2 2 2 2 ...
 $ PercentSalaryHike       : int  11 11 11 11 11 11 11 11 11 14 ...
 $ PerformanceRating       : Factor w/ 2 levels "3","4": 1 2 1 1 1 1 1 1 1 1 ...
 $ RelationshipSatisfaction: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 3 ...
 $ StockOptionLevel        : Factor w/ 4 levels "0","1","2","3": 1 1 1 1 1 1 1 1 1 4 ...
 $ TotalWorkingYears       : int  8 8 8 8 8 8 8 8 8 21 ...
 $ TrainingTimesLastYear   : int  0 0 0 0 0 0 0 0 0 2 ...
 $ WorkLifeBalance         : Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 3 ...
 $ YearsAtCompany          : int  6 6 6 6 6 6 6 6 6 5 ...
 $ YearsInCurrentRole      : int  4 4 4 4 4 4 4 4 4 0 ...
 $ YearsSinceLastPromotion : int  0 0 0 0 0 0 0 0 0 0 ...
 $ YearsWithCurrManager    : int  5 5 5 5 5 5 5 5 5 2 ...
 $ Employee Source         : Factor w/ 9 levels "Adzuna","Company Website",..: 8 8 8 8 8 8 8 8 8 2 ...
 $ AgeStartedWorking       : int  33 29 33 29 29 29 33 33 33 20 ...
 - attr(*, ".internal.selfref")=<externalptr>


# Fix the space in the `Employee Source` column name
names(data_hr)[names(data_hr) == "Employee Source"] <- "EmployeeSource"


# Creating a column with prior years of experience to better visualize an employee's experience profile
data_hr$PriorYearsOfExperience <- data_hr$TotalWorkingYears - data_hr$YearsAtCompany
dim(data_hr)


# Checking if I didn't break anything during feature engineering
summary(data_hr$PriorYearsOfExperience)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  0.000   0.000   2.000   4.165   5.000  40.000


# Create a new feature (column) with each employee's average job tenure
data_hr$AverageTenure <- data_hr$PriorYearsOfExperience / data_hr$NumCompaniesWorked
dim(data_hr)


# Checking if I didn't break anything during feature engineering
summary(data_hr$AverageTenure)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
      0       0       1     Inf       4     Inf     372


# Fixing inf values on $AverageTenure
data_hr$AverageTenure[!is.finite(data_hr$AverageTenure)] <- 0
summary(data_hr$AverageTenure)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
 0.0000  0.0000  0.3333  1.7725  1.5000 40.0000


# Create a filtered dataset that does not include terminations.
# I'm doing this because for this project I'm not as much looking into building ML models
# Instead I'm interested in which factors lead people to resign, and the datapoints for terminations won't help with that
# More likely than not they might just prove to mess with the analysis, so let's se apart a dataset without terminations
data_hr_1 <- data_hr[data_hr$Attrition != 'Termination']
data_hr_1 <- droplevels(data_hr_1)
dim(data_hr_1)


# Job Role
ggplot(data_hr) + 
    geom_bar(aes(x = JobRole)) + 
    theme(axis.text.x = element_text(angle = 30, vjust = 0.7))


# Employee Source
ggplot(data_hr) + 
    geom_bar(aes(x = EmployeeSource)) + 
    theme(axis.text.x = element_text(angle = 30, vjust = 0.7))


# Education
# 1 is the lowest 
ggplot(data_hr) + 
    geom_bar(aes(x = Education)) + 
    facet_grid(~EducationField)


# For the datapoints with few categories a Multiplot Grid can be used
p1 <- ggplot(data_hr) + geom_bar(aes(x = Gender))
p2 <- ggplot(data_hr) + geom_bar(aes(x = Attrition))
p3 <- ggplot(data_hr) + geom_bar(aes(x = Department))
p4 <- ggplot(data_hr) + geom_bar(aes(x = BusinessTravel))

# Organize the grid
grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)


# Probability Density Functions (PDFs) for Time-Based Data
p.TotalWorkingYears       <- ggplot(data_hr) + geom_density(aes(TotalWorkingYears), fill="gray", alpha=0.5)
p.YearsAtCompany          <- ggplot(data_hr) + geom_density(aes(YearsAtCompany), fill="gray", alpha=0.5)
p.YearsSinceLastPromotion <- ggplot(data_hr) + geom_density(aes(YearsSinceLastPromotion), fill="gray", alpha=0.5)
p.YearsWithCurrManager    <- ggplot(data_hr) + geom_density(aes(YearsWithCurrManager), fill="gray", alpha=0.5)
p.YearsInCurrentRole      <- ggplot(data_hr) + geom_density(aes(YearsInCurrentRole), fill="gray", alpha=0.5)
p.PriorYearsOfExperience  <- ggplot(data_hr) + geom_density(aes(PriorYearsOfExperience), fill="gray", alpha=0.5)

# Organize the grid
grid.arrange(p.TotalWorkingYears, 
             p.YearsAtCompany, 
             p.YearsSinceLastPromotion, 
             p.YearsWithCurrManager, 
             p.YearsInCurrentRole, 
             p.PriorYearsOfExperience, 
             nrow = 2, 
             ncol = 3,
             top = "Probability Density Functions (PDFs) for Time-Based Data")


# Share of employees with X prior years of experience
less1  <- length(which(data_hr$PriorYearsOfExperience < 1)) / length(data_hr$PriorYearsOfExperience)  
less3  <- length(which(data_hr$PriorYearsOfExperience < 3)) / length(data_hr$PriorYearsOfExperience)   
less5  <- length(which(data_hr$PriorYearsOfExperience < 5)) / length(data_hr$PriorYearsOfExperience)   
less7  <- length(which(data_hr$PriorYearsOfExperience < 7)) / length(data_hr$PriorYearsOfExperience)   
less10 <- length(which(data_hr$PriorYearsOfExperience < 10)) / length(data_hr$PriorYearsOfExperience)  

cat('Share of employees with less than 1 year of experience: ', less1*100,'%', '\n')
cat('Share of employees with less than 3 years of experience: ', less3*100,'%', '\n')
cat('Share of employees with less than 5 years of experience: ', less5*100,'%', '\n')
cat('Share of employees with less than 7 years of experience: ', less7*100,'%', '\n')
cat('Share of employees with less than 10 years of experience: ', less10*100,'%', '\n')

Share of employees with less than 1 year of experience:  32.46596 % 
Share of employees with less than 3 years of experience:  58.28346 % 
Share of employees with less than 5 years of experience:  70.85177 % 
Share of employees with less than 7 years of experience:  79.52121 % 
Share of employees with less than 10 years of experience:  85.89644 %


# Checking how young the workforce really is
ggplot(data_hr) + 
    geom_density(aes(x = Age), fill="gray", alpha=0.5) +
    ggtitle("Employee Age :: Probability Density Function (PDF)")


# Share under 30
under30 = length(which(data_hr$Age < 30)) / length(data_hr$Age)

cat('Share of employees under 30 years old: ', under30*100,'%', '\n')

Share of employees under 30 years old:  21.65409 %


# Education
summary(data_hr$Education)


# Education 
ggplot(data_hr, aes(x = Education)) + 
    geom_bar(aes(y = (..count..)/sum(..count..))) +
    scale_y_continuous(labels=scales::percent) + 
    ylab("Relative Frequencies") +
    ggtitle("Share of Employees per Education Bracket")


# Boxplot with the monthly income distribution for each of the job satisfaction levels
ggplot(data = subset(data_hr, !is.na(JobSatisfaction)), aes(JobSatisfaction, MonthlyIncome)) + 
    geom_boxplot() +
    ggtitle("Boxplot of Monthly Income per Job Satisfaction Bracket")


# How well do years at company correlate with other metrics?
yc1 <- cor(data_hr$YearsAtCompany, data_hr$TotalWorkingYears,       use = "complete.obs")
yc2 <- cor(data_hr$YearsAtCompany, data_hr$YearsInCurrentRole,      use = "complete.obs")
yc3 <- cor(data_hr$YearsAtCompany, data_hr$YearsSinceLastPromotion, use = "complete.obs")
yc4 <- cor(data_hr$YearsAtCompany, data_hr$YearsWithCurrManager,    use = "complete.obs")
yc5 <- cor(data_hr$YearsAtCompany, data_hr$MonthlyIncome,           use = "complete.obs") 

cat('Correlation between YearsAtCompany and TotalWorkingYears:       ', yc1, '\n')
cat('Correlation between YearsAtCompany and YearsInCurrentRole:      ', yc2, '\n')
cat('Correlation between YearsAtCompany and YearsSinceLastPromotion: ', yc3, '\n')
cat('Correlation between YearsAtCompany and YearsWithCurrManager:    ', yc4, '\n')
cat('Correlation between YearsAtCompany and MonthlyIncome:           ', yc5, '\n')

Correlation between YearsAtCompany and TotalWorkingYears:        0.624816 
Correlation between YearsAtCompany and YearsInCurrentRole:       0.7670497 
Correlation between YearsAtCompany and YearsSinceLastPromotion:  0.6236737 
Correlation between YearsAtCompany and YearsWithCurrManager:     0.7728072 
Correlation between YearsAtCompany and MonthlyIncome:            0.4981578


# Scatterplot: YearsAtCompany x MonthlyIncome
# Scatterplot: TotalWorkingYears x MonthlyIncome
grid.arrange(ggplot(data_hr) + geom_point(aes(YearsAtCompany, MonthlyIncome)),
             ggplot(data_hr) + geom_point(aes(TotalWorkingYears, MonthlyIncome)),  
             nrow = 1, ncol = 2)


ggplot(data = subset(data_hr, !is.na(WorkLifeBalance)), aes(WorkLifeBalance, MonthlyIncome)) + 
    geom_boxplot() +
    ggtitle("Boxplot of Monthly Income per Work-Life Balance Bracket")


ggplot(data = subset(data_hr, !is.na(Gender)), aes(Gender, MonthlyIncome, fill = Gender)) +
  geom_boxplot() + 
  theme(legend.position = "none") +
  labs(x = "Gender", y = "Monthly Income", title = "Monthly Income per Gender") +
  coord_flip()


# Number of men and women overall
data_hr %>% count(Gender, sort = FALSE)


# Men and women per role
tbl <- with(subset(data_hr, !is.na(JobRole)), table(JobRole, Gender))
ggplot(as.data.frame(tbl), aes(factor(JobRole), Freq, fill = Gender)) +     
    geom_col(position = 'stack') +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.7)) +
    ggtitle("Gender per Role")


# Table breakdown of gender x role
data_hr %>% count(JobRole, Gender, sort = FALSE)


ggplot(data = subset(data_hr, !is.na(JobRole))) + 
    geom_boxplot(aes(JobRole, MonthlyIncome)) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.7)) +
    ggtitle("Monthly Income per Role")


ggplot(data = subset(data_hr, !is.na(JobRole))) + 
    geom_boxplot(aes(JobRole, Age)) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.7)) +
    ggtitle("Age per Role")


ggplot(data = subset(data_hr, !is.na(JobRole))) + 
    geom_boxplot(aes(JobRole, YearsAtCompany)) +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.7)) +
    ggtitle("Years at Company per Role")


ggplot(data = na.omit(data_hr)) + 
    geom_bar(aes(JobRole, fill = Education), position = "fill") +
    theme(axis.text.x = element_text(angle = 30, vjust = 0.7)) +
    ggtitle("Education Level per Role") + 
    ylab("Proportion")


ggplot(data = data_hr_1) + 
    geom_bar(aes(x = Education , fill = Attrition), position = 'fill') + 
    facet_grid(.~Department)  +
    ggtitle("Resignations per Department and Education")


ggplot(data = data_hr_1) + 
    geom_bar(aes(x = Education , fill = Attrition), position = 'fill') + 
    facet_grid(.~JobRole)  +
    ggtitle("Resignations per Job Role and Education")


# Analysis of variables frequently considered during the hiring process
p1 <- ggplot(data_hr_1) + geom_bar(aes(x = Age, fill = Attrition), position = 'fill') + ggtitle("Resignations per Age")
p2 <- ggplot(data_hr_1) + geom_bar(aes(x = MaritalStatus, fill = Attrition), position = 'fill') + ggtitle("Resignations per MaritalStatus")
p3 <- ggplot(data_hr_1) + geom_bar(aes(x = Education, fill = Attrition), position = 'fill') + ggtitle("Resignations per Education")
p4 <- ggplot(data_hr_1) + geom_bar(aes(x = DistanceFromHome, fill = Attrition), position = 'fill') + ggtitle("Resignations per DistanceFromHome")
grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)


# Analysis of variables measuring the "status" of a current employee
p1 <- ggplot(data_hr_1) + geom_bar(aes(x = BusinessTravel, fill = Attrition), position = 'fill') + ggtitle("Resignations per BusinessTravel")
p2 <- ggplot(data_hr_1) + geom_bar(aes(x = TrainingTimesLastYear, fill = Attrition), position = 'fill') + ggtitle("Resignations per TrainingTimesLastYear")
p3 <- ggplot(data_hr_1) + geom_bar(aes(x = OverTime, fill = Attrition), position = 'fill') + ggtitle("Resignations per OverTime")
p4 <- ggplot(data_hr_1) + geom_bar(aes(x = StockOptionLevel, fill = Attrition), position = 'fill') + ggtitle("Resignations per StockOptionLevel")
grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)


# Co-Occurance of Business Travel and Over Time
data_hr %>% count(BusinessTravel, OverTime, sort = FALSE)


# Analysis of variables measuring job satisfactions
p1 <- ggplot(data_hr_1) + geom_bar(aes(x = JobSatisfaction, fill = Attrition), position = 'fill') + ggtitle("Resignations per JobSatisfaction")
p2 <- ggplot(data_hr_1) + geom_bar(aes(x = JobInvolvement, fill = Attrition), position = 'fill') + ggtitle("Resignations per JobInvolvement")
p3 <- ggplot(data_hr_1) + geom_bar(aes(x = EnvironmentSatisfaction, fill = Attrition), position = 'fill') + ggtitle("Resignations per EnvironmentSatisfaction")
p4 <- ggplot(data_hr_1) + geom_bar(aes(x = WorkLifeBalance, fill = Attrition), position = 'fill') + ggtitle("Resignations per WorkLifeBalance")
grid.arrange(p1, p2, p3, p4, nrow=2, ncol=2)


# Remebering all variables
head(data_hr, 3)


# Logistic Regression Model (GLM) using variables available at hiring time
model_v1 <- glm(Attrition ~ Age + Department + DistanceFromHome + EmployeeSource + 
                   JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender + 
                   Education + EducationField, 
                 family = binomial, 
                 data = data_hr_1)
summary(model_v1)

Call:
glm(formula = Attrition ~ Age + Department + DistanceFromHome + 
    EmployeeSource + JobRole + MaritalStatus + AverageTenure + 
    PriorYearsOfExperience + Gender + Education + EducationField, 
    family = binomial, data = data_hr_1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.4484  -0.6177  -0.4918  -0.3558   2.7300  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -0.499751   0.199492  -2.505 0.012241 *  
Age                              -0.044889   0.002446 -18.348  < 2e-16 ***
DepartmentResearch & Development -0.427955   0.103053  -4.153 3.28e-05 ***
DepartmentSales                   0.025684   0.106499   0.241 0.809423    
DistanceFromHome                  0.020372   0.002522   8.076 6.69e-16 ***
EmployeeSourceCompany Website     0.183335   0.074868   2.449 0.014334 *  
EmployeeSourceGlassDoor           0.006274   0.089680   0.070 0.944229    
EmployeeSourceIndeed             -0.080908   0.089734  -0.902 0.367244    
EmployeeSourceJora                0.183678   0.084958   2.162 0.030618 *  
EmployeeSourceLinkedIn           -0.079145   0.090405  -0.875 0.381325    
EmployeeSourceRecruit.net        -0.050665   0.089444  -0.566 0.571095    
EmployeeSourceReferral            0.230121   0.147168   1.564 0.117897    
EmployeeSourceSeek               -0.005837   0.079828  -0.073 0.941708    
JobRoleHuman Resources            0.107348   0.125753   0.854 0.393302    
JobRoleLaboratory Technician      0.314968   0.080707   3.903 9.52e-05 ***
JobRoleManager                   -0.402633   0.123788  -3.253 0.001144 ** 
JobRoleManufacturing Director    -0.083426   0.095273  -0.876 0.381221    
JobRoleResearch Director         -0.292195   0.126243  -2.315 0.020637 *  
JobRoleResearch Scientist         0.111877   0.079359   1.410 0.158608    
JobRoleSales Executive           -0.028140   0.079873  -0.352 0.724611    
JobRoleSales Representative       0.478077   0.096067   4.977 6.47e-07 ***
MaritalStatusMarried              0.176289   0.053865   3.273 0.001065 ** 
MaritalStatusSingle               0.747383   0.053896  13.867  < 2e-16 ***
AverageTenure                    -0.021245   0.009467  -2.244 0.024825 *  
PriorYearsOfExperience            0.019787   0.005399   3.665 0.000248 ***
GenderMale                        0.030982   0.038752   0.800 0.424000    
Education2                        0.067584   0.069195   0.977 0.328712    
Education3                        0.092553   0.061236   1.511 0.130684    
Education4                        0.071013   0.066760   1.064 0.287461    
Education5                       -0.233758   0.134267  -1.741 0.081685 .  
EducationFieldLife Sciences      -0.148858   0.143810  -1.035 0.300620    
EducationFieldMarketing          -0.106268   0.152995  -0.695 0.487317    
EducationFieldMedical            -0.202212   0.145203  -1.393 0.163736    
EducationFieldOther              -0.137807   0.161652  -0.852 0.393940    
EducationFieldTechnical Degree    0.180977   0.154552   1.171 0.241608    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 19951  on 22970  degrees of freedom
Residual deviance: 18626  on 22936  degrees of freedom
AIC: 18696

Number of Fisher Scoring iterations: 5


# Removing values with a high p-value (Education and EducationField)
model_v2 <- glm(Attrition ~ Age + Department + DistanceFromHome + EmployeeSource + 
                   JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender, 
                 family = binomial, 
                 data = data_hr_1)
summary(model_v2)

Call:
glm(formula = Attrition ~ Age + Department + DistanceFromHome + 
    EmployeeSource + JobRole + MaritalStatus + AverageTenure + 
    PriorYearsOfExperience + Gender, family = binomial, data = data_hr_1)

Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-1.3428  -0.6201  -0.4941  -0.3619   2.7143  

Coefficients:
                                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)                      -0.594443   0.163302  -3.640 0.000272 ***
Age                              -0.044338   0.002361 -18.781  < 2e-16 ***
DepartmentResearch & Development -0.455831   0.097648  -4.668 3.04e-06 ***
DepartmentSales                   0.006375   0.100798   0.063 0.949567    
DistanceFromHome                  0.023945   0.002219  10.792  < 2e-16 ***
EmployeeSourceCompany Website     0.185836   0.074684   2.488 0.012835 *  
EmployeeSourceGlassDoor           0.004131   0.089469   0.046 0.963174    
EmployeeSourceIndeed             -0.084488   0.089587  -0.943 0.345638    
EmployeeSourceJora                0.182141   0.084629   2.152 0.031378 *  
EmployeeSourceLinkedIn           -0.073833   0.090249  -0.818 0.413300    
EmployeeSourceRecruit.net        -0.058670   0.089241  -0.657 0.510903    
EmployeeSourceReferral            0.237922   0.146800   1.621 0.105078    
EmployeeSourceSeek               -0.006818   0.079571  -0.086 0.931717    
JobRoleHuman Resources            0.099083   0.125594   0.789 0.430163    
JobRoleLaboratory Technician      0.312339   0.080556   3.877 0.000106 ***
JobRoleManager                   -0.418085   0.123665  -3.381 0.000723 ***
JobRoleManufacturing Director    -0.079696   0.095061  -0.838 0.401826    
JobRoleResearch Director         -0.308958   0.126075  -2.451 0.014263 *  
JobRoleResearch Scientist         0.119993   0.079265   1.514 0.130071    
JobRoleSales Executive           -0.023432   0.079774  -0.294 0.768961    
JobRoleSales Representative       0.483836   0.095952   5.042 4.60e-07 ***
MaritalStatusMarried              0.176480   0.053793   3.281 0.001035 ** 
MaritalStatusSingle               0.747665   0.053772  13.904  < 2e-16 ***
AverageTenure                    -0.019906   0.009465  -2.103 0.035453 *  
PriorYearsOfExperience            0.019187   0.005400   3.553 0.000381 ***
GenderMale                        0.033764   0.038690   0.873 0.382838    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 19951  on 22970  degrees of freedom
Residual deviance: 18668  on 22945  degrees of freedom
AIC: 18720

Number of Fisher Scoring iterations: 5


# Necessary packages
require(xgboost)
require(Matrix)
require(data.table)
library(correlationfunnel)
if (!require('vcd')) install.packages('vcd')

Loading required package: xgboost
Warning message:
"package 'xgboost' was built under R version 3.6.3"
Attaching package: 'xgboost'

The following object is masked from 'package:dplyr':

    slice

Loading required package: Matrix

Attaching package: 'Matrix'

The following objects are masked from 'package:tidyr':

    expand, pack, unpack

Warning message:
"package 'correlationfunnel' was built under R version 3.6.3"== Using correlationfunnel? ====================================================
You might also be interested in applied data science training for business.
</> Learn more at - www.business-science.io </>
Loading required package: vcd
Warning message:
"package 'vcd' was built under R version 3.6.3"Loading required package: grid


# Create a binary target variable (required by XGBoost)

# Binarize the Attrition column
tmp <- data_hr_1 %>%
        select(Attrition) %>%
        binarize()

# Select the binary mapping where CurrentEmployee==0 and VoluntaryResignation==1
data_hr_1$Resigned <- tmp[2]

# Fix the data type of the new column (from list to numeric)
data_hr$Resigned  <- as.integer(data_hr$Resigned)

# Check result
str(data_hr_1$Resigned)

 num [1:22971] 1 1 1 1 1 1 1 1 1 1 ...
 - attr(*, "assign")= int [1:2] 1 1
 - attr(*, "contrasts")=List of 1
  ..$ Attrition: chr "contr.treatment"


# Class distribution
data_hr_1 %>% count(Resigned, sort = FALSE)


# Create an one-hot encoded sparse matrix needed for XGBOOST, using only the same variables as in model_v1
sparse_matrix <- sparse.model.matrix(Resigned ~ Age + Department + DistanceFromHome + EmployeeSource + 
                                         JobRole + MaritalStatus + AverageTenure + PriorYearsOfExperience + Gender + 
                                         Education + EducationField, 
                                     data = data_hr_1)
head(sparse_matrix)

   [[ suppressing 35 column names '(Intercept)', 'Age', 'DepartmentResearch & Development' ... ]]

6 x 35 sparse Matrix of class "dgCMatrix"
                                                                           
1 1 41 . 1 1 . . . . . . 1 . . . . . . . 1 . . 1 0.25 2 . 1 . . . 1 . . . .
2 1 37 . . 6 . . . . . . 1 . . . . . . . 1 . . 1 0.25 2 . . . 1 . . . . . .
3 1 41 . 1 1 . . . . . . 1 . . . . . . . 1 . . 1 0.50 2 . 1 . . . 1 . . . .
4 1 37 . . 6 . . . . . . 1 . . . . . . . 1 . . 1 0.40 2 . . . 1 . . 1 . . .
5 1 37 . . 6 . . . . . . 1 . . . . . . . 1 . . 1 0.25 2 . . . 1 . . . . . .
6 1 37 . . 6 . . . . . . 1 . . . . . . . 1 . . 1 0.40 2 . . . 1 . . 1 . . .


# Create a vector with the binzarized target label
output_vector <- data_hr_1[,Resigned]
head(output_vector)


# Train model
bst <- xgboost(data = sparse_matrix, label = output_vector, max.depth = 6,
               eta = 0.3, nthread = -1, nrounds = 100, 
               objective = "binary:logistic", eval_metric = "logloss", verbose = 0)


# Extract feature importance
importance <- xgb.importance(feature_names = sparse_matrix@Dimnames[[2]], model = bst)
head(importance)


# Plot most important features
xgb.plot.importance(importance_matrix = importance, top_n = 10, main = "Top 10 Most Important Features")


# Run a "Chi-Squared Goodness of Fit Test" to make sure those highly ranked variables are indeed meaningful for the model
# Looking for a p-value of less than 0.05 to confirm the variables are significant
# As well as an chi-squared (x-squared) value above 5 to confirm the test's validity
c2 <- chisq.test(data_hr_1$Age, output_vector)
print(c2)

c2 <- chisq.test(data_hr_1$DistanceFromHome, output_vector)
print(c2)

c2 <- chisq.test(data_hr_1$PriorYearsOfExperience, output_vector)
print(c2)

c2 <- chisq.test(data_hr_1$AverageTenure, output_vector)
print(c2)

c2 <- chisq.test(data_hr_1$Education, output_vector)
print(c2)

c2 <- chisq.test(data_hr_1$Department, output_vector)
print(c2)

c2 <- chisq.test(data_hr_1$MaritalStatus, output_vector)
print(c2)

	Pearson's Chi-squared test

data:  data_hr_1$Age and output_vector
X-squared = 1823.6, df = 42, p-value < 2.2e-16


	Pearson's Chi-squared test

data:  data_hr_1$DistanceFromHome and output_vector
X-squared = 564.63, df = 28, p-value < 2.2e-16

Warning message in chisq.test(data_hr_1$PriorYearsOfExperience, output_vector):
"Chi-squared approximation may be incorrect"

	Pearson's Chi-squared test

data:  data_hr_1$PriorYearsOfExperience and output_vector
X-squared = 211.03, df = 35, p-value < 2.2e-16

Warning message in chisq.test(data_hr_1$AverageTenure, output_vector):
"Chi-squared approximation may be incorrect"

	Pearson's Chi-squared test

data:  data_hr_1$AverageTenure and output_vector
X-squared = 565.52, df = 155, p-value < 2.2e-16


	Pearson's Chi-squared test

data:  data_hr_1$Education and output_vector
X-squared = 42.616, df = 4, p-value = 1.243e-08


	Pearson's Chi-squared test

data:  data_hr_1$Department and output_vector
X-squared = 167.9, df = 2, p-value < 2.2e-16


	Pearson's Chi-squared test

data:  data_hr_1$MaritalStatus and output_vector
X-squared = 380.58, df = 2, p-value < 2.2e-16

#	Education	EnvironmentSatisfaction	JobInvolvement	JobSatisfaction	PerformanceRating	RelationshipSatisfaction	WorkLifeBalance
1	Below College	Low	Low	Low	Low	Low	Bad
2	College	Medium	Medium	Medium	Good	Medium	Good
3	Bachelor	High	High	High	Excellent	High	Better
4	Master	Very High	Very High	Very High	Outstanding	Very High	Best
5	Doctor

#	Education	EnvironmentSatisfaction	JobInvolvement	JobSatisfaction	PerformanceRating	RelationshipSatisfaction	WorkLifeBalance
1	Below College	Low	Low	Low	Low	Low	Bad
2	College	Medium	Medium	Medium	Good	Medium	Good
3	Bachelor	High	High	High	Excellent	High	Better
4	Master	Very High	Very High	Very High	Outstanding	Very High	Best
5	Doctor

Feature	Gain	Cover	Frequency
Age	0.26544632	0.32700307	0.22757906
DistanceFromHome	0.22301070	0.17786409	0.19932608
PriorYearsOfExperience	0.09457971	0.11191857	0.10082945
AverageTenure	0.03782742	0.06216012	0.06220840
Education3	0.03614013	0.01336200	0.03421462
DepartmentResearch & Development	0.02853856	0.01305244	0.02980819

People Analytics: Evaluating Factors Predictive of Resignation¶

Problem Statement¶

Define Directory¶

Packages¶

Dataset¶

Data Cleaning¶

Feature Engineering¶

Exploratory Analysis¶

Prior Work Experience & Age¶

Education¶

Monthly Income x Job Satisfaction¶

Years at Company¶

Monthly Income¶

Monthly Income x Work Life Balance¶

Gender Pay Gap¶

Gender Breakdown¶

Job Role¶

Resignations¶

Predictive Modeling¶

Logistic Regression¶

Gradient Boosting with XGBoost¶

Conclusion¶

End¶

Age	Attrition	BusinessTravel	Department	DistanceFromHome	Education	EducationField	EnvironmentSatisfaction	Gender	JobInvolvement	...	TotalWorkingYears	WorkLifeBalance	YearsAtCompany	YearsInCurrentRole	YearsWithCurrManager	Employee Source	AgeStartedWorking
41	Voluntary Resignation	Travel_Rarely	Sales	1	2	Life Sciences	2	Female	3	...	8	1	6	4	5	Referral	33
37	Voluntary Resignation	Travel_Rarely	Human Resources	6	4	Human Resources	1	Female	3	...	8	1	6	4	5	Referral	29
41	Voluntary Resignation	Travel_Rarely	Sales	1	2	Life Sciences	2	Female	3	...	8	1	6	4	5	Referral	33
37	Voluntary Resignation	Travel_Rarely	Human Resources	6	4	Marketing	1	Female	3	...	8	1	6	4	5	Referral	29
37	Voluntary Resignation	Travel_Rarely	Human Resources	6	4	Human Resources	1	Female	3	...	8	1	6	4	5	Referral	29
37	Voluntary Resignation	Travel_Rarely	Human Resources	6	4	Marketing	1	Female	3	...	8	1	6	4	5	Referral	29

JobRole	Gender	n
Healthcare Representative	Female	812
Healthcare Representative	Male	1257
Human Resources	Female	256
Human Resources	Male	577
Laboratory Technician	Female	1355
Laboratory Technician	Male	2757
Manager	Female	696
Manager	Male	825
Manufacturing Director	Female	1167
Manufacturing Director	Male	1179
Research Director	Female	503
Research Director	Male	709
Research Scientist	Female	1780
Research Scientist	Male	2811
Sales Executive	Female	2046
Sales Executive	Male	3021
Sales Representative	Female	590
Sales Representative	Male	717

BusinessTravel	OverTime	n
Non-Travel	No	1796
Non-Travel	Yes	548
Travel_Frequently	No	3054
Travel_Frequently	Yes	1324
Travel_Rarely	No	11674
Travel_Rarely	Yes	4662

Resigned	n
0	19370
1	3601