In this project I'll be building and analysing KPIs from a retail store dataset. The analysis contains the follow eight KPIs:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import plotly
import matplotlib.pyplot as plt
import plotly.offline as pyoff
import plotly.graph_objs as go
from datetime import datetime, timedelta
%matplotlib inline
pyoff.init_notebook_mode()
Exploratory Analysis
data = pd.read_csv('data/dataset.csv', header=0, encoding = 'unicode_escape')
data.head()
Note: Each row (register) represents one item from a purchase. Thus the PurchaseId column will have repeating values indicating that a single purchase contained multiple items.
# Shape
data.shape
# Data types
data.dtypes
# Describe
data.describe()
# Check for missing values
data.isna().sum()
# Range of dates contained in the dataset
# (This retuns errouneous information because the data type for the date is incorrect)
print('Minimum Date:', data['PurchaseDate'].min())
print('Maximum Date:', data['PurchaseDate'].max())
# Convert the PurchaseDate column to datetime format
data.PurchaseDate = pd.to_datetime(data.PurchaseDate)
data.dtypes
# Range of dates contained in the dataset
print('Minimum Date:', data['PurchaseDate'].min())
print('Maximum Date:', data['PurchaseDate'].max())
There is only partial data for the last month.
# Countries for which there were purchases
data['Country'].sort_values().unique()
Revenue = Quantity * Unit_Price
# Extract the year and month of each purchase
data['YearMonth'] = data['PurchaseDate'].map(lambda date: 100 * date.year + date.month)
# Calculate the revenue per item in each purchase
data['Revenue'] = data['Quantity'] * data['UnitPrice']
# Visualize the data
data.head(1)
# Group the revenue per month/year
df_revenue = data.groupby(['YearMonth']).agg({'Revenue': sum}).reset_index()
df_revenue
Visualizing KPI 1
# Define plot data
plot_data = [go.Bar(x = df_revenue['YearMonth'],
y = df_revenue['Revenue'])]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
title = 'Monthly Revenue')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
Growth Rate = Current Month's Revenue / Previous Month's Revenue
# Using the pct_change() method to calculate the monthly growth rate
df_revenue['GrowthRate'] = df_revenue['Revenue'].pct_change()
df_revenue
Visualizing KPI 2
Won't be considering the last month (2011/12) since there is only partial sales data for that month, which results in an incorrect growth rate measument.
# Define plot data
plot_data = [go.Scatter(x = df_revenue.query('YearMonth < 201112')['YearMonth'],
y = df_revenue.query('YearMonth < 201112')['GrowthRate'])]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
yaxis = {'tickformat': '%'},
title = 'Revenue Growth Rate')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
Defining active clients as those clients who made at least one purchase in a given month.
# Create a dataframe containing only data from Brazil
data_brazil = data.query("Country=='Brazil'").reset_index(drop=True)
# On each month, consider as active only those users to purchased on the current month
df_active_month = data_brazil.groupby('YearMonth')['CustomerId'].nunique().reset_index()
df_active_month
Visualizing KPI 3
# Define plot data
plot_data = [go.Scatter(x = df_active_month['YearMonth'],
y = df_active_month['CustomerId'])]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
title = 'Monthly Active Customers in a Country (Brazil)')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
Note that there is only partial data for the last month, which is why such a large drop is observed on the plot.
# Group the data to tally the number of items sold per month in Brazil
df_items_month = data_brazil.groupby('YearMonth')['Quantity'].sum().reset_index()
df_items_month
Visualizing KPI 4
# Define plot data
plot_data = [go.Bar(x = df_items_month['YearMonth'],
y = df_items_month['Quantity'])]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
title = 'Monthly Item Sales in a Country (Brazil)')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
# Calculate the average item sale price per month
df_revenue_avg = data_brazil.groupby('YearMonth')['Revenue'].mean().reset_index()
df_revenue_avg
Visualizing KPI 5
# Define plot data
plot_data = [go.Bar(x = df_revenue_avg['YearMonth'],
y = df_revenue_avg['Revenue'])]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
title = 'Monthly Average Item Sale Price')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
To obtain this KPI some creativity is needed, as it's necessary to define what constitutes a new vs old customer...
For each month I'll be considering as new customers those whose first purchase happened in the current month, while those whose first purchase happened in previous months will be considered old customers.
# Finding the earliest purchase date for each customer
df_purchase_earliest = data.groupby('CustomerId')['PurchaseDate'].min().reset_index()
df_purchase_earliest.columns = ['CustomerId', 'EarliestDate']
df_purchase_earliest.head(2)
# Extracting the month of the first purchase
df_purchase_earliest['FirstPurchase'] = df_purchase_earliest['EarliestDate'].map(lambda date: 100 * date.year + date.month)
df_purchase_earliest.head()
# Merge the dataframe containing the month of the first purchase into the base dataset
data_purchase = pd.merge(data, df_purchase_earliest, on = 'CustomerId')
data_purchase.head(3)
# Creating a new column for customer type and filling all values as "new"
data_purchase['CustomerType'] = 'New'
data_purchase['CustomerType'].value_counts()
# For each PurchaseId, if the purchase is happening in an year/month greater than the user's FirstPurchase
# then set CustomerType as "Old"
data_purchase.loc[data_purchase['YearMonth'] > data_purchase['FirstPurchase'], 'CustomerType'] = 'Old'
data_purchase['CustomerType'].value_counts()
# Now calculating the revenue per CustomerType and month
df_revenue_type_month = data_purchase.groupby(['YearMonth', 'CustomerType'])['Revenue'].sum().reset_index()
# Removing 2011/12 since it only has partial sales data
df_revenue_type_month = df_revenue_type_month.query("YearMonth != 201112 and YearMonth != 201012")
df_revenue_type_month
Visualizing KPI 6
# Define plot data
plot_data = [go.Scatter(x = df_revenue_type_month.query("CustomerType == 'Old'")['YearMonth'],
y = df_revenue_type_month.query("CustomerType == 'Old'")['Revenue'],
name = 'Old Customers'),
go.Scatter(x = df_revenue_type_month.query("CustomerType == 'New'")['YearMonth'],
y = df_revenue_type_month.query("CustomerType == 'New'")['Revenue'],
name = 'New Customers')]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
title = 'Revenue Growth from New vs Old Customers')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
Since new and old customers have been defined, it's possible to now calculate the share of new customers each month.
# Calculate the rate of new customers in each month
df_new_customer_rate = data_purchase.query("CustomerType == 'New'").groupby('YearMonth')['CustomerId'].nunique() / data_purchase.groupby('YearMonth')['CustomerId'].nunique()
# Fix index and remove missing values
df_new_customer_rate = df_new_customer_rate.reset_index()
df_new_customer_rate = df_new_customer_rate.dropna()
df_new_customer_rate
Visualizing KPI 7
# Define plot data
plot_data = [go.Bar(x = df_new_customer_rate.query("YearMonth < 201112")['YearMonth'],
y = df_new_customer_rate.query("YearMonth < 201112")['CustomerId'])]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
yaxis = {'tickformat': '%'},
title = 'New Customer Share')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)
Retention Rate = Previous Month's Clients / Total Active Clients
# GroupĂng the data by customer and month and tallying the revenue
data_purchase_customers = data_purchase.groupby(['CustomerId', 'YearMonth'])['Revenue'].sum().reset_index()
data_purchase_customers.head()
# Defining retention with a crosstab
df_ret = pd.crosstab(data_purchase_customers['CustomerId'], data_purchase_customers['YearMonth'])
df_ret.head()
# Extracting all year/month pairs
months = df_ret.columns
months
# Loop to calculate retention over the months
# List to store the result
ret_list = []
# Loop
for i in range(len(months)-1):
retention_data = {}
previous_month = months[i]
current_month = months[i+1]
retention_data['YearMonth'] = int(current_month)
retention_data['CustomerTotal'] = df_ret[current_month].sum()
retention_data['TotalRetained'] = df_ret[(df_ret[current_month] > 0 ) & (df_ret[previous_month] > 0)][current_month].sum()
ret_list.append(retention_data)
ret_list
# Convert to dataframe
df_ret_final = pd.DataFrame(ret_list)
df_ret_final.head(2)
# Calculate the ratio to obtain the KPI
df_ret_final['RetentionRate'] = df_ret_final['TotalRetained'] / df_ret_final['CustomerTotal']
df_ret_final
Visualizing KPI 8
# Define plot data
plot_data = [go.Scatter(x = df_ret_final.query("YearMonth < 201112")['YearMonth'],
y = df_ret_final.query("YearMonth < 201112")['RetentionRate'],
name = 'Rate')]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
yaxis = {'tickformat': '%'},
title = 'Customer Retention Rate')
# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)