KPI Analytics¶

Matheus Schmitz

https://www.linkedin.com/in/matheusschmitz/

https://matheus-schmitz.github.io/

In this project I'll be building and analysing KPIs from a retail store dataset. The analysis contains the follow eight KPIs:

KPI 1: Monthly Revenues
KPI 2: Revenue Growth Rate
KPI 3: Monthly Active Customers in a Country
KPI 4: Monthly Item Sales in a Country
KPI 5: Monthly Average Item Sales Price
KPI 6: Revenue Growth from New vs Old Customers
KPI 7: New Customer Share
KPI 8: Customer Retention Rate

# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import plotly
import matplotlib.pyplot as plt
import plotly.offline as pyoff
import plotly.graph_objs as go
from datetime import datetime, timedelta
%matplotlib inline
pyoff.init_notebook_mode()

Exploratory Analysis

data = pd.read_csv('data/dataset.csv', header=0, encoding = 'unicode_escape')
data.head()

Note: Each row (register) represents one item from a purchase. Thus the PurchaseId column will have repeating values indicating that a single purchase contained multiple items.

# Shape
data.shape

(541800, 8)

# Data types
data.dtypes

PurchaseId       object
ProductCode      object
ProductName      object
Quantity          int64
PurchaseDate     object
UnitPrice       float64
CustomerId      float64
Country          object
dtype: object

# Describe
data.describe()

# Check for missing values
data.isna().sum()

PurchaseId           0
ProductCode          0
ProductName       1454
Quantity             0
PurchaseDate         0
UnitPrice            0
CustomerId      135075
Country              0
dtype: int64

# Range of dates contained in the dataset
# (This retuns errouneous information because the data type for the date is incorrect)
print('Minimum Date:', data['PurchaseDate'].min())
print('Maximum Date:', data['PurchaseDate'].max())

Minimum Date: 1/10/2011 10:04
Maximum Date: 9/9/2011 9:52

# Convert the PurchaseDate column to datetime format
data.PurchaseDate = pd.to_datetime(data.PurchaseDate)      
data.dtypes

PurchaseId              object
ProductCode             object
ProductName             object
Quantity                 int64
PurchaseDate    datetime64[ns]
UnitPrice              float64
CustomerId             float64
Country                 object
dtype: object

# Range of dates contained in the dataset
print('Minimum Date:', data['PurchaseDate'].min())
print('Maximum Date:', data['PurchaseDate'].max())

Minimum Date: 2010-12-01 08:26:00
Maximum Date: 2011-12-09 12:50:00

There is only partial data for the last month.

# Countries for which there were purchases
data['Country'].sort_values().unique()

array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada',
       'Channel Islands', 'Chile', 'China', 'Cyprus', 'Czech Republic',
       'Denmark', 'Ecuador', 'England', 'Finland', 'Germany', 'Greece',
       'Hong Kong', 'Iceland', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Japan', 'Lithuania', 'Malta', 'Netherlands', 'Norway', 'Paraguay',
       'Poland', 'Portugal', 'Saudi Arabia', 'Singapore', 'Spain',
       'Sweden', 'USA', 'Uruguay'], dtype=object)

KPI 1: Monthly Revenues¶

Revenue = Quantity * Unit_Price

# Extract the year and month of each purchase
data['YearMonth'] = data['PurchaseDate'].map(lambda date: 100 * date.year + date.month)

# Calculate the revenue per item in each purchase
data['Revenue'] = data['Quantity'] * data['UnitPrice']

# Visualize the data
data.head(1)

# Group the revenue per month/year
df_revenue = data.groupby(['YearMonth']).agg({'Revenue': sum}).reset_index()
df_revenue

Visualizing KPI 1

# Define plot data
plot_data = [go.Bar(x = df_revenue['YearMonth'],
                    y = df_revenue['Revenue'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Revenue')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 2: Revenue Growth Rate¶

Growth Rate = Current Month's Revenue / Previous Month's Revenue

# Using the pct_change() method to calculate the monthly growth rate
df_revenue['GrowthRate'] = df_revenue['Revenue'].pct_change()
df_revenue

Visualizing KPI 2

Won't be considering the last month (2011/12) since there is only partial sales data for that month, which results in an incorrect growth rate measument.

# Define plot data
plot_data = [go.Scatter(x = df_revenue.query('YearMonth < 201112')['YearMonth'],
                        y = df_revenue.query('YearMonth < 201112')['GrowthRate'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        yaxis = {'tickformat': '%'},
                        title = 'Revenue Growth Rate')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 3: Monthly Active Customers in a Country¶

Defining active clients as those clients who made at least one purchase in a given month.

# Create a dataframe containing only data from Brazil
data_brazil = data.query("Country=='Brazil'").reset_index(drop=True)

# On each month, consider as active only those users to purchased on the current month
df_active_month = data_brazil.groupby('YearMonth')['CustomerId'].nunique().reset_index()
df_active_month

Visualizing KPI 3

# Define plot data
plot_data = [go.Scatter(x = df_active_month['YearMonth'],
                        y = df_active_month['CustomerId'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Active Customers in a Country (Brazil)')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

Note that there is only partial data for the last month, which is why such a large drop is observed on the plot.

KPI 4: Monthly Item Sales in a Country¶

# Group the data to tally the number of items sold per month in Brazil
df_items_month = data_brazil.groupby('YearMonth')['Quantity'].sum().reset_index()
df_items_month

Visualizing KPI 4

# Define plot data
plot_data = [go.Bar(x = df_items_month['YearMonth'],
                    y = df_items_month['Quantity'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Item Sales in a Country (Brazil)')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 5: Monthly Average Item Sale Price¶

# Calculate the average item sale price per month
df_revenue_avg = data_brazil.groupby('YearMonth')['Revenue'].mean().reset_index()
df_revenue_avg

Visualizing KPI 5

# Define plot data
plot_data = [go.Bar(x = df_revenue_avg['YearMonth'],
                    y = df_revenue_avg['Revenue'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Average Item Sale Price')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 6: Revenue Growth from New vs Old Customers¶

To obtain this KPI some creativity is needed, as it's necessary to define what constitutes a new vs old customer...

For each month I'll be considering as new customers those whose first purchase happened in the current month, while those whose first purchase happened in previous months will be considered old customers.

# Finding the earliest purchase date for each customer
df_purchase_earliest = data.groupby('CustomerId')['PurchaseDate'].min().reset_index()
df_purchase_earliest.columns = ['CustomerId', 'EarliestDate']
df_purchase_earliest.head(2)

# Extracting the month of the first purchase
df_purchase_earliest['FirstPurchase'] = df_purchase_earliest['EarliestDate'].map(lambda date: 100 * date.year + date.month)
df_purchase_earliest.head()

# Merge the dataframe containing the month of the first purchase into the base dataset
data_purchase = pd.merge(data, df_purchase_earliest, on = 'CustomerId')
data_purchase.head(3)

# Creating a new column for customer type and filling all values as "new"
data_purchase['CustomerType'] = 'New'
data_purchase['CustomerType'].value_counts()

New    406725
Name: CustomerType, dtype: int64

# For each PurchaseId, if the purchase is happening in an year/month greater than the user's FirstPurchase
# then set CustomerType as "Old"
data_purchase.loc[data_purchase['YearMonth'] > data_purchase['FirstPurchase'], 'CustomerType'] = 'Old'
data_purchase['CustomerType'].value_counts()

Old    287549
New    119176
Name: CustomerType, dtype: int64

# Now calculating the revenue per CustomerType and month
df_revenue_type_month = data_purchase.groupby(['YearMonth', 'CustomerType'])['Revenue'].sum().reset_index()

# Removing 2011/12 since it only has partial sales data
df_revenue_type_month = df_revenue_type_month.query("YearMonth != 201112 and YearMonth != 201012")
df_revenue_type_month

Visualizing KPI 6

# Define plot data
plot_data = [go.Scatter(x = df_revenue_type_month.query("CustomerType == 'Old'")['YearMonth'],
                        y = df_revenue_type_month.query("CustomerType == 'Old'")['Revenue'],
                        name = 'Old Customers'),
             go.Scatter(x = df_revenue_type_month.query("CustomerType == 'New'")['YearMonth'],
                        y = df_revenue_type_month.query("CustomerType == 'New'")['Revenue'],
                        name = 'New Customers')]
             
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Revenue Growth from New vs Old Customers')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

Since new and old customers have been defined, it's possible to now calculate the share of new customers each month.

# Calculate the rate of new customers in each month
df_new_customer_rate = data_purchase.query("CustomerType == 'New'").groupby('YearMonth')['CustomerId'].nunique() / data_purchase.groupby('YearMonth')['CustomerId'].nunique()

# Fix index and remove missing values
df_new_customer_rate = df_new_customer_rate.reset_index()
df_new_customer_rate = df_new_customer_rate.dropna()
df_new_customer_rate

Visualizing KPI 7

# Define plot data
plot_data = [go.Bar(x = df_new_customer_rate.query("YearMonth < 201112")['YearMonth'],
                    y = df_new_customer_rate.query("YearMonth < 201112")['CustomerId'])]
             
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        yaxis = {'tickformat': '%'},
                        title = 'New Customer Share')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 8: Customer Retention Rate¶

Retention Rate = Previous Month's Clients / Total Active Clients

# Groupíng the data by customer and month and tallying the revenue
data_purchase_customers = data_purchase.groupby(['CustomerId', 'YearMonth'])['Revenue'].sum().reset_index()
data_purchase_customers.head()

# Defining retention with a crosstab
df_ret = pd.crosstab(data_purchase_customers['CustomerId'], data_purchase_customers['YearMonth'])
df_ret.head()

# Extracting all year/month pairs
months = df_ret.columns
months

Int64Index([201012, 201101, 201102, 201103, 201104, 201105, 201106, 201107,
            201108, 201109, 201110, 201111, 201112],
           dtype='int64', name='YearMonth')

# Loop to calculate retention over the months

# List to store the result
ret_list = []

# Loop
for i in range(len(months)-1):
    retention_data = {}
    previous_month = months[i]
    current_month = months[i+1]
    retention_data['YearMonth'] = int(current_month)
    retention_data['CustomerTotal'] = df_ret[current_month].sum()
    retention_data['TotalRetained'] = df_ret[(df_ret[current_month] > 0 ) & (df_ret[previous_month] > 0)][current_month].sum()
    ret_list.append(retention_data)
    
ret_list

[{'YearMonth': 201101, 'CustomerTotal': 783, 'TotalRetained': 362},
 {'YearMonth': 201102, 'CustomerTotal': 798, 'TotalRetained': 299},
 {'YearMonth': 201103, 'CustomerTotal': 1020, 'TotalRetained': 345},
 {'YearMonth': 201104, 'CustomerTotal': 899, 'TotalRetained': 346},
 {'YearMonth': 201105, 'CustomerTotal': 1079, 'TotalRetained': 399},
 {'YearMonth': 201106, 'CustomerTotal': 1051, 'TotalRetained': 464},
 {'YearMonth': 201107, 'CustomerTotal': 993, 'TotalRetained': 415},
 {'YearMonth': 201108, 'CustomerTotal': 980, 'TotalRetained': 433},
 {'YearMonth': 201109, 'CustomerTotal': 1302, 'TotalRetained': 465},
 {'YearMonth': 201110, 'CustomerTotal': 1425, 'TotalRetained': 552},
 {'YearMonth': 201111, 'CustomerTotal': 1711, 'TotalRetained': 690},
 {'YearMonth': 201112, 'CustomerTotal': 683, 'TotalRetained': 440}]

# Convert to dataframe
df_ret_final = pd.DataFrame(ret_list)
df_ret_final.head(2)

# Calculate the ratio to obtain the KPI
df_ret_final['RetentionRate'] = df_ret_final['TotalRetained'] / df_ret_final['CustomerTotal']
df_ret_final

Visualizing KPI 8

# Define plot data
plot_data = [go.Scatter(x = df_ret_final.query("YearMonth < 201112")['YearMonth'],
                        y = df_ret_final.query("YearMonth < 201112")['RetentionRate'],
                        name = 'Rate')]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        yaxis = {'tickformat': '%'},
                        title = 'Customer Retention Rate')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

End¶