In this project I'll be building and analysing KPIs from a retail store dataset. The analysis contains the follow eight KPIs:

  • KPI 1: Monthly Revenues
  • KPI 2: Revenue Growth Rate
  • KPI 3: Monthly Active Customers in a Country
  • KPI 4: Monthly Item Sales in a Country
  • KPI 5: Monthly Average Item Sales Price
  • KPI 6: Revenue Growth from New vs Old Customers
  • KPI 7: New Customer Share
  • KPI 8: Customer Retention Rate
In [1]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import plotly
import matplotlib.pyplot as plt
import plotly.offline as pyoff
import plotly.graph_objs as go
from datetime import datetime, timedelta
%matplotlib inline
pyoff.init_notebook_mode()

Exploratory Analysis

In [2]:
data = pd.read_csv('data/dataset.csv', header=0, encoding = 'unicode_escape')
data.head()
Out[2]:
PurchaseId ProductCode ProductName Quantity PurchaseDate UnitPrice CustomerId Country
0 536365 21730 GLASS STAR FROSTED T-LIGHT HOLDER 6 12/1/2010 8:26 4.25 17850.0 Brazil
1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 12/1/2010 8:26 2.55 17850.0 Brazil
2 536365 71053 WHITE METAL LANTERN 6 12/1/2010 8:26 3.39 17850.0 Brazil
3 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 12/1/2010 8:26 2.75 17850.0 Brazil
4 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 12/1/2010 8:26 3.39 17850.0 Brazil

Note: Each row (register) represents one item from a purchase. Thus the PurchaseId column will have repeating values indicating that a single purchase contained multiple items.

In [3]:
# Shape
data.shape
Out[3]:
(541800, 8)
In [4]:
# Data types
data.dtypes
Out[4]:
PurchaseId       object
ProductCode      object
ProductName      object
Quantity          int64
PurchaseDate     object
UnitPrice       float64
CustomerId      float64
Country          object
dtype: object
In [5]:
# Describe
data.describe()
Out[5]:
Quantity UnitPrice CustomerId
count 541800.000000 541800.000000 406725.000000
mean 9.551739 4.611581 15287.754038
std 218.103033 96.769576 1713.475925
min -80995.000000 -11062.060000 12346.000000
25% 1.000000 1.250000 13954.000000
50% 3.000000 2.080000 15152.000000
75% 10.000000 4.130000 16791.000000
max 80995.000000 38970.000000 18287.000000
In [6]:
# Check for missing values
data.isna().sum()
Out[6]:
PurchaseId           0
ProductCode          0
ProductName       1454
Quantity             0
PurchaseDate         0
UnitPrice            0
CustomerId      135075
Country              0
dtype: int64
In [7]:
# Range of dates contained in the dataset
# (This retuns errouneous information because the data type for the date is incorrect)
print('Minimum Date:', data['PurchaseDate'].min())
print('Maximum Date:', data['PurchaseDate'].max())
Minimum Date: 1/10/2011 10:04
Maximum Date: 9/9/2011 9:52
In [8]:
# Convert the PurchaseDate column to datetime format
data.PurchaseDate = pd.to_datetime(data.PurchaseDate)      
data.dtypes
Out[8]:
PurchaseId              object
ProductCode             object
ProductName             object
Quantity                 int64
PurchaseDate    datetime64[ns]
UnitPrice              float64
CustomerId             float64
Country                 object
dtype: object
In [9]:
# Range of dates contained in the dataset
print('Minimum Date:', data['PurchaseDate'].min())
print('Maximum Date:', data['PurchaseDate'].max())
Minimum Date: 2010-12-01 08:26:00
Maximum Date: 2011-12-09 12:50:00

There is only partial data for the last month.

In [10]:
# Countries for which there were purchases
data['Country'].sort_values().unique()
Out[10]:
array(['Australia', 'Austria', 'Bahrain', 'Belgium', 'Brazil', 'Canada',
       'Channel Islands', 'Chile', 'China', 'Cyprus', 'Czech Republic',
       'Denmark', 'Ecuador', 'England', 'Finland', 'Germany', 'Greece',
       'Hong Kong', 'Iceland', 'Iraq', 'Ireland', 'Israel', 'Italy',
       'Japan', 'Lithuania', 'Malta', 'Netherlands', 'Norway', 'Paraguay',
       'Poland', 'Portugal', 'Saudi Arabia', 'Singapore', 'Spain',
       'Sweden', 'USA', 'Uruguay'], dtype=object)

KPI 1: Monthly Revenues

Revenue = Quantity * Unit_Price

In [11]:
# Extract the year and month of each purchase
data['YearMonth'] = data['PurchaseDate'].map(lambda date: 100 * date.year + date.month)
In [12]:
# Calculate the revenue per item in each purchase
data['Revenue'] = data['Quantity'] * data['UnitPrice']
In [13]:
# Visualize the data
data.head(1)
Out[13]:
PurchaseId ProductCode ProductName Quantity PurchaseDate UnitPrice CustomerId Country YearMonth Revenue
0 536365 21730 GLASS STAR FROSTED T-LIGHT HOLDER 6 2010-12-01 08:26:00 4.25 17850.0 Brazil 201012 25.5
In [14]:
# Group the revenue per month/year
df_revenue = data.groupby(['YearMonth']).agg({'Revenue': sum}).reset_index()
df_revenue
Out[14]:
YearMonth Revenue
0 201012 748957.020
1 201101 560000.260
2 201102 498062.650
3 201103 683267.080
4 201104 493207.121
5 201105 723333.510
6 201106 691123.120
7 201107 681300.111
8 201108 682680.510
9 201109 1019687.622
10 201110 1070704.670
11 201111 1461756.250
12 201112 431245.000

Visualizing KPI 1

In [15]:
# Define plot data
plot_data = [go.Bar(x = df_revenue['YearMonth'],
                    y = df_revenue['Revenue'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Revenue')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 2: Revenue Growth Rate

Growth Rate = Current Month's Revenue / Previous Month's Revenue

In [16]:
# Using the pct_change() method to calculate the monthly growth rate
df_revenue['GrowthRate'] = df_revenue['Revenue'].pct_change()
df_revenue
Out[16]:
YearMonth Revenue GrowthRate
0 201012 748957.020 NaN
1 201101 560000.260 -0.252293
2 201102 498062.650 -0.110603
3 201103 683267.080 0.371850
4 201104 493207.121 -0.278163
5 201105 723333.510 0.466592
6 201106 691123.120 -0.044530
7 201107 681300.111 -0.014213
8 201108 682680.510 0.002026
9 201109 1019687.622 0.493653
10 201110 1070704.670 0.050032
11 201111 1461756.250 0.365228
12 201112 431245.000 -0.704982

Visualizing KPI 2

Won't be considering the last month (2011/12) since there is only partial sales data for that month, which results in an incorrect growth rate measument.

In [17]:
# Define plot data
plot_data = [go.Scatter(x = df_revenue.query('YearMonth < 201112')['YearMonth'],
                        y = df_revenue.query('YearMonth < 201112')['GrowthRate'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        yaxis = {'tickformat': '%'},
                        title = 'Revenue Growth Rate')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 3: Monthly Active Customers in a Country

Defining active clients as those clients who made at least one purchase in a given month.

In [18]:
# Create a dataframe containing only data from Brazil
data_brazil = data.query("Country=='Brazil'").reset_index(drop=True)
In [19]:
# On each month, consider as active only those users to purchased on the current month
df_active_month = data_brazil.groupby('YearMonth')['CustomerId'].nunique().reset_index()
df_active_month
Out[19]:
YearMonth CustomerId
0 201012 871
1 201101 684
2 201102 714
3 201103 923
4 201104 817
5 201105 985
6 201106 943
7 201107 899
8 201108 867
9 201109 1177
10 201110 1285
11 201111 1548
12 201112 614

Visualizing KPI 3

In [20]:
# Define plot data
plot_data = [go.Scatter(x = df_active_month['YearMonth'],
                        y = df_active_month['CustomerId'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Active Customers in a Country (Brazil)')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

Note that there is only partial data for the last month, which is why such a large drop is observed on the plot.

KPI 4: Monthly Item Sales in a Country

In [21]:
# Group the data to tally the number of items sold per month in Brazil
df_items_month = data_brazil.groupby('YearMonth')['Quantity'].sum().reset_index()
df_items_month
Out[21]:
YearMonth Quantity
0 201012 298101
1 201101 237381
2 201102 225641
3 201103 279843
4 201104 257666
5 201105 306452
6 201106 258522
7 201107 324129
8 201108 319804
9 201109 458490
10 201110 455612
11 201111 642281
12 201112 198750

Visualizing KPI 4

In [22]:
# Define plot data
plot_data = [go.Bar(x = df_items_month['YearMonth'],
                    y = df_items_month['Quantity'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Item Sales in a Country (Brazil)')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 5: Monthly Average Item Sale Price

In [23]:
# Calculate the average item sale price per month
df_revenue_avg = data_brazil.groupby('YearMonth')['Revenue'].mean().reset_index()
df_revenue_avg
Out[23]:
YearMonth Revenue
0 201012 16.865860
1 201101 13.614680
2 201102 16.093027
3 201103 16.716166
4 201104 15.773380
5 201105 17.713823
6 201106 16.714748
7 201107 15.723497
8 201108 17.315899
9 201109 18.931723
10 201110 16.093582
11 201111 16.312383
12 201112 16.223363

Visualizing KPI 5

In [24]:
# Define plot data
plot_data = [go.Bar(x = df_revenue_avg['YearMonth'],
                    y = df_revenue_avg['Revenue'])]

# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Monthly Average Item Sale Price')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 6: Revenue Growth from New vs Old Customers

To obtain this KPI some creativity is needed, as it's necessary to define what constitutes a new vs old customer...

For each month I'll be considering as new customers those whose first purchase happened in the current month, while those whose first purchase happened in previous months will be considered old customers.

In [25]:
# Finding the earliest purchase date for each customer
df_purchase_earliest = data.groupby('CustomerId')['PurchaseDate'].min().reset_index()
df_purchase_earliest.columns = ['CustomerId', 'EarliestDate']
df_purchase_earliest.head(2)
Out[25]:
CustomerId EarliestDate
0 12346.0 2011-01-18 10:01:00
1 12347.0 2010-12-07 14:57:00
In [26]:
# Extracting the month of the first purchase
df_purchase_earliest['FirstPurchase'] = df_purchase_earliest['EarliestDate'].map(lambda date: 100 * date.year + date.month)
df_purchase_earliest.head()
Out[26]:
CustomerId EarliestDate FirstPurchase
0 12346.0 2011-01-18 10:01:00 201101
1 12347.0 2010-12-07 14:57:00 201012
2 12348.0 2010-12-16 19:09:00 201012
3 12349.0 2011-11-21 09:51:00 201111
4 12350.0 2011-02-02 16:01:00 201102
In [27]:
# Merge the dataframe containing the month of the first purchase into the base dataset
data_purchase = pd.merge(data, df_purchase_earliest, on = 'CustomerId')
data_purchase.head(3)
Out[27]:
PurchaseId ProductCode ProductName Quantity PurchaseDate UnitPrice CustomerId Country YearMonth Revenue EarliestDate FirstPurchase
0 536365 21730 GLASS STAR FROSTED T-LIGHT HOLDER 6 2010-12-01 08:26:00 4.25 17850.0 Brazil 201012 25.50 2010-12-01 08:26:00 201012
1 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 2010-12-01 08:26:00 2.55 17850.0 Brazil 201012 15.30 2010-12-01 08:26:00 201012
2 536365 71053 WHITE METAL LANTERN 6 2010-12-01 08:26:00 3.39 17850.0 Brazil 201012 20.34 2010-12-01 08:26:00 201012
In [28]:
# Creating a new column for customer type and filling all values as "new"
data_purchase['CustomerType'] = 'New'
data_purchase['CustomerType'].value_counts()
Out[28]:
New    406725
Name: CustomerType, dtype: int64
In [29]:
# For each PurchaseId, if the purchase is happening in an year/month greater than the user's FirstPurchase
# then set CustomerType as "Old"
data_purchase.loc[data_purchase['YearMonth'] > data_purchase['FirstPurchase'], 'CustomerType'] = 'Old'
data_purchase['CustomerType'].value_counts()
Out[29]:
Old    287549
New    119176
Name: CustomerType, dtype: int64
In [30]:
# Now calculating the revenue per CustomerType and month
df_revenue_type_month = data_purchase.groupby(['YearMonth', 'CustomerType'])['Revenue'].sum().reset_index()

# Removing 2011/12 since it only has partial sales data
df_revenue_type_month = df_revenue_type_month.query("YearMonth != 201112 and YearMonth != 201012")
df_revenue_type_month
Out[30]:
YearMonth CustomerType Revenue
1 201101 New 203457.860
2 201101 Old 271616.520
3 201102 New 149521.380
4 201102 Old 287024.770
5 201103 New 189930.080
6 201103 Old 390034.530
7 201104 New 119764.251
8 201104 Old 306283.600
9 201105 New 115858.740
10 201105 Old 532392.340
11 201106 New 92526.510
12 201106 Old 515486.650
13 201107 New 65882.871
14 201107 Old 508355.610
15 201108 New 77658.230
16 201108 Old 538709.770
17 201109 New 153278.591
18 201109 Old 778161.781
19 201110 New 154930.690
20 201110 Old 819672.900
21 201111 New 134231.380
22 201111 Old 998176.360

Visualizing KPI 6

In [31]:
# Define plot data
plot_data = [go.Scatter(x = df_revenue_type_month.query("CustomerType == 'Old'")['YearMonth'],
                        y = df_revenue_type_month.query("CustomerType == 'Old'")['Revenue'],
                        name = 'Old Customers'),
             go.Scatter(x = df_revenue_type_month.query("CustomerType == 'New'")['YearMonth'],
                        y = df_revenue_type_month.query("CustomerType == 'New'")['Revenue'],
                        name = 'New Customers')]
             
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        title = 'Revenue Growth from New vs Old Customers')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 7: New Customer Share

Since new and old customers have been defined, it's possible to now calculate the share of new customers each month.

In [32]:
# Calculate the rate of new customers in each month
df_new_customer_rate = data_purchase.query("CustomerType == 'New'").groupby('YearMonth')['CustomerId'].nunique() / data_purchase.groupby('YearMonth')['CustomerId'].nunique()

# Fix index and remove missing values
df_new_customer_rate = df_new_customer_rate.reset_index()
df_new_customer_rate = df_new_customer_rate.dropna()
df_new_customer_rate
Out[32]:
YearMonth CustomerId
0 201012 1.000000
1 201101 0.537676
2 201102 0.476190
3 201103 0.431373
4 201104 0.332592
5 201105 0.258573
6 201106 0.223597
7 201107 0.192346
8 201108 0.170408
9 201109 0.228879
10 201110 0.247018
11 201111 0.187610
12 201112 0.060029

Visualizing KPI 7

In [33]:
# Define plot data
plot_data = [go.Bar(x = df_new_customer_rate.query("YearMonth < 201112")['YearMonth'],
                    y = df_new_customer_rate.query("YearMonth < 201112")['CustomerId'])]
             
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        yaxis = {'tickformat': '%'},
                        title = 'New Customer Share')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)

KPI 8: Customer Retention Rate

Retention Rate = Previous Month's Clients / Total Active Clients

In [34]:
# GroupĂ­ng the data by customer and month and tallying the revenue
data_purchase_customers = data_purchase.groupby(['CustomerId', 'YearMonth'])['Revenue'].sum().reset_index()
data_purchase_customers.head()
Out[34]:
CustomerId YearMonth Revenue
0 12346.0 201101 0.00
1 12347.0 201012 711.79
2 12347.0 201101 475.39
3 12347.0 201104 636.25
4 12347.0 201106 382.52
In [35]:
# Defining retention with a crosstab
df_ret = pd.crosstab(data_purchase_customers['CustomerId'], data_purchase_customers['YearMonth'])
df_ret.head()
Out[35]:
YearMonth 201012 201101 201102 201103 201104 201105 201106 201107 201108 201109 201110 201111 201112
CustomerId
12346.0 0 1 0 0 0 0 0 0 0 0 0 0 0
12347.0 1 1 0 0 1 0 1 0 1 0 1 0 1
12348.0 1 1 0 0 1 0 0 0 0 1 0 0 0
12349.0 0 0 0 0 0 0 0 0 0 0 0 1 0
12350.0 0 0 1 0 0 0 0 0 0 0 0 0 0
In [36]:
# Extracting all year/month pairs
months = df_ret.columns
months
Out[36]:
Int64Index([201012, 201101, 201102, 201103, 201104, 201105, 201106, 201107,
            201108, 201109, 201110, 201111, 201112],
           dtype='int64', name='YearMonth')
In [37]:
# Loop to calculate retention over the months

# List to store the result
ret_list = []

# Loop
for i in range(len(months)-1):
    retention_data = {}
    previous_month = months[i]
    current_month = months[i+1]
    retention_data['YearMonth'] = int(current_month)
    retention_data['CustomerTotal'] = df_ret[current_month].sum()
    retention_data['TotalRetained'] = df_ret[(df_ret[current_month] > 0 ) & (df_ret[previous_month] > 0)][current_month].sum()
    ret_list.append(retention_data)
    
ret_list    
Out[37]:
[{'YearMonth': 201101, 'CustomerTotal': 783, 'TotalRetained': 362},
 {'YearMonth': 201102, 'CustomerTotal': 798, 'TotalRetained': 299},
 {'YearMonth': 201103, 'CustomerTotal': 1020, 'TotalRetained': 345},
 {'YearMonth': 201104, 'CustomerTotal': 899, 'TotalRetained': 346},
 {'YearMonth': 201105, 'CustomerTotal': 1079, 'TotalRetained': 399},
 {'YearMonth': 201106, 'CustomerTotal': 1051, 'TotalRetained': 464},
 {'YearMonth': 201107, 'CustomerTotal': 993, 'TotalRetained': 415},
 {'YearMonth': 201108, 'CustomerTotal': 980, 'TotalRetained': 433},
 {'YearMonth': 201109, 'CustomerTotal': 1302, 'TotalRetained': 465},
 {'YearMonth': 201110, 'CustomerTotal': 1425, 'TotalRetained': 552},
 {'YearMonth': 201111, 'CustomerTotal': 1711, 'TotalRetained': 690},
 {'YearMonth': 201112, 'CustomerTotal': 683, 'TotalRetained': 440}]
In [38]:
# Convert to dataframe
df_ret_final = pd.DataFrame(ret_list)
df_ret_final.head(2)
Out[38]:
YearMonth CustomerTotal TotalRetained
0 201101 783 362
1 201102 798 299
In [39]:
# Calculate the ratio to obtain the KPI
df_ret_final['RetentionRate'] = df_ret_final['TotalRetained'] / df_ret_final['CustomerTotal']
df_ret_final
Out[39]:
YearMonth CustomerTotal TotalRetained RetentionRate
0 201101 783 362 0.462324
1 201102 798 299 0.374687
2 201103 1020 345 0.338235
3 201104 899 346 0.384872
4 201105 1079 399 0.369787
5 201106 1051 464 0.441484
6 201107 993 415 0.417925
7 201108 980 433 0.441837
8 201109 1302 465 0.357143
9 201110 1425 552 0.387368
10 201111 1711 690 0.403273
11 201112 683 440 0.644217

Visualizing KPI 8

In [40]:
# Define plot data
plot_data = [go.Scatter(x = df_ret_final.query("YearMonth < 201112")['YearMonth'],
                        y = df_ret_final.query("YearMonth < 201112")['RetentionRate'],
                        name = 'Rate')]
# Layout
plot_layout = go.Layout(xaxis = {'type': 'category'},
                        yaxis = {'tickformat': '%'},
                        title = 'Customer Retention Rate')

# Plot the figure
fig = go.Figure(data = plot_data, layout = plot_layout)
pyoff.iplot(fig)