E-commerce Customer Segmentation

Problem Definition

https://tamebay.com/2020/03/online-retail-is-still-open-and-encouraged.html

Define the Scope

Business Questions

Analysis

import pandas as pd
import matplotlib.pyplot as plt
import calendar
import datetime
df = pd.read_csv('data_ecommerce.csv', encoding = 'ISO-8859-1')
df.head()
print('Missing values:', df.isnull().sum())
df.dropna(axis = 0, subset = ['CustomerID'], inplace = True)
print('Missing values after cleaning:',df.isnull().sum())
#Change 2010-2011 to 2020-2021
df['InvoiceDate']=df['InvoiceDate'].apply(lambda x: x.replace('2010','2019'))
df['InvoiceDate']=df['InvoiceDate'].apply(lambda x: x.replace('2011','2020'))
#Drop Country column and change the year in InvoiceDate
df = df.drop('Country',axis=1)
#Data event time menjadi datetime, kemudian userid, shopid, orderid merupakan string
timefmt = "%m/%d/%Y %H:%M"
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], format = timefmt, errors='coerce')
#Customer ID is an object
df['CustomerID']=df['CustomerID'].apply(lambda x: str(int(x)))
print(df.info())
print(“Duplicated Values: {}”.format(df.duplicated().sum()))
df.drop_duplicates(inplace=True)
df.describe()
# drop quantity <= 0
df = df[df['Quantity']>=0]
df.describe()
# Create TotalPrice column
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

a) Visualization of Order Patterns

print('First date data recorder: {}'.format(min(df.InvoiceDate)))
print('Last date data recorder: {}'.format(max(df.InvoiceDate)))
df['month_year'] = df['InvoiceDate'].dt.to_period('M')#Bar Plot for Monthly Orders
plt.clf()
plt.subplot(1, 2, 1)
df.groupby('month_year')['InvoiceNo'].count().plot(kind='bar')
plt.title('Monthly Orders')
plt.ylabel('Orders')
#Bar Plot for Monthly Revenue
plt.subplot(1, 2, 2)
df.groupby('month_year')['TotalPrice'].sum().plot(kind='bar')
plt.title('Monthly Revenue')
plt.ylabel('Revenue')
plt.tight_layout()
plt.show()

b) RFM Segmentation

max_InvoiceDate = max(df['InvoiceDate'])
NOW = datetime.datetime(2020,12,10)
RFMtable = df.groupby('CustomerID').agg({
'InvoiceDate': lambda x: (NOW-x.max()).days,
'InvoiceNo': 'count',
'TotalPrice': 'sum'
})
RFMtable.columns=['Recency','Frequency','Revenue']
RFMtable.describe()
#Histogram Recency
plt.clf()
plt.figure(figsize=(10,10))
plt.subplot(3, 1, 1)
plt.hist(RFMtable['Recency'],bins=100)
plt.title('Recency', fontsize=15)
plt.xlabel('Recency(days)', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
#Histogram Frequency
plt.subplot(3, 1, 2)
plt.hist(RFMtable['Frequency'],bins=100)
plt.title('Frequency', fontsize=15)
plt.xlabel('Frequency(orders)', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
#Histogram Revenue
plt.subplot(3, 1, 3)
plt.hist(RFMtable['Revenue'],bins=100)
plt.title('Revenue', fontsize=15)
plt.xlabel('Revenue(USD)', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.tight_layout()
plt.show()
RFM_revenuehigh = RFMtable[RFMtable.Revenue>50000]
#Histogram Frequency
plt.subplot(2, 1, 1)
plt.hist(RFM_revenuehigh['Frequency'],bins=100)
plt.title('Frequency from Customers with >50k USD Revenue', fontsize=15)
plt.xlabel('Frequency(orders)', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
#Histogram Revenue
plt.subplot(2, 1, 2)
plt.hist(RFM_revenuehigh['Revenue'],bins=100)
plt.title('Customers with >50k USD Revenue', fontsize=15)
plt.xlabel('Revenue(USD)', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.tight_layout()
plt.show()

RFM Segmentation using K-Means Clustering

from sklearn.cluster import KMeans
def elbow_method(dataframe):
sse={}
newdf = RFMtable[['dataframe']]
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, max_iter=1000, random_state=0).fit(newdf)
sse[k] = kmeans.inertia_
plt.figure()
plt.plot(list(sse.keys()), list(sse.values()))
plt.xlabel("Number of cluster")
plt.ylabel("SSE")
return plt.show()
elbow_method(RFMtable[['Recency']])
elbow_method(RFMtable[['Frequency']])
elbow_method(RFMtable[['Revenue']])
# K-Means on Recency feature
kmeans = KMeans(n_clusters=4, random_state=0)
kmeans.fit(RFMtable[['Recency']])
RFMtable['RecencyClust'] = kmeans.predict(RFMtable[['Recency']])
# K-Means on Frequency feature
kmeans.fit(RFMtable[['Frequency']])
RFMtable['FreqClust'] = kmeans.predict(RFMtable[['Frequency']])
# K-Means on Revenue feature
kmeans.fit(RFMtable[['Revenue']])
RFMtable['RevenueClust'] = kmeans.predict(RFMtable[['Revenue']])
def ordered_cluster(df,clustername,targetname,ascending):
df_new = df.groupby(clustername)[targetname].mean().sort_values(ascending=ascending).reset_index()
df_new['index'] = df_new.index+1 #So the cluster would start at cluster 1-4
df_fix = pd.merge(df,df_new[[clustername,'index']], on=clustername).drop(clustername,axis=1)
df_fix.rename(columns={'index':clustername},inplace=True)
return df_fix
RFMtable['CustomerID']=RFMtable.index
RFMtable.reset_index(drop=True)
RFMtable=RFMtable[['CustomerID','Recency','Frequency','Revenue','RecencyClust','FreqClust','RevenueClust']]
RFMtable = ordered_cluster(RFMtable,'RecencyClust','Recency',True)
RFMtable = ordered_cluster(RFMtable,'FreqClust','Frequency',False)
RFMtable = ordered_cluster(RFMtable,'RevenueClust','Revenue',False)
#Create RFMindex column
RFMtable.iloc[:,4:7]=RFMtable.iloc[:,4:7].applymap(str)
RFMtable['RFMindex']=RFMtable['RecencyClust']+RFMtable['FreqClust']+RFMtable['RevenueClust']
RFMtable
RFMtable.groupby('RFMindex').mean()
CustomerSegment = RFMtable[['CustomerID','RFMindex']]
CustomerSegment
def segmentation(index):
if index in ['112', '121', '122']:
return 'Best Customer'
elif index in ['131', '132', '142']:
return 'High-Spending New Customer'
elif index in ['113','123', '124']:
return 'Low-Spending Loyal Customer'
elif index in ['133', '134', '143', '144']:
return 'Low-Spending New Customer'
elif index in ['234', '244']:
return 'About to Sleep Customer'
elif index in ['334', '343', '344', '434', '443', '444']:
return 'Churned Customer'
else:
return 'Not Specified'
CustomerSegment['Segmentation']=CustomerSegment['RFMindex'].apply(lambda x: segmentation(x))
CustomerSegment.drop('RFMindex',axis=1,inplace=True)
print(CustomerSegment.head())
pd.DataFrame(CustomerSegment.groupby('Segmentation')['CustomerID'].count())

Conclussion

Excited to learn more in data field, sometimes read, mostly watch movies.