Online Shopper's Intention Prediction¶
Credit: AITS Cainvas Community
Photo by Karol Cichoń on Dribbble
Predict a customer's behaviour in online shopping websites for KPI and marketing analysis.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from tensorflow.keras import models, optimizers, losses, layers, callbacks
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings("ignore")
The dataset¶
- C. Okan Sakar Department of Computer Engineering, Faculty of Engineering and Natural Sciences, Bahcesehir University, 34349 Besiktas, Istanbul, Turkey
- Yomi Kastro Inveon Information Technologies Consultancy and Trade, 34335 Istanbul, Turkey
Sakar, C.O., Polat, S.O., Katircioglu, M. et al. Neural Comput & Applic (2018).
The dataset is a CSV file with 18 attributes (10 numerical and 8 categorical) and 1 target columns.
Administrative, Administrative Duration, Informational, Informational Duration, Product Related and Product Related Duration represent the number of times visited and duration of time spent in the respective categories of websites.
The Bounce Rate (the percentage of visitors who enter and leave the site without triggering any request), Exit Rate (percentage of sessions that ended int he page relative to all page views) and Page Value (the average value for a web page that a user visited before completing an e-commerce transaction) features represent the metrics measured by "Google Analytics" for each page in the e-commerce site.
The Special Day feature indicates the closeness of the site visiting time to a specific special day.
Other attributes such as operating system, browser, region, traffic type, visitor type, weekend, and month are also available.
df = pd.read_csv('https://cainvas-static.s3.amazonaws.com/media/user_data/cainvas-admin/online_shoppers_intention.csv')
df
Looking into the columns in the data frame
df.columns
Defining the numeric columns for standardization later
numeric_columns = ['Administrative', 'Administrative_Duration', 'Informational',
'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration',
'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay']
Checking for NaN values
...and dropping them.
print(df.isna().sum())
df = df.dropna()
A peek into the class label distribution
df['Revenue'].value_counts()
Its not balanced but let us see how our model performs on this data.
A peek in to the values in the 'Month' column
df['Month'].value_counts()
Only 10 out of 12 months are in the dataframe. The month column needs to be one-hot encoded with all the 12 months in count.
# Convert binary to int
df['Weekend'] = df['Weekend'].astype('int64')
df['Revenue'] = df['Revenue'].astype('int64')
# One hot encoding
dummy_columns = ['OperatingSystems','Browser','Region','TrafficType','VisitorType']
for column in dummy_columns:
df_dummies = pd.get_dummies(df[column], drop_first = True, prefix = column+"_")
df = pd.concat([df, df_dummies], axis = 1)
df = df.drop(columns = dummy_columns)
# Accounting for all months in the calendar
months = ['Jan','Feb','Mar','Apr','May','June','Jul','Aug','Sep','Oct','Nov','Dec']
for mx in months[1:]: # drop_first = True
df[mx] = (df['Month'] == mx).astype('int64')
df = df.drop(columns = ['Month'])
Defining input and output columns
input_columns = df.columns.tolist()
input_columns.remove('Revenue')
output_columns = ['Revenue']
Train-val-test split based on 80-10-10 ratio
# Splitting into train, val and test set -- 80-10-10 split
# First, an 80-20 split
train_df, val_test_df = train_test_split(df, test_size = 0.2)
# Then split the 20% into half
val_df, test_df = train_test_split(val_test_df, test_size = 0.5)
print("Number of samples in...")
print("Training set: ", len(train_df))
print("Validation set: ", len(val_df))
print("Testing set: ", len(test_df))
Standardizing the numeric column values
ss = StandardScaler()
train_df[numeric_columns] = ss.fit_transform(train_df[numeric_columns])
val_df[numeric_columns] = ss.transform(val_df[numeric_columns])
test_df[numeric_columns] = ss.transform(test_df[numeric_columns])
# Splitting into X (input) and y (output)
Xtrain, ytrain = np.array(train_df[input_columns]), np.array(train_df[output_columns])
Xval, yval = np.array(val_df[input_columns]), np.array(val_df[output_columns])
Xtest, ytest = np.array(test_df[input_columns]).astype('float16'), np.array(test_df[output_columns])
The model¶
model = models.Sequential([
layers.Dense(16, activation = 'relu', input_shape = Xtrain[0].shape),
layers.Dense(8, activation = 'relu'),
layers.Dense(1, activation = 'sigmoid')
])
cb = callbacks.EarlyStopping(patience = 5, restore_best_weights = True)
model.summary()
model.compile(optimizer = optimizers.Adam(0.0001), loss = losses.BinaryCrossentropy(), metrics = ['accuracy'])
history = model.fit(Xtrain, ytrain, validation_data = (Xval, yval), epochs = 256, callbacks = cb)
model.evaluate(Xtest, ytest)
cm = confusion_matrix(ytest, (model.predict(Xtest)>0.5).astype('int64'))
cm = cm.astype('int') / cm.sum(axis=1)[:, np.newaxis]
fig = plt.figure(figsize = (5, 5))
ax = fig.add_subplot(111)
for i in range(cm.shape[1]):
for j in range(cm.shape[0]):
if cm[i,j] > 0.8:
clr = "white"
else:
clr = "black"
ax.text(j, i, format(cm[i, j], '.2f'), horizontalalignment="center", color=clr)
_ = ax.imshow(cm, cmap=plt.cm.Blues)
ax.set_xticks(range(2))
ax.set_yticks(range(2))
ax.set_xticklabels(['True', 'False'], rotation = 90)
ax.set_yticklabels(['True', 'False'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
Plotting the metrics¶
def plot(history, variable, variable2):
plt.plot(range(len(history[variable])), history[variable])
plt.plot(range(len(history[variable2])), history[variable2])
plt.legend([variable, variable2])
plt.title(variable)
plot(history.history, "loss", "val_loss")
plot(history.history, "accuracy", "val_accuracy")
Prediction¶
# pick random test data sample from one batch
x = random.randint(0, len(Xtest) - 1)
output = model.predict(Xtest[x].reshape(1, -1))[0][0]
pred = (output>0.5).astype('int64')
print("Predicted: ", bool(pred), "(", output, "-->", pred, ")")
print("True: ", bool(ytest[x]))
deepC¶
model.save('online_shopper.h5')
!deepCC online_shopper.h5
x = random.randint(0, len(Xtest) - 1)
np.savetxt('sample.data', Xtest[x]) # xth sample into text file
# run exe with input
!online_shopper_deepC/online_shopper.exe sample.data
output = model.predict(Xtest[x].reshape(1, -1))[0][0]
predm = (output>0.5).astype('int64')
# show predicted output
nn_out = np.loadtxt('deepSea_result_1.out')
pred = (nn_out>0.5).astype('int64')
print("Predicted (deepC): ", bool(pred), "(", nn_out, "-->", pred, ")")
print("Predicted (model): ", bool(predm), "(", output, "-->", predm, ")")
print("True: ", bool(ytest[x]))