Extracted Notebook Content
Extracted Notebook Content
```python
#Installing packages
!pip install squarify
!pip install statsmodels
!pip install seaborn
!pip install xgboost
```
## Code:
```python
#importing libraries
import numpy as np
import pandas as pd
import os
from statsmodels import api as sm
import pylab as py
import matplotlib.pyplot as plt
import matplotlib.dates as dates
from datetime import datetime
import matplotlib.ticker as ticker
import matplotlib.cm as cm
import matplotlib as mpl
from matplotlib.gridspec import GridSpec
import seaborn as sns
import squarify
from scipy.stats import kstest,norm
from scipy.stats import norm
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from xgboost import plot_importance
from sklearn.utils import resample
from sklearn import metrics
from scipy.stats import chi2_contingency
```
## Markdown:
## **Dataset**
## Code:
```python
def reduce_mem_usage(df, verbose=True):
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
start_mem = df.memory_usage().sum() / 1024**2
for col in df.columns:
col_type = df[col].dtypes
if col_type in numerics:
c_min = df[col].min()
c_max = df[col].max()
if str(col_type)[:3] == 'int':
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max <
np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max <
np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif c_min > np.iinfo(np.int64).min and c_max <
np.iinfo(np.int64).max:
df[col] = df[col].astype(np.int64)
else:
if c_min > np.finfo(np.float16).min and c_max <
np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max <
np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
else:
df[col] = df[col].astype(np.float64)
return df
```
## Code:
```python
df = pd.read_csv('2019-Nov.csv')
```
## Code:
```python
df=reduce_mem_usage(df)
```
## Code:
```python
df.head()
```
## Code:
```python
df.info()
```
## Markdown:
# Rows
# Columns
## Code:
```python
#no of rows with null values
print("category_code ",df['category_code'].isnull().sum())
print("brand ",df['brand'].isnull().sum())
print("Both ",(df['category_code'].isnull() & df['brand'].isnull()).sum())
```
## Markdown:
Since we have ample data we drop the data with null values and data is now reduced
to 4 crores
## Code:
```python
df = df.dropna()
```
## Code:
```python
df.shape
```
## Markdown:
# No of visitors by date
To analyze the number of visitors by date we have grouped the dataset by columns
event_time and user_id.
The no.of visitors on each date is extracted and shown in the graph below.
## Code:
```python
#No of visitors by date
data = df.loc[:,['event_time','user_id']]
#Extracting only dates
data['event_time'] = data['event_time'].apply(lambda s: str(s)[0:10])
visitor_by_date = data.drop_duplicates().groupby(['event_time'])
['user_id'].agg(['count']).sort_values(by=['event_time'], ascending=True)
x = pd.Series(visitor_by_date.index.values).apply(lambda s: datetime.strptime(s,
'%Y-%m-%d').date())
y = visitor_by_date['count']
plt.rcParams['figure.figsize'] = (20,8)
plt.plot(x,y)
plt.show()
```
## Markdown:
## Markdown:
# Most bought brand
## Code:
```python
df['brand'].value_counts()
df['event_type'].value_counts()
```
## Code:
```python
title_type = df.groupby('brand').agg('count')
print(title_type)
type_labels = title_type.user_id.sort_values().index
type_counts = title_type.user_id.sort_values()
plt.figure(1,figsize =(20,10))
the_grid = GridSpec(2,2)
cmap = plt.get_cmap('Spectral')
colors = [cmap(i) for i in np.linspace(0,1,8)]
plt.subplot(the_grid[0,1],aspect=1,title = 'Brand titles')
type_show_ids = plt.pie(type_counts,labels = type_labels,autopct = '%1.1f%%',shadow
= True,colors = colors)
plt.show()
```
## Markdown:
The above *piechart* shows the popularity of brands in the market with *"Samsung"*
being the top brand
## Markdown:
# Popular product categories
A squarify plot is used to visually represent which categories of product has drawn
more demand from customer.
Most of the items are given two category codes separated by a point.First word
depicts the main category of the item whereas the following word depicts its sub
category.
## Code:
```python
top_category_n = 30
top_category = df.loc[:,'category_code'].value_counts()
[:top_category_n].sort_values(ascending=False)
squarify.plot(sizes=top_category, label=top_category.index.array,
color=["red","cyan","green","orange","blue","grey"], alpha=.7 )
plt.axis('off')
plt.show()
```
## Markdown:
"Smartphones" which comes under electronics goods are more popular.
A huge fraction of items bought are electronics which concludes there have been
major discounts and price deals available on ecommerce platform.
## Code:
```python
labels = ['view', 'cart','purchase']
size = df['event_type'].value_counts()
colors = ['yellowgreen', 'lightskyblue','lightcoral']
explode = [0, 0.1,0.1]
plt.rcParams['figure.figsize'] = (8, 8)
plt.pie(size, colors = colors, explode = explode, labels = labels, shadow = True,
autopct = '%.2f%%')
plt.title('Event_Type', fontsize = 20)
plt.axis('off')
plt.legend()
plt.show()
```
## Markdown:
# Conversion Rates
We have three type of events view,add to cart and purchase.Not every user view ,add
it to the cart and purchase it.Most users tend to have look at the product and its
price.
Conversion rates gives us the idea of how many users actually purchased the product
as opposed to how many times the products are viewed or added to the cart.
How many products are actually purchased as opposed to number of products added to
the cart.
We find
## Code:
```python
view_count = df['event_type'].value_counts()[0]
cart_count = df['event_type'].value_counts()[1]
purchase_count = df['event_type'].value_counts()[2]
print("Rate of conversion between view and purchase events"
+str((purchase_count/view_count)*100) +'%')
print("Rate of conversion between view and add to cart events"
+str((cart_count/view_count)*100) +'%')
print("Rate of conversion between add to cart and purchase events"
+str((purchase_count/cart_count)*100) +'%')
```
## Markdown:
Rate of conversion from view to purchase is 1.67%
Rate of conversion of buying the item which is added to the cart is 31.16%
There are some cases where no carting data is recorded before purchase which
indicates that in some cases the customer directly purchases the product without
adding it to the cart.
## Markdown:
# Brandwise sales of all event types
## Code:
```python
#Brandwise sales of all event types
df['brand'].value_counts().head(50).plot.bar(figsize = (18,7))
plt.title('Top brand',fontsize = 20)
plt.xlabel('Names of brand')
plt.ylabel('Count')
plt.show()
```
## Markdown:
From the above plot we infer "Samsung" is the top brand where we consider all type
of events(view,cart and purchase)
## Markdown:
We consider only purchase events which tells us which brand is the first player in
the market
## Code:
```python
d = df.loc[df['event_type'].isin(['purchase'])].drop_duplicates()
print(d['brand'].value_counts())
d['brand'].value_counts().head(70).plot.bar(figsize =(18,7))
plt.xlabel('Names of brand')
plt.ylabel('Count')
plt.show()
```
## Markdown:
* As seen in the graph samsung is again the top in the market closely followed by
apple.
* A number of brands with only one product sale each including cameo,Imetec and
Zapco take the last position.
## Code:
```python
top_player = df['brand'].value_counts()[0]
second_player = df['brand'].value_counts()[1]
last_player = df['brand'].value_counts()[-1]
print("Top brand saw " +str((top_player/second_player)*100)+"%more sales than
second_player in the market")
print("Top brand saw " +str((top_player/last_player)*100)+"%more sales than bottom
player in the market")
```
## Markdown:
# Purchase path
The standard idea is that most people first view an item,compare with other item
and add to cart to buy specific item.not many people follow this path.
## Code:
```python
df.loc[df.user_session =="ef3daa59-4936-43e5-a530-32902f64b2f4"].sort_values(by =
"event_time")
```
## Markdown:
# User's journey
The code below shows the user who purchased a apple product after which he views
other products manufactured by the same company "apple".
## Code:
```python
user_ID = 518267348
df.loc[df['user_id'] == user_ID]
```
## Markdown:
The user below views an android phone ,purchases the same and goes on to view other
apple products clock and phone. He buys the apple clock.
The inference is apple customers wear brand loyalty.Apple customers have only
viewed other products of apple whereas android customers view other company
products also.
## Code:
```python
user_ID = 513351129
df.loc[df['user_id'] == user_ID]
```
## Markdown:
# Preparing data
To predict whether the product added to the cart is actually purchased by the
customer based on factors such as its category,event_weekday,activity of the user
in that session etc
* category_code_level1 - category
* category_code_level2 - subcategory
* event_weekday - weekday of the event
* activity_count - no.of activity in that session
The training dataset contains every non-duplicated cart transactions with above
mentioned features.We will use these features with original price and brand to
predict whether the customer will eventuall purchase the item in the cart.
## Markdown:
#List of people who has bought or added products to the cart
## Code:
```python
#List of people who has bought or added products to the cart
cart_purchase_users =
df.loc[df["event_type"].isin(["cart","purchase"])].drop_duplicates(subset=['user_id
'])
cart_purchase_users.dropna(how='any', inplace=True)
print(cart_purchase_users)
```
## Markdown:
#All activities of above users including view event
## Code:
```python
cart_purchase_users_all_activity =
df.loc[df['user_id'].isin(cart_purchase_users['user_id'])]
print(cart_purchase_users_all_activity)
```
## Markdown:
#Counting no of activities in one session
## Code:
```python
activity_in_session = cart_purchase_users_all_activity.groupby(['user_session'])
['event_type'].count().reset_index()
activity_in_session =
activity_in_session.rename(columns={"event_type":"activity_count"})
print(activity_in_session)
```
## Markdown:
Extract event date from event_time column and find on which date the activity
occurs
## Code:
```python
def convert_time_to_date(utc_timestamp):
utc_date = datetime.strptime(utc_timestamp[0:10],'%Y-%m-%d').date()
return utc_date
```
## Code:
```python
df['event_date'] = df['event_time'].apply(lambda s:convert_time_to_date(s))
```
## Markdown:
Splitting of category and sub category is done by string handling
## Code:
```python
df_targets =
df.loc[df["event_type"].isin(["cart","purchase"])].drop_duplicates(subset =
['event_type','product_id','price','user_id','user_session'])
df_targets["is_purchased"] = np.where(df_targets["event_type"]=="purchase",1,0)
df_targets["is_purchased"] = df_targets.groupby(["user_session","product_id"])
["is_purchased"].transform("max")
df_targets =
df_targets.loc[df_targets["event_type"]=='cart'].drop_duplicates(["user_session","p
roduct_id","is_purchased"])
df_targets['event_weekday'] = df_targets['event_date'].apply(lambda s:s.weekday())
df_targets.dropna(how = 'any',inplace = True)
df_targets["category_code_level1"] =
df_targets["category_code"].str.split(",",expand = True)[0].astype('category')
df_targets["category_code_level2"] =
df_targets["category_code"].str.split(",",expand = True)[1].astype('category')
```
## Code:
```python
df_targets = df_targets.merge(activity_in_session,on = 'user_session',how ='left')
df_targets['activity_count'] = df_targets['activity_count'].fillna(0)
df_targets.head()
```
## Code:
```python
df_targets.info()
```
## Code:
```python
#Saving a copy of preprocessed data
df_targets.to_csv('training_data.csv')
```
## Code:
```python
df_targets = pd.read_csv('training_data.csv')
```
## Code:
```python
df_targets.head()
```
## Markdown:
#Resampling data to have equal no of purchased and not purchased items
no.of rows when the item was purchased was around 5 lakh and not purchased was
around 8 lakh.
## Code:
```python
is_purchase_set = df_targets[df_targets['is_purchased'] == 1]
is_purchase_set.shape[0]
```
## Code:
```python
not_purchase_set = df_targets[df_targets['is_purchased'] == 0]
not_purchase_set.shape[0]
```
## Code:
```python
n_samples = 500000
is_purchase_downsampled = resample(is_purchase_set,replace = False,n_samples =
n_samples,random_state = 27)
not_purchase_set_downsampled = resample(not_purchase_set,replace = False,n_samples
= n_samples,random_state = 27)
```
## Code:
```python
downsampled = pd.concat([is_purchase_downsampled,not_purchase_set_downsampled])
downsampled['is_purchased'].value_counts()
```
## Code:
```python
features =
downsampled[['brand','price','event_weekday','category_code_level1','category_code_
level2','activity_count']]
```
## Markdown:
#Encoding categorical attributes
## Code:
```python
features.loc[:,'brand']
=LabelEncoder().fit_transform(downsampled.loc[:,'brand'].copy())
features.loc[:,'event_weekday'] =
LabelEncoder().fit_transform(downsampled.loc[:,'event_weekday'].copy()
features.loc[:,'category_code_level1'] =
LabelEncoder().fit_transform(downsampled.loc[:,'category_code_level1'].copy()
features.loc[:,'category_code_level2'] =
LabelEncoder().fit_transform(downsampled.loc[:,'category_code_level2'].copy()
is_purchased = LabelEncoder().fit_transform(downsampled['is_purchased'])
features.head()
```
## Code:
```python
features.info()
```
## Markdown:
#Hypothesis testing - atrribute dependence
## Code:
```python
df.head()
```
## Markdown:
#Chi square test - association between two attributes
Weekday vs price
## Code:
```python
print()
print("Chi square test")
#Event weekday and price
table = pd.crosstab(features['event_weekday'],features['price'],margins = False)
stat,p,dof,expected = chi2_contigency(table)
alpha = 0.05
print("For weekday of the event and price")
print("p value is " +str(p))
if p >= alpha:
print("No significant association between these attributes -H0 holds true")
else:
print("significant association between these attributes -H0 is rejected")
```
## Markdown:
user_id vs category_id
## Code:
```python
d1 =df[:100000]
table1 = pd.crosstab(d1['category_id'],d1['user_id'],margins = False)
stat,p,dof,expected = chi2_contigency(table1)
alpha = 0.05
print("For category code and user")
print("p value is " +str(p))
if p >= alpha:
print("No significant association between these attributes -H0 holds true")
else:
print("significant association between these attributes -H0 is rejected")
```
## Markdown:
#Time series analysis
## Code:
```python
parse_date = ['date']
parse_time = ['time']
df.head()
```
## Code:
```python
timeseries_df = df[['date','event_type']]
plt.plot('xlabel','ylabel',data =timeseries_df)
plt.show()
```
## Markdown:
#Covariance matrix
## Code:
```python
matrix =
downsampled[['brand','price','event_weekday','category_code_level1','category_code_
level2','activity_count','is_purchased']]
matrix.loc[:,'brand'] =
LabelEncoder().fit_transform(downsampled.loc[:,'brand'].copy())
matrix.loc[:,'event_weekday'] =
LabelEncoder().fit_transform(downsampled.loc[:,'event_weekday'].copy()
matrix.loc[:,'category_code_level1'] =
LabelEncoder().fit_transform(downsampled.loc[:,'category_code_level1'].copy()
matrix.loc[:,'category_code_level2'] =
LabelEncoder().fit_transform(downsampled.loc[:,'category_code_level2'].copy()
matrix.head()
```
## Code:
```python
cov_matrix = matrix.cov()
sns.heatmap(cov_matrix,annot = True)
plt.show()
```
## Markdown:
#Correlation matrix
## Code:
```python
corr_matrix = matrix.corr()
sns.heatmap(corr_matrix,annot = True)
plt.show()
```
## Markdown:
#ML models
## Code:
```python
## Markdown:
#Decision tree classification
## Code:
```python
```
## Code:
```python
print("Accuracy",metrics.accuracy_score(y_test,y_pred))
print("Precision",metrics.precision_score(y_test,y_pred))
print("Recall",metrics.recall_score(y_test,y_pred))
print("fbeta",metrics.fbeta_score(y_test,y_pred,average = 'weighted',beta=0.5))
```
## Markdown:
#XGBoost classification
## Code:
```python
## Code:
```python
print("Accuracy",metrics.accuracy_score(y_test,y_pred))
print("Precision",metrics.precision_score(y_test,y_pred))
print("Recall",metrics.recall_score(y_test,y_pred))
print("fbeta",metrics.fbeta_score(y_test,y_pred,average = 'weighted',beta=0.5))
```
## Markdown:
#Logistic regression
## Code:
```python
## Code:
```python
print("Accuracy",metrics.accuracy_score(y_test,y_pred))
print("Precision",metrics.precision_score(y_test,y_pred))
print("Recall",metrics.recall_score(y_test,y_pred))
print("fbeta",metrics.fbeta_score(y_test,y_pred,average = 'weighted',beta=0.5))
```