0% found this document useful (0 votes)
12 views

SalesDataAnalysis__1693296057

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
12 views

SalesDataAnalysis__1693296057

Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 14

SalesDataAnalysis

August 27, 2023

[1]: import numpy as np


import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

[2]: df = pd.read_csv('sales_data.csv')

[3]: df.head()

[3]: Order Date Order ID Product Product_ean \


0 2019-01-22 21:25:00 141234 iPhone 5.638009e+12
1 2019-01-28 14:15:00 141235 Lightning Charging Cable 5.563320e+12
2 2019-01-17 13:33:00 141236 Wired Headphones 2.113973e+12
3 2019-01-05 20:33:00 141237 27in FHD Monitor 3.069157e+12
4 2019-01-25 11:59:00 141238 Wired Headphones 9.692681e+12

catégorie Purchase Address Quantity Ordered \


0 Vêtements 944 Walnut St, Boston, MA 02215 1
1 Alimentation 185 Maple St, Portland, OR 97035 1
2 Vêtements 538 Adams St, San Francisco, CA 94016 2
3 Sports 738 10th St, Los Angeles, CA 90001 1
4 Électronique 387 10th St, Austin, TX 73301 1

Price Each Cost price turnover margin


0 700.00 231.0000 700.00 469.0000
1 14.95 7.4750 14.95 7.4750
2 11.99 5.9950 23.98 11.9900
3 149.99 97.4935 149.99 52.4965
4 11.99 5.9950 11.99 5.9950

[4]: df.isnull().sum()

[4]: Order Date 0


Order ID 0
Product 0
Product_ean 0
catégorie 0
Purchase Address 0

1
Quantity Ordered 0
Price Each 0
Cost price 0
turnover 0
margin 0
dtype: int64

[5]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Order Date 185950 non-null object
1 Order ID 185950 non-null int64
2 Product 185950 non-null object
3 Product_ean 185950 non-null float64
4 catégorie 185950 non-null object
5 Purchase Address 185950 non-null object
6 Quantity Ordered 185950 non-null int64
7 Price Each 185950 non-null float64
8 Cost price 185950 non-null float64
9 turnover 185950 non-null float64
10 margin 185950 non-null float64
dtypes: float64(5), int64(2), object(4)
memory usage: 15.6+ MB

[6]: df['Order Date'].head(1)

[6]: 0 2019-01-22 21:25:00


Name: Order Date, dtype: object

[7]: df['Order Year']=df['Order Date'].str.split(' ').str[0].str.split('-').str[0]

[8]: df['Order Month']=df['Order Date'].str.split(' ').str[0].str.split('-').str[1]

[9]: df.head(1)

[9]: Order Date Order ID Product Product_ean catégorie \


0 2019-01-22 21:25:00 141234 iPhone 5.638009e+12 Vêtements

Purchase Address Quantity Ordered Price Each Cost price \


0 944 Walnut St, Boston, MA 02215 1 700.0 231.0

turnover margin Order Year Order Month


0 700.0 469.0 2019 01

2
[10]: df['Purchase Address'].head(1)

[10]: 0 944 Walnut St, Boston, MA 02215


Name: Purchase Address, dtype: object

[11]: df['Purchase City']=df['Purchase Address'].str.split(',').str[1]

[12]: df.head(1)

[12]: Order Date Order ID Product Product_ean catégorie \


0 2019-01-22 21:25:00 141234 iPhone 5.638009e+12 Vêtements

Purchase Address Quantity Ordered Price Each Cost price \


0 944 Walnut St, Boston, MA 02215 1 700.0 231.0

turnover margin Order Year Order Month Purchase City


0 700.0 469.0 2019 01 Boston

[13]: df.drop(columns={'Order Date','Order ID','Product_ean','Purchase␣


↪Address'},axis=1,inplace=True)

[14]: df.head(1)

[14]: Product catégorie Quantity Ordered Price Each Cost price turnover \
0 iPhone Vêtements 1 700.0 231.0 700.0

margin Order Year Order Month Purchase City


0 469.0 2019 01 Boston

[15]: df['Product'].value_counts()

[15]: USB-C Charging Cable 21903


Lightning Charging Cable 21658
AAA Batteries (4-pack) 20641
AA Batteries (4-pack) 20577
Wired Headphones 18882
Apple Airpods Headphones 15549
Bose SoundSport Headphones 13325
27in FHD Monitor 7507
iPhone 6842
27in 4K Gaming Monitor 6230
34in Ultrawide Monitor 6181
Google Phone 5525
Flatscreen TV 4800
Macbook Pro Laptop 4724
ThinkPad Laptop 4128
20in Monitor 4101
Vareebadd Phone 2065

3
LG Washing Machine 666
LG Dryer 646
Name: Product, dtype: int64

[16]: def change(x):


if x in ['USB-C Charging Cable','Lightning Charging Cable']:
return 'Charging Cables'
elif x in ['AAA Batteries (4-pack)','AA Batteries (4-pack)']:
return 'Batteries'
elif x in ['Wired Headphones','Apple Airpods Headphones','Bose SoundSport␣
↪Headphones']:

return 'Headphones'
elif x in ['27in FHD Monitor','27in 4K Gaming Monitor','34in Ultrawide␣
↪Monitor','Flatscreen TV','20in Monitor']:

return 'Smart Tv'


elif x in ['iPhone','Google Phone','Vareebadd Phone']:
return 'Smart Phones'
elif x in ['Macbook Pro Laptop','ThinkPad Laptop']:
return 'Laptops'
elif x in ['LG Washing Machine','LG Dryer']:
return 'Cleaning Machines'
else:
return 'Others'

[17]: df['Product'] = df['Product'].apply(change)

[18]: df['Product'].value_counts()

[18]: Headphones 47756


Charging Cables 43561
Batteries 41218
Smart Tv 28819
Smart Phones 14432
Laptops 8852
Cleaning Machines 1312
Name: Product, dtype: int64

[19]: df.head(1)

[19]: Product catégorie Quantity Ordered Price Each Cost price \


0 Smart Phones Vêtements 1 700.0 231.0

turnover margin Order Year Order Month Purchase City


0 700.0 469.0 2019 01 Boston

[20]: df.rename(columns={'catégorie':'Category'},inplace=True)

4
[21]: df.head(1)

[21]: Product Category Quantity Ordered Price Each Cost price \


0 Smart Phones Vêtements 1 700.0 231.0

turnover margin Order Year Order Month Purchase City


0 700.0 469.0 2019 01 Boston

[22]: df['Category'].value_counts()

[22]: Sports 46925


Vêtements 46405
Alimentation 46342
Électronique 46278
Name: Category, dtype: int64

[23]: data_mapping = {
'Vêtements': 'Clothes',
'Électronique': 'Electronics'
}
df['Category'] = df['Category'].map(data_mapping).fillna(df['Category'])

[24]: df['Category'].value_counts()

[24]: Sports 46925


Clothes 46405
Alimentation 46342
Electronics 46278
Name: Category, dtype: int64

[25]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Product 185950 non-null object
1 Category 185950 non-null object
2 Quantity Ordered 185950 non-null int64
3 Price Each 185950 non-null float64
4 Cost price 185950 non-null float64
5 turnover 185950 non-null float64
6 margin 185950 non-null float64
7 Order Year 185950 non-null object
8 Order Month 185950 non-null object
9 Purchase City 185950 non-null object
dtypes: float64(4), int64(1), object(5)

5
memory usage: 14.2+ MB

[26]: df['Order Month'] = df['Order Month'].astype(int)


df['Order Year'] = df['Order Year'].astype(int)

[27]: df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Product 185950 non-null object
1 Category 185950 non-null object
2 Quantity Ordered 185950 non-null int64
3 Price Each 185950 non-null float64
4 Cost price 185950 non-null float64
5 turnover 185950 non-null float64
6 margin 185950 non-null float64
7 Order Year 185950 non-null int32
8 Order Month 185950 non-null int32
9 Purchase City 185950 non-null object
dtypes: float64(4), int32(2), int64(1), object(3)
memory usage: 12.8+ MB

[28]: df.head()

[28]: Product Category Quantity Ordered Price Each Cost price \


0 Smart Phones Clothes 1 700.00 231.0000
1 Charging Cables Alimentation 1 14.95 7.4750
2 Headphones Clothes 2 11.99 5.9950
3 Smart Tv Sports 1 149.99 97.4935
4 Headphones Electronics 1 11.99 5.9950

turnover margin Order Year Order Month Purchase City


0 700.00 469.0000 2019 1 Boston
1 14.95 7.4750 2019 1 Portland
2 23.98 11.9900 2019 1 San Francisco
3 149.99 52.4965 2019 1 Los Angeles
4 11.99 5.9950 2019 1 Austin

[29]: cat = df.select_dtypes(include='object').columns.tolist()


col = len(cat)

fig , axs = plt.subplots(nrows=col,ncols=2,figsize=(15,20))


axs = axs.flatten()

for i , var in enumerate (cat):

6
sns.countplot(y=var,data=df,ax=axs[i])
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

[30]: num = df.select_dtypes(include=['int','float']).columns.tolist()


col = len(num)

fig , axs = plt.subplots(nrows=col,ncols=2,figsize=(15,20))


axs = axs.flatten()

7
for i , var in enumerate (num):
df[var].plot.hist(ax=axs[i])
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

[31]: num = df.select_dtypes(include=['int','float']).columns.tolist()


col = len(num)

fig , axs = plt.subplots(nrows=col,ncols=2,figsize=(15,20))


axs = axs.flatten()

for i , var in enumerate (num):


sns.histplot(data=df,x=var,kde=True,ax=axs[i])

8
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

[32]: num = ['Price Each','Cost price']


col = len(num)

fig , axs = plt.subplots(nrows=col,ncols=2,figsize=(15,15))


axs = axs.flatten()

for i , var in enumerate (num):


sns.scatterplot(x=var,y='turnover',data=df,ax=axs[i])
axs[i].set_title(var)

9
if col < len(axs):
for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

[33]: cat = ['Product','Category','Purchase City']


col = len(cat)
fig, axs = plt.subplots(nrows=col,ncols=2,figsize=(15,15))
axs = axs.flatten()

for i, var in enumerate (cat):


sns.barplot(x='Cost price', y=var, data=df, ax=axs[i])
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()
plt.show()

10
[34]: num = ['Quantity Ordered','Price Each','Cost price','turnover','margin']
col = len(num)

fig , axs = plt.subplots(nrows=col,ncols=2,figsize=(15,20))


axs = axs.flatten()

for i , var in enumerate (num):


sns.boxplot(x=var, data=df,ax=axs[i])
axs[i].set_title(var)

if col < len(axs):


for i in range(col, len(axs)):
fig.delaxes(axs[i])

fig.tight_layout()

plt.show()

11
[35]: for col in df.select_dtypes(include=['object']).columns:
print(f'{col}: {df[col].unique()}')

Product: ['Smart Phones' 'Charging Cables' 'Headphones' 'Smart Tv' 'Batteries'


'Laptops' 'Cleaning Machines']
Category: ['Clothes' 'Alimentation' 'Sports' 'Electronics']
Purchase City: [' Boston' ' Portland' ' San Francisco' ' Los Angeles' ' Austin'
' Atlanta' ' Seattle' ' New York City' ' Dallas']

[39]: from sklearn import preprocessing

for col in df.select_dtypes(include=['object']).columns:


label_encoder = preprocessing.LabelEncoder()
label_encoder.fit(df[col].unique())
df[col] = label_encoder.transform(df[col])
print(f'{col}: {df[col].unique()}')

Product: [5 1 3 6 0 4 2]
Category: [1 0 3 2]

12
Purchase City: [2 6 7 4 1 0 8 5 3]

[41]: plt.figure(figsize=(25,20))
sns.heatmap(df.corr(), fmt='.2g', annot=True)
plt.show()

[47]: def correlation(df,threshold):


col_corr = set()
corr_matrix = df.corr()
for i in range(len(corr_matrix.columns)):
for j in range(i):
if abs(corr_matrix.iloc[i,j]) > threshold:
col_name = corr_matrix.columns[i]
col_corr.add(col_name)
return col_corr

[48]: correlation(df,0.7)

13
[48]: {'Cost price', 'margin', 'turnover'}

[49]: df.drop(columns={'Cost price', 'margin', 'turnover'},axis=1,inplace=True)

[52]: plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), fmt='.2g', annot=True)
plt.show()

[ ]:

14

You might also like