SalesDataAnalysis__1693296057
SalesDataAnalysis__1693296057
[2]: df = pd.read_csv('sales_data.csv')
[3]: df.head()
[4]: df.isnull().sum()
1
Quantity Ordered 0
Price Each 0
Cost price 0
turnover 0
margin 0
dtype: int64
[5]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Order Date 185950 non-null object
1 Order ID 185950 non-null int64
2 Product 185950 non-null object
3 Product_ean 185950 non-null float64
4 catégorie 185950 non-null object
5 Purchase Address 185950 non-null object
6 Quantity Ordered 185950 non-null int64
7 Price Each 185950 non-null float64
8 Cost price 185950 non-null float64
9 turnover 185950 non-null float64
10 margin 185950 non-null float64
dtypes: float64(5), int64(2), object(4)
memory usage: 15.6+ MB
[9]: df.head(1)
2
[10]: df['Purchase Address'].head(1)
[12]: df.head(1)
[14]: df.head(1)
[14]: Product catégorie Quantity Ordered Price Each Cost price turnover \
0 iPhone Vêtements 1 700.0 231.0 700.0
[15]: df['Product'].value_counts()
3
LG Washing Machine 666
LG Dryer 646
Name: Product, dtype: int64
return 'Headphones'
elif x in ['27in FHD Monitor','27in 4K Gaming Monitor','34in Ultrawide␣
↪Monitor','Flatscreen TV','20in Monitor']:
[18]: df['Product'].value_counts()
[19]: df.head(1)
[20]: df.rename(columns={'catégorie':'Category'},inplace=True)
4
[21]: df.head(1)
[22]: df['Category'].value_counts()
[23]: data_mapping = {
'Vêtements': 'Clothes',
'Électronique': 'Electronics'
}
df['Category'] = df['Category'].map(data_mapping).fillna(df['Category'])
[24]: df['Category'].value_counts()
[25]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Product 185950 non-null object
1 Category 185950 non-null object
2 Quantity Ordered 185950 non-null int64
3 Price Each 185950 non-null float64
4 Cost price 185950 non-null float64
5 turnover 185950 non-null float64
6 margin 185950 non-null float64
7 Order Year 185950 non-null object
8 Order Month 185950 non-null object
9 Purchase City 185950 non-null object
dtypes: float64(4), int64(1), object(5)
5
memory usage: 14.2+ MB
[27]: df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 185950 entries, 0 to 185949
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Product 185950 non-null object
1 Category 185950 non-null object
2 Quantity Ordered 185950 non-null int64
3 Price Each 185950 non-null float64
4 Cost price 185950 non-null float64
5 turnover 185950 non-null float64
6 margin 185950 non-null float64
7 Order Year 185950 non-null int32
8 Order Month 185950 non-null int32
9 Purchase City 185950 non-null object
dtypes: float64(4), int32(2), int64(1), object(3)
memory usage: 12.8+ MB
[28]: df.head()
6
sns.countplot(y=var,data=df,ax=axs[i])
axs[i].set_title(var)
fig.tight_layout()
plt.show()
7
for i , var in enumerate (num):
df[var].plot.hist(ax=axs[i])
axs[i].set_title(var)
fig.tight_layout()
plt.show()
8
axs[i].set_title(var)
fig.tight_layout()
plt.show()
9
if col < len(axs):
for i in range(col, len(axs)):
fig.delaxes(axs[i])
fig.tight_layout()
plt.show()
fig.tight_layout()
plt.show()
10
[34]: num = ['Quantity Ordered','Price Each','Cost price','turnover','margin']
col = len(num)
fig.tight_layout()
plt.show()
11
[35]: for col in df.select_dtypes(include=['object']).columns:
print(f'{col}: {df[col].unique()}')
Product: [5 1 3 6 0 4 2]
Category: [1 0 3 2]
12
Purchase City: [2 6 7 4 1 0 8 5 3]
[41]: plt.figure(figsize=(25,20))
sns.heatmap(df.corr(), fmt='.2g', annot=True)
plt.show()
[48]: correlation(df,0.7)
13
[48]: {'Cost price', 'margin', 'turnover'}
[52]: plt.figure(figsize=(15,10))
sns.heatmap(df.corr(), fmt='.2g', annot=True)
plt.show()
[ ]:
14