0% found this document useful (0 votes)
8 views

Final

Uploaded by

asharyg5752
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views

Final

Uploaded by

asharyg5752
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 2

import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# 1. Load Data

transactions = pd.read_csv('transactions.csv', parse_dates=['order_date'])

products = pd.read_excel('product_catalog.xlsx')

customers = pd.read_json('customers.json', convert_dates=['signup_date'])

# 2. Clean Transactions Data

# Handle missing values

transactions['quantity'] = transactions['quantity'].fillna(1)

transactions['unit_price'] = transactions.groupby('product_id')['unit_price'].transform( lambda x:


x.fillna(x.median()))

# Convert data types

transactions['order_id'] = transactions['order_id'].astype('category')

transactions['customer_id'] = transactions['customer_id'].astype('int32')

# 3. Merge Datasets

merged_data = transactions.merge(products[['product_id', 'category', 'cost_price']], on='product_id',


how='left').merge(customers[['customer_id', 'signup_date', 'tier']], on='customer_id',how='left')

# 4. Feature Engineering

merged_data['total_sales'] = merged_data['quantity'] * merged_data['unit_price']

merged_data['profit'] = (merged_data['unit_price'] - merged_data['cost_price']) * erged_data['quantity']

# 5. Visualization

# 6. Monthly Sales Analysis


monthly_sales = merged_data.resample('M', on='order_date')['total_sales'].sum()

monthly_profit = merged_data.resample('M', on='order_date')['profit'].sum()

plt.figure(figsize=(12, 6))

monthly_sales.plot(label='Sales')

monthly_profit.plot(label='Profit')

plt.title('Monthly Sales and Profit Trends')

plt.ylabel('USD')

plt.legend()

plt.show()

# 7. Product Performance Analysis

product_performance = merged_data.groupby('product_id').agg({'total_sales': 'sum', 'quantity': 'sum',


'profit': 'mean'}).sort_values('total_sales', ascending=False).head(10)

# 8. Customer Segmentation

customer_loyalty = merged_data.groupby('customer_id').agg({'order_id': 'nunique','total_sales': 'sum',


'signup_date': 'first'}).rename(columns={'order_id': 'purchase_count'})

customer_loyalty['cohort'] = customer_loyalty['signup_date'].dt.to_period('M')

customer_loyalty['lifetime_months'] = (pd.Period('2023-12', freq='M') -


customer_loyalty['cohort']).apply(lambda x: x.n)

# 9. Data Validation

# Check for negative profits

negative_profit = merged_data[merged_data['profit'] < 0]

if not negative_profit.empty:

print(f"Warning: {len(negative_profit)} transactions with negative profit")

# 10. Verify data completeness

assert merged_data['category'].isna().sum() == 0, "Missing product categories exist"

You might also like