数据技能(SKILLS) :Day 5:电商核心指标计算与RFM分析
课程目标
掌握电商核心指标计算、用户价值RFM分析、销售漏斗分析。
核心技能
- 电商核心KPI计算(GMV、转化率、客单价等)
- RFM用户价值分层
- 销售漏斗分析
课件内容
1. 电商核心指标计算
python# 安装依赖:pip install pandas numpy matplotlib seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
class EcommerceMetrics:
def __init__(self, order_data, user_data=None, product_data=None):
"""
初始化电商指标计算器
参数:
order_data: 订单数据
user_data: 用户数据(可选)
product_data: 商品数据(可选)
"""
self.order_data = order_data.copy()
self.user_data = user_data.copy() if user_data is not None else None
self.product_data = product_data.copy() if product_data is not None else None
# 预处理:确保日期格式
if 'order_date' in self.order_data.columns:
self.order_data['order_date'] = pd.to_datetime(self.order_data['order_date'])
# 计算订单金额
if 'quantity' in self.order_data.columns and 'unit_price' in self.order_data.columns:
self.order_data['order_amount'] = self.order_data['quantity'] * self.order_data['unit_price']
def calculate_basic_metrics(self, start_date=None, end_date=None):
"""计算基础电商指标"""
# 过滤时间范围
if start_date and end_date:
mask = (self.order_data['order_date'] >= pd.to_datetime(start_date)) & \
(self.order_data['order_date'] <= pd.to_datetime(end_date))
filtered_data = self.order_data[mask]
else:
filtered_data = self.order_data
# 确保有订单金额列
if 'order_amount' not in filtered_data.columns:
if 'quantity' in filtered_data.columns and 'unit_price' in filtered_data.columns:
filtered_data['order_amount'] = filtered_data['quantity'] * filtered_data['unit_price']
else:
raise ValueError("订单数据需要包含quantity和unit_price列来计算金额")
metrics = {}
# 1. GMV(总交易额)
metrics['gmv'] = filtered_data['order_amount'].sum()
# 2. 订单数量
metrics['order_count'] = len(filtered_data)
# 3. 用户数量(去重)
if 'user_id' in filtered_data.columns:
metrics['user_count'] = filtered_data['user_id'].nunique()
else:
metrics['user_count'] = None
# 4. 客单价(平均订单金额)
metrics['avg_order_value'] = metrics['gmv'] / metrics['order_count'] if metrics['order_count'] > 0 else 0
# 5. 人均订单数
if metrics['user_count']:
metrics['orders_per_user'] = metrics['order_count'] / metrics['user_count']
else:
metrics['orders_per_user'] = None
# 6. 转化率(如果有访问数据)
# 这里假设有访问数据,实际中需要结合UV数据
# 7. 退货率(如果有退货状态)
if 'order_status' in filtered_data.columns:
returned_orders = filtered_data[filtered_data['order_status'].str.contains('退货|退款', na=False)]
metrics['return_rate'] = len(returned_orders) / metrics['order_count'] if metrics['order_count'] > 0 else 0
else:
metrics['return_rate'] = None
# 8. 热门商品
if 'product_id' in filtered_data.columns:
product_stats = filtered_data.groupby('product_id').agg({
'order_amount': 'sum',
'quantity': 'sum',
'order_id': 'count'
}).rename(columns={
'order_amount': 'sales_amount',
'quantity': 'sales_quantity',
'order_id': 'order_count'
}).sort_values('sales_amount', ascending=False)
metrics['top_products'] = product_stats.head(10)
# 9. 销售趋势
if 'order_date' in filtered_data.columns:
daily_sales = filtered_data.groupby(filtered_data['order_date'].dt.date).agg({
'order_amount': 'sum',
'order_id': 'count'
}).rename(columns={
'order_amount': 'daily_gmv',
'order_id': 'daily_orders'
})
metrics['daily_trend'] = daily_sales
return metrics
def calculate_rfm_metrics(self, analysis_date=None):
"""计算RFM指标"""
if analysis_date is None:
analysis_date = self.order_data['order_date'].max()
else:
analysis_date = pd.to_datetime(analysis_date)
# 确保有user_id和order_amount
required_cols = ['user_id', 'order_date', 'order_amount']
for col in required_cols:
if col not in self.order_data.columns:
raise ValueError(f"订单数据需要包含{col}列来计算RFM")
# 按用户聚合
user_stats = self.order_data.groupby('user_id').agg({
'order_date': lambda x: (analysis_date - x.max()).days, # R: 最近购买天数
'order_id': 'count', # F: 购买频率
'order_amount': 'sum' # M: 购买金额
}).rename(columns={
'order_date': 'recency',
'order_id': 'frequency',
'order_amount': 'monetary'
}).reset_index()
# RFM打分(1-5分,5分最好)
# R值:最近购买天数越小越好
user_stats['R_score'] = pd.qcut(user_stats['recency'], 5, labels=[5, 4, 3, 2, 1])
# F值:购买频率越高越好
user_stats['F_score'] = pd.qcut(user_stats['frequency'], 5, labels=[1, 2, 3, 4, 5])
# M值:购买金额越高越好
user_stats['M_score'] = pd.qcut(user_stats['monetary'], 5, labels=[1, 2, 3, 4, 5])
# 转换为数值型
user_stats['R_score'] = user_stats['R_score'].astype(int)
user_stats['F_score'] = user_stats['F_score'].astype(int)
user_stats['M_score'] = user_stats['M_score'].astype(int)
# 计算RFM总分
user_stats['RFM_score'] = user_stats['R_score'] + user_stats['F_score'] + user_stats['M_score']
# RFM分段
def segment_rfm(row):
if row['R_score'] >= 4 and row['F_score'] >= 4 and row['M_score'] >= 4:
return '重要价值客户'
elif row['R_score'] >= 4 and row['F_score'] < 4 and row['M_score'] >= 4:
return '重要发展客户'
elif row['R_score'] < 4 and row['F_score'] >= 4 and row['M_score'] >= 4:
return '重要保持客户'
elif row['R_score'] < 4 and row['F_score'] < 4 and row['M_score'] >= 4:
return '重要挽留客户'
elif row['R_score'] >= 4 and row['F_score'] >= 4 and row['M_score'] < 4:
return '一般价值客户'
elif row['R_score'] >= 4 and row['F_score'] < 4 and row['M_score'] < 4:
return '一般发展客户'
elif row['R_score'] < 4 and row['F_score'] >= 4 and row['M_score'] < 4:
return '一般保持客户'
else:
return '一般挽留客户'
user_stats['RFM_segment'] = user_stats.apply(segment_rfm, axis=1)
return user_stats
def analyze_sales_funnel(self, funnel_stages=None):
"""分析销售漏斗"""
if funnel_stages is None:
# 默认漏斗阶段
funnel_stages = {
'浏览商品': 10000, # 假设数据
'加入购物车': 5000,
'生成订单': 3000,
'完成支付': 2500,
'确认收货': 2300
}
funnel_data = pd.DataFrame({
'stage': list(funnel_stages.keys()),
'count': list(funnel_stages.values())
})
# 计算转化率
funnel_data['conversion_rate'] = (funnel_data['count'] / funnel_data['count'].iloc[0] * 100).round(2)
funnel_data['stage_conversion'] = (funnel_data['count'].shift(-1) / funnel_data['count'] * 100).round(2)
funnel_data.loc[len(funnel_data)-1, 'stage_conversion'] = 100 # 最后阶段
return funnel_data
def visualize_metrics(self, metrics):
"""可视化指标"""
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
# 1. 销售趋势图
if 'daily_trend' in metrics:
ax1 = axes[0, 0]
metrics['daily_trend']['daily_gmv'].plot(ax=ax1, color='blue', marker='o')
ax1.set_title('每日GMV趋势', fontsize=12, fontweight='bold')
ax1.set_xlabel('日期')
ax1.set_ylabel('GMV')
ax1.grid(True, alpha=0.3)
# 2. 热门商品
if 'top_products' in metrics:
ax2 = axes[0, 1]
metrics['top_products'].head(5)['sales_amount'].plot(
kind='bar', ax=ax2, color='green'
)
ax2.set_title('Top 5商品销售额', fontsize=12, fontweight='bold')
ax2.set_xlabel('商品ID')
ax2.set_ylabel('销售额')
ax2.tick_params(axis='x', rotation=45)
# 3. RFM分布
if hasattr(self, 'rfm_data'):
ax3 = axes[1, 0]
segment_counts = self.rfm_data['RFM_segment'].value_counts()
segment_counts.plot(kind='pie', ax=ax3, autopct='%1.1f%%')
ax3.set_title('RFM用户分群分布', fontsize=12, fontweight='bold')
ax3.set_ylabel('')
# 4. 漏斗图
if hasattr(self, 'funnel_data'):
ax4 = axes[1, 1]
stages = self.funnel_data['stage']
rates = self.funnel_data['conversion_rate']
ax4.barh(stages, rates, color='orange')
ax4.set_title('销售漏斗转化率', fontsize=12, fontweight='bold')
ax4.set_xlabel('转化率 (%)')
# 添加数值标签
for i, v in enumerate(rates):
ax4.text(v + 1, i, f'{v}%', va='center')
plt.tight_layout()
plt.show()
def generate_report(self, start_date=None, end_date=None):
"""生成完整分析报告"""
print("="*60)
print("电商数据分析报告")
print("="*60)
# 计算基础指标
print("\n1. 核心业务指标:")
print("-"*40)
metrics = self.calculate_basic_metrics(start_date, end_date)
print(f"📊 GMV(总交易额): ¥{metrics['gmv']:,.2f}")
print(f"📦 订单数量: {metrics['order_count']:,}")
print(f"👥 用户数量: {metrics['user_count']:,}")
print(f"💰 客单价: ¥{metrics['avg_order_value']:,.2f}")
if metrics['orders_per_user']:
print(f"🛒 人均订单数: {metrics['orders_per_user']:.2f}")
if metrics['return_rate']:
print(f"↩️ 退货率: {metrics['return_rate']:.2%}")
# RFM分析
print("\n2. RFM用户价值分析:")
print("-"*40)
rfm_data = self.calculate_rfm_metrics()
self.rfm_data = rfm_data
segment_dist = rfm_data['RFM_segment'].value_counts()
print("用户分群分布:")
for segment, count in segment_dist.items():
percentage = (count / len(rfm_data) * 100)
print(f" {segment}: {count}人 ({percentage:.1f}%)")
# 销售漏斗
print("\n3. 销售漏斗分析:")
print("-"*40)
funnel_data = self.analyze_sales_funnel()
self.funnel_data = funnel_data
for _, row in funnel_data.iterrows():
print(f" {row['stage']}: {row['count']}人 | "
f"总体转化率: {row['conversion_rate']}% | "
f"阶段转化率: {row['stage_conversion']}%")
# 可视化
print("\n4. 数据可视化生成中...")
self.visualize_metrics(metrics)
return {
'basic_metrics': metrics,
'rfm_data': rfm_data,
'funnel_data': funnel_data
}
# 案例数据脚本
def create_ecommerce_sample_data():
"""生成电商分析示例数据"""
np.random.seed(42)
# 生成更丰富的订单数据
n_orders = 1000
user_ids = np.random.choice(range(1001, 1101), n_orders)
product_ids = np.random.choice(['P001', 'P002', 'P003', 'P004', 'P005',
'P006', 'P007', 'P008', 'P009', 'P010'], n_orders)
order_data = pd.DataFrame({
'order_id': range(10001, 10001 + n_orders),
'user_id': user_ids,
'order_date': pd.date_range('2024-01-01', periods=n_orders, freq='H'),
'product_id': product_ids,
'quantity': np.random.randint(1, 5, n_orders),
'unit_price': np.random.uniform(50, 2000, n_orders).round(2),
'order_status': np.random.choice(['已完成', '已发货', '待付款', '已取消', '已退货'],
n_orders, p=[0.7, 0.15, 0.05, 0.05, 0.05])
})
# 打乱日期
order_data['order_date'] = order_data['order_date'].sample(frac=1).reset_index(drop=True)
# 用户数据
user_data = pd.DataFrame({
'user_id': range(1001, 1101),
'registration_date': pd.date_range('2023-06-01', periods=100),
'user_level': np.random.choice(['青铜', '白银', '黄金', '铂金', '钻石'], 100,
p=[0.3, 0.25, 0.2, 0.15, 0.1]),
'city': np.random.choice(['北京', '上海', '广州', '深圳', '杭州', '成都', '武汉', '南京'], 100)
})
# 商品数据
product_data = pd.DataFrame({
'product_id': [f'P{str(i).zfill(3)}' for i in range(1, 11)],
'product_name': ['智能手机', '笔记本电脑', '无线耳机', '智能手表', '平板电脑',
'数码相机', '游戏主机', '显示器', '键盘鼠标', '移动硬盘'],
'category': ['电子产品', '电子产品', '配件', '电子产品', '电子产品',
'电子产品', '电子产品', '配件', '配件', '配件'],
'brand': ['品牌A', '品牌B', '品牌C', '品牌D', '品牌E',
'品牌F', '品牌G', '品牌H', '品牌I', '品牌J'],
'cost_price': [1999, 5999, 299, 1299, 2999, 3999, 3499, 899, 199, 499],
'market_price': [2499, 6999, 399, 1599, 3499, 4999, 3999, 1299, 299, 699]
})
# 保存数据
order_data.to_csv('ecommerce_orders.csv', index=False, encoding='utf-8')
user_data.to_csv('ecommerce_users.csv', index=False, encoding='utf-8')
product_data.to_csv('ecommerce_products.csv', index=False, encoding='utf-8')
print("✅ 电商示例数据已生成:")
print(f" 订单数据: {order_data.shape}")
print(f" 用户数据: {user_data.shape}")
print(f" 商品数据: {product_data.shape}")
return order_data, user_data, product_data
# 运行示例
if __name__ == "__main__":
print("="*60)
print("Day 5: 电商核心指标计算与RFM分析")
print("="*60)
# 1. 生成示例数据
print("\n步骤1: 生成电商示例数据...")
order_df, user_df, product_df = create_ecommerce_sample_data()
# 2. 初始化分析器
print("\n步骤2: 初始化电商指标分析器...")
analyzer = EcommerceMetrics(order_df, user_df, product_df)
# 3. 生成分析报告
print("\n步骤3: 生成完整分析报告...")
report = analyzer.generate_report(
start_date='2024-01-01',
end_date='2024-03-31'
)
# 4. 保存RFM结果
print("\n步骤4: 保存分析结果...")
report['rfm_data'].to_csv('rfm_analysis_result.csv', index=False, encoding='utf-8')
report['funnel_data'].to_csv('funnel_analysis_result.csv', index=False, encoding='utf-8')
print("\n✅ 分析完成!结果已保存到:")
print(" - rfm_analysis_result.csv")
print(" - funnel_analysis_result.csv")
安装要求
bash# 基础依赖
pip install pandas==1.5.3 numpy==1.24.3 matplotlib==3.7.1 seaborn==0.12.2
# 可选:更美观的图表
pip install plotly==5.15.0