【Python10年经验总结】第四课 电商平台销售数据分析实践分解 – 数据聚合(Data Aggregation)
常见用的数据聚合场景:
按月份统计销售额与订单量
按品类统计平均售价与销量
按地区汇总销售额占比
按用户分层(VIP/普通)统计购买力
按渠道统计转化率与客单价
按小时粒度统计访问/下单高峰
按SKU统计库存周转天数
按促销活动统计参与人数与成交额
按品牌统计市场份额变化
按会员等级统计复购率
我们将展示如何进行这些常见的数据聚合任务。假设我们有一个包含销售订单数据的DataFrame,并逐步应用这些聚合操作。
首先,让我们创建一个示例DataFrame来模拟原始数据:
import pandas as pd
import numpy as np
# 创建示例DataFrame
data = {
'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'product_id': ['A001', 'B002', 'D004', 'E005', 'F006', 'G007', 'H008', 'I009', 'J010', 'K011'],
'category_code': ['C1', 'C2', 'C1', 'C3', 'C4', 'C5', 'C6', 'C7', 'C1', 'C2'],
'amount': [100.00, 200.00, 300.00, 400.00, 500.00, 600.00, 700.00, 800.00, 900.00, 1000.00],
'original_price': [120.00, 220.00, 320.00, 420.00, 520.00, 620.00, 720.00, 820.00, 920.00, 1020.00],
'quantity': [1, 2, 1, 1, 2, 1, 1, 2, 1, 2],
'order_date': [
'2025-06-01 10:00:00', '2025-06-01 12:00:00', '2025-06-02 14:00:00',
'2025-06-03 16:00:00', '2025-06-04 18:00:00', '2025-06-05 20:00:00',
'2025-06-06 22:00:00', '2025-06-07 10:00:00', '2025-06-08 12:00:00',
'2025-06-09 14:00:00'
],
'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102],
'promotion': [False, True, False, True, False, True, False, True, False, True],
'region': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chengdu', 'Nanjing', 'Wuhan', 'Beijing', 'Shanghai'],
'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online'],
'hour': [10, 12, 14, 16, 18, 20, 22, 10, 12, 14],
'brand': ['BrandA', 'BrandB', 'BrandA', 'BrandC', 'BrandB', 'BrandD', 'BrandA', 'BrandC', 'BrandB', 'BrandD'],
'inventory_days': [30, 45, 30, 60, 45, 60, 30, 60, 45, 60],
'membership_level': ['VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'VIP', 'Normal']
}
df = pd.DataFrame(data)
print("原始数据:")
print(df)
接下来,我们将逐个处理上述提到的数据聚合任务。
1. 按月份统计销售额与订单量
# 转换order_date为日期格式
df['order_date'] = pd.to_datetime(df['order_date'])
# 提取年、月
df['year_month'] = df['order_date'].dt.to_period('M')
# 按月份统计销售额与订单量
monthly_sales = df.groupby('year_month').agg({
'amount': 'sum',
'order_id': 'count'
}).reset_index()
monthly_sales.rename(columns={'amount': 'total_sales', 'order_id': 'order_count'}, inplace=True)
print("\n按月份统计销售额与订单量后:")
print(monthly_sales)
2. 按品类统计平均售价与销量
# 按品类统计平均售价与销量
category_stats = df.groupby('category_code').agg({
'amount': 'mean',
'quantity': 'sum'
}).reset_index()
category_stats.rename(columns={'amount': 'avg_sale_price', 'quantity': 'total_quantity_sold'}, inplace=True)
print("\n按品类统计平均售价与销量后:")
print(category_stats)
3. 按地区汇总销售额占比
# 按地区汇总销售额
region_sales = df.groupby('region')['amount'].sum().reset_index()
# 计算销售额占比
region_sales['sales_percentage'] = (region_sales['amount'] / region_sales['amount'].sum()) * 100
print("\n按地区汇总销售额占比后:")
print(region_sales)
4. 按用户分层(VIP/普通)统计购买力
# 按用户分层统计购买力
user_purchase_power = df.groupby('membership_level')['amount'].sum().reset_index()
print("\n按用户分层统计购买力后:")
print(user_purchase_power)
5. 按渠道统计转化率与客单价
假设我们还有一个访问记录的DataFrame,用于计算转化率。
# 创建访问记录的DataFrame
visit_data = {
'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102, 103, 104, 105, 106, 107, 108],
'visits': [10, 20, 15, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85],
'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online']
}
df_visits = pd.DataFrame(visit_data)
# 合并订单和访问数据
merged_df = pd.merge(df, df_visits, on=['customer_id', 'channel'], how='right')
# 填充缺失的订单金额为0
merged_df['amount'].fillna(0, inplace=True)
# 按渠道统计转化率与客单价
channel_stats = merged_df.groupby('channel').agg({
'amount': 'sum',
'visits': 'sum',
'order_id': 'count'
}).reset_index()
channel_stats['conversion_rate'] = (channel_stats['order_id'] / channel_stats['visits']) * 100
channel_stats['average_order_value'] = channel_stats['amount'] / channel_stats['order_id']
print("\n按渠道统计转化率与客单价后:")
print(channel_stats)
6. 按小时粒度统计访问/下单高峰
假设我们已经有了合并后的订单和访问数据。
# 按小时粒度统计访问/下单高峰
hourly_peak = merged_df.groupby('hour').agg({
'amount': 'sum',
'visits': 'sum',
'order_id': 'count'
}).reset_index()
hourly_peak.rename(columns={'amount': 'total_sales', 'visits': 'total_visits', 'order_id': 'order_count'}, inplace=True)
print("\n按小时粒度统计访问/下单高峰后:")
print(hourly_peak)
7. 按SKU统计库存周转天数
# 按SKU统计库存周转天数
sku_inventory = df.groupby('product_id')['inventory_days'].mean().reset_index()
print("\n按SKU统计库存周转天数后:")
print(sku_inventory)
8. 按促销活动统计参与人数与成交额
# 按促销活动统计参与人数与成交额
promotion_stats = df.groupby('promotion').agg({
'customer_id': 'nunique',
'amount': 'sum'
}).reset_index()
promotion_stats.rename(columns={'customer_id': 'participant_count', 'amount': 'total_sales'}, inplace=True)
print("\n按促销活动统计参与人数与成交额后:")
print(promotion_stats)
9. 按品牌统计市场份额变化
# 按品牌统计市场份额变化
brand_share = df.groupby('brand')['amount'].sum().reset_index()
# 计算市场份额
brand_share['market_share'] = (brand_share['amount'] / brand_share['amount'].sum()) * 100
print("\n按品牌统计市场份额变化后:")
print(brand_share)
10. 按会员等级统计复购率
# 按会员等级统计复购率
repurchase_rate = df.groupby(['customer_id', 'membership_level']).size().ge(2).groupby('membership_level').mean().reset_index()
repurchase_rate.rename(columns={0: 'repurchase_rate'}, inplace=True)
print("\n按会员等级统计复购率后:")
print(repurchase_rate)
综合以上步骤,最终的聚合结果如下:
这段代码展示了从原始数据到经过全面聚合的数据的过程。你可以根据实际需求调整每一步的操作。
import pandas as pd
import numpy as np
# 创建示例DataFrame
data = {
'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'product_id': ['A001', 'B002', 'D004', 'E005', 'F006', 'G007', 'H008', 'I009', 'J010', 'K011'],
'category_code': ['C1', 'C2', 'C1', 'C3', 'C4', 'C5', 'C6', 'C7', 'C1', 'C2'],
'amount': [100.00, 200.00, 300.00, 400.00, 500.00, 600.00, 700.00, 800.00, 900.00, 1000.00],
'original_price': [120.00, 220.00, 320.00, 420.00, 520.00, 620.00, 720.00, 820.00, 920.00, 1020.00],
'quantity': [1, 2, 1, 1, 2, 1, 1, 2, 1, 2],
'order_date': [
'2025-06-01 10:00:00', '2025-06-01 12:00:00', '2025-06-02 14:00:00',
'2025-06-03 16:00:00', '2025-06-04 18:00:00', '2025-06-05 20:00:00',
'2025-06-06 22:00:00', '2025-06-07 10:00:00', '2025-06-08 12:00:00',
'2025-06-09 14:00:00'
],
'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102],
'promotion': [False, True, False, True, False, True, False, True, False, True],
'region': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chengdu', 'Nanjing', 'Wuhan', 'Beijing', 'Shanghai'],
'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online'],
'hour': [10, 12, 14, 16, 18, 20, 22, 10, 12, 14],
'brand': ['BrandA', 'BrandB', 'BrandA', 'BrandC', 'BrandB', 'BrandD', 'BrandA', 'BrandC', 'BrandB', 'BrandD'],
'inventory_days': [30, 45, 30, 60, 45, 60, 30, 60, 45, 60],
'membership_level': ['VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'VIP', 'Normal']
}
df = pd.DataFrame(data)
# 转换order_date为日期格式
df['order_date'] = pd.to_datetime(df['order_date'])
# 提取年、月
df['year_month'] = df['order_date'].dt.to_period('M')
# 按月份统计销售额与订单量
monthly_sales = df.groupby('year_month').agg({
'amount': 'sum',
'order_id': 'count'
}).reset_index()
monthly_sales.rename(columns={'amount': 'total_sales', 'order_id': 'order_count'}, inplace=True)
# 按品类统计平均售价与销量
category_stats = df.groupby('category_code').agg({
'amount': 'mean',
'quantity': 'sum'
}).reset_index()
category_stats.rename(columns={'amount': 'avg_sale_price', 'quantity': 'total_quantity_sold'}, inplace=True)
# 按地区汇总销售额占比
region_sales = df.groupby('region')['amount'].sum().reset_index()
region_sales['sales_percentage'] = (region_sales['amount'] / region_sales['amount'].sum()) * 100
# 按用户分层统计购买力
user_purchase_power = df.groupby('membership_level')['amount'].sum().reset_index()
# 创建访问记录的DataFrame
visit_data = {
'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102, 103, 104, 105, 106, 107, 108],
'visits': [10, 20, 15, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85],
'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online']
}
df_visits = pd.DataFrame(visit_data)
# 合并订单和访问数据
merged_df = pd.merge(df, df_visits, on=['customer_id', 'channel'], how='right')
merged_df['amount'].fillna(0, inplace=True)
# 按渠道统计转化率与客单价
channel_stats = merged_df.groupby('channel').agg({
'amount': 'sum',
'visits': 'sum',
'order_id': 'count'
}).reset_index()
channel_stats['conversion_rate'] = (channel_stats['order_id'] / channel_stats['visits']) * 100
channel_stats['average_order_value'] = channel_stats['amount'] / channel_stats['order_id']
# 按小时粒度统计访问/下单高峰
hourly_peak = merged_df.groupby('hour').agg({
'amount': 'sum',
'visits': 'sum',
'order_id': 'count'
}).reset_index()
hourly_peak.rename(columns={'amount': 'total_sales', 'visits': 'total_visits', 'order_id': 'order_count'}, inplace=True)
# 按SKU统计库存周转天数
sku_inventory = df.groupby('product_id')['inventory_days'].mean().reset_index()
# 按促销活动统计参与人数与成交额
promotion_stats = df.groupby('promotion').agg({
'customer_id': 'nunique',
'amount': 'sum'
}).reset_index()
promotion_stats.rename(columns={'customer_id': 'participant_count', 'amount': 'total_sales'}, inplace=True)
# 按品牌统计市场份额变化
brand_share = df.groupby('brand')['amount'].sum().reset_index()
brand_share['market_share'] = (brand_share['amount'] / brand_share['amount'].sum()) * 100
# 按会员等级统计复购率
repurchase_rate = df.groupby(['customer_id', 'membership_level']).size().ge(2).groupby('membership_level').mean().reset_index()
repurchase_rate.rename(columns={0: 'repurchase_rate'}, inplace=True)
print("按月份统计销售额与订单量后:")
print(monthly_sales)
print("\n按品类统计平均售价与销量后:")
print(category_stats)
print("\n按地区汇总销售额占比后:")
print(region_sales)
print("\n按用户分层统计购买力后:")
print(user_purchase_power)
print("\n按渠道统计转化率与客单价后:")
print(channel_stats)
print("\n按小时粒度统计访问/下单高峰后:")
print(hourly_peak)
print("\n按SKU统计库存周转天数后:")
print(sku_inventory)
print("\n按促销活动统计参与人数与成交额后:")
print(promotion_stats)
print("\n按品牌统计市场份额变化后:")
print(brand_share)
print("\n按会员等级统计复购率后:")
print(repurchase_rate)