常见用的数据聚合场景:

按月份统计销售额与订单量
按品类统计平均售价与销量
按地区汇总销售额占比
按用户分层(VIP/普通)统计购买力
按渠道统计转化率与客单价
按小时粒度统计访问/下单高峰
按SKU统计库存周转天数
按促销活动统计参与人数与成交额
按品牌统计市场份额变化
按会员等级统计复购率

我们将展示如何进行这些常见的数据聚合任务。假设我们有一个包含销售订单数据的DataFrame,并逐步应用这些聚合操作。

首先,让我们创建一个示例DataFrame来模拟原始数据:

import pandas as pd
import numpy as np

# 创建示例DataFrame
data = {
    'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'product_id': ['A001', 'B002', 'D004', 'E005', 'F006', 'G007', 'H008', 'I009', 'J010', 'K011'],
    'category_code': ['C1', 'C2', 'C1', 'C3', 'C4', 'C5', 'C6', 'C7', 'C1', 'C2'],
    'amount': [100.00, 200.00, 300.00, 400.00, 500.00, 600.00, 700.00, 800.00, 900.00, 1000.00],
    'original_price': [120.00, 220.00, 320.00, 420.00, 520.00, 620.00, 720.00, 820.00, 920.00, 1020.00],
    'quantity': [1, 2, 1, 1, 2, 1, 1, 2, 1, 2],
    'order_date': [
        '2025-06-01 10:00:00', '2025-06-01 12:00:00', '2025-06-02 14:00:00',
        '2025-06-03 16:00:00', '2025-06-04 18:00:00', '2025-06-05 20:00:00',
        '2025-06-06 22:00:00', '2025-06-07 10:00:00', '2025-06-08 12:00:00',
        '2025-06-09 14:00:00'
    ],
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102],
    'promotion': [False, True, False, True, False, True, False, True, False, True],
    'region': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chengdu', 'Nanjing', 'Wuhan', 'Beijing', 'Shanghai'],
    'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online'],
    'hour': [10, 12, 14, 16, 18, 20, 22, 10, 12, 14],
    'brand': ['BrandA', 'BrandB', 'BrandA', 'BrandC', 'BrandB', 'BrandD', 'BrandA', 'BrandC', 'BrandB', 'BrandD'],
    'inventory_days': [30, 45, 30, 60, 45, 60, 30, 60, 45, 60],
    'membership_level': ['VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'VIP', 'Normal']
}

df = pd.DataFrame(data)
print("原始数据:")
print(df)

接下来,我们将逐个处理上述提到的数据聚合任务。

1. 按月份统计销售额与订单量

# 转换order_date为日期格式
df['order_date'] = pd.to_datetime(df['order_date'])

# 提取年、月
df['year_month'] = df['order_date'].dt.to_period('M')

# 按月份统计销售额与订单量
monthly_sales = df.groupby('year_month').agg({
    'amount': 'sum',
    'order_id': 'count'
}).reset_index()

monthly_sales.rename(columns={'amount': 'total_sales', 'order_id': 'order_count'}, inplace=True)

print("\n按月份统计销售额与订单量后:")
print(monthly_sales)

2. 按品类统计平均售价与销量

# 按品类统计平均售价与销量
category_stats = df.groupby('category_code').agg({
    'amount': 'mean',
    'quantity': 'sum'
}).reset_index()

category_stats.rename(columns={'amount': 'avg_sale_price', 'quantity': 'total_quantity_sold'}, inplace=True)

print("\n按品类统计平均售价与销量后:")
print(category_stats)

3. 按地区汇总销售额占比

# 按地区汇总销售额
region_sales = df.groupby('region')['amount'].sum().reset_index()

# 计算销售额占比
region_sales['sales_percentage'] = (region_sales['amount'] / region_sales['amount'].sum()) * 100

print("\n按地区汇总销售额占比后:")
print(region_sales)

4. 按用户分层(VIP/普通)统计购买力

# 按用户分层统计购买力
user_purchase_power = df.groupby('membership_level')['amount'].sum().reset_index()

print("\n按用户分层统计购买力后:")
print(user_purchase_power)

5. 按渠道统计转化率与客单价

假设我们还有一个访问记录的DataFrame,用于计算转化率。

# 创建访问记录的DataFrame
visit_data = {
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102, 103, 104, 105, 106, 107, 108],
    'visits': [10, 20, 15, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85],
    'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online']
}
df_visits = pd.DataFrame(visit_data)

# 合并订单和访问数据
merged_df = pd.merge(df, df_visits, on=['customer_id', 'channel'], how='right')

# 填充缺失的订单金额为0
merged_df['amount'].fillna(0, inplace=True)

# 按渠道统计转化率与客单价
channel_stats = merged_df.groupby('channel').agg({
    'amount': 'sum',
    'visits': 'sum',
    'order_id': 'count'
}).reset_index()

channel_stats['conversion_rate'] = (channel_stats['order_id'] / channel_stats['visits']) * 100
channel_stats['average_order_value'] = channel_stats['amount'] / channel_stats['order_id']

print("\n按渠道统计转化率与客单价后:")
print(channel_stats)

6. 按小时粒度统计访问/下单高峰

假设我们已经有了合并后的订单和访问数据。

# 按小时粒度统计访问/下单高峰
hourly_peak = merged_df.groupby('hour').agg({
    'amount': 'sum',
    'visits': 'sum',
    'order_id': 'count'
}).reset_index()

hourly_peak.rename(columns={'amount': 'total_sales', 'visits': 'total_visits', 'order_id': 'order_count'}, inplace=True)

print("\n按小时粒度统计访问/下单高峰后:")
print(hourly_peak)

7. 按SKU统计库存周转天数

# 按SKU统计库存周转天数
sku_inventory = df.groupby('product_id')['inventory_days'].mean().reset_index()

print("\n按SKU统计库存周转天数后:")
print(sku_inventory)

8. 按促销活动统计参与人数与成交额

# 按促销活动统计参与人数与成交额
promotion_stats = df.groupby('promotion').agg({
    'customer_id': 'nunique',
    'amount': 'sum'
}).reset_index()

promotion_stats.rename(columns={'customer_id': 'participant_count', 'amount': 'total_sales'}, inplace=True)

print("\n按促销活动统计参与人数与成交额后:")
print(promotion_stats)

9. 按品牌统计市场份额变化

# 按品牌统计市场份额变化
brand_share = df.groupby('brand')['amount'].sum().reset_index()

# 计算市场份额
brand_share['market_share'] = (brand_share['amount'] / brand_share['amount'].sum()) * 100

print("\n按品牌统计市场份额变化后:")
print(brand_share)

10. 按会员等级统计复购率

# 按会员等级统计复购率
repurchase_rate = df.groupby(['customer_id', 'membership_level']).size().ge(2).groupby('membership_level').mean().reset_index()

repurchase_rate.rename(columns={0: 'repurchase_rate'}, inplace=True)

print("\n按会员等级统计复购率后:")
print(repurchase_rate)

综合以上步骤,最终的聚合结果如下:

这段代码展示了从原始数据到经过全面聚合的数据的过程。你可以根据实际需求调整每一步的操作。

import pandas as pd
import numpy as np

# 创建示例DataFrame
data = {
    'order_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'product_id': ['A001', 'B002', 'D004', 'E005', 'F006', 'G007', 'H008', 'I009', 'J010', 'K011'],
    'category_code': ['C1', 'C2', 'C1', 'C3', 'C4', 'C5', 'C6', 'C7', 'C1', 'C2'],
    'amount': [100.00, 200.00, 300.00, 400.00, 500.00, 600.00, 700.00, 800.00, 900.00, 1000.00],
    'original_price': [120.00, 220.00, 320.00, 420.00, 520.00, 620.00, 720.00, 820.00, 920.00, 1020.00],
    'quantity': [1, 2, 1, 1, 2, 1, 1, 2, 1, 2],
    'order_date': [
        '2025-06-01 10:00:00', '2025-06-01 12:00:00', '2025-06-02 14:00:00',
        '2025-06-03 16:00:00', '2025-06-04 18:00:00', '2025-06-05 20:00:00',
        '2025-06-06 22:00:00', '2025-06-07 10:00:00', '2025-06-08 12:00:00',
        '2025-06-09 14:00:00'
    ],
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102],
    'promotion': [False, True, False, True, False, True, False, True, False, True],
    'region': ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen', 'Hangzhou', 'Chengdu', 'Nanjing', 'Wuhan', 'Beijing', 'Shanghai'],
    'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online'],
    'hour': [10, 12, 14, 16, 18, 20, 22, 10, 12, 14],
    'brand': ['BrandA', 'BrandB', 'BrandA', 'BrandC', 'BrandB', 'BrandD', 'BrandA', 'BrandC', 'BrandB', 'BrandD'],
    'inventory_days': [30, 45, 30, 60, 45, 60, 30, 60, 45, 60],
    'membership_level': ['VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'Normal', 'VIP', 'Normal', 'VIP', 'Normal']
}

df = pd.DataFrame(data)

# 转换order_date为日期格式
df['order_date'] = pd.to_datetime(df['order_date'])

# 提取年、月
df['year_month'] = df['order_date'].dt.to_period('M')

# 按月份统计销售额与订单量
monthly_sales = df.groupby('year_month').agg({
    'amount': 'sum',
    'order_id': 'count'
}).reset_index()

monthly_sales.rename(columns={'amount': 'total_sales', 'order_id': 'order_count'}, inplace=True)

# 按品类统计平均售价与销量
category_stats = df.groupby('category_code').agg({
    'amount': 'mean',
    'quantity': 'sum'
}).reset_index()

category_stats.rename(columns={'amount': 'avg_sale_price', 'quantity': 'total_quantity_sold'}, inplace=True)

# 按地区汇总销售额占比
region_sales = df.groupby('region')['amount'].sum().reset_index()
region_sales['sales_percentage'] = (region_sales['amount'] / region_sales['amount'].sum()) * 100

# 按用户分层统计购买力
user_purchase_power = df.groupby('membership_level')['amount'].sum().reset_index()

# 创建访问记录的DataFrame
visit_data = {
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 101, 102, 103, 104, 105, 106, 107, 108],
    'visits': [10, 20, 15, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85],
    'channel': ['Online', 'Offline', 'Online', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online', 'Offline', 'Online']
}
df_visits = pd.DataFrame(visit_data)

# 合并订单和访问数据
merged_df = pd.merge(df, df_visits, on=['customer_id', 'channel'], how='right')
merged_df['amount'].fillna(0, inplace=True)

# 按渠道统计转化率与客单价
channel_stats = merged_df.groupby('channel').agg({
    'amount': 'sum',
    'visits': 'sum',
    'order_id': 'count'
}).reset_index()

channel_stats['conversion_rate'] = (channel_stats['order_id'] / channel_stats['visits']) * 100
channel_stats['average_order_value'] = channel_stats['amount'] / channel_stats['order_id']

# 按小时粒度统计访问/下单高峰
hourly_peak = merged_df.groupby('hour').agg({
    'amount': 'sum',
    'visits': 'sum',
    'order_id': 'count'
}).reset_index()

hourly_peak.rename(columns={'amount': 'total_sales', 'visits': 'total_visits', 'order_id': 'order_count'}, inplace=True)

# 按SKU统计库存周转天数
sku_inventory = df.groupby('product_id')['inventory_days'].mean().reset_index()

# 按促销活动统计参与人数与成交额
promotion_stats = df.groupby('promotion').agg({
    'customer_id': 'nunique',
    'amount': 'sum'
}).reset_index()

promotion_stats.rename(columns={'customer_id': 'participant_count', 'amount': 'total_sales'}, inplace=True)

# 按品牌统计市场份额变化
brand_share = df.groupby('brand')['amount'].sum().reset_index()
brand_share['market_share'] = (brand_share['amount'] / brand_share['amount'].sum()) * 100

# 按会员等级统计复购率
repurchase_rate = df.groupby(['customer_id', 'membership_level']).size().ge(2).groupby('membership_level').mean().reset_index()
repurchase_rate.rename(columns={0: 'repurchase_rate'}, inplace=True)

print("按月份统计销售额与订单量后:")
print(monthly_sales)
print("\n按品类统计平均售价与销量后:")
print(category_stats)
print("\n按地区汇总销售额占比后:")
print(region_sales)
print("\n按用户分层统计购买力后:")
print(user_purchase_power)
print("\n按渠道统计转化率与客单价后:")
print(channel_stats)
print("\n按小时粒度统计访问/下单高峰后:")
print(hourly_peak)
print("\n按SKU统计库存周转天数后:")
print(sku_inventory)
print("\n按促销活动统计参与人数与成交额后:")
print(promotion_stats)
print("\n按品牌统计市场份额变化后:")
print(brand_share)
print("\n按会员等级统计复购率后:")
print(repurchase_rate)