【Python实践案例】电商平台数据分析和挖掘 – 时序预测

发表于： 2025年9月13日 2025年9月13日
标签：电商运营数据分析实践
访问量： 2874
我们来创建一个针对电商平台日志数据时序预测程序。这个程序将模拟生成包含访问量（如PV或UV）的时序日志数据，并使用ARIMA和Prophet两种模型对未来流量进行预测。
重要提示：
Prophet库安装：prophet库的安装可能需要C++编译器和一些额外依赖。如果安装遇到问题，请参考其官方文档。如果无法安装，程序会自动跳过Prophet部分。
pmdarima库安装：为了实现ARIMA模型的自动调参，我们使用pmdarima库。如果未安装，程序会回退到使用固定参数的statsmodels ARIMA。
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

# --- 尝试导入可选库 ---
PROPHET_AVAILABLE = False
PMDARIMA_AVAILABLE = False

try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
    print("Info: Prophet库已导入，将包含Prophet模型预测。")
except ImportError:
    print("Warning: Prophet库未导入。将跳过Prophet模型部分。")

try:
    from pmdarima import auto_arima
    PMDARIMA_AVAILABLE = True
    print("Info: pmdarima库已导入，ARIMA模型将自动选择参数。")
except ImportError:
    print("Warning: pmdarima库未导入。ARIMA模型将使用固定参数(2,1,2)。")
    from statsmodels.tsa.arima.model import ARIMA # 基础ARIMA

# --- 配置 ---
# 模拟数据参数
NUM_DAYS_HISTORY = 730  # 历史数据天数 (例如2年)
FORECAST_HORIZON = 30   # 预测未来天数
REPORT_PREFIX = '电商日志时序预测报告'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# --- 1. 数据生成 ---

def generate_sample_traffic_data(n_days):
    """
    生成模拟的电商平台日访问量(如PV)时序数据。
    包含趋势、多重季节性（周、年）、事件效应和噪声。
    """
    print("--- 正在生成模拟电商日志流量数据 ---")
    start_date = datetime.now() - timedelta(days=n_days-1)
    dates = pd.date_range(start=start_date, periods=n_days, freq='D')
    
    # 1. 基础水平和长期趋势
    base_level = 50000
    trend_slope = np.random.uniform(50, 150) # 每天平均增长50-150个PV
    trend = np.arange(n_days) * trend_slope
    
    # 2. 季节性
    # 周度季节性 (周末流量高)
    day_of_week = pd.Series(dates).dt.dayofweek
    weekly_effect = np.where(day_of_week.isin([5, 6]), 1.2, 1.0) # 周六、日流量增加20%
    
    # 年度季节性 (简化为正弦波，例如夏季流量可能更高)
    doy = pd.Series(dates).dt.dayofyear
    annual_effect = 1 + 0.1 * np.sin(2 * np.pi * (doy - 80) / 365.25) # 从3月21日(春分)开始计算，峰值在夏至
    
    # 3. 特定事件/促销效应 (模拟几个大促节日)
    event_effect = np.ones(n_days)
    # 618 (6月18日)
    june_18_indices = [i for i, date in enumerate(dates) if date.month == 6 and date.day == 18]
    for idx in june_18_indices:
        event_effect[idx] *= np.random.uniform(3.0, 5.0) # 流量翻3-5倍
        if idx > 0: event_effect[idx-1] *= np.random.uniform(1.5, 2.5) # 前一天预热
        if idx < n_days-1: event_effect[idx+1] *= np.random.uniform(1.2, 1.8) # 后一天返场
    
    # 双11 (11月11日)
    nov_11_indices = [i for i, date in enumerate(dates) if date.month == 11 and date.day == 11]
    for idx in nov_11_indices:
        event_effect[idx] *= np.random.uniform(5.0, 8.0) # 流量翻5-8倍
        if idx > 0: event_effect[idx-1] *= np.random.uniform(2.0, 4.0)
        if idx < n_days-1: event_effect[idx+1] *= np.random.uniform(1.5, 2.5)
        
    # 4. 随机噪声 (使用泊松噪声更符合计数数据特性，但这里用正态简化)
    noise = np.random.normal(1.0, 0.05, n_days) # 5%的随机波动
    
    # 5. 组合所有成分
    traffic = base_level + trend
    traffic = traffic * weekly_effect * annual_effect * event_effect * noise
    traffic = np.maximum(traffic, 0) # 流量不能为负
    
    df = pd.DataFrame({
        'date': dates,
        'value': np.round(traffic, 0).astype(int) # 模拟PV/UV等整数指标
    })
    
    csv_filename = f'{REPORT_PREFIX}_模拟流量数据.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    print(f"模拟流量数据已生成并保存至: {csv_filename}")
    return df

# --- 2. 数据预处理与EDA ---

def preprocess_and_eda(df):
    """数据预处理和探索性数据分析"""
    print("\n--- 正在进行数据预处理与EDA ---")
    df_processed = df.copy()
    df_processed.set_index('date', inplace=True)
    df_processed.sort_index(inplace=True)
    
    # 检查缺失值
    if df_processed.isnull().sum().sum() > 0:
        print("警告: 发现缺失值，正在进行前向填充...")
        df_processed['value'].fillna(method='ffill', inplace=True)
    
    # 绘制时间序列图
    plt.figure(figsize=(15, 6))
    plt.plot(df_processed.index, df_processed['value'], linewidth=1, label='历史流量')
    plt.title('电商平台日志流量时间序列')
    plt.xlabel('日期')
    plt.ylabel('流量指标 (模拟PV)')
    plt.legend()
    plt.grid(True)
    ts_plot_path = f'{REPORT_PREFIX}_流量时间序列图.png'
    plt.savefig(ts_plot_path, bbox_inches='tight')
    plt.close()
    print(f"流量时间序列图已保存至: {ts_plot_path}")
    
    return df_processed

# --- 3. ARIMA 模型预测 ---

def forecast_with_arima(df, forecast_periods):
    """使用ARIMA模型进行预测"""
    print("\n--- 正在使用ARIMA模型进行预测 ---")
    
    train_size = int(len(df) * 0.85)
    train, test = df.iloc[:train_size], df.iloc[train_size:]
    
    # 使用auto_arima自动选择最佳参数，或使用固定参数
    if PMDARIMA_AVAILABLE:
        print("使用auto_arima自动选择ARIMA参数...")
        # seasonal=True 对于年度数据效果好，但计算慢。这里简化处理。
        model_auto = auto_arima(train['value'], start_p=1, max_p=5, start_q=1, max_q=5,
                                d=1, max_d=2, trace=False, error_action='ignore', 
                                suppress_warnings=True, stepwise=True, seasonal=False)
        print(f"auto_arima选择的最佳模型: ARIMA{model_auto.order}")
        model = model_auto
    else:
        print("pmdarima不可用，使用固定参数 ARIMA(2,1,2)...")
        model = ARIMA(train['value'], order=(2,1,2))
        
    # 拟合模型
    model_fit = model.fit() if not PMDARIMA_AVAILABLE else model
    
    # 预测测试集用于评估
    if len(test) > 0:
        test_forecast = model_fit.forecast(steps=len(test))
        test_forecast = np.maximum(test_forecast, 0) # 确保预测值非负
    
    # 预测未来
    future_forecast = model_fit.forecast(steps=forecast_periods)
    future_forecast = np.maximum(future_forecast, 0)
    future_dates = pd.date_range(start=df.index[-1] + timedelta(days=1), periods=forecast_periods, freq='D')
    
    # 评估 (仅在有测试集时)
    metrics = {}
    if len(test) > 0:
        metrics['MAE'] = mean_absolute_error(test['value'], test_forecast)
        metrics['RMSE'] = np.sqrt(mean_squared_error(test['value'], test_forecast))
        print(f"ARIMA模型在测试集上的表现:")
        for k, v in metrics.items():
            print(f"  - {k}: {v:.2f}")
    
    # 绘制预测结果
    plt.figure(figsize=(15, 6))
    plt.plot(df.index, df['value'], label='历史流量', linewidth=1)
    if len(test) > 0:
        plt.plot(test.index, test_forecast, label='ARIMA测试集预测', linestyle='--', alpha=0.8)
    plt.plot(future_dates, future_forecast, label=f'ARIMA未来{forecast_periods}天预测', linestyle='-.', marker='o', markersize=4)
    plt.axvline(x=df.index[train_size-1], color='black', linestyle=':', alpha=0.7, label='训练/测试分割点')
    plt.title('ARIMA模型流量预测')
    plt.xlabel('日期')
    plt.ylabel('流量指标 (模拟PV)')
    plt.legend()
    plt.grid(True)
    arima_plot_path = f'{REPORT_PREFIX}_ARIMA预测图.png'
    plt.savefig(arima_plot_path, bbox_inches='tight')
    plt.close()
    print(f"ARIMA预测图已保存至: {arima_plot_path}")
    
    results = {
        'model_name': 'ARIMA',
        'predictions': future_forecast,
        'future_dates': future_dates,
        'metrics': metrics,
        'plot_path': arima_plot_path
    }
    return results

# --- 4. Prophet 模型预测 ---

def forecast_with_prophet(df, forecast_periods):
    """使用Prophet模型进行预测"""
    if not PROPHET_AVAILABLE:
        print("Prophet模型跳过，因为库未导入。")
        return None
        
    print("\n--- 正在使用Prophet模型进行预测 ---")
    
    # Prophet需要特定的列名
    df_prophet = df.reset_index()[['date', 'value']].rename(columns={'date': 'ds', 'value': 'y'})
    
    # 创建并拟合模型
    model = Prophet(
        daily_seasonality=False, # 数据是日粒度，关闭内置的日季节性
        yearly_seasonality='auto',
        weekly_seasonality='auto',
        seasonality_mode='multiplicative' # 对于有明显比例变化的季节性更合适
    )
    # 可以在这里添加自定义节假日
    # model.add_country_holidays(country_name='CN')
    
    model.fit(df_prophet)
    
    # 创建未来日期框架
    future = model.make_future_dataframe(periods=forecast_periods)
    
    # 进行预测
    forecast = model.predict(future)
    
    # 绘制预测结果
    fig1 = model.plot(forecast, figsize=(15, 6))
    plt.title('Prophet模型流量预测')
    prophet_plot_path = f'{REPORT_PREFIX}_Prophet预测图.png'
    plt.savefig(prophet_plot_path, bbox_inches='tight')
    plt.close(fig1)
    
    # 绘制模型成分 (趋势、季节性等)
    fig2 = model.plot_components(forecast, figsize=(12, 10))
    prophet_comp_plot_path = f'{REPORT_PREFIX}_Prophet成分图.png'
    plt.savefig(prophet_comp_plot_path, bbox_inches='tight')
    plt.close(fig2)
    print(f"Prophet预测图已保存至: {prophet_plot_path}")
    print(f"Prophet成分图已保存至: {prophet_comp_plot_path}")
    
    # 提取未来预测值
    future_forecast_df = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(forecast_periods)
    future_forecast_df['yhat'] = np.maximum(future_forecast_df['yhat'], 0)
    future_forecast_df['yhat_lower'] = np.maximum(future_forecast_df['yhat_lower'], 0)
    
    results = {
        'model_name': 'Prophet',
        'forecast_df': future_forecast_df,
        'metrics': {}, # Prophet有内置交叉验证，这里简化
        'plot_path': prophet_plot_path,
        'comp_plot_path': prophet_comp_plot_path
    }
    return results

# --- 5. 报告生成 ---

def generate_forecast_report(arima_res, prophet_res, forecast_horizon):
    """生成最终的时序预测分析报告"""
    print("\n--- 正在生成电商日志时序预测报告 ---")
    from datetime import datetime
    report_filename = f"{REPORT_PREFIX}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write("=" * 60 + "\n")
        f.write("           电商平台日志数据时序预测分析报告\n")
        f.write(f"              生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("=" * 60 + "\n\n")

        f.write("--- 1. 项目概述 ---\n")
        f.write("本报告旨在通过对电商平台历史日志数据（模拟流量指标）的分析，预测未来一段时间的流量趋势。\n")
        f.write("此预测可为服务器资源规划、带宽管理、营销活动安排等提供数据支持。\n\n")

        f.write("--- 2. 数据概览 ---\n")
        f.write(f"数据来源: 模拟生成的电商平台日访问量(PV)数据。\n")
        f.write(f"数据规模: {NUM_DAYS_HISTORY} 天的历史记录。\n")
        f.write("数据特征: 包含长期增长趋势、周度季节性、年度季节性以及'618'、'双11'等大促事件效应。\n")
        f.write("原始数据已保存为 CSV 文件。\n\n")

        f.write("--- 3. 探索性数据分析 (EDA) ---\n")
        f.write("对时间序列数据进行了可视化分析，以识别其核心模式：\n")
        f.write("- 趋势: 数据显示总体呈稳定上升趋势。\n")
        f.write("- 季节性: 存在显著的周度（周末高峰）和年度周期性波动。\n")
        f.write("- 事件效应: 可以清晰观察到'618'、'双11'等大促节日带来的流量巨大峰值。\n")
        f.write("流量时间序列图已生成。\n\n")

        f.write("--- 4. 预测模型 ---\n")
        f.write("采用了两种主流的时间序列预测模型进行建模与对比：\n")
        f.write("1. ARIMA (自回归积分滑动平均模型): \n")
        f.write("   - 一种基于统计学的经典模型。\n")
        f.write("   - 适用于单变量、平稳或可转化为平稳的时间序列。\n")
        f.write("   - 优点是模型可解释性强。\n")
        f.write("2. Prophet (由Facebook开发): \n")
        f.write("   - 一种灵活且强大的模型，特别擅长处理具有强季节性、历史数据缺失和趋势突变的序列。\n")
        f.write("   - 内置对节假日的支持。\n")
        f.write("   - 优点是易于使用且鲁棒性好。\n\n")
        
        f.write("--- 5. ARIMA模型预测结果 ---\n")
        if arima_res:
            f.write(f"模型: {arima_res['model_name']}\n")
            if arima_res['metrics']:
                f.write("模型评估 (在测试集上):\n")
                for metric, value in arima_res['metrics'].items():
                    f.write(f"  - {metric}: {value:.2f}\n")
            f.write(f"未来{forecast_horizon}天预测值预览:\n")
            forecast_df_arima = pd.DataFrame({'date': arima_res['future_dates'], 'predicted_value': arima_res['predictions']})
            f.write(forecast_df_arima.round(2).to_string(index=False))
            f.write(f"\n预测图表: {arima_res['plot_path']}\n\n")
        else:
            f.write("ARIMA模型执行失败或被跳过。\n\n")

        f.write("--- 6. Prophet模型预测结果 ---\n")
        if prophet_res:
            f.write(f"模型: {prophet_res['model_name']}\n")
            f.write(f"未来{forecast_horizon}天预测值预览 (包含不确定性区间):\n")
            forecast_summary = prophet_res['forecast_df'].rename(columns={
                'ds': 'date', 'yhat': 'predicted_value', 
                'yhat_lower': 'lower_bound', 'yhat_upper': 'upper_bound'
            })
            f.write(forecast_summary.round(2).to_string(index=False))
            f.write(f"\n预测图表: {prophet_res['plot_path']}\n")
            f.write(f"趋势与季节性成分图: {prophet_res['comp_plot_path']}\n\n")
        else:
            f.write("Prophet模型因库未导入而被跳过。\n\n")

        f.write("--- 7. 结论与建议 ---\n")
        f.write("1. 趋势预判: 两种模型均预测未来流量将延续当前的增长趋势。\n")
        f.write("2. 风险管理: 预测区间（尤其是Prophet）提供了不确定性估计，有助于评估风险。\n")
        f.write("3. 资源规划: \n")
        f.write("   - 根据预测的流量峰值（如未来大促期）提前扩容服务器和带宽。\n")
        f.write("   - 在预测流量较低的时期进行系统维护和更新。\n")
        f.write("4. 营销策略: 将营销活动与预测的流量高峰相结合，以最大化效果。\n")
        f.write("5. 模型迭代: 定期使用最新的日志数据重新训练模型，以保持预测的时效性和准确性。\n\n")

        f.write("=" * 60 + "\n")
        f.write("                         报告结束\n")
        f.write("=" * 60 + "\n")

    print(f"电商日志时序预测报告已生成: {report_filename}")

# --- 主函数 ---

def main():
    """主函数"""
    # 1. 生成数据
    df_traffic = generate_sample_traffic_data(NUM_DAYS_HISTORY)
    
    # 2. 数据预处理与EDA
    df_processed = preprocess_and_eda(df_traffic)
    
    # 3. ARIMA预测
    arima_results = forecast_with_arima(df_processed, FORECAST_HORIZON)
    
    # 4. Prophet预测
    prophet_results = forecast_with_prophet(df_processed, FORECAST_HORIZON)
    
    # 5. 生成报告
    generate_forecast_report(arima_results, prophet_results, FORECAST_HORIZON)
    
    print("\n电商日志数据时序预测分析流程完成。")

if __name__ == "__main__":
    main()
viplao

487