【Python实践案例】电商平台数据分析和挖掘 – 市场趋势预测

访问量： 2682
开发思路 – 该脚本将包含以下功能：
模拟生成包含时间序列信息的商品销售数据。
进行数据预处理和探索性数据分析（EDA）。
使用ARIMA（自回归积分滑动平均模型）和Prophet两种时间序列预测模型进行销量预测。
评估模型性能并比较预测效果。
生成一份包含分析过程、预测结果和商业洞察的综合报告。
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

# 尝试导入Prophet，如果失败则标记
try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except ImportError:
    print("警告: Prophet库未安装或导入失败。将跳过Prophet模型部分。请运行 'pip install prophet' 安装。")
    PROPHET_AVAILABLE = False

# 尝试导入pmdarima用于ARIMA，如果失败则标记
try:
    from pmdarima import auto_arima
    PMDARIMA_AVAILABLE = True
except ImportError:
    print("警告: pmdarima库未安装。ARIMA模型将使用固定参数。请运行 'pip install pmdarima' 安装以获得更好的效果。")
    PMDARIMA_AVAILABLE = False
    from statsmodels.tsa.arima.model import ARIMA # 基础ARIMA

# --- 配置 ---
NUM_DAYS = 730  # 2年的日销量数据
REPORT_PREFIX = '电商商品趋势预测报告'
RANDOM_SEED = 42

# --- 数据生成 ---

def generate_sample_sales_data(n_days):
    """生成模拟的商品日销量数据，包含趋势、季节性和噪声"""
    print("--- 正在生成模拟商品销售数据 ---")
    np.random.seed(RANDOM_SEED)
    
    start_date = datetime.now() - timedelta(days=n_days-1)
    dates = pd.date_range(start=start_date, periods=n_days, freq='D')
    
    # 1. 长期趋势 (线性增长)
    trend_slope = np.random.uniform(0.5, 2.0) # 每天平均增长0.5到2个单位
    trend = np.arange(n_days) * trend_slope
    
    # 2. 季节性 (年度和周度)
    # 年度季节性 (简化为正弦波)
    annual_seasonality = 50 * np.sin(2 * np.pi * np.arange(n_days) / 365.25)
    
    # 周度季节性 (周末销量高)
    day_of_week = pd.Series(dates).dt.dayofweek
    weekly_seasonality = np.where(day_of_week.isin([5, 6]), 30, 0) # 周六、日增加30
    
    # 3. 节假日效应 (模拟几个大促节日)
    holidays_effect = np.zeros(n_days)
    # 模拟618 (6月18日)
    june_18_indices = [i for i, date in enumerate(dates) if date.month == 6 and date.day == 18]
    for idx in june_18_indices:
        holidays_effect[idx] = np.random.uniform(200, 400)
        if idx > 0: holidays_effect[idx-1] += np.random.uniform(100, 200) # 前一天预热
        if idx < n_days-1: holidays_effect[idx+1] += np.random.uniform(50, 150) # 后一天返场
    
    # 模拟双11 (11月11日)
    nov_11_indices = [i for i, date in enumerate(dates) if date.month == 11 and date.day == 11]
    for idx in nov_11_indices:
        holidays_effect[idx] = np.random.uniform(500, 800)
        if idx > 0: holidays_effect[idx-1] += np.random.uniform(200, 400)
        if idx < n_days-1: holidays_effect[idx+1] += np.random.uniform(100, 300)

    # 4. 随机噪声
    noise = np.random.normal(0, 10, n_days) # 均值0，标准差10的噪声
    
    # 5. 组合所有成分
    base_level = 200 # 基础销量水平
    sales = base_level + trend + annual_seasonality + weekly_seasonality + holidays_effect + noise
    sales = np.maximum(sales, 0) # 销量不能为负
    
    df = pd.DataFrame({
        'date': dates,
        'sales': np.round(sales, 0)
    })
    
    csv_filename = f'{REPORT_PREFIX}_模拟销售数据.csv'
    df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    print(f"模拟销售数据已生成并保存至: {csv_filename}")
    return df

# --- 数据预处理与EDA ---

def preprocess_and_eda(df):
    """数据预处理和探索性数据分析"""
    print("\n--- 正在进行数据预处理与EDA ---")
    df_processed = df.copy()
    df_processed.set_index('date', inplace=True)
    df_processed.sort_index(inplace=True)
    
    # 检查缺失值
    if df_processed.isnull().sum().sum() > 0:
        print("警告: 发现缺失值，正在进行填充...")
        df_processed['sales'].fillna(method='ffill', inplace=True) # 前向填充
    
    # 绘制时间序列图
    plt.figure(figsize=(15, 6))
    plt.plot(df_processed.index, df_processed['sales'], linewidth=1)
    plt.title('商品日销量时间序列')
    plt.xlabel('日期')
    plt.ylabel('销量')
    plt.grid(True)
    ts_plot_path = f'{REPORT_PREFIX}_时间序列图.png'
    plt.savefig(ts_plot_path)
    plt.close()
    print(f"时间序列图已保存至: {ts_plot_path}")
    
    return df_processed

# --- ARIMA 模型预测 ---

def forecast_with_arima(df, forecast_periods=30):
    """使用ARIMA模型进行预测"""
    print("\n--- 正在使用ARIMA模型进行预测 ---")
    
    train_size = int(len(df) * 0.8)
    train, test = df.iloc[:train_size], df.iloc[train_size:]
    
    # 使用auto_arima自动选择最佳参数，或使用固定参数
    if PMDARIMA_AVAILABLE:
        print("使用auto_arima自动选择ARIMA参数...")
        model_auto = auto_arima(train['sales'], seasonal=False, trace=False, error_action='ignore', suppress_warnings=True)
        print(f"auto_arima选择的最佳模型: ARIMA{model_auto.order}")
        model = model_auto
    else:
        print("pmdarima不可用，使用固定参数 ARIMA(2,1,2)...")
        model = ARIMA(train['sales'], order=(2,1,2))
        model = model.fit()
    
    # 拟合模型
    model_fit = model.fit() if not PMDARIMA_AVAILABLE else model
    
    # 预测测试集
    test_forecast = model_fit.forecast(steps=len(test))
    
    # 预测未来
    future_forecast = model_fit.forecast(steps=forecast_periods)
    future_dates = pd.date_range(start=df.index[-1] + timedelta(days=1), periods=forecast_periods, freq='D')
    
    # 评估 (仅在有测试集时)
    if len(test) > 0:
        mae = mean_absolute_error(test['sales'], test_forecast)
        rmse = np.sqrt(mean_squared_error(test['sales'], test_forecast))
        print(f"ARIMA模型在测试集上的表现:")
        print(f"  - MAE: {mae:.2f}")
        print(f"  - RMSE: {rmse:.2f}")
    else:
        mae, rmse = None, None
    
    # 绘制预测结果
    plt.figure(figsize=(15, 6))
    plt.plot(df.index, df['sales'], label='历史销量', linewidth=1)
    if len(test) > 0:
        plt.plot(test.index, test_forecast, label='ARIMA测试集预测', linestyle='--', alpha=0.8)
    plt.plot(future_dates, future_forecast, label=f'ARIMA未来{forecast_periods}天预测', linestyle='--', marker='o', markersize=3)
    plt.axvline(x=df.index[train_size-1], color='black', linestyle=':', alpha=0.7, label='训练/测试分割点')
    plt.title('ARIMA模型销量预测')
    plt.xlabel('日期')
    plt.ylabel('销量')
    plt.legend()
    plt.grid(True)
    arima_plot_path = f'{REPORT_PREFIX}_ARIMA预测图.png'
    plt.savefig(arima_plot_path)
    plt.close()
    print(f"ARIMA预测图已保存至: {arima_plot_path}")
    
    results = {
        'model': 'ARIMA',
        'predictions': future_forecast,
        'future_dates': future_dates,
        'metrics': {'MAE': mae, 'RMSE': rmse} if mae is not None else {},
        'plot_path': arima_plot_path
    }
    return results

# --- Prophet 模型预测 ---

def forecast_with_prophet(df, forecast_periods=30):
    """使用Prophet模型进行预测"""
    if not PROPHET_AVAILABLE:
        print("Prophet模型跳过，因为库不可用。")
        return None
        
    print("\n--- 正在使用Prophet模型进行预测 ---")
    
    # Prophet需要特定的列名
    df_prophet = df.reset_index()[['date', 'sales']].rename(columns={'date': 'ds', 'sales': 'y'})
    
    # 创建并拟合模型
    model = Prophet()
    # 可以在这里添加节假日
    # model.add_country_holidays(country_name='CN') # 添加中国节假日
    
    model.fit(df_prophet)
    
    # 创建未来日期框架
    future = model.make_future_dataframe(periods=forecast_periods)
    
    # 进行预测
    forecast = model.predict(future)
    
    # 绘制预测结果
    fig1 = model.plot(forecast)
    plt.title('Prophet模型销量预测')
    prophet_plot_path = f'{REPORT_PREFIX}_Prophet预测图.png'
    plt.savefig(prophet_plot_path)
    plt.close(fig1)
    
    # 绘制模型成分
    fig2 = model.plot_components(forecast)
    prophet_comp_plot_path = f'{REPORT_PREFIX}_Prophet成分图.png'
    plt.savefig(prophet_comp_plot_path)
    plt.close(fig2)
    print(f"Prophet预测图已保存至: {prophet_plot_path}")
    print(f"Prophet成分图已保存至: {prophet_comp_plot_path}")
    
    # 提取未来预测值
    future_forecast = forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail(forecast_periods)
    
    results = {
        'model': 'Prophet',
        'forecast_df': future_forecast,
        'metrics': {}, # Prophet内部有交叉验证，这里简化处理
        'plot_path': prophet_plot_path,
        'comp_plot_path': prophet_comp_plot_path
    }
    return results

# --- 报告生成 ---

def generate_trend_forecast_report(arima_res, prophet_res):
    """生成最终的趋势预测分析报告"""
    print("\n--- 正在生成商品市场趋势预测报告 ---")
    from datetime import datetime
    report_filename = f"{REPORT_PREFIX}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write("=" * 50 + "\n")
        f.write("        电商平台商品市场趋势预测报告\n")
        f.write(f"        生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("=" * 50 + "\n\n")

        f.write("--- 1. 项目概述 ---\n")
        f.write("本项目旨在通过对电商平台特定商品的历史销售数据进行分析，预测其未来的市场趋势。\n")
        f.write("这有助于库存管理、营销资源分配和制定销售策略。\n\n")

        f.write("--- 2. 数据概览 ---\n")
        f.write("数据来源: 模拟生成的电商平台商品日销量数据。\n")
        f.write("数据规模: 2年 (730天) 的日销量记录。\n")
        f.write("数据特征: 包含长期增长趋势、年度季节性、周度波动和节假日效应。\n")
        f.write("原始数据已保存为 CSV 文件。\n\n")

        f.write("--- 3. 探索性数据分析 (EDA) ---\n")
        f.write("对时间序列数据进行了可视化分析，以识别其模式和特征。\n")
        f.write("- 趋势: 数据显示总体呈上升趋势。\n")
        f.write("- 季节性: 存在明显的年度和周度季节性模式。\n")
        f.write("- 周期性事件: 可以观察到类似'618'、'双11'等大促节日带来的销量高峰。\n")
        f.write("时间序列图已生成。\n\n")

        f.write("--- 4. 预测模型 ---\n")
        f.write("采用了两种主流的时间序列预测模型进行对比分析：\n")
        f.write("1. ARIMA (自回归积分滑动平均模型): 一种经典的统计模型，适用于单变量时间序列。\n")
        f.write("2. Prophet (由Facebook开发): 一种灵活的模型，能较好地处理缺失数据、趋势变化和节假日效应。\n\n")
        
        f.write("--- 5. ARIMA模型预测结果 ---\n")
        if arima_res:
            f.write(f"模型: {arima_res['model']}\n")
            if arima_res['metrics']:
                f.write("模型评估 (在测试集上):\n")
                for metric, value in arima_res['metrics'].items():
                    f.write(f"  - {metric}: {value:.2f}\n")
            f.write("未来30天预测值预览:\n")
            forecast_df_arima = pd.DataFrame({'date': arima_res['future_dates'], 'predicted_sales': arima_res['predictions']})
            f.write(forecast_df_arima.head(10).round(2).to_string(index=False))
            f.write(f"\n预测图表: {arima_res['plot_path']}\n\n")
        else:
            f.write("ARIMA模型执行失败或被跳过。\n\n")

        f.write("--- 6. Prophet模型预测结果 ---\n")
        if prophet_res:
            f.write(f"模型: {prophet_res['model']}\n")
            f.write("未来30天预测值预览 (包含置信区间):\n")
            forecast_summary = prophet_res['forecast_df'][['ds', 'yhat', 'yhat_lower', 'yhat_upper']].rename(columns={'ds': 'date', 'yhat': 'predicted_sales', 'yhat_lower': 'lower_bound', 'yhat_upper': 'upper_bound'})
            f.write(forecast_summary.head(10).round(2).to_string(index=False))
            f.write(f"\n预测图表: {prophet_res['plot_path']}\n")
            f.write(f"趋势与季节性成分图: {prophet_res['comp_plot_path']}\n\n")
        else:
            f.write("Prophet模型因库不可用而被跳过。\n\n")

        f.write("--- 7. 商业洞察与建议 ---\n")
        f.write("1. 库存管理: 根据预测的销量高峰（如节假日）提前备货，避免缺货。\n")
        f.write("2. 营销规划: 在预测销量较低的时期策划促销活动，以提升销量。\n")
        f.write("3. 资源分配: 将更多营销预算投入到预测销量增长的时期。\n")
        f.write("4. 模型迭代: 定期用最新的销售数据重新训练模型，以保持预测的准确性。\n")
        f.write("5. 风险预警: 监控实际销量与预测值的偏差，及时发现市场异常变化。\n\n")

        f.write("=" * 50 + "\n")
        f.write("                    报告结束\n")
        f.write("=" * 50 + "\n")

    print(f"商品市场趋势预测报告已生成: {report_filename}")

# --- 主函数 ---

def main():
    """主函数"""
    # 1. 生成数据
    df_sales = generate_sample_sales_data(NUM_DAYS)
    
    # 2. 数据预处理与EDA
    df_processed = preprocess_and_eda(df_sales)
    
    # 3. ARIMA预测
    arima_results = forecast_with_arima(df_processed)
    
    # 4. Prophet预测
    prophet_results = forecast_with_prophet(df_processed) if PROPHET_AVAILABLE else None
    
    # 5. 生成报告
    generate_trend_forecast_report(arima_results, prophet_results)
    
    print("\n商品市场趋势预测分析流程完成。")

if __name__ == "__main__":
    main()
viplao

521