【Python10年经验总结】第五课 电商平台销售数据分析实践分解 – 趋势预估(Trend Forecasting)
常见用的预估场景:
- 使用线性回归预测下月销售额
- 使用ARIMA模型预测未来一周销量
- 构建时间序列预测节假日效应
- 使用指数平滑法预测单品销量
- 构建季节性分解模型(STL)
- 使用XGBoost预测不同品类的增长趋势
- 构建多维时间序列预测(按区域+品类)
- 预测促销期间的订单波动
- 预测库存需求以支持补货决策
- 使用Prophet预测年度趋势变化
我们将展示如何进行这些常见的趋势预估任务。为了演示这些方法,我们需要使用一些常用的时间序列分析库,如 statsmodels
、pmdarima
、xgboost
和 fbprophet
(现在称为 prophet
)。首先,确保你已经安装了这些库:
pip install statsmodels pmdarima xgboost prophet
接下来,我们将创建一个示例DataFrame来模拟原始数据,并逐步应用这些趋势预估任务。
创建示例数据
import pandas as pd
import numpy as np
# 创建示例时间序列数据
dates = pd.date_range(start='2023-01-01', end='2025-06-30', freq='D')
np.random.seed(42)
sales_data = np.cumsum(np.random.normal(loc=100, scale=20, size=len(dates)))
data = {
'order_date': dates,
'amount': sales_data
}
df = pd.DataFrame(data)
# 添加其他字段以支持多维预测
categories = ['C1', 'C2', 'C3', 'C4']
regions = ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen']
df['category_code'] = np.random.choice(categories, len(dates))
df['region'] = np.random.choice(regions, len(dates))
print("原始数据:")
print(df.head())
1. 使用线性回归预测下月销售额
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
# 提取年份和月份作为特征
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
# 准备特征和目标变量
X = df[['year', 'month']]
y = df['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练线性回归模型
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
# 预测下个月的销售额
next_month = pd.DataFrame({'year': [2025], 'month': [7]})
predicted_sales_lr = model_lr.predict(next_month)
print("\n使用线性回归预测下月销售额:")
print(predicted_sales_lr[0])
2. 使用ARIMA模型预测未来一周销量
import pandas as pd
import numpy as np
from statsmodels.tsa.arima.model import ARIMA
# 按天聚合销售额
daily_sales = df.groupby('order_date')['amount'].sum().reset_index()
# 设置时间为索引
daily_sales.set_index('order_date', inplace=True)
# 拟合ARIMA模型
model_arima = ARIMA(daily_sales, order=(5, 1, 0)) # 这里使用 (5, 1, 0) 作为示例参数
model_arima_fit = model_arima.fit()
# 预测未来一周的销售额
forecast_arima = model_arima_fit.forecast(steps=7)
print("\n使用ARIMA模型预测未来一周销量:")
print(forecast_arima)
3. 构建时间序列预测节假日效应
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
# 添加节假日标志
holidays = pd.to_datetime([
'2023-01-01', '2023-02-22', '2023-04-05', '2023-05-01', '2023-10-01',
'2024-01-01', '2024-02-10', '2024-04-04', '2024-05-01', '2024-10-01',
'2025-01-01', '2025-02-19', '2025-04-04', '2025-05-01', '2025-10-01'
])
df['is_holiday'] = df['order_date'].isin(holidays).astype(int)
# 拟合SARIMA模型
model_sarima = SARIMAX(daily_sales, exog=df['is_holiday'], order=(5, 1, 0), seasonal_order=(1, 1, 1, 12))
model_sarima_fit = model_sarima.fit()
# 预测未来一个月的销售额
future_dates = pd.date_range(start=daily_sales.index[-1] + pd.Timedelta(days=1), periods=30)
future_exog = df.loc[future_dates]['is_holiday'].values
forecast_sarima = model_sarima_fit.get_forecast(steps=30, exog=future_exog)
print("\n构建时间序列预测节假日效应:")
print(forecast_sarima.predicted_mean)
4. 使用指数平滑法预测单品销量
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
# 按产品ID聚合销售额
product_sales = df.groupby(['order_date', 'product_id'])['amount'].sum().unstack(fill_value=0)
# 选择一个产品ID进行预测
selected_product = product_sales.columns[0]
product_series = product_sales[selected_product]
# 拟合指数平滑模型
model_es = ExponentialSmoothing(product_series, trend='add', seasonal=None)
model_es_fit = model_es.fit()
# 预测未来一个月的销售额
forecast_es = model_es_fit.forecast(steps=30)
print("\n使用指数平滑法预测单品销量:")
print(forecast_es)
5. 构建季节性分解模型(STL)
import pandas as pd
import numpy as np
from statsmodels.tsa.seasonal import STL
# 按天聚合销售额
daily_sales = df.groupby('order_date')['amount'].sum().reset_index()
# 设置时间为索引
daily_sales.set_index('order_date', inplace=True)
# 季节性分解
stl = STL(daily_sales, period=365)
res = stl.fit()
print("\n构建季节性分解模型(STL):")
print(res.summary())
6. 使用XGBoost预测不同品类的增长趋势
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# 按品类聚合销售额
category_sales = df.groupby(['order_date', 'category_code'])['amount'].sum().unstack(fill_value=0)
# 选择一个品类进行预测
selected_category = category_sales.columns[0]
category_series = category_sales[selected_category].reset_index()
# 准备特征和目标变量
category_series['lag_1'] = category_series['amount'].shift(1)
category_series['lag_7'] = category_series['amount'].shift(7)
category_series.dropna(inplace=True)
X = category_series[['lag_1', 'lag_7']]
y = category_series['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练XGBoost模型
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:squarederror'}
bst = xgb.train(params, dtrain, num_boost_round=100)
# 预测未来一个月的销售额
future_lag_1 = category_series['amount'].iloc[-1]
future_lag_7 = category_series['amount'].iloc[-7:]
future_lag_7_avg = future_lag_7.mean()
future_X = pd.DataFrame({'lag_1': [future_lag_1], 'lag_7': [future_lag_7_avg]})
future_dmatrix = xgb.DMatrix(future_X)
predicted_sales_xgb = bst.predict(future_dmatrix)
print("\n使用XGBoost预测不同品类的增长趋势:")
print(predicted_sales_xgb[0])
7. 构建多维时间序列预测(按区域+品类)
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# 按区域和品类聚合销售额
region_category_sales = df.groupby(['order_date', 'region', 'category_code'])['amount'].sum().unstack(fill_value=0)
# 选择一个组合进行预测
selected_region = region_category_sales.columns.levels[0][0]
selected_category = region_category_sales.columns.levels[1][0]
region_category_series = region_category_sales[(selected_region, selected_category)].reset_index()
# 准备特征和目标变量
region_category_series['lag_1'] = region_category_series['amount'].shift(1)
region_category_series['lag_7'] = region_category_series['amount'].shift(7)
region_category_series.dropna(inplace=True)
X = region_category_series[['lag_1', 'lag_7']]
y = region_category_series['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练XGBoost模型
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:squarederror'}
bst = xgb.train(params, dtrain, num_boost_round=100)
# 预测未来一个月的销售额
future_lag_1 = region_category_series['amount'].iloc[-1]
future_lag_7 = region_category_series['amount'].iloc[-7:]
future_lag_7_avg = future_lag_7.mean()
future_X = pd.DataFrame({'lag_1': [future_lag_1], 'lag_7': [future_lag_7_avg]})
future_dmatrix = xgb.DMatrix(future_X)
predicted_sales_mc = bst.predict(future_dmatrix)
print("\n构建多维时间序列预测(按区域+品类):")
print(predicted_sales_mc[0])
8. 预测促销期间的订单波动
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error
# 准备特征和目标变量
df['lag_1'] = df['amount'].shift(1)
df['lag_7'] = df['amount'].shift(7)
df.dropna(inplace=True)
X = df[['lag_1', 'lag_7', 'promotion']]
y = df['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练XGBoost模型
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:squarederror'}
bst = xgb.train(params, dtrain, num_boost_round=100)
# 预测未来一个月的销售额
future_lag_1 = df['amount'].iloc[-1]
future_lag_7 = df['amount'].iloc[-7:]
future_lag_7_avg = future_lag_7.mean()
future_promotion = 1 # 假设未来是促销期
future_X = pd.DataFrame({
'lag_1': [future_lag_1],
'lag_7': [future_lag_7_avg],
'promotion': [future_promotion]
})
future_dmatrix = xgb.DMatrix(future_X)
predicted_sales_promo = bst.predict(future_dmatrix)
print("\n预测促销期间的订单波动:")
print(predicted_sales_promo[0])
9. 预测库存需求以支持补货决策
import pandas as pd
import numpy as np
from statsmodels.tsa.holtwinters import ExponentialSmoothing
# 按产品ID聚合销售额
product_sales = df.groupby(['order_date', 'product_id'])['amount'].sum().unstack(fill_value=0)
# 选择一个产品ID进行预测
selected_product = product_sales.columns[0]
product_series = product_sales[selected_product]
# 拟合指数平滑模型
model_es = ExponentialSmoothing(product_series, trend='add', seasonal=None)
model_es_fit = model_es.fit()
# 预测未来一个月的销售额
forecast_es = model_es_fit.forecast(steps=30)
# 假设库存周转率为30天
inventory_turnover_days = 30
average_daily_sales = forecast_es.mean()
predicted_inventory_demand = average_daily_sales * inventory_turnover_days
print("\n预测库存需求以支持补货决策:")
print(predicted_inventory_demand)
10. 使用Prophet预测年度趋势变化
import pandas as pd
import numpy as np
from fbprophet import Prophet
# 按天聚合销售额
daily_sales = df.groupby('order_date')['amount'].sum().reset_index()
# 重命名列以符合Prophet的要求
daily_sales.rename(columns={'order_date': 'ds', 'amount': 'y'}, inplace=True)
# 初始化并拟合Prophet模型
model_prophet = Prophet()
model_prophet.fit(daily_sales)
# 创建未来一年的日期范围
future_dates = model_prophet.make_future_dataframe(periods=365)
# 预测未来一年的销售额
forecast_prophet = model_prophet.predict(future_dates)
print("\n使用Prophet预测年度趋势变化:")
print(forecast_prophet[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())
综合以上步骤,最终的趋势预估结果如下:
这段代码展示了从原始数据到经过全面趋势预估的数据的过程。你可以根据实际需求调整每一步的操作。
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.seasonal import STL
import xgboost as xgb
from fbprophet import Prophet
# 创建示例时间序列数据
dates = pd.date_range(start='2023-01-01', end='2025-06-30', freq='D')
np.random.seed(42)
sales_data = np.cumsum(np.random.normal(loc=100, scale=20, size=len(dates)))
data = {
'order_date': dates,
'amount': sales_data
}
df = pd.DataFrame(data)
# 添加其他字段以支持多维预测
categories = ['C1', 'C2', 'C3', 'C4']
regions = ['Beijing', 'Shanghai', 'Guangzhou', 'Shenzhen']
df['category_code'] = np.random.choice(categories, len(dates))
df['region'] = np.random.choice(regions, len(dates))
# 提取年份和月份作为特征
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
# 准备特征和目标变量
X = df[['year', 'month']]
y = df['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练线性回归模型
model_lr = LinearRegression()
model_lr.fit(X_train, y_train)
# 预测下个月的销售额
next_month = pd.DataFrame({'year': [2025], 'month': [7]})
predicted_sales_lr = model_lr.predict(next_month)
print("使用线性回归预测下月销售额:")
print(predicted_sales_lr[0])
# 按天聚合销售额
daily_sales = df.groupby('order_date')['amount'].sum().reset_index()
# 设置时间为索引
daily_sales.set_index('order_date', inplace=True)
# 拟合ARIMA模型
model_arima = ARIMA(daily_sales, order=(5, 1, 0)) # 这里使用 (5, 1, 0) 作为示例参数
model_arima_fit = model_arima.fit()
# 预测未来一周的销售额
forecast_arima = model_arima_fit.forecast(steps=7)
print("\n使用ARIMA模型预测未来一周销量:")
print(forecast_arima)
# 添加节假日标志
holidays = pd.to_datetime([
'2023-01-01', '2023-02-22', '2023-04-05', '2023-05-01', '2023-10-01',
'2024-01-01', '2024-02-10', '2024-04-04', '2024-05-01', '2024-10-01',
'2025-01-01', '2025-02-19', '2025-04-04', '2025-05-01', '2025-10-01'
])
df['is_holiday'] = df['order_date'].isin(holidays).astype(int)
# 拟合SARIMA模型
model_sarima = SARIMAX(daily_sales, exog=df['is_holiday'], order=(5, 1, 0), seasonal_order=(1, 1, 1, 12))
model_sarima_fit = model_sarima.fit()
# 预测未来一个月的销售额
future_dates = pd.date_range(start=daily_sales.index[-1] + pd.Timedelta(days=1), periods=30)
future_exog = df.loc[future_dates]['is_holiday'].values
forecast_sarima = model_sarima_fit.get_forecast(steps=30, exog=future_exog)
print("\n构建时间序列预测节假日效应:")
print(forecast_sarima.predicted_mean)
# 按产品ID聚合销售额
product_sales = df.groupby(['order_date', 'product_id'])['amount'].sum().unstack(fill_value=0)
# 选择一个产品ID进行预测
selected_product = product_sales.columns[0]
product_series = product_sales[selected_product]
# 拟合指数平滑模型
model_es = ExponentialSmoothing(product_series, trend='add', seasonal=None)
model_es_fit = model_es.fit()
# 预测未来一个月的销售额
forecast_es = model_es_fit.forecast(steps=30)
print("\n使用指数平滑法预测单品销量:")
print(forecast_es)
# 季节性分解
stl = STL(daily_sales, period=365)
res = stl.fit()
print("\n构建季节性分解模型(STL):")
print(res.summary())
# 按品类聚合销售额
category_sales = df.groupby(['order_date', 'category_code'])['amount'].sum().unstack(fill_value=0)
# 选择一个品类进行预测
selected_category = category_sales.columns[0]
category_series = category_sales[selected_category].reset_index()
# 准备特征和目标变量
category_series['lag_1'] = category_series['amount'].shift(1)
category_series['lag_7'] = category_series['amount'].shift(7)
category_series.dropna(inplace=True)
X = category_series[['lag_1', 'lag_7']]
y = category_series['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练XGBoost模型
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:squarederror'}
bst = xgb.train(params, dtrain, num_boost_round=100)
# 预测未来一个月的销售额
future_lag_1 = category_series['amount'].iloc[-1]
future_lag_7 = category_series['amount'].iloc[-7:]
future_lag_7_avg = future_lag_7.mean()
future_X = pd.DataFrame({'lag_1': [future_lag_1], 'lag_7': [future_lag_7_avg]})
future_dmatrix = xgb.DMatrix(future_X)
predicted_sales_xgb = bst.predict(future_dmatrix)
print("\n使用XGBoost预测不同品类的增长趋势:")
print(predicted_sales_xgb[0])
# 按区域和品类聚合销售额
region_category_sales = df.groupby(['order_date', 'region', 'category_code'])['amount'].sum().unstack(fill_value=0)
# 选择一个组合进行预测
selected_region = region_category_sales.columns.levels[0][0]
selected_category = region_category_sales.columns.levels[1][0]
region_category_series = region_category_sales[(selected_region, selected_category)].reset_index()
# 准备特征和目标变量
region_category_series['lag_1'] = region_category_series['amount'].shift(1)
region_category_series['lag_7'] = region_category_series['amount'].shift(7)
region_category_series.dropna(inplace=True)
X = region_category_series[['lag_1', 'lag_7']]
y = region_category_series['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练XGBoost模型
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:squarederror'}
bst = xgb.train(params, dtrain, num_boost_round=100)
# 预测未来一个月的销售额
future_lag_1 = region_category_series['amount'].iloc[-1]
future_lag_7 = region_category_series['amount'].iloc[-7:]
future_lag_7_avg = future_lag_7.mean()
future_X = pd.DataFrame({'lag_1': [future_lag_1], 'lag_7': [future_lag_7_avg]})
future_dmatrix = xgb.DMatrix(future_X)
predicted_sales_mc = bst.predict(future_dmatrix)
print("\n构建多维时间序列预测(按区域+品类):")
print(predicted_sales_mc[0])
# 准备特征和目标变量
df['lag_1'] = df['amount'].shift(1)
df['lag_7'] = df['amount'].shift(7)
df.dropna(inplace=True)
X = df[['lag_1', 'lag_7', 'promotion']]
y = df['amount']
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
# 训练XGBoost模型
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
params = {'objective': 'reg:squarederror'}
bst = xgb.train(params, dtrain, num_boost_round=100)
# 预测未来一个月的销售额
future_lag_1 = df['amount'].iloc[-1]
future_lag_7 = df['amount'].iloc[-7:]
future_lag_7_avg = future_lag_7.mean()
future_promotion = 1 # 假设未来是促销期
future_X = pd.DataFrame({
'lag_1': [future_lag_1],
'lag_7': [future_lag_7_avg],
'promotion': [future_promotion]
})
future_dmatrix = xgb.DMatrix(future_X)
predicted_sales_promo = bst.predict(future_dmatrix)
print("\n预测促销期间的订单波动:")
print(predicted_sales_promo[0])
# 假设库存周转率为30天
inventory_turnover_days = 30
average_daily_sales = forecast_es.mean()
predicted_inventory_demand = average_daily_sales * inventory_turnover_days
print("\n预测库存需求以支持补货决策:")
print(predicted_inventory_demand)
# 重命名列以符合Prophet的要求
daily_sales.rename(columns={'order_date': 'ds', 'amount': 'y'}, inplace=True)
# 初始化并拟合Prophet模型
model_prophet = Prophet()
model_prophet.fit(daily_sales)
# 创建未来一年的日期范围
future_dates = model_prophet.make_future_dataframe(periods=365)
# 预测未来一年的销售额
forecast_prophet = model_prophet.predict(future_dates)
print("\n使用Prophet预测年度趋势变化:")
print(forecast_prophet[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail())