【Python10年经验总结】第八课 电商平台销售数据分析实践 -机器学习预测(Machine Learning Forecasting)
工作常用的机器学习预测案例:
- 准备用于预测的特征工程(时间、促销、节假日等)
- 对分类变量进行 One-Hot 编码
- 划分训练集与测试集(按时间切片)
- 使用线性回归预测单品销量
- 使用 XGBoost 预测不同品类的增长趋势
- 使用 LightGBM 构建高维特征的销量预测模型
- 使用随机森林预测新品上市后的表现
- 构建多输出模型同时预测多个商品的销量
- 使用交叉验证评估模型效果(时间序列专用)
- 使用SHAP解释模型预测结果(可解释AI)
好的,我们将展示如何进行这些常见的机器学习预测任务。为了演示这些方法,我们需要使用一些常用的数据分析和机器学习库,如 pandas
、numpy
、scikit-learn
、xgboost
和 lightgbm
。首先,让我们创建一个示例DataFrame来模拟原始数据,并逐步应用这些机器学习预测任务。
创建示例数据
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import shap
# 创建示例时间序列数据
dates = pd.date_range(start='2023-01-01', end='2025-06-30', freq='D')
np.random.seed(42)
sales_data = np.cumsum(np.random.normal(loc=100, scale=20, size=len(dates)))
data = {
'order_date': dates,
'product_id': np.random.choice(['P{}'.format(i) for i in range(1, 101)], len(dates)),
'category_code': np.random.choice(['C{}'.format(i) for i in range(1, 11)], len(dates)),
'amount': sales_data,
'quantity': np.random.randint(1, 5, size=len(dates)),
'promotion': np.random.choice([True, False], len(dates))
}
df = pd.DataFrame(data)
# 设置时间为索引
df.set_index('order_date', inplace=True)
print("原始数据:")
print(df.head())
1. 准备用于预测的特征工程(时间、促销、节假日等)
# 添加时间特征
df['year'] = df.index.year
df['month'] = df.index.month
df['day_of_week'] = df.index.dayofweek
# 添加节假日标志
holidays = pd.to_datetime([
'2023-01-01', '2023-02-22', '2023-04-05', '2023-05-01', '2023-10-01',
'2024-01-01', '2024-02-10', '2024-04-04', '2024-05-01', '2024-10-01',
'2025-01-01', '2025-02-19', '2025-04-04', '2025-05-01', '2025-10-01'
])
df['is_holiday'] = df.index.isin(holidays).astype(int)
# 将促销状态转换为数值
df['promotion_numeric'] = df['promotion'].astype(int)
print("\n添加特征后的数据:")
print(df.head())
2. 对分类变量进行 One-Hot 编码
# 对类别代码和产品ID进行One-Hot编码
df_encoded = pd.get_dummies(df, columns=['category_code', 'product_id'])
print("\nOne-Hot编码后的数据:")
print(df_encoded.head())
3. 划分训练集与测试集(按时间切片)
# 按时间顺序划分训练集和测试集
train_size = int(len(df_encoded) * 0.8)
train, test = df_encoded.iloc[:train_size], df_encoded.iloc[train_size:]
X_train = train.drop(columns=['amount'])
y_train = train['amount']
X_test = test.drop(columns=['amount'])
y_test = test['amount']
print("\n训练集和测试集的形状:")
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
4. 使用线性回归预测单品销量
# 线性回归模型
lr = LinearRegression()
lr.fit(X_train, y_train)
# 预测
y_pred_lr = lr.predict(X_test)
# 计算均方误差
mse_lr = mean_squared_error(y_test, y_pred_lr)
print("\n线性回归模型 MSE:", mse_lr)
5. 使用 XGBoost 预测不同品类的增长趋势
# XGBoost模型
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb.fit(X_train, y_train)
# 预测
y_pred_xgb = xgb.predict(X_test)
# 计算均方误差
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("\nXGBoost模型 MSE:", mse_xgb)
6. 使用 LightGBM 构建高维特征的销量预测模型
# LightGBM模型
lgb = LGBMRegressor(random_state=42)
lgb.fit(X_train, y_train)
# 预测
y_pred_lgb = lgb.predict(X_test)
# 计算均方误差
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
print("\nLightGBM模型 MSE:", mse_lgb)
7. 使用随机森林预测新品上市后的表现
# 随机森林模型
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
# 预测
y_pred_rf = rf.predict(X_test)
# 计算均方误差
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("\n随机森林模型 MSE:", mse_rf)
8. 构建多输出模型同时预测多个商品的销量
这里我们假设每个商品的销量可以作为一个独立的目标变量。为了简化示例,我们将只选择几个商品进行预测。
# 选择几个商品作为目标变量
target_products = ['P1', 'P2', 'P3']
targets = df[df['product_id'].isin(target_products)].pivot_table(index=df.index, columns='product_id', values='amount', fill_value=0)
# 合并特征和目标变量
df_multi_output = pd.concat([df_encoded.drop(columns=['amount']), targets], axis=1)
# 按时间顺序划分训练集和测试集
train_size = int(len(df_multi_output) * 0.8)
train, test = df_multi_output.iloc[:train_size], df_multi_output.iloc[train_size:]
X_train_multi = train.drop(columns=target_products)
y_train_multi = train[target_products]
X_test_multi = test.drop(columns=target_products)
y_test_multi = test[target_products]
# 多输出随机森林模型
rf_multi = RandomForestRegressor(random_state=42)
rf_multi.fit(X_train_multi, y_train_multi)
# 预测
y_pred_rf_multi = rf_multi.predict(X_test_multi)
# 计算均方误差
mse_rf_multi = mean_squared_error(y_test_multi, y_pred_rf_multi)
print("\n多输出随机森林模型 MSE:", mse_rf_multi)
9. 使用交叉验证评估模型效果(时间序列专用)
# 时间序列交叉验证
tscv = TimeSeriesSplit(n_splits=5)
# 使用XGBoost模型进行交叉验证
cv_scores = cross_val_score(xgb, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error')
# 转换为正数并计算平均值
avg_cv_mse_xgb = -np.mean(cv_scores)
print("\nXGBoost模型交叉验证 MSE 平均值:", avg_cv_mse_xgb)
10. 使用SHAP解释模型预测结果(可解释AI)
# 使用SHAP解释XGBoost模型
explainer = shap.Explainer(xgb)
shap_values = explainer.shap_values(X_test)
# 绘制SHAP摘要图
shap.summary_plot(shap_values, X_test)
综合以上步骤,最终的机器学习预测结果如下:
这段代码展示了从原始数据到经过全面机器学习预测的结果的过程。你可以根据实际需求调整每一步的操作。
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, TimeSeriesSplit, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
import shap
import matplotlib.pyplot as plt
# 创建示例时间序列数据
dates = pd.date_range(start='2023-01-01', end='2025-06-30', freq='D')
np.random.seed(42)
sales_data = np.cumsum(np.random.normal(loc=100, scale=20, size=len(dates)))
data = {
'order_date': dates,
'product_id': np.random.choice(['P{}'.format(i) for i in range(1, 101)], len(dates)),
'category_code': np.random.choice(['C{}'.format(i) for i in range(1, 11)], len(dates)),
'amount': sales_data,
'quantity': np.random.randint(1, 5, size=len(dates)),
'promotion': np.random.choice([True, False], len(dates))
}
df = pd.DataFrame(data)
# 设置时间为索引
df.set_index('order_date', inplace=True)
# 添加时间特征
df['year'] = df.index.year
df['month'] = df.index.month
df['day_of_week'] = df.index.dayofweek
# 添加节假日标志
holidays = pd.to_datetime([
'2023-01-01', '2023-02-22', '2023-04-05', '2023-05-01', '2023-10-01',
'2024-01-01', '2024-02-10', '2024-04-04', '2024-05-01', '2024-10-01',
'2025-01-01', '2025-02-19', '2025-04-04', '2025-05-01', '2025-10-01'
])
df['is_holiday'] = df.index.isin(holidays).astype(int)
# 将促销状态转换为数值
df['promotion_numeric'] = df['promotion'].astype(int)
# 对类别代码和产品ID进行One-Hot编码
df_encoded = pd.get_dummies(df, columns=['category_code', 'product_id'])
# 按时间顺序划分训练集和测试集
train_size = int(len(df_encoded) * 0.8)
train, test = df_encoded.iloc[:train_size], df_encoded.iloc[train_size:]
X_train = train.drop(columns=['amount'])
y_train = train['amount']
X_test = test.drop(columns=['amount'])
y_test = test['amount']
# 线性回归模型
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
print("\n线性回归模型 MSE:", mse_lr)
# XGBoost模型
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
print("\nXGBoost模型 MSE:", mse_xgb)
# LightGBM模型
lgb = LGBMRegressor(random_state=42)
lgb.fit(X_train, y_train)
y_pred_lgb = lgb.predict(X_test)
mse_lgb = mean_squared_error(y_test, y_pred_lgb)
print("\nLightGBM模型 MSE:", mse_lgb)
# 随机森林模型
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
print("\n随机森林模型 MSE:", mse_rf)
# 选择几个商品作为目标变量
target_products = ['P1', 'P2', 'P3']
targets = df[df['product_id'].isin(target_products)].pivot_table(index=df.index, columns='product_id', values='amount', fill_value=0)
# 合并特征和目标变量
df_multi_output = pd.concat([df_encoded.drop(columns=['amount']), targets], axis=1)
# 按时间顺序划分训练集和测试集
train_size = int(len(df_multi_output) * 0.8)
train, test = df_multi_output.iloc[:train_size], df_multi_output.iloc[train_size:]
X_train_multi = train.drop(columns=target_products)
y_train_multi = train[target_products]
X_test_multi = test.drop(columns=target_products)
y_test_multi = test[target_products]
# 多输出随机森林模型
rf_multi = RandomForestRegressor(random_state=42)
rf_multi.fit(X_train_multi, y_train_multi)
y_pred_rf_multi = rf_multi.predict(X_test_multi)
mse_rf_multi = mean_squared_error(y_test_multi, y_pred_rf_multi)
print("\n多输出随机森林模型 MSE:", mse_rf_multi)
# 时间序列交叉验证
tscv = TimeSeriesSplit(n_splits=5)
cv_scores = cross_val_score(xgb, X_train, y_train, cv=tscv, scoring='neg_mean_squared_error')
avg_cv_mse_xgb = -np.mean(cv_scores)
print("\nXGBoost模型交叉验证 MSE 平均值:", avg_cv_mse_xgb)
# 使用SHAP解释XGBoost模型
explainer = shap.Explainer(xgb)
shap_values = explainer.shap_values(X_test)
# 绘制SHAP摘要图
shap.summary_plot(shap_values, X_test)