【Python实践案例】电商平台数据分析和挖掘 – 用户行为分析

发表于： 2025年8月20日 2025年9月13日
标签：电商运营数据分析实践
访问量： 2125
开发思路该脚本将包含以下功能：
模拟生成包含用户购买、浏览、加购等行为的电商数据。
进行数据预处理和用户行为特征工程。
使用K-Means聚类算法对用户进行细分。
分析并可视化各个用户群体的特征。
生成一份包含分析过程、用户群体画像和运营建议的综合报告。
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings("ignore")

# --- 配置 ---
NUM_USERS = 2000
NUM_PRODUCTS = 500
REPORT_PREFIX = '电商用户行为分析报告'
RANDOM_SEED = 42

# --- 数据生成 ---

def generate_sample_user_behavior_data(n_users, n_products):
    """生成模拟的用户行为数据"""
    print("--- 正在生成模拟用户行为数据 ---")
    np.random.seed(RANDOM_SEED)
    
    data = []
    
    # 模拟用户基础信息
    user_ids = [f'user_{i}' for i in range(1, n_users + 1)]
    user_info = []
    for user_id in user_ids:
        age_group = np.random.choice(['18-25', '26-35', '36-45', '46-55', '55+'], p=[0.2, 0.3, 0.25, 0.15, 0.1])
        gender = np.random.choice(['Male', 'Female'], p=[0.5, 0.5])
        location = np.random.choice(['Tier_1', 'Tier_2', 'Tier_3'], p=[0.3, 0.4, 0.3])
        user_info.append({'user_id': user_id, 'age_group': age_group, 'gender': gender, 'location': location})
    df_users = pd.DataFrame(user_info)

    # 模拟用户行为 (浏览、加购、购买)
    # 简化模型：每个用户有不同数量的行为记录
    for user_id in user_ids:
        user_age_group = df_users[df_users['user_id'] == user_id]['age_group'].iloc[0]
        
        # 假设用户有10-100次行为记录
        n_actions = np.random.randint(10, 101)
        
        for _ in range(n_actions):
            product_id = f'prod_{np.random.randint(1, n_products + 1)}'
            action_type = np.random.choice(['view', 'cart', 'purchase'], p=[0.6, 0.3, 0.1])
            timestamp = datetime.now() - timedelta(days=np.random.randint(0, 365), seconds=np.random.randint(0, 86400))
            # 简化：每次行为的金额都随机生成，实际中应关联商品
            amount = np.random.lognormal(6, 1.2) if action_type == 'purchase' else 0.0
            
            data.append({
                'user_id': user_id,
                'product_id': product_id,
                'action_type': action_type,
                'timestamp': timestamp,
                'amount': round(amount, 2)
            })
            
    df_actions = pd.DataFrame(data)
    
    # --- 用户行为特征工程 ---
    print("正在进行用户行为特征工程...")
    
    # 1. 基础统计特征
    user_stats = df_actions.groupby('user_id').agg(
        total_actions=('action_type', 'count'),
        total_purchases=('amount', lambda x: (x > 0).sum()),
        total_spent=('amount', 'sum'),
        avg_action_amount=('amount', lambda x: x.sum() / (x > 0).sum() if (x > 0).sum() > 0 else 0),
        first_action_date=('timestamp', 'min'),
        last_action_date=('timestamp', 'max')
    ).reset_index()
    
    # 2. 衍生特征
    user_stats['tenure_days'] = (user_stats['last_action_date'] - user_stats['first_action_date']).dt.days + 1
    user_stats['purchase_freq'] = user_stats['total_purchases'] / user_stats['tenure_days'] # 日均购买次数
    user_stats['activity_freq'] = user_stats['total_actions'] / user_stats['tenure_days'] # 日均活动次数
    user_stats['avg_order_value'] = user_stats['total_spent'] / user_stats['total_purchases'].replace(0, 1) # 避免除以零
    user_stats['cart_to_purchase_rate'] = user_stats['total_purchases'] / (df_actions[(df_actions['user_id'].isin(user_stats['user_id'])) & (df_actions['action_type'] == 'cart')].groupby('user_id').size().reindex(user_stats['user_id'], fill_value=0) + 1) # +1 避免除以零

    # 3. 最近活跃度 (Recency)
    user_stats['recency_days'] = (datetime.now() - user_stats['last_action_date']).dt.days
    
    # 合并用户基础信息
    df_final = user_stats.merge(df_users, on='user_id', how='left')
    
    csv_filename = f'{REPORT_PREFIX}_用户行为特征数据.csv'
    df_final.to_csv(csv_filename, index=False, encoding='utf-8-sig')
    print(f"用户行为特征数据已生成并保存至: {csv_filename}")
    return df_final

# --- 数据预处理 ---

def preprocess_data(df):
    """数据预处理"""
    print("\n--- 正在进行数据预处理 ---")
    df_processed = df.copy()
    
    # 1. 编码分类变量
    le_age = LabelEncoder()
    le_gender = LabelEncoder()
    le_location = LabelEncoder()
    
    df_processed['age_group_encoded'] = le_age.fit_transform(df_processed['age_group'])
    df_processed['gender_encoded'] = le_gender.fit_transform(df_processed['gender'])
    df_processed['location_encoded'] = le_location.fit_transform(df_processed['location'])
    
    # 2. 选择用于聚类的数值特征列
    # 选择能体现用户价值和行为模式的特征
    feature_columns = [
        'total_actions', 'total_purchases', 'total_spent', 'avg_action_amount',
        'tenure_days', 'purchase_freq', 'activity_freq', 'avg_order_value',
        'cart_to_purchase_rate', 'recency_days'
        # 注意：这里没有包含基础画像特征(age, gender, location)，因为聚类主要关注行为模式。
        # 如果需要结合画像细分，可以加入编码后的特征。
    ]
    
    X = df_processed[feature_columns]
    
    # 3. 处理缺失值 (虽然在这个模拟数据中不太可能有缺失，但这是好习惯)
    X = X.fillna(0)
    
    # 4. 特征标准化 (对K-Means至关重要)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=feature_columns)
    
    print(f"预处理完成。特征矩阵形状: {X_scaled_df.shape}")
    return X_scaled_df, scaler, feature_columns

# --- 用户聚类分析 ---

def perform_user_segmentation(X_scaled, max_clusters=10):
    """执行用户聚类分析"""
    print("\n--- 正在执行用户聚类分析 ---")
    
    # 1. 确定最优聚类数 K (肘部法则和轮廓系数)
    print("寻找最优聚类数 K...")
    inertias = []
    silhouette_scores = []
    K_range = range(2, max_clusters+1)
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=RANDOM_SEED, n_init=10)
        kmeans.fit(X_scaled)
        inertias.append(kmeans.inertia_)
        score = silhouette_score(X_scaled, kmeans.labels_)
        silhouette_scores.append(score)
        print(f"  K={k}, Inertia={kmeans.inertia_:.2f}, Silhouette Score={score:.3f}")

    # 绘制肘部法则和轮廓系数图
    fig, ax1 = plt.subplots(figsize=(10, 6))

    color = 'tab:blue'
    ax1.set_xlabel('聚类数 K')
    ax1.set_ylabel('簇内平方和 (Inertia)', color=color)
    ax1.plot(K_range, inertias, marker='o', color=color, label='Inertia')
    ax1.tick_params(axis='y', labelcolor=color)

    ax2 = ax1.twinx()  
    color = 'tab:red'
    ax2.set_ylabel('轮廓系数 (Silhouette Score)', color=color)  
    ax2.plot(K_range, silhouette_scores, marker='s', color=color, label='Silhouette Score')
    ax2.tick_params(axis='y', labelcolor=color)
    
    fig.tight_layout() 
    plt.title('肘部法则 & 轮廓系数 vs 聚类数 K')
    plt.xticks(K_range)
    plt.grid(True)
    # 将图例放在图表下方
    lines, labels = ax1.get_legend_handles_labels()
    lines2, labels2 = ax2.get_legend_handles_labels()
    ax2.legend(lines + lines2, labels + labels2, loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)
    
    k_plot_path = f'{REPORT_PREFIX}_最优K值分析.png'
    plt.savefig(k_plot_path, bbox_inches='tight')
    plt.close()
    print(f"最优K值分析图表已保存至: {k_plot_path}")
    
    # 选择最优K (这里我们选择轮廓系数最高的K，也可以结合肘部法则)
    optimal_k = K_range[np.argmax(silhouette_scores)]
    print(f"选择最优聚类数 K = {optimal_k}")
    
    # 2. 使用最优K进行聚类
    print(f"使用 K={optimal_k} 进行最终聚类...")
    kmeans_final = KMeans(n_clusters=optimal_k, random_state=RANDOM_SEED, n_init=10)
    cluster_labels = kmeans_final.fit_predict(X_scaled)
    
    return cluster_labels, optimal_k, kmeans_final

# --- 聚类结果分析与可视化 ---

def analyze_and_visualize_clusters(df, X_scaled_df, cluster_labels, feature_columns, optimal_k):
    """分析和可视化聚类结果"""
    print("\n--- 正在分析和可视化聚类结果 ---")
    
    df['cluster'] = cluster_labels
    
    # 1. 各聚类的基本统计信息
    cluster_summary = df.groupby('cluster').agg(
        user_count=('user_id', 'count'),
        mean_total_spent=('total_spent', 'mean'),
        mean_total_purchases=('total_purchases', 'mean'),
        mean_activity_freq=('activity_freq', 'mean'),
        mean_recency_days=('recency_days', 'mean'),
        mean_tenure_days=('tenure_days', 'mean')
    ).round(2)
    cluster_summary['pct_of_users'] = (cluster_summary['user_count'] / len(df) * 100).round(2)
    
    print("各用户群体基本统计摘要:")
    print(cluster_summary.to_string())
    
    summary_csv_path = f'{REPORT_PREFIX}_用户群体摘要.csv'
    cluster_summary.to_csv(summary_csv_path, encoding='utf-8-sig')
    print(f"用户群体摘要已保存至: {summary_csv_path}")

    # 2. 可视化：使用PCA降维后绘制散点图
    print("生成PCA降维可视化图...")
    pca = PCA(n_components=2, random_state=RANDOM_SEED)
    X_pca = pca.fit_transform(X_scaled_df)
    
    plt.figure(figsize=(10, 8))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis', alpha=0.6)
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]:.1%} variance)')
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]:.1%} variance)')
    plt.title('用户群体聚类 (PCA降维可视化)')
    plt.colorbar(scatter, label='Cluster')
    # 添加聚类中心 (在PCA空间中)
    centers_pca = pca.transform(kmeans.cluster_centers_)
    plt.scatter(centers_pca[:, 0], centers_pca[:, 1], c='red', marker='x', s=200, linewidths=3, label='Centroids')
    plt.legend()
    pca_plot_path = f'{REPORT_PREFIX}_用户群体PCA可视化.png'
    plt.savefig(pca_plot_path)
    plt.close()
    print(f"PCA可视化图表已保存至: {pca_plot_path}")

    # 3. 可视化：各群体关键特征雷达图 (每个群体一个图)
    print("生成各用户群体特征雷达图...")
    # 计算每个群体在各特征上的平均Z-score (标准化后的值)
    cluster_profiles = X_scaled_df.copy()
    cluster_profiles['cluster'] = cluster_labels
    cluster_averages = cluster_profiles.groupby('cluster').mean()
    
    # 为雷达图准备数据
    num_vars = len(feature_columns)
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1] # 闭合图形
    
    fig, axes = plt.subplots(2, (optimal_k + 1) // 2, figsize=(5 * (optimal_k + 1) // 2, 10), subplot_kw=dict(polar=True))
    if optimal_k == 2:
         axes = axes.reshape(1, -1) # 如果只有2个簇，axes是一维的
    fig.suptitle('用户群体特征雷达图', fontsize=16)
    
    for i in range(optimal_k):
        row = i // axes.shape[1]
        col = i % axes.shape[1]
        ax = axes[row, col] if axes.ndim > 1 else axes[col]
        
        values = cluster_averages.iloc[i].tolist()
        values += values[:1]
        ax.plot(angles, values, linewidth=2, label=f'Cluster {i}')
        ax.fill(angles, values, alpha=0.25)
        ax.set_xticks(angles[:-1])
        ax.set_xticklabels(feature_columns, fontsize=8)
        ax.set_ylim([-3, 3]) # 标准化后的值通常在这个范围
        ax.set_title(f'群体 {i}', fontsize=12)
        ax.yaxis.grid(True)
        
    # 隐藏多余的子图 (如果簇数是奇数)
    if optimal_k % 2 != 0 and axes.ndim > 1:
        fig.delaxes(axes[1, -1])

    plt.tight_layout(rect=[0, 0.03, 1, 0.95])
    radar_plot_path = f'{REPORT_PREFIX}_用户群体雷达图.png'
    plt.savefig(radar_plot_path)
    plt.close()
    print(f"用户群体雷达图已保存至: {radar_plot_path}")
    
    return cluster_summary

# --- 报告生成 ---

def generate_user_segmentation_report(cluster_summary, optimal_k, k_plot_path, pca_plot_path, radar_plot_path):
    """生成最终的用户行为分析与细分报告"""
    print("\n--- 正在生成用户行为分析与细分报告 ---")
    from datetime import datetime
    report_filename = f"{REPORT_PREFIX}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write("=" * 50 + "\n")
        f.write("        电商平台用户行为分析与细分报告\n")
        f.write(f"        生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("=" * 50 + "\n\n")

        f.write("--- 1. 项目概述 ---\n")
        f.write("本项目旨在通过对电商平台用户行为数据的深入分析，将用户划分为不同的群体。\n")
        f.write("目标是理解不同用户群体的行为特征，为精细化运营、个性化推荐和精准营销提供数据支持。\n\n")

        f.write("--- 2. 数据概览 ---\n")
        f.write("数据来源: 模拟生成的电商平台用户行为数据。\n")
        f.write("数据规模: 2000 名用户，500 种商品。\n")
        f.write("关键行为: 浏览 (view), 加购 (cart), 购买 (purchase)。\n")
        f.write("关键字段: 用户ID, 商品ID, 行为类型, 时间戳, 金额等。\n")
        f.write("原始特征数据已保存为 CSV 文件。\n\n")

        f.write("--- 3. 用户行为特征工程 ---\n")
        f.write("从原始行为日志中提取了能反映用户价值和行为模式的关键指标：\n")
        f.write("- 总行为次数 (total_actions)\n")
        f.write("- 总购买次数 (total_purchases)\n")
        f.write("- 总消费金额 (total_spent)\n")
        f.write("- 平均行为金额 (avg_action_amount)\n")
        f.write("- 用户生命周期 (tenure_days)\n")
        f.write("- 购买频率 (purchase_freq)\n")
        f.write("- 活动频率 (activity_freq)\n")
        f.write("- 平均订单价值 (avg_order_value)\n")
        f.write("- 加购转化率 (cart_to_purchase_rate)\n")
        f.write("- 最近活跃度 (recency_days)\n\n")

        f.write("--- 4. 用户群体细分 ---\n")
        f.write(f"采用 K-Means 聚类算法，通过肘部法则和轮廓系数分析，确定最优聚类数 K = {optimal_k}。\n")
        f.write("分析过程图表已生成，包括：\n")
        f.write(f"- 最优K值选择分析图: {k_plot_path}\n")
        f.write(f"- PCA降维可视化图: {pca_plot_path}\n")
        f.write(f"- 用户群体特征雷达图: {radar_plot_path}\n\n")
        
        f.write("--- 5. 用户群体画像 ---\n")
        f.write("根据聚类结果，用户被划分为以下群体，各群体特征摘要如下:\n")
        f.write(cluster_summary.to_string())
        f.write("\n\n基于以上数据，可以对每个群体进行命名和描述：\n")
        f.write("(命名需结合雷达图等可视化信息进行人工解读)\n")
        f.write("例如：\n")
        f.write("- 高价值客户: 高消费金额、高购买频率、低近期活跃度可能表示忠诚老客户。\n")
        f.write("- 潜力客户: 中等活动频率、中等消费、高近期活跃度。\n")
        f.write("- 新兴客户: 短生命周期、低活动频率、但高近期活跃度。\n")
        f.write("- 流失风险客户: 低活动频率、低购买频率、高近期活跃度。\n")
        f.write("- 低价值客户: 各项指标均较低。\n\n")

        f.write("--- 6. 运营策略与建议 ---\n")
        f.write("1. 精准营销:\n")
        f.write("   - 对'高价值客户'提供VIP服务、专属优惠，提高忠诚度。\n")
        f.write("   - 对'潜力客户'推送个性化商品推荐，刺激消费。\n")
        f.write("   - 对'新兴客户'提供新人礼包，引导完成首单。\n")
        f.write("   - 对'流失风险客户'发送召回优惠券或进行用户关怀。\n")
        f.write("2. 个性化推荐: 根据不同群体的偏好调整推荐算法。\n")
        f.write("3. 产品优化: 分析各群体热门商品，优化商品结构。\n")
        f.write("4. 用户体验: 针对不同群体优化App或网站的用户界面和功能。\n")
        f.write("5. 模型迭代: 定期更新用户行为数据和群体划分，以适应市场变化。\n\n")

        f.write("=" * 50 + "\n")
        f.write("                    报告结束\n")
        f.write("=" * 50 + "\n")

    print(f"用户行为分析与细分报告已生成: {report_filename}")

# --- 主函数 ---

def main():
    """主函数"""
    # 1. 生成数据
    df_user_features = generate_sample_user_behavior_data(NUM_USERS, NUM_PRODUCTS)
    
    # 2. 数据预处理
    X_scaled_df, scaler, feature_cols = preprocess_data(df_user_features)
    
    # 3. 用户聚类分析
    cluster_labels, optimal_k, kmeans = perform_user_segmentation(X_scaled_df)
    
    # 4. 分析和可视化聚类结果
    cluster_summary = analyze_and_visualize_clusters(df_user_features, X_scaled_df, cluster_labels, feature_cols, optimal_k)
    
    # 5. 生成报告
    generate_user_segmentation_report(cluster_summary, optimal_k, f'{REPORT_PREFIX}_最优K值分析.png', f'{REPORT_PREFIX}_用户群体PCA可视化.png', f'{REPORT_PREFIX}_用户群体雷达图.png')
    
    print("\n用户行为分析与细分流程完成。")

if __name__ == "__main__":
    main()
viplao

487