【Python实践案例】电商平台数据分析和挖掘 – 用户评论情感，并生成用户反馈报告

发表于： 2025年9月13日 2025年9月13日
标签：电商运营数据分析实践

访问量： 2931

开发思路

配置 (# --- 配置 ---):
- REVIEWS_FILE_PATH: 指定包含用户评论的文本文件路径。每行应为一条独立评论。
- STOPWORDS_FILE_PATH: (可选) 指定停用词文件路径。停用词是无实际意义或过于常见的词（如“的”、“是”、“在”），在分析中通常会被过滤掉。如果文件不存在，则不进行过滤。
- REPORT_PREFIX: 生成的报告和图表文件名的前缀。
辅助函数:
- load_stopwords: 从文件加载停用词到一个集合中。
- preprocess_text: 对单条评论进行分词 (jieba.lcut) 并移除停用词。
分析函数:
- analyze_sentiment:
  - 遍历所有评论，使用 SnowNLP 计算每条评论的情感得分 (0到1之间，越接近1越积极)。
  - 根据得分将评论分为积极 (>0.6)、中性 (0.4-0.6)、消极 (<0.4) 三类。
  - 打印各类别数量，并使用 matplotlib 绘制情感分布的饼图。
- extract_keywords:
  - 对所有评论进行预处理（分词、去停用词）。
  - 使用 collections.Counter 统计所有词语的出现频率。
  - 提取出现频率最高的N个词作为关键词。
  - 使用 pandas 打印关键词列表，并用 matplotlib 绘制柱状图展示。
- identify_issues_and_needs:
  - 潜在问题: 筛选出消极评论 (sentiments < 0.4)，对这些评论提取关键词。为了更聚焦，会尝试过滤掉一些常见的、不具体的负面词（如“差”、“不好”），以找出更具体的“物流慢”、“质量差”等问题。
  - 用户需求: 筛选出积极评论 (sentiments > 0.6)，同样提取关键词，并过滤常见正面词，以发现用户喜欢的“物流快”、“质量好”、“款式新”等特性。
  - 使用 pandas 打印识别出的问题和需求列表。
报告生成 (generate_report):
- 将所有分析结果（情感统计、关键词、潜在问题、用户需求）以及对应的图表路径汇总到一个 .txt 文件中。
主函数 (main):
- 负责协调整个流程：加载评论数据、加载停用词、调用分析函数、生成报告。
- 如果找不到评论文件，会自动创建一个包含示例评论的文件用于演示。

import jieba
from snownlp import SnowNLP
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter
import os

# --- 配置 ---
# 评论数据文件路径
REVIEWS_FILE_PATH = 'reviews.txt'
# 停用词文件路径 (可选，用于过滤无意义词汇)
STOPWORDS_FILE_PATH = 'stopwords.txt'
# 生成报告的文件名前缀
REPORT_PREFIX = '用户反馈报告'

# --- 辅助函数 ---

def load_stopwords(filepath):
    """加载停用词列表"""
    stopwords = set()
    if os.path.exists(filepath):
        with open(filepath, 'r', encoding='utf-8') as f:
            for line in f:
                stopwords.add(line.strip())
        print(f"已加载停用词文件: {filepath}")
    else:
        print(f"未找到停用词文件: {filepath}，将不使用停用词过滤。")
    return stopwords

def preprocess_text(text, stopwords):
    """文本预处理：分词并去除停用词"""
    # 使用精确模式分词
    words = jieba.lcut(text)
    # 去除停用词和空字符串
    filtered_words = [word for word in words if word.strip() and word not in stopwords]
    return filtered_words

# --- 分析函数 ---

def analyze_sentiment(reviews):
    """分析评论情感"""
    print("--- 开始情感分析 ---")
    sentiments = []
    for i, review in enumerate(reviews):
        # 使用SnowNLP进行情感分析，score越接近1表示越积极
        s = SnowNLP(review)
        score = s.sentiments
        sentiments.append(score)
        # print(f"评论 {i+1}: {review[:30]}... -> 情感得分: {score:.4f}") # 可选：打印每条评论得分
    
    # 分类情感
    positive_count = sum(1 for s in sentiments if s > 0.6)
    neutral_count = sum(1 for s in sentiments if 0.4 <= s <= 0.6)
    negative_count = sum(1 for s in sentiments if s < .4)
    
    sentiment_labels = ['积极', '中性', '消极']
    sentiment_counts = [positive_count, neutral_count, negative_count]
    
    print(f"积极评论: {positive_count}")
    print(f"中性评论: {neutral_count}")
    print(f"消极评论: {negative_count}")
    
    # 绘制情感分布饼图
    plt.figure(figsize=(8, 8))
    colors = ['green', 'gold', 'red']
    plt.pie(sentiment_counts, labels=sentiment_labels, autopct='%1.1f%%', startangle=140, colors=colors)
    plt.title('用户评论情感分布')
    sentiment_chart_path = f'{REPORT_PREFIX}_情感分布.png'
    plt.savefig(sentiment_chart_path)
    plt.close()
    print(f"情感分布图表已保存至: {sentiment_chart_path}")
    
    return sentiments, sentiment_chart_path

def extract_keywords(reviews, stopwords, top_n=20):
    """提取评论中的关键词"""
    print("\n--- 开始提取关键词 ---")
    all_words = []
    for review in reviews:
        words = preprocess_text(review, stopwords)
        all_words.extend(words)
    
    # 统计词频
    word_freq = Counter(all_words)
    # 获取最常见的N个词
    most_common_words = word_freq.most_common(top_n)
    
    if not most_common_words:
        print("未提取到关键词。")
        return [], ""
        
    df_keywords = pd.DataFrame(most_common_words, columns=['关键词', '频率'])
    print("高频关键词:")
    print(df_keywords.to_string(index=False))
    
    # 绘制关键词词云图（简化为柱状图）
    plt.figure(figsize=(12, 8))
    words, counts = zip(*most_common_words) # 解压元组列表
    bars = plt.bar(range(len(words)), counts, color='skyblue')
    plt.xlabel('关键词')
    plt.ylabel('出现频率')
    plt.title(f'TOP {top_n} 高频关键词')
    plt.xticks(range(len(words)), words, rotation=45, ha='right')
    
    # 在柱状图上添加数值标签
    for bar in bars:
        yval = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2.0, yval, int(yval), ha='center', va='bottom')

    plt.tight_layout()
    keywords_chart_path = f'{REPORT_PREFIX}_高频关键词.png'
    plt.savefig(keywords_chart_path)
    plt.close()
    print(f"高频关键词图表已保存至: {keywords_chart_path}")
    
    return df_keywords, keywords_chart_path

def identify_issues_and_needs(reviews, sentiments, stopwords):
    """根据情感和关键词识别潜在问题与需求"""
    print("\n--- 识别潜在问题与需求 ---")
    
    # --- 识别潜在问题 (分析消极评论中的关键词) ---
    negative_reviews = [reviews[i] for i, s in enumerate(sentiments) if s < 0.4]
    print(f"分析 {len(negative_reviews)} 条消极评论以识别潜在问题...")
    
    negative_words = []
    for review in negative_reviews:
        words = preprocess_text(review, stopwords)
        negative_words.extend(words)
        
    neg_word_freq = Counter(negative_words)
    # 过滤掉一些通用的负面词，聚焦于具体问题
    # 这里可以加入更复杂的逻辑，比如结合正面词对比
    common_neg_words = {'差', '不好', '慢', '贵', '问题', '失望', '垃圾'} # 示例通用负面词
    potential_issues = [(word, freq) for word, freq in neg_word_freq.most_common(20) if word not in common_neg_words and len(word) > 1]
    
    df_issues = pd.DataFrame(potential_issues, columns=['潜在问题关键词', '频率'])
    print("潜在问题关键词 (基于消极评论):")
    print(df_issues.to_string(index=False))
    
    # --- 识别用户需求 (分析积极评论中的关键词) ---
    positive_reviews = [reviews[i] for i, s in enumerate(sentiments) if s > 0.6]
    print(f"\n分析 {len(positive_reviews)} 条积极评论以识别用户需求/喜好...")
    
    positive_words = []
    for review in positive_reviews:
        words = preprocess_text(review, stopwords)
        positive_words.extend(words)
        
    pos_word_freq = Counter(positive_words)
     # 过滤掉一些通用的正面词，聚焦于具体需求/喜好
    common_pos_words = {'好', '棒', '喜欢', '不错', '推荐', '满意', '快'} # 示例通用正面词
    user_needs = [(word, freq) for word, freq in pos_word_freq.most_common(20) if word not in common_pos_words and len(word) > 1]
    
    df_needs = pd.DataFrame(user_needs, columns=['用户需求/喜好关键词', '频率'])
    print("用户需求/喜好关键词 (基于积极评论):")
    print(df_needs.to_string(index=False))
    
    return df_issues, df_needs

def generate_report(sentiments, sentiment_chart, df_keywords, keywords_chart, df_issues, df_needs):
    """生成最终的文本报告"""
    from datetime import datetime
    report_filename = f"{REPORT_PREFIX}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt"
    
    with open(report_filename, 'w', encoding='utf-8') as f:
        f.write("=" * 40 + "\n")
        f.write("        电商平台用户反馈分析报告\n")
        f.write(f"        生成时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
        f.write("=" * 40 + "\n\n")

        f.write("--- 1. 整体情感分析 ---\n")
        positive_count = sum(1 for s in sentiments if s > 0.6)
        neutral_count = sum(1 for s in sentiments if 0.4 <= s <= 0.6)
        negative_count = sum(1 for s in sentiments if s < 0.4)
        total_count = len(sentiments)
        
        f.write(f"总评论数: {total_count}\n")
        f.write(f"积极评论: {positive_count} ({positive_count/total_count*100:.1f}%)\n")
        f.write(f"中性评论: {neutral_count} ({neutral_count/total_count*100:.1f}%)\n")
        f.write(f"消极评论: {negative_count} ({negative_count/total_count*100:.1f}%)\n")
        f.write(f"图表: {sentiment_chart}\n\n")

        f.write("--- 2. 关键词分析 ---\n")
        if df_keywords is not None and not df_keywords.empty:
            f.write(df_keywords.to_string(index=False))
            f.write(f"\n图表: {keywords_chart}\n\n")
        else:
            f.write("无关键词数据。\n\n")

        f.write("--- 3. 潜在问题识别 (基于消极评论) ---\n")
        if df_issues is not None and not df_issues.empty:
            f.write(df_issues.to_string(index=False))
            f.write("\n建议：关注上述高频负面关键词，深入调查具体原因。\n\n")
        else:
            f.write("未识别出显著的潜在问题关键词。\n\n")

        f.write("--- 4. 用户需求与喜好 (基于积极评论) ---\n")
        if df_needs is not None and not df_needs.empty:
            f.write(df_needs.to_string(index=False))
            f.write("\n建议：考虑加强或扩展与上述关键词相关的功能或产品。\n\n")
        else:
            f.write("未识别出显著的用户需求关键词。\n\n")

        f.write("=" * 40 + "\n")
        f.write("              报告结束\n")
        f.write("=" * 40 + "\n")

    print(f"\n分析报告已生成: {report_filename}")

# --- 主函数 ---

def main():
    """主函数"""
    # 1. 加载数据
    if not os.path.exists(REVIEWS_FILE_PATH):
        print(f"错误: 未找到评论文件 {REVIEWS_FILE_PATH}")
        # 创建一个示例文件用于演示
        sample_reviews = [
            "这个产品质量很好，我很喜欢，会推荐给朋友。",
            "物流速度非常快，包装也很仔细，五星好评！",
            "价格有点贵，但是东西还不错。",
            "客服态度很好，解决问题很及时。",
            "商品和描述不符，有点失望。",
            "物流太慢了，等了好久才收到。",
            "质量很差，用了一次就坏了，非常不满意。",
            "款式很新颖，穿着很舒服，赞一个！",
            "发货速度一般，希望可以更快一点。",
            "性价比很高，物超所值，下次还会再买。"
        ]
        with open(REVIEWS_FILE_PATH, 'w', encoding='utf-8') as f:
            for review in sample_reviews:
                f.write(review + '\n')
        print(f"已创建示例评论文件: {REVIEWS_FILE_PATH}")
        
    
    reviews = []
    try:
        with open(REVIEWS_FILE_PATH, 'r', encoding='utf-8') as f:
            for line in f:
                review = line.strip()
                if review: # 忽略空行
                    reviews.append(review)
        print(f"成功加载 {len(reviews)} 条评论。")
    except Exception as e:
        print(f"读取评论文件时出错: {e}")
        return

    if not reviews:
        print("评论文件为空或读取失败。")
        return

    # 2. 加载停用词
    stopwords = load_stopwords(STOPWORDS_FILE_PATH)

    # 3. 执行分析
    sentiments, sentiment_chart = analyze_sentiment(reviews)
    df_keywords, keywords_chart = extract_keywords(reviews, stopwords)
    df_issues, df_needs = identify_issues_and_needs(reviews, sentiments, stopwords)

    # 4. 生成报告
    generate_report(sentiments, sentiment_chart, df_keywords, keywords_chart, df_issues, df_needs)

    print("\n分析完成。")

if __name__ == "__main__":
    main()

viplao

489