【Python实践案例】电商平台数据分析和挖掘 -全流程数据清洗与聚合销售报表输出
在当今数据驱动的商业环境中,数据分析是企业决策的基石。但如何高效地从海量数据中提取有价值的洞察?
客户需求:已在MySQL数据库中有如下表的数据,进行销售分析,生成分析报告!
核心实现步骤
1. 数据库连接与读取
使用SQLAlchemy建立高效连接,避免直接操作SQL语句的复杂性:
def get_sqlalchemy_engine():
return create_engine('mysql+pymysql://root:root@localhost/qycw?charset=utf8mb4')
def load_financial_data_from_db():
engine = get_sqlalchemy_engine()
query = """
SELECT
report_date, revenue, operating_cost, sales_expense,
admin_expense, financial_expense, current_assets,
current_liabilities, total_assets, total_liabilities, net_profit
FROM financial_data
ORDER BY report_date
"""
df = pd.read_sql(query, engine, parse_dates=['report_date'])
return df
2. 数据清洗与异常检测
财务数据容不得半点误差,我们实现三重清洗机制:
def clean_data(df):
# 缺失值检查
print("缺失值检查:", df.isnull().sum())
# 异常值检测(IQR方法)
def detect_outliers(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return (series < lower_bound) | (series > upper_bound)
# 时间特征工程
df['季度'] = df['日期'].dt.quarter
df['月份'] = df['日期'].dt.month
return df
3. 指标计算引擎
自动计算12项核心财务指标,覆盖盈利能力、偿债能力和运营效率:
def calculate_financial_ratios(df):
# 盈利能力指标
df['毛利率(%)'] = ((df['营业收入'] - df['营业成本']) / df['营业收入'] * 100).round(2)
df['净利率(%)'] = (df['净利润'] / df['营业收入'] * 100).round(2)
# 偿债能力指标
df['流动比率'] = (df['流动资产'] / df['流动负债']).round(2)
df['资产负债率(%)'] = (df['总负债'] / df['总资产'] * 100).round(2)
# 运营效率指标
df['总资产周转率'] = (df['营业收入'] / df['总资产']).round(2)
return df
4. 六维可视化分析
6个专业图表,全方位展示企业状况:
def visualize_data(df):
plt.figure(figsize=(18, 12))
# 1. 收入与利润趋势
plt.subplot(2, 3, 1)
plt.plot(df['日期'], df['营业收入'], marker='o', label='营业收入')
plt.plot(df['日期'], df['净利润'], marker='s', label='净利润')
plt.title('收入与利润趋势分析')
# 2. 成本结构饼图
plt.subplot(2, 3, 2)
cost_data = df[['营业成本', '销售费用', '管理费用', '财务费用']].sum()
cost_data.plot(kind='pie', autopct='%1.1f%%')
# 3. 盈利能力指标
plt.subplot(2, 3, 3)
plt.plot(df['日期'], df['毛利率(%)'], label='毛利率')
plt.plot(df['日期'], df['净利率(%)'], label='净利率')
# 4. 偿债能力指标
plt.subplot(2, 3, 4)
plt.plot(df['日期'], df['流动比率'], label='流动比率')
plt.plot(df['日期'], df['资产负债率(%)'], label='资产负债率')
# 5. 季度对比分析
plt.subplot(2, 3, 5)
quarterly = df.groupby('季度').agg({'营业收入': 'sum', '净利润': 'sum'})
quarterly.plot(kind='bar')
# 6. 资产负债结构
plt.subplot(2, 3, 6)
asset_liability = df[['总资产', '总负债']].iloc[-1]
asset_liability.plot(kind='bar')
plt.tight_layout()
plt.savefig('财务分析报告.png', dpi=300)
5. 智能报告生成
自动生成包含财务摘要、健康评估、趋势分析和风险提示的专业报告:
def generate_report(df):
print("="*50)
print("企业财务分析报告".center(50))
print("="*50)
# 财务数据摘要
print(f"年度总营业收入: {df['营业收入'].sum():.2f}万元")
print(f"年度总净利润: {df['净利润'].sum():.2f}万元")
print(f"平均毛利率: {df['毛利率(%)'].mean():.2f}%")
# 财务健康评估
avg_current_ratio = df['流动比率'].mean()
avg_debt_ratio = df['资产负债率(%)'].mean()
print(f"平均流动比率: {avg_current_ratio:.2f}", end=" - ")
if avg_current_ratio > 2:
print("偿债能力强")
elif avg_current_ratio > 1:
print("偿债能力正常")
else:
print("偿债能力较弱")
# 风险提示
if df['净利润'].min() < 0:
print("⚠️ 存在月度亏损情况,需关注成本控制")
if avg_current_ratio < 1.5:
print("⚠️ 流动比率偏低,短期偿债压力较大")
完整代码参考:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import pymysql
from sqlalchemy import create_engine
# 设置中文显示
plt.rcParams['font.sans-serif'] = ['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False# 用来正常显示负号
# 数据库连接配置
def get_conn():
return pymysql.connect(
host='',
user='',
password='',
database='',
charset='utf8',
cursorclass=pymysql.cursors.DictCursor
)
# 使用SQLAlchemy创建引擎(推荐方式)
def get_sqlalchemy_engine():
return create_engine('mysql+pymysql://?charset=utf8')
# 创建财务数据表(如果不存在)
def create_financial_table():
conn = get_conn()
try:
with conn.cursor() as cursor:
# 创建财务数据表
create_table_sql = """
CREATE TABLE IF NOT EXISTS financial_data (
id INT AUTO_INCREMENT PRIMARY KEY,
report_date DATE NOT NULL,
revenue DECIMAL(15,2) COMMENT '营业收入(万元)',
operating_cost DECIMAL(15,2) COMMENT '营业成本(万元)',
sales_expense DECIMAL(15,2) COMMENT '销售费用(万元)',
admin_expense DECIMAL(15,2) COMMENT '管理费用(万元)',
financial_expense DECIMAL(15,2) COMMENT '财务费用(万元)',
current_assets DECIMAL(15,2) COMMENT '流动资产(万元)',
current_liabilities DECIMAL(15,2) COMMENT '流动负债(万元)',
total_assets DECIMAL(15,2) COMMENT '总资产(万元)',
total_liabilities DECIMAL(15,2) COMMENT '总负债(万元)',
net_profit DECIMAL(15,2) COMMENT '净利润(万元)',
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COMMENT='企业财务数据表';
"""
cursor.execute(create_table_sql)
conn.commit()
print("财务数据表创建成功或已存在")
except Exception as e:
print(f"创建表时出错: {e}")
finally:
conn.close()
# 从数据库读取财务数据(使用SQLAlchemy)
def load_financial_data_from_db():
engine = get_sqlalchemy_engine()
try:
# 使用英文字段名避免解析问题
query = """
SELECT
report_date,
revenue,
operating_cost,
sales_expense,
admin_expense,
financial_expense,
current_assets,
current_liabilities,
total_assets,
total_liabilities,
net_profit
FROM financial_data
ORDER BY report_date
"""
# 使用SQLAlchemy引擎和parse_dates参数
df = pd.read_sql(query, engine, parse_dates=['report_date'])
# 重命名列为中文
df.columns = [
'日期', '营业收入(万元)', '营业成本(万元)', '销售费用(万元)',
'管理费用(万元)', '财务费用(万元)', '流动资产(万元)',
'流动负债(万元)', '总资产(万元)', '总负债(万元)', '净利润(万元)'
]
return df
except Exception as e:
print(f"从数据库读取数据时出错: {e}")
return pd.DataFrame()
# 数据清洗与预处理
def clean_data(df):
# 检查缺失值
print("缺失值检查:")
print(df.isnull().sum())
def detect_outliers(series):
Q1 = series.quantile(0.25)
Q3 = series.quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return (series < lower_bound) | (series > upper_bound)
print("\n异常值检查:")
for col in df.select_dtypes(include=[np.number]).columns:
outliers = detect_outliers(df[col])
if outliers.any():
print(f"{col}: {outliers.sum()}个异常值")
# 添加时间特征
df['季度'] = df['日期'].dt.quarter
df['月份'] = df['日期'].dt.month
return df
# 财务指标计算
def calculate_financial_ratios(df):
# 盈利能力指标
df['毛利率(%)'] = ((df['营业收入(万元)'] - df['营业成本(万元)']) / df['营业收入(万元)'] * 100).round(2)
df['净利率(%)'] = (df['净利润(万元)'] / df['营业收入(万元)'] * 100).round(2)
df['营业利润率(%)'] = ((df['净利润(万元)'] + df['财务费用(万元)']) / df['营业收入(万元)'] * 100).round(2)
# 偿债能力指标
df['流动比率'] = (df['流动资产(万元)'] / df['流动负债(万元)']).round(2)
df['资产负债率(%)'] = (df['总负债(万元)'] / df['总资产(万元)'] * 100).round(2)
# 运营效率指标
df['总资产周转率'] = (df['营业收入(万元)'] / df['总资产(万元)']).round(2)
return df
# 数据可视化分析
def visualize_data(df):
# 创建图形布局
plt.figure(figsize=(18, 12))
# 1. 收入与利润趋势
plt.subplot(2, 3, 1)
plt.plot(df['日期'], df['营业收入(万元)'], marker='o', label='营业收入')
plt.plot(df['日期'], df['净利润(万元)'], marker='s', label='净利润')
plt.title('收入与利润趋势分析')
plt.xlabel('日期')
plt.ylabel('金额(万元)')
plt.legend()
plt.grid(True)
# 2. 成本结构分析
plt.subplot(2, 3, 2)
cost_data = df[['营业成本(万元)', '销售费用(万元)', '管理费用(万元)', '财务费用(万元)']].sum()
cost_data.plot(kind='pie', autopct='%1.1f%%', startangle=90)
plt.title('成本结构分析')
plt.ylabel('')
# 3. 盈利能力指标
plt.subplot(2, 3, 3)
plt.plot(df['日期'], df['毛利率(%)'], marker='o', label='毛利率')
plt.plot(df['日期'], df['净利率(%)'], marker='s', label='净利率')
plt.title('盈利能力指标')
plt.xlabel('日期')
plt.ylabel('百分比(%)')
plt.legend()
plt.grid(True)
# 4. 偿债能力指标
plt.subplot(2, 3, 4)
plt.plot(df['日期'], df['流动比率'], marker='o', label='流动比率')
plt.plot(df['日期'], df['资产负债率(%)'], marker='s', label='资产负债率')
plt.title('偿债能力指标')
plt.xlabel('日期')
plt.ylabel('比率')
plt.legend()
plt.grid(True)
# 5. 季度对比分析
plt.subplot(2, 3, 5)
quarterly = df.groupby('季度').agg({
'营业收入(万元)': 'sum',
'净利润(万元)': 'sum'
})
quarterly.plot(kind='bar')
plt.title('季度收入与利润对比')
plt.xlabel('季度')
plt.ylabel('金额(万元)')
plt.xticks(rotation=0)
# 6. 资产负债结构
plt.subplot(2, 3, 6)
asset_liability = df[['总资产(万元)', '总负债(万元)']].iloc[-1]
asset_liability.plot(kind='bar', color=['#1f77b4', '#ff7f0e'])
plt.title('期末资产负债结构')
plt.ylabel('金额(万元)')
plt.xticks(rotation=0)
plt.tight_layout()
plt.savefig('财务分析报告.png', dpi=300)
plt.show()
# 生成分析报告
def generate_report(df):
print("\n" + "="*50)
print("企业财务分析报告".center(50))
print("="*50)
# 基本财务数据摘要
print("\n【财务数据摘要】")
print(f"年度总营业收入: {df['营业收入(万元)'].sum():.2f}万元")
print(f"年度总净利润: {df['净利润(万元)'].sum():.2f}万元")
print(f"平均毛利率: {df['毛利率(%)'].mean():.2f}%")
print(f"平均净利率: {df['净利率(%)'].mean():.2f}%")
# 财务健康评估
print("\n【财务健康评估】")
avg_current_ratio = df['流动比率'].mean()
avg_debt_ratio = df['资产负债率(%)'].mean()
print(f"平均流动比率: {avg_current_ratio:.2f}", end=" - ")
if avg_current_ratio > 2:
print("偿债能力强")
elif avg_current_ratio > 1:
print("偿债能力正常")
else:
print("偿债能力较弱")
print(f"平均资产负债率: {avg_debt_ratio:.2f}%", end=" - ")
if avg_debt_ratio < 40:
print("负债水平低")
elif avg_debt_ratio < 60:
print("负债水平适中")
else:
print("负债水平较高")
# 趋势分析
print("\n【趋势分析】")
revenue_growth = (df['营业收入(万元)'].iloc[-1] - df['营业收入(万元)'].iloc[0]) / df['营业收入(万元)'].iloc[0] * 100
profit_growth = (df['净利润(万元)'].iloc[-1] - df['净利润(万元)'].iloc[0]) / df['净利润(万元)'].iloc[0] * 100
print(f"收入增长率: {revenue_growth:.2f}%")
print(f"利润增长率: {profit_growth:.2f}%")
# 风险提示
print("\n【风险提示】")
if df['净利润(万元)'].min() < 0:
print("⚠️ 存在月度亏损情况,需关注成本控制")
if avg_current_ratio < 1.5:
print("⚠️ 流动比率偏低,短期偿债压力较大")
if avg_debt_ratio > 70:
print("⚠️ 资产负债率过高,财务风险较大")
print("\n" + "="*50)
print("报告生成时间:", datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
print("="*50)
# 主程序
def main():
print("正在创建财务数据表(如果不存在)...")
create_financial_table()
print("正在从数据库读取财务数据...")
df = load_financial_data_from_db()
if df.empty:
print("未能从数据库加载数据,程序终止")
return
print("正在清洗数据...")
df = clean_data(df)
print("正在计算财务指标...")
df = calculate_financial_ratios(df)
print("正在生成可视化图表...")
visualize_data(df)
print("正在生成分析报告...")
generate_report(df)
# 保存处理后的数据
df.to_excel('财务数据分析结果.xlsx', index=False)
print("\n分析完成 结果已保存至'财务数据分析结果.xlsx'和'财务分析报告.png'")
if __name__ == "__main__":
main()
CREATE TABLE `financial_data` ( `id` int NOT NULL AUTO_INCREMENT, `report_date` date NOT NULL, `revenue` decimal(15,2) DEFAULT NULL COMMENT '营业收入(万元)', `operating_cost` decimal(15,2) DEFAULT NULL COMMENT '营业成本(万元)', `sales_expense` decimal(15,2) DEFAULT NULL COMMENT '销售费用(万元)', `admin_expense` decimal(15,2) DEFAULT NULL COMMENT '管理费用(万元)', `financial_expense` decimal(15,2) DEFAULT NULL COMMENT '财务费用(万元)', `current_assets` decimal(15,2) DEFAULT NULL COMMENT '流动资产(万元)', `current_liabilities` decimal(15,2) DEFAULT NULL COMMENT '流动负债(万元)', `total_assets` decimal(15,2) DEFAULT NULL COMMENT '总资产(万元)', `total_liabilities` decimal(15,2) DEFAULT NULL COMMENT '总负债(万元)', `net_profit` decimal(15,2) DEFAULT NULL COMMENT '净利润(万元)', `created_at` timestamp NULL DEFAULT CURRENT_TIMESTAMP, PRIMARY KEY (`id`)) ENGINE=InnoDB AUTO_INCREMENT=13 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci COMMENT='企业财务数据表';
INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (1, '2022-01-31', 524.84, 295.85, 56.48, 37.62, 9.30, 776.59, 478.96, 1653.49, 853.05, 125.59, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (2, '2022-02-28', 527.13, 286.10, 45.34, 31.21, 4.26, 627.51, 371.89, 1297.43, 931.42, 160.22, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (3, '2022-03-31', 454.60, 257.63, 64.66, 28.87, 10.20, 657.53, 372.78, 1522.18, 784.90, 93.24, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (4, '2022-04-30', 518.78, 281.98, 47.08, 26.99, 15.56, 798.65, 347.11, 1664.51, 777.92, 147.17, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (5, '2022-05-31', 510.44, 241.21, 36.72, 30.98, 12.22, 817.14, 394.22, 1439.78, 752.15, 189.31, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (6, '2022-06-30', 464.01, 286.18, 60.57, 31.72, 4.71, 832.41, 380.75, 1364.62, 961.17, 80.83, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (7, '2022-07-31', 551.55, 327.94, 41.61, 28.45, 10.99, 897.55, 376.04, 1462.87, 789.37, 142.56, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (8, '2022-08-31', 440.19, 324.38, 63.56, 29.64, 13.01, 836.16, 367.74, 1572.28, 1053.80, 9.60, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (9, '2022-09-30', 498.21, 346.94, 23.80, 34.11, 10.26, 770.10, 404.59, 1102.49, 878.03, 83.10, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (10, '2022-10-31', 517.86, 344.34, 44.82, 25.96, 8.49, 891.54, 416.44, 1394.05, 951.33, 94.25, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (11, '2022-11-30', 504.85, 329.06, 42.98, 28.36, 8.82, 653.65, 414.81, 1552.21, 900.51, 95.63, '2025-07-30 13:32:21');INSERT INTO `qycw`.`financial_data` (`id`, `report_date`, `revenue`, `operating_cost`, `sales_expense`, `admin_expense`, `financial_expense`, `current_assets`, `current_liabilities`, `total_assets`, `total_liabilities`, `net_profit`, `created_at`) VALUES (12, '2022-12-31', 488.27, 257.54, 45.79, 28.29, 7.59, 783.87, 420.20, 1877.24, 917.46, 149.06, '2025-07-30 13:32:21');