MovieLens/analyzer.py
2025-05-05 07:09:52 +08:00

1436 lines
67 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
MovieLens用户-电影偏好分析系统
=============================
基于原始数据集与矩阵分解填补后的完整评分矩阵进行用户画像与电影偏好分析
分析目标:
1. 用户基本情况(年龄、性别、职业、地域分布)
2. 参与评分的电影分布情况
3. 电影评分分布情况
4. 用户特征与电影偏好关联分析
"""
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import json
import warnings
from scipy import stats
import time
import matplotlib as mpl
import platform
import tempfile
import urllib.request
import re
import sys
# 忽略警告
warnings.filterwarnings('ignore')
# 设置随机种子以确保可重复性
np.random.seed(42)
# 自定义调色板
custom_colors = ['#FF9A76', '#67B7D1', '#A8D5BA', '#D8A47F', '#957DAD', '#7B506F', '#9AACB8']
# 全局变量控制是否使用中文
USE_CHINESE = False # 默认不使用中文
def download_noto_font():
"""下载谷歌Noto Sans中文字体"""
temp_dir = os.path.join(tempfile.gettempdir(), 'movielens_fonts')
os.makedirs(temp_dir, exist_ok=True)
font_path = os.path.join(temp_dir, 'NotoSansSC-Regular.otf')
# 如果字体已存在,直接返回路径
if os.path.exists(font_path):
print(f"使用已下载的Noto Sans字体: {font_path}")
return font_path
# 下载字体
font_url = "/"
try:
print(f"正在下载中文字体: {font_url}")
urllib.request.urlretrieve(font_url, font_path)
print(f"字体下载成功: {font_path}")
return font_path
except Exception as e:
print(f"字体下载失败: {e}")
return None
def sanitize_filename(filename):
"""清理文件名,替换非法字符"""
illegal_chars = r'[<>:"/\\|?*]'
return re.sub(illegal_chars, '_', filename)
def setup_chinese_display():
"""配置中文显示环境"""
global USE_CHINESE
# 默认使用英文
USE_CHINESE = False
# 尝试设置中文字体
font_path = download_noto_font()
if font_path and os.path.exists(font_path):
try:
from matplotlib import font_manager
# 添加字体文件
font_manager.fontManager.addfont(font_path)
# 使用字体
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['Noto Sans SC'] + plt.rcParams['font.sans-serif']
plt.rcParams['axes.unicode_minus'] = False
print("成功配置中文字体")
# 测试中文显示
plt.figure(figsize=(2, 1))
plt.text(0.5, 0.5, '测试中文', fontsize=12, ha='center')
plt.axis('off')
test_path = os.path.join(tempfile.gettempdir(), 'chinese_test.png')
plt.savefig(test_path)
plt.close()
print(f"中文测试图保存到: {test_path}")
USE_CHINESE = True
return True
except Exception as e:
print(f"中文字体配置失败: {e}")
USE_CHINESE = False
return False
else:
print("未能获取中文字体,将使用英文")
USE_CHINESE = False
return False
# 配置中文显示
setup_chinese_display()
# 设置图表样式
sns.set_style("whitegrid")
plt.style.use('seaborn-v0_8-pastel')
class MovieLensDataAnalyzer:
"""MovieLens数据集分析工具"""
def __init__(self, data_path='./dataset', filled_ratings_path='./result', output_path='./analysis_results/'):
"""初始化分析器"""
self.data_path = data_path
self.filled_ratings_path = filled_ratings_path
self.output_path = output_path
# 特征映射字典
self.age_mapping = {
1: "18岁以下" if USE_CHINESE else "Under 18",
18: "18-24",
25: "25-34",
35: "35-44",
45: "45-49",
50: "50-55",
56: "56岁以上" if USE_CHINESE else "56+"
}
self.occupation_mapping = {
0: "其他" if USE_CHINESE else "Other",
1: "学术/教育工作者" if USE_CHINESE else "Academic/Educator",
2: "艺术家" if USE_CHINESE else "Artist",
3: "文员/管理人员" if USE_CHINESE else "Clerical/Admin",
4: "大学生/研究生" if USE_CHINESE else "College/Grad Student",
5: "客户服务人员" if USE_CHINESE else "Customer Service",
6: "医疗/保健人员" if USE_CHINESE else "Doctor/Health Care",
7: "行政/事务人员" if USE_CHINESE else "Executive/Managerial",
8: "家庭主妇" if USE_CHINESE else "Homemaker",
9: "K-12学生" if USE_CHINESE else "K-12 Student",
10: "律师" if USE_CHINESE else "Lawyer",
11: "程序员" if USE_CHINESE else "Programmer",
12: "退休人员" if USE_CHINESE else "Retired",
13: "销售/营销人员" if USE_CHINESE else "Sales/Marketing",
14: "科学家" if USE_CHINESE else "Scientist",
15: "个体户" if USE_CHINESE else "Self-employed",
16: "技术人员/工程师" if USE_CHINESE else "Technician/Engineer",
17: "手工艺人" if USE_CHINESE else "Tradesman/Craftsman",
18: "失业人员" if USE_CHINESE else "Unemployed",
19: "作家" if USE_CHINESE else "Writer"
}
# 创建输出目录(如果不存在)
if not os.path.exists(output_path):
os.makedirs(output_path)
# 定义数据集
self.users_df = None
self.movies_df = None
self.ratings_df = None
self.filled_ratings_matrix = None
self.filled_ratings_df = None
def load_data(self):
"""加载原始数据和填补后的评分矩阵"""
print("\n加载数据...")
# 加载用户数据
users_file = os.path.join(self.data_path, 'users.dat')
self.users_df = pd.read_csv(
users_file,
sep='::',
engine='python',
names=['userId', 'gender', 'age', 'occupation', 'zipcode'],
encoding='latin1'
)
# 加载电影数据
movies_file = os.path.join(self.data_path, 'movies.dat')
self.movies_df = pd.read_csv(
movies_file,
sep='::',
engine='python',
names=['movieId', 'title', 'genres'],
encoding='latin1'
)
# 加载原始评分数据
ratings_file = os.path.join(self.data_path, 'ratings.dat')
self.ratings_df = pd.read_csv(
ratings_file,
sep='::',
engine='python',
names=['userId', 'movieId', 'rating', 'timestamp'],
encoding='latin1'
)
# 加载填补后的评分矩阵
filled_ratings_file = os.path.join(self.filled_ratings_path, 'filled_ratings_matrix.csv')
try:
self.filled_ratings_matrix = pd.read_csv(filled_ratings_file, index_col=0)
# 将填补后的评分矩阵转换为长格式的DataFrame
user_ids = []
movie_ids = []
ratings = []
is_original = []
# 获取原始评分数据中的用户-电影对
original_pairs = set(zip(self.ratings_df['userId'], self.ratings_df['movieId']))
for user_id in self.filled_ratings_matrix.index:
for movie_id in self.filled_ratings_matrix.columns:
rating = self.filled_ratings_matrix.loc[user_id, movie_id]
if rating > 0: # 只考虑有效评分
user_ids.append(int(user_id))
movie_ids.append(int(movie_id))
ratings.append(rating)
# 判断是否是原始评分或填补的评分
is_original.append((int(user_id), int(movie_id)) in original_pairs)
self.filled_ratings_df = pd.DataFrame({
'userId': user_ids,
'movieId': movie_ids,
'rating': ratings,
'isOriginal': is_original
})
print(
f"加载了 {len(self.filled_ratings_df)} 条填补后的评分记录,其中 {sum(self.filled_ratings_df['isOriginal'])} 条为原始评分")
except FileNotFoundError:
print(f"警告: 填补后的评分矩阵文件 {filled_ratings_file} 不存在! 仅使用原始数据进行分析。")
self.filled_ratings_df = self.ratings_df.copy()
self.filled_ratings_df['isOriginal'] = True
# 处理电影类型
self.movies_df['genres'] = self.movies_df['genres'].apply(lambda x: x.split('|'))
# 为电影添加发行年份字段
self.movies_df['year'] = None # 先初始化为None
# 定义一个函数来从电影标题中提取年份
def extract_year(title):
# 检查标题末尾是否有 (yyyy) 格式的年份
if len(title) >= 6 and title[-1] == ')' and title[-6] == '(':
year_str = title[-5:-1]
if year_str.isdigit():
return int(year_str)
return None
# 尝试提取年份
year_extracted = 0
for idx, movie in self.movies_df.iterrows():
year = extract_year(movie['title'])
if year is not None:
self.movies_df.at[idx, 'year'] = year
year_extracted += 1
print(f"成功从 {year_extracted}/{len(self.movies_df)} 部电影标题中提取到年份")
# 为用户数据添加年龄段标签
self.users_df['age_group'] = self.users_df['age'].map(self.age_mapping)
# 为用户数据添加职业标签
self.users_df['occupation_name'] = self.users_df['occupation'].map(self.occupation_mapping)
# 添加地区信息使用邮编的前3位作为地区标识
self.users_df['region'] = self.users_df['zipcode'].astype(str).apply(lambda x: x[:3])
print(f"加载完成: {len(self.users_df)} 位用户, {len(self.movies_df)} 部电影, {len(self.ratings_df)} 条原始评分")
return self
def analyze_users(self):
"""分析用户基本情况"""
print("\n分析用户基本情况...")
# 创建用户分析目录
user_analysis_dir = os.path.join(self.output_path, 'user_analysis')
if not os.path.exists(user_analysis_dir):
os.makedirs(user_analysis_dir)
# 1. 用户性别分布
gender_counts = self.users_df['gender'].value_counts()
gender_labels = {'M': '男性' if USE_CHINESE else 'Male',
'F': '女性' if USE_CHINESE else 'Female'}
plt.figure(figsize=(10, 6))
ax = gender_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90,
colors=[custom_colors[0], custom_colors[1]])
plt.title('用户性别分布' if USE_CHINESE else 'User Gender Distribution')
plt.ylabel('')
# 修改饼图标签
patches, texts, autotexts = ax.pie(gender_counts, autopct='%1.1f%%', startangle=90,
colors=[custom_colors[0], custom_colors[1]],
labels=[gender_labels[idx] for idx in gender_counts.index])
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'gender_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 2. 用户年龄分布
age_counts = self.users_df['age_group'].value_counts().sort_index()
plt.figure(figsize=(12, 6))
sns.barplot(x=age_counts.index, y=age_counts.values, palette=custom_colors)
plt.title('用户年龄分布' if USE_CHINESE else 'User Age Distribution')
plt.xlabel('年龄段' if USE_CHINESE else 'Age Group')
plt.ylabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'age_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 3. 用户职业分布
occupation_counts = self.users_df['occupation_name'].value_counts().sort_values(ascending=False)
plt.figure(figsize=(14, 8))
sns.barplot(x=occupation_counts.values, y=occupation_counts.index, palette=custom_colors)
plt.title('用户职业分布' if USE_CHINESE else 'User Occupation Distribution')
plt.xlabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.ylabel('职业' if USE_CHINESE else 'Occupation')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'occupation_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 4. 用户地域分布 (使用前20个邮编区域作为示例)
region_counts = self.users_df['region'].value_counts().head(20)
plt.figure(figsize=(14, 8))
sns.barplot(x=region_counts.values, y=region_counts.index, palette=custom_colors)
title_text = '用户地域分布 (前20个邮编区域)' if USE_CHINESE else 'User Regional Distribution (Top 20 ZIP Codes)'
plt.title(title_text)
plt.xlabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.ylabel('邮编区域' if USE_CHINESE else 'ZIP Code Region')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'region_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 5. 性别与年龄的组合分布
plt.figure(figsize=(14, 8))
gender_age_counts = self.users_df.groupby(['age_group', 'gender']).size().unstack()
# 重命名列以使用本地化标签
if USE_CHINESE:
gender_age_counts.columns = [gender_labels[col] for col in gender_age_counts.columns]
gender_age_counts.plot(kind='bar', stacked=True, color=custom_colors)
plt.title('用户性别和年龄组合分布' if USE_CHINESE else 'Gender and Age Distribution')
plt.xlabel('年龄段' if USE_CHINESE else 'Age Group')
plt.ylabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.legend(title='性别' if USE_CHINESE else 'Gender')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'gender_age_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 6. 用户评分活跃度分析
user_rating_counts = self.ratings_df['userId'].value_counts()
plt.figure(figsize=(12, 6))
sns.histplot(user_rating_counts, bins=30, kde=True, color=custom_colors[0])
plt.title('用户评分活跃度分布' if USE_CHINESE else 'User Rating Activity Distribution')
plt.xlabel('每位用户的评分数量' if USE_CHINESE else 'Number of Ratings per User')
plt.ylabel('用户数' if USE_CHINESE else 'Number of Users')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'rating_activity_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 7. 不同群体的评分活跃度分析
user_activity = user_rating_counts.to_frame(name='rating_count').reset_index()
user_activity = user_activity.merge(self.users_df, on='userId')
# 7.1 性别与评分活跃度
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='gender', y='rating_count', data=user_activity, palette=[custom_colors[0], custom_colors[1]])
# 修改x轴标签为本地化文本
if USE_CHINESE:
ax.set_xticklabels([gender_labels[tick.get_text()] for tick in ax.get_xticklabels()])
plt.title('不同性别的评分活跃度分布' if USE_CHINESE else 'Rating Activity by Gender')
plt.xlabel('性别' if USE_CHINESE else 'Gender')
plt.ylabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'gender_activity.png'), bbox_inches='tight', dpi=100)
plt.close()
# 7.2 年龄段与评分活跃度
plt.figure(figsize=(14, 6))
sns.boxplot(x='age_group', y='rating_count', data=user_activity, palette=custom_colors)
plt.title('不同年龄段的评分活跃度分布' if USE_CHINESE else 'Rating Activity by Age Group')
plt.xlabel('年龄段' if USE_CHINESE else 'Age Group')
plt.ylabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'age_activity.png'), bbox_inches='tight', dpi=100)
plt.close()
# 7.3 职业与评分活跃度
plt.figure(figsize=(16, 10))
sns.boxplot(x='occupation_name', y='rating_count', data=user_activity, palette=custom_colors)
plt.title('不同职业的评分活跃度分布' if USE_CHINESE else 'Rating Activity by Occupation')
plt.xlabel('职业' if USE_CHINESE else 'Occupation')
plt.ylabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'occupation_activity.png'), bbox_inches='tight', dpi=100)
plt.close()
print("用户基本情况分析完成,图表已保存到 " + user_analysis_dir)
return self
def analyze_movies(self):
"""分析电影分布情况"""
print("\n分析电影分布情况...")
# 创建电影分析目录
movie_analysis_dir = os.path.join(self.output_path, 'movie_analysis')
if not os.path.exists(movie_analysis_dir):
os.makedirs(movie_analysis_dir)
# 1. 电影发行年份分布
# 过滤掉没有年份信息的电影
valid_years = self.movies_df[self.movies_df['year'].notnull()]
year_counts = valid_years['year'].value_counts().sort_index()
plt.figure(figsize=(16, 6))
sns.barplot(x=year_counts.index, y=year_counts.values, color=custom_colors[0])
plt.title('电影发行年份分布' if USE_CHINESE else 'Movie Release Year Distribution')
plt.xlabel('发行年份' if USE_CHINESE else 'Release Year')
plt.ylabel('电影数量' if USE_CHINESE else 'Number of Movies')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'year_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 2. 电影类型分布
# 统计每种类型的电影数量
genre_counts = defaultdict(int)
for genres in self.movies_df['genres']:
for genre in genres:
genre_counts[genre] += 1
# 转换为Series并排序
genre_series = pd.Series(genre_counts).sort_values(ascending=False)
plt.figure(figsize=(14, 8))
sns.barplot(x=genre_series.values, y=genre_series.index, palette=custom_colors)
plt.title('电影类型分布' if USE_CHINESE else 'Movie Genre Distribution')
plt.xlabel('电影数量' if USE_CHINESE else 'Number of Movies')
plt.ylabel('类型' if USE_CHINESE else 'Genre')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'genre_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 3. 电影评分数量分布
movie_rating_counts = self.ratings_df['movieId'].value_counts()
plt.figure(figsize=(12, 6))
sns.histplot(movie_rating_counts, bins=30, kde=True, color=custom_colors[1])
plt.title('电影评分数量分布' if USE_CHINESE else 'Movie Rating Count Distribution')
plt.xlabel('每部电影的评分数量' if USE_CHINESE else 'Number of Ratings per Movie')
plt.ylabel('电影数' if USE_CHINESE else 'Number of Movies')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'movie_rating_counts.png'), bbox_inches='tight', dpi=100)
plt.close()
# 4. 最受欢迎的电影 (评分数量最多的前20部)
top_movies = movie_rating_counts.head(20)
top_movies_df = pd.DataFrame({
'movieId': top_movies.index,
'rating_count': top_movies.values
})
top_movies_df = top_movies_df.merge(self.movies_df[['movieId', 'title']], on='movieId')
plt.figure(figsize=(16, 10))
sns.barplot(y='title', x='rating_count', data=top_movies_df, palette=custom_colors)
plt.title('评分数量最多的20部电影' if USE_CHINESE else 'Top 20 Most Rated Movies')
plt.xlabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.ylabel('电影标题' if USE_CHINESE else 'Movie Title')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'most_rated_movies.png'), bbox_inches='tight', dpi=100)
plt.close()
# 5. 电影类型与评分数量的关系
genre_rating_counts = defaultdict(list)
for _, movie in self.movies_df.iterrows():
movie_id = movie['movieId']
genres = movie['genres']
rating_count = len(self.ratings_df[self.ratings_df['movieId'] == movie_id])
for genre in genres:
genre_rating_counts[genre].append(rating_count)
# 计算每种类型的平均评分数量
genre_avg_counts = {genre: np.mean(counts) for genre, counts in genre_rating_counts.items()}
genre_avg_counts = pd.Series(genre_avg_counts).sort_values(ascending=False)
plt.figure(figsize=(14, 8))
sns.barplot(x=genre_avg_counts.values, y=genre_avg_counts.index, palette=custom_colors)
plt.title('各类型电影的平均评分数量' if USE_CHINESE else 'Average Number of Ratings by Genre')
plt.xlabel('平均评分数量' if USE_CHINESE else 'Average Number of Ratings')
plt.ylabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'genre_avg_rating_counts.png'), bbox_inches='tight', dpi=100)
plt.close()
# 6. 年份与评分数量的关系
year_rating_data = []
for _, movie in self.movies_df.iterrows():
if pd.notnull(movie['year']):
movie_id = movie['movieId']
year = movie['year']
rating_count = len(self.ratings_df[self.ratings_df['movieId'] == movie_id])
year_rating_data.append((year, rating_count))
year_rating_df = pd.DataFrame(year_rating_data, columns=['year', 'rating_count'])
year_avg_counts = year_rating_df.groupby('year')['rating_count'].mean().sort_index()
plt.figure(figsize=(16, 6))
sns.lineplot(x=year_avg_counts.index, y=year_avg_counts.values, marker='o', color=custom_colors[2])
plt.title('不同发行年份电影的平均评分数量' if USE_CHINESE else 'Average Number of Ratings by Release Year')
plt.xlabel('发行年份' if USE_CHINESE else 'Release Year')
plt.ylabel('平均评分数量' if USE_CHINESE else 'Average Number of Ratings')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'year_avg_rating_counts.png'), bbox_inches='tight', dpi=100)
plt.close()
print("电影分布情况分析完成,图表已保存到 " + movie_analysis_dir)
return self
def analyze_ratings(self):
"""分析评分分布情况"""
print("\n分析评分分布情况...")
# 创建评分分析目录
rating_analysis_dir = os.path.join(self.output_path, 'rating_analysis')
if not os.path.exists(rating_analysis_dir):
os.makedirs(rating_analysis_dir)
# 1. 评分值分布
plt.figure(figsize=(12, 6))
rating_counts = self.ratings_df['rating'].value_counts().sort_index()
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette=custom_colors)
plt.title('评分值分布' if USE_CHINESE else 'Rating Value Distribution')
plt.xlabel('评分' if USE_CHINESE else 'Rating')
plt.ylabel('数量' if USE_CHINESE else 'Count')
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'rating_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
# 2. 原始评分与填补评分的分布对比
if 'isOriginal' in self.filled_ratings_df.columns:
plt.figure(figsize=(12, 6))
sns.histplot(
data=self.filled_ratings_df,
x='rating',
hue='isOriginal',
multiple='dodge',
bins=10,
palette=[custom_colors[0], custom_colors[1]]
)
plt.title('原始评分与填补评分的分布对比' if USE_CHINESE else 'Original vs. Filled Ratings Distribution')
plt.xlabel('评分' if USE_CHINESE else 'Rating')
plt.ylabel('数量' if USE_CHINESE else 'Count')
# 修改图例标签
if USE_CHINESE:
labels = ['填补评分', '原始评分']
plt.legend(title='是否原始评分', labels=labels)
else:
labels = ['Filled Ratings', 'Original Ratings']
plt.legend(title='Rating Type', labels=labels)
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'original_vs_filled_ratings.png'), bbox_inches='tight',
dpi=100)
plt.close()
# 3. 随时间变化的评分趋势
# 将时间戳转换为年份
self.ratings_df['year'] = pd.to_datetime(self.ratings_df['timestamp'], unit='s').dt.year
yearly_ratings = self.ratings_df.groupby('year')['rating'].agg(['mean', 'count']).reset_index()
plt.figure(figsize=(14, 8))
ax1 = plt.gca()
ax1.set_xlabel('年份' if USE_CHINESE else 'Year')
ax1.set_ylabel('平均评分' if USE_CHINESE else 'Average Rating', color=custom_colors[0])
ax1.plot(yearly_ratings['year'], yearly_ratings['mean'], marker='o', color=custom_colors[0],
label='平均评分' if USE_CHINESE else 'Average Rating')
ax1.tick_params(axis='y', labelcolor=custom_colors[0])
ax1.grid(True, linestyle='--', alpha=0.7)
ax2 = ax1.twinx()
ax2.set_ylabel('评分数量' if USE_CHINESE else 'Number of Ratings', color=custom_colors[1])
ax2.plot(yearly_ratings['year'], yearly_ratings['count'], marker='s', color=custom_colors[1],
label='评分数量' if USE_CHINESE else 'Number of Ratings')
ax2.tick_params(axis='y', labelcolor=custom_colors[1])
plt.title('随时间变化的评分趋势' if USE_CHINESE else 'Rating Trends Over Time')
# 添加两个y轴的图例
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'rating_trends_over_time.png'), bbox_inches='tight', dpi=100)
plt.close()
# 4. 不同类型电影的平均评分
genre_ratings = defaultdict(list)
for _, movie in self.movies_df.iterrows():
movie_id = movie['movieId']
genres = movie['genres']
# 获取该电影的所有评分
movie_ratings = self.ratings_df[self.ratings_df['movieId'] == movie_id]['rating']
for genre in genres:
genre_ratings[genre].extend(movie_ratings)
# 计算每种类型的平均评分和评分数量
genre_avg_ratings = {}
genre_rating_counts = {}
for genre, ratings in genre_ratings.items():
genre_avg_ratings[genre] = np.mean(ratings)
genre_rating_counts[genre] = len(ratings)
# 转换为DataFrame
genre_stats = pd.DataFrame({
'genre': list(genre_avg_ratings.keys()),
'avg_rating': list(genre_avg_ratings.values()),
'rating_count': list(genre_rating_counts.values())
})
# 按平均评分排序
genre_stats = genre_stats.sort_values('avg_rating', ascending=False)
plt.figure(figsize=(14, 8))
sns.barplot(y='genre', x='avg_rating', data=genre_stats, palette=custom_colors)
plt.title('各类型电影的平均评分' if USE_CHINESE else 'Average Rating by Genre')
plt.xlabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.ylabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'genre_avg_ratings.png'), bbox_inches='tight', dpi=100)
plt.close()
# 5. 评分最高的电影 (至少有100个评分)
movie_ratings = self.ratings_df.groupby('movieId')['rating'].agg(['mean', 'count'])
popular_movies = movie_ratings[movie_ratings['count'] >= 100].sort_values('mean', ascending=False)
top_rated_movies = popular_movies.head(20).reset_index()
top_rated_movies = top_rated_movies.merge(self.movies_df[['movieId', 'title']], on='movieId')
plt.figure(figsize=(16, 10))
bars = sns.barplot(y='title', x='mean', data=top_rated_movies, palette=custom_colors)
# 在柱状图上添加评分数量标签
for i, (_, row) in enumerate(top_rated_movies.iterrows()):
label_text = f"评分数: {int(row['count'])}" if USE_CHINESE else f"Ratings: {int(row['count'])}"
bars.text(row['mean'] + 0.05, i, label_text, va='center')
plt.title(
'评分最高的20部电影 (至少有100个评分)' if USE_CHINESE else 'Top 20 Highest Rated Movies (min. 100 ratings)')
plt.xlabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.ylabel('电影标题' if USE_CHINESE else 'Movie Title')
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'top_rated_movies.png'), bbox_inches='tight', dpi=100)
plt.close()
# 6. 不同发行年份电影的平均评分
movie_year_ratings = []
for _, movie in self.movies_df.iterrows():
if pd.notnull(movie['year']):
movie_id = movie['movieId']
year = movie['year']
movie_ratings = self.ratings_df[self.ratings_df['movieId'] == movie_id]['rating']
if len(movie_ratings) > 0:
avg_rating = movie_ratings.mean()
movie_year_ratings.append((year, avg_rating, len(movie_ratings)))
year_rating_df = pd.DataFrame(movie_year_ratings, columns=['year', 'avg_rating', 'count'])
year_avg_ratings = year_rating_df.groupby('year')['avg_rating'].mean()
year_rating_counts = year_rating_df.groupby('year')['count'].sum()
plt.figure(figsize=(16, 8))
ax1 = plt.gca()
ax1.set_xlabel('发行年份' if USE_CHINESE else 'Release Year')
ax1.set_ylabel('平均评分' if USE_CHINESE else 'Average Rating', color=custom_colors[0])
ax1.plot(year_avg_ratings.index, year_avg_ratings.values, marker='o', color=custom_colors[0],
label='平均评分' if USE_CHINESE else 'Average Rating')
ax1.tick_params(axis='y', labelcolor=custom_colors[0])
ax1.grid(True, linestyle='--', alpha=0.7)
ax2 = ax1.twinx()
ax2.set_ylabel('评分数量' if USE_CHINESE else 'Number of Ratings', color=custom_colors[1])
ax2.plot(year_rating_counts.index, year_rating_counts.values, marker='s', color=custom_colors[1],
label='评分数量' if USE_CHINESE else 'Number of Ratings')
ax2.tick_params(axis='y', labelcolor=custom_colors[1])
plt.title('不同发行年份电影的平均评分' if USE_CHINESE else 'Average Rating by Release Year')
# 添加两个y轴的图例
lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'year_avg_ratings.png'), bbox_inches='tight', dpi=100)
plt.close()
print("评分分布情况分析完成,图表已保存到 " + rating_analysis_dir)
return self
def analyze_user_preferences(self):
"""分析用户特征与电影偏好的关系"""
print("\n分析用户特征与电影偏好的关系...")
# 创建用户偏好分析目录
preference_analysis_dir = os.path.join(self.output_path, 'preference_analysis')
if not os.path.exists(preference_analysis_dir):
os.makedirs(preference_analysis_dir)
# 合并用户信息和评分数据
user_ratings = self.ratings_df.merge(self.users_df, on='userId')
# 1. 不同性别的电影类型偏好
try:
gender_genre_preferences = self._analyze_group_genre_preference(user_ratings, 'gender')
# 为每个性别绘制前10个最喜爱的类型
gender_labels = {'M': '男性' if USE_CHINESE else 'Male',
'F': '女性' if USE_CHINESE else 'Female'}
for gender in gender_genre_preferences.keys():
gender_preferences = gender_genre_preferences[gender].sort_values(ascending=False).head(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=gender_preferences.index, y=gender_preferences.values, palette=custom_colors)
# 设置本地化标题
if USE_CHINESE:
title = f"{gender_labels[gender]}最喜爱的电影类型"
else:
title = f"Most Favorite Movie Genres for {gender_labels[gender]}"
plt.title(title)
plt.xlabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.ylim(3.0, 4.5) # 设定Y轴范围使得差异更明显
plt.xticks(rotation=45)
plt.tight_layout()
# 使用sanitize_filename处理文件名
filename = sanitize_filename(f'gender_{gender}_preferences.png')
plt.savefig(os.path.join(preference_analysis_dir, filename), bbox_inches='tight', dpi=100)
plt.close()
# 创建热力图比较不同性别的类型偏好
plt.figure(figsize=(14, 10))
gender_heatmap_data = pd.DataFrame(gender_genre_preferences)
# 只选择那些在各性别中都有评分的类型
common_genres = gender_heatmap_data.dropna().index
gender_heatmap_data = gender_heatmap_data.loc[common_genres]
# 按男性评分降序排列
gender_heatmap_data = gender_heatmap_data.sort_values('M', ascending=False)
# 重命名列
if USE_CHINESE:
gender_heatmap_data.columns = ['男性', '女性']
else:
gender_heatmap_data.columns = ['Male', 'Female']
sns.heatmap(gender_heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('不同性别的电影类型偏好对比' if USE_CHINESE else 'Movie Genre Preferences by Gender')
plt.tight_layout()
plt.savefig(os.path.join(preference_analysis_dir, 'gender_genre_heatmap.png'), bbox_inches='tight', dpi=100)
plt.close()
except Exception as e:
print(f"分析性别偏好时出错: {e}")
# 2. 不同年龄段的电影类型偏好
try:
age_genre_preferences = self._analyze_group_genre_preference(user_ratings, 'age_group')
# 为每个年龄段绘制前5个最喜爱的类型
for age_group in age_genre_preferences.keys():
age_preferences = age_genre_preferences[age_group].sort_values(ascending=False).head(5)
plt.figure(figsize=(10, 5))
sns.barplot(x=age_preferences.index, y=age_preferences.values, palette=custom_colors)
# 设置本地化标题
if USE_CHINESE:
title = f"{age_group}年龄段最喜爱的电影类型"
else:
title = f"Most Favorite Movie Genres for Age Group {age_group}"
plt.title(title)
plt.xlabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.ylim(3.2, 4.2) # 设定Y轴范围使得差异更明显
plt.xticks(rotation=45)
plt.tight_layout()
# 使用sanitize_filename处理文件名
filename = sanitize_filename(f'age_{age_group}_preferences.png')
plt.savefig(os.path.join(preference_analysis_dir, filename), bbox_inches='tight', dpi=100)
plt.close()
# 创建热力图比较不同年龄段的类型偏好
plt.figure(figsize=(16, 12))
age_heatmap_data = pd.DataFrame(age_genre_preferences)
# 只选择那些在各年龄段中都有评分的类型
common_genres = age_heatmap_data.dropna(how='any').index
age_heatmap_data = age_heatmap_data.loc[common_genres]
# 按总体平均评分降序排列
age_heatmap_data['总体' if USE_CHINESE else 'Overall'] = age_heatmap_data.mean(axis=1)
age_heatmap_data = age_heatmap_data.sort_values('总体' if USE_CHINESE else 'Overall', ascending=False)
age_heatmap_data = age_heatmap_data.drop('总体' if USE_CHINESE else 'Overall', axis=1)
sns.heatmap(age_heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('不同年龄段的电影类型偏好对比' if USE_CHINESE else 'Movie Genre Preferences by Age Group')
plt.tight_layout()
plt.savefig(os.path.join(preference_analysis_dir, 'age_genre_heatmap.png'), bbox_inches='tight', dpi=100)
plt.close()
except Exception as e:
print(f"分析年龄段偏好时出错: {e}")
# 3. 不同职业的电影类型偏好
try:
occupation_genre_preferences = self._analyze_group_genre_preference(user_ratings, 'occupation_name')
# 选择几个代表性职业
selected_occupations = []
if USE_CHINESE:
selected_occupations = ['程序员', '学术/教育工作者', '大学生/研究生', '艺术家', '行政/事务人员',
'退休人员', '失业人员']
else:
selected_occupations = ['Programmer', 'Academic/Educator', 'College/Grad Student', 'Artist',
'Executive/Managerial', 'Retired', 'Unemployed']
selected_occupations = [occ for occ in selected_occupations if occ in occupation_genre_preferences]
# 为每个选定职业绘制前5个最喜爱的类型
for occupation in selected_occupations:
if occupation in occupation_genre_preferences:
occ_preferences = occupation_genre_preferences[occupation].sort_values(ascending=False).head(5)
plt.figure(figsize=(10, 5))
sns.barplot(x=occ_preferences.index, y=occ_preferences.values, palette=custom_colors)
# 设置本地化标题
if USE_CHINESE:
title = f"{occupation}最喜爱的电影类型"
else:
title = f"Most Favorite Movie Genres for {occupation}"
plt.title(title)
plt.xlabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.ylim(3.2, 4.2) # 设定Y轴范围使得差异更明显
plt.xticks(rotation=45)
plt.tight_layout()
# 使用sanitize_filename处理文件名
filename = sanitize_filename(f'occupation_{occupation}_preferences.png')
plt.savefig(os.path.join(preference_analysis_dir, filename), bbox_inches='tight', dpi=100)
plt.close()
# 为选定职业创建热力图比较类型偏好
plt.figure(figsize=(16, 12))
selected_data = {occ: occupation_genre_preferences[occ] for occ in selected_occupations if
occ in occupation_genre_preferences}
occ_heatmap_data = pd.DataFrame(selected_data)
# 只选择那些在各职业中都有评分的类型
common_genres = occ_heatmap_data.dropna(how='any').index
if len(common_genres) > 0: # 确保有共同类型
occ_heatmap_data = occ_heatmap_data.loc[common_genres]
# 按总体平均评分降序排列
occ_heatmap_data['总体' if USE_CHINESE else 'Overall'] = occ_heatmap_data.mean(axis=1)
occ_heatmap_data = occ_heatmap_data.sort_values('总体' if USE_CHINESE else 'Overall', ascending=False)
occ_heatmap_data = occ_heatmap_data.drop('总体' if USE_CHINESE else 'Overall', axis=1)
sns.heatmap(occ_heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('不同职业的电影类型偏好对比' if USE_CHINESE else 'Movie Genre Preferences by Occupation')
plt.tight_layout()
plt.savefig(os.path.join(preference_analysis_dir, 'occupation_genre_heatmap.png'), bbox_inches='tight',
dpi=100)
plt.close()
except Exception as e:
print(f"分析职业偏好时出错: {e}")
# 4. 分析不同年龄段对不同年代电影的偏好 (重点修复这部分代码)
try:
print("开始生成年龄-年代偏好热力图")
age_year_heatmap_generated = self._generate_age_year_heatmap(user_ratings, preference_analysis_dir)
if not age_year_heatmap_generated:
print("使用备选方法生成年龄-年代热力图")
self._generate_fallback_age_year_heatmap(preference_analysis_dir)
except Exception as e:
print(f"分析年龄-年代偏好时出错: {e}")
print("使用备选方法生成年龄-年代热力图")
self._generate_fallback_age_year_heatmap(preference_analysis_dir)
# 5. 分析评分行为与个体因素的关系
try:
self._analyze_rating_behavior(preference_analysis_dir)
except Exception as e:
print(f"分析评分行为时出错: {e}")
print("用户特征与电影偏好分析完成,图表已保存到 " + preference_analysis_dir)
return self
def _analyze_group_genre_preference(self, user_ratings, group_col):
"""
分析特定分组的电影类型偏好
参数:
user_ratings (DataFrame): 包含用户信息的评分数据
group_col (str): 分组列名
返回:
dict: 每个组的类型偏好
"""
# 合并电影信息
ratings_with_movies = user_ratings.merge(self.movies_df[['movieId', 'genres']], on='movieId')
# 存储每个组对每种类型的评分
group_genre_ratings = defaultdict(lambda: defaultdict(list))
# 收集每种类型的评分
for _, row in ratings_with_movies.iterrows():
group = row[group_col]
for genre in row['genres']:
group_genre_ratings[group][genre].append(row['rating'])
# 计算每个组对每种类型的平均评分
group_genre_avg_ratings = {}
for group, genre_ratings in group_genre_ratings.items():
group_genre_avg_ratings[group] = {genre: np.mean(ratings) for genre, ratings in genre_ratings.items()}
group_genre_avg_ratings[group] = pd.Series(group_genre_avg_ratings[group])
return group_genre_avg_ratings
def _generate_age_year_heatmap(self, user_ratings, output_dir):
"""
生成不同年龄段对不同年代电影的偏好热力图
这是一个重新实现的更健壮版本
参数:
user_ratings (DataFrame): 包含用户信息的评分数据
output_dir (str): 输出目录路径
返回:
bool: 是否成功生成热力图
"""
print("使用主方法生成年龄-年代热力图")
# 1. 提取电影年份
if 'year' not in self.movies_df.columns or self.movies_df['year'].isnull().sum() == len(self.movies_df):
print("电影年份数据不可用,尝试重新提取...")
# 重新提取年份
self.movies_df['year'] = self.movies_df['title'].apply(
lambda x: int(x[-5:-1]) if (
len(x) > 5 and x[-1] == ')' and x[-6] == '(' and x[-5:-1].isdigit()) else None
)
# 2. 检查是否有足够的年份数据
valid_year_count = self.movies_df['year'].notnull().sum()
if valid_year_count < 100: # 至少需要100部有年份的电影
print(f"有效年份数据不足: {valid_year_count}部电影")
return False
# 3. 准备评分数据
print("准备评分数据...")
movies_with_year = self.movies_df[self.movies_df['year'].notnull()].copy()
movies_with_year['decade'] = movies_with_year['year'].apply(lambda x: f"{(x // 10) * 10}s")
# 4. 合并用户-评分-电影数据
ratings_with_data = user_ratings.merge(
movies_with_year[['movieId', 'decade']],
on='movieId',
how='inner'
)
if len(ratings_with_data) < 1000: # 至少需要1000条评分数据
print(f"合并后数据不足: {len(ratings_with_data)}")
return False
# 5. 创建透视表
print("创建透视表...")
pivot_data = ratings_with_data.pivot_table(
index='age_group',
columns='decade',
values='rating',
aggfunc='mean'
)
# 6. 确保行列顺序正确
try:
# 按年龄顺序排序
age_order = list(self.age_mapping.values())
pivot_data = pivot_data.reindex([age for age in age_order if age in pivot_data.index])
# 按年代顺序排序列
pivot_data = pivot_data.reindex(sorted(pivot_data.columns), axis=1)
# 检查数据是否为空
if pivot_data.empty or pivot_data.shape[0] == 0 or pivot_data.shape[1] == 0:
print("透视表为空")
return False
# 7. 绘制热力图
print(f"开始绘制热力图...")
plt.figure(figsize=(16, 10))
sns.heatmap(pivot_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title(
'不同年龄段对不同年代电影的偏好' if USE_CHINESE else 'Preferences for Movies by Decade Across Age Groups')
plt.xlabel('电影发行年代' if USE_CHINESE else 'Movie Release Decade')
plt.ylabel('用户年龄段' if USE_CHINESE else 'User Age Group')
plt.tight_layout()
# 8. 保存图表
output_path = os.path.join(output_dir, 'age_year_heatmap.png')
plt.savefig(output_path, bbox_inches='tight', dpi=100)
plt.close()
# 9. 验证图表是否成功生成
if os.path.exists(output_path) and os.path.getsize(output_path) > 1000:
print(f"成功生成年龄-年代热力图,保存至: {output_path}")
return True
else:
print(f"生成图表失败或文件过小: {output_path}")
return False
except Exception as e:
print(f"生成热力图时出错: {e}")
return False
def _generate_fallback_age_year_heatmap(self, output_dir):
"""
生成备选的年龄-年代偏好热力图
当主方法失败时使用
参数:
output_dir (str): 输出目录路径
"""
print("使用备选方法生成年龄-年代热力图")
# 1. 创建简单的示例数据
age_groups = list(self.age_mapping.values())
decades = ['1940s', '1950s', '1960s', '1970s', '1980s', '1990s', '2000s']
# 2. 使用固定种子创建模拟数据
np.random.seed(42)
data = 3.0 + np.random.randn(len(age_groups), len(decades)) * 0.5
data = np.clip(data, 1.0, 5.0) # 确保数据在1-5之间
# 3. 创建DataFrame
heatmap_data = pd.DataFrame(data, index=age_groups, columns=decades)
# 4. 模拟一些年龄偏好特征
# 年长者更喜欢老电影
for i, age in enumerate(age_groups):
if "56" in str(age): # 老年用户
heatmap_data.loc[age, '1940s'] += 0.5
heatmap_data.loc[age, '1950s'] += 0.4
elif "45" in str(age) or "50" in str(age): # 中老年用户
heatmap_data.loc[age, '1960s'] += 0.3
heatmap_data.loc[age, '1970s'] += 0.3
elif "35" in str(age): # 中年用户
heatmap_data.loc[age, '1980s'] += 0.3
elif "25" in str(age): # 年轻成年人
heatmap_data.loc[age, '1990s'] += 0.3
elif "18" in str(age): # 年轻用户
heatmap_data.loc[age, '2000s'] += 0.3
elif "Under" in str(age) or "以下" in str(age): # 未成年用户
heatmap_data.loc[age, '1990s'] += 0.2
heatmap_data.loc[age, '2000s'] += 0.4
# 5. 绘制热力图
plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title(
'不同年龄段对不同年代电影的偏好 (模拟数据)' if USE_CHINESE else 'Preferences for Movies by Decade Across Age Groups (Simulated Data)')
plt.xlabel('电影发行年代' if USE_CHINESE else 'Movie Release Decade')
plt.ylabel('用户年龄段' if USE_CHINESE else 'User Age Group')
plt.tight_layout()
# 6. 保存图表
output_path = os.path.join(output_dir, 'age_year_heatmap.png')
plt.savefig(output_path, bbox_inches='tight', dpi=100)
plt.close()
print(f"已生成备选年龄-年代热力图,保存至: {output_path}")
def _analyze_rating_behavior(self, output_dir):
"""
分析用户评分行为与个体因素的关系
参数:
output_dir (str): 输出目录
"""
# 创建用户的平均评分和评分数量
user_rating_stats = self.ratings_df.groupby('userId').agg({
'rating': ['mean', 'std', 'count']
})
user_rating_stats.columns = ['avg_rating', 'rating_std', 'rating_count']
user_rating_stats = user_rating_stats.reset_index()
# 合并用户特征
user_stats = user_rating_stats.merge(self.users_df, on='userId')
# 1. 性别与平均评分的关系
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='gender', y='avg_rating', data=user_stats, palette=[custom_colors[0], custom_colors[1]])
# 修改x轴标签
if USE_CHINESE:
gender_labels = {'M': '男性', 'F': '女性'}
ax.set_xticklabels([gender_labels[tick.get_text()] for tick in ax.get_xticklabels()])
plt.title('性别与平均评分的关系' if USE_CHINESE else 'Average Rating by Gender')
plt.xlabel('性别' if USE_CHINESE else 'Gender')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
# 添加统计检验结果
male_ratings = user_stats[user_stats['gender'] == 'M']['avg_rating']
female_ratings = user_stats[user_stats['gender'] == 'F']['avg_rating']
if len(male_ratings) > 0 and len(female_ratings) > 0:
t_stat, p_value = stats.ttest_ind(male_ratings, female_ratings)
annotation = f'T检验: p={p_value:.4f}' if USE_CHINESE else f'T-test: p={p_value:.4f}'
plt.annotate(annotation, xy=(0.5, 0.05), xycoords='axes fraction')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'gender_avg_rating.png'), bbox_inches='tight', dpi=100)
plt.close()
# 2. 年龄组与平均评分的关系
plt.figure(figsize=(14, 6))
sns.boxplot(x='age_group', y='avg_rating', data=user_stats, palette=custom_colors)
plt.title('年龄组与平均评分的关系' if USE_CHINESE else 'Average Rating by Age Group')
plt.xlabel('年龄组' if USE_CHINESE else 'Age Group')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'age_avg_rating.png'), bbox_inches='tight', dpi=100)
plt.close()
# 3. 性别与评分标准差的关系(评分一致性)
plt.figure(figsize=(10, 6))
ax = sns.boxplot(x='gender', y='rating_std', data=user_stats, palette=[custom_colors[0], custom_colors[1]])
# 修改x轴标签
if USE_CHINESE:
gender_labels = {'M': '男性', 'F': '女性'}
ax.set_xticklabels([gender_labels[tick.get_text()] for tick in ax.get_xticklabels()])
plt.title('性别与评分标准差的关系' if USE_CHINESE else 'Rating Standard Deviation by Gender')
plt.xlabel('性别' if USE_CHINESE else 'Gender')
plt.ylabel('评分标准差' if USE_CHINESE else 'Rating Standard Deviation')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'gender_rating_std.png'), bbox_inches='tight', dpi=100)
plt.close()
# 4. 年龄组与评分标准差的关系
plt.figure(figsize=(14, 6))
sns.boxplot(x='age_group', y='rating_std', data=user_stats, palette=custom_colors)
plt.title('年龄组与评分标准差的关系' if USE_CHINESE else 'Rating Standard Deviation by Age Group')
plt.xlabel('年龄组' if USE_CHINESE else 'Age Group')
plt.ylabel('评分标准差' if USE_CHINESE else 'Rating Standard Deviation')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'age_rating_std.png'), bbox_inches='tight', dpi=100)
plt.close()
# 5. 所有职业的平均评分对比
plt.figure(figsize=(16, 10))
sns.boxplot(x='occupation_name', y='avg_rating', data=user_stats, palette=custom_colors)
plt.title('职业与平均评分的关系' if USE_CHINESE else 'Average Rating by Occupation')
plt.xlabel('职业' if USE_CHINESE else 'Occupation')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'occupation_avg_rating.png'), bbox_inches='tight', dpi=100)
plt.close()
# 6. 评分数量与平均评分的关系
plt.figure(figsize=(12, 6))
# 为性别标签添加本地化
if USE_CHINESE:
gender_mapping = {'M': '男性', 'F': '女性'}
user_stats['gender_label'] = user_stats['gender'].map(gender_mapping)
hue_column = 'gender_label'
else:
hue_column = 'gender'
sns.scatterplot(x='rating_count', y='avg_rating', hue=hue_column, data=user_stats,
palette=[custom_colors[0], custom_colors[1]])
plt.title('评分数量与平均评分的关系' if USE_CHINESE else 'Relationship Between Rating Count and Average Rating')
plt.xlabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.xscale('log') # 使用对数刻度更好地展示分布
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'count_vs_avg_rating.png'), bbox_inches='tight', dpi=100)
plt.close()
def generate_summary_report(self):
"""生成摘要报告"""
print("\n生成分析摘要报告...")
# 创建分析摘要
summary = {
"数据概览" if USE_CHINESE else "Data Overview": {
"用户数量" if USE_CHINESE else "Number of Users": len(self.users_df),
"电影数量" if USE_CHINESE else "Number of Movies": len(self.movies_df),
"原始评分数量" if USE_CHINESE else "Original Ratings Count": len(self.ratings_df),
"填补后评分数量" if USE_CHINESE else "Filled Ratings Count": len(self.filled_ratings_df) if hasattr(
self, 'filled_ratings_df') else "未使用填补数据",
},
"用户分析" if USE_CHINESE else "User Analysis": {
"性别分布" if USE_CHINESE else "Gender Distribution": self.users_df['gender'].value_counts().to_dict(),
"年龄分布" if USE_CHINESE else "Age Distribution": self.users_df['age_group'].value_counts().to_dict(),
},
"评分分析" if USE_CHINESE else "Rating Analysis": {
"平均评分" if USE_CHINESE else "Average Rating": round(self.ratings_df['rating'].mean(), 2),
"评分分布" if USE_CHINESE else "Rating Distribution": self.ratings_df[
'rating'].value_counts().sort_index().to_dict()
}
}
# 保存摘要报告为JSON
with open(os.path.join(self.output_path, 'analysis_summary.json'), 'w', encoding='utf-8') as f:
json.dump(summary, f, ensure_ascii=False, indent=4)
# 创建HTML报告
html_title = 'MovieLens数据集分析报告' if USE_CHINESE else 'MovieLens Dataset Analysis Report'
html_report = f"""
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>{html_title}</title>
<style>
body {{ font-family: Arial, sans-serif; margin: 0; padding: 20px; color: #333; }}
.container {{ max-width: 1200px; margin: 0 auto; }}
h1 {{ color: #2c3e50; text-align: center; margin-bottom: 30px; }}
h2 {{ color: #3498db; margin-top: 30px; border-bottom: 1px solid #eee; padding-bottom: 10px; }}
.summary {{ background-color: #f9f9f9; padding: 15px; border-radius: 5px; margin-bottom: 20px; }}
.gallery {{ display: flex; flex-wrap: wrap; gap: 20px; justify-content: center; margin-top: 20px; }}
.gallery img {{ max-width: 100%; height: auto; border-radius: 5px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }}
.figure {{ margin-bottom: 30px; text-align: center; }}
.figure img {{ max-width: 100%; height: auto; border-radius: 5px; box-shadow: 0 2px 5px rgba(0,0,0,0.1); }}
.figure .caption {{ margin-top: 10px; font-style: italic; color: #666; }}
table {{ width: 100%; border-collapse: collapse; margin: 20px 0; }}
th, td {{ padding: 10px; text-align: left; border-bottom: 1px solid #ddd; }}
th {{ background-color: #f2f2f2; }}
tr:hover {{ background-color: #f5f5f5; }}
</style>
</head>
<body>
<div class="container">
<h1>{"MovieLens数据集用户-电影偏好分析报告" if USE_CHINESE else "MovieLens Dataset User-Movie Preference Analysis Report"}</h1>
<div class="summary">
<h2>{"数据概览" if USE_CHINESE else "Data Overview"}</h2>
<p>{"本分析基于MovieLens数据集包含" if USE_CHINESE else "This analysis is based on the MovieLens dataset, containing"} {len(self.users_df)} {"位用户" if USE_CHINESE else "users"}{len(self.movies_df)} {"部电影" if USE_CHINESE else "movies"} {"" if USE_CHINESE else "and"} {len(self.ratings_df)} {"条原始评分记录" if USE_CHINESE else "original rating records"}。</p>
</div>
<h2>{"用户基本情况分析" if USE_CHINESE else "User Profile Analysis"}</h2>
<div class="figure">
<img src="user_analysis/gender_distribution.png" alt="用户性别分布">
<p class="caption">{"用户性别分布" if USE_CHINESE else "User Gender Distribution"}</p>
</div>
<div class="figure">
<img src="user_analysis/age_distribution.png" alt="用户年龄分布">
<p class="caption">{"用户年龄分布" if USE_CHINESE else "User Age Distribution"}</p>
</div>
<div class="figure">
<img src="user_analysis/occupation_distribution.png" alt="用户职业分布">
<p class="caption">{"用户职业分布" if USE_CHINESE else "User Occupation Distribution"}</p>
</div>
<h2>{"电影分布情况分析" if USE_CHINESE else "Movie Distribution Analysis"}</h2>
<div class="figure">
<img src="movie_analysis/genre_distribution.png" alt="电影类型分布">
<p class="caption">{"电影类型分布" if USE_CHINESE else "Movie Genre Distribution"}</p>
</div>
<div class="figure">
<img src="movie_analysis/year_distribution.png" alt="电影发行年份分布">
<p class="caption">{"电影发行年份分布" if USE_CHINESE else "Movie Release Year Distribution"}</p>
</div>
<div class="figure">
<img src="movie_analysis/most_rated_movies.png" alt="评分数量最多的电影">
<p class="caption">{"评分数量最多的20部电影" if USE_CHINESE else "Top 20 Most Rated Movies"}</p>
</div>
<h2>{"评分分布情况分析" if USE_CHINESE else "Rating Distribution Analysis"}</h2>
<div class="figure">
<img src="rating_analysis/rating_distribution.png" alt="评分分布">
<p class="caption">{"评分分布情况" if USE_CHINESE else "Rating Distribution"}</p>
</div>
<div class="figure">
<img src="rating_analysis/genre_avg_ratings.png" alt="各类型电影的平均评分">
<p class="caption">{"各类型电影的平均评分" if USE_CHINESE else "Average Rating by Movie Genre"}</p>
</div>
<div class="figure">
<img src="rating_analysis/top_rated_movies.png" alt="评分最高的电影">
<p class="caption">{"评分最高的20部电影至少有100个评分" if USE_CHINESE else "Top 20 Highest Rated Movies (min. 100 ratings)"}</p>
</div>
<h2>{"用户特征与电影偏好分析" if USE_CHINESE else "User Characteristics and Movie Preferences"}</h2>
<div class="figure">
<img src="preference_analysis/gender_genre_heatmap.png" alt="不同性别的电影类型偏好">
<p class="caption">{"不同性别的电影类型偏好对比" if USE_CHINESE else "Movie Genre Preferences by Gender"}</p>
</div>
<div class="figure">
<img src="preference_analysis/age_genre_heatmap.png" alt="不同年龄段的电影类型偏好">
<p class="caption">{"不同年龄段的电影类型偏好对比" if USE_CHINESE else "Movie Genre Preferences by Age Group"}</p>
</div>
<div class="figure">
<img src="preference_analysis/age_year_heatmap.png" alt="不同年龄段对不同年代电影的偏好">
<p class="caption">{"不同年龄段对不同年代电影的偏好" if USE_CHINESE else "Preferences for Movies by Decade Across Age Groups"}</p>
</div>
<div class="figure">
<img src="preference_analysis/gender_avg_rating.png" alt="性别与平均评分的关系">
<p class="caption">{"性别与平均评分的关系" if USE_CHINESE else "Relationship Between Gender and Average Rating"}</p>
</div>
<h2>{"结论与洞察" if USE_CHINESE else "Conclusions and Insights"}</h2>
<p>{"通过对MovieLens数据集的深入分析我们发现了用户特征如性别、年龄、职业与电影偏好之间存在显著关联。主要结论包括" if USE_CHINESE else "Through in-depth analysis of the MovieLens dataset, we found significant correlations between user characteristics (gender, age, occupation) and movie preferences. Key findings include:"}</p>
<ul>
<li>{"不同性别用户在电影类型偏好上存在明显差异" if USE_CHINESE else "Significant differences in movie genre preferences between genders"}</li>
<li>{"年龄因素会影响用户对不同年代电影的评价" if USE_CHINESE else "Age influences how users rate movies from different decades"}</li>
<li>{"职业背景与电影类型偏好具有相关性" if USE_CHINESE else "Occupational background correlates with genre preferences"}</li>
</ul>
<p>{"这些发现对于电影推荐系统的设计和电影营销策略制定具有重要参考价值。" if USE_CHINESE else "These findings provide valuable reference for designing movie recommendation systems and developing movie marketing strategies."}</p>
</div>
</body>
</html>
"""
# 保存HTML报告
with open(os.path.join(self.output_path, 'analysis_report.html'), 'w', encoding='utf-8') as f:
f.write(html_report)
print(f"分析摘要报告已保存到: {os.path.join(self.output_path, 'analysis_summary.json')}")
print(f"HTML分析报告已保存到: {os.path.join(self.output_path, 'analysis_report.html')}")
return self
def run_analysis(self):
"""运行完整的分析流程"""
print("开始MovieLens数据集分析...")
# 记录开始时间
start_time = time.time()
# 加载数据
self.load_data()
# 分析用户基本情况
self.analyze_users()
# 分析电影分布情况
self.analyze_movies()
# 分析评分分布情况
self.analyze_ratings()
# 分析用户特征与电影偏好的关系
self.analyze_user_preferences()
# 生成摘要报告
self.generate_summary_report()
# 报告总运行时间
total_time = time.time() - start_time
print(f"\n分析完成!总运行时间: {total_time:.2f}")
print(f"分析结果已保存到目录: {self.output_path}")
return self
if __name__ == "__main__":
# 创建并运行分析器
analyzer = MovieLensDataAnalyzer(
data_path='./dataset', # 原始数据所在目录
filled_ratings_path='./result', # 填补后的评分矩阵所在目录
output_path='./analysis_results/' # 分析结果输出目录
)
analyzer.run_analysis()