This commit is contained in:
Cat Tom 2025-05-05 08:18:13 +08:00
parent 3e50e8d1c5
commit 85f7f3ea8f
9 changed files with 6694 additions and 311 deletions

6434
README

File diff suppressed because it is too large Load Diff

Binary file not shown.

View File

@ -32,71 +32,71 @@
<h2>User Profile Analysis</h2>
<div class="figure">
<img src="user_analysis/gender_distribution.png" alt="用户性别分布">
<img src="user_analysis/gender_distribution.png" alt="User Gender Distribution">
<p class="caption">User Gender Distribution</p>
</div>
<div class="figure">
<img src="user_analysis/age_distribution.png" alt="用户年龄分布">
<img src="user_analysis/age_distribution.png" alt="User Age Distribution">
<p class="caption">User Age Distribution</p>
</div>
<div class="figure">
<img src="user_analysis/occupation_distribution.png" alt="用户职业分布">
<img src="user_analysis/occupation_distribution.png" alt="User Occupation Distribution">
<p class="caption">User Occupation Distribution</p>
</div>
<h2>Movie Distribution Analysis</h2>
<div class="figure">
<img src="movie_analysis/genre_distribution.png" alt="电影类型分布">
<img src="movie_analysis/genre_distribution.png" alt="Movie Genre Distribution">
<p class="caption">Movie Genre Distribution</p>
</div>
<div class="figure">
<img src="movie_analysis/year_distribution.png" alt="电影发行年份分布">
<img src="movie_analysis/year_distribution.png" alt="Movie Release Year Distribution">
<p class="caption">Movie Release Year Distribution</p>
</div>
<div class="figure">
<img src="movie_analysis/most_rated_movies.png" alt="评分数量最多的电影">
<img src="movie_analysis/most_rated_movies.png" alt="Top 20 Most Rated Movies">
<p class="caption">Top 20 Most Rated Movies</p>
</div>
<h2>Rating Distribution Analysis</h2>
<div class="figure">
<img src="rating_analysis/rating_distribution.png" alt="评分分布">
<img src="rating_analysis/rating_distribution.png" alt="Rating Distribution">
<p class="caption">Rating Distribution</p>
</div>
<div class="figure">
<img src="rating_analysis/genre_avg_ratings.png" alt="各类型电影的平均评分">
<img src="rating_analysis/genre_avg_ratings.png" alt="Average Rating by Movie Genre">
<p class="caption">Average Rating by Movie Genre</p>
</div>
<div class="figure">
<img src="rating_analysis/top_rated_movies.png" alt="评分最高的电影">
<img src="rating_analysis/top_rated_movies.png" alt="Top 20 Highest Rated Movies">
<p class="caption">Top 20 Highest Rated Movies (min. 100 ratings)</p>
</div>
<h2>User Characteristics and Movie Preferences</h2>
<div class="figure">
<img src="preference_analysis/gender_genre_heatmap.png" alt="不同性别的电影类型偏好">
<img src="preference_analysis/gender_genre_heatmap.png" alt="Movie Genre Preferences by Gender">
<p class="caption">Movie Genre Preferences by Gender</p>
</div>
<div class="figure">
<img src="preference_analysis/age_genre_heatmap.png" alt="不同年龄段的电影类型偏好">
<img src="preference_analysis/age_genre_heatmap.png" alt="Movie Genre Preferences by Age Group">
<p class="caption">Movie Genre Preferences by Age Group</p>
</div>
<div class="figure">
<img src="preference_analysis/age_year_heatmap.png" alt="不同年龄段对不同年代电影的偏好">
<img src="preference_analysis/age_year_heatmap.png" alt="Preferences for Movies by Decade Across Age Groups">
<p class="caption">Preferences for Movies by Decade Across Age Groups</p>
</div>
<div class="figure">
<img src="preference_analysis/gender_avg_rating.png" alt="性别与平均评分的关系">
<p class="caption">Relationship Between Gender and Average Rating</p>
<img src="preference_analysis/gender_avg_rating.png" alt="Average Rating by Gender">
<p class="caption">Average Rating by Gender</p>
</div>
<h2>Conclusions and Insights</h2>

Binary file not shown.

Before

Width:  |  Height:  |  Size: 334 KiB

After

Width:  |  Height:  |  Size: 336 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 24 KiB

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 22 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

After

Width:  |  Height:  |  Size: 24 KiB

View File

@ -25,7 +25,6 @@ import time
import matplotlib as mpl
import platform
import tempfile
import urllib.request
import re
import sys
@ -38,31 +37,40 @@ np.random.seed(42)
# 自定义调色板
custom_colors = ['#FF9A76', '#67B7D1', '#A8D5BA', '#D8A47F', '#957DAD', '#7B506F', '#9AACB8']
# 全局变量控制是否使用中文
USE_CHINESE = False # 默认不使用中文
# 强制使用英文,避免中文显示问题
USE_ENGLISH = True # 设置为True使用英文False使用中文如果支持
def download_noto_font():
"""下载谷歌Noto Sans中文字体"""
temp_dir = os.path.join(tempfile.gettempdir(), 'movielens_fonts')
os.makedirs(temp_dir, exist_ok=True)
font_path = os.path.join(temp_dir, 'NotoSansSC-Regular.otf')
# 配置matplotlib字体和编码
def configure_matplotlib_fonts():
"""配置matplotlib使用合适的字体显示中文"""
# 如果字体已存在,直接返回路径
if os.path.exists(font_path):
print(f"使用已下载的Noto Sans字体: {font_path}")
return font_path
# 根据当前环境使用不同的默认字体
system = platform.system()
# 下载字体
font_url = "/"
try:
print(f"正在下载中文字体: {font_url}")
urllib.request.urlretrieve(font_url, font_path)
print(f"字体下载成功: {font_path}")
return font_path
except Exception as e:
print(f"字体下载失败: {e}")
return None
if system == 'Windows':
# Windows环境
plt.rcParams['font.sans-serif'] = ['SimHei', 'Microsoft YaHei', 'Arial']
elif system == 'Darwin':
# macOS环境
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'PingFang SC', 'Heiti SC']
else:
# Linux环境或其他
plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'WenQuanYi Micro Hei', 'Noto Sans CJK SC']
# 通用设置
plt.rcParams['axes.unicode_minus'] = False # 正确显示负号
plt.rcParams['font.family'] = 'sans-serif'
print(f"字体配置完成,当前系统: {system}")
# 应用字体配置
configure_matplotlib_fonts()
# 设置图表样式
sns.set_style("whitegrid")
plt.style.use('seaborn-v0_8-pastel')
def sanitize_filename(filename):
@ -71,55 +79,6 @@ def sanitize_filename(filename):
return re.sub(illegal_chars, '_', filename)
def setup_chinese_display():
"""配置中文显示环境"""
global USE_CHINESE
# 默认使用英文
USE_CHINESE = False
# 尝试设置中文字体
font_path = download_noto_font()
if font_path and os.path.exists(font_path):
try:
from matplotlib import font_manager
# 添加字体文件
font_manager.fontManager.addfont(font_path)
# 使用字体
plt.rcParams['font.family'] = ['sans-serif']
plt.rcParams['font.sans-serif'] = ['Noto Sans SC'] + plt.rcParams['font.sans-serif']
plt.rcParams['axes.unicode_minus'] = False
print("成功配置中文字体")
# 测试中文显示
plt.figure(figsize=(2, 1))
plt.text(0.5, 0.5, '测试中文', fontsize=12, ha='center')
plt.axis('off')
test_path = os.path.join(tempfile.gettempdir(), 'chinese_test.png')
plt.savefig(test_path)
plt.close()
print(f"中文测试图保存到: {test_path}")
USE_CHINESE = True
return True
except Exception as e:
print(f"中文字体配置失败: {e}")
USE_CHINESE = False
return False
else:
print("未能获取中文字体,将使用英文")
USE_CHINESE = False
return False
# 配置中文显示
setup_chinese_display()
# 设置图表样式
sns.set_style("whitegrid")
plt.style.use('seaborn-v0_8-pastel')
class MovieLensDataAnalyzer:
"""MovieLens数据集分析工具"""
@ -131,36 +90,36 @@ class MovieLensDataAnalyzer:
# 特征映射字典
self.age_mapping = {
1: "18岁以下" if USE_CHINESE else "Under 18",
1: "Under 18" if USE_ENGLISH else "18岁以下",
18: "18-24",
25: "25-34",
35: "35-44",
45: "45-49",
50: "50-55",
56: "56岁以上" if USE_CHINESE else "56+"
56: "56+" if USE_ENGLISH else "56岁以上"
}
self.occupation_mapping = {
0: "其他" if USE_CHINESE else "Other",
1: "学术/教育工作者" if USE_CHINESE else "Academic/Educator",
2: "艺术家" if USE_CHINESE else "Artist",
3: "文员/管理人员" if USE_CHINESE else "Clerical/Admin",
4: "大学生/研究生" if USE_CHINESE else "College/Grad Student",
5: "客户服务人员" if USE_CHINESE else "Customer Service",
6: "医疗/保健人员" if USE_CHINESE else "Doctor/Health Care",
7: "行政/事务人员" if USE_CHINESE else "Executive/Managerial",
8: "家庭主妇" if USE_CHINESE else "Homemaker",
9: "K-12学生" if USE_CHINESE else "K-12 Student",
10: "律师" if USE_CHINESE else "Lawyer",
11: "程序员" if USE_CHINESE else "Programmer",
12: "退休人员" if USE_CHINESE else "Retired",
13: "销售/营销人员" if USE_CHINESE else "Sales/Marketing",
14: "科学家" if USE_CHINESE else "Scientist",
15: "个体户" if USE_CHINESE else "Self-employed",
16: "技术人员/工程师" if USE_CHINESE else "Technician/Engineer",
17: "手工艺人" if USE_CHINESE else "Tradesman/Craftsman",
18: "失业人员" if USE_CHINESE else "Unemployed",
19: "作家" if USE_CHINESE else "Writer"
0: "Other" if USE_ENGLISH else "其他",
1: "Academic/Educator" if USE_ENGLISH else "学术/教育工作者",
2: "Artist" if USE_ENGLISH else "艺术家",
3: "Clerical/Admin" if USE_ENGLISH else "文员/管理人员",
4: "College/Grad Student" if USE_ENGLISH else "大学生/研究生",
5: "Customer Service" if USE_ENGLISH else "客户服务人员",
6: "Doctor/Health Care" if USE_ENGLISH else "医疗/保健人员",
7: "Executive/Managerial" if USE_ENGLISH else "行政/事务人员",
8: "Homemaker" if USE_ENGLISH else "家庭主妇",
9: "K-12 Student" if USE_ENGLISH else "K-12学生",
10: "Lawyer" if USE_ENGLISH else "律师",
11: "Programmer" if USE_ENGLISH else "程序员",
12: "Retired" if USE_ENGLISH else "退休人员",
13: "Sales/Marketing" if USE_ENGLISH else "销售/营销人员",
14: "Scientist" if USE_ENGLISH else "科学家",
15: "Self-employed" if USE_ENGLISH else "个体户",
16: "Technician/Engineer" if USE_ENGLISH else "技术人员/工程师",
17: "Tradesman/Craftsman" if USE_ENGLISH else "手工艺人",
18: "Unemployed" if USE_ENGLISH else "失业人员",
19: "Writer" if USE_ENGLISH else "作家"
}
# 创建输出目录(如果不存在)
@ -296,13 +255,13 @@ class MovieLensDataAnalyzer:
# 1. 用户性别分布
gender_counts = self.users_df['gender'].value_counts()
gender_labels = {'M': '男性' if USE_CHINESE else 'Male',
'F': '女性' if USE_CHINESE else 'Female'}
gender_labels = {'M': 'Male' if USE_ENGLISH else '男性',
'F': 'Female' if USE_ENGLISH else '女性'}
plt.figure(figsize=(10, 6))
ax = gender_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90,
colors=[custom_colors[0], custom_colors[1]])
plt.title('用户性别分布' if USE_CHINESE else 'User Gender Distribution')
plt.title('User Gender Distribution' if USE_ENGLISH else '用户性别分布')
plt.ylabel('')
# 修改饼图标签
@ -318,9 +277,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(12, 6))
sns.barplot(x=age_counts.index, y=age_counts.values, palette=custom_colors)
plt.title('用户年龄分布' if USE_CHINESE else 'User Age Distribution')
plt.xlabel('年龄段' if USE_CHINESE else 'Age Group')
plt.ylabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.title('User Age Distribution' if USE_ENGLISH else '用户年龄分布')
plt.xlabel('Age Group' if USE_ENGLISH else '年龄段')
plt.ylabel('Number of Users' if USE_ENGLISH else '用户数量')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'age_distribution.png'), bbox_inches='tight', dpi=100)
@ -331,9 +290,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(14, 8))
sns.barplot(x=occupation_counts.values, y=occupation_counts.index, palette=custom_colors)
plt.title('用户职业分布' if USE_CHINESE else 'User Occupation Distribution')
plt.xlabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.ylabel('职业' if USE_CHINESE else 'Occupation')
plt.title('User Occupation Distribution' if USE_ENGLISH else '用户职业分布')
plt.xlabel('Number of Users' if USE_ENGLISH else '用户数量')
plt.ylabel('Occupation' if USE_ENGLISH else '职业')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'occupation_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -343,10 +302,10 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(14, 8))
sns.barplot(x=region_counts.values, y=region_counts.index, palette=custom_colors)
title_text = '用户地域分布 (前20个邮编区域)' if USE_CHINESE else 'User Regional Distribution (Top 20 ZIP Codes)'
title_text = 'User Regional Distribution (Top 20 ZIP Codes)' if USE_ENGLISH else '用户地域分布 (前20个邮编区域)'
plt.title(title_text)
plt.xlabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.ylabel('邮编区域' if USE_CHINESE else 'ZIP Code Region')
plt.xlabel('Number of Users' if USE_ENGLISH else '用户数量')
plt.ylabel('ZIP Code Region' if USE_ENGLISH else '邮编区域')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'region_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -356,14 +315,16 @@ class MovieLensDataAnalyzer:
gender_age_counts = self.users_df.groupby(['age_group', 'gender']).size().unstack()
# 重命名列以使用本地化标签
if USE_CHINESE:
if not USE_ENGLISH:
gender_age_counts.columns = [gender_labels[col] for col in gender_age_counts.columns]
else:
gender_age_counts.columns = ['Male', 'Female'] # 确保英文模式下也有正确的标签
gender_age_counts.plot(kind='bar', stacked=True, color=custom_colors)
plt.title('用户性别和年龄组合分布' if USE_CHINESE else 'Gender and Age Distribution')
plt.xlabel('年龄段' if USE_CHINESE else 'Age Group')
plt.ylabel('用户数量' if USE_CHINESE else 'Number of Users')
plt.legend(title='性别' if USE_CHINESE else 'Gender')
plt.title('Gender and Age Distribution' if USE_ENGLISH else '用户性别和年龄组合分布')
plt.xlabel('Age Group' if USE_ENGLISH else '年龄段')
plt.ylabel('Number of Users' if USE_ENGLISH else '用户数量')
plt.legend(title='Gender' if USE_ENGLISH else '性别')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'gender_age_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -373,9 +334,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(12, 6))
sns.histplot(user_rating_counts, bins=30, kde=True, color=custom_colors[0])
plt.title('用户评分活跃度分布' if USE_CHINESE else 'User Rating Activity Distribution')
plt.xlabel('每位用户的评分数量' if USE_CHINESE else 'Number of Ratings per User')
plt.ylabel('用户数' if USE_CHINESE else 'Number of Users')
plt.title('User Rating Activity Distribution' if USE_ENGLISH else '用户评分活跃度分布')
plt.xlabel('Number of Ratings per User' if USE_ENGLISH else '每位用户的评分数量')
plt.ylabel('Number of Users' if USE_ENGLISH else '用户数')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'rating_activity_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -389,12 +350,11 @@ class MovieLensDataAnalyzer:
ax = sns.boxplot(x='gender', y='rating_count', data=user_activity, palette=[custom_colors[0], custom_colors[1]])
# 修改x轴标签为本地化文本
if USE_CHINESE:
ax.set_xticklabels([gender_labels[tick.get_text()] for tick in ax.get_xticklabels()])
ax.set_xticklabels(['Male', 'Female']) # 始终使用英文标签确保显示正确
plt.title('不同性别的评分活跃度分布' if USE_CHINESE else 'Rating Activity by Gender')
plt.xlabel('性别' if USE_CHINESE else 'Gender')
plt.ylabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.title('Rating Activity by Gender' if USE_ENGLISH else '不同性别的评分活跃度分布')
plt.xlabel('Gender' if USE_ENGLISH else '性别')
plt.ylabel('Number of Ratings' if USE_ENGLISH else '评分数量')
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'gender_activity.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -402,9 +362,9 @@ class MovieLensDataAnalyzer:
# 7.2 年龄段与评分活跃度
plt.figure(figsize=(14, 6))
sns.boxplot(x='age_group', y='rating_count', data=user_activity, palette=custom_colors)
plt.title('不同年龄段的评分活跃度分布' if USE_CHINESE else 'Rating Activity by Age Group')
plt.xlabel('年龄段' if USE_CHINESE else 'Age Group')
plt.ylabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.title('Rating Activity by Age Group' if USE_ENGLISH else '不同年龄段的评分活跃度分布')
plt.xlabel('Age Group' if USE_ENGLISH else '年龄段')
plt.ylabel('Number of Ratings' if USE_ENGLISH else '评分数量')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'age_activity.png'), bbox_inches='tight', dpi=100)
@ -413,9 +373,9 @@ class MovieLensDataAnalyzer:
# 7.3 职业与评分活跃度
plt.figure(figsize=(16, 10))
sns.boxplot(x='occupation_name', y='rating_count', data=user_activity, palette=custom_colors)
plt.title('不同职业的评分活跃度分布' if USE_CHINESE else 'Rating Activity by Occupation')
plt.xlabel('职业' if USE_CHINESE else 'Occupation')
plt.ylabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.title('Rating Activity by Occupation' if USE_ENGLISH else '不同职业的评分活跃度分布')
plt.xlabel('Occupation' if USE_ENGLISH else '职业')
plt.ylabel('Number of Ratings' if USE_ENGLISH else '评分数量')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(user_analysis_dir, 'occupation_activity.png'), bbox_inches='tight', dpi=100)
@ -441,9 +401,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(16, 6))
sns.barplot(x=year_counts.index, y=year_counts.values, color=custom_colors[0])
plt.title('电影发行年份分布' if USE_CHINESE else 'Movie Release Year Distribution')
plt.xlabel('发行年份' if USE_CHINESE else 'Release Year')
plt.ylabel('电影数量' if USE_CHINESE else 'Number of Movies')
plt.title('Movie Release Year Distribution' if USE_ENGLISH else '电影发行年份分布')
plt.xlabel('Release Year' if USE_ENGLISH else '发行年份')
plt.ylabel('Number of Movies' if USE_ENGLISH else '电影数量')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'year_distribution.png'), bbox_inches='tight', dpi=100)
@ -461,9 +421,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(14, 8))
sns.barplot(x=genre_series.values, y=genre_series.index, palette=custom_colors)
plt.title('电影类型分布' if USE_CHINESE else 'Movie Genre Distribution')
plt.xlabel('电影数量' if USE_CHINESE else 'Number of Movies')
plt.ylabel('类型' if USE_CHINESE else 'Genre')
plt.title('Movie Genre Distribution' if USE_ENGLISH else '电影类型分布')
plt.xlabel('Number of Movies' if USE_ENGLISH else '电影数量')
plt.ylabel('Genre' if USE_ENGLISH else '类型')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'genre_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -473,9 +433,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(12, 6))
sns.histplot(movie_rating_counts, bins=30, kde=True, color=custom_colors[1])
plt.title('电影评分数量分布' if USE_CHINESE else 'Movie Rating Count Distribution')
plt.xlabel('每部电影的评分数量' if USE_CHINESE else 'Number of Ratings per Movie')
plt.ylabel('电影数' if USE_CHINESE else 'Number of Movies')
plt.title('Movie Rating Count Distribution' if USE_ENGLISH else '电影评分数量分布')
plt.xlabel('Number of Ratings per Movie' if USE_ENGLISH else '每部电影的评分数量')
plt.ylabel('Number of Movies' if USE_ENGLISH else '电影数')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'movie_rating_counts.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -490,9 +450,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(16, 10))
sns.barplot(y='title', x='rating_count', data=top_movies_df, palette=custom_colors)
plt.title('评分数量最多的20部电影' if USE_CHINESE else 'Top 20 Most Rated Movies')
plt.xlabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.ylabel('电影标题' if USE_CHINESE else 'Movie Title')
plt.title('Top 20 Most Rated Movies' if USE_ENGLISH else '评分数量最多的20部电影')
plt.xlabel('Number of Ratings' if USE_ENGLISH else '评分数量')
plt.ylabel('Movie Title' if USE_ENGLISH else '电影标题')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'most_rated_movies.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -513,9 +473,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(14, 8))
sns.barplot(x=genre_avg_counts.values, y=genre_avg_counts.index, palette=custom_colors)
plt.title('各类型电影的平均评分数量' if USE_CHINESE else 'Average Number of Ratings by Genre')
plt.xlabel('平均评分数量' if USE_CHINESE else 'Average Number of Ratings')
plt.ylabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.title('Average Number of Ratings by Genre' if USE_ENGLISH else '各类型电影的平均评分数量')
plt.xlabel('Average Number of Ratings' if USE_ENGLISH else '平均评分数量')
plt.ylabel('Movie Genre' if USE_ENGLISH else '电影类型')
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'genre_avg_rating_counts.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -534,9 +494,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(16, 6))
sns.lineplot(x=year_avg_counts.index, y=year_avg_counts.values, marker='o', color=custom_colors[2])
plt.title('不同发行年份电影的平均评分数量' if USE_CHINESE else 'Average Number of Ratings by Release Year')
plt.xlabel('发行年份' if USE_CHINESE else 'Release Year')
plt.ylabel('平均评分数量' if USE_CHINESE else 'Average Number of Ratings')
plt.title('Average Number of Ratings by Release Year' if USE_ENGLISH else '不同发行年份电影的平均评分数量')
plt.xlabel('Release Year' if USE_ENGLISH else '发行年份')
plt.ylabel('Average Number of Ratings' if USE_ENGLISH else '平均评分数量')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(movie_analysis_dir, 'year_avg_rating_counts.png'), bbox_inches='tight', dpi=100)
@ -559,9 +519,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(12, 6))
rating_counts = self.ratings_df['rating'].value_counts().sort_index()
sns.barplot(x=rating_counts.index, y=rating_counts.values, palette=custom_colors)
plt.title('评分值分布' if USE_CHINESE else 'Rating Value Distribution')
plt.xlabel('评分' if USE_CHINESE else 'Rating')
plt.ylabel('数量' if USE_CHINESE else 'Count')
plt.title('Rating Value Distribution' if USE_ENGLISH else '评分值分布')
plt.xlabel('Rating' if USE_ENGLISH else '评分')
plt.ylabel('Count' if USE_ENGLISH else '数量')
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'rating_distribution.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -577,17 +537,17 @@ class MovieLensDataAnalyzer:
bins=10,
palette=[custom_colors[0], custom_colors[1]]
)
plt.title('原始评分与填补评分的分布对比' if USE_CHINESE else 'Original vs. Filled Ratings Distribution')
plt.xlabel('评分' if USE_CHINESE else 'Rating')
plt.ylabel('数量' if USE_CHINESE else 'Count')
plt.title('Original vs. Filled Ratings Distribution' if USE_ENGLISH else '原始评分与填补评分的分布对比')
plt.xlabel('Rating' if USE_ENGLISH else '评分')
plt.ylabel('Count' if USE_ENGLISH else '数量')
# 修改图例标签
if USE_CHINESE:
labels = ['填补评分', '原始评分']
plt.legend(title='是否原始评分', labels=labels)
else:
if USE_ENGLISH:
labels = ['Filled Ratings', 'Original Ratings']
plt.legend(title='Rating Type', labels=labels)
else:
labels = ['填补评分', '原始评分']
plt.legend(title='是否原始评分', labels=labels)
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'original_vs_filled_ratings.png'), bbox_inches='tight',
@ -603,20 +563,20 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(14, 8))
ax1 = plt.gca()
ax1.set_xlabel('年份' if USE_CHINESE else 'Year')
ax1.set_ylabel('平均评分' if USE_CHINESE else 'Average Rating', color=custom_colors[0])
ax1.set_xlabel('Year' if USE_ENGLISH else '年份')
ax1.set_ylabel('Average Rating' if USE_ENGLISH else '平均评分', color=custom_colors[0])
ax1.plot(yearly_ratings['year'], yearly_ratings['mean'], marker='o', color=custom_colors[0],
label='平均评分' if USE_CHINESE else 'Average Rating')
label='Average Rating' if USE_ENGLISH else '平均评分')
ax1.tick_params(axis='y', labelcolor=custom_colors[0])
ax1.grid(True, linestyle='--', alpha=0.7)
ax2 = ax1.twinx()
ax2.set_ylabel('评分数量' if USE_CHINESE else 'Number of Ratings', color=custom_colors[1])
ax2.set_ylabel('Number of Ratings' if USE_ENGLISH else '评分数量', color=custom_colors[1])
ax2.plot(yearly_ratings['year'], yearly_ratings['count'], marker='s', color=custom_colors[1],
label='评分数量' if USE_CHINESE else 'Number of Ratings')
label='Number of Ratings' if USE_ENGLISH else '评分数量')
ax2.tick_params(axis='y', labelcolor=custom_colors[1])
plt.title('随时间变化的评分趋势' if USE_CHINESE else 'Rating Trends Over Time')
plt.title('Rating Trends Over Time' if USE_ENGLISH else '随时间变化的评分趋势')
# 添加两个y轴的图例
lines1, labels1 = ax1.get_legend_handles_labels()
@ -659,9 +619,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(14, 8))
sns.barplot(y='genre', x='avg_rating', data=genre_stats, palette=custom_colors)
plt.title('各类型电影的平均评分' if USE_CHINESE else 'Average Rating by Genre')
plt.xlabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.ylabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.title('Average Rating by Genre' if USE_ENGLISH else '各类型电影的平均评分')
plt.xlabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.ylabel('Movie Genre' if USE_ENGLISH else '电影类型')
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'genre_avg_ratings.png'), bbox_inches='tight', dpi=100)
@ -679,13 +639,13 @@ class MovieLensDataAnalyzer:
# 在柱状图上添加评分数量标签
for i, (_, row) in enumerate(top_rated_movies.iterrows()):
label_text = f"评分数: {int(row['count'])}" if USE_CHINESE else f"Ratings: {int(row['count'])}"
label_text = f"Ratings: {int(row['count'])}" if USE_ENGLISH else f"评分数: {int(row['count'])}"
bars.text(row['mean'] + 0.05, i, label_text, va='center')
plt.title(
'评分最高的20部电影 (至少有100个评分)' if USE_CHINESE else 'Top 20 Highest Rated Movies (min. 100 ratings)')
plt.xlabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.ylabel('电影标题' if USE_CHINESE else 'Movie Title')
'Top 20 Highest Rated Movies (min. 100 ratings)' if USE_ENGLISH else '评分最高的20部电影 (至少有100个评分)')
plt.xlabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.ylabel('Movie Title' if USE_ENGLISH else '电影标题')
plt.tight_layout()
plt.savefig(os.path.join(rating_analysis_dir, 'top_rated_movies.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -708,20 +668,20 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(16, 8))
ax1 = plt.gca()
ax1.set_xlabel('发行年份' if USE_CHINESE else 'Release Year')
ax1.set_ylabel('平均评分' if USE_CHINESE else 'Average Rating', color=custom_colors[0])
ax1.set_xlabel('Release Year' if USE_ENGLISH else '发行年份')
ax1.set_ylabel('Average Rating' if USE_ENGLISH else '平均评分', color=custom_colors[0])
ax1.plot(year_avg_ratings.index, year_avg_ratings.values, marker='o', color=custom_colors[0],
label='平均评分' if USE_CHINESE else 'Average Rating')
label='Average Rating' if USE_ENGLISH else '平均评分')
ax1.tick_params(axis='y', labelcolor=custom_colors[0])
ax1.grid(True, linestyle='--', alpha=0.7)
ax2 = ax1.twinx()
ax2.set_ylabel('评分数量' if USE_CHINESE else 'Number of Ratings', color=custom_colors[1])
ax2.set_ylabel('Number of Ratings' if USE_ENGLISH else '评分数量', color=custom_colors[1])
ax2.plot(year_rating_counts.index, year_rating_counts.values, marker='s', color=custom_colors[1],
label='评分数量' if USE_CHINESE else 'Number of Ratings')
label='Number of Ratings' if USE_ENGLISH else '评分数量')
ax2.tick_params(axis='y', labelcolor=custom_colors[1])
plt.title('不同发行年份电影的平均评分' if USE_CHINESE else 'Average Rating by Release Year')
plt.title('Average Rating by Release Year' if USE_ENGLISH else '不同发行年份电影的平均评分')
# 添加两个y轴的图例
lines1, labels1 = ax1.get_legend_handles_labels()
@ -752,8 +712,8 @@ class MovieLensDataAnalyzer:
try:
gender_genre_preferences = self._analyze_group_genre_preference(user_ratings, 'gender')
# 为每个性别绘制前10个最喜爱的类型
gender_labels = {'M': '男性' if USE_CHINESE else 'Male',
'F': '女性' if USE_CHINESE else 'Female'}
gender_labels = {'M': 'Male' if USE_ENGLISH else '男性',
'F': 'Female' if USE_ENGLISH else '女性'}
for gender in gender_genre_preferences.keys():
gender_preferences = gender_genre_preferences[gender].sort_values(ascending=False).head(10)
@ -762,14 +722,14 @@ class MovieLensDataAnalyzer:
sns.barplot(x=gender_preferences.index, y=gender_preferences.values, palette=custom_colors)
# 设置本地化标题
if USE_CHINESE:
title = f"{gender_labels[gender]}最喜爱的电影类型"
else:
if USE_ENGLISH:
title = f"Most Favorite Movie Genres for {gender_labels[gender]}"
else:
title = f"{gender_labels[gender]}最喜爱的电影类型"
plt.title(title)
plt.xlabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.xlabel('Movie Genre' if USE_ENGLISH else '电影类型')
plt.ylabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.ylim(3.0, 4.5) # 设定Y轴范围使得差异更明显
plt.xticks(rotation=45)
plt.tight_layout()
@ -791,13 +751,10 @@ class MovieLensDataAnalyzer:
gender_heatmap_data = gender_heatmap_data.sort_values('M', ascending=False)
# 重命名列
if USE_CHINESE:
gender_heatmap_data.columns = ['男性', '女性']
else:
gender_heatmap_data.columns = ['Male', 'Female']
gender_heatmap_data.columns = ['Male', 'Female'] if USE_ENGLISH else ['男性', '女性']
sns.heatmap(gender_heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('不同性别的电影类型偏好对比' if USE_CHINESE else 'Movie Genre Preferences by Gender')
plt.title('Movie Genre Preferences by Gender' if USE_ENGLISH else '不同性别的电影类型偏好对比')
plt.tight_layout()
plt.savefig(os.path.join(preference_analysis_dir, 'gender_genre_heatmap.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -816,14 +773,14 @@ class MovieLensDataAnalyzer:
sns.barplot(x=age_preferences.index, y=age_preferences.values, palette=custom_colors)
# 设置本地化标题
if USE_CHINESE:
title = f"{age_group}年龄段最喜爱的电影类型"
else:
if USE_ENGLISH:
title = f"Most Favorite Movie Genres for Age Group {age_group}"
else:
title = f"{age_group}年龄段最喜爱的电影类型"
plt.title(title)
plt.xlabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.xlabel('Movie Genre' if USE_ENGLISH else '电影类型')
plt.ylabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.ylim(3.2, 4.2) # 设定Y轴范围使得差异更明显
plt.xticks(rotation=45)
plt.tight_layout()
@ -842,12 +799,12 @@ class MovieLensDataAnalyzer:
age_heatmap_data = age_heatmap_data.loc[common_genres]
# 按总体平均评分降序排列
age_heatmap_data['总体' if USE_CHINESE else 'Overall'] = age_heatmap_data.mean(axis=1)
age_heatmap_data = age_heatmap_data.sort_values('总体' if USE_CHINESE else 'Overall', ascending=False)
age_heatmap_data = age_heatmap_data.drop('总体' if USE_CHINESE else 'Overall', axis=1)
age_heatmap_data['Overall'] = age_heatmap_data.mean(axis=1)
age_heatmap_data = age_heatmap_data.sort_values('Overall', ascending=False)
age_heatmap_data = age_heatmap_data.drop('Overall', axis=1)
sns.heatmap(age_heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('不同年龄段的电影类型偏好对比' if USE_CHINESE else 'Movie Genre Preferences by Age Group')
plt.title('Movie Genre Preferences by Age Group' if USE_ENGLISH else '不同年龄段的电影类型偏好对比')
plt.tight_layout()
plt.savefig(os.path.join(preference_analysis_dir, 'age_genre_heatmap.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -860,12 +817,12 @@ class MovieLensDataAnalyzer:
# 选择几个代表性职业
selected_occupations = []
if USE_CHINESE:
selected_occupations = ['程序员', '学术/教育工作者', '大学生/研究生', '艺术家', '行政/事务人员',
'退休人员', '失业人员']
if USE_ENGLISH:
selected_occupations = ['Programmer', 'Academic/Educator', 'College/Grad Student',
'Artist', 'Executive/Managerial', 'Retired', 'Unemployed']
else:
selected_occupations = ['Programmer', 'Academic/Educator', 'College/Grad Student', 'Artist',
'Executive/Managerial', 'Retired', 'Unemployed']
selected_occupations = ['程序员', '学术/教育工作者', '大学生/研究生',
'艺术家', '行政/事务人员', '退休人员', '失业人员']
selected_occupations = [occ for occ in selected_occupations if occ in occupation_genre_preferences]
@ -878,14 +835,14 @@ class MovieLensDataAnalyzer:
sns.barplot(x=occ_preferences.index, y=occ_preferences.values, palette=custom_colors)
# 设置本地化标题
if USE_CHINESE:
title = f"{occupation}最喜爱的电影类型"
else:
if USE_ENGLISH:
title = f"Most Favorite Movie Genres for {occupation}"
else:
title = f"{occupation}最喜爱的电影类型"
plt.title(title)
plt.xlabel('电影类型' if USE_CHINESE else 'Movie Genre')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.xlabel('Movie Genre' if USE_ENGLISH else '电影类型')
plt.ylabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.ylim(3.2, 4.2) # 设定Y轴范围使得差异更明显
plt.xticks(rotation=45)
plt.tight_layout()
@ -907,12 +864,12 @@ class MovieLensDataAnalyzer:
occ_heatmap_data = occ_heatmap_data.loc[common_genres]
# 按总体平均评分降序排列
occ_heatmap_data['总体' if USE_CHINESE else 'Overall'] = occ_heatmap_data.mean(axis=1)
occ_heatmap_data = occ_heatmap_data.sort_values('总体' if USE_CHINESE else 'Overall', ascending=False)
occ_heatmap_data = occ_heatmap_data.drop('总体' if USE_CHINESE else 'Overall', axis=1)
occ_heatmap_data['Overall'] = occ_heatmap_data.mean(axis=1)
occ_heatmap_data = occ_heatmap_data.sort_values('Overall', ascending=False)
occ_heatmap_data = occ_heatmap_data.drop('Overall', axis=1)
sns.heatmap(occ_heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('不同职业的电影类型偏好对比' if USE_CHINESE else 'Movie Genre Preferences by Occupation')
plt.title('Movie Genre Preferences by Occupation' if USE_ENGLISH else '不同职业的电影类型偏好对比')
plt.tight_layout()
plt.savefig(os.path.join(preference_analysis_dir, 'occupation_genre_heatmap.png'), bbox_inches='tight',
dpi=100)
@ -993,7 +950,7 @@ class MovieLensDataAnalyzer:
# 重新提取年份
self.movies_df['year'] = self.movies_df['title'].apply(
lambda x: int(x[-5:-1]) if (
len(x) > 5 and x[-1] == ')' and x[-6] == '(' and x[-5:-1].isdigit()) else None
len(x) > 5 and x[-1] == ')' and x[-6] == '(' and x[-5:-1].isdigit()) else None
)
# 2. 检查是否有足够的年份数据
@ -1046,9 +1003,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(16, 10))
sns.heatmap(pivot_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title(
'不同年龄段对不同年代电影的偏好' if USE_CHINESE else 'Preferences for Movies by Decade Across Age Groups')
plt.xlabel('电影发行年代' if USE_CHINESE else 'Movie Release Decade')
plt.ylabel('用户年龄段' if USE_CHINESE else 'User Age Group')
'Preferences for Movies by Decade Across Age Groups' if USE_ENGLISH else '不同年龄段对不同年代电影的偏好')
plt.xlabel('Movie Release Decade' if USE_ENGLISH else '电影发行年代')
plt.ylabel('User Age Group' if USE_ENGLISH else '用户年龄段')
plt.tight_layout()
# 8. 保存图表
@ -1105,7 +1062,7 @@ class MovieLensDataAnalyzer:
heatmap_data.loc[age, '1990s'] += 0.3
elif "18" in str(age): # 年轻用户
heatmap_data.loc[age, '2000s'] += 0.3
elif "Under" in str(age) or "以下" in str(age): # 未成年用户
elif "Under" in str(age) or str(age).startswith("18岁"): # 未成年用户
heatmap_data.loc[age, '1990s'] += 0.2
heatmap_data.loc[age, '2000s'] += 0.4
@ -1113,9 +1070,9 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(16, 10))
sns.heatmap(heatmap_data, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title(
'不同年龄段对不同年代电影的偏好 (模拟数据)' if USE_CHINESE else 'Preferences for Movies by Decade Across Age Groups (Simulated Data)')
plt.xlabel('电影发行年代' if USE_CHINESE else 'Movie Release Decade')
plt.ylabel('用户年龄段' if USE_CHINESE else 'User Age Group')
'Preferences for Movies by Decade Across Age Groups (Simulated Data)' if USE_ENGLISH else '不同年龄段对不同年代电影的偏好 (模拟数据)')
plt.xlabel('Movie Release Decade' if USE_ENGLISH else '电影发行年代')
plt.ylabel('User Age Group' if USE_ENGLISH else '用户年龄段')
plt.tight_layout()
# 6. 保存图表
@ -1147,13 +1104,11 @@ class MovieLensDataAnalyzer:
ax = sns.boxplot(x='gender', y='avg_rating', data=user_stats, palette=[custom_colors[0], custom_colors[1]])
# 修改x轴标签
if USE_CHINESE:
gender_labels = {'M': '男性', 'F': '女性'}
ax.set_xticklabels([gender_labels[tick.get_text()] for tick in ax.get_xticklabels()])
ax.set_xticklabels(['Male', 'Female']) # 强制使用英文标签确保显示
plt.title('性别与平均评分的关系' if USE_CHINESE else 'Average Rating by Gender')
plt.xlabel('性别' if USE_CHINESE else 'Gender')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.title('Average Rating by Gender' if USE_ENGLISH else '性别与平均评分的关系')
plt.xlabel('Gender' if USE_ENGLISH else '性别')
plt.ylabel('Average Rating' if USE_ENGLISH else '平均评分')
# 添加统计检验结果
male_ratings = user_stats[user_stats['gender'] == 'M']['avg_rating']
@ -1161,7 +1116,7 @@ class MovieLensDataAnalyzer:
if len(male_ratings) > 0 and len(female_ratings) > 0:
t_stat, p_value = stats.ttest_ind(male_ratings, female_ratings)
annotation = f'T检验: p={p_value:.4f}' if USE_CHINESE else f'T-test: p={p_value:.4f}'
annotation = f'T-test: p={p_value:.4f}' if USE_ENGLISH else f'T检验: p={p_value:.4f}'
plt.annotate(annotation, xy=(0.5, 0.05), xycoords='axes fraction')
plt.tight_layout()
@ -1171,9 +1126,9 @@ class MovieLensDataAnalyzer:
# 2. 年龄组与平均评分的关系
plt.figure(figsize=(14, 6))
sns.boxplot(x='age_group', y='avg_rating', data=user_stats, palette=custom_colors)
plt.title('年龄组与平均评分的关系' if USE_CHINESE else 'Average Rating by Age Group')
plt.xlabel('年龄组' if USE_CHINESE else 'Age Group')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.title('Average Rating by Age Group' if USE_ENGLISH else '年龄组与平均评分的关系')
plt.xlabel('Age Group' if USE_ENGLISH else '年龄组')
plt.ylabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'age_avg_rating.png'), bbox_inches='tight', dpi=100)
@ -1184,13 +1139,11 @@ class MovieLensDataAnalyzer:
ax = sns.boxplot(x='gender', y='rating_std', data=user_stats, palette=[custom_colors[0], custom_colors[1]])
# 修改x轴标签
if USE_CHINESE:
gender_labels = {'M': '男性', 'F': '女性'}
ax.set_xticklabels([gender_labels[tick.get_text()] for tick in ax.get_xticklabels()])
ax.set_xticklabels(['Male', 'Female']) # 强制使用英文标签确保显示
plt.title('性别与评分标准差的关系' if USE_CHINESE else 'Rating Standard Deviation by Gender')
plt.xlabel('性别' if USE_CHINESE else 'Gender')
plt.ylabel('评分标准差' if USE_CHINESE else 'Rating Standard Deviation')
plt.title('Rating Standard Deviation by Gender' if USE_ENGLISH else '性别与评分标准差的关系')
plt.xlabel('Gender' if USE_ENGLISH else '性别')
plt.ylabel('Rating Standard Deviation' if USE_ENGLISH else '评分标准差')
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'gender_rating_std.png'), bbox_inches='tight', dpi=100)
plt.close()
@ -1198,9 +1151,9 @@ class MovieLensDataAnalyzer:
# 4. 年龄组与评分标准差的关系
plt.figure(figsize=(14, 6))
sns.boxplot(x='age_group', y='rating_std', data=user_stats, palette=custom_colors)
plt.title('年龄组与评分标准差的关系' if USE_CHINESE else 'Rating Standard Deviation by Age Group')
plt.xlabel('年龄组' if USE_CHINESE else 'Age Group')
plt.ylabel('评分标准差' if USE_CHINESE else 'Rating Standard Deviation')
plt.title('Rating Standard Deviation by Age Group' if USE_ENGLISH else '年龄组与评分标准差的关系')
plt.xlabel('Age Group' if USE_ENGLISH else '年龄组')
plt.ylabel('Rating Standard Deviation' if USE_ENGLISH else '评分标准差')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'age_rating_std.png'), bbox_inches='tight', dpi=100)
@ -1209,9 +1162,9 @@ class MovieLensDataAnalyzer:
# 5. 所有职业的平均评分对比
plt.figure(figsize=(16, 10))
sns.boxplot(x='occupation_name', y='avg_rating', data=user_stats, palette=custom_colors)
plt.title('职业与平均评分的关系' if USE_CHINESE else 'Average Rating by Occupation')
plt.xlabel('职业' if USE_CHINESE else 'Occupation')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.title('Average Rating by Occupation' if USE_ENGLISH else '职业与平均评分的关系')
plt.xlabel('Occupation' if USE_ENGLISH else '职业')
plt.ylabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.xticks(rotation=90)
plt.tight_layout()
plt.savefig(os.path.join(output_dir, 'occupation_avg_rating.png'), bbox_inches='tight', dpi=100)
@ -1221,18 +1174,14 @@ class MovieLensDataAnalyzer:
plt.figure(figsize=(12, 6))
# 为性别标签添加本地化
if USE_CHINESE:
gender_mapping = {'M': '男性', 'F': '女性'}
user_stats['gender_label'] = user_stats['gender'].map(gender_mapping)
hue_column = 'gender_label'
else:
hue_column = 'gender'
gender_mapping = {'M': 'Male', 'F': 'Female'}
user_stats['gender_label'] = user_stats['gender'].map(gender_mapping)
sns.scatterplot(x='rating_count', y='avg_rating', hue=hue_column, data=user_stats,
sns.scatterplot(x='rating_count', y='avg_rating', hue='gender_label', data=user_stats,
palette=[custom_colors[0], custom_colors[1]])
plt.title('评分数量与平均评分的关系' if USE_CHINESE else 'Relationship Between Rating Count and Average Rating')
plt.xlabel('评分数量' if USE_CHINESE else 'Number of Ratings')
plt.ylabel('平均评分' if USE_CHINESE else 'Average Rating')
plt.title('Relationship Between Rating Count and Average Rating' if USE_ENGLISH else '评分数量与平均评分的关系')
plt.xlabel('Number of Ratings' if USE_ENGLISH else '评分数量')
plt.ylabel('Average Rating' if USE_ENGLISH else '平均评分')
plt.xscale('log') # 使用对数刻度更好地展示分布
plt.grid(True, linestyle='--', alpha=0.7)
plt.tight_layout()
@ -1245,20 +1194,20 @@ class MovieLensDataAnalyzer:
# 创建分析摘要
summary = {
"数据概览" if USE_CHINESE else "Data Overview": {
"用户数量" if USE_CHINESE else "Number of Users": len(self.users_df),
"电影数量" if USE_CHINESE else "Number of Movies": len(self.movies_df),
"原始评分数量" if USE_CHINESE else "Original Ratings Count": len(self.ratings_df),
"填补后评分数量" if USE_CHINESE else "Filled Ratings Count": len(self.filled_ratings_df) if hasattr(
"Data Overview" if USE_ENGLISH else "数据概览": {
"Number of Users" if USE_ENGLISH else "用户数量": len(self.users_df),
"Number of Movies" if USE_ENGLISH else "电影数量": len(self.movies_df),
"Original Ratings Count" if USE_ENGLISH else "原始评分数量": len(self.ratings_df),
"Filled Ratings Count" if USE_ENGLISH else "填补后评分数量": len(self.filled_ratings_df) if hasattr(
self, 'filled_ratings_df') else "未使用填补数据",
},
"用户分析" if USE_CHINESE else "User Analysis": {
"性别分布" if USE_CHINESE else "Gender Distribution": self.users_df['gender'].value_counts().to_dict(),
"年龄分布" if USE_CHINESE else "Age Distribution": self.users_df['age_group'].value_counts().to_dict(),
"User Analysis" if USE_ENGLISH else "用户分析": {
"Gender Distribution" if USE_ENGLISH else "性别分布": self.users_df['gender'].value_counts().to_dict(),
"Age Distribution" if USE_ENGLISH else "年龄分布": self.users_df['age_group'].value_counts().to_dict(),
},
"评分分析" if USE_CHINESE else "Rating Analysis": {
"平均评分" if USE_CHINESE else "Average Rating": round(self.ratings_df['rating'].mean(), 2),
"评分分布" if USE_CHINESE else "Rating Distribution": self.ratings_df[
"Rating Analysis" if USE_ENGLISH else "评分分析": {
"Average Rating" if USE_ENGLISH else "平均评分": round(self.ratings_df['rating'].mean(), 2),
"Rating Distribution" if USE_ENGLISH else "评分分布": self.ratings_df[
'rating'].value_counts().sort_index().to_dict()
}
}
@ -1268,7 +1217,7 @@ class MovieLensDataAnalyzer:
json.dump(summary, f, ensure_ascii=False, indent=4)
# 创建HTML报告
html_title = 'MovieLens数据集分析报告' if USE_CHINESE else 'MovieLens Dataset Analysis Report'
html_title = 'MovieLens Dataset Analysis Report' if USE_ENGLISH else 'MovieLens数据集分析报告'
html_report = f"""
<!DOCTYPE html>
<html>
@ -1294,90 +1243,90 @@ class MovieLensDataAnalyzer:
</head>
<body>
<div class="container">
<h1>{"MovieLens数据集用户-电影偏好分析报告" if USE_CHINESE else "MovieLens Dataset User-Movie Preference Analysis Report"}</h1>
<h1>{"MovieLens Dataset User-Movie Preference Analysis Report" if USE_ENGLISH else "MovieLens数据集用户-电影偏好分析报告"}</h1>
<div class="summary">
<h2>{"数据概览" if USE_CHINESE else "Data Overview"}</h2>
<p>{"本分析基于MovieLens数据集包含" if USE_CHINESE else "This analysis is based on the MovieLens dataset, containing"} {len(self.users_df)} {"位用户" if USE_CHINESE else "users"}{len(self.movies_df)} {"部电影" if USE_CHINESE else "movies"} {"" if USE_CHINESE else "and"} {len(self.ratings_df)} {"条原始评分记录" if USE_CHINESE else "original rating records"}</p>
<h2>{"Data Overview" if USE_ENGLISH else "数据概览"}</h2>
<p>{"This analysis is based on the MovieLens dataset, containing" if USE_ENGLISH else "本分析基于MovieLens数据集包含"} {len(self.users_df)} {"users" if USE_ENGLISH else "位用户"}{len(self.movies_df)} {"movies" if USE_ENGLISH else "部电影"} {"and" if USE_ENGLISH else ""} {len(self.ratings_df)} {"original rating records" if USE_ENGLISH else "条原始评分记录"}</p>
</div>
<h2>{"用户基本情况分析" if USE_CHINESE else "User Profile Analysis"}</h2>
<h2>{"User Profile Analysis" if USE_ENGLISH else "用户基本情况分析"}</h2>
<div class="figure">
<img src="user_analysis/gender_distribution.png" alt="用户性别分布">
<p class="caption">{"用户性别分布" if USE_CHINESE else "User Gender Distribution"}</p>
<img src="user_analysis/gender_distribution.png" alt="User Gender Distribution">
<p class="caption">{"User Gender Distribution" if USE_ENGLISH else "用户性别分布"}</p>
</div>
<div class="figure">
<img src="user_analysis/age_distribution.png" alt="用户年龄分布">
<p class="caption">{"用户年龄分布" if USE_CHINESE else "User Age Distribution"}</p>
<img src="user_analysis/age_distribution.png" alt="User Age Distribution">
<p class="caption">{"User Age Distribution" if USE_ENGLISH else "用户年龄分布"}</p>
</div>
<div class="figure">
<img src="user_analysis/occupation_distribution.png" alt="用户职业分布">
<p class="caption">{"用户职业分布" if USE_CHINESE else "User Occupation Distribution"}</p>
<img src="user_analysis/occupation_distribution.png" alt="User Occupation Distribution">
<p class="caption">{"User Occupation Distribution" if USE_ENGLISH else "用户职业分布"}</p>
</div>
<h2>{"电影分布情况分析" if USE_CHINESE else "Movie Distribution Analysis"}</h2>
<h2>{"Movie Distribution Analysis" if USE_ENGLISH else "电影分布情况分析"}</h2>
<div class="figure">
<img src="movie_analysis/genre_distribution.png" alt="电影类型分布">
<p class="caption">{"电影类型分布" if USE_CHINESE else "Movie Genre Distribution"}</p>
<img src="movie_analysis/genre_distribution.png" alt="Movie Genre Distribution">
<p class="caption">{"Movie Genre Distribution" if USE_ENGLISH else "电影类型分布"}</p>
</div>
<div class="figure">
<img src="movie_analysis/year_distribution.png" alt="电影发行年份分布">
<p class="caption">{"电影发行年份分布" if USE_CHINESE else "Movie Release Year Distribution"}</p>
<img src="movie_analysis/year_distribution.png" alt="Movie Release Year Distribution">
<p class="caption">{"Movie Release Year Distribution" if USE_ENGLISH else "电影发行年份分布"}</p>
</div>
<div class="figure">
<img src="movie_analysis/most_rated_movies.png" alt="评分数量最多的电影">
<p class="caption">{"评分数量最多的20部电影" if USE_CHINESE else "Top 20 Most Rated Movies"}</p>
<img src="movie_analysis/most_rated_movies.png" alt="Top 20 Most Rated Movies">
<p class="caption">{"Top 20 Most Rated Movies" if USE_ENGLISH else "评分数量最多的20部电影"}</p>
</div>
<h2>{"评分分布情况分析" if USE_CHINESE else "Rating Distribution Analysis"}</h2>
<h2>{"Rating Distribution Analysis" if USE_ENGLISH else "评分分布情况分析"}</h2>
<div class="figure">
<img src="rating_analysis/rating_distribution.png" alt="评分分布">
<p class="caption">{"评分分布情况" if USE_CHINESE else "Rating Distribution"}</p>
<img src="rating_analysis/rating_distribution.png" alt="Rating Distribution">
<p class="caption">{"Rating Distribution" if USE_ENGLISH else "评分分布情况"}</p>
</div>
<div class="figure">
<img src="rating_analysis/genre_avg_ratings.png" alt="各类型电影的平均评分">
<p class="caption">{"各类型电影的平均评分" if USE_CHINESE else "Average Rating by Movie Genre"}</p>
<img src="rating_analysis/genre_avg_ratings.png" alt="Average Rating by Movie Genre">
<p class="caption">{"Average Rating by Movie Genre" if USE_ENGLISH else "各类型电影的平均评分"}</p>
</div>
<div class="figure">
<img src="rating_analysis/top_rated_movies.png" alt="评分最高的电影">
<p class="caption">{"评分最高的20部电影至少有100个评分" if USE_CHINESE else "Top 20 Highest Rated Movies (min. 100 ratings)"}</p>
<img src="rating_analysis/top_rated_movies.png" alt="Top 20 Highest Rated Movies">
<p class="caption">{"Top 20 Highest Rated Movies (min. 100 ratings)" if USE_ENGLISH else "评分最高的20部电影至少有100个评分"}</p>
</div>
<h2>{"用户特征与电影偏好分析" if USE_CHINESE else "User Characteristics and Movie Preferences"}</h2>
<h2>{"User Characteristics and Movie Preferences" if USE_ENGLISH else "用户特征与电影偏好分析"}</h2>
<div class="figure">
<img src="preference_analysis/gender_genre_heatmap.png" alt="不同性别的电影类型偏好">
<p class="caption">{"不同性别的电影类型偏好对比" if USE_CHINESE else "Movie Genre Preferences by Gender"}</p>
<img src="preference_analysis/gender_genre_heatmap.png" alt="Movie Genre Preferences by Gender">
<p class="caption">{"Movie Genre Preferences by Gender" if USE_ENGLISH else "不同性别的电影类型偏好对比"}</p>
</div>
<div class="figure">
<img src="preference_analysis/age_genre_heatmap.png" alt="不同年龄段的电影类型偏好">
<p class="caption">{"不同年龄段的电影类型偏好对比" if USE_CHINESE else "Movie Genre Preferences by Age Group"}</p>
<img src="preference_analysis/age_genre_heatmap.png" alt="Movie Genre Preferences by Age Group">
<p class="caption">{"Movie Genre Preferences by Age Group" if USE_ENGLISH else "不同年龄段的电影类型偏好对比"}</p>
</div>
<div class="figure">
<img src="preference_analysis/age_year_heatmap.png" alt="不同年龄段对不同年代电影的偏好">
<p class="caption">{"不同年龄段对不同年代电影的偏好" if USE_CHINESE else "Preferences for Movies by Decade Across Age Groups"}</p>
<img src="preference_analysis/age_year_heatmap.png" alt="Preferences for Movies by Decade Across Age Groups">
<p class="caption">{"Preferences for Movies by Decade Across Age Groups" if USE_ENGLISH else "不同年龄段对不同年代电影的偏好"}</p>
</div>
<div class="figure">
<img src="preference_analysis/gender_avg_rating.png" alt="性别与平均评分的关系">
<p class="caption">{"性别与平均评分的关系" if USE_CHINESE else "Relationship Between Gender and Average Rating"}</p>
<img src="preference_analysis/gender_avg_rating.png" alt="Average Rating by Gender">
<p class="caption">{"Average Rating by Gender" if USE_ENGLISH else "性别与平均评分的关系"}</p>
</div>
<h2>{"结论与洞察" if USE_CHINESE else "Conclusions and Insights"}</h2>
<p>{"通过对MovieLens数据集的深入分析我们发现了用户特征如性别、年龄、职业与电影偏好之间存在显著关联。主要结论包括" if USE_CHINESE else "Through in-depth analysis of the MovieLens dataset, we found significant correlations between user characteristics (gender, age, occupation) and movie preferences. Key findings include:"}</p>
<h2>{"Conclusions and Insights" if USE_ENGLISH else "结论与洞察"}</h2>
<p>{"Through in-depth analysis of the MovieLens dataset, we found significant correlations between user characteristics (gender, age, occupation) and movie preferences. Key findings include:" if USE_ENGLISH else "通过对MovieLens数据集的深入分析我们发现了用户特征如性别、年龄、职业与电影偏好之间存在显著关联。主要结论包括"}</p>
<ul>
<li>{"不同性别用户在电影类型偏好上存在明显差异" if USE_CHINESE else "Significant differences in movie genre preferences between genders"}</li>
<li>{"年龄因素会影响用户对不同年代电影的评价" if USE_CHINESE else "Age influences how users rate movies from different decades"}</li>
<li>{"职业背景与电影类型偏好具有相关性" if USE_CHINESE else "Occupational background correlates with genre preferences"}</li>
<li>{"Significant differences in movie genre preferences between genders" if USE_ENGLISH else "不同性别用户在电影类型偏好上存在明显差异"}</li>
<li>{"Age influences how users rate movies from different decades" if USE_ENGLISH else "年龄因素会影响用户对不同年代电影的评价"}</li>
<li>{"Occupational background correlates with genre preferences" if USE_ENGLISH else "职业背景与电影类型偏好具有相关性"}</li>
</ul>
<p>{"这些发现对于电影推荐系统的设计和电影营销策略制定具有重要参考价值。" if USE_CHINESE else "These findings provide valuable reference for designing movie recommendation systems and developing movie marketing strategies."}</p>
<p>{"These findings provide valuable reference for designing movie recommendation systems and developing movie marketing strategies." if USE_ENGLISH else "这些发现对于电影推荐系统的设计和电影营销策略制定具有重要参考价值。"}</p>
</div>
</body>
</html>