当前位置：首页 > news >正文

【第8章数据分析基础】让AI帮你可视化一个数据集

news 2026/6/10 19:34:56

课本中的源码 http://qr.cmpedu.com/CmpBookResource/download_resource.do?id=179708

Step1. 问AI寻找一个自己想要可视化的数据库，比如北京大学处理的1998年的人民日报文章，其他数据库大家可以下载后，上传到bohrium新建数据集上；

例子数据集在云盘中：https://cloud.tsinghua.edu.cn/d/91644dfb276045f8a9c0/

Step2. 问AI怎么导入这个数据集；

Step3. 问AI怎么可视化这个数据集；

Step4. 让AI详细解释他写的每一行代码是什么意思，强调你是初学者，并让他告诉你，你可以个性化修改哪些地方；

Step5. 尝试个性化修改AI写出的代码。

import pandas as pd
import jieba
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import os
from matplotlib.font_manager import FontProperties, fontManager# 过滤掉字体相关的警告
import warnings
import matplotlib as mpl# 在导入 matplotlib.pyplot 之前设置警告过滤
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")# 设置字体路径
font_path = "/bohr/PeopleDaily1998-ojni/v4/Alibaba-PuHuiTi-Medium.ttf"# Check if font file exists
if not os.path.exists(font_path):raise FileNotFoundError(f"The font file was not found at the specified path: {font_path}")# Register the font
fontManager.addfont(font_path)
font_prop = FontProperties(fname=font_path)
font_name = font_prop.get_name()
print(f"Registered font name: {font_name}")# Set the font family globally
mpl.rcParams['font.family'] = font_name# 读取文件
with open('/bohr/PeopleDaily1998-ojni/v4/199801/199801.txt', 'r', encoding='utf-8') as file:text = file.read()# Define function to check if string contains only Chinese characters
def is_chinese(word):return all('\u4e00' <= char <= '\u9fff' for char in word)# 分词并过滤
words = []
for line in text.split('\n'):parts = line.split()for part in parts:if '/' in part:word = part.split('/')[0]if is_chinese(word) and len(word) > 1:words.append(word)# 统计词频
word_counts = Counter(words)# 转换为DataFrame
df = pd.DataFrame(word_counts.items(), columns=['word', 'count'])
df = df.sort_values('count', ascending=False).reset_index(drop=True)# 只保留前100个高频词
df = df.head(100)print(df.head())# 创建一个2x2的子图布局
fig, axs = plt.subplots(2, 2, figsize=(20, 20))
fig.suptitle('1998年人民日报词频分析', fontsize=16, fontproperties=font_prop)# 1. 词云图
wordcloud = WordCloud(width=800,height=400,background_color='white',font_path=font_path,max_font_size=100
)
wordcloud.generate_from_frequencies(word_counts)
axs[0, 0].imshow(wordcloud, interpolation='bilinear')
axs[0, 0].axis('off')
axs[0, 0].set_title('词云图', fontproperties=font_prop)# 2. 柱状图 - 展示前10个高频词
axs[0, 1].bar(df['word'][:10], df['count'][:10])
axs[0, 1].set_title('前10个高频词', fontproperties=font_prop)
axs[0, 1].set_xlabel('词语', fontproperties=font_prop)
axs[0, 1].set_ylabel('出现次数', fontproperties=font_prop)
axs[0, 1].tick_params(axis='x', rotation=45)
for tick in axs[0, 1].get_xticklabels():tick.set_fontproperties(font_prop)
for i, v in enumerate(df['count'][:10]):axs[0, 1].text(i, v, str(v), ha='center', va='bottom', fontproperties=font_prop)# 3. 饼图 - 展示前5个高频词的比例
wedges, texts, autotexts = axs[1, 0].pie(df['count'][:5], labels=df['word'][:5], autopct='%1.1f%%', startangle=90)
axs[1, 0].set_title('前5个高频词的比例', fontproperties=font_prop)
for text in texts + autotexts:text.set_fontproperties(font_prop)
axs[1, 0].axis('equal')# 4. 折线图 - 展示前20个高频词的趋势
axs[1, 1].plot(range(1, 21), df['count'][:20], marker='o')
axs[1, 1].set_title('前20个高频词的出现频率趋势', fontproperties=font_prop)
axs[1, 1].set_xlabel('词语排名', fontproperties=font_prop)
axs[1, 1].set_ylabel('出现次数', fontproperties=font_prop)
axs[1, 1].set_xticks(range(1, 21))
axs[1, 1].set_xticklabels(df['word'][:20], rotation=45, ha='right', fontproperties=font_prop)
axs[1, 1].grid(True, linestyle='--', alpha=0.7)# 调整子图之间的间距
plt.tight_layout()# 显示图形
plt.show()

查看全文

http://www.zskr.cn/news/56817.html