python:实验课2

将在这个页面同步实验内容,持续更新,刷新查看最新内容

安装库

1
2
pip install jieba
pip install wordcloud

cet4词频统计

常规版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
import re
from collections import Counter

cet4_files = ['D:\\change\\your\\path\\to\\20240601.txt', 'D:\\change\\your\\path\\to\\20240602.txt', 'D:\\change\\your\\path\\to\\20240603.txt']
stopwords_file = 'D:\\cxdownload\\stopwords-en.txt'

with open(stopwords_file, 'r', encoding='utf-8') as f:
stopwords = set(line.strip().lower() for line in f if line.strip())
word_counter = Counter()
word_pattern = re.compile(r'\b[a-zA-Z-]+\b')

for file_path in cet4_files:
with open(file_path, 'r', encoding='utf-8') as f:
text = f.read().lower()
words = word_pattern.findall(text)
word_counter.update(word for word in words if word not in stopwords)

for word, count in word_counter.most_common(100): #此处100代表前多少个单词
print(f'{word}: {count}')

去除常用词版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

with open("C:\\Users\\berni\\Desktop\\python实验课2\\stopwords-zh.txt", "r", encoding="utf-8") as file:
stopwords = file.read()

with open("C:\\Users\\berni\\Desktop\\python实验课2\\2024年政府工作报告.txt", "r", encoding="utf-8") as file:
report_text = file.read()

words = jieba.lcut(report_text)

filtered_words = [word for word in words if word not in stopwords]

text = " ".join(filtered_words)

wordcloud = WordCloud(
background_color="white",
font_path='C:/Windows/Fonts/simsun.ttc',
max_words=200,
).generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

2024政府工作报告词云

常规版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

with open("D:\\change\\your\\path\\to\\2024年政府工作报告.txt", "r", encoding="utf-8") as file:
report_text = file.read()

words = jieba.lcut(report_text)
text = " ".join(words)


wordcloud = WordCloud(
background_color="white",
font_path='C:/Windows/Fonts/simsun.ttc',
max_words=200
).generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

更改底色版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 读取政府工作报告文本
with open("2024年政府工作报告.txt", "r", encoding="utf-8") as file:
report_text = file.read()

# 使用jieba进行中文分词
words = jieba.lcut(report_text)

# 将分词结果用空格连接起来,形成字符串
text = " ".join(words)

# 读取遮罩图片(如果使用)
# mask_image = Image.open("mask.png")
# mask = np.array(mask_image)
# width, height = mask_image.size

# 生成词云图(如果不使用遮罩图片,则注释掉mask相关参数)
wordcloud = WordCloud(
background_color="white", # 背景颜色
# mask=mask, # 使用遮罩图片数组(如果使用遮罩图片,则取消注释)
font_path='C:/Windows/Fonts/simsun.ttc', # 适用于常见的中文字体
max_words=200, # 限制单词数量
# width=width, # 设置词云宽度与遮罩图片相同(如果使用遮罩图片,则取消注释)
# height=height # 设置词云高度与遮罩图片相同(如果使用遮罩图片,则取消注释)
).generate(text)

# 显示词云图片
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off") # 关闭坐标轴
plt.show()

去除常用字版

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 定义要排除的常用连词列表
stopwords = set([
"的", "了", "在", "是", "和", "有", "我", "你", "他", "她", "它", "们",
"来", "去", "上", "下", "说", "到", "就", "也", "都", "还", "这", "那",
"对", "给", "把", "用", "以", "为", "比", "从", "被", "让", "没", "不",
"会", "能", "要", "想", "时", "大", "小", "多", "少", "很", "好", "快",
"慢", "高", "低", "长", "短", "前", "后", "左", "右", "里", "外", "中",
"间", "最", "又", "再", "还", "就", "才", "可", "但", "或", "且", "因",
"果", "然", "虽", "已", "经", "通", "过", "向", "而", "于", "自", "其",
"之", "所", "以", "及", "因此", "所以", "但是", "然而", "如果", "虽然",
"尽管", "或者", "并且", "而且", "不仅", "不但", "而且", "还有", "也许",
"大概", "可能", "似乎", "好像", "确实", "的确", "已经", "曾经", "刚刚",
"正在", "将要", "马上", "立刻", "现在", "过去", "未来", "今天", "明天",
"昨天", "上午", "下午", "晚上", "今年", "明年", "去年", "一月", "二月",
"三月", "四月", "五月", "六月", "七月", "八月", "九月", "十月", "十一月",
"十二月", "星期一", "星期二", "星期三", "星期四", "星期五", "星期六",
"星期日"
])

# 读取政府工作报告文本
with open("2024年政府工作报告.txt", "r", encoding="utf-8") as file:
report_text = file.read()

# 使用jieba进行中文分词
words = jieba.lcut(report_text)

# 过滤掉常用连词
filtered_words = [word for word in words if word not in stopwords]

# 将过滤后的分词结果用空格连接起来,形成字符串
text = " ".join(filtered_words)

# 读取遮罩图片(如果使用)
# mask_image = Image.open("mask.png")
# mask = np.array(mask_image)
# width, height = mask_image.size

# 生成词云图(如果不使用遮罩图片,则注释掉mask相关参数)
wordcloud = WordCloud(
background_color="white", # 背景颜色
# mask=mask, # 使用遮罩图片数组(如果使用遮罩图片,则取消注释)
font_path='C:/Windows/Fonts/simsun.ttc', # 适用于常见的中文字体
max_words=200, # 限制单词数量
# width=width, # 设置词云宽度与遮罩图片相同(如果使用遮罩图片,则取消注释)
# height=height # 设置词云高度与遮罩图片相同(如果使用遮罩图片,则取消注释)
).generate(text)

# 显示词云图片
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off") # 关闭坐标轴
plt.show()
Donate
  • Copyright: Copyright is owned by the author. For commercial reprints, please contact the author for authorization. For non-commercial reprints, please indicate the source.
  • Copyrights © 2023-2025 John Doe
  • Visitors: | Views:

请我喝杯茶吧~

支付宝
微信