python：实验课2

2024-11-29

Word count: 1.5k | Reading time≈ 7 min

将在这个页面同步实验内容,持续更新，刷新查看最新内容

安装库

1 2	pip install jieba pip install wordcloud

cet4词频统计

常规版

import re
from collections import Counter

cet4_files = ['D:\\change\\your\\path\\to\\20240601.txt', 'D:\\change\\your\\path\\to\\20240602.txt', 'D:\\change\\your\\path\\to\\20240603.txt']
stopwords_file = 'D:\\cxdownload\\stopwords-en.txt'

with open(stopwords_file, 'r', encoding='utf-8') as f:
    stopwords = set(line.strip().lower() for line in f if line.strip())
word_counter = Counter()
word_pattern = re.compile(r'\b[a-zA-Z-]+\b')

for file_path in cet4_files:
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read().lower()  
        words = word_pattern.findall(text)  
        word_counter.update(word for word in words if word not in stopwords)

for word, count in word_counter.most_common(100): #此处100代表前多少个单词
    print(f'{word}: {count}')

去除常用词版

from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

with open("C:\\Users\\berni\\Desktop\\python实验课2\\stopwords-zh.txt", "r", encoding="utf-8") as file:
    stopwords = file.read()

with open("C:\\Users\\berni\\Desktop\\python实验课2\\2024年政府工作报告.txt", "r", encoding="utf-8") as file:
    report_text = file.read()

words = jieba.lcut(report_text)

filtered_words = [word for word in words if word not in stopwords]

text = " ".join(filtered_words)

wordcloud = WordCloud(
    background_color="white", 
    font_path='C:/Windows/Fonts/simsun.ttc',  
    max_words=200,
).generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  
plt.show()

2024政府工作报告词云

常规版

from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

with open("D:\\change\\your\\path\\to\\2024年政府工作报告.txt", "r", encoding="utf-8") as file:
    report_text = file.read()

words = jieba.lcut(report_text)
text = " ".join(words)


wordcloud = WordCloud(
    background_color="white",  
    font_path='C:/Windows/Fonts/simsun.ttc',
    max_words=200
).generate(text)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  
plt.show()

更改底色版

from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 读取政府工作报告文本
with open("2024年政府工作报告.txt", "r", encoding="utf-8") as file:
    report_text = file.read()

# 使用jieba进行中文分词
words = jieba.lcut(report_text)

# 将分词结果用空格连接起来，形成字符串
text = " ".join(words)

# 读取遮罩图片（如果使用）
# mask_image = Image.open("mask.png")
# mask = np.array(mask_image)
# width, height = mask_image.size

# 生成词云图（如果不使用遮罩图片，则注释掉mask相关参数）
wordcloud = WordCloud(
    background_color="white",  # 背景颜色
    # mask=mask,  # 使用遮罩图片数组（如果使用遮罩图片，则取消注释）
    font_path='C:/Windows/Fonts/simsun.ttc',  # 适用于常见的中文字体
    max_words=200,  # 限制单词数量
    # width=width,  # 设置词云宽度与遮罩图片相同（如果使用遮罩图片，则取消注释）
    # height=height  # 设置词云高度与遮罩图片相同（如果使用遮罩图片，则取消注释）
).generate(text)

# 显示词云图片
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  # 关闭坐标轴
plt.show()

去除常用字版

from PIL import Image
import numpy as np
import jieba
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 定义要排除的常用连词列表
stopwords = set([
    "的", "了", "在", "是", "和", "有", "我", "你", "他", "她", "它", "们",
    "来", "去", "上", "下", "说", "到", "就", "也", "都", "还", "这", "那",
    "对", "给", "把", "用", "以", "为", "比", "从", "被", "让", "没", "不",
    "会", "能", "要", "想", "时", "大", "小", "多", "少", "很", "好", "快",
    "慢", "高", "低", "长", "短", "前", "后", "左", "右", "里", "外", "中",
    "间", "最", "又", "再", "还", "就", "才", "可", "但", "或", "且", "因",
    "果", "然", "虽", "已", "经", "通", "过", "向", "而", "于", "自", "其",
    "之", "所", "以", "及", "因此", "所以", "但是", "然而", "如果", "虽然",
    "尽管", "或者", "并且", "而且", "不仅", "不但", "而且", "还有", "也许",
    "大概", "可能", "似乎", "好像", "确实", "的确", "已经", "曾经", "刚刚",
    "正在", "将要", "马上", "立刻", "现在", "过去", "未来", "今天", "明天",
    "昨天", "上午", "下午", "晚上", "今年", "明年", "去年", "一月", "二月",
    "三月", "四月", "五月", "六月", "七月", "八月", "九月", "十月", "十一月",
    "十二月", "星期一", "星期二", "星期三", "星期四", "星期五", "星期六",
    "星期日"
])

# 读取政府工作报告文本
with open("2024年政府工作报告.txt", "r", encoding="utf-8") as file:
    report_text = file.read()

# 使用jieba进行中文分词
words = jieba.lcut(report_text)

# 过滤掉常用连词
filtered_words = [word for word in words if word not in stopwords]

# 将过滤后的分词结果用空格连接起来，形成字符串
text = " ".join(filtered_words)

# 读取遮罩图片（如果使用）
# mask_image = Image.open("mask.png")
# mask = np.array(mask_image)
# width, height = mask_image.size

# 生成词云图（如果不使用遮罩图片，则注释掉mask相关参数）
wordcloud = WordCloud(
    background_color="white",  # 背景颜色
    # mask=mask,  # 使用遮罩图片数组（如果使用遮罩图片，则取消注释）
    font_path='C:/Windows/Fonts/simsun.ttc',  # 适用于常见的中文字体
    max_words=200,  # 限制单词数量
    # width=width,  # 设置词云宽度与遮罩图片相同（如果使用遮罩图片，则取消注释）
    # height=height  # 设置词云高度与遮罩图片相同（如果使用遮罩图片，则取消注释）
).generate(text)

# 显示词云图片
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")  # 关闭坐标轴
plt.show()

Donate

Copyright： Copyright is owned by the author. For commercial reprints, please contact the author for authorization. For non-commercial reprints, please indicate the source.