三国词云

首先导入jieba

#pip install jieba
import jieba

测试jieba

# txt='我来到北京清华大学'
# seg_list=jieba.lcut(txt)
# print(seg_list)
['我', '来到', '北京', '清华大学']

加载三国文本‘threekingdom.txt’
分词,并存在列表里

    with open('threekingdom.txt','r',encoding='utf-8') as f:
        txt=f.read()
        #将字符串分割成等量的中文
    words=jieba.lcut(txt)
    #print(words)

统计词频,去除长度为1的词

counts={}
    for word in words:
        if len(word)==1:
            continue
        else:
            #往字典里增加元素
            # counts['key']=888
            counts[word]=counts.get(word,0)+1
            # counts['曹操'] = counts.get('曹操', 0) + 1
    #print(counts)

合并名字相同的词,词频相加

#合并相同词
   counts['孔明']=counts.get('孔明')+counts.get('孔明曰')
   counts['玄德'] = counts.get('玄德') + counts.get('玄德曰')
   counts['玄德'] = counts.get('玄德') + counts.get('刘备')
   counts['关公'] = counts.get('关公') + counts.get('云长')

定义不是名字的高频词集合,并从集合中删去

excludes={"将军", "却说", "丞相", "二人", "不可", "荆州", "不能", "如此", "商议",
                "如何", "主公", "军士", "军马", "左右", "次日", "引兵", "大喜", "天下",
                "东吴", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人马", "不知",
                '孔明曰','玄德曰','云长','刘备'
                }
   #删除无关词
    for word in excludes:
        del counts[word]

两种方法统计词频出现次数最多的前20个词

#统计出现频次最多的前20个词
    items=list(counts.items())
    #print('排序前:',items)
    items.sort(key=lambda x:x[1],reverse=True)
    #print('排序后:',items)
    for i in range(10):
        character,count=items[i]
        print(character,count)

    #统计出现词频最高的20个词2
    # roles=Counter(counts)
    # role=roles.most_common(10)
    # print(role)

构造词云字符串

首先需要导入词云WordCloud

li=[]
    for i in range(10):
        character, count = items[i]
        for _ in range(count):
            li.append(character)
    # print(li)
    cloud_txt=",".join(li)

wc=WordCloud(
        background_color='white',
        font_path='msyh.ttc',
        #是否包含两个词的搭配,默认是True
        collocations=False
    ).generate(cloud_txt)
    wc.to_file('三国词云.png')

最后运行函数

全部代码:

#pip install jieba
import jieba
from collections import  Counter
from wordcloud import WordCloud
# txt='我来到北京清华大学'
# seg_list=jieba.lcut(txt)
# print(seg_list)
def parse():
    """三国小说人物出场词频统计"""
    #定义无关词集合
    excludes={"将军", "却说", "丞相", "二人", "不可", "荆州", "不能", "如此", "商议",
                "如何", "主公", "军士", "军马", "左右", "次日", "引兵", "大喜", "天下",
                "东吴", "于是", "今日", "不敢", "魏兵", "陛下", "都督", "人马", "不知",
                '孔明曰','玄德曰','云长','刘备'
                }
    with open('threekingdom.txt','r',encoding='utf-8') as f:
        txt=f.read()
        #将字符串分割成等量的中文
    words=jieba.lcut(txt)
    #print(words)
    #'曹操':555
    counts={}
    for word in words:
        if len(word)==1:
            continue
        else:
            #往字典里增加元素
            # counts['key']=888
            counts[word]=counts.get(word,0)+1
            # counts['曹操'] = counts.get('曹操', 0) + 1
    #print(counts)

    #合并相同词
    counts['孔明']=counts.get('孔明')+counts.get('孔明曰')
    counts['玄德'] = counts.get('玄德') + counts.get('玄德曰')
    counts['玄德'] = counts.get('玄德') + counts.get('刘备')
    counts['关公'] = counts.get('关公') + counts.get('云长')
    #删除无关词
    for word in excludes:
        del counts[word]
    #统计出现频次最多的前20个词
    items=list(counts.items())
    #print('排序前:',items)
    items.sort(key=lambda x:x[1],reverse=True)
    #print('排序后:',items)
    for i in range(10):
        character,count=items[i]
        print(character,count)

    #统计出现词频最高的20个词2
    # roles=Counter(counts)
    # role=roles.most_common(10)
    # print(role)

    ##生成词云

    #构造词云字符串
    li=[]
    for i in range(10):
        character, count = items[i]
        for _ in range(count):
            li.append(character)
    # print(li)
    cloud_txt=",".join(li)

    wc=WordCloud(
        background_color='white',
        font_path='msyh.ttc',
        #是否包含两个词的搭配,默认是True
        collocations=False
    ).generate(cloud_txt)
    wc.to_file('三国词云.png')

parse()
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容