python制作词云图设置停用词,Python生成词云图
代码如下:
from os import path
from wordcloud import WordCloud
import matplotlib
matplotlib.use('TkAgg')
获取目录地址,读取文本
d = path.dirname(file)
text = open(path.join(d, 'haha.txt')).read()
生成一个词云图像
wordcloud = WordCloud().generate(text)
pil方式展示生成的词云图像
image = wordcloud.to_image()
image.show()
from os import path
from PIL import Image
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
from wordcloud import WordCloud, STOPWORDS
d = path.dirname(file)
读取整个文本.
text = open(path.join(d, 'haha.txt')).read()
读取图片
alice_mask = np.array(Image.open(path.join(d, "heart.png")))
添加停用词
stopwords = set(STOPWORDS)
stopwords.add("HaHa")
设置词云的一些属性
外汇经纪商对比http://www.fx61.com/brokerlist
wc = WordCloud(background_color="black", max_words=2000, mask=alice_mask,
stopwords=stopwords)
生成词云
wc.generate(text)
pil方式展示生成的词云图像(如果你没有matplotlib)
image = wc.to_image()
image.show()
from os import path
from PIL import Image
import numpy as np
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import jieba
d = path.dirname(file)
# 读取整个文本.
text = open(path.join(d, '中文.txt')).read()
print text
#读取图片
alice_mask = np.array(Image.open(path.join(d, "dog.jpeg")))
stopwords_path = 'stopwords.txt' # 停用词词表
my_words_list = ['碌碌无为'] # 在结巴的词库中添加新词
添加自己的词库分词
def add_word(list):
for items in list:
jieba.add_word(items)
add_word(my_words_list)
结巴分词
wordlist = jieba.cut(text, cut_all=False)
wl = " ".join(wordlist)
print(wl)#输出分词之后的txt
去掉停用词
mywordlist = []
f_stop = open(stopwords_path)
try:
f_stop_text = f_stop.read()
f_stop_text = unicode(f_stop_text, 'utf-8')
print f_stop_text
finally:
f_stop.close()
f_stop_seg_list = f_stop_text.split('\n')
for myword in wl.split(" "):
for stopword in f_stop_seg_list:
if (myword.strip() == stopword) or len(myword.strip()) <= 1:
break;
else :
mywordlist.append(myword)
mywordlist = " ".join(mywordlist)
stopwords = set(STOPWORDS)
stopwords.add("这样")
设置词云
wc = WordCloud(background_color="black", # 设置背景颜色
mask=alice_mask, #设置背景图片
max_words=2000, # 设置最大显示的字数
font_path="fangsong_GB2312.ttf",
font_path="华文宋体.ttf",
max_font_size=50, # 设置字体最大值
random_state=30, # 设置有多少种随机生成状态,即有多少种配色方案
)
wc = WordCloud(背景颜色设置为黑色, 最大单词数量设为2000, 图像遮罩应用 alice_mask 参数, 字体路径设置为 "华文宋体" 字体文件)
stopwords=stopwords)
myword = wc.generate(mywordlist) # 生成词云
展示词云图
plt.imshow(myword)
plt.axis("off")
plt.show()
词云图模糊怎么解决?
默认的参数图片分辨率较低,设置scale参数,参数越大,分辨率越高。
词云图重复怎么解决?
默认collocations=True的情况下,默认情况下会认为相邻的两个词是一个整体(即视为一个单词),因此我们将其设置为False即可解决这一问题。具体机制而言,则是将两个连续出现的关键字视为同一个概念进行处理。collocations=False #为了应对关键词重复的问题,在这种情况下我们可以通过调整colocations参数来实现优化效果
其他
python展示中文字符串列表,直接输出会展示为unicode编码后的格式。
import json
print json.dumps(f_stop_seg_list, encoding=“UTF-8”, ensure_ascii=False)
