python对文本内容的标签分类_Python文章分类与自动分类标签--NLP
对文章进行分类、打标签、建模的主要步骤如下:
1)将已划分好类型的文本集(训练集)和待划分类型的文本集(测试集)进行分词处理,将长句划分为单个词组;
2)将步骤1中切好的词组放入词包中,并扩展成链式结构,形成bag of word;
3)应用TF-IDF算法计算训练集文档中每篇文章的TF-IDF权重矩阵;
4)使用朴素贝叶斯分类方法对训练集数据进行训练,然后利用得到的参数对测试集数据进行分类处理;
原始数据源:
-------------------------------------文本分词处理------------------------------------
分词
#!/usr/bin/env python
-- coding: UTF-8 --
import os
import jieba
import jieba.analyse # 导入提取关键词的库
对训练集 测试集文本都进行切词处理,对测试集数据打上主题标签
保存至文件
def save_file(save_path, content):
with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:
fp.write(content)
读取文件
def read_file(file_path):
with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:
content = fp.readlines()
print(content)
return str(content)
抽取测试集的主题关键词
def extract_theme(content):
themes = []
tags = jieba.analyse.extract_tags(content, topK=3, withWeight=True, allowPOS=\
['n','ns','v','vn'],withFlag=True)
'''sentence 为待提取的文本topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20withWeight 为是否一并返回关键词权重值,默认值为 FalseallowPOS 仅包括指定词性的词,默认值为空,即不筛选withFlag 表示词性'''
for i in tags:
themes.append(i[0].word)
return str(themes)
def cast_words(origin_path, save_path, theme_tag):
'''train_words_path: 原始文本路径train_save_path: 切词后文本路径:return:'''
file_lists = os.listdir(origin_path) #原文档所在路径,os.listdir读取到一个目录下面所有的文件名
for dir_1 in file_lists: # 找到文件夹
file_path = origin_path + dir_1 + "/" #原始文件路径
seg_path = save_path + dir_1 + "/" #切词后文件路径
if not os.path.exists(seg_path):
os.makedirs(seg_path) #生成文件夹
detail_paths = os.listdir(file_path)
for detail_path in detail_paths: # 找到文件夹下具体文件路径
full_path = file_path + detail_path #原始文件下每个文档路径
file_content = read_file(full_path)
file_content = file_content.strip() # replace("\r\n", " ") # 删除换行
file_content = file_content.replace(r"\u3000", "") # 删除空行、多余的空格
file_content = file_content.replace(r"& nbsp", "") # html 里的空格占位符
file_content = file_content.replace("'", "")
file_content = file_content.replace(" \ n ", "")
content_seg = jieba.cut(file_content) # 为文件内容分词
if theme_tag is not None:
print("文件路径:{} ".format(theme_tag + detail_path))
theme = extract_theme(" ".join(content_seg)) #theme为该文章的主题关键词
print("文章主题关键词:{} ".format(theme))
save_file(theme_tag + detail_path, theme) # 将训练集文章的主题关键词 保存到标签存储路径
save_file(seg_path + detail_path, " ".join(content_seg)) # 将处理后的文件保存到分词后语料目录
if name == "main":
对训练集进行分词
train_words_path = './train_words/' #输入源文件
train_save_path = './train_segments/' #输出新文件
cast_words(train_words_path,train_save_path,theme_tag=None)
对测试集进行分词 抽取文章主题标签
train_words_path = './test_words/' #输入源文件
train_save_path = './test_segments/' #输出新文件
theme_tag_path = './theme_tag/' #存放测试集文章主题标签路径
cast_words(train_words_path, train_save_path, theme_tag=theme_tag_path)
执行程序后,训练集和测试集对应文件夹下未经处理的原始txt文档被切词处理,并将切词后的txt文件存入新建立的文件夹下。
---------------------------------------数据结构处理----------------------------------
为了方便后续生成词向量空间模型,需要将这些分词后的文本信息转换成文本向量信息并对象化,Sklearn中的Bunch库继承自dict类型,可以参数形式创建相关对象,并以key/value的形式存储数据。
#!/usr/bin/env python
-- coding: UTF-8 --
import os
import pickle
import time
from sklearn.datasets.base import Bunch
'''label: 文章类型filepath: 文章路径contents: 分词后的文章'''
def read_file(file_path):
with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:
content = fp.readlines()
return str(content)
def word_to_bunch(train_save_path, train_bunch_path):
bunch = Bunch(label=[], filepath=[], contents=[])
all_labels = os.listdir(train_save_path)
for label in all_labels:
detail_path = train_save_path + label + '/'
all_details = os.listdir(detail_path)
for all_detail in all_details:
file_detail_path = detail_path + all_detail # 文件具体路径
bunch.label.append(label) #文章类型
print(bunch.label)
bunch.filepath.append(file_detail_path) #文章所在路径
print(bunch.filepath)
contents = read_file(file_detail_path)
print(contents) #文章内容
bunch.contents.append(contents)
print(bunch.contents)
with open(train_bunch_path, "wb+") as fp:
pickle.dump(bunch, fp)
print("创建完成")
if name == "main":
#对训练集进行分词
train_save_path = './train_segments/'
train_bunch_path = "train_bunch_bag.dat"
word_to_bunch(train_save_path, train_bunch_path)
#对测试集进行分词,抽取文章主题标签
test_save_path = './test_segments/'
test_bunch_path = "test_bunch_bag.dat"
word_to_bunch(test_save_path, test_bunch_path)
执行完程序后生成train_bunch_bag.dat和test_bunch_bag.dat数据文件。
---------------------------------------TF-IDF权重矩阵---------------------------------
其中TF=(某词在文档中出现的总次数/文档的词总数), IDF= loge(语料库中文档总数/包含该词的文档数)+1。
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import pickle
from sklearn.datasets.base import Bunch
读取bunch对象
def read_bunch(path):
with open(path, "rb") as fp:
bunch = pickle.load(fp) # joblib 同样可用于存储模型文件
return bunch
读取文件对象
def read_file(path):
with open(path, "rb") as fp:
bunch = fp.read()
return bunch
写入bunch对象
def write_bunch(path,bunch):
with open(path, "wb") as fp:
pickle.dump(bunch,fp)
训练集
def train_tfidf_space(stopword_path, train_bunch_path, train_tfidf_data):
'''stopword_path: 停用词路径train_bunch_path: 训练集语料路径train_tfidf_data: 训练集tfidf数据路径'''
bunch = read_bunch(train_bunch_path)
stopwords = read_file(stopword_path).splitlines() # 读取停用词
tfidf_space = Bunch(label=bunch.label, filepath=bunch.filepath, contents=bunch.contents, tdm=[], space={})
vectorizer = TfidfVectorizer(stop_words=stopwords, sublinear_tf=True, max_df=0.5)
#max_df 严格忽略高于给出阈值的文档频率的词条 ,sublinear 应用线性缩放TF
tfidf_space.tdm = vectorizer.fit_transform(bunch.contents)
tfidf_space.space = vectorizer.vocabulary_
write_bunch(train_tfidf_data,tfidf_space)
测试集
def test_tfidf_space(stopword_path, test_bunch_path, test_tfidf_data, train_tfidf_data):
'''stopword_path: 停用词路径test_bunch_path: 测试集语料路径test_tfidf_data: 测试集tfidf数据路径train_tfidf_data: 训练集tfidf数据路径,将训练集的词向量空间坐标赋值给测试集'''
bunch = read_bunch(test_bunch_path)
stopwords = read_file(stopword_path).splitlines() # 读取停用词
tfidf_space = Bunch(label=bunch.label, filepath=bunch.filepath, contents=bunch.contents, tdm=[], space={})
权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值
train_bunch = read_bunch(train_tfidf_data) #训练集tfidf数据
tfidf_space.space = train_bunch.space #将训练集的词向量空间坐标赋值给测试集
#使用TfidVectorizer初始化向量空间模型
vectorizer = TfidfVectorizer(stop_words=stopwords, sublinear_tf=True, max_df=0.5, vocabulary=train_bunch.space)
#文本转为词频矩阵,单独保存字典文件
tfidf_space.tdm = vectorizer.fit_transform(bunch.contents)
#创建词袋的持久化
write_bunch(test_tfidf_data, tfidf_space)
if name == 'main':
训练集数据处理
stopword_path = "./chinese_stop_words.txt" # 停用词表的路径
train_bunch_path = './train_bunch_bag.dat'
train_tfidf_data = './train_tfdifspace.dat'
train_tfidf_space(stopword_path, train_bunch_path,train_tfidf_data)
测试集数据处理
test_bunch_path = './test_bunch_bag.dat'
test_tfidf_data = './test_tfidfspace.dat'
test_tfidf_space(stopword_path, test_bunch_path, test_tfidf_data,train_tfidf_data)
space表示词向量空间坐标,tdm表示训练集和测试集数据的TF-IDF权重矩阵。
执行完程序后生成train_tfidfspace.dat和test_tfidfspace.dat数据文件。
-----------------------------------朴素贝叶斯分类--------------------------------------
import pickle
from sklearn.naive_bayes import MultinomialNB
import warnings
from sklearn import metrics
warnings.filterwarnings("ignore")
读取bunch对象
def read_bunch(path):
with open(path, "rb") as fp:
bunch = pickle.load(fp) # joblib 同样可用于存储模型文件
return bunch
分类结果保存至文件
def save_file(save_path, content):
with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:
fp.write(content)
朴素贝叶斯分类
def nbayes_classify(train_set, test_set):
'''train_set: 训练集样本数据test_set: 测试集样本数据:return: 测试集样本分类'''
clf = MultinomialNB(alpha=0.5) #alpha: 拉普拉斯平滑参数,默认为1.0
clf.fit(train_set.tdm, train_set.label) # 训练模型
predict = clf.predict(test_set.tdm)
return predict
def classification_result(actual, predict):
print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted')))
print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted')))
print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted')))
if name == 'main':
导入训练集
train_path = './train_tfdifspace.dat'
train_set = read_bunch(train_path)
导入测试集
test_path = "./test_tfidfspace.dat"
test_set = read_bunch(test_path)
predict = nbayes_classify(train_set, test_set) #
classification_result(test_set.label, predict)
print('-' * 100)
保存结果路径
save_path = './classify_file.txt'
for label, filename, predict in zip(test_set.label, test_set.filepath , predict): #test_set
print(filename, "\t实际类别:",label,"\t-->预测类别:", predict)
save_content = filename + "\t实际类别:" + label + "\t-->预测类别:" + predict + '\n'
save_file(save_path, save_content) # 将分类结果写入txt
从建模流程来看,对文章分类、打标签包括4个主要步骤:
1)文章数据分词处理,抽取文章的主题标签;
2)构建文章的词袋模型;
3)使用TF-IDF计算文章的词空间向量;
4)使用朴素贝叶斯算法对测试集文章进行分类。
