Advertisement

python对文本内容的标签分类_Python文章分类与自动分类标签--NLP

阅读量:

对文章进行分类、打标签、建模的主要步骤如下:

1)将已划分好类型的文本集(训练集)和待划分类型的文本集(测试集)进行分词处理,将长句划分为单个词组;

2)将步骤1中切好的词组放入词包中,并扩展成链式结构,形成bag of word;

3)应用TF-IDF算法计算训练集文档中每篇文章的TF-IDF权重矩阵;

4)使用朴素贝叶斯分类方法对训练集数据进行训练,然后利用得到的参数对测试集数据进行分类处理;

原始数据源:

-------------------------------------文本分词处理------------------------------------

分词

#!/usr/bin/env python

-- coding: UTF-8 --

import os

import jieba

import jieba.analyse # 导入提取关键词的库

对训练集 测试集文本都进行切词处理,对测试集数据打上主题标签

保存至文件

def save_file(save_path, content):

with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:

fp.write(content)

读取文件

def read_file(file_path):

with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:

content = fp.readlines()

print(content)

return str(content)

抽取测试集的主题关键词

def extract_theme(content):

themes = []

tags = jieba.analyse.extract_tags(content, topK=3, withWeight=True, allowPOS=\

['n','ns','v','vn'],withFlag=True)

'''sentence 为待提取的文本topK 为返回几个 TF/IDF 权重最大的关键词,默认值为 20withWeight 为是否一并返回关键词权重值,默认值为 FalseallowPOS 仅包括指定词性的词,默认值为空,即不筛选withFlag 表示词性'''

for i in tags:

themes.append(i[0].word)

return str(themes)

def cast_words(origin_path, save_path, theme_tag):

'''train_words_path: 原始文本路径train_save_path: 切词后文本路径:return:'''

file_lists = os.listdir(origin_path) #原文档所在路径,os.listdir读取到一个目录下面所有的文件名

for dir_1 in file_lists: # 找到文件夹

file_path = origin_path + dir_1 + "/" #原始文件路径

seg_path = save_path + dir_1 + "/" #切词后文件路径

if not os.path.exists(seg_path):

os.makedirs(seg_path) #生成文件夹

detail_paths = os.listdir(file_path)

for detail_path in detail_paths: # 找到文件夹下具体文件路径

full_path = file_path + detail_path #原始文件下每个文档路径

file_content = read_file(full_path)

file_content = file_content.strip() # replace("\r\n", " ") # 删除换行

file_content = file_content.replace(r"\u3000", "") # 删除空行、多余的空格

file_content = file_content.replace(r"& nbsp", "") # html 里的空格占位符

file_content = file_content.replace("'", "")

file_content = file_content.replace(" \ n ", "")

content_seg = jieba.cut(file_content) # 为文件内容分词

if theme_tag is not None:

print("文件路径:{} ".format(theme_tag + detail_path))

theme = extract_theme(" ".join(content_seg)) #theme为该文章的主题关键词

print("文章主题关键词:{} ".format(theme))

save_file(theme_tag + detail_path, theme) # 将训练集文章的主题关键词 保存到标签存储路径

save_file(seg_path + detail_path, " ".join(content_seg)) # 将处理后的文件保存到分词后语料目录

if name == "main":

对训练集进行分词

train_words_path = './train_words/' #输入源文件

train_save_path = './train_segments/' #输出新文件

cast_words(train_words_path,train_save_path,theme_tag=None)

对测试集进行分词 抽取文章主题标签

train_words_path = './test_words/' #输入源文件

train_save_path = './test_segments/' #输出新文件

theme_tag_path = './theme_tag/' #存放测试集文章主题标签路径

cast_words(train_words_path, train_save_path, theme_tag=theme_tag_path)

执行程序后,训练集和测试集对应文件夹下未经处理的原始txt文档被切词处理,并将切词后的txt文件存入新建立的文件夹下。

---------------------------------------数据结构处理----------------------------------

为了方便后续生成词向量空间模型,需要将这些分词后的文本信息转换成文本向量信息并对象化,Sklearn中的Bunch库继承自dict类型,可以参数形式创建相关对象,并以key/value的形式存储数据。

#!/usr/bin/env python

-- coding: UTF-8 --

import os

import pickle

import time

from sklearn.datasets.base import Bunch

'''label: 文章类型filepath: 文章路径contents: 分词后的文章'''

def read_file(file_path):

with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:

content = fp.readlines()

return str(content)

def word_to_bunch(train_save_path, train_bunch_path):

bunch = Bunch(label=[], filepath=[], contents=[])

all_labels = os.listdir(train_save_path)

for label in all_labels:

detail_path = train_save_path + label + '/'

all_details = os.listdir(detail_path)

for all_detail in all_details:

file_detail_path = detail_path + all_detail # 文件具体路径

bunch.label.append(label) #文章类型

print(bunch.label)

bunch.filepath.append(file_detail_path) #文章所在路径

print(bunch.filepath)

contents = read_file(file_detail_path)

print(contents) #文章内容

bunch.contents.append(contents)

print(bunch.contents)

with open(train_bunch_path, "wb+") as fp:

pickle.dump(bunch, fp)

print("创建完成")

if name == "main":

#对训练集进行分词

train_save_path = './train_segments/'

train_bunch_path = "train_bunch_bag.dat"

word_to_bunch(train_save_path, train_bunch_path)

#对测试集进行分词,抽取文章主题标签

test_save_path = './test_segments/'

test_bunch_path = "test_bunch_bag.dat"

word_to_bunch(test_save_path, test_bunch_path)

执行完程序后生成train_bunch_bag.dat和test_bunch_bag.dat数据文件。

---------------------------------------TF-IDF权重矩阵---------------------------------

其中TF=(某词在文档中出现的总次数/文档的词总数), IDF= loge(语料库中文档总数/包含该词的文档数)+1。

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.feature_extraction.text import TfidfTransformer

import pickle

from sklearn.datasets.base import Bunch

读取bunch对象

def read_bunch(path):

with open(path, "rb") as fp:

bunch = pickle.load(fp) # joblib 同样可用于存储模型文件

return bunch

读取文件对象

def read_file(path):

with open(path, "rb") as fp:

bunch = fp.read()

return bunch

写入bunch对象

def write_bunch(path,bunch):

with open(path, "wb") as fp:

pickle.dump(bunch,fp)

训练集

def train_tfidf_space(stopword_path, train_bunch_path, train_tfidf_data):

'''stopword_path: 停用词路径train_bunch_path: 训练集语料路径train_tfidf_data: 训练集tfidf数据路径'''

bunch = read_bunch(train_bunch_path)

stopwords = read_file(stopword_path).splitlines() # 读取停用词

tfidf_space = Bunch(label=bunch.label, filepath=bunch.filepath, contents=bunch.contents, tdm=[], space={})

vectorizer = TfidfVectorizer(stop_words=stopwords, sublinear_tf=True, max_df=0.5)

#max_df 严格忽略高于给出阈值的文档频率的词条 ,sublinear 应用线性缩放TF

tfidf_space.tdm = vectorizer.fit_transform(bunch.contents)

tfidf_space.space = vectorizer.vocabulary_

write_bunch(train_tfidf_data,tfidf_space)

测试集

def test_tfidf_space(stopword_path, test_bunch_path, test_tfidf_data, train_tfidf_data):

'''stopword_path: 停用词路径test_bunch_path: 测试集语料路径test_tfidf_data: 测试集tfidf数据路径train_tfidf_data: 训练集tfidf数据路径,将训练集的词向量空间坐标赋值给测试集'''

bunch = read_bunch(test_bunch_path)

stopwords = read_file(stopword_path).splitlines() # 读取停用词

tfidf_space = Bunch(label=bunch.label, filepath=bunch.filepath, contents=bunch.contents, tdm=[], space={})

权重矩阵tdm,其中,权重矩阵是一个二维矩阵,tdm[i][j]表示,第j个词(即词典中的序号)在第i个类别中的IF-IDF值

train_bunch = read_bunch(train_tfidf_data) #训练集tfidf数据

tfidf_space.space = train_bunch.space #将训练集的词向量空间坐标赋值给测试集

#使用TfidVectorizer初始化向量空间模型

vectorizer = TfidfVectorizer(stop_words=stopwords, sublinear_tf=True, max_df=0.5, vocabulary=train_bunch.space)

#文本转为词频矩阵,单独保存字典文件

tfidf_space.tdm = vectorizer.fit_transform(bunch.contents)

#创建词袋的持久化

write_bunch(test_tfidf_data, tfidf_space)

if name == 'main':

训练集数据处理

stopword_path = "./chinese_stop_words.txt" # 停用词表的路径

train_bunch_path = './train_bunch_bag.dat'

train_tfidf_data = './train_tfdifspace.dat'

train_tfidf_space(stopword_path, train_bunch_path,train_tfidf_data)

测试集数据处理

test_bunch_path = './test_bunch_bag.dat'

test_tfidf_data = './test_tfidfspace.dat'

test_tfidf_space(stopword_path, test_bunch_path, test_tfidf_data,train_tfidf_data)

space表示词向量空间坐标,tdm表示训练集和测试集数据的TF-IDF权重矩阵。

执行完程序后生成train_tfidfspace.dat和test_tfidfspace.dat数据文件。

-----------------------------------朴素贝叶斯分类--------------------------------------

import pickle

from sklearn.naive_bayes import MultinomialNB

import warnings

from sklearn import metrics

warnings.filterwarnings("ignore")

读取bunch对象

def read_bunch(path):

with open(path, "rb") as fp:

bunch = pickle.load(fp) # joblib 同样可用于存储模型文件

return bunch

分类结果保存至文件

def save_file(save_path, content):

with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:

fp.write(content)

朴素贝叶斯分类

def nbayes_classify(train_set, test_set):

'''train_set: 训练集样本数据test_set: 测试集样本数据:return: 测试集样本分类'''

clf = MultinomialNB(alpha=0.5) #alpha: 拉普拉斯平滑参数,默认为1.0

clf.fit(train_set.tdm, train_set.label) # 训练模型

predict = clf.predict(test_set.tdm)

return predict

def classification_result(actual, predict):

print('精度:{0:.3f}'.format(metrics.precision_score(actual, predict,average='weighted')))

print('召回:{0:0.3f}'.format(metrics.recall_score(actual, predict,average='weighted')))

print('f1-score:{0:.3f}'.format(metrics.f1_score(actual, predict,average='weighted')))

if name == 'main':

导入训练集

train_path = './train_tfdifspace.dat'

train_set = read_bunch(train_path)

导入测试集

test_path = "./test_tfidfspace.dat"

test_set = read_bunch(test_path)

predict = nbayes_classify(train_set, test_set) #

classification_result(test_set.label, predict)

print('-' * 100)

保存结果路径

save_path = './classify_file.txt'

for label, filename, predict in zip(test_set.label, test_set.filepath , predict): #test_set

print(filename, "\t实际类别:",label,"\t-->预测类别:", predict)

save_content = filename + "\t实际类别:" + label + "\t-->预测类别:" + predict + '\n'

save_file(save_path, save_content) # 将分类结果写入txt

从建模流程来看,对文章分类、打标签包括4个主要步骤:

1)文章数据分词处理,抽取文章的主题标签;

2)构建文章的词袋模型;

3)使用TF-IDF计算文章的词空间向量;

4)使用朴素贝叶斯算法对测试集文章进行分类。

全部评论 (0)

还没有任何评论哟~