Textcnn textcnn的文本分类情感分类研究
发布时间
阅读量:
阅读量
该项目旨在利用TextCNN模型对微博评论进行情感分类(正面/负面/中性)。通过以下步骤实现:
数据预处理:
- 加载并清洗微博评论数据。
- 使用正则表达式提取中文字符并分词。
- 将标签转换为数值形式(0: 中性, 1: 正面, -1: 负面)。
- 随机采样5000条评论用于训练集,并生成词汇表(包含前5000个高频词汇)。
词汇表构建:- 从训练集中统计词汇频率并生成词汇表。
- 为每个词汇分配唯一索引,并创建逆映射(id到词)。
模型构建:- 使用PyTorch搭建TextCNN网络:
- 嵌入层将文本转化为嵌入向量。
- 卷积层提取局部上下文特征。
- 池化层降低维度并提取关键特征。
- 全连接层进行多类别分类。
训练与评估:- 定义优化器(Adam)、损失函数(CrossEntropyLoss)和评价指标(准确率)。
- 在随机分成的训练集上进行迭代学习,并定期在测试集上评估性能。
- 绘制训练与验证准确率曲线以监控模型收敛情况。
潜在改进空间:- 可视化显示了不同阶段的准确率变化趋势。
- 模型选择适合情感分析任务的TextCNN架构,并通过调整超参数进一步优化性能的可能性较大。
该方案整体流程清晰且结构合理,但在数据预处理阶段存在一些细节问题需进一步优化。
该项目展示了基于TextCNN模型在微博评论文本情感分析中的应用,并提供完整的代码实现以供参考

# http://www.hengsblog.com/2021/02/14/TextCnn_base/
import torch
from torch import nn
from torch import optim
import transformers as tfs
import math
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import warnings
import re
import jieba
from transformers import BertTokenizer, BertModel
from transformers import BertConfig
from transformers import AutoTokenizer, AutoModel,AutoConfig
warnings.filterwarnings('ignore')
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
plt.rcParams["font.sans-serif"] = ['Simhei']
plt.rcParams["axes.unicode_minus"] = False
from pylab import *
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import torch.optim as optim
from torch.utils.data import random_split
from sklearn.model_selection import train_test_split
from tqdm import tqdm
def data_process():# 数据预处理函数
data=pd.read_csv("nCoV_100k_train.labled.csv")
# print(data.columns)
data_sorce=data['微博中文内容'].values
data_label=data['情感倾向'].values
# print(len(data_sorce))
# print(Counter(data_label))
train_text_data=[]
train_text_data_label=[]
train_text_data_0=[]
train_text_data_1=[]
train_text_data__1=[]
train_text_data_1_label=[]
train_text_data_0_label=[]
train_text_data__1_label=[]
sum_idx = 0# 计数器
for idx,line in enumerate(data_sorce):
if str(data_label[idx])=='0':
if len(train_text_data_0)<11800:
line1=re.findall(u'[\u4e00-\u9fa5]',str(line))
if len(line1)>20:
sum_idx+=1
train_text_data_0.append(line)
train_text_data_0_label.append(int(data_label[idx])+1)
if str(data_label[idx]) == '1':
if len(train_text_data_1)<11800:
line1 = re.findall(u'[\u4e00-\u9fa5]', str(line))
if len(line1) > 20:
sum_idx += 1
train_text_data_1.append(line)
train_text_data_1_label.append(int(data_label[idx])+1)
if str(data_label[idx]) == '-1':
if len(train_text_data__1)<11800:
line1 = re.findall(u'[\u4e00-\u9fa5]', str(line))
if len(line1) > 20:
sum_idx += 1
train_text_data__1.append(line)
train_text_data__1_label.append(int(data_label[idx])+1)# 不允许出现负数
if sum_idx==35000:
break
train_text_data=train_text_data_0+train_text_data_1+train_text_data__1
train_text_data_label=train_text_data_0_label+train_text_data_1_label+train_text_data__1_label
print(Counter(train_text_data_label))
return train_text_data[:5000],train_text_data_label[:5000]
train_text_data,train_text_data_label=data_process()
from collections import Counter
with open("vocab.txt", 'w', encoding='utf-8') as fout:
fout.write("<unk>\n")
fout.write("<pad>\n")
vocab = [word for word, freq in Counter(j for i in train_text_data for j in i).most_common() if freq>1]
for i in vocab:
fout.write(i+"\n")
#初始化vocab
with open("vocab.txt", encoding='utf-8') as fin:
vocab = [i.strip() for i in fin]
char2idx = {i:index for index, i in enumerate(vocab)}
idx2char = {index:i for index, i in enumerate(vocab)}
vocab_size = len(vocab)
pad_id = char2idx["<pad>"]
unk_id = char2idx["<unk>"]
sequence_length = 385
#对输入数据进行预处理
def tokenizer():
inputs = []
sentence_char = [[j for j in i] for i in train_text_data]
# 将输入文本进行padding
for index,i in enumerate(sentence_char):
temp = [char2idx.get(j,unk_id) for j in i]
if len(temp) < sequence_length:
for _ in range(sequence_length-len(temp)):
temp.append(pad_id)
else:
temp = temp[:sequence_length]
inputs.append(temp)
return inputs
data_input = tokenizer()
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
Embedding_size = 100
Batch_Size = 16
Kernel = 3
Filter_num = 10
Epoch = 10
Dropout = 0.5
Learning_rate = 1e-3
class TextCNNDataSet(Data.Dataset):
def __init__(self, data_inputs, data_targets):
self.inputs = torch.LongTensor(data_inputs)
self.label = torch.LongTensor(data_targets)
def __getitem__(self, index):
return self.inputs[index], self.label[index]
def __len__(self):
return len(self.inputs)
TextCNNDataSet = TextCNNDataSet(data_input, list(train_text_data_label))
train_size = int(len(data_input) * 0.8)
test_size = len(data_input) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(TextCNNDataSet, [train_size, test_size])
TrainDataLoader = Data.DataLoader(train_dataset, batch_size=Batch_Size, shuffle=True)
TestDataLoader = Data.DataLoader(test_dataset, batch_size=Batch_Size, shuffle=True)
# nn.Conv2d(in_channels,#输入通道数 out_channels,#输出通道数 kernel_size#卷积核大小 )
num_classs = 3
class TextCNN(nn.Module):
def __init__(self):
super(TextCNN, self).__init__()
self.W = nn.Embedding(vocab_size, embedding_dim=Embedding_size)
out_channel = Filter_num
self.conv = nn.Sequential(
nn.Conv2d(1, out_channel, (2, Embedding_size)), # 卷积核大小为2*Embedding_size
nn.ReLU(),
nn.MaxPool2d((sequence_length - 1, 1)),
)
self.dropout = nn.Dropout(Dropout)
self.fc = nn.Linear(out_channel, num_classs)
def forward(self, X):
batch_size = X.shape[0]
embedding_X = self.W(X) # [batch_size, sequence_length, embedding_size]
embedding_X = embedding_X.unsqueeze(1) # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]
conved = self.conv(embedding_X) # [batch_size, output_channel, 1, 1]
conved = self.dropout(conved)
flatten = conved.view(batch_size, -1) # [batch_size, output_channel*1*1]
output = self.fc(flatten)
return output
model = TextCNN().to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.Adam(model.parameters(),lr=Learning_rate)
def binary_acc(pred, y):
"""
计算模型的准确率
:param pred: 预测值
:param y: 实际真实值
:return: 返回准确率
"""
correct = torch.eq(pred, y).float()
acc = correct.sum() / len(correct)
return acc.item()
def train():
avg_acc = []
model.train()
for index, (batch_x, batch_y) in enumerate(TrainDataLoader):
batch_x, batch_y = batch_x.to(device), batch_y.to(device)
pred = model(batch_x)
loss = criterion(pred, batch_y)
acc = binary_acc(torch.max(pred, dim=1)[1], batch_y)
avg_acc.append(acc)
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_acc = np.array(avg_acc).mean()
return avg_acc
def evaluate():
"""
模型评估
:param model: 使用的模型
:return: 返回当前训练的模型在测试集上的结果
"""
avg_acc = []
model.eval() # 进入测试模式
with torch.no_grad():
for x_batch, y_batch in TestDataLoader:
x_batch, y_batch = x_batch.to(device), y_batch.to(device)
pred = model(x_batch)
acc = binary_acc(torch.max(pred, dim=1)[1], y_batch)
avg_acc.append(acc)
return np.array(avg_acc).mean()
# Training cycle
model_train_acc, model_test_acc = [], []
for epoch in range(Epoch):
train_acc = train()
test_acc = evaluate()
print("epoch = {}, 训练准确率={}".format(epoch + 1, train_acc))
print("epoch = {}, 测试准确率={}".format(epoch + 1, test_acc))
model_train_acc.append(train_acc)
model_test_acc.append(test_acc)
plt.plot(model_train_acc)
plt.plot(model_test_acc)
plt.ylim(ymin=0.5, ymax=1.01)
plt.title("The accuracy of textCNN model")
plt.legend(['train', 'test'])
plt.show()
AI助手
<>
全部评论 (0)
还没有任何评论哟~
