Advertisement

Textcnn textcnn的文本分类情感分类研究

阅读量:

该项目旨在利用TextCNN模型对微博评论进行情感分类(正面/负面/中性)。通过以下步骤实现:
数据预处理:

  • 加载并清洗微博评论数据。
  • 使用正则表达式提取中文字符并分词。
  • 将标签转换为数值形式(0: 中性, 1: 正面, -1: 负面)。
  • 随机采样5000条评论用于训练集,并生成词汇表(包含前5000个高频词汇)。
    词汇表构建:
  • 从训练集中统计词汇频率并生成词汇表。
  • 为每个词汇分配唯一索引,并创建逆映射(id到词)。
    模型构建:
  • 使用PyTorch搭建TextCNN网络:
  • 嵌入层将文本转化为嵌入向量。
  • 卷积层提取局部上下文特征。
  • 池化层降低维度并提取关键特征。
  • 全连接层进行多类别分类。
    训练与评估:
  • 定义优化器(Adam)、损失函数(CrossEntropyLoss)和评价指标(准确率)。
  • 在随机分成的训练集上进行迭代学习,并定期在测试集上评估性能。
  • 绘制训练与验证准确率曲线以监控模型收敛情况。
    潜在改进空间:
  • 可视化显示了不同阶段的准确率变化趋势。
  • 模型选择适合情感分析任务的TextCNN架构,并通过调整超参数进一步优化性能的可能性较大。
    该方案整体流程清晰且结构合理,但在数据预处理阶段存在一些细节问题需进一步优化。

该项目展示了基于TextCNN模型在微博评论文本情感分析中的应用,并提供完整的代码实现以供参考

复制代码
 # http://www.hengsblog.com/2021/02/14/TextCnn_base/

    
 import torch
    
 from torch import nn
    
 from torch import optim
    
 import transformers as tfs
    
 import math
    
 import numpy as np
    
 import pandas as pd
    
 from sklearn.metrics import f1_score
    
 import warnings
    
 import re
    
 import jieba
    
 from transformers import BertTokenizer, BertModel
    
 from transformers import BertConfig
    
 from transformers import AutoTokenizer, AutoModel,AutoConfig
    
 warnings.filterwarnings('ignore')
    
 from tqdm import tqdm
    
 from sklearn.model_selection import train_test_split
    
 from collections import Counter
    
 import matplotlib.pyplot as plt
    
 from sklearn.metrics import f1_score
    
 from sklearn.metrics import accuracy_score
    
 plt.rcParams["font.sans-serif"] = ['Simhei']
    
 plt.rcParams["axes.unicode_minus"] = False
    
 from pylab import *
    
 from torch.utils.data import Dataset, DataLoader
    
 from torchvision import transforms
    
 from sklearn.metrics import f1_score
    
 from sklearn.metrics import accuracy_score
    
 import torch.optim as optim
    
 from torch.utils.data import random_split
    
 from sklearn.model_selection import train_test_split
    
 from tqdm import tqdm
    
  
    
 def data_process():# 数据预处理函数
    
     data=pd.read_csv("nCoV_100k_train.labled.csv")
    
     # print(data.columns)
    
     data_sorce=data['微博中文内容'].values
    
     data_label=data['情感倾向'].values
    
     # print(len(data_sorce))
    
     # print(Counter(data_label))
    
  
    
     train_text_data=[]
    
     train_text_data_label=[]
    
  
    
     train_text_data_0=[]
    
     train_text_data_1=[]
    
     train_text_data__1=[]
    
  
    
     train_text_data_1_label=[]
    
     train_text_data_0_label=[]
    
     train_text_data__1_label=[]
    
  
    
     sum_idx = 0# 计数器
    
     for idx,line in enumerate(data_sorce):
    
     if str(data_label[idx])=='0':
    
         if len(train_text_data_0)<11800:
    
             line1=re.findall(u'[\u4e00-\u9fa5]',str(line))
    
             if len(line1)>20:
    
                 sum_idx+=1
    
                 train_text_data_0.append(line)
    
                 train_text_data_0_label.append(int(data_label[idx])+1)
    
  
    
     if str(data_label[idx]) == '1':
    
         if len(train_text_data_1)<11800:
    
             line1 = re.findall(u'[\u4e00-\u9fa5]', str(line))
    
             if len(line1) > 20:
    
                     sum_idx += 1
    
                     train_text_data_1.append(line)
    
                     train_text_data_1_label.append(int(data_label[idx])+1)
    
  
    
     if str(data_label[idx]) == '-1':
    
         if len(train_text_data__1)<11800:
    
             line1 = re.findall(u'[\u4e00-\u9fa5]', str(line))
    
             if len(line1) > 20:
    
                     sum_idx += 1
    
                     train_text_data__1.append(line)
    
                     train_text_data__1_label.append(int(data_label[idx])+1)# 不允许出现负数
    
     if sum_idx==35000:
    
             break
    
     train_text_data=train_text_data_0+train_text_data_1+train_text_data__1
    
     train_text_data_label=train_text_data_0_label+train_text_data_1_label+train_text_data__1_label
    
     print(Counter(train_text_data_label))
    
     return train_text_data[:5000],train_text_data_label[:5000]
    
  
    
 train_text_data,train_text_data_label=data_process()
    
  
    
 from collections import Counter
    
 with open("vocab.txt", 'w', encoding='utf-8') as fout:
    
     fout.write("<unk>\n")
    
     fout.write("<pad>\n")
    
     vocab = [word for word, freq in Counter(j for i in train_text_data for j in i).most_common() if freq>1]
    
     for i in vocab:
    
     fout.write(i+"\n")
    
  
    
 #初始化vocab
    
 with open("vocab.txt", encoding='utf-8') as fin:
    
     vocab = [i.strip() for i in fin]
    
 char2idx = {i:index for index, i in enumerate(vocab)}
    
 idx2char = {index:i for index, i in enumerate(vocab)}
    
 vocab_size = len(vocab)
    
 pad_id = char2idx["<pad>"]
    
 unk_id = char2idx["<unk>"]
    
  
    
 sequence_length = 385
    
 #对输入数据进行预处理
    
 def tokenizer():
    
     inputs = []
    
     sentence_char = [[j for j in i] for i in train_text_data]
    
     # 将输入文本进行padding
    
     for index,i in enumerate(sentence_char):
    
     temp = [char2idx.get(j,unk_id) for j in i]
    
     if len(temp) < sequence_length:
    
         for _ in range(sequence_length-len(temp)):
    
             temp.append(pad_id)
    
     else:
    
         temp = temp[:sequence_length]
    
     inputs.append(temp)
    
     return inputs
    
 data_input = tokenizer()
    
  
    
 import torch
    
 import torch.nn as nn
    
 import torch.utils.data as Data
    
 import torch.optim as optim
    
 import torch.nn.functional as F
    
 import numpy as np
    
  
    
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
 Embedding_size = 100
    
 Batch_Size = 16
    
 Kernel = 3
    
 Filter_num = 10
    
 Epoch = 10
    
 Dropout = 0.5
    
 Learning_rate = 1e-3
    
  
    
  
    
 class TextCNNDataSet(Data.Dataset):
    
     def __init__(self, data_inputs, data_targets):
    
     self.inputs = torch.LongTensor(data_inputs)
    
     self.label = torch.LongTensor(data_targets)
    
  
    
     def __getitem__(self, index):
    
     return self.inputs[index], self.label[index]
    
  
    
     def __len__(self):
    
     return len(self.inputs)
    
  
    
  
    
 TextCNNDataSet = TextCNNDataSet(data_input, list(train_text_data_label))
    
 train_size = int(len(data_input) * 0.8)
    
 test_size = len(data_input) - train_size
    
 train_dataset, test_dataset = torch.utils.data.random_split(TextCNNDataSet, [train_size, test_size])
    
  
    
 TrainDataLoader = Data.DataLoader(train_dataset, batch_size=Batch_Size, shuffle=True)
    
 TestDataLoader = Data.DataLoader(test_dataset, batch_size=Batch_Size, shuffle=True)
    
  
    
 # nn.Conv2d(in_channels,#输入通道数 out_channels,#输出通道数 kernel_size#卷积核大小 )
    
 num_classs = 3
    
  
    
  
    
 class TextCNN(nn.Module):
    
     def __init__(self):
    
     super(TextCNN, self).__init__()
    
     self.W = nn.Embedding(vocab_size, embedding_dim=Embedding_size)
    
     out_channel = Filter_num
    
     self.conv = nn.Sequential(
    
         nn.Conv2d(1, out_channel, (2, Embedding_size)),  # 卷积核大小为2*Embedding_size
    
         nn.ReLU(),
    
         nn.MaxPool2d((sequence_length - 1, 1)),
    
     )
    
     self.dropout = nn.Dropout(Dropout)
    
     self.fc = nn.Linear(out_channel, num_classs)
    
  
    
     def forward(self, X):
    
     batch_size = X.shape[0]
    
     embedding_X = self.W(X)  # [batch_size, sequence_length, embedding_size]
    
     embedding_X = embedding_X.unsqueeze(1)  # add channel(=1) [batch, channel(=1), sequence_length, embedding_size]
    
     conved = self.conv(embedding_X)  # [batch_size, output_channel, 1, 1]
    
     conved = self.dropout(conved)
    
     flatten = conved.view(batch_size, -1)  # [batch_size, output_channel*1*1]
    
     output = self.fc(flatten)
    
     return output
    
  
    
 model = TextCNN().to(device)
    
 criterion = nn.CrossEntropyLoss().to(device)
    
 optimizer = optim.Adam(model.parameters(),lr=Learning_rate)
    
  
    
 def binary_acc(pred, y):
    
     """
    
     计算模型的准确率
    
     :param pred: 预测值
    
     :param y: 实际真实值
    
     :return: 返回准确率
    
     """
    
     correct = torch.eq(pred, y).float()
    
     acc = correct.sum() / len(correct)
    
     return acc.item()
    
  
    
 def train():
    
     avg_acc = []
    
     model.train()
    
     for index, (batch_x, batch_y) in enumerate(TrainDataLoader):
    
     batch_x, batch_y = batch_x.to(device), batch_y.to(device)
    
     pred = model(batch_x)
    
     loss = criterion(pred, batch_y)
    
     acc = binary_acc(torch.max(pred, dim=1)[1], batch_y)
    
     avg_acc.append(acc)
    
     optimizer.zero_grad()
    
     loss.backward()
    
     optimizer.step()
    
     avg_acc = np.array(avg_acc).mean()
    
     return avg_acc
    
  
    
 def evaluate():
    
     """
    
     模型评估
    
     :param model: 使用的模型
    
     :return: 返回当前训练的模型在测试集上的结果
    
     """
    
     avg_acc = []
    
     model.eval()  # 进入测试模式
    
     with torch.no_grad():
    
     for x_batch, y_batch in TestDataLoader:
    
         x_batch, y_batch = x_batch.to(device), y_batch.to(device)
    
         pred = model(x_batch)
    
         acc = binary_acc(torch.max(pred, dim=1)[1], y_batch)
    
         avg_acc.append(acc)
    
     return np.array(avg_acc).mean()
    
  
    
  
    
 # Training cycle
    
 model_train_acc, model_test_acc = [], []
    
 for epoch in range(Epoch):
    
     train_acc = train()
    
     test_acc = evaluate()
    
     print("epoch = {}, 训练准确率={}".format(epoch + 1, train_acc))
    
     print("epoch = {}, 测试准确率={}".format(epoch + 1, test_acc))
    
     model_train_acc.append(train_acc)
    
     model_test_acc.append(test_acc)
    
  
    
 plt.plot(model_train_acc)
    
 plt.plot(model_test_acc)
    
 plt.ylim(ymin=0.5, ymax=1.01)
    
 plt.title("The accuracy of textCNN model")
    
 plt.legend(['train', 'test'])
    
 plt.show()
    
  
    
    
    
    
    AI助手

<>

全部评论 (0)

还没有任何评论哟~