Advertisement

BERT实战:中文命名实体识别

阅读量:

使用bert实现的一个NER10标签任务

获取数据:

复制代码
    # 获取数据
    import json
    
    train_data = []
    dev_data = []
    test_data = []
    
    for line in open('train.json','r',encoding='UTF-8'):
    train_data.append(json.loads(line))
    
    for line in open('dev.json','r',encoding='UTF-8'):
    dev_data.append(json.loads(line))
    
    for line in open('test.json','r',encoding='UTF-8'):
    test_data.append(json.loads(line))
    
    print(f'数量:train:{len(train_data)},dev:{len(dev_data)},test:{len(test_data)}')
    print(train_data[0])
    print(dev_data[0])
    print(test_data[0])
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

标签数据处理:

构建标签字典,字典格式如下:

复制代码
    #上图为标签类别
    #需要构建标签
    import re
    
    label_type = {'o':0,'address':1,'book':2,'company':3,'game':4,'government':5,'movie':6,'name':7,'organization':8,'position':9,'scene':10}
    
    def decode_label(d):
    #解析标签,以列表形式构成
      text_len = len(d['text'])
      label = [0]*text_len
      types = d['label'].keys()
      for t in types:
    values = d['label'][t].values()
    si = [v for value in values for v in value]
    for i in si:
      for j in range(i[0],i[1]+1):
        label[j] = label_type[t]
      return label
    
    
    def transfrom_data(data,mode):
      data_texts = [re.sub('\d','&',d['text']) for d in data]
      
      if mode == 'train':
    data_labels = []
    for d in data:
      data_labels.append(decode_label(d))
    return (data_texts,data_labels)
      
      else:
    return data_texts 
    
    train_texts,train_labels = transfrom_data(train_data,'train')
    dev_texts,dev_labels = transfrom_data(dev_data,'train')
    test_texts = transfrom_data(train_data,'test')
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

数据处理:

使用bert中的tokenizer解析数据,构建数据向量。

复制代码
    ! pip install transformers
    from transformers import BertTokenizer
    from IPython.display import clear_output
    
    # 使用bert的tokenizer将文字转化成数字。
    PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定为中文
    tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
    clear_output()
    
    train_ids = []
    dev_ids = []
    test_ids = []
    for train_text in train_texts:
      train_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(train_text)))
    
    for dev_text in dev_texts:
      dev_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(dev_text)))
    
    for test_text in test_texts:
      test_ids.append(tokenizer.convert_tokens_to_ids(tokenizer.tokenize(test_text)))
    
    print(train_ids[0])
    print(dev_texts[66])
    print(dev_labels[66])
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

构建dataloader

使用torch中构建数据集的方法

复制代码
    import torch
    from torch.utils.data import DataLoader,Dataset
    from torch.nn.utils.rnn import pad_sequence
    
    MaxLen = 40
    class NewDataset(Dataset):
    def __init__(self,ids,labels):
        self.ids = ids
        self.labels = labels
        self.len = len(ids)
    
    def __getitem__(self, item):
        tokens_tensor = torch.tensor(self.ids[item])
        label_tensor = torch.tensor(self.labels[item])
        return (tokens_tensor,label_tensor)
    
    def __len__(self):
        return self.len
    
    trainset = NewDataset(train_ids,train_labels)
    devset = NewDataset(dev_ids,dev_labels)
    BATCH_SIZE = 64
    def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    label_tensors = [s[1] for s in samples]
    
    
    # zero pad 到同一序列长度
    one = [0]
    tokens_tensors = pad_sequence(tokens_tensors,batch_first=True)
    label_tensors = pad_sequence(label_tensors,batch_first=True,padding_value=0)
    
    if len(tokens_tensors[0]) != 50:
      tokens_tensors = torch.tensor([t + one for t in tokens_tensors.numpy().tolist()])
    if len(label_tensors[0]) != 50: 
      label_tensors = torch.tensor([t + one for t in label_tensors.numpy().tolist()])
    # attention masks,将 tokens_tensors 不为 zero padding 的位置设为1
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, masks_tensors, label_tensors
    
    
    trainloader = DataLoader(trainset,batch_size=BATCH_SIZE,collate_fn=create_mini_batch,drop_last=False)
    devloader = DataLoader(trainset,batch_size=BATCH_SIZE,collate_fn=create_mini_batch,drop_last=False)
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

训练模型

使用BertForTokenClassification模型

复制代码
    from transformers import BertForTokenClassification
    model = BertForTokenClassification.from_pretrained("bert-base-uncased", num_labels=17)
    model.cuda()
    
    model.train()
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-5)
    Epochs = 10
    for epoch in range(Epochs):
    losses = 0.0
    for data in trainloader:
        tokens_tensors, masks_tensors, label_tensors = [t.cuda() for t in data]
        optimizer.zero_grad()
        outputs = model(input_ids = tokens_tensors,attention_mask = masks_tensors,labels = label_tensors)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        losses += loss.item()
    print(losses)
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

只训练了10次,效果还不是很好。

验证数据:

复制代码
    import numpy as np
    def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
    nb_eval_steps = 0
    model.eval()
    eval_loss,eval_accuracy = 0,0
    predictions , true_labels = [], []
    
    for data in devloader:
    tokens_tensors, masks_tensors, label_tensors = [t.cuda() for t in data]
    with torch.no_grad():
        outputs = model(input_ids=tokens_tensors, attention_mask=masks_tensors, labels=label_tensors)
        loss = outputs[0]
        preds = model(input_ids=tokens_tensors, attention_mask=masks_tensors)
    
    for pred,label_tensor in zip(preds[0],label_tensors):
      logit = pred.detach().cpu().numpy()#detach的方法,将variable参数从网络中隔离开,不参与参数更新
      label_ids = label_tensor.cpu().numpy()
    
      predictions.extend(np.argmax(logit, axis=1))
      true_labels.append(label_ids)
      # 计算accuracy 和 loss
      tmp_eval_accuracy = flat_accuracy(logit, label_ids)
    
      eval_loss += loss.mean().item()
      eval_accuracy += tmp_eval_accuracy
      nb_eval_steps += 1
    
    print("Validation loss: {}".format(eval_loss/nb_eval_steps))
    print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
复制代码
    from sklearn.metrics import f1_score
    pred_tags = list(np.array(predictions).flatten())
    valid_tags = list(np.array(true_labels).flatten())
    print(pred_tags[0:20])
    print(valid_tags[0:20])
    print("F1-Score: {}".format(f1_score(pred_tags,valid_tags,average='weighted')))#传入的是具体的tag
    
      
      
      
      
      
      
    

通过一句话来验证

复制代码
    text = '普京是俄罗斯的总统'
    test_tokens = tokenizer.tokenize(text)
    test_ids = tokenizer.convert_tokens_to_ids(test_tokens)
    test_tokens_tensor = torch.tensor(test_ids)
    test_tokens_tensor = test_tokens_tensor
    
    test_masks_tensor = torch.zeros(test_tokens_tensor.shape, dtype=torch.long)
    test_masks_tensor = test_masks_tensor.masked_fill(test_tokens_tensor != 0, 1)
    
    outputs = model(input_ids=test_tokens_tensor.unsqueeze(0).cuda(),attention_mask=test_masks_tensor.unsqueeze(0).cuda())
    logits = outputs[0]
    preds = []
    for logit in logits:
      preds.extend(np.argmax(logit.detach().cpu().numpy(), axis=1))
    
    inverse_dict=dict([val,key] for key,val in label_type.items())
    preds = [inverse_dict[i] for i in preds]
    
    print(test_tokens)
    print(preds)
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    

结果:

总体来说bert的强大表现使得即使训练不足,也能达到很高的f1得分。

但是可以看出训练出来的模型有很多缺陷:

  • 标签的分类问题,我直接将每个标签分为一类,但实际上是可以将一个标签分为首中位3部分的。这样导致训练出来的模型界限划分优点小问题(比如结果中【普京是】被划分在了一起)。
  • 标签太多,标签过多导致机器很难将其中标签区分,如结果中的【普京】被分给了公司的标签,实际上应该是人名(可以看出其实人名和公司名机器还不能完全区分他的语义)。
  • 训练不足:可以看到训练的loss值还没有完全降下来。
  • 模型过于简单,没有考虑加入lstm和crf处理。

全部评论 (0)

还没有任何评论哟~