Advertisement

命名实体识别(BiLSTM+CRF)

阅读量:

  • 数据来源

    • 获取
  • 数据标注

    • 效果
  • 模型搭建

  • 代码部分

数据来源

在第一次世界大战期间,该方法被广泛应用于……

获取

复制代码
    import requests
    import re
    import pandas as pd
    from lxml import html
    from lxml import etree 
    
    
      
      
      
      
      
    
    代码解释
复制代码
    url = 'https://baike.baidu.com/item/%E7%AC%AC%E4%B8%80%E6%AC%A1%E4%B8%96%E7%95%8C%E5%A4%A7%E6%88%98/68516?fr=aladdin'
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
    }
    res = requests.get(url=url,headers=headers)
    s = res.text
    html = etree.HTML(s)
    p = html.xpath('//div[@class="para"]/text()')
    p = [i for i in p if len(i)>20]
    data = pd.DataFrame()
    data['文字'] = p
    data.to_csv('一战信息.txt',index=False)
    
    
      
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释

数据标注

在线标注平台

效果

在这里插入图片描述

以txt格式导出

模型搭建

复制代码
    import tensorflow as tf
    import tensorflow_addons as tfa
    import numpy as np
    import tqdm
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.callbacks import TensorBoard
    from tensorflow.keras.optimizers import Adam
    from tensorflow_addons.layers import CRF
    
    class MyBiLSTMCRF:
    def __init__(self, vocabSize, maxLen, tagIndexDict,tagSum,sequenceLengths=None,vecSize=100,learning_rate=0.01):
        self.vocabSize = vocabSize
        self.vecSize = vecSize
        self.maxLen = maxLen
        self.tagSum = tagSum
        self.sequenceLengths=sequenceLengths
        self.tagIndexDict=tagIndexDict
        self.learning_rate=learning_rate
    
        self.buildBiLSTMCRF()
    
    def getTransParam(self,y,tagIndexDict):
        self.trainY=np.argmax(y,axis=-1)
        yList=self.trainY.tolist()
        transParam=np.zeros([len(list(tagIndexDict.keys())),len(list(tagIndexDict.keys()))])
        for rowI in range(len(yList)):
            for colI in range(len(yList[rowI])-1):
                transParam[yList[rowI][colI]][yList[rowI][colI+1]]+=1
        for rowI in range(transParam.shape[0]):
            transParam[rowI]=transParam[rowI]/np.sum(transParam[rowI])
        return transParam
    
    def buildBiLSTMCRF(self):
    
        myModel=Sequential()
        myModel.add(tf.keras.layers.Input(shape=(self.maxLen,)))
        myModel.add(tf.keras.layers.Embedding(self.vocabSize, self.vecSize))
        myModel.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
                    self.tagSum, return_sequences=True, activation="tanh"), merge_mode='sum'))
        myModel.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
                    self.tagSum, return_sequences=True, activation="softmax"), merge_mode='sum'))
        crf=CRF(self.tagSum,name='crf_layer')
        myModel.add(crf)
        myModel.compile(Adam(learning_rate=self.learning_rate),loss={'crf_layer': crf.get_loss})
        self.myBiLSTMCRF=myModel
        
    def fit(self,X,y,epochs=100,transParam=None):
        if len(y.shape)==3:
            y=np.argmax(y,axis=-1)
        if self.sequenceLengths is None:
            self.sequenceLengths=[row.shape[0] for row in y]
        log_dir = "logs"
        tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    
        history=self.myBiLSTMCRF.fit(X,y,epochs=epochs,callbacks=[tensorboard_callback])
    
        return history
    
    def predict(self,X):
        preYArr=self.myBiLSTMCRF.predict(X)
        return preYArr
    
    if __name__=="__main__":
    myModel=MyBiLSTMCRF(vocabSize,maxLen, tagIndexDict,tagSum,sequenceLengths)
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释

将上述代码保存为一个Python文件,并命名为BiLSTMCRF(以便在最终训练阶段使用)。

代码部分

复制代码
    import re
    import numpy as np
    import pandas as pd
    import tensorflow as tf
    import random
    #from tensorflow_addons.layers import CRF
    
    data = pd.read_table('./World war 1.txt')#读取
    
    word_list = []#分割之后的字
    label_list = []#分割之后的标签
    for i in range(len(data)):
    p = [j for j in data.iloc[i,0].split(' ') if j != '']
    word = []
    label = []
    for k in p:
        word_label = k.split('/')
        word.append(word_label[0])
        label.append(word_label[1])
    word_list.append(word)
    label_list.append(label)
    np.unique([j for i in label_list for j in i])#查看类别
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释

array([‘O’, ‘nr’, ‘ns’, ‘nsc’, ‘nsfc’, ‘nsn’, ‘nst’, ‘nt’, ‘t’],
dtype=’<U4’)

复制代码
    label_dict = {'O':'无实体','nr':'人名','ns':'地名','nsc':'城市名','nsfc':'大洲名','nsn':'中文-国名/政体','nt':'机构名','t':'时间词'}#类别含义
    label_num_dict = {'O':0,'nr':1,'ns':2,'nsc':3,'nsfc':4,'nsn':5,'nst':6,'nt':7,'t':8}#类别编号
    
    
      
      
    
    代码解释
复制代码
    word_list1 = []#以空格隔开的字组成的句子
    for i in word_list:
    word_list1.append(" ".join(str(j) for j in i))
    
    
      
      
      
    
    代码解释
复制代码
    wordIndexDict={"<pad>":0}#词典
    wi=1
    for row in word_list1:
    if type(row)==float:
        print(row)
        break
    for word in row.split(" "):
        if word not in wordIndexDict:
            wordIndexDict[word]=wi
            wi+=1
    vocabSize=wi
    maxLen=max(len(row) for row in word_list1)
    sequenceLengths=[len(row) for row in word_list1]
    label_list1 = []
    for i in label_list:
    label_list1.append(' '.join(str(j) for j in i))
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释
复制代码
    data1 = pd.DataFrame()
    data1['text'] = word_list1
    data1['tag'] = label_list1
    data1['text']=data1['text'].apply(lambda x:[wordIndexDict[word] for word in x.split()])
    import tensorflow as tf
    #因为长度不一样,需要补充到同一长度
    X=tf.keras.preprocessing.sequence.pad_sequences(data1["text"],
                                                value=wordIndexDict["<pad>"],
                                                padding='post',
                                                maxlen=maxLen)
    X.shape
    
    
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释

(77, 353)

对tag做同样的处理

复制代码
    import tqdm
    import re
    
    data1["tag"]=data1["tag"].apply(lambda x:re.sub("\-\S+","",x))
    
    tagIndexDict = {"PAD": 0}
    ti = 1
    for row in tqdm.tqdm(data1["tag"].values.tolist()):
    for tag in row.split(" "):
        if tag not in tagIndexDict:
            tagIndexDict[tag] = ti
            ti += 1
    tagSum = len(list(tagIndexDict.keys()))
    data1["tag"] = data1["tag"].apply(lambda x:x.split()+["PAD" for i in range(maxLen-len(x.split()))])
    data1["tag"] = data1["tag"].apply(lambda x:[tagIndexDict[tagItem] for tagItem in x])
    
    y=np.array(data1["tag"].values.tolist())
    y.shape
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释

(77, 353)

复制代码
    import BiLSTMCRF#之前搭好的模型
    myModel=BiLSTMCRF.MyBiLSTMCRF(vocabSize,maxLen, tagIndexDict,tagSum,sequenceLengths)#出入之前的参数
    myModel.myBiLSTMCRF.summary()
    
    
      
      
      
    
    代码解释
在这里插入图片描述
复制代码
    history=myModel.fit(X,y,epochs=1500)#训练1500轮
    
    
      
    
    代码解释
复制代码
    import matplotlib.pyplot as plt
    loss = history.history['loss']
    plt.plot(range(1500),loss,"bo",label="loss")
    plt.legend()
    plt.show()
    
    
      
      
      
      
      
    
    代码解释
在这里插入图片描述
复制代码
    testI=22
    preY=myModel.predict(X)[testI]
    preY
    
    
      
      
      
    
    代码解释
复制代码
    array([1, 2, 2, 2, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0])
    
    
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释
复制代码
    indexTagDict=dict(list(zip(list(tagIndexDict.values()),list(tagIndexDict.keys()))))
    indexWordDict=dict(list(zip(list(wordIndexDict.values()),list(wordIndexDict.keys()))))
    
    sentenceList=[indexWordDict[wordItem] for wordItem in X[testI]]
    sentenceList=sentenceList[:sentenceList.index('<pad>')]
    
    tagList=[indexTagDict[tagItem] for tagItem in preY]
    tagList=tagList[:tagList.index('PAD')]
    
    
    for i,j in zip(sentenceList,tagList):
    print(i,'的类别是',j,'代表的是',label_dict[j])
    
    
      
      
      
      
      
      
      
      
      
      
      
      
    
    代码解释
在这里插入图片描述

全部评论 (0)

还没有任何评论哟~