命名实体识别(BiLSTM+CRF)
发布时间
阅读量:
阅读量
-
数据来源
-
- 获取
-
数据标注
-
- 效果
-
模型搭建
-
代码部分
数据来源
在第一次世界大战期间,该方法被广泛应用于……
获取
import requests
import re
import pandas as pd
from lxml import html
from lxml import etree
代码解释
url = 'https://baike.baidu.com/item/%E7%AC%AC%E4%B8%80%E6%AC%A1%E4%B8%96%E7%95%8C%E5%A4%A7%E6%88%98/68516?fr=aladdin'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36'
}
res = requests.get(url=url,headers=headers)
s = res.text
html = etree.HTML(s)
p = html.xpath('//div[@class="para"]/text()')
p = [i for i in p if len(i)>20]
data = pd.DataFrame()
data['文字'] = p
data.to_csv('一战信息.txt',index=False)
代码解释
数据标注
效果

以txt格式导出
模型搭建
import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
import tqdm
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import Adam
from tensorflow_addons.layers import CRF
class MyBiLSTMCRF:
def __init__(self, vocabSize, maxLen, tagIndexDict,tagSum,sequenceLengths=None,vecSize=100,learning_rate=0.01):
self.vocabSize = vocabSize
self.vecSize = vecSize
self.maxLen = maxLen
self.tagSum = tagSum
self.sequenceLengths=sequenceLengths
self.tagIndexDict=tagIndexDict
self.learning_rate=learning_rate
self.buildBiLSTMCRF()
def getTransParam(self,y,tagIndexDict):
self.trainY=np.argmax(y,axis=-1)
yList=self.trainY.tolist()
transParam=np.zeros([len(list(tagIndexDict.keys())),len(list(tagIndexDict.keys()))])
for rowI in range(len(yList)):
for colI in range(len(yList[rowI])-1):
transParam[yList[rowI][colI]][yList[rowI][colI+1]]+=1
for rowI in range(transParam.shape[0]):
transParam[rowI]=transParam[rowI]/np.sum(transParam[rowI])
return transParam
def buildBiLSTMCRF(self):
myModel=Sequential()
myModel.add(tf.keras.layers.Input(shape=(self.maxLen,)))
myModel.add(tf.keras.layers.Embedding(self.vocabSize, self.vecSize))
myModel.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
self.tagSum, return_sequences=True, activation="tanh"), merge_mode='sum'))
myModel.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(
self.tagSum, return_sequences=True, activation="softmax"), merge_mode='sum'))
crf=CRF(self.tagSum,name='crf_layer')
myModel.add(crf)
myModel.compile(Adam(learning_rate=self.learning_rate),loss={'crf_layer': crf.get_loss})
self.myBiLSTMCRF=myModel
def fit(self,X,y,epochs=100,transParam=None):
if len(y.shape)==3:
y=np.argmax(y,axis=-1)
if self.sequenceLengths is None:
self.sequenceLengths=[row.shape[0] for row in y]
log_dir = "logs"
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
history=self.myBiLSTMCRF.fit(X,y,epochs=epochs,callbacks=[tensorboard_callback])
return history
def predict(self,X):
preYArr=self.myBiLSTMCRF.predict(X)
return preYArr
if __name__=="__main__":
myModel=MyBiLSTMCRF(vocabSize,maxLen, tagIndexDict,tagSum,sequenceLengths)
代码解释
将上述代码保存为一个Python文件,并命名为BiLSTMCRF(以便在最终训练阶段使用)。
代码部分
import re
import numpy as np
import pandas as pd
import tensorflow as tf
import random
#from tensorflow_addons.layers import CRF
data = pd.read_table('./World war 1.txt')#读取
word_list = []#分割之后的字
label_list = []#分割之后的标签
for i in range(len(data)):
p = [j for j in data.iloc[i,0].split(' ') if j != '']
word = []
label = []
for k in p:
word_label = k.split('/')
word.append(word_label[0])
label.append(word_label[1])
word_list.append(word)
label_list.append(label)
np.unique([j for i in label_list for j in i])#查看类别
代码解释
array([‘O’, ‘nr’, ‘ns’, ‘nsc’, ‘nsfc’, ‘nsn’, ‘nst’, ‘nt’, ‘t’],
dtype=’<U4’)
label_dict = {'O':'无实体','nr':'人名','ns':'地名','nsc':'城市名','nsfc':'大洲名','nsn':'中文-国名/政体','nt':'机构名','t':'时间词'}#类别含义
label_num_dict = {'O':0,'nr':1,'ns':2,'nsc':3,'nsfc':4,'nsn':5,'nst':6,'nt':7,'t':8}#类别编号
代码解释
word_list1 = []#以空格隔开的字组成的句子
for i in word_list:
word_list1.append(" ".join(str(j) for j in i))
代码解释
wordIndexDict={"<pad>":0}#词典
wi=1
for row in word_list1:
if type(row)==float:
print(row)
break
for word in row.split(" "):
if word not in wordIndexDict:
wordIndexDict[word]=wi
wi+=1
vocabSize=wi
maxLen=max(len(row) for row in word_list1)
sequenceLengths=[len(row) for row in word_list1]
label_list1 = []
for i in label_list:
label_list1.append(' '.join(str(j) for j in i))
代码解释
data1 = pd.DataFrame()
data1['text'] = word_list1
data1['tag'] = label_list1
data1['text']=data1['text'].apply(lambda x:[wordIndexDict[word] for word in x.split()])
import tensorflow as tf
#因为长度不一样,需要补充到同一长度
X=tf.keras.preprocessing.sequence.pad_sequences(data1["text"],
value=wordIndexDict["<pad>"],
padding='post',
maxlen=maxLen)
X.shape
代码解释
(77, 353)
对tag做同样的处理
import tqdm
import re
data1["tag"]=data1["tag"].apply(lambda x:re.sub("\-\S+","",x))
tagIndexDict = {"PAD": 0}
ti = 1
for row in tqdm.tqdm(data1["tag"].values.tolist()):
for tag in row.split(" "):
if tag not in tagIndexDict:
tagIndexDict[tag] = ti
ti += 1
tagSum = len(list(tagIndexDict.keys()))
data1["tag"] = data1["tag"].apply(lambda x:x.split()+["PAD" for i in range(maxLen-len(x.split()))])
data1["tag"] = data1["tag"].apply(lambda x:[tagIndexDict[tagItem] for tagItem in x])
y=np.array(data1["tag"].values.tolist())
y.shape
代码解释
(77, 353)
import BiLSTMCRF#之前搭好的模型
myModel=BiLSTMCRF.MyBiLSTMCRF(vocabSize,maxLen, tagIndexDict,tagSum,sequenceLengths)#出入之前的参数
myModel.myBiLSTMCRF.summary()
代码解释

history=myModel.fit(X,y,epochs=1500)#训练1500轮
代码解释
import matplotlib.pyplot as plt
loss = history.history['loss']
plt.plot(range(1500),loss,"bo",label="loss")
plt.legend()
plt.show()
代码解释

testI=22
preY=myModel.predict(X)[testI]
preY
代码解释
array([1, 2, 2, 2, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0])
代码解释
indexTagDict=dict(list(zip(list(tagIndexDict.values()),list(tagIndexDict.keys()))))
indexWordDict=dict(list(zip(list(wordIndexDict.values()),list(wordIndexDict.keys()))))
sentenceList=[indexWordDict[wordItem] for wordItem in X[testI]]
sentenceList=sentenceList[:sentenceList.index('<pad>')]
tagList=[indexTagDict[tagItem] for tagItem in preY]
tagList=tagList[:tagList.index('PAD')]
for i,j in zip(sentenceList,tagList):
print(i,'的类别是',j,'代表的是',label_dict[j])
代码解释

全部评论 (0)
还没有任何评论哟~
