【NLP_命名实体识别】Albert+BiLSTM+CRF模型训练、评估与使用
模型訓練
2021/3/10:使用訓練好的Bert/Albert-CRF模型,同時,在此基礎上,加一層BiLSTM網絡,得修改后的Albert-BiLSTM-CRF模型(見下一篇文章),開始訓練。
修改思路:以已有的Albert+CRF模型代碼為基礎,參考網上的Albert+BiLSTM+CRF模型,稍加修改即可。值得注意的,無非是“三種模型”之間的數據傳遞類型,比如,將Albert模型訓練得到的embedding,傳入BiLSTM(參考:ALBERT+BiLSTM+CRF實現序列標注 - 光彩照人 - 博客園)。
調試過程:其間,多次用到命令行,安裝需要的庫、工具包,按部就班去做即可。
import numpy as np from bert4keras.backend import keras, K from bert4keras.models import build_transformer_model from bert4keras.tokenizers import Tokenizer from bert4keras.optimizers import Adam from bert4keras.snippets import sequence_padding, DataGenerator from bert4keras.snippets import open, ViterbiDecoder from bert4keras.layers import ConditionalRandomField from keras.layers import Dense from keras.models import Model from tqdm import tqdm from tensorflow import ConfigProto from tensorflow import InteractiveSession from numpy import array from keras.models import Sequential from keras.layers import LSTM from keras.layers import Dense from keras.layers import Bidirectional from keras.layers import Dropout from keras.layers import TimeDistributed from keras_contrib.layers import CRF from keras_contrib.losses import crf_loss from keras_contrib.metrics import crf_accuracy, crf_viterbi_accuracyconfig = ConfigProto() # config.gpu_options.per_process_gpu_memory_fraction = 0.2 config.gpu_options.allow_growth = True session = InteractiveSession(config=config)maxlen = 256 epochs = 1#10 batch_size = 16 bert_layers = 12 learing_rate = 1e-5 # bert_layers越小,學習率應該要越大 crf_lr_multiplier = 10 # 必要時擴大CRF層的學習率#1000# # bert配置 # config_path = './bert_model/chinese_L-12_H-768_A-12/bert_config.json' # checkpoint_path = './bert_model/chinese_L-12_H-768_A-12/bert_model.ckpt' # dict_path = './bert_model/chinese_L-12_H-768_A-12/vocab.txt'#albert配置 config_path = './bert_model/albert_large/albert_config.json' checkpoint_path = './bert_model/albert_large/model.ckpt-best' dict_path = './bert_model/albert_large/vocab_chinese.txt'def load_data(filename):D = []with open(filename, encoding='utf-8') as f:f = f.read()for l in f.split('\n\n'):if not l:continued, last_flag = [], ''for c in l.split('\n'):char, this_flag = c.split(' ')if this_flag == 'O' and last_flag == 'O':d[-1][0] += charelif this_flag == 'O' and last_flag != 'O':d.append([char, 'O'])elif this_flag[:1] == 'B':d.append([char, this_flag[2:]])else:d[-1][0] += charlast_flag = this_flagD.append(d)return D# 標注數據 train_data = load_data('./data/example.train') valid_data = load_data('./data/example.dev') test_data = load_data('./data/example.test')# 建立分詞器 tokenizer = Tokenizer(dict_path, do_lower_case=True)# 類別映射 labels = ['PER', 'LOC', 'ORG'] id2label = dict(enumerate(labels)) label2id = {j: i for i, j in id2label.items()} num_labels = len(labels) * 2 + 1class data_generator(DataGenerator):"""數據生成器"""def __iter__(self, random=False):batch_token_ids, batch_segment_ids, batch_labels = [], [], []for is_end, item in self.sample(random):token_ids, labels = [tokenizer._token_start_id], [0]for w, l in item:w_token_ids = tokenizer.encode(w)[0][1:-1]if len(token_ids) + len(w_token_ids) < maxlen:token_ids += w_token_idsif l == 'O':labels += [0] * len(w_token_ids)else:B = label2id[l] * 2 + 1I = label2id[l] * 2 + 2labels += ([B] + [I] * (len(w_token_ids) - 1))else:breaktoken_ids += [tokenizer._token_end_id]labels += [0]segment_ids = [0] * len(token_ids)batch_token_ids.append(token_ids)batch_segment_ids.append(segment_ids)batch_labels.append(labels)if len(batch_token_ids) == self.batch_size or is_end:batch_token_ids = sequence_padding(batch_token_ids)batch_segment_ids = sequence_padding(batch_segment_ids)batch_labels = sequence_padding(batch_labels)yield [batch_token_ids, batch_segment_ids], batch_labelsbatch_token_ids, batch_segment_ids, batch_labels = [], [], []""" 后面的代碼使用的是bert類型的模型,如果你用的是albert,那么前幾行請改為: """ model = build_transformer_model(config_path,checkpoint_path,model='albert', ) output_layer = 'Transformer-FeedForward-Norm' albert_output = model.get_layer(output_layer).get_output_at(bert_layers - 1)lstm = Bidirectional(LSTM(units=128, return_sequences=True), name="bi_lstm")(albert_output) drop = Dropout(0.1, name="dropout")(lstm) dense = TimeDistributed(Dense(num_labels, activation="softmax"), name="time_distributed")(drop)output = Dense(num_labels)(dense) CRF = ConditionalRandomField(lr_multiplier=crf_lr_multiplier) output = CRF(output)model = Model(model.input, output) model.summary()model.compile(loss=CRF.sparse_loss,optimizer=Adam(learing_rate),metrics=[CRF.sparse_accuracy] )class NamedEntityRecognizer(ViterbiDecoder):"""命名實體識別器"""def recognize(self,text):tokens = tokenizer.tokenize(text)while len(tokens) > 512:tokens.pop(-2)mapping = tokenizer.rematch(text, tokens)token_ids = tokenizer.tokens_to_ids(tokens)segment_ids = [0] * len(token_ids)nodes = model.predict([[token_ids], [segment_ids]])[0]labels = self.decode(nodes)entities, starting = [], Falsefor i, label in enumerate(labels):if label > 0:if label % 2 == 1:starting = Trueentities.append([[i], id2label[(label - 1) // 2]])elif starting:entities[-1][0].append(i)else:starting = Falseelse:starting = Falsereturn [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l)for w, l in entities]def evaluate(data):"""評測函數"""X, Y, Z = 1e-10, 1e-10, 1e-10for d in tqdm(data):text = ''.join([i[0] for i in d])R = set(NER.recognize(text))T = set([tuple(i) for i in d if i[1] != 'O'])X += len(R & T)Y += len(R)Z += len(T)f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Zreturn f1, precision, recallclass Evaluate(keras.callbacks.Callback):def __init__(self):self.best_val_f1 = 0def on_epoch_end(self, epoch, logs=None):trans = K.eval(CRF.trans)NER.trans = transprint(NER.trans)f1, precision, recall = evaluate(valid_data)# 保存最優if f1 >= self.best_val_f1:self.best_val_f1 = f1model.save_weights('best_model.weights')print('valid: f1: %.5f, precision: %.5f, recall: %.5f, best f1: %.5f\n' %(f1, precision, recall, self.best_val_f1))f1, precision, recall = evaluate(test_data)print('test: f1: %.5f, precision: %.5f, recall: %.5f\n' %(f1, precision, recall))if __name__ == '__main__':evaluator = Evaluate()train_generator = data_generator(train_data, batch_size)model.fit_generator(train_generator.forfit(),steps_per_epoch=len(train_generator),epochs=epochs,callbacks=[evaluator])else:model.load_weights('best_model.weights')模型評估
2021/3/11:今早,查看Albert+BiLSTM+CRF模型運行結果,發現其精度很低,僅為0.8左右。然而,使用同樣的數據,Albert+CRF模型精度在0.95以上。→→→思考其中原因,嘗試調整代碼:①嘗試調整LSTM相關參數(dropout),甚至去除dropout,皆無改善。②嘗試去除dropout與。dropout的作用?防止模型過擬合,但我認為,其使用需要看場景,參考:為什么模型加入dropout層后變得更差了?最后dense層的作用?我認為,可以將其理解為分類輸出層,因此模型中有CRF用于輸出轉換,故可能不需要dens層。參考:LSTM模型后增加Dense(全連接)層的目的是什么??→→→去除下面代碼后兩行后,Albert+BiLSTM+CRF模型精度在0.95以上。至于模型原理,待深究。
lstm = Bidirectional(LSTM(units=128, return_sequences=True), name="bi_lstm")(albert_output) #drop = Dropout(0.2, name="dropout")(lstm) #dense = TimeDistributed(Dense(num_labels, activation="softmax"), name="time_distributed")(drop)讀寫文件
2021/3/12:上午,一直在嘗試Python讀寫文件,如此簡單之事,竟耗費我兩小時之久。原因:總是報錯'open' object has no attribute 'readlines'。解決思路:新建一個py文件,在里面進行讀寫操作,可行。然而,同樣的語句,在Albert+BiLSTM+CRF模型py文件中,不可行。→這說明,語句本身沒錯,可能是Albert+BiLSTM+CRF模型py文件中變量/函數等名稱與讀寫語句沖突。→的確如此,Albert+BiLSTM+CRF模型py文件的開頭,有“from bert4keras.snippets import?open, ViterbiDecoder”,此"open"非彼"open"。
model.load_weights('best_model.weights') NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0])r = open("D:\Asian elephant\gao A_geography_NER\A_geography_NER\data\\result.txt", 'w') with open("D:\Asian elephant\gao A_geography_NER\A_geography_NER\data\\t.txt",'r',encoding='utf-8') as tt:content = tt.readlines() for line in content:ner=NER.recognize(line)print(ner,file=r)模型訓練
2021/3/14:訓練模型(迭代3次,學習率設為1000,其他參數設置如下)。
訓練數據:現有標注數據集+自己標注的數據;測試數據:自己標注的數據;驗證數據:自己標注的數據。
耗時:純CPU,迭代一次大約需要7小時。
結果LOW:epoch 1 →1304/1304:loss: 3.9929 - sparse_accuracy: 0.9648,test: ?f1: 0.13333, precision: 0.41176, recall: 0.07955,valid: ?f1: 0.15493, precision: 0.64706, recall: 0.08800, best f1: 0.15493
epoch 2→1304/1304:loss: 0.5454 - sparse_accuracy: 0.9849,test: ?f1: 0.25455, precision: 0.63636, recall: 0.15909,valid: ?f1: 0.18919, precision: 0.60870, recall: 0.11200, best f1: 0.18919
epoch 3→test與valid的precision達0.7以上
maxlen = 256 #文本保留的最大長度 epochs = 3 #迭代次數 batch_size = 16 #訓練時,每次傳入模型的特征數量 bert_layers = 12 learing_rate = 1e-5 # bert_layers越小,學習率應該要越大 crf_lr_multiplier = 1000 # 必要時擴大CRF層的學習率#1000各種bug及其解決
ValueError: substring not found
bug之“substring not found”2021/3/5:問題:迭代三次的模型已訓練完畢,但將所有數據放入模型時,得到上述bug。解決:解決bug并不難,甚至無需了解其原理,只需進行比對——多試幾次,發現數據中報錯行的規律。本以為是標點符號的問題,但排查過后,了解到,是字母的問題。
ValueError: not enough values to unpack (expected?2,?got?1)
2021/3/13:問題:其他設置一致,僅是使用的數據不同,精度結果卻大相徑庭。使用Albert+BiLSTM+CRF模型代碼包自帶的訓練數據用于訓練模型,使用自己標注的少量數據用于測試與驗證,得到較好的結果;但在訓練數據中,加上自己標注的少量數據,一起用于訓練,卻得到很差的結果。解決:仍是,找不同。我標注的數據與原數據有何不同?答:是否有'\n',這“不起眼”的'\n',卻有很重要的作用(如下)。
2021/3/15:新增一些自己標注的數據,而后,程序又報錯。錯誤原因:類似于2021/3/13那次報錯原因,仍是數據里的格式問題(字符/空格/換行符多余或缺失),但本次錯誤更為細致——文件末尾兩個換行符的缺失,而這兩個換行符十分重要(見代碼中的for l in f.split('\n\n'): #查找雙換行符)。解決方案:仍是對比正確數據VS我的報錯數據,①以為是數據中空格的問題(上次是此原因報錯),就一直糾結空格;②對比的所謂“正確數據”并非原始的、真正正確的數據,導致遲遲未能解決。
def load_data(filename): #加載標注數據:訓練集、測試集與驗證集D = [] with open(filename, encoding='utf-8') as f: #打開并讀取文件內容f = f.read() #讀取文件全部內容for l in f.split('\n\n'): #查找雙換行符if not l: #若無雙換行符continue #跳出本次循環,可執行下一次 (而break是跳出整個循環)d, last_flag = [], '' for c in l.split('\n'): #查找換行符char, this_flag = c.split(' ') if this_flag == 'O' and last_flag == 'O': d[-1][0] += char elif this_flag == 'O' and last_flag != 'O':d.append([char, 'O'])elif this_flag[:1] == 'B': #從索引0開始取,到1,但不包括1(即標注首字母為B)d.append([char, this_flag[2:]]) #從索引2開始取,char豎著到最后,如“梁子老寨”每個字的標注都非O,輸出('梁子老寨', 'LOC')。else:d[-1][0] += char #若無換行符,last_flag = this_flagD.append(d)return D #結果格式:[('良子', 'LOC'), ('勐乃通達', 'LOC'), ('梁子老寨', 'LOC'), ('黑山', 'LOC'), ('黑山', 'LOC'), ('勐乃通達', 'LOC')]總結
以上是生活随笔為你收集整理的【NLP_命名实体识别】Albert+BiLSTM+CRF模型训练、评估与使用的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: linux内核远程漏洞,Linux内核远
- 下一篇: PhotoShop2018安装与破解教程