使用预训练模型进行句对分类(Paddle、PyTorch)
生活随笔
收集整理的這篇文章主要介紹了
使用预训练模型进行句对分类(Paddle、PyTorch)
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
文章目錄
- 1. Paddle
- 2. PyTorch
- 3. 提交結(jié)果
分別使用兩種框架,加載預(yù)訓(xùn)練模型,對(duì)句對(duì)進(jìn)行分類
數(shù)據(jù)下載:千言數(shù)據(jù)集:文本相似度
1. Paddle
可以使用 paddlenlp 直接加載預(yù)訓(xùn)練模型,比較方便
# %% # 比賽地址 # https://aistudio.baidu.com/aistudio/competition/detail/45 import time import os import numpy as np import paddle import paddlenlp import paddle.nn.functional as F import paddle.distributed as dist # 并行 from functools import partial from paddlenlp.data import Stack, Pad, Tuple import paddle.nn as nn from paddlenlp.datasets import load_dataset from paddlenlp.transformers import LinearDecayWithWarmupdef read(data, datasetname, predict=False): # 將數(shù)據(jù)轉(zhuǎn)成迭代器if not predict:for d in data:label = d["label"]if datasetname != "lcqmc":text1, text2 = d["sentence1"], d["sentence2"]else:text1, text2 = d["query"], d["title"]yield {"label": label, "text1": text1, "text2": text2}else:for d in data:if datasetname != "lcqmc":text1, text2 = d["sentence1"], d["sentence2"]else:text1, text2 = d["query"], d["title"]yield {"text1": text1, "text2": text2}def convert_data(data, tokenizer, datasetname, max_seq_len=512, is_test=False): # 數(shù)據(jù)轉(zhuǎn)碼為模型的輸入text1, text2 = data["text1"], data["text2"]encoded_inputs = tokenizer(text=text1, text_pair=text2, max_seq_len=max_seq_len)input_ids = encoded_inputs["input_ids"]token_type_ids = encoded_inputs["token_type_ids"]if not is_test:label = np.array([data["label"]], dtype="int64")return input_ids, token_type_ids, labelreturn input_ids, token_type_idsclass PretrainedModel(nn.Layer): # 預(yù)訓(xùn)練模型 + FCdef __init__(self, pretrained_model, dropout=None):super().__init__()self.ptm = pretrained_modelself.dropout = nn.Dropout(dropout if dropout is not None else 0.1)self.clf = nn.Linear(self.ptm.config["hidden_size"], 2)def forward(self, input_ids, token_type_ids=None, position_ids=None, attention_mask=None):_, cls_embedding = self.ptm(input_ids, token_type_ids, position_ids, attention_mask)cls_embedding = self.dropout(cls_embedding)logits = self.clf(cls_embedding)probs = F.softmax(logits)return probsclass Recongnizer(): # 識(shí)別器類def __init__(self, datasetname, state_dict_path=None): # 傳入模型參數(shù)路徑self.seed = 100paddle.seed = self.seedself.batch_size = 128self.epochs = 20self.max_seq_len = 512self.datasetname = datasetname# paddlenlp自帶的一鍵加載數(shù)據(jù)self.train_ds, self.dev_ds, self.test_ds = load_dataset(datasetname, splits=["train", "dev", "test"])# 使用預(yù)訓(xùn)練模型的tokenizerself.tokenizer = paddlenlp.transformers.ErnieGramTokenizer.from_pretrained("ernie-gram-zh")# https://gitee.com/paddlepaddle/PaddleNLP/blob/develop/docs/model_zoo/transformers.rst# 加載預(yù)訓(xùn)練模型self.pretrained_model = paddlenlp.transformers.ErnieGramModel.from_pretrained("ernie-gram-zh")self.model = PretrainedModel(self.pretrained_model)if state_dict_path: # 如果傳入了模型參數(shù),直接加載參數(shù)try:state_dict = paddle.load(state_dict_path)self.model.set_dict(state_dict)except:print("加載模型參數(shù)失敗!")self.pathname = "checkpoint"self.global_step = 0isExists = os.path.exists(self.pathname)if not isExists:os.mkdir(self.pathname)self.save_dir = ""self.save_param_path = ""def fit(self):# 加載數(shù)據(jù)集train_ds = load_dataset(read, data=self.train_ds, datasetname=self.datasetname, lazy=False)dev_ds = load_dataset(read, data=self.dev_ds, datasetname=self.datasetname, lazy=False)test_ds = load_dataset(read, data=self.test_ds, datasetname=self.datasetname, predict=True, lazy=False)# 展示數(shù)據(jù)for i, example in enumerate(train_ds):if i < 5:print(example)input_ids, token_type_ids, label = convert_data(train_ds[0], self.tokenizer, self.datasetname)print(input_ids)# [1, 692, 811, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2, 329, 445, 2001, 497, 5, 654, 21, 692, 811, 614, 356, 314, 5, 291, 21, 2]print(token_type_ids)# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]print(label)# [1]# 數(shù)據(jù)轉(zhuǎn)換函數(shù)trans_func = partial(convert_data, tokenizer=self.tokenizer, datasetname=self.datasetname,max_seq_len=self.max_seq_len)# 對(duì)數(shù)據(jù)進(jìn)行批量打包+padbatchify_fn = lambda samples, fn=Tuple(Pad(axis=0, pad_val=self.tokenizer.pad_token_id),Pad(axis=0, pad_val=self.tokenizer.pad_token_type_id),Stack(dtype="int64")): [d for d in fn(samples)]# 將長(zhǎng)度不同的多個(gè)句子padding到統(tǒng)一長(zhǎng)度,取N個(gè)輸入數(shù)據(jù)中的最大長(zhǎng)度# 長(zhǎng)度是指的: 一個(gè)batch中的最大長(zhǎng)度,主要考慮性能開(kāi)銷# 取樣器batch_sampler = paddle.io.DistributedBatchSampler(train_ds, batch_size=self.batch_size, shuffle=True)# 數(shù)據(jù)加載器train_data_loader = paddle.io.DataLoader(dataset=train_ds.map(trans_func),batch_sampler=batch_sampler,collate_fn=batchify_fn,return_list=True)batch_sampler = paddle.io.BatchSampler(dev_ds, batch_size=self.batch_size, shuffle=False)dev_data_loader = paddle.io.DataLoader(dataset=dev_ds.map(trans_func),batch_sampler=batch_sampler,collate_fn=batchify_fn,return_list=True)num_training_steps = len(train_data_loader) * self.epochs# 學(xué)習(xí)率lr_scheduler = LinearDecayWithWarmup(5e-5, num_training_steps, 0.0)# 衰減的參數(shù)decay_params = [p.name for n, p in self.model.named_parameters()if not any(nd in n for nd in ["bias", "norm"])]# 梯度剪切clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)# 優(yōu)化器optimizer = paddle.optimizer.AdamW(learning_rate=lr_scheduler,parameters=self.model.parameters(),weight_decay=0.0,apply_decay_param_fun=lambda x: x in decay_params,grad_clip=clip)# 損失函數(shù)criterion = paddle.nn.loss.CrossEntropyLoss()# 評(píng)估準(zhǔn)確率metric = paddle.metric.Precision()t_start = time.time()F1 = 0 # 最大F1值for epoch in range(1, self.epochs + 1):for step, batch in enumerate(train_data_loader, start=1):input_ids, token_type_ids, labels = batchprobs = self.model(input_ids=input_ids, token_type_ids=token_type_ids)loss = criterion(probs, labels)metric.update(np.argmax(probs, axis=1), labels)acc = metric.accumulate()self.global_step += 1if self.global_step % 10 == 0:print("訓(xùn)練步數(shù) %d, epoch: %d, batch: %d, loss: %.5f, acc: %.5f, speed: %.2f step/s"% (self.global_step, epoch, step, loss, acc,10 / (time.time() - t_start)))t_start = time.time()loss.backward()optimizer.step()lr_scheduler.step()optimizer.clear_grad()if self.global_step % 100 == 0:_, F1, improve = self.evaluate(criterion, metric, dev_data_loader, F1, "dev")if improve:print("評(píng)估結(jié)果 F1值 : {:.3f} , 模型保存于:".format(F1) + self.save_param_path)else:print("最好結(jié)果 F1值 : {:.3f} , 當(dāng)前評(píng)估沒(méi)有提升!".format(F1))print("-----訓(xùn)練完成------")# 用最好的模型參數(shù),提交預(yù)測(cè)state_dict = paddle.load(self.save_param_path)self.model.set_dict(state_dict)self.writeToFile(test_ds)@paddle.no_grad()def evaluate(self, criterion, metric, data_loader, F1, phase="dev"):self.model.eval()metric.reset()recall = paddle.metric.Recall()recall.reset()losses = []prob_list = []for batch in data_loader:input_ids, token_type_ids, labels = batchprobs = self.model(input_ids=input_ids, token_type_ids=token_type_ids)prob_list.extend(probs)loss = criterion(probs, labels)losses.append(loss.numpy())metric.update(np.argmax(probs, axis=1), labels)recall.update(np.argmax(probs, axis=1), labels)acc = metric.accumulate()rec = recall.accumulate()f1 = 0 if (acc + rec) == 0.0 else 2 * acc * rec / (acc + rec)improve = Falseif f1 > F1: # 保存 F1 值最大的時(shí)候的模型參數(shù)F1 = f1improve = Trueself.save_dir = os.path.join(self.pathname, "best_model_state")self.save_param_path = os.path.join(self.save_dir, "model_state_pdparams_F1_" + str(round(F1, 4)))paddle.save(self.model.state_dict(), self.save_param_path)self.tokenizer.save_pretrained(self.save_dir)print("評(píng)估 {} loss: {:.5}, acc: {:.5}, recall: {:.5}".format(phase, np.mean(losses), acc, rec))self.model.train()metric.reset()return prob_list, F1, improvedef predict(self, text1, text2): # 單條 句對(duì)預(yù)測(cè)encoded_inputs = self.tokenizer(text=text1, text_pair=text2, max_seq_len=self.max_seq_len)input_ids = encoded_inputs["input_ids"]token_type_ids = encoded_inputs["token_type_ids"]predict_data_loader = [(input_ids, token_type_ids)]batch_probs = []self.model.eval()with paddle.no_grad():for batch_data in predict_data_loader:input_ids, token_type_ids = batch_datainput_ids = paddle.to_tensor([input_ids])token_type_ids = paddle.to_tensor([token_type_ids])batch_prob = self.model(input_ids=input_ids, token_type_ids=token_type_ids)batch_prob = F.softmax(batch_prob, axis=1).numpy()batch_probs.append(batch_prob)batch_probs = np.concatenate(batch_probs, axis=0)return batch_probsdef writeToFile(self, test_ds): # 對(duì)測(cè)試集進(jìn)行預(yù)測(cè),寫入文件with open(self.datasetname + ".tsv", "w", encoding="utf-8") as f:f.write("index\tprediction\n")for i, d in enumerate(test_ds):prob = self.predict(d["text1"], d["text2"])label = 1 if prob[0][1] >= 0.5 else 0f.write(str(i) + "\t" + str(label) + "\n")if __name__ == "__main__":dist.init_parallel_env() # 初始化并行環(huán)境# 啟動(dòng)命令 python -m paddle.distributed.launch --gpus '0,1' xxx.py &# 并行訓(xùn)練設(shè)置# https://aistudio.baidu.com/aistudio/projectdetail/1222066# https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/02_paddle2.0_develop/06_device_cn.htmldatasetnames = ["lcqmc", "bq_corpus", "paws-x"] #for name in datasetnames:model = Recongnizer(name)model.fit()2. PyTorch
預(yù)訓(xùn)練模型下載:https://huggingface.co/nghuyong/ernie-1.0
3. 提交結(jié)果
創(chuàng)作挑戰(zhàn)賽新人創(chuàng)作獎(jiǎng)勵(lì)來(lái)咯,堅(jiān)持創(chuàng)作打卡瓜分現(xiàn)金大獎(jiǎng)總結(jié)
以上是生活随笔為你收集整理的使用预训练模型进行句对分类(Paddle、PyTorch)的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: LeetCode 2070. 每一个查询
- 下一篇: LeetCode 1779. 找到最近的