pytorch笔记: 搭建Skip—gram
生活随笔
收集整理的這篇文章主要介紹了
pytorch笔记: 搭建Skip—gram
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
skip-gram 理論部分見:NLP 筆記:Skip-gram_劉文巾的博客-CSDN博客
1 導入庫
import numpy as np import torch from torch import nn, optim import random from collections import Counter import matplotlib.pyplot as plt2 數據集部分
2.1 訓練數據
#訓練數據 text ='I like dog i like cat i like \ animal dog cat animal apple \ cat dog like dog fish milk like dog \ cat eyes like i like apple apple \ i hate apple i movie book music like \ cat dog hate cat dog like he is man she\ is woman king is man queen is woman'2.2 參數設置
#參數設置 EMBEDDING_DIM = 2 #詞向量維度 PRINT_EVERY = 1000 #可視化頻率 EPOCHS = 1000 #訓練的輪數 BATCH_SIZE = 5 #每一批訓練數據大小 N_SAMPLES = 3 #負樣本大小 WINDOW_SIZE = 5 #最大背景詞窗口大小 FREQ = 0 #詞匯出現頻率,低于這個的詞匯將被刪除 DELETE_WORDS = False #是否刪除部分高頻詞(是否二次采樣)2.3 文本預處理
2.3.1 保留一定數目以上詞頻的詞匯
#文本預處理 def preprocess(text, FREQ):text = text.lower()words = text.split()word_counts = Counter(words)#計算每個詞出現的次數trimmed_words = []for word in words :if word_counts[word] > FREQ:trimmed_words.append(word)return trimmed_words words = preprocess(text, FREQ) #保留下詞匯出現頻率大于等于FREQ的詞2.3.2 構建word<->num的字典
#構建詞典 word<->num;num<->word vocab = set(words) vocab2int = {} int2vocab = {} for c, w in enumerate(vocab):vocab2int[w]=cint2vocab[c]=w vocab2int,int2vocab2.3.3?將words中的詞匯id化
#將文本轉化為數值 int_words = [] for w in words:int_words.append(vocab2int[w])2.3.4 計算詞頻
#計算單詞頻次 int_word_counts = Counter(int_words) total_count = len(int_words) word_freqs={} for w, c in int_word_counts.items():word_freqs[w]=c/total_count2.3.5 二次采樣(去除出現頻率過高的詞匯)
#二次采樣:去除出現頻次高的詞匯 if DELETE_WORDS:t = 1e-5prob_drop = {}for w in int_word_counts:prob_drop[w]=1-np.sqrt(t/word_freqs[w])#每個word的丟棄概率train_words=[]for w in int_words:if(random.random()<(1-prob_drop[w])):train_words.append(w) else:train_words = int_words2.3.6 選作負樣本的概率
word_freqs = np.array(list(word_freqs.values())) noise_dist = torch.from_numpy(word_freqs ** (0.75) / np.sum(word_freqs ** (0.75)))noise_dist ''' tensor([0.0969, 0.1314, 0.1088, 0.0969, 0.0425, 0.0715, 0.0253, 0.0253, 0.0253,0.0425, 0.0253, 0.0253, 0.0253, 0.0253, 0.0715, 0.0425, 0.0253, 0.0425,0.0253, 0.0253], dtype=torch.float64) '''2.3.7 獲取中心詞周圍的背景詞
#獲取目標詞匯(中心詞的背景詞) def get_target(words, idx, WINDOW_SIZE):target_window = np.random.randint(1, WINDOW_SIZE+1)#當前窗口的大小,窗口大大小可以靈活調整,這樣訓練得到的效果會更好if (idx-target_window)>0:start_point = idx-target_window#預測窗口的第一個下標else:start_point=0if(idx+target_window<len(words)):end_point = idx+target_window#預測窗口的最后一個下標else:end_point=len(words)-1targets = set(words[start_point:idx]+words[idx+1:end_point+1])#預測窗口區間中不同的單詞return list(targets) #得到的就是以idx下標處的單詞為中心詞,一定窗口大小的背景詞2.3.8 生成一個一個batch
#批次化數據 def get_batch(words, BATCH_SIZE, WINDOW_SIZE):n_batches = len(words)//BATCH_SIZE#我們的單詞一共需要劃分成幾個batchwords = words[:n_batches*BATCH_SIZE]#這里我們是把最后多出來的幾個詞剔除掉了for idx in range(0, len(words), BATCH_SIZE):batch_x, batch_y = [],[]batch = words[idx:idx+BATCH_SIZE] #當前batch所涉及的單詞for i in range(len(batch)):x = batch[i]y = get_target(batch, i, WINDOW_SIZE) 我們記y返回的窗口size為k_i(一個batch里面不同的i可能窗口的size是不一樣的)batch_x.extend([x]*len(y)) #batch_x:[k_i]batch_y.extend(y) #batch_y:[k_i]yield batch_x, batch_y#每次輸出當前的這一組batch_x和batch_y,待下一次調用的時候,生成下一組 #batch_x:[sigma{k_i}] #batch_y:[sigma{k_i}]?3 定義模型
class SkipGramNeg(nn.Module):def __init__(self, n_vocab, n_embed, noise_dist):super().__init__()self.n_vocab = n_vocab#輸入的單詞數量(len(vocab2int))self.n_embed = n_embed#中間詞向量的維度(EMBEDDING_DIM)self.noise_dist = noise_dist#定義詞向量層(每個詞被選成負樣本的概率)#noise_dist [n_vocab]self.in_embed = nn.Embedding(n_vocab, n_embed)#中心詞對應的權重矩陣 #維度 n_vocab->n_embedself.out_embed = nn.Embedding(n_vocab, n_embed)#背景詞對應的權重矩陣 #維度 n_vocab->n_embed #詞向量層參數初始化self.in_embed.weight.data.uniform_(-1, 1)self.out_embed.weight.data.uniform_(-1, 1)#將參數的范圍限定在(-1,1)#輸入詞的前向過程(對應 input->hidden)def forward_input(self, input_words): #k_i表示一個batch每個i的窗口大小 #input_words [sigma{k_i}]input_vectors = self.in_embed(input_words)return input_vectors #input_vectors [sigma{k_i},n_embed]#目標詞的前向過程(在中心詞窗口內的背景詞的編碼)def forward_output(self, output_words): #output_words [sigma{k_i}]output_vectors = self.out_embed(output_words)return output_vectors #output_vectors [sigma{k_i},n_embed]#負樣本詞的前向過程def forward_noise(self, size, N_SAMPLES):noise_dist = self.noise_dist#從詞匯分布中采樣負樣本noise_words = torch.multinomial(noise_dist,size * N_SAMPLES,replacement=True)noise_vectors = self.out_embed(noise_words).view(size, N_SAMPLES, self.n_embed)return noise_vectors#noise_vectors [sigma{k_i},N_SAMPLES,n_embed]4 定義損失函數
負采樣見NLP 筆記:Skip-gram_劉文巾的博客-CSDN博客
class NegativeSamplingLoss(nn.Module):def __init__(self):super().__init__()def forward(self, input_vectors, output_vectors, noise_vectors):#k_i表示一個batch每個i的窗口大小#input_vectors [sigma{k_i},n_embed]#output_vectors [sigma{k_i},n_embed]#noise_vectors [sigma{k_i},N_SAMPLES,n_embed]BATCH_SIZE, embed_size = input_vectors.shape#將輸入詞向量與目標詞向量作維度轉化處理 input_vectors = input_vectors.view(BATCH_SIZE, embed_size, 1)#input_vectors [sigma{k_i},n_embed,1]output_vectors = output_vectors.view(BATCH_SIZE, 1, embed_size)#output_vectors [sigma{k_i},1,n_embed]test = torch.bmm(output_vectors, input_vectors) #test [sigma{k_i},1,1]#也就是中心詞和背景詞的條件概率,即u_o^T v_c#目標詞的lossout_loss = torch.bmm(output_vectors, input_vectors).sigmoid().log() #out_loss [sigma{k_i},1,1]out_loss = out_loss.squeeze() #out_loss [sigma{k_i}]#負樣本損失noise_loss = torch.bmm(noise_vectors.neg(), input_vectors).sigmoid().log() #noise_loss [sigma{k_i},N_SAMPLES,1]noise_loss = noise_loss.squeeze().sum(1) #noise_loss [sigma{k_i},1]#綜合計算兩類損失#目標是讓正樣本概率大,負樣本概率小,也就是兩個loss的和越大越好 #但損失函數是要最小化, 所以前面要加一個-return -(out_loss + noise_loss).mean()?5 模型的聲明、損失函數和優化函數的定義
model = SkipGramNeg(len(vocab2int),EMBEDDING_DIM, noise_dist=noise_dist) criterion = NegativeSamplingLoss() optimizer = optim.Adam(model.parameters(), lr=0.003)?6 模型的訓練
#訓練 steps = 0 for e in range(EPOCHS):#獲取輸入詞以及目標詞for input_words, target_words in get_batch(train_words, BATCH_SIZE, WINDOW_SIZE): #input_words [sigma{k_i}] #target_words [sigma{k_i}]steps += 1inputs, targets = torch.LongTensor(input_words), torch.LongTensor(target_words)#輸入、輸出以及負樣本向量input_vectors = model.forward_input(inputs) #k_i表示一個batch每個i的窗口大小 #input_vectors [sigma{k_i},n_embed]output_vectors = model.forward_output(targets) #output_vectors [sigma{k_i},n_embed]size, _ = input_vectors.shape #size:sigma{k_i}noise_vectors = model.forward_noise(size, N_SAMPLES) #noise_vectors [sigma{k_i},N_SAMPLES,n_embed]#計算損失loss = criterion(input_vectors, output_vectors, noise_vectors)#打印損失if steps%PRINT_EVERY == 0:print("loss:",loss)#梯度回傳optimizer.zero_grad()loss.backward()optimizer.step()''' loss: tensor(2.3455, grad_fn=<NegBackward>) loss: tensor(1.6729, grad_fn=<NegBackward>) loss: tensor(1.6398, grad_fn=<NegBackward>) loss: tensor(1.5920, grad_fn=<NegBackward>) loss: tensor(1.4348, grad_fn=<NegBackward>) loss: tensor(1.5463, grad_fn=<NegBackward>) loss: tensor(1.4360, grad_fn=<NegBackward>) loss: tensor(1.6348, grad_fn=<NegBackward>) loss: tensor(1.4676, grad_fn=<NegBackward>) loss: tensor(1.6141, grad_fn=<NegBackward>) '''7 結果可視化
如NLP 筆記:Skip-gram_劉文巾的博客-CSDN博客 第4條第3)小條所言,權重矩陣的每一行對應的是一個點在稠密空間上的編碼
?
plt.figure(figsize=(20,10)) for i, w in int2vocab.items():vectors = model.state_dict()["in_embed.weight"]x,y = float(vectors[i][0]),float(vectors[i][1])plt.scatter(x,y)plt.annotate(w, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')plt.show() 《新程序員》:云原生和全面數字化實踐50位技術專家共同創作,文字、視頻、音頻交互閱讀總結
以上是生活随笔為你收集整理的pytorch笔记: 搭建Skip—gram的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python库整理: Collectio
- 下一篇: pytorch函数整理