word2vec原理(五):skip-gram和CBOW模型代码实现
生活随笔
收集整理的這篇文章主要介紹了
word2vec原理(五):skip-gram和CBOW模型代码实现
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
目錄
代碼一
代碼二
? ? ?第一部分代碼對于skip-gram和CBOW模型是通用的,第二部分是實現skip-gram模型的代碼。
代碼一:
import os from six.moves.urllib.request import urlretrieve import zipfile import collections# http://mattmahoney.net/dc/textdata.html dataset_link = 'http://mattmahoney.net/dc/' zip_file = 'text8.zip'# 查看下載進度 def cbk(a,b,c):'''回調函數@a:已經下載的數據塊@b:數據塊的大小@c:遠程文件的大小'''per = 100.0*a*b/cif per > 100:per = 100print('%.2f%%' % per)def data_download(zip_file):'''下載數據集'''if not os.path.exists(zip_file):# urlretrieve()方法直接將遠程數據下載到本地zip_file, _ = urlretrieve(dataset_link + zip_file, zip_file, cbk)print('File downloaded successfully!')return Nonedef extracting(extracted_folder, zip_file):'''解壓縮'''if not os.path.isdir(extracted_folder):with zipfile.ZipFile(zip_file) as zf:# 功能:解壓zip文檔中的所有文件到當前目錄。zf.extractall(extracted_folder)def text_processing(ft8_text):# 標點處理ft8_text = ft8_text.lower()ft8_text = ft8_text.replace('.', ' <period> ')ft8_text = ft8_text.replace(',', ' <comma> ')ft8_text = ft8_text.replace('"', ' <quotation> ')ft8_text = ft8_text.replace(';', ' <semicolon> ')ft8_text = ft8_text.replace('!', ' <exclamation> ')ft8_text = ft8_text.replace('?', ' <question> ')ft8_text = ft8_text.replace('(', ' <paren_l> ')ft8_text = ft8_text.replace(')', ' <paren_r> ')ft8_text = ft8_text.replace('--', ' <hyphen> ')ft8_text = ft8_text.replace(':', ' <colon> ')ft8_text_tokens = ft8_text.split()return ft8_text_tokensdef remove_lowerfreword(ft_tokens):'''去除與單詞相關的噪音:輸入數據集中詞頻小于7的單詞'''word_cnt = collections.Counter(ft_tokens) # 統計列表元素出現次數,一個無序的容器類型,以字典的鍵值對形式存儲,其中元素作為key,其計數作為valueshortlisted_words = [w for w in ft_tokens if word_cnt[w]>7]print(shortlisted_words[:15]) # 列出數據集中詞頻最高的15個單詞print('Total number of shortlisted_words', len(shortlisted_words)) # 16616688print('Unique number of shortlisted_words', len(set(shortlisted_words))) #53721return shortlisted_wordsdef dict_creation(shortlisted_words):'''創建詞匯表:單詞-詞頻'''counts = collections.Counter(shortlisted_words)vocabulary = sorted(counts, key=counts.get, reverse=True)rev_dictionary = {ii:word for ii,word in enumerate(vocabulary)} # 整數:單詞dictionary = {word:ii for ii, word in rev_dictionary.items()} # 單詞:整數return dictionary, rev_dictionary 部分庫解讀: 1. six是用來兼容python2和3的庫。 six.moves 是用來處理那些在2和3里面函數的位置有變化的,直接用six.moves就可以屏蔽掉這些變化2. zipfile.ZipFile(zip_file) 打開壓縮文件zip_fileZipFile.extractall([path[, members[, pwd]]]) 解壓zip文檔中的所有文件到當前目錄。參數:path 指定解析文件保存的文件夾member 指定要解壓的文件名稱或對應的ZipInfo對象pwd 解壓密碼代碼二:
import collections import time import numpy as np import random import tensorflow as tf from text_processing import * from sklearn.manifold import TSNEdef subsampling(words_cnt):# 采用子采樣處理文本中的停止詞thresh = 0.00005word_counts = collections.Counter(words_cnt)total_count = len(words_cnt)freqs = {word: count/total_count for word, count in word_counts.items()}p_drop = {word: 1 - np.sqrt(thresh/freqs[word]) for word in word_counts}train_words = [word for word in words_cnt if p_drop[word] < random.random()]return train_wordsdef skipG_target_set_generation(batch_, batch_index, word_window):# 以所需格式創建skip-gram模型的輸入:即中心詞周圍的詞random_num = np.random.randint(1, word_window+1) # 在word_window范圍內隨機選取周圍詞的數量words_start = batch_index - random_num if (batch_index-random_num) > 0 else 0words_stop = batch_index + random_numwindow_target = set(batch_[words_start:batch_index] + batch_[batch_index+1:words_stop+1])return list(window_target)def skipG_batch_creation(short_words,batch_length,word_window):# 創建中心詞及其周圍單詞的組合形式batch_cnt = len(short_words)//batch_lengthprint('batch_cnt=',batch_cnt)short_words = short_words[:batch_cnt*batch_length]for word_index in range(0, len(short_words), batch_length):input_words,label_words = [],[]word_batch = short_words[word_index:word_index+batch_length]for index_ in range(len(word_batch)): # 遍歷每個batch中的每個中詞batch_input = word_batch[index_]batch_label = skipG_target_set_generation(word_batch, index_, word_window) # 獲取周圍單詞label_words.extend(batch_label)input_words.extend([batch_input]*len(batch_label)) # skip_gram的輸入形式,周圍單詞都得對應上中心詞yield input_words, label_words# extracted_folder = 'dataset' # full_text = extracting(extracted_folder, zip_file)with open('dataset/text8') as ft_:full_text = ft_.read()ft_tokens = text_processing(full_text) # 單詞列表 shortlisted_words = remove_lowerfreword(ft_tokens) dictionary, rev_dictionary = dict_creation(shortlisted_words) words_cnt = [dictionary[word] for word in shortlisted_words] # 通過詞典獲取每個單詞對應的整數 train_words = subsampling(words_cnt) print('train_words=',len(train_words))# 1. tf_graph = tf.Graph() with tf_graph.as_default():input_ = tf.placeholder(tf.int32, [None], name='input_')label_ = tf.placeholder(tf.int32, [None, None], name='label_')# 2. 得到embedding with tf_graph.as_default():word_embed = tf.Variable(tf.random_uniform((len(rev_dictionary), 300),-1,1))embedding = tf.nn.embedding_lookup(word_embed, input_) # 將單詞轉換為向量# 3.定義優化算法 vocabulary_size = len(rev_dictionary) with tf_graph.as_default():sf_weights = tf.Variable(tf.truncated_normal((vocabulary_size,300),stddev=0.1))sf_bias = tf.Variable(tf.zeros(vocabulary_size))# 通過負采樣計算lossloss_fn = tf.nn.sampled_softmax_loss(weights=sf_weights,biases=sf_bias,labels=label_,inputs=embedding,num_sampled=100,num_classes=vocabulary_size)cost_fn = tf.reduce_mean(loss_fn)optim = tf.train.AdamOptimizer().minimize(cost_fn)# 4. 驗證集:在語料庫中選擇常見和不常見詞的組合,并基于詞向量之間的余弦相似性返回最接近它們之間的單詞 with tf_graph.as_default():validation_cnt = 16validation_dict = 100validation_words = np.array(random.sample(range(validation_dict), validation_cnt//2)) # 從list(range(validation_dict))中隨機獲取8個元素,作為一個片斷返回validation_words = np.append(validation_words, random.sample(range(1000, 1000+validation_dict), validation_cnt//2))validation_data = tf.constant(validation_words, dtype=tf.int32)normalization_embed = word_embed / (tf.sqrt(tf.reduce_sum(tf.square(word_embed),1,keep_dims=True)))validation_embed = tf.nn.embedding_lookup(normalization_embed, validation_data)word_similarity = tf.matmul(validation_embed,tf.transpose(normalization_embed))epochs = 2 batch_length = 1000 word_window = 10# 定義模型存儲檢查點model_checkpoint with tf_graph.as_default():saver = tf.train.Saver()with tf.Session(graph=tf_graph) as sess:iteration = 1loss = 0sess.run(tf.global_variables_initializer())print("Begin training-----------")for e in range(1, epochs+1):batches = skipG_batch_creation(train_words, batch_length, word_window)start = time.time()for x, y in batches:train_loss, _ = sess.run([cost_fn, optim],feed_dict={input_:x, label_:np.array(y)[:,None]})loss += train_lossif iteration % 100 ==0:end = time.time()print('Epoch {}/{}'.format(e,epochs),', Iteration:{}'.format(iteration),', Avg.Training loss:{:.4f}'.format(loss/100),', Processing:{:.4f} sec/batch'.format((end-start)/100))loss = 0start = time.time()if iteration % 2000 ==0:similarity_ = word_similarity.eval() # 返回結果值for i in range(validation_cnt):validated_words = rev_dictionary[validation_words[i]]top_k = 8nearest = (-similarity_[i,:]).argsort()[1:top_k+1] # argsort將similarity_中的元素從小到大排列,提取其對應的index(索引)log = 'Nearest to %s:' % validated_wordsfor k in range(top_k):close_word = rev_dictionary[nearest[k]]log = '%s %s,' % (log, close_word)print(log)iteration += 1 # 每遍歷一個batch,iteration值加1save_path = saver.save(sess, "model_checkpoint/skipGram_text8.ckpt")embed_mat = sess.run(normalization_embed)with tf_graph.as_default():saver = tf.train.Saver()with tf.Session(graph=tf_graph) as sess:saver.restore(sess, tf.train.latest_checkpoint('model_checkpoint'))embed_mat = sess.run(word_embed)# 使用t分布隨機鄰嵌入(t-SNE)來實現可視化 word_graph = 250 tsne = TSNE() word_embedding_tsne = tsne.fit_transform(embed_mat[:word_graph,:])可視化結果:
? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ?
總結
以上是生活随笔為你收集整理的word2vec原理(五):skip-gram和CBOW模型代码实现的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 文本挖掘预处理流程总结(2)— 英文
- 下一篇: 文本挖掘预处理:向量化与Hash Tri