源码分析:《Topic-to-Essay Generation with Neural Networks》
生活随笔
收集整理的這篇文章主要介紹了
源码分析:《Topic-to-Essay Generation with Neural Networks》
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1.閱讀配置參數的 Config.py 文件
完整代碼如下:
#coding:utf-8class Config(object):data_dir = 'Data/'vec_file = 'Data/vec.txt'init_scale = 0.04learning_rate = 0.001max_grad_norm = 10 #梯度剪裁num_layers = 2num_steps = 101 #這個值比句子中的最大單詞數多1hidden_size = 20word_embedding_size = 10max_epoch = 30max_max_epoch = 80keep_prob = 0.5 #每個元素通過dropout層保留的概率lr_decay = 1.0batch_size = 16vocab_size = 7187num_keywords = 5save_freq = 10 #將模型保存到硬盤上的步數(根據迭代次數計算)model_path = './Model_News' #保存或加載的模型的路徑# parameter for generationlen_of_generation = 16 #生成的字符數目save_time = 20 #加載節省時間的模型is_sample = True #true表示使用sample,否則使用argmaxBeamSize = 22.閱讀數據預處理 Preprocess.py 文件
1)Read_WordVec函數:處理word2vec得到的詞向量,新增四個標記
wordLS,單詞序列;vec_ls單詞對應的向量序列(增加四個標記,設為[0]*embedding長度,并且保留了word2vec訓練出來的詞向量)
2)Read_Data函數:讀取數據
trainingdata列表保存元組,元組為(['doc1', 'doc2', 'doc3'], ['k1', 'k2', 'k3'])
3)data_iterator函數:數據轉為向量表示
epoch_size訓練的輪數 = len(trainingdata)訓練數據條數 // batch_size 一批數據的條數
raw_data:文檔文字列表;
key_words:關鍵詞向量列表,轉換屬性為np.int64;
data:文檔詞向量列表,開頭[1],結束[2],未知[3],得到[1,,,3,,,,2]的文檔向量列表,然后轉換屬性為np.int64
x:文檔從位置0到倒數第二位置;y:文檔從位置1到最后位置,即可表示為:x-hell,y-ello
mask:將 x 非0的部分mask為1,為0的部分mask為0
key_words = np.array(key_words, dtype=np.int64) x = data[:, 0:num_steps] y = data[:, 1:] mask = np.float32(x != 0)4)將數據迭代地寫入
# TFRecord格式的文件是將樣本和標簽放在一起,模型前預處理步驟,提高效率和節約內存 tf.python_io.TFRecordWriter("coverage_data") # 使用proto對象將數據序列化為字符串 serialized = example.SerializeToString() # 將序列化對象寫入磁盤 writer.write(serialized)完整代碼如下:
#coding:utf-8 import numpy as np import tensorflow as tf import cPickle, os, collections import Config# 導入配置類Config() config = Config.Config() # 增加四個標記 config.vocab_size += 4# Read_WordVec函數:增加四個標記,設為[0]*embedding長度,并且保留了word2vec訓練出來的詞向量 def Read_WordVec(config):with open(config.vec_file, 'r') as fvec:# wordLS,單詞序列wordLS = []# vec_ls單詞對應的向量序列vec_ls =[]fvec.readline()wordLS.append(u'PAD')vec_ls.append([0]*config.word_embedding_size)wordLS.append(u'START')vec_ls.append([0]*config.word_embedding_size)wordLS.append(u'END')vec_ls.append([0]*config.word_embedding_size)wordLS.append(u'UNK')vec_ls.append([0]*config.word_embedding_size)for line in fvec:line = line.split()try:word = line[0].decode('utf-8')vec = [float(i) for i in line[1:]]assert len(vec) == config.word_embedding_sizewordLS.append(word)vec_ls.append(vec)except:print line[0]assert len(wordLS) == config.vocab_sizeword_vec = np.array(vec_ls, dtype=np.float32)cPickle.dump(word_vec, open('word_vec.pkl','w'), protocol=cPickle.HIGHEST_PROTOCOL)cPickle.dump(wordLS, open('word_voc.pkl','w'), protocol=cPickle.HIGHEST_PROTOCOL)return wordLS, word_vec# Read_Data函數:trainingdata列表保存元組,元組為(['doc1', 'doc2', 'doc3'], ['k1', 'k2', 'k3']) def Read_Data(config):trainingdata = []with open(os.path.join(config.data_dir, 'TrainingData.txt'),'r') as ftext:for line in ftext:line = line.decode('utf-8')tmp = line.split()idx = tmp.index('</d>')doc = tmp[:idx]keywords = tmp[idx+1:]assert len(keywords) == 5trainingdata.append((doc, keywords))return trainingdataprint 'loading the trainingdata...' DATADIR = config.data_dir vocab, _ = Read_WordVec(config)data = Read_Data(config)word_to_idx = { ch:i for i,ch in enumerate(vocab) } idx_to_word = { i:ch for i,ch in enumerate(vocab) } data_size, _vocab_size = len(data), len(vocab)print 'data has %d document, size of word vocabular: %d.' % (data_size, _vocab_size)# data_iterator函數,將數據從文本轉化為向量,文檔內容需要加標記 def data_iterator(trainingdata, batch_size, num_steps):# epoch_size訓練的輪數 = len(trainingdata)訓練數據條數 // batch_size 一批數據的條數epoch_size = len(trainingdata) // batch_sizefor i in range(epoch_size):batch_data = trainingdata[i*batch_size:(i+1)*batch_size]# raw_data:文檔文字列表raw_data = []# key_words:關鍵詞向量列表,np.array(key_words, dtype=np.int64)轉換屬性為np.int64key_words = []for it in batch_data:raw_data.append(it[0])tmp = []for wd in it[1]:tmp.append(word_to_idx[wd])key_words.append(tmp)# data:文檔詞向量列表,開頭[1],結束[2],未知[3],得到[1,,,3,,,,2]的文檔向量列表,然后轉換屬性data = np.zeros((len(raw_data), num_steps+1), dtype=np.int64)for i in range(len(raw_data)):doc = raw_data[i]tmp = [1]for wd in doc:if wd in vocab:tmp.append(word_to_idx[wd])else:tmp.append(3)tmp.append(2) tmp = np.array(tmp, dtype=np.int64)_size = tmp.shape[0]data[i][:_size] = tmpkey_words = np.array(key_words, dtype=np.int64)x = data[:, 0:num_steps]y = data[:, 1:]mask = np.float32(x != 0)yield (x, y, mask, key_words)train_data = data # # TFRecord格式的文件是將樣本和標簽放在一起,模型前預處理步驟,提高效率和節約內存 writer = tf.python_io.TFRecordWriter("coverage_data") dataLS = []for step, (x, y, mask, key_words) in enumerate(data_iterator(train_data, config.batch_size, config.num_steps)):example = tf.train.Example(# Example contains a Features proto objectfeatures=tf.train.Features(# Features contains a map of string to Feature proto objectsfeature={# A Feature contains one of either a int64_list,# float_list, or bytes_list'input_data': tf.train.Feature(int64_list=tf.train.Int64List(value=x.reshape(-1).astype("int64"))),'target': tf.train.Feature(int64_list=tf.train.Int64List(value=y.reshape(-1).astype("int64"))),'mask': tf.train.Feature(float_list=tf.train.FloatList(value=mask.reshape(-1).astype("float"))),'key_words': tf.train.Feature(int64_list=tf.train.Int64List(value=key_words.reshape(-1).astype("int64"))), }))# 使用proto對象將數據序列化為字符串serialized = example.SerializeToString()# 將序列化對象寫入磁盤writer.write(serialized)print 'total step: ',step3.閱讀 Train.py 文件
首先加載flags的參數項,然后執行main函數
if __name__ == "__main__":tf.app.run()main函數如下
def main(_):# 開啟圖、session會話with tf.Graph().as_default(), tf.Session(config=config_tf) as session:# 隨機均勻初始化initializer = tf.random_uniform_initializer(-config.init_scale,config.init_scale)# 構建模型 with tf.variable_scope("model", reuse=None, initializer=initializer):m = Model(is_training=True, config=config, filename='coverage_data')tf.global_variables_initializer().run()# 保存變量model_saver = tf.train.Saver(tf.global_variables())tf.train.start_queue_runners(sess=session)#model_saver = tf.train.Saver(tf.all_variables())for i in range(config.max_max_epoch):lr_decay = config.lr_decay ** max(i - config.max_epoch, 0.0)m.assign_lr(session, config.learning_rate * lr_decay)print("Epoch: %d Learning rate: %.4f" % (i + 1, session.run(m.lr)))train_perplexity = run_epoch(session, m, m.train_op)print("Epoch: %d Train Perplexity: %.3f" % (i + 1, train_perplexity))if (i+1) % config.save_freq == 0:print 'model saving ...'model_saver.save(session, config.model_path+'--%d'%(i+1))print 'Done!'?
?
總結
以上是生活随笔為你收集整理的源码分析:《Topic-to-Essay Generation with Neural Networks》的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: web前端项目(一) 做一个网易考拉官网
- 下一篇: 分享一些百度收录又快又好的平台,用好它们