實質上這是一個RNN的詞語向量化模型 + 條件GAN 首先用一個RNN網絡來將文字轉換為向量,然后將生成的文 本向量加入到G和D網絡中。
與普通GAN不同的是,這里多了一種錯誤情況,即看上去挺 真的,但是對應的描述與圖不符合,也要給與懲罰。 如果不加的話,那么D所能獲得的信息僅僅是G的生成圖,失 去了判斷圖與描述是否符合的判斷能力。
為什么還需要噪聲輸入? 這是因為一般情況下很多時候一句話就是描述內容(花的樣 子)的,而不會描述style(style主要是包括背景和姿態)。 那么這種情況下我們就希望噪聲能起到這種加入style的作用 ,從而生成更加真實多樣化的圖片。
另外,通過特征可視化的方式,讓z具有specific的style加入 功能,從而解決文本描述本身不對style進行任何闡述的問題, 隨機化的z可以加入不同的style,從而增加生成樣本的真實 性與多樣性。
三個重點部分: 一、對于文本的處理,如何提取文本信息,作為我們生成器 的條件? 首先是如何文本的向量化 然后提取文本信息
二、對于圖片的處理,需要添加負面的訓練: 即:輸入的文本和圖片不對應的時候,要給出懲罰。 做出輸入的隊列: 正確的圖片 + 正確的文本 錯誤的圖片 + 錯誤的文本
三、創建輸入隊列 保證文本和圖片對應
import tensorflow
as tf
from gensim
. models
import word2vec
from gensim
. models
import Word2Vec
import pandas
as pd
import glob
import numpy
as np
import os
import matplotlib
. pyplot
as plt
% matplotlib inline
from IPython
import display
os
. listdir
( '../input/gan-text-to-image-102flowers-rieyuguanghua' )
n_input
= 100
n_hidden
= 128
image_height
= 64
image_width
= 64
image_depth
= 3
noise_dim
= 100
maxlength
= 250
NUM_EPOCHS
= 100
batch_size
= 64
if not os
. path
. exists
( '102flowers' ) : !mkdir 102flowers!tar zxvf
. . / input / 102flowersdataset
/ 102flowers
. tgz
- C
. / 102flowers
/
display
. clear_output
( )
all_text_filename
= glob
. glob
( '../input/cvpr2016/cvpr2016_flowers/text_c10/class_*/image_*.txt' ) all_text_filename
. sort
( key
= lambda x
: x
. split
( '/' ) [ - 1 ] )
all_image_filename
= glob
. glob
( './102flowers/jpg/*.jpg' ) all_image_filename
. sort
( )
all_text_filename
= np
. array
( all_text_filename
)
all_image_filename
= np
. array
( all_image_filename
)
wrong_image_filename
= all_image_filename
[ np
. random
. permutation
( len ( all_image_filename
) ) ]
dataset_image
= tf
. data
. Dataset
. from_tensor_slices
( ( all_image_filename
, wrong_image_filename
) )
if not os
. path
. exists
( '../input/gan-text-to-image-102flowers-rieyuguanghua/all_text.txt' ) : with open ( 'all_text.txt' , 'at' ) as f
: for a_text
in all_text_filename
: f
. write
( open ( a_text
) . read
( ) . replace
( '\n' , '' ) + '\n' )
if not os
. path
. exists
( '../input/gan-text-to-image-102flowers-rieyuguanghua/word_model' ) : sentences
= word2vec
. Text8Corpus
( 'all_text.txt' ) model
= word2vec
. Word2Vec
( sentences
, size
= 100 ) model
. save
( 'word_model' )
else : model
= Word2Vec
. load
( '../input/gan-text-to-image-102flowers-rieyuguanghua/word_model' ) !cp
. . / input / gan
- text
- to
- image
- 102flowers
- rieyuguanghua
/ all_text
. txt
. / !cp
. . / input / gan
- text
- to
- image
- 102flowers
- rieyuguanghua
/ word_model
. /
word_vectors
= model
. wv
maxlength
= max ( [ len ( open ( a_text
) . read
( ) . split
( ) ) for a_text
in all_text_filename
] )
n_steps
= maxlength
def pad ( x
, maxlength
= 200 ) : x1
= np
. zeros
( ( maxlength
, 100 ) ) x1
[ : len ( x
) ] = x
return x1
def text_vec ( text_filenames
) : vec
= [ ] for a_text
in text_filenames
: all_word
= open ( a_text
) . read
( ) . split
( ) all_vec
= [ word_vectors
[ w
] for w
in all_word
if w
in word_vectors
] vec
. append
( all_vec
) data
= pd
. Series
( vec
) data
= data
. apply ( pad
, maxlength
= maxlength
) data_
= np
. concatenate
( data
) . reshape
( len ( data
) , maxlength
, 100 ) return data_
data_text_emb
= text_vec
( all_text_filename
)
def read_image ( image_filename
) : image
= tf
. read_file
( image_filename
) image
= tf
. image
. decode_jpeg
( image
, channels
= 3 ) image
= tf
. image
. resize_image_with_crop_or_pad
( image
, 512 , 512 ) image
= tf
. image
. resize_images
( image
, ( 256 , 256 ) ) image
= ( image
- tf
. reduce_min
( image
) ) / ( tf
. reduce_max
( image
) - tf
. reduce_min
( image
) ) return image
def _pre_func ( real_image_name
, wrong_image_name
) : wrong_image
= read_image
( wrong_image_name
) real_image
= read_image
( real_image_name
) return real_image
, wrong_image
dataset_image
= dataset_image
. map ( _pre_func
)
dataset_image
= dataset_image
. batch
( batch_size
)
iterator
= tf
. data
. Iterator
. from_structure
( dataset_image
. output_types
, dataset_image
. output_shapes
)
real_image_batch
, wrong_image_batch
= iterator
. get_next
( )
input_text
= tf
. placeholder
( tf
. float32
, [ None , n_steps
, n_input
] )
inputs_noise
= tf
. placeholder
( tf
. float32
, [ None , noise_dim
] , name
= 'inputs_noise' )
def length ( shuru
) : return tf
. reduce_sum
( tf
. sign
( tf
. reduce_max
( tf
. abs ( shuru
) , reduction_indices
= 2 ) ) , reduction_indices
= 1 )
def text_rnn ( input_text
, batch_size
= 64 , reuse
= tf
. AUTO_REUSE
) : cell
= tf
. contrib
. rnn
. GRUCell
( n_hidden
, kernel_initializer
= tf
. truncated_normal_initializer
( stddev
= 0.0001 ) , bias_initializer
= tf
. truncated_normal_initializer
( stddev
= 0.0001 ) , reuse
= reuse
) output
, _
= tf
. nn
. dynamic_rnn
( cell
, input_text
, dtype
= tf
. float32
, sequence_length
= length
( input_text
) ) index
= tf
. range ( 0 , batch_size
) * n_steps
+ ( tf
. cast
( length
( input_text
) , tf
. int32
) - 1 ) flat
= tf
. reshape
( output
, [ - 1 , int ( output
. get_shape
( ) [ 2 ] ) ] ) last
= tf
. gather
( flat
, index
) return last
def get_generator ( noise_img
, image_depth
, condition_label
, is_train
= True , alpha
= 0.2 ) : with tf
. variable_scope
( "generator" , reuse
= tf
. AUTO_REUSE
) : noise_img
= tf
. to_float
( noise_img
) noise_img
= tf
. layers
. dense
( noise_img
, n_hidden
) noise_img
= tf
. maximum
( alpha
* noise_img
, noise_img
) noise_img_
= tf
. concat
( [ noise_img
, condition_label
] , 1 ) layer1
= tf
. layers
. dense
( noise_img_
, 4 * 4 * 512 ) layer1
= tf
. reshape
( layer1
, [ - 1 , 4 , 4 , 512 ] ) layer1
= tf
. layers
. batch_normalization
( layer1
, training
= is_train
) layer1
= tf
. nn
. relu
( layer1
) layer1
= tf
. nn
. dropout
( layer1
, keep_prob
= 0.8 ) layer2
= tf
. layers
. conv2d_transpose
( layer1
, 256 , 3 , strides
= 2 , padding
= 'same' ) layer2
= tf
. layers
. batch_normalization
( layer2
, training
= is_train
) layer2
= tf
. nn
. relu
( layer2
) layer2
= tf
. nn
. dropout
( layer2
, keep_prob
= 0.8 ) layer3
= tf
. layers
. conv2d_transpose
( layer2
, 128 , 3 , strides
= 2 , padding
= 'same' ) layer3
= tf
. layers
. batch_normalization
( layer3
, training
= is_train
) layer3
= tf
. nn
. relu
( layer3
) layer3
= tf
. nn
. dropout
( layer3
, keep_prob
= 0.8 ) layer4
= tf
. layers
. conv2d_transpose
( layer3
, 64 , 3 , strides
= 2 , padding
= 'same' ) layer4
= tf
. layers
. batch_normalization
( layer4
, training
= is_train
) layer4
= tf
. nn
. relu
( layer4
) layer5
= tf
. layers
. conv2d_transpose
( layer4
, 32 , 3 , strides
= 2 , padding
= 'same' ) layer5
= tf
. layers
. batch_normalization
( layer5
, training
= is_train
) layer5
= tf
. nn
. relu
( layer5
) layer6
= tf
. layers
. conv2d_transpose
( layer5
, 16 , 3 , strides
= 2 , padding
= 'same' ) layer6
= tf
. layers
. batch_normalization
( layer6
, training
= is_train
) layer6
= tf
. nn
. relu
( layer6
) logits
= tf
. layers
. conv2d_transpose
( layer6
, image_depth
, 3 , strides
= 2 , padding
= 'same' ) outputs
= tf
. tanh
( logits
) outputs
= ( outputs
/ 2 ) + 0.5 outputs
= tf
. clip_by_value
( outputs
, 0.0 , 1.0 ) return outputs
def get_discriminator ( inputs_img
, condition_label
, reuse
= tf
. AUTO_REUSE
, alpha
= 0.2 ) : with tf
. variable_scope
( "discriminator" , reuse
= reuse
) : layer1
= tf
. layers
. conv2d
( inputs_img
, 16 , 3 , strides
= 2 , padding
= 'same' ) layer1
= tf
. maximum
( alpha
* layer1
, layer1
) layer1
= tf
. nn
. dropout
( layer1
, keep_prob
= 0.8 ) layer2
= tf
. layers
. conv2d
( layer1
, 32 , 3 , strides
= 2 , padding
= 'same' ) layer2
= tf
. layers
. batch_normalization
( layer2
, training
= True ) layer2
= tf
. maximum
( alpha
* layer2
, layer2
) layer2
= tf
. nn
. dropout
( layer2
, keep_prob
= 0.8 ) layer3
= tf
. layers
. conv2d
( layer2
, 64 , 3 , strides
= 2 , padding
= 'same' ) layer3
= tf
. layers
. batch_normalization
( layer3
, training
= True ) layer3
= tf
. maximum
( alpha
* layer3
, layer3
) layer3
= tf
. nn
. dropout
( layer3
, keep_prob
= 0.8 ) layer4
= tf
. layers
. conv2d
( layer3
, 128 , 3 , strides
= 2 , padding
= 'same' ) layer4
= tf
. layers
. batch_normalization
( layer4
, training
= True ) layer4
= tf
. maximum
( alpha
* layer4
, layer4
) layer5
= tf
. layers
. conv2d
( layer4
, 256 , 3 , strides
= 2 , padding
= 'same' ) layer5
= tf
. layers
. batch_normalization
( layer5
, training
= True ) layer5
= tf
. maximum
( alpha
* layer5
, layer5
) layer6
= tf
. layers
. conv2d
( layer5
, 512 , 3 , strides
= 2 , padding
= 'same' ) layer6
= tf
. layers
. batch_normalization
( layer6
, training
= True ) layer6
= tf
. maximum
( alpha
* layer6
, layer6
) text_emb
= tf
. layers
. dense
( condition_label
, 512 ) text_emb
= tf
. maximum
( alpha
* text_emb
, text_emb
) text_emb
= tf
. expand_dims
( text_emb
, 1 ) text_emb
= tf
. expand_dims
( text_emb
, 2 ) text_emb
= tf
. tile
( text_emb
, [ 1 , 4 , 4 , 1 ] ) layer_concat
= tf
. concat
( [ layer6
, text_emb
] , 3 ) layer7
= tf
. layers
. conv2d
( layer_concat
, 512 , 1 , strides
= 1 , padding
= 'same' ) layer7
= tf
. layers
. batch_normalization
( layer7
, training
= True ) layer7
= tf
. maximum
( alpha
* layer7
, layer7
) flatten
= tf
. reshape
( layer7
, ( - 1 , 4 * 4 * 512 ) ) logits
= tf
. layers
. dense
( flatten
, 1 ) outputs
= tf
. sigmoid
( logits
) return logits
, outputs
def get_loss ( inputs_image
, wrong_image
, inputs_noise
, condition_label
, image_depth
, smooth
= 0.1 ) : g_outputs
= get_generator
( inputs_noise
, image_depth
, condition_label
, is_train
= True ) d_logits_real
, d_outputs_real
= get_discriminator
( inputs_image
, condition_label
) d_logits_fake
, d_outputs_fake
= get_discriminator
( g_outputs
, condition_label
, reuse
= tf
. AUTO_REUSE
) d_logits_wrong
, d_outputs_wrong
= get_discriminator
( wrong_image
, condition_label
, reuse
= tf
. AUTO_REUSE
) print ( inputs_image
. get_shape
( ) , condition_label
. get_shape
( ) ) g_loss
= tf
. reduce_mean
( tf
. nn
. sigmoid_cross_entropy_with_logits
( logits
= d_logits_fake
, labels
= tf
. ones_like
( d_outputs_fake
) * ( 1 - smooth
) ) ) d_loss_real
= tf
. reduce_mean
( tf
. nn
. sigmoid_cross_entropy_with_logits
( logits
= d_logits_real
, labels
= tf
. ones_like
( d_outputs_real
) * ( 1 - smooth
) ) ) d_loss_fake
= tf
. reduce_mean
( tf
. nn
. sigmoid_cross_entropy_with_logits
( logits
= d_logits_fake
, labels
= tf
. ones_like
( d_outputs_fake
) * smooth
) ) d_loss_wrong
= tf
. reduce_mean
( tf
. nn
. sigmoid_cross_entropy_with_logits
( logits
= d_logits_wrong
, labels
= tf
. ones_like
( d_outputs_wrong
) * smooth
) ) d_loss
= d_loss_real
+ d_loss_fake
+ d_loss_wrong
return g_loss
, d_loss
def get_optimizer ( g_loss
, d_loss
, beta1
= 0.4 , learning_rate
= 0.001 ) : train_vars
= tf
. trainable_variables
( ) g_vars
= [ var
for var
in train_vars
if var
. name
. startswith
( "generator" ) ] d_vars
= [ var
for var
in train_vars
if var
. name
. startswith
( "discriminator" ) ] with tf
. control_dependencies
( tf
. get_collection
( tf
. GraphKeys
. UPDATE_OPS
) ) : g_opt
= tf
. train
. AdamOptimizer
( learning_rate
, beta1
= beta1
) . minimize
( g_loss
, var_list
= g_vars
) d_opt
= tf
. train
. AdamOptimizer
( learning_rate
, beta1
= beta1
) . minimize
( d_loss
, var_list
= d_vars
) return g_opt
, d_opt
def plot_images ( samples
) : fig
, axes
= plt
. subplots
( nrows
= 1 , ncols
= 10 , sharex
= True , sharey
= True , figsize
= ( 20 , 2 ) ) for img
, ax
in zip ( samples
, axes
) : ax
. imshow
( img
. reshape
( ( 256 , 256 , 3 ) ) ) ax
. get_xaxis
( ) . set_visible
( False ) ax
. get_yaxis
( ) . set_visible
( False ) fig
. tight_layout
( pad
= 0 )
def show_generator_output ( sess
, n_images
, inputs_noise
, output_dim
, test_text_vec
) :
samples
= sess
. run
( get_generator
( inputs_noise
, output_dim
, test_text_vec
, is_train
= False ) ) return samples
n_samples
= 10
learning_rate
= 0.0002
beta1
= 0.5
losses
= [ ]
step
= 0
last
= text_rnn
( input_text
)
g_loss
, d_loss
= get_loss
( real_image_batch
, wrong_image_batch
, inputs_noise
, last
, image_depth
, smooth
= 0.1 )
g_train_opt
, d_train_opt
= get_optimizer
( g_loss
, d_loss
, beta1
, learning_rate
) saver
= tf
. train
. Saver
( )
with tf
. Session
( ) as sess
: model_file
= tf
. train
. latest_checkpoint
( '../input/gan-text-to-image-102flowers-rieyuguanghua' ) saver
. restore
( sess
, model_file
) for epoch
in range ( 791 , 831 ) : index
= np
. random
. permutation
( len ( all_image_filename
) ) data_text_emb
= data_text_emb
[ index
] all_image_filename
= all_image_filename
[ index
] wrong_image_filename
= all_image_filename
[ np
. random
. permutation
( len ( all_image_filename
) ) ] dataset_image
= tf
. data
. Dataset
. from_tensor_slices
( ( all_image_filename
, wrong_image_filename
) ) dataset_image
= dataset_image
. map ( _pre_func
) dataset_image
= dataset_image
. repeat
( 1 ) dataset_image
= dataset_image
. batch
( batch_size
) dataset_image_op
= iterator
. make_initializer
( dataset_image
) sess
. run
( dataset_image_op
) i
= 0 while True : try : batch_noise
= np
. random
. uniform
( - 1 , 1 , size
= ( batch_size
, noise_dim
) ) text_emb_batch
= data_text_emb
[ i
: i
+ batch_size
] i
= i
+ batch_size_
= sess
. run
( [ g_train_opt
, d_train_opt
] , feed_dict
= { input_text
: text_emb_batch
, inputs_noise
: batch_noise
} )
step
+= 1 except :
print ( 'epoch' , epoch
, 'step' , step
) break if epoch
% 10 == 0 : n_samples
= 10 condition_text
= data_text_emb
[ : n_samples
] test_noise
= np
. random
. uniform
( - 1 , 1 , size
= [ n_samples
, noise_dim
] ) last_test
= text_rnn
( input_text
, batch_size
= n_samples
, reuse
= tf
. AUTO_REUSE
) test_text_vec
= sess
. run
( last_test
, feed_dict
= { input_text
: condition_text
} ) samples
= show_generator_output
( sess
, n_samples
, test_noise
, 3 , test_text_vec
) plot_images
( samples
) saver
. save
( sess
, "./model11.ckpt" )
tf
. reset_default_graph
( )
vec
= [ ]
test_word
= """
the petals on this flower are yellow with a red center,the petals on this flower are yellow with a red center
"""
all_vec
= [ word_vectors
[ w
] for w
in test_word
if w
in word_vectors
]
vec
. append
( all_vec
)
data
= pd
. Series
( vec
)
data
= data
. apply ( pad
, maxlength
= maxlength
)
data_
= np
. concatenate
( data
) . reshape
( len ( data
) , maxlength
, 100 )
test_text_vec
= data_
test_text_vec
= test_text_vec
. astype
( np
. float32
) losses
= [ ]
step
= 0
n_samples
= 10
test_noise
= np
. random
. uniform
( - 1 , 1 , size
= [ n_samples
, noise_dim
] )
last_test
= text_rnn
( test_text_vec
, batch_size
= n_samples
, reuse
= tf
. AUTO_REUSE
) new_image
= get_generator
( test_noise
, image_depth
, last_test
) saver
= tf
. train
. Saver
( )
with tf
. Session
( ) as sess
: model_file
= tf
. train
. latest_checkpoint
( '../input/gan-text-to-image-102flowers-rieyuguanghua' ) saver
. restore
( sess
, model_file
) samples
= show_generator_output
( sess
, n_samples
, test_noise
, 3 , last_test
) plot_images
( samples
)
總結
以上是生活随笔 為你收集整理的GAN生成对抗网络-text to image原理与基本实现-文字转图像-11 的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔 網站內容還不錯,歡迎將生活随笔 推薦給好友。