Commit 1aa66d77 authored by Luca Pasa's avatar Luca Pasa
Browse files

dev motion vector tf records

parent 582633b8
......@@ -116,8 +116,10 @@ if __name__ == '__main__':
x_a_b_len, x_a_b,x_a_len, x_a, x_v_len, x_v, y_len, y = sess.run(get_next)
x_v=video_batch_align(x_a,x_v)
print x_a_b.shape
print x_a.shape
print x_v
delta_features_video = np.zeros_like(x_v)
delta_features_video[1:] = x_v[1:] - x_v[:-1]
print delta_features_video
raw_input()
# res = sess.run(it.get_next())
......
import glob
import numpy as np
np.set_printoptions(threshold=np.nan)
import tensorflow as tf
def serialize_sequence(base_audio_sequence, audio_sequence, video_sequence, labels):
# The object we return
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
base_audio_sequence_length = len(base_audio_sequence)
audio_sequence_length = len(audio_sequence)
video_sequence_length = len(video_sequence)
labels_length = len(labels)
ex.context.feature["base_audio_length"].int64_list.value.append(base_audio_sequence_length)
ex.context.feature["audio_length"].int64_list.value.append(audio_sequence_length)
ex.context.feature["video_length"].int64_list.value.append(video_sequence_length)
ex.context.feature["label_length"].int64_list.value.append(labels_length)
# Feature lists for the two sequential features of our example
fl_base_audio_feat = ex.feature_lists.feature_list["base_audio_feat"]
fl_audio_feat = ex.feature_lists.feature_list["audio_feat"]
fl_video_feat = ex.feature_lists.feature_list["video_feat"]
fl_labels = ex.feature_lists.feature_list["labels"]
for base_audio_feat in base_audio_sequence:
fl_base_audio_feat.feature.add().float_list.value.extend(base_audio_feat)
for audio_feat in audio_sequence:
fl_audio_feat.feature.add().float_list.value.extend(audio_feat)
for video_feat in video_sequence:
fl_video_feat.feature.add().float_list.value.extend(video_feat)
for label in labels:
fl_labels.feature.add().float_list.value.append(label)
return ex
# load dataset mean and std
dataset_audio_base_mean=np.load('dataset_audio_base_mean.npy')
dataset_audio_base_std=np.load('dataset_audio_base_stdev.npy')
dataset_multi_audio_mean = np.load('dataset_multi_audio_mean.npy')
dataset_multi_audio_std = np.load('dataset_multi_audio_stdev.npy')
dataset_video_mean = np.load('dataset_video_mean.npy')
dataset_video_std = np.load('dataset_video_stdev.npy')
# destination folders
train_dir = '/home/storage/Data/MULTI_GRID_100/multiModalTfRec/TRAIN_CTC_SENTENCES/'
val_dir = '/home/storage/Data/MULTI_GRID_100/multiModalTfRec/VAL_CTC_SENTENCES/'
test_dir = '/home/storage/Data/MULTI_GRID_100/multiModalTfRec/TEST_CTC_SENTENCES/'
f = open('./dictionary.txt', 'r')
dictionary = f.read()
phonemes = dictionary.replace('\n', ' ').split(' ')
phonemes = [ph for ph in sorted(set(phonemes)) if ph is not '']
#print('Number of phonemes = ', len(phonemes))
#print(phonemes)
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.csv'))
features_file_list_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.csv'))
features_file_list_video = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/video/*.txt'))
assert len(features_file_list_audio) == len(features_file_list_video) == len(features_file_list_base_audio), "#base_audop != #multi_audio != #video"
print('Total number of files = {}'.format(
len(features_file_list_audio))) # it has to be equal to len(features_file_list_video)
# prepare indices for cross validation
indices = np.arange(len(features_file_list_audio)) # same of indices_video = np.arange(len(features_file_list_video))
np.random.seed(3)
np.random.shuffle(indices)
# cross validation split
train_percent = 0.6
val_percent = 0.2
test_percent = 0.2
print(len(features_file_list_audio))
num_sentences_train = int(len(features_file_list_audio) * train_percent)
num_sentences_val = int(len(features_file_list_audio) * val_percent)
num_sentences_test = len(features_file_list_audio) - num_sentences_train - num_sentences_val
print('num sentences train = ', num_sentences_train)
print('num sentences val = ', num_sentences_val)
print('num sentences test = ', num_sentences_test)
train_indices = indices[:num_sentences_train]
val_indices = indices[num_sentences_train:(num_sentences_train + num_sentences_val)]
test_indices = indices[(num_sentences_val + num_sentences_train):]
train_counter = 0
val_counter = 0
test_counter = 0
for file_index, (csv_base_file_audio,csv_file_audio, txt_file_video) in enumerate(zip(features_file_list_base_audio,
features_file_list_audio,
features_file_list_video)):
# print('base audio {:s},multi audio {:s}, video {:s}'.format(csv_base_file_audio, csv_file_audio, txt_file_video))
features_base_audio = np.loadtxt(csv_base_file_audio, delimiter=',')
features_audio = np.loadtxt(csv_file_audio, delimiter=',')
features_video = np.loadtxt(txt_file_video)
delta_features_video = np.zeros_like(features_video)
delta_features_video[1:] = features_video[1:] - features_video[:-1]
# print features_base_audio
#print features_audio.shape
#print features_video.shape
# label path
labels_file = csv_file_audio.replace('/multi_audio/', '/transcription/').replace('.csv', '.transcription')
f = open(labels_file, 'r')
labels = f.read()
labels = labels.replace('\n', '').replace('SP', '').split(',')
labels = [lab for lab in labels if lab is not '']
#print('labels : ', labels)
labels = [phonemes.index(ph) for ph in labels]
#print('labels : ', labels)
labels = np.asarray(labels)
#print(labels.shape)
#print('')
features_base_audio = np.subtract(features_base_audio, dataset_audio_base_mean) / dataset_audio_base_std
features_audio = np.subtract(features_audio, dataset_multi_audio_mean) / dataset_multi_audio_std
features_video = np.subtract(delta_features_video, dataset_video_mean) / dataset_video_std
if file_index in train_indices:
sentence_file = train_dir + 'sequence_full_{:05d}.tfrecords'.format(train_counter)
train_counter += 1
if file_index in val_indices:
sentence_file = val_dir + 'sequence_full_{:05d}.tfrecords'.format(val_counter)
val_counter += 1
if file_index in test_indices:
sentence_file = test_dir + 'sequence_full_{:05d}.tfrecords'.format(test_counter)
test_counter += 1
fp = open(sentence_file, 'w')
writer = tf.python_io.TFRecordWriter(fp.name)
serialized_sentence = serialize_sequence(features_base_audio, features_audio, features_video, labels)
# write to tfrecord
writer.write(serialized_sentence.SerializeToString())
writer.close()
# fp.close()
import tensorflow as tf
from Utils.decoratorTF import define_scope
class Bi_DEA_Concat_model:
def __init__(self, x_audio_ph, x_video_ph, y_ph, n_in_audio, n_in_video, x_len_audio_ph,
n_hidden, n_hidden_encode, batch_size, learning_rate,
learning_decay, momentum, updating_step, keep_prob=0.9, output_act_fun=None, out_parameter_initializer=None):
# placeholders
self.x_audio_ph = x_audio_ph
self.x_video_ph = x_video_ph
self.y_ph = y_ph
self.x_len_audio_ph = x_len_audio_ph
# model parameters
self.n_in_audio = n_in_audio
self.n_in_video = n_in_video
self.n_hidden = n_hidden
self.n_hidden_encode = n_hidden_encode
self.output_act_fun = output_act_fun
self.out_parameter_initializer = out_parameter_initializer
self.dropout_kp=keep_prob
# training parameters
self.batch_size = batch_size
self.learning_rate = learning_rate
self.updating_step = updating_step
self.learning_decay = learning_decay
self.momentum = momentum
self.multi_audio_encoding
self.single_audio_decoder
def init_model(self, loss_fun):
# init model methods
self.loss_fun = loss_fun
self.regression
self.optimizer
@define_scope("multi_audio_encoding")
def multi_audio_encoding(self):
#fw_cell:
cells_fw = []
for dim in self.n_hidden:
cells_fw.append(tf.contrib.rnn.LayerNormBasicLSTMCell(dim,dropout_keep_prob=self.dropout_kp))
multi_LSTM_cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw)
initial_state_fw = multi_LSTM_cell_fw.zero_state(self.batch_size, dtype=tf.float32)
#bw_cell:
cells_bw = []
for dim in self.n_hidden:
cells_bw.append(tf.contrib.rnn.LayerNormBasicLSTMCell(dim,dropout_keep_prob=self.dropout_kp))
multi_LSTM_cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw)
initial_state_bw = multi_LSTM_cell_bw.zero_state(self.batch_size, dtype=tf.float32)
audio_visual_input = tf.concat([self.x_audio_ph, self.x_video_ph], 2)
rnn_outputs, output_state_fw, output_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw=cells_fw,
cells_bw=cells_bw,
inputs=audio_visual_input,
initial_states_fw=list(initial_state_fw),
initial_states_bw=list(initial_state_bw),
dtype=tf.float32,
sequence_length=self.x_len_audio_ph,
parallel_iterations=None,
scope=None)
rnn_outputs_fw, rnn_outputs_bw = tf.split(rnn_outputs, num_or_size_splits=2, axis=2)
# Define encoding layer weights
output_weights_fw = tf.get_variable('outputs_weights_fw', dtype=tf.float32,
shape=[self.n_hidden[-1], self.n_hidden_encode],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_weights_bw = tf.get_variable('outputs_weights_bw', dtype=tf.float32,
shape=[self.n_hidden[-1], self.n_hidden_encode],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_biases = tf.get_variable('biases', dtype=tf.float32,
shape=[self.n_hidden_encode],
initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
rnn_outputs_fw = tf.reshape(rnn_outputs_fw, [-1, self.n_hidden[-1]])
rnn_outputs_bw = tf.reshape(rnn_outputs_bw, [-1, self.n_hidden[-1]])
output = tf.matmul(rnn_outputs_fw, output_weights_fw) + tf.matmul(rnn_outputs_bw,
output_weights_bw) + output_biases
output = tf.reshape(output, [self.batch_size, -1, self.n_hidden_encode])
return output
@define_scope("single_audio_decoder")
def single_audio_decoder(self):
# fw_cell:
cells_fw = []
for dim in reversed(self.n_hidden):
cells_fw.append(tf.contrib.rnn.BasicLSTMCell(dim))
multi_LSTM_cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw)
initial_state_fw = multi_LSTM_cell_fw.zero_state(self.batch_size, dtype=tf.float32)
# bw_cell:
cells_bw = []
for dim in reversed(self.n_hidden):
cells_bw.append(tf.contrib.rnn.BasicLSTMCell(dim))
#cells_bw.append(tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(dim),
#output_keep_prob=self.keep_prob, input_keep_prob=self.keep_prob))
multi_LSTM_cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw)
initial_state_bw = multi_LSTM_cell_bw.zero_state(self.batch_size, dtype=tf.float32)
rnn_outputs, output_state_fw, output_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw=cells_fw,
cells_bw=cells_bw,
inputs=self.multi_audio_encoding,
initial_states_fw=list(initial_state_fw),
initial_states_bw=list(initial_state_bw),
dtype=tf.float32,
sequence_length=self.x_len_audio_ph,
parallel_iterations=None,
scope=None)
rnn_outputs_fw, rnn_outputs_bw = tf.split(rnn_outputs, num_or_size_splits=2, axis=2)
# Define fw e bw mixing layer weights
output_weights_fw = tf.get_variable('outputs_weights_fw', dtype=tf.float32,
shape=[self.n_hidden[0], self.n_in_audio],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_weights_bw = tf.get_variable('outputs_weights_bw', dtype=tf.float32,
shape=[self.n_hidden[0], self.n_in_audio],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_biases = tf.get_variable('biases', dtype=tf.float32,
shape=[self.n_in_audio],
initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
rnn_outputs_fw = tf.reshape(rnn_outputs_fw, [-1, self.n_hidden[0]])
rnn_outputs_bw = tf.reshape(rnn_outputs_bw, [-1, self.n_hidden[0]])
output = tf.matmul(rnn_outputs_fw, output_weights_fw) + tf.matmul(rnn_outputs_bw,
output_weights_bw) + output_biases
output = tf.reshape(output, [self.batch_size, -1, self.n_in_audio])
return output
@define_scope("regression")
def regression(self):
return self.single_audio_decoder
@define_scope("optimizer")
def optimizer(self):
global_step = tf.Variable(0, trainable=False)
# define cost function
cost = tf.reduce_mean(self.loss_fun)
# define learning rate decay function
learning_rate_decay_fun = tf.train.exponential_decay(self.learning_rate, global_step,
self.updating_step, self.learning_decay, staircase=True)
# define optimizer
# optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate_decay_fun, momentum=self.momentum)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate_decay_fun)
grad, variables = zip(*optimizer.compute_gradients(cost))
clip_grad, _ = tf.clip_by_global_norm(grad, 5.0) # TODO:check value to clip
optimizer = optimizer.apply_gradients(zip(grad, variables), global_step=global_step)
return optimizer, learning_rate_decay_fun, cost, global_step
if __name__ == '__main__':
num_epochs = 10
batch_size = 10
nIn_audio = 123
nIn_video = 134
nHidden = [123, 100, 75, 50]
nHidden_encode = 25
learningRate = 0.01
traininglog_dir = "./"
updatingStep = 10
learningDecay = 1
momentum=0.9
x_audio_ph = tf.placeholder("float32", [None, None, nIn_audio], 'x_audio')
x_audio_len_ph = tf.placeholder("int32", shape=[None], name='x_audio_len')
x_video_ph = tf.placeholder("float32", [None, None, nIn_video], 'x_video')
y_ph = tf.placeholder("float32", [None, None, nIn_audio], 'y')
model = Bi_DEA_Concat_model(x_audio_ph=x_audio_ph, x_video_ph=x_video_ph, y_ph=y_ph, n_in_audio=nIn_audio,
n_in_video=nIn_video, x_len_audio_ph=x_audio_len_ph, n_hidden=nHidden,
n_hidden_encode=nHidden_encode, batch_size=batch_size, learning_rate=learningRate,
learning_decay=learningDecay, momentum=momentum,
updating_step=updatingStep)
\ No newline at end of file
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '../../../'))
from Concat_DAE_4_speech import DAE_4_speech
import tensorflow as tf
import numpy as np
np.set_printoptions(threshold='nan')
if __name__ == '__main__':
num_epochs = 500
batch_size = 18
nIn_audio = 123
nIn_video = 134
nHidden = [350,500]
nHidden_encode = 750
learningRate = 0.001
traininglog_dir = "./"
updating_step = 2250
test_step= 10
learningDecay = 1
momentum = 0.9
graph = tf.Graph()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
with tf.Session(graph=graph, config=config) as sess:
model = DAE_4_speech(sess=sess,graph=graph, n_in_audio=nIn_audio, n_in_video=nIn_video, n_hidden=nHidden,
n_hidden_encode=nHidden_encode, batch_size=batch_size, learning_rate=learningRate, learning_decay=learningDecay,
momentum=momentum, updating_step=updating_step)
model.restore_model("./RESULT/BaseLine1/Overfitting_TEST_Concat_DAE_4_speech_Test_lr-0.001_batch_size-18_n_hidden_encode-750.ckpt-500")
mse = model.get_model_output(data_set_path="/home/storage/Data/MULTI_GRID/multiModalTfRec/TRAIN_CTC_SENTENCES/")
print mse
raw_input("........................................................................")
mse_test= model.get_model_output(data_set_path="/home/storage/Data/MULTI_GRID/multiModalTfRec/TEST_CTC_SENTENCES/")
print mse_test
raw_input(".-.-.-..")
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '../../../'))
from Concat_DAE_4_speech import DAE_4_speech
import tensorflow as tf
if __name__ == '__main__':
num_epochs = 500
batch_size = 30
nIn_audio = 257
nIn_video = 134
nHidden = [500,600]
nHidden_encode = 750
learningRate = 0.001
training_log_dir = "./test_log/"
updating_step = 2250
test_step= 10
learningDecay = 1
momentum = 0.9
test_name="MULTI_GRID_100_Spectro_motion_Concat_DAE_4_speech_Test_lr-"+str(learningRate)+"_batch_size-"+str(batch_size)+"_n_hidden_encode-"+str(nHidden_encode)
data_path="/home/storage/Data/MULTI_GRID_100/rawMultiModalTfRec/"
graph = tf.Graph()
with tf.Session(graph=graph) as sess:
model = DAE_4_speech(sess=sess, graph=graph, n_in_audio=nIn_audio, n_in_video=nIn_video, n_hidden=nHidden,
n_hidden_encode=nHidden_encode, batch_size=batch_size, learning_rate=learningRate,
learning_decay=learningDecay, momentum=momentum, updating_step=updating_step)
model.training_model(training_set_path=os.path.join(data_path,"TRAIN_CTC_SENTENCES"),
test_set_path=os.path.join(data_path,"TEST_CTC_SENTENCES"),
validation_set_path=os.path.join(data_path,"VAL_CTC_SENTENCES"),
n_epoch=num_epochs, test_step=test_step, test_name=test_name,
log_dir=training_log_dir)
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '../../../'))
from Concat_DAE_4_speech import DAE_4_speech
import tensorflow as tf
if __name__ == '__main__':
num_epochs = 120
batch_size = 15
nIn_audio = 123
nIn_video = 134
nHidden = [123,500,600]
nHidden_encode = 750
learningRate = 0.001
traininglog_dir = "./"
updating_step = 2250
test_step= 10
learningDecay = 1
momentum = 0.9
test_name="GRID_100_Concat_DAE_4_speech_Test_lr-"+str(learningRate)+"_batch_size-"+str(batch_size)+"_n_hidden_encode-"+str(nHidden_encode)
#Code for running on CPU
# config = tf.ConfigProto(device_count={'GPU': 0})
#------------#
graph = tf.Graph()
with tf.Session(graph=graph) as sess:
model = DAE_4_speech(sess=sess, graph=graph, n_in_audio=nIn_audio, n_in_video=nIn_video, n_hidden=nHidden,
n_hidden_encode=nHidden_encode, batch_size=batch_size, learning_rate=learningRate,
learning_decay=learningDecay, momentum=momentum, updating_step=updating_step)
model.restore_model("./RESULT/BaseLine1/GRID_100_Concat_DAE_4_speech_Test_lr-0.001_batch_size-15_n_hidden_encode-750.ckpt-80")
model.training_model(training_set_path="/home/storage/Data/MULTI_GRID_100/multiModalTfRec/TRAIN_CTC_SENTENCES/",
test_set_path="/home/storage/Data/MULTI_GRID_100/multiModalTfRec/TEST_CTC_SENTENCES/",
validation_set_path="/home/storage/Data/MULTI_GRID_100/multiModalTfRec/VAL_CTC_SENTENCES/",
n_epoch=num_epochs, test_step=test_step, test_name=test_name,
log_dir="./test_log/")
import sys
import os
import tensorflow as tf
from Bi_DAE_Concat_Model import Bi_DEA_Concat_model as model
from Utils.decoratorTF import define_scope
import datetime
import time
from Data.Data_reader import DatabaseMultiSpeechReader as Data
import numpy as np
import os
from math import floor
class DAE_4_speech:
def __init__(self,sess,graph, n_in_audio, n_in_video, n_hidden, n_hidden_encode, batch_size, learning_rate,
learning_decay, momentum, updating_step):
self.graph = graph
self.sess=sess
with self.graph.as_default():
self.x_audio_ph = tf.placeholder("float32", [None, None, n_in_audio], 'x_audio')
self.x_video_ph = tf.placeholder("float32", [None, None, n_in_video], 'x_video')
self.x_audio_len_ph = tf.placeholder("int32", shape=[None], name='x_audio_len')
self.y_ph = tf.placeholder("float32", [None, None, n_in_audio], 'y')
self.x_audio_len_ph = tf.placeholder("int32", shape=[None], name='x_audio_len')
self.x_video_len_ph = tf.placeholder("int32", shape=[None], name='x_video_len')
self.y_len_ph = tf.placeholder("int32", shape=[None], name='y_len')
self.model = model(x_audio_ph=self.x_audio_ph, x_video_ph=self.x_video_ph,
y_ph=self.y_ph, x_len_audio_ph=self.x_audio_len_ph, n_in_audio=n_in_audio, n_in_video=n_in_video,
n_hidden=n_hidden, n_hidden_encode=n_hidden_encode, batch_size=batch_size,
learning_rate=learning_rate, learning_decay=learning_decay, momentum=momentum,
updating_step=updating_step)
self.batch_size=batch_size
self.loss_function
self.model.init_model(self.loss_function)
self.audio_acc
self.compute_output
@define_scope("compute_output")
def compute_output(self):
return self.model.regression
def get_model_output(self, data_set_path):
data_set_dm = Data.dataManager(single_audio_frame_size=self.model.n_in_audio,
single_video_frame_size=self.model.n_in_video)
# read dataset
data_set = data_set_dm.get_dataset(data_set_path)
# get itarator
_, it_data = data_set_dm.get_iterator(data_set)
# init iterator
self.sess.run(it_data.initializer,
feed_dict={data_set_dm.batch_size_ph: self.batch_size, data_set_dm.n_epoch_ph: 1,
data_set_dm.buffer_size_ph: 2})
get_next = it_data.get_next()
output = []
input = []
mse_list=[]
while (True):
try:
# get sample
x_len, x_a_val,_,x_ma_val,_, x_v_val, _, _ = self.sess.run(get_next)
x_v_val = Data.video_batch_align(x_a_val, x_v_val)
# compute model output
mse = self.sess.run(self.loss_function,feed_dict={self.x_audio_ph: x_ma_val,
self.x_video_ph: x_v_val,
self.x_audio_len_ph: x_len,
self.y_ph:x_a_val})
# input.append(x_a_val)
# output.append(model_out)
mse_list.append(mse)
except tf.errors.OutOfRangeError: