Commit 6f472281 authored by Luca Pasa's avatar Luca Pasa
Browse files

Merge branch 'master' of gitlab.iit.it:lpasa/AV_ASR

parents b584ea29 1a84f50e
......@@ -116,8 +116,10 @@ if __name__ == '__main__':
x_a_b_len, x_a_b,x_a_len, x_a, x_v_len, x_v, y_len, y = sess.run(get_next)
x_v=video_batch_align(x_a,x_v)
print x_a_b.shape
print x_a.shape
print x_v
delta_features_video = np.zeros_like(x_v)
delta_features_video[1:] = x_v[1:] - x_v[:-1]
print delta_features_video
raw_input()
# res = sess.run(it.get_next())
......
import glob
import numpy as np
np.set_printoptions(threshold=np.nan)
import tensorflow as tf
def serialize_sequence(base_audio_sequence, audio_sequence, video_sequence, labels):
# The object we return
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
base_audio_sequence_length = len(base_audio_sequence)
audio_sequence_length = len(audio_sequence)
video_sequence_length = len(video_sequence)
labels_length = len(labels)
ex.context.feature["base_audio_length"].int64_list.value.append(base_audio_sequence_length)
ex.context.feature["audio_length"].int64_list.value.append(audio_sequence_length)
ex.context.feature["video_length"].int64_list.value.append(video_sequence_length)
ex.context.feature["label_length"].int64_list.value.append(labels_length)
# Feature lists for the two sequential features of our example
fl_base_audio_feat = ex.feature_lists.feature_list["base_audio_feat"]
fl_audio_feat = ex.feature_lists.feature_list["audio_feat"]
fl_video_feat = ex.feature_lists.feature_list["video_feat"]
fl_labels = ex.feature_lists.feature_list["labels"]
for base_audio_feat in base_audio_sequence:
fl_base_audio_feat.feature.add().float_list.value.extend(base_audio_feat)
for audio_feat in audio_sequence:
fl_audio_feat.feature.add().float_list.value.extend(audio_feat)
for video_feat in video_sequence:
fl_video_feat.feature.add().float_list.value.extend(video_feat)
for label in labels:
fl_labels.feature.add().float_list.value.append(label)
return ex
# load dataset mean and std
dataset_audio_base_mean=np.load('dataset_audio_base_mean.npy')
dataset_audio_base_std=np.load('dataset_audio_base_stdev.npy')
dataset_multi_audio_mean = np.load('dataset_multi_audio_mean.npy')
dataset_multi_audio_std = np.load('dataset_multi_audio_stdev.npy')
dataset_video_mean = np.load('dataset_video_mean.npy')
dataset_video_std = np.load('dataset_video_stdev.npy')
# destination folders
train_dir = '/home/storage/Data/MULTI_GRID_100/multiMotionModalTfRec/TRAIN_CTC_SENTENCES/'
val_dir = '/home/storage/Data/MULTI_GRID_100/multiMotionModalTfRec/VAL_CTC_SENTENCES/'
test_dir = '/home/storage/Data/MULTI_GRID_100/multiMotionModalTfRec/TEST_CTC_SENTENCES/'
f = open('./dictionary.txt', 'r')
dictionary = f.read()
phonemes = dictionary.replace('\n', ' ').split(' ')
phonemes = [ph for ph in sorted(set(phonemes)) if ph is not '']
#print('Number of phonemes = ', len(phonemes))
#print(phonemes)
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.csv'))
features_file_list_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.csv'))
features_file_list_video = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/video/*.txt'))
assert len(features_file_list_audio) == len(features_file_list_video) == len(features_file_list_base_audio), "#base_audop != #multi_audio != #video"
print('Total number of files = {}'.format(
len(features_file_list_audio))) # it has to be equal to len(features_file_list_video)
# prepare indices for cross validation
indices = np.arange(len(features_file_list_audio)) # same of indices_video = np.arange(len(features_file_list_video))
np.random.seed(3)
np.random.shuffle(indices)
# cross validation split
train_percent = 0.6
val_percent = 0.2
test_percent = 0.2
print(len(features_file_list_audio))
num_sentences_train = int(len(features_file_list_audio) * train_percent)
num_sentences_val = int(len(features_file_list_audio) * val_percent)
num_sentences_test = len(features_file_list_audio) - num_sentences_train - num_sentences_val
print('num sentences train = ', num_sentences_train)
print('num sentences val = ', num_sentences_val)
print('num sentences test = ', num_sentences_test)
train_indices = indices[:num_sentences_train]
val_indices = indices[num_sentences_train:(num_sentences_train + num_sentences_val)]
test_indices = indices[(num_sentences_val + num_sentences_train):]
train_counter = 0
val_counter = 0
test_counter = 0
for file_index, (csv_base_file_audio,csv_file_audio, txt_file_video) in enumerate(zip(features_file_list_base_audio,
features_file_list_audio,
features_file_list_video)):
# print('base audio {:s},multi audio {:s}, video {:s}'.format(csv_base_file_audio, csv_file_audio, txt_file_video))
features_base_audio = np.loadtxt(csv_base_file_audio, delimiter=',')
features_audio = np.loadtxt(csv_file_audio, delimiter=',')
features_video = np.loadtxt(txt_file_video)
delta_features_video = np.zeros_like(features_video)
delta_features_video[1:] = features_video[1:] - features_video[:-1]
# print features_base_audio
#print features_audio.shape
#print features_video.shape
# label path
labels_file = csv_file_audio.replace('/multi_audio/', '/transcription/').replace('.csv', '.transcription')
f = open(labels_file, 'r')
labels = f.read()
labels = labels.replace('\n', '').replace('SP', '').split(',')
labels = [lab for lab in labels if lab is not '']
#print('labels : ', labels)
labels = [phonemes.index(ph) for ph in labels]
#print('labels : ', labels)
labels = np.asarray(labels)
#print(labels.shape)
#print('')
features_base_audio = np.subtract(features_base_audio, dataset_audio_base_mean) / dataset_audio_base_std
features_audio = np.subtract(features_audio, dataset_multi_audio_mean) / dataset_multi_audio_std
features_video = np.subtract(delta_features_video, dataset_video_mean) / dataset_video_std
if file_index in train_indices:
sentence_file = train_dir + 'sequence_full_{:05d}.tfrecords'.format(train_counter)
train_counter += 1
if file_index in val_indices:
sentence_file = val_dir + 'sequence_full_{:05d}.tfrecords'.format(val_counter)
val_counter += 1
if file_index in test_indices:
sentence_file = test_dir + 'sequence_full_{:05d}.tfrecords'.format(test_counter)
test_counter += 1
fp = open(sentence_file, 'w')
writer = tf.python_io.TFRecordWriter(fp.name)
serialized_sentence = serialize_sequence(features_base_audio, features_audio, features_video, labels)
# write to tfrecord
writer.write(serialized_sentence.SerializeToString())
writer.close()
# fp.close()
import glob
import numpy as np
from scipy.io.wavfile import read
np.set_printoptions(threshold=np.nan)
import tensorflow as tf
def serialize_sequence(base_audio_sequence, audio_sequence, video_sequence, labels):
# The object we return
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
base_audio_sequence_length = len(base_audio_sequence)
audio_sequence_length = len(audio_sequence)
video_sequence_length = len(video_sequence)
labels_length = len(labels)
ex.context.feature["base_audio_length"].int64_list.value.append(base_audio_sequence_length)
ex.context.feature["audio_length"].int64_list.value.append(audio_sequence_length)
ex.context.feature["video_length"].int64_list.value.append(video_sequence_length)
ex.context.feature["label_length"].int64_list.value.append(labels_length)
# Feature lists for the two sequential features of our example
fl_base_audio_feat = ex.feature_lists.feature_list["base_audio_feat"]
fl_audio_feat = ex.feature_lists.feature_list["audio_feat"]
fl_video_feat = ex.feature_lists.feature_list["video_feat"]
fl_labels = ex.feature_lists.feature_list["labels"]
for base_audio_feat in base_audio_sequence:
fl_base_audio_feat.feature.add().float_list.value.extend(base_audio_feat)
for audio_feat in audio_sequence:
fl_audio_feat.feature.add().float_list.value.extend(audio_feat)
for video_feat in video_sequence:
fl_video_feat.feature.add().float_list.value.extend(video_feat)
for label in labels:
fl_labels.feature.add().float_list.value.append(label)
return ex
# load dataset mean and std
dataset_audio_base_mean=np.load('dataset_audio_raw_base_mean.npy')
dataset_audio_base_std=np.load('dataset_audio_raw_base_stdev.npy')
dataset_multi_audio_mean = np.load('dataset_audio_raw_multi_mean.npy')
dataset_multi_audio_std = np.load('dataset_audio_raw_multi_stdev.npy')
dataset_video_mean = np.load('dataset_video_mean.npy')
dataset_video_std = np.load('dataset_video_stdev.npy')
# destination folders
train_dir = '/home/storage/Data/MULTI_GRID_100/rawAudioMotionTfRec/TRAIN_CTC_SENTENCES/'
val_dir = '/home/storage/Data/MULTI_GRID_100/rawAudioMotionTfRec/VAL_CTC_SENTENCES/'
test_dir = '/home/storage/Data/MULTI_GRID_100/rawAudioMotionTfRec/TEST_CTC_SENTENCES/'
f = open('./dictionary.txt', 'r')
dictionary = f.read()
phonemes = dictionary.replace('\n', ' ').split(' ')
phonemes = [ph for ph in sorted(set(phonemes)) if ph is not '']
#print('Number of phonemes = ', len(phonemes))
#print(phonemes)
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.npy'))
features_file_list_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.npy'))
features_file_list_video = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/video/*.txt'))
#assert len(features_file_list_audio) == len(features_file_list_video) == len(features_file_list_base_audio), "#base_audop != #multi_audio != #video"
print('Total number of files = {}'.format(
len(features_file_list_audio))) # it has to be equal to len(features_file_list_video)
# prepare indices for cross validation
indices = np.arange(len(features_file_list_audio)) # same of indices_video = np.arange(len(features_file_list_video))
np.random.seed(3)
np.random.shuffle(indices)
# cross validation split
train_percent = 0.6
val_percent = 0.2
test_percent = 0.2
print(len(features_file_list_audio))
num_sentences_train = int(len(features_file_list_audio) * train_percent)
num_sentences_val = int(len(features_file_list_audio) * val_percent)
num_sentences_test = len(features_file_list_audio) - num_sentences_train - num_sentences_val
print('num sentences train = ', num_sentences_train)
print('num sentences val = ', num_sentences_val)
print('num sentences test = ', num_sentences_test)
train_indices = indices[:num_sentences_train]
val_indices = indices[num_sentences_train:(num_sentences_train + num_sentences_val)]
test_indices = indices[(num_sentences_val + num_sentences_train):]
train_counter = 0
val_counter = 0
test_counter = 0
for file_index, (wav_base_file_audio,wav_file_audio, txt_file_video) in enumerate(zip(features_file_list_base_audio,
features_file_list_audio,
features_file_list_video)):
# print('base audio {:s},multi audio {:s}, video {:s}'.format(csv_base_file_audio, csv_file_audio, txt_file_video))
features_base_audio = np.load(wav_base_file_audio)#np.loadtxt(csv_base_file_audio, delimiter=',')
features_audio = np.load(wav_file_audio)#np.loadtxt(csv_file_audio, delimiter=',')
features_video = np.loadtxt(txt_file_video)
delta_features_video = np.zeros_like(features_video)
delta_features_video[1:] = features_video[1:] - features_video[:-1]
# label path
labels_file = wav_file_audio.replace('/multi_audio/', '/transcription/').replace('.npy', '.transcription')
f = open(labels_file, 'r')
labels = f.read()
labels = labels.replace('\n', '').replace('SP', '').split(',')
labels = [lab for lab in labels if lab is not '']
#print('labels : ', labels)
labels = [phonemes.index(ph) for ph in labels]
#print('labels : ', labels)
labels = np.asarray(labels)
#print(labels.shape)
#print('')
features_base_audio = np.subtract(features_base_audio, dataset_audio_base_mean) / dataset_audio_base_std
features_audio = np.subtract(features_audio, dataset_multi_audio_mean) / dataset_multi_audio_std
delta_features_video = np.subtract(delta_features_video, dataset_video_mean) / dataset_video_std
if file_index in train_indices:
sentence_file = train_dir + 'sequence_full_{:05d}.tfrecords'.format(train_counter)
train_counter += 1
if file_index in val_indices:
sentence_file = val_dir + 'sequence_full_{:05d}.tfrecords'.format(val_counter)
val_counter += 1
if file_index in test_indices:
sentence_file = test_dir + 'sequence_full_{:05d}.tfrecords'.format(test_counter)
test_counter += 1
fp = open(sentence_file, 'w')
writer = tf.python_io.TFRecordWriter(fp.name)
serialized_sentence = serialize_sequence(features_base_audio, features_audio, delta_features_video, labels)
# write to tfrecord
writer.write(serialized_sentence.SerializeToString())
writer.close()
# fp.close()
import tensorflow as tf
from Utils.decoratorTF import define_scope
class Bi_DEA_Concat_model:
def __init__(self, x_audio_ph, x_video_ph, y_ph, n_in_audio, n_in_video, x_len_audio_ph,
n_hidden, n_hidden_encode, batch_size, learning_rate,
learning_decay, momentum, updating_step, keep_prob=0.9, output_act_fun=None, out_parameter_initializer=None):
# placeholders
self.x_audio_ph = x_audio_ph
self.x_video_ph = x_video_ph
self.y_ph = y_ph
self.x_len_audio_ph = x_len_audio_ph
# model parameters
self.n_in_audio = n_in_audio
self.n_in_video = n_in_video
self.n_hidden = n_hidden
self.n_hidden_encode = n_hidden_encode
self.output_act_fun = output_act_fun
self.out_parameter_initializer = out_parameter_initializer
self.dropout_kp=keep_prob
# training parameters
self.batch_size = batch_size
self.learning_rate = learning_rate
self.updating_step = updating_step
self.learning_decay = learning_decay
self.momentum = momentum
self.multi_audio_encoding
self.single_audio_decoder
def init_model(self, loss_fun):
# init model methods
self.loss_fun = loss_fun
self.regression
self.optimizer
@define_scope("multi_audio_encoding")
def multi_audio_encoding(self):
#fw_cell:
cells_fw = []
for dim in self.n_hidden:
cells_fw.append(tf.contrib.rnn.LayerNormBasicLSTMCell(dim,dropout_keep_prob=self.dropout_kp))
multi_LSTM_cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw)
initial_state_fw = multi_LSTM_cell_fw.zero_state(self.batch_size, dtype=tf.float32)
#bw_cell:
cells_bw = []
for dim in self.n_hidden:
cells_bw.append(tf.contrib.rnn.LayerNormBasicLSTMCell(dim,dropout_keep_prob=self.dropout_kp))
multi_LSTM_cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw)
initial_state_bw = multi_LSTM_cell_bw.zero_state(self.batch_size, dtype=tf.float32)
audio_visual_input = tf.concat([self.x_audio_ph, self.x_video_ph], 2)
rnn_outputs, output_state_fw, output_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw=cells_fw,
cells_bw=cells_bw,
inputs=audio_visual_input,
initial_states_fw=list(initial_state_fw),
initial_states_bw=list(initial_state_bw),
dtype=tf.float32,
sequence_length=self.x_len_audio_ph,
parallel_iterations=None,
scope=None)
rnn_outputs_fw, rnn_outputs_bw = tf.split(rnn_outputs, num_or_size_splits=2, axis=2)
# Define encoding layer weights
output_weights_fw = tf.get_variable('outputs_weights_fw', dtype=tf.float32,
shape=[self.n_hidden[-1], self.n_hidden_encode],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_weights_bw = tf.get_variable('outputs_weights_bw', dtype=tf.float32,
shape=[self.n_hidden[-1], self.n_hidden_encode],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_biases = tf.get_variable('biases', dtype=tf.float32,
shape=[self.n_hidden_encode],
initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
rnn_outputs_fw = tf.reshape(rnn_outputs_fw, [-1, self.n_hidden[-1]])
rnn_outputs_bw = tf.reshape(rnn_outputs_bw, [-1, self.n_hidden[-1]])
output = tf.matmul(rnn_outputs_fw, output_weights_fw) + tf.matmul(rnn_outputs_bw,
output_weights_bw) + output_biases
output = tf.reshape(output, [self.batch_size, -1, self.n_hidden_encode])
return output
@define_scope("single_audio_decoder")
def single_audio_decoder(self):
# fw_cell:
cells_fw = []
for dim in reversed(self.n_hidden):
cells_fw.append(tf.contrib.rnn.BasicLSTMCell(dim))
multi_LSTM_cell_fw = tf.nn.rnn_cell.MultiRNNCell(cells_fw)
initial_state_fw = multi_LSTM_cell_fw.zero_state(self.batch_size, dtype=tf.float32)
# bw_cell:
cells_bw = []
for dim in reversed(self.n_hidden):
cells_bw.append(tf.contrib.rnn.BasicLSTMCell(dim))
#cells_bw.append(tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(dim),
#output_keep_prob=self.keep_prob, input_keep_prob=self.keep_prob))
multi_LSTM_cell_bw = tf.nn.rnn_cell.MultiRNNCell(cells_bw)
initial_state_bw = multi_LSTM_cell_bw.zero_state(self.batch_size, dtype=tf.float32)
rnn_outputs, output_state_fw, output_state_bw = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
cells_fw=cells_fw,
cells_bw=cells_bw,
inputs=self.multi_audio_encoding,
initial_states_fw=list(initial_state_fw),
initial_states_bw=list(initial_state_bw),
dtype=tf.float32,
sequence_length=self.x_len_audio_ph,
parallel_iterations=None,
scope=None)
rnn_outputs_fw, rnn_outputs_bw = tf.split(rnn_outputs, num_or_size_splits=2, axis=2)
# Define fw e bw mixing layer weights
output_weights_fw = tf.get_variable('outputs_weights_fw', dtype=tf.float32,
shape=[self.n_hidden[0], self.n_in_audio],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_weights_bw = tf.get_variable('outputs_weights_bw', dtype=tf.float32,
shape=[self.n_hidden[0], self.n_in_audio],
initializer=tf.random_uniform_initializer(minval=-0.1, maxval=0.1))
output_biases = tf.get_variable('biases', dtype=tf.float32,
shape=[self.n_in_audio],
initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
rnn_outputs_fw = tf.reshape(rnn_outputs_fw, [-1, self.n_hidden[0]])
rnn_outputs_bw = tf.reshape(rnn_outputs_bw, [-1, self.n_hidden[0]])
output = tf.matmul(rnn_outputs_fw, output_weights_fw) + tf.matmul(rnn_outputs_bw,
output_weights_bw) + output_biases
output = tf.reshape(output, [self.batch_size, -1, self.n_in_audio])
return output
@define_scope("regression")
def regression(self):
return self.single_audio_decoder
@define_scope("optimizer")
def optimizer(self):
global_step = tf.Variable(0, trainable=False)
# define cost function
cost = tf.reduce_mean(self.loss_fun)
# define learning rate decay function
learning_rate_decay_fun = tf.train.exponential_decay(self.learning_rate, global_step,
self.updating_step, self.learning_decay, staircase=True)
# define optimizer
# optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate_decay_fun, momentum=self.momentum)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate_decay_fun)
grad, variables = zip(*optimizer.compute_gradients(cost))
clip_grad, _ = tf.clip_by_global_norm(grad, 5.0) # TODO:check value to clip
optimizer = optimizer.apply_gradients(zip(grad, variables), global_step=global_step)
return optimizer, learning_rate_decay_fun, cost, global_step
if __name__ == '__main__':
num_epochs = 10
batch_size = 10
nIn_audio = 123
nIn_video = 134
nHidden = [123, 100, 75, 50]
nHidden_encode = 25
learningRate = 0.01
traininglog_dir = "./"
updatingStep = 10
learningDecay = 1
momentum=0.9
x_audio_ph = tf.placeholder("float32", [None, None, nIn_audio], 'x_audio')
x_audio_len_ph = tf.placeholder("int32", shape=[None], name='x_audio_len')
x_video_ph = tf.placeholder("float32", [None, None, nIn_video], 'x_video')
y_ph = tf.placeholder("float32", [None, None, nIn_audio], 'y')
model = Bi_DEA_Concat_model(x_audio_ph=x_audio_ph, x_video_ph=x_video_ph, y_ph=y_ph, n_in_audio=nIn_audio,
n_in_video=nIn_video, x_len_audio_ph=x_audio_len_ph, n_hidden=nHidden,
n_hidden_encode=nHidden_encode, batch_size=batch_size, learning_rate=learningRate,
learning_decay=learningDecay, momentum=momentum,
updating_step=updatingStep)
\ No newline at end of file
import sys
import os
sys.path.append(os.path.join(os.path.dirname(__file__), '../../../'))
from Concat_DAE_4_speech import DAE_4_speech
import tensorflow as tf
import numpy as np
np.set_printoptions(threshold='nan')
if __name__ == '__main__':
num_epochs = 500
batch_size = 18
nIn_audio = 123
nIn_video = 134
nHidden = [350,500]
nHidden_encode = 750
learningRate = 0.001
traininglog_dir = "./"
updating_step = 2250
test_step= 10
learningDecay = 1
momentum = 0.9
graph = tf.Graph()
config = tf.ConfigProto()