Commit 4015a91e authored by Luca Pasa's avatar Luca Pasa
Browse files

add tf_records creator for spectogram dataset

parent 18f2ceb5
import tensorflow as tf
import glob
from scipy.io.wavfile import read
import numpy as np
import os
from scipy import signal
audio_ph=tf.placeholder(dtype=tf.float32, shape=[None,None], name="audio_ph")
stfs = tf.contrib.signal.stft(audio_ph, frame_length=400, frame_step=160, #16Khz, 25ms window, 10ms step
fft_length=512)
def compute_stsf(audio_seq):
with tf.Session() as sess:
audio_stsf=sess.run(stfs,feed_dict={audio_ph: audio_seq})
return audio_stsf.astype(float)
def downsampling(samples, sample_rate, downsample_rate):
secs = len(samples) / sample_rate
num_samples = int(downsample_rate * secs)
return signal.resample(samples, num_samples)
if __name__ == '__main__':
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.wav'))
for wav_file in features_file_list_base_audio:
rate, wav_rap = read(wav_file)
wav_rap=downsampling(wav_rap,rate,16000)#downsamplig to 16khz
wav_rap=np.reshape(wav_rap,(1,wav_rap.shape[0]))
stsf_rap=compute_stsf(wav_rap)[0]
file_name=(os.path.splitext(wav_file)[0])+".npy"
#print file_name
np.save(file_name,stsf_rap)
import glob
import numpy as np
from scipy.io.wavfile import read
np.set_printoptions(threshold=np.nan)
import tensorflow as tf
def serialize_sequence(base_audio_sequence, audio_sequence, video_sequence, labels):
# The object we return
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
base_audio_sequence_length = len(base_audio_sequence)
audio_sequence_length = len(audio_sequence)
video_sequence_length = len(video_sequence)
labels_length = len(labels)
ex.context.feature["base_audio_length"].int64_list.value.append(base_audio_sequence_length)
ex.context.feature["audio_length"].int64_list.value.append(audio_sequence_length)
ex.context.feature["video_length"].int64_list.value.append(video_sequence_length)
ex.context.feature["label_length"].int64_list.value.append(labels_length)
# Feature lists for the two sequential features of our example
fl_base_audio_feat = ex.feature_lists.feature_list["base_audio_feat"]
fl_audio_feat = ex.feature_lists.feature_list["audio_feat"]
fl_video_feat = ex.feature_lists.feature_list["video_feat"]
fl_labels = ex.feature_lists.feature_list["labels"]
for base_audio_feat in base_audio_sequence:
fl_base_audio_feat.feature.add().float_list.value.extend(base_audio_feat)
for audio_feat in audio_sequence:
fl_audio_feat.feature.add().float_list.value.extend(audio_feat)
for video_feat in video_sequence:
fl_video_feat.feature.add().float_list.value.extend(video_feat)
for label in labels:
fl_labels.feature.add().float_list.value.append(label)
return ex
# load dataset mean and std
dataset_audio_base_mean=np.load('dataset_audio_raw_base_mean.npy')
dataset_audio_base_std=np.load('dataset_audio_raw_base_stdev.npy')
dataset_multi_audio_mean = np.load('dataset_audio_raw_multi_mean.npy')
dataset_multi_audio_std = np.load('dataset_audio_raw_multi_stdev.npy')
dataset_video_mean = np.load('dataset_video_mean.npy')
dataset_video_std = np.load('dataset_video_stdev.npy')
# destination folders
train_dir = '/home/storage/Data/MULTI_GRID_100/rawMultiModalTfRec/TRAIN_CTC_SENTENCES/'
val_dir = '/home/storage/Data/MULTI_GRID_100/rawMultiModalTfRec/VAL_CTC_SENTENCES/'
test_dir = '/home/storage/Data/MULTI_GRID_100/rawMultiModalTfRec/TEST_CTC_SENTENCES/'
f = open('./dictionary.txt', 'r')
dictionary = f.read()
phonemes = dictionary.replace('\n', ' ').split(' ')
phonemes = [ph for ph in sorted(set(phonemes)) if ph is not '']
#print('Number of phonemes = ', len(phonemes))
#print(phonemes)
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.npy'))
features_file_list_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.npy'))
features_file_list_video = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/video/*.txt'))
#assert len(features_file_list_audio) == len(features_file_list_video) == len(features_file_list_base_audio), "#base_audop != #multi_audio != #video"
print('Total number of files = {}'.format(
len(features_file_list_audio))) # it has to be equal to len(features_file_list_video)
# prepare indices for cross validation
indices = np.arange(len(features_file_list_audio)) # same of indices_video = np.arange(len(features_file_list_video))
np.random.seed(3)
np.random.shuffle(indices)
# cross validation split
train_percent = 0.6
val_percent = 0.2
test_percent = 0.2
print(len(features_file_list_audio))
num_sentences_train = int(len(features_file_list_audio) * train_percent)
num_sentences_val = int(len(features_file_list_audio) * val_percent)
num_sentences_test = len(features_file_list_audio) - num_sentences_train - num_sentences_val
print('num sentences train = ', num_sentences_train)
print('num sentences val = ', num_sentences_val)
print('num sentences test = ', num_sentences_test)
train_indices = indices[:num_sentences_train]
val_indices = indices[num_sentences_train:(num_sentences_train + num_sentences_val)]
test_indices = indices[(num_sentences_val + num_sentences_train):]
train_counter = 0
val_counter = 0
test_counter = 0
for file_index, (wav_base_file_audio,wav_file_audio, txt_file_video) in enumerate(zip(features_file_list_base_audio,
features_file_list_audio,
features_file_list_video)):
# print('base audio {:s},multi audio {:s}, video {:s}'.format(csv_base_file_audio, csv_file_audio, txt_file_video))
features_base_audio = np.load(wav_base_file_audio)#np.loadtxt(csv_base_file_audio, delimiter=',')
features_audio = np.load(wav_file_audio)#np.loadtxt(csv_file_audio, delimiter=',')
features_video = np.loadtxt(txt_file_video)
print features_audio.shape
print type(features_audio)
# label path
labels_file = wav_file_audio.replace('/multi_audio/', '/transcription/').replace('.npy', '.transcription')
f = open(labels_file, 'r')
labels = f.read()
labels = labels.replace('\n', '').replace('SP', '').split(',')
labels = [lab for lab in labels if lab is not '']
#print('labels : ', labels)
labels = [phonemes.index(ph) for ph in labels]
#print('labels : ', labels)
labels = np.asarray(labels)
#print(labels.shape)
#print('')
features_base_audio = np.subtract(features_base_audio, dataset_audio_base_mean) / dataset_audio_base_std
features_audio = np.subtract(features_audio, dataset_multi_audio_mean) / dataset_multi_audio_std
features_video = np.subtract(features_video, dataset_video_mean) / dataset_video_std
if file_index in train_indices:
sentence_file = train_dir + 'sequence_full_{:05d}.tfrecords'.format(train_counter)
train_counter += 1
if file_index in val_indices:
sentence_file = val_dir + 'sequence_full_{:05d}.tfrecords'.format(val_counter)
val_counter += 1
if file_index in test_indices:
sentence_file = test_dir + 'sequence_full_{:05d}.tfrecords'.format(test_counter)
test_counter += 1
fp = open(sentence_file, 'w')
writer = tf.python_io.TFRecordWriter(fp.name)
serialized_sentence = serialize_sequence(features_base_audio, features_audio, features_video, labels)
# write to tfrecord
writer.write(serialized_sentence.SerializeToString())
writer.close()
# fp.close()
from __future__ import print_function
from __future__ import division
import glob
import numpy as np
print("----------compute base audio mean and std----------")
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.npy'))
#features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
print('Total number of files = {}'.format(len(features_file_list)))
features=[]
for file_index, npy_file in enumerate(features_file_list):
print(file_index,npy_file)
data=np.load(npy_file)
features.append(data)
features = np.concatenate(features)
print(features.shape)
dataset_mean = np.mean(features,axis=0)
dataset_stdev = np.std(features,axis=0)
print(dataset_mean.shape)
print(dataset_stdev.shape)
mean_file = 'dataset_audio_raw_base_mean.npy'
std_file = 'dataset_audio_raw_base_stdev.npy'
np.save(mean_file,dataset_mean)
np.save(std_file,dataset_stdev)
print("----------compute multi audio mean and std----------")
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.npy'))
#features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
print('Total number of files = {}'.format(len(features_file_list)))
features=[]
for file_index, npy_file in enumerate(features_file_list):
print(file_index,npy_file)
data=np.load(npy_file)
features.append(data)
features = np.concatenate(features)
print(features.shape)
dataset_mean = np.mean(features,axis=0)
dataset_stdev = np.std(features,axis=0)
print(dataset_mean.shape)
print(dataset_stdev.shape)
mean_file = 'dataset_audio_raw_multi_mean.npy'
std_file = 'dataset_audio_raw_multi_stdev.npy'
np.save(mean_file,dataset_mean)
np.save(std_file,dataset_stdev)
\ No newline at end of file
......@@ -90,8 +90,8 @@ class DAE_4_speech:
def loss_function(self):
output = self.model.regression
loss = tf.reduce_mean(tf.losses.mean_squared_error(self.y_ph, output)) + \
tf.losses.cosine_distance(tf.nn.l2_normalize(self.y_ph,0),tf.nn.l2_normalize(output,0),dim=0)
loss = tf.reduce_mean(tf.losses.mean_squared_error(self.y_ph, output)) #+ \
#tf.losses.cosine_distance(tf.nn.l2_normalize(self.y_ph,0),tf.nn.l2_normalize(output,0),dim=0)
return loss
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment