Commit f1a80b60 authored by Luca Pasa's avatar Luca Pasa
Browse files

merge

parents fc9a3f89 6cc3e87d
......@@ -2,3 +2,4 @@
*.pyc
datasets/*
Model/DAE/Restore/gd/*
*.npy
......@@ -97,12 +97,12 @@ def video_batch_align(audio_batch, video_batch):
if __name__ == '__main__':
path = '/home/storage/Data/MULTI_GRID/multiModalTfRec/TRAIN_CTC_SENTENCES/'
path = '/home/storage/Data/MULTI_GRID/rawMultiModalTfRec/TRAIN_CTC_SENTENCES/'
n_batch = 4
n_epoch = 5
buffer_size = 2
dm = dataManager()
dm = dataManager(single_audio_frame_size=257)
ds = dm.get_dataset(path)
......
......@@ -2,6 +2,7 @@ from pydub import AudioSegment
import os
import random
import shutil
import glob
#conf local
#grid_path = "/home/nameless/Project/MultiModalSpeech/Data/GRID/"
......@@ -9,8 +10,14 @@ import shutil
#conf Comago
grid_path = "/DATA_NEW/lpasa/Data/GRID/"
grid_multi_speaker_path = "/DATA_NEW/lpasa/Data/MULTI_GRID/"
# grid_path = "/DATA_NEW/lpasa/Data/GRID/"
# grid_multi_speaker_path = "/DATA_NEW/lpasa/Data/MULTI_GRID/"
#conf local (dell)
grid_path='/home/storage/Data/GRID/'
grid_multi_speaker_path ='/home/storage/Data/MULTI_GRID_100/'
#Sarabebe utile lavorare su una coppia di grid in mododa poter cancellare i file mano a mano e quindi poter greare facilmente test e validation con la sicurezza
......@@ -38,6 +45,7 @@ def create_multi_grid_folder(n_speakers=33): # de deafualt value is 33 because
def two_files_audio_sum(file_1_path, file_2_path, file_sum_name,volume_reduction=0):
s1 = AudioSegment.from_file(file_1_path)
s2 = AudioSegment.from_file(file_2_path) - volume_reduction
if s1.duration_seconds >= s2.duration_seconds:
......@@ -46,7 +54,7 @@ def two_files_audio_sum(file_1_path, file_2_path, file_sum_name,volume_reduction
audio_sum = s2.overlay(s1)
audio_sum.export(file_sum_name, format='wav')
def random_files_selector(folders,n_file=1):
def random_files_selector(folders,n_file=1, file_extension=".wav"):
'''
:param n_file: number of files that have to be selected by the method
:param folders: list of folders where the file will be randomly selected
......@@ -54,15 +62,17 @@ def random_files_selector(folders,n_file=1):
'''
dir_file_list=[]
for dir in folders:
dir_file_list.append(tuple((dir, os.listdir(dir))))
dir_file_list.append(tuple((dir, glob.glob( os.path.join(dir,"*"+file_extension)))))
selected_files=[]
for i in range(n_file):
random_folder_files=random.choice(dir_file_list)
is_not_append=True
while is_not_append:
random_file=random.choice(random_folder_files[1])
if random_folder_files not in selected_files:
selected_files.append(random_folder_files[0]+"/"+random_file)
random_file=os.path.split(random_file)[-1]
if random_folder_files not in selected_files :
selected_files.append(os.path.join(random_folder_files[0],random_file))
is_not_append=False
return selected_files
......@@ -134,7 +144,7 @@ def create_multi_speaker_data(n_speech_4_speaker, n_combinations=3, n_speakers=2
other_speakers=[speaker for speaker in list_of_speakers if speaker != s]
#first we have to select n_speech_4_speaker by the current speaker, that will be use as target for the combinations
current_speech_list=random_files_selector([grid_path+"s"+str(s)+"/audio"],n_speech_4_speaker)
current_speech_list=random_files_selector([grid_path+"s"+str(s)+"/audio"],n_speech_4_speaker,"*.wav")
#now, for each speech we have to create n_combinations combinations:
for base_speech in current_speech_list:
......@@ -144,7 +154,7 @@ def create_multi_speaker_data(n_speech_4_speaker, n_combinations=3, n_speakers=2
while condition:
#to create a combinations we have to select n_speakers-1 speeches from other spearkers
other_speeches_to_combine=random_files_selector([grid_path+"s"+str(s_other)+"/audio" for s_other in other_speakers], n_speakers-1)
other_speeches_to_combine=random_files_selector([grid_path+"s"+str(s_other)+"/audio" for s_other in other_speakers], n_speakers-1,"*.wav")
for speech_to_combine in other_speeches_to_combine:
......@@ -177,6 +187,6 @@ def create_multi_speaker_data(n_speech_4_speaker, n_combinations=3, n_speakers=2
if __name__ == '__main__':
create_multi_grid_folder()
create_multi_speaker_data(10,n_of_GRID_speakers=33,n_speakers=2)
create_multi_speaker_data(n_speech_4_speaker=200,n_of_GRID_speakers=33,n_speakers=2)
# print random_files_selector(["/home/nameless/Project/MultiModalSpeech/Data/GRID/s3/audio","/home/nameless/Project/MultiModalSpeech/Data/GRID/s2/audio","/home/nameless/Project/MultiModalSpeech/Data/GRID/s1/audio"],6)
\ No newline at end of file
import tensorflow as tf
import glob
from scipy.io.wavfile import read
import numpy as np
import os
from scipy import signal
audio_ph=tf.placeholder(dtype=tf.float32, shape=[None,None], name="audio_ph")
stfs = tf.contrib.signal.stft(audio_ph, frame_length=400, frame_step=160, #16Khz, 25ms window, 10ms step
fft_length=512)
def compute_stsf(audio_seq):
with tf.Session() as sess:
audio_stsf=sess.run(stfs,feed_dict={audio_ph: audio_seq})
return audio_stsf.astype(float)
def downsampling(samples, sample_rate, downsample_rate):
secs = len(samples) / sample_rate
num_samples = int(downsample_rate * secs)
return signal.resample(samples, num_samples)
if __name__ == '__main__':
print "compute npy base audio"
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.wav'))
for wav_file in features_file_list_base_audio:
rate, wav_rap = read(wav_file)
wav_rap=downsampling(wav_rap,rate,16000)#downsamplig to 16khz
wav_rap=np.reshape(wav_rap,(1,wav_rap.shape[0]))
stsf_rap=compute_stsf(wav_rap)[0]
file_name=(os.path.splitext(wav_file)[0])+".npy"
print file_name
np.save(file_name,stsf_rap)
print "compute npy multi audio"
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.wav'))
for wav_file in features_file_list_base_audio:
rate, wav_rap = read(wav_file)
wav_rap=downsampling(wav_rap,rate,16000)#downsamplig to 16khz
wav_rap=np.reshape(wav_rap,(1,wav_rap.shape[0]))
stsf_rap=compute_stsf(wav_rap)[0]
file_name=(os.path.splitext(wav_file)[0])+".npy"
print file_name
np.save(file_name,stsf_rap)
......@@ -3,7 +3,9 @@
HTK=~/htk/HTKTools/
MULTI_GRID_FOLDER=/home/storage/Data/GRID/
# MULTI_GRID_FOLDER=/home/storage/Data/GRID/
MULTI_GRID_FOLDER=/home/storage/Data/MULTI_GRID_100/
for DIR in "$MULTI_GRID_FOLDER"/*
do
......@@ -27,4 +29,4 @@
fi
done
done
\ No newline at end of file
done
from scipy.io import wavfile
import numpy as np
from librosa.feature import melspectrogram
from scipy import signal
import glob
GRID_AUDIO_FILES='/home/storage/Data/MULTI_GRID/s*/base_audio/*.wav'
def mel_specgram(audio=None, sample_rate=16e3, spec=None, n_mels=80, window_size=20, overlap_size=10):
times = None # TODO: add times with spectrogram input
if audio is not None:
nperseg = int(round(window_size / 1e3 * sample_rate))
noverlap = int(round(overlap_size / 1e3 * sample_rate))
freqs, times, spec = signal.spectrogram(audio, fs=sample_rate, window='hann', nperseg=nperseg, noverlap=noverlap, detrend=False)
elif spec is None:
return None
mel_spec = melspectrogram(S=spec, sr=sample_rate, n_mels=n_mels)
channels = np.arange(n_mels)
return channels, times, mel_spec.T
def log_mel_specgram(audio=None, sample_rate=16e3, spec=None, n_mels=80, window_size=20, overlap_size=10, eps=1e-10):
channels, times, mel_spec = mel_specgram(audio, sample_rate, spec, n_mels, window_size, overlap_size)
return channels, times, np.log(mel_spec + eps)
def create_dataset():
features_file_list_base_audio = glob.glob(GRID_AUDIO_FILES)
for audio_sample in features_file_list_base_audio:
sample_rate, sample = wavfile.read(audio_sample)
_, _, spec = log_mel_specgram(sample, sample_rate, window_size=25, overlap_size=15)
print sample
if __name__ == '__main__':
create_dataset()
\ No newline at end of file
......@@ -6,7 +6,7 @@ import os
#comago
# root = grid_multi_speaker_path = "/DATA_NEW/lpasa/Data/MULTI_GRID/"
#root="/home/storage/Data/GRID/"
root="/home/storage/Data/MULTI_GRID/"
root="/home/storage/Data/MULTI_GRID_100/"
dict_path = "./dictionary.txt"
word_path = "./word.txt"
......@@ -72,9 +72,9 @@ if __name__ == '__main__':
word_file = open(word_path, 'r')
dict_file = open(dict_path, 'r')
# CREATE FILE PROCEDURE
# create_copy_files_string()
# for word,transcription in zip(word_file,dict_file):
# replace_in_dataset(word.strip(),transcription.strip())
create_copy_files_string()
for word,transcription in zip(word_file,dict_file):
replace_in_dataset(word.strip(),transcription.strip())
# CREATE LINEARIZED FILE
create_linearized_files()
......
#!/usr/bin/env bash
#MULTI_GRID_FOLDER=/home/storage/Data/MULTI_GRID
MULTI_GRID_FOLDER=/home/storage/Data/GRID
#Comago
MULTI_GRID_FOLDER=/home/lpasa/Data/MULTI_GRID
#MULTI_GRID_FOLDER=/home/lpasa/Data/MULTI_GRID
for DIR in "$MULTI_GRID_FOLDER"/*
do
......
......@@ -50,21 +50,21 @@ dataset_video_mean = np.load('dataset_video_mean.npy')
dataset_video_std = np.load('dataset_video_stdev.npy')
# destination folders
train_dir = '/home/storage/Data/MULTI_GRID/multiModalTfRec/TRAIN_CTC_SENTENCES/'
val_dir = '/home/storage/Data/MULTI_GRID/multiModalTfRec/VAL_CTC_SENTENCES/'
test_dir = '/home/storage/Data/MULTI_GRID/multiModalTfRec/TEST_CTC_SENTENCES/'
train_dir = '/home/storage/Data/MULTI_GRID_100/multiModalTfRec/TRAIN_CTC_SENTENCES/'
val_dir = '/home/storage/Data/MULTI_GRID_100/multiModalTfRec/VAL_CTC_SENTENCES/'
test_dir = '/home/storage/Data/MULTI_GRID_100/multiModalTfRec/TEST_CTC_SENTENCES/'
f = open('./dictionary.txt', 'r')
dictionary = f.read()
phonemes = dictionary.replace('\n', ' ').split(' ')
phonemes = [ph for ph in sorted(set(phonemes)) if ph is not '']
print('Number of phonemes = ', len(phonemes))
print(phonemes)
#print('Number of phonemes = ', len(phonemes))
#print(phonemes)
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
features_file_list_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/multi_audio/*.csv'))
features_file_list_video = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/video/*.txt'))
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.csv'))
features_file_list_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.csv'))
features_file_list_video = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/video/*.txt'))
assert len(features_file_list_audio) == len(features_file_list_video) == len(features_file_list_base_audio), "#base_audop != #multi_audio != #video"
print('Total number of files = {}'.format(
......@@ -103,14 +103,14 @@ for file_index, (csv_base_file_audio,csv_file_audio, txt_file_video) in enumerat
features_file_list_audio,
features_file_list_video)):
print('base audio {:s},multi audio {:s}, video {:s}'.format(csv_base_file_audio, csv_file_audio, txt_file_video))
# print('base audio {:s},multi audio {:s}, video {:s}'.format(csv_base_file_audio, csv_file_audio, txt_file_video))
features_base_audio = np.loadtxt(csv_base_file_audio, delimiter=',')
features_audio = np.loadtxt(csv_file_audio, delimiter=',')
features_video = np.loadtxt(txt_file_video)
print features_base_audio
print features_audio.shape
print features_video.shape
# print features_base_audio
#print features_audio.shape
#print features_video.shape
# label path
labels_file = csv_file_audio.replace('/multi_audio/', '/transcription/').replace('.csv', '.transcription')
......@@ -119,12 +119,12 @@ for file_index, (csv_base_file_audio,csv_file_audio, txt_file_video) in enumerat
labels = f.read()
labels = labels.replace('\n', '').replace('SP', '').split(',')
labels = [lab for lab in labels if lab is not '']
print('labels : ', labels)
#print('labels : ', labels)
labels = [phonemes.index(ph) for ph in labels]
print('labels : ', labels)
#print('labels : ', labels)
labels = np.asarray(labels)
print(labels.shape)
print('')
#print(labels.shape)
#print('')
features_base_audio = np.subtract(features_base_audio, dataset_audio_base_mean) / dataset_audio_base_std
features_audio = np.subtract(features_audio, dataset_multi_audio_mean) / dataset_multi_audio_std
features_video = np.subtract(features_video, dataset_video_mean) / dataset_video_std
......
import glob
import numpy as np
from scipy.io.wavfile import read
np.set_printoptions(threshold=np.nan)
import tensorflow as tf
def serialize_sequence(base_audio_sequence, audio_sequence, video_sequence, labels):
# The object we return
ex = tf.train.SequenceExample()
# A non-sequential feature of our example
base_audio_sequence_length = len(base_audio_sequence)
audio_sequence_length = len(audio_sequence)
video_sequence_length = len(video_sequence)
labels_length = len(labels)
ex.context.feature["base_audio_length"].int64_list.value.append(base_audio_sequence_length)
ex.context.feature["audio_length"].int64_list.value.append(audio_sequence_length)
ex.context.feature["video_length"].int64_list.value.append(video_sequence_length)
ex.context.feature["label_length"].int64_list.value.append(labels_length)
# Feature lists for the two sequential features of our example
fl_base_audio_feat = ex.feature_lists.feature_list["base_audio_feat"]
fl_audio_feat = ex.feature_lists.feature_list["audio_feat"]
fl_video_feat = ex.feature_lists.feature_list["video_feat"]
fl_labels = ex.feature_lists.feature_list["labels"]
for base_audio_feat in base_audio_sequence:
fl_base_audio_feat.feature.add().float_list.value.extend(base_audio_feat)
for audio_feat in audio_sequence:
fl_audio_feat.feature.add().float_list.value.extend(audio_feat)
for video_feat in video_sequence:
fl_video_feat.feature.add().float_list.value.extend(video_feat)
for label in labels:
fl_labels.feature.add().float_list.value.append(label)
return ex
# load dataset mean and std
dataset_audio_base_mean=np.load('dataset_audio_raw_base_mean.npy')
dataset_audio_base_std=np.load('dataset_audio_raw_base_stdev.npy')
dataset_multi_audio_mean = np.load('dataset_audio_raw_multi_mean.npy')
dataset_multi_audio_std = np.load('dataset_audio_raw_multi_stdev.npy')
dataset_video_mean = np.load('dataset_video_mean.npy')
dataset_video_std = np.load('dataset_video_stdev.npy')
# destination folders
train_dir = '/home/storage/Data/MULTI_GRID_100/rawMultiModalTfRec/TRAIN_CTC_SENTENCES/'
val_dir = '/home/storage/Data/MULTI_GRID_100/rawMultiModalTfRec/VAL_CTC_SENTENCES/'
test_dir = '/home/storage/Data/MULTI_GRID_100/rawMultiModalTfRec/TEST_CTC_SENTENCES/'
f = open('./dictionary.txt', 'r')
dictionary = f.read()
phonemes = dictionary.replace('\n', ' ').split(' ')
phonemes = [ph for ph in sorted(set(phonemes)) if ph is not '']
#print('Number of phonemes = ', len(phonemes))
#print(phonemes)
features_file_list_base_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.npy'))
features_file_list_audio = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.npy'))
features_file_list_video = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/video/*.txt'))
#assert len(features_file_list_audio) == len(features_file_list_video) == len(features_file_list_base_audio), "#base_audop != #multi_audio != #video"
print('Total number of files = {}'.format(
len(features_file_list_audio))) # it has to be equal to len(features_file_list_video)
# prepare indices for cross validation
indices = np.arange(len(features_file_list_audio)) # same of indices_video = np.arange(len(features_file_list_video))
np.random.seed(3)
np.random.shuffle(indices)
# cross validation split
train_percent = 0.6
val_percent = 0.2
test_percent = 0.2
print(len(features_file_list_audio))
num_sentences_train = int(len(features_file_list_audio) * train_percent)
num_sentences_val = int(len(features_file_list_audio) * val_percent)
num_sentences_test = len(features_file_list_audio) - num_sentences_train - num_sentences_val
print('num sentences train = ', num_sentences_train)
print('num sentences val = ', num_sentences_val)
print('num sentences test = ', num_sentences_test)
train_indices = indices[:num_sentences_train]
val_indices = indices[num_sentences_train:(num_sentences_train + num_sentences_val)]
test_indices = indices[(num_sentences_val + num_sentences_train):]
train_counter = 0
val_counter = 0
test_counter = 0
for file_index, (wav_base_file_audio,wav_file_audio, txt_file_video) in enumerate(zip(features_file_list_base_audio,
features_file_list_audio,
features_file_list_video)):
# print('base audio {:s},multi audio {:s}, video {:s}'.format(csv_base_file_audio, csv_file_audio, txt_file_video))
features_base_audio = np.load(wav_base_file_audio)#np.loadtxt(csv_base_file_audio, delimiter=',')
features_audio = np.load(wav_file_audio)#np.loadtxt(csv_file_audio, delimiter=',')
features_video = np.loadtxt(txt_file_video)
print features_audio.shape
print type(features_audio)
# label path
labels_file = wav_file_audio.replace('/multi_audio/', '/transcription/').replace('.npy', '.transcription')
f = open(labels_file, 'r')
labels = f.read()
labels = labels.replace('\n', '').replace('SP', '').split(',')
labels = [lab for lab in labels if lab is not '']
#print('labels : ', labels)
labels = [phonemes.index(ph) for ph in labels]
#print('labels : ', labels)
labels = np.asarray(labels)
#print(labels.shape)
#print('')
features_base_audio = np.subtract(features_base_audio, dataset_audio_base_mean) / dataset_audio_base_std
features_audio = np.subtract(features_audio, dataset_multi_audio_mean) / dataset_multi_audio_std
features_video = np.subtract(features_video, dataset_video_mean) / dataset_video_std
if file_index in train_indices:
sentence_file = train_dir + 'sequence_full_{:05d}.tfrecords'.format(train_counter)
train_counter += 1
if file_index in val_indices:
sentence_file = val_dir + 'sequence_full_{:05d}.tfrecords'.format(val_counter)
val_counter += 1
if file_index in test_indices:
sentence_file = test_dir + 'sequence_full_{:05d}.tfrecords'.format(test_counter)
test_counter += 1
fp = open(sentence_file, 'w')
writer = tf.python_io.TFRecordWriter(fp.name)
serialized_sentence = serialize_sequence(features_base_audio, features_audio, features_video, labels)
# write to tfrecord
writer.write(serialized_sentence.SerializeToString())
writer.close()
# fp.close()
......@@ -4,7 +4,7 @@ from __future__ import division
import glob
import numpy as np
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.csv'))
#features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
print('Total number of files = {}'.format(len(features_file_list)))
......
......@@ -4,7 +4,7 @@ from __future__ import division
import glob
import numpy as np
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/multi_audio/*.csv'))
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.csv'))
#features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
print('Total number of files = {}'.format(len(features_file_list)))
......
from __future__ import print_function
from __future__ import division
import glob
import numpy as np
print("----------compute base audio mean and std----------")
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/base_audio/*.npy'))
#features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
print('Total number of files = {}'.format(len(features_file_list)))
features=[]
for file_index, npy_file in enumerate(features_file_list):
print(file_index,npy_file)
data=np.load(npy_file)
features.append(data)
features = np.concatenate(features)
print(features.shape)
dataset_mean = np.mean(features,axis=0)
dataset_stdev = np.std(features,axis=0)
print(dataset_mean.shape)
print(dataset_stdev.shape)
mean_file = 'dataset_audio_raw_base_mean.npy'
std_file = 'dataset_audio_raw_base_stdev.npy'
np.save(mean_file,dataset_mean)
np.save(std_file,dataset_stdev)
print("----------compute multi audio mean and std----------")
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/multi_audio/*.npy'))
#features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/base_audio/*.csv'))
print('Total number of files = {}'.format(len(features_file_list)))
features=[]
for file_index, npy_file in enumerate(features_file_list):
print(file_index,npy_file)
data=np.load(npy_file)
features.append(data)
features = np.concatenate(features)
print(features.shape)
dataset_mean = np.mean(features,axis=0)
dataset_stdev = np.std(features,axis=0)
print(dataset_mean.shape)
print(dataset_stdev.shape)
mean_file = 'dataset_audio_raw_multi_mean.npy'
std_file = 'dataset_audio_raw_multi_stdev.npy'
np.save(mean_file,dataset_mean)
np.save(std_file,dataset_stdev)
\ No newline at end of file
from __future__ import print_function
from __future__ import division
import glob
import numpy as np
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID/s*/video/*.txt'))
features_file_list = sorted(glob.glob('/home/storage/Data/MULTI_GRID_100/s*/video/*.txt'))
#features_file_list = sorted(glob.glob('/home/lpasa/Data/MULTI_GRID/s*/video/*.txt'))
print('Total number of files = {}'.format(len(features_file_list)))
features=[]
f_len=[]
for file_index, txt_file in enumerate(features_file_list):