Trainning in French #48

ZheQU-somfy · 2020-04-15T07:24:25Z

Hi,
I want to train the model in French, i use the data set from website 'common_voice'.
I wrote commoncoive_fr.py like this:
from concurrent.futures import ProcessPoolExecutor
from functools import partial
import numpy as np
import os
from util import audio

def build_from_path(in_dir, out_dir, num_workers=1, tqdm=lambda x: x):
executor = ProcessPoolExecutor(max_workers=num_workers)
futures = []
index = 1
with open(os.path.join(in_dir, 'train.tsv'), encoding='utf-8') as f:
for line in f:
parts = line.strip().split('\t')
wav_path = os.path.join(in_dir,parts[1])
text = parts[2]
futures.append(executor.submit(partial(_process_utterance, out_dir, index, wav_path, text)))
index += 1
return [future.result() for future in tqdm(futures)]

def _process_utterance(out_dir, index, wav_path, text):
'''Preprocesses a single utterance audio/text pair.

This writes the mel and linear scale spectrograms to disk and returns a tuple to write
to the train.txt file.

Args:
out_dir: The directory to write the spectrograms into
index: The numeric index to use in the spectrogram filenames.
wav_path: Path to the audio file containing the speech input
text: The text spoken in the input audio file

Returns:
A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
'''

Load the audio to a numpy array:

wav = audio.load_wav(wav_path)

Compute the linear-scale spectrogram from the wav:

spectrogram = audio.spectrogram(wav).astype(np.float32)
n_frames = spectrogram.shape[1]

Compute a mel-scale spectrogram from the wav:

mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

Write the spectrograms to disk:

spectrogram_filename = 'commonvoice_fr-spec.npy' % index
mel_filename = 'commonvoice_fr-mel.npy' % index
np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
np.save(os.path.join(out_dir, mel_filename), mel_spectrogram.T, allow_pickle=False)

Return a tuple describing this training example:

return (spectrogram_filename, mel_filename, n_frames, text)

And I modified the preprocess.py like this:
import argparse
import os
from multiprocessing import cpu_count
from tqdm import tqdm
from datasets import amy, blizzard, ljspeech, kusal, mailabs,commonvoice_fr
from datasets import mrs
from hparams import hparams, hparams_debug_string
import sys

def preprocess_blizzard(args):
in_dir = os.path.join(args.base_dir, 'Blizzard2012')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = blizzard.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)

def preprocess_ljspeech(args):
in_dir = os.path.join(args.base_dir, 'LJSpeech-1.1')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = ljspeech.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)

def preprocess_mrs(args):
in_dir = args.mrs_dir
out_dir = os.path.join(args.base_dir, args.output)
username = args.mrs_username
os.makedirs(out_dir, exist_ok=True)
metadata = mrs.build_from_path(
in_dir, out_dir, username, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)

def preprocess_amy(args):
in_dir = os.path.join(args.base_dir, 'amy')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = amy.build_from_path(in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)

def preprocess_kusal(args):
in_dir = os.path.join(args.base_dir, 'kusal')
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
metadata = kusal.build_from_path(
in_dir, out_dir, args.num_workers, tqdm=tqdm)
write_metadata(metadata, out_dir)

def preprocess_mailabs(args):
in_dir = os.path.join(args.mailabs_books_dir)
out_dir = os.path.join(args.base_dir, args.output)
os.makedirs(out_dir, exist_ok=True)
books = args.books
metadata = mailabs.build_from_path(
in_dir, out_dir, books, args.num_workers, tqdm)
write_metadata(metadata, out_dir)

def preprocess_commonvoice(args):
in_dir = os.path.join(args.base_dir,'clips')
out_dir = os.path.join(args.base_dir,args.output)
os.makedirs(out_dir,exist_ok=True)
metdata = commonvoice_fr.build_from_path(in_dir,out_dir,
args.num_workers,tqdm=tqdm)
write_metadata(metadata,out_dir)

def write_metadata(metadata, out_dir):
with open(os.path.join(out_dir, 'train.txt'), 'w', encoding='utf-8') as f:
for m in metadata:
f.write('|'.join([str(x) for x in m]) + '\n')
frames = sum([m[2] for m in metadata])
hours = frames * hparams.frame_shift_ms / (3600 * 1000)
print('Wrote %d utterances, %d frames (%.2f hours)' %
(len(metadata), frames, hours))
print('Max input length: %d' % max(len(m[3]) for m in metadata))
print('Max output length: %d' % max(m[2] for m in metadata))
with open("metadata.txt", 'w') as f:
f.write(
'''
Wrote {} utterances, {} frames, {} hours\n
Max input lengh: {} \n
Max output length: {} \n
'''.format(
len(metadata), frames, hours,
max(len(m[3]) for m in metadata), max(m[2] for m in metadata)
)
)

def main():
parser = argparse.ArgumentParser()
parser.add_argument('--base_dir', default=os.path.expanduser('~/tacotron'))
parser.add_argument('--mrs_dir', required=False)
parser.add_argument('--mrs_username', required=False)
parser.add_argument('--output', default='training')
parser.add_argument(
'--dataset', required=True, choices=['amy', 'blizzard', 'ljspeech',
'kusal', 'mailabs','mrs','commonvoice']
)
parser.add_argument('--mailabs_books_dir',
help='absolute directory to the books for the mlailabs')
parser.add_argument(
'--books',
help='comma-seperated and no space name of books i.e hunter_space,pink_fairy_book,etc.',
)
parser.add_argument('--num_workers', type=int, default=cpu_count())
args = parser.parse_args()

if args.dataset == 'mailabs' and args.books is None:
parser.error("--books required if mailabs is chosen for dataset.")

if args.dataset == 'mailabs' and args.mailabs_books_dir is None:
parser.error(
"--mailabs_books_dir required if mailabs is chosen for dataset.")

print(hparams_debug_string())

if args.dataset == 'amy':
preprocess_amy(args)
elif args.dataset == 'blizzard':
preprocess_blizzard(args)
elif args.dataset == 'ljspeech':
preprocess_ljspeech(args)
elif args.dataset == 'kusal':
preprocess_kusal(args)
elif args.dataset == 'mailabs':
preprocess_mailabs(args)
elif args.dataset == 'mrs':
preprocess_mrs(args)
elif args.dataset == 'commonvoice':
preprocess_commonvoice(args)

if name == "main":
main()

But when I preprocces the data by using the commande:
python3 preprocess.py --dataset commonvoice
I got this erros:
Traceback (most recent call last):
File "/usr/lib/python3.5/concurrent/futures/process.py", line 175, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/root/mimic2/datasets/commonvoice_fr.py", line 64, in _process_utterance
spectrogram_filename = 'commonvoice_fr-spec.npy' % index
TypeError: not all arguments converted during string formatting
"""

Could you please help me to solve this problem?
Thanks

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Trainning in French #48

Trainning in French #48

ZheQU-somfy commented Apr 15, 2020 •

edited

Loading

Trainning in French #48

Trainning in French #48

Comments

ZheQU-somfy commented Apr 15, 2020 • edited Loading

Load the audio to a numpy array:

Compute the linear-scale spectrogram from the wav:

Compute a mel-scale spectrogram from the wav:

Write the spectrograms to disk:

Return a tuple describing this training example:

ZheQU-somfy commented Apr 15, 2020 •

edited

Loading