forked from dusty-nv/jetson-containers
-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
138 lines (108 loc) · 6.13 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python3
import os
import time
import wave
import onnxruntime
from piper import PiperVoice
from piper.download import ensure_voice_exists, find_voice, get_voices
DEFAULT_PROMPT="""A rainbow is a meteorological phenomenon that is caused by reflection, refraction and dispersion of light in water droplets resulting in a spectrum of light appearing in the sky.
It takes the form of a multi-colored circular arc.
Rainbows caused by sunlight always appear in the section of sky directly opposite the Sun.
With tenure, Suzie’d have all the more leisure for yachting, but her publications are no good.
Shaw, those twelve beige hooks are joined if I patch a young, gooey mouth.
Are those shy Eurasian footwear, cowboy chaps, or jolly earthmoving headgear?
The beige hue on the waters of the loch impressed all, including the French queen, before she heard that symphony again, just as young Arthur wanted.
"""
def main(model='en_US-lessac-high', config=None, cache=os.environ.get('PIPER_CACHE'),
speaker=0, length_scale=1.0, noise_scale=0.667, noise_w=0.8, sentence_silence=0.2,
prompt=DEFAULT_PROMPT, output='/dev/null', use_cuda=True, runs=5, dump=False, **kwargs):
# Download voice info
try:
voices_info = get_voices(cache, update_voices=True)
except Exception as error:
print(f"Failed to download Piper voice list ({error})")
voices_info = get_voices(cache)
# Resolve aliases for backwards compatibility with old voice names
aliases_info = {}
for voice_info in voices_info.values():
for voice_alias in voice_info.get("aliases", []):
aliases_info[voice_alias] = {"_is_alias": True, **voice_info}
voices_info.update(aliases_info)
if not os.path.isfile(os.path.join(cache, model)):
model_name = model
ensure_voice_exists(model, cache, cache, voices_info)
model, config = find_voice(model, [cache])
else:
model_name = os.path.splitext(os.path.basename(model))[0]
# Load model
print(f"Loading {model}")
voice = PiperVoice.load(model, config_path=config, use_cuda=True)
# get the speaker name->ID mapping
speaker_id_map = voices_info[model_name]['speaker_id_map']
if not speaker_id_map:
speaker_id_map = {'Default': 0}
# get the inverse speakerID->name mapping
speaker_id_inv = {}
for key, value in speaker_id_map.items():
speaker_id_inv[value] = key
# optional mode to dump all speakers
if dump:
speakers = list(speaker_id_map.values())
runs = 1
output_dir = output
os.makedirs(output_dir, exist_ok=True)
else:
speakers = [speaker]
for speaker in speakers:
synthesize_args = {
"speaker_id": speaker,
"length_scale": length_scale,
"noise_scale": noise_scale,
"noise_w": noise_w,
"sentence_silence": sentence_silence,
}
# Run benchmarking iterations
for run in range(runs):
if dump:
output = os.path.join(output_dir, f"{model_name}_{speaker:04d}_{speaker_id_inv[speaker]}.wav")
with wave.open(output, "wb") as wav_file:
wav_file.setnchannels(1)
start = time.perf_counter()
voice.synthesize(prompt, wav_file, **synthesize_args)
end = time.perf_counter()
inference_duration = end - start
frames = wav_file.getnframes()
rate = wav_file.getframerate()
audio_duration = frames / float(rate)
print(f"Piper TTS model: {model_name}")
print(f"Output saved to: {output}")
print(f"Inference duration: {inference_duration:.3f} sec")
print(f"Audio duration: {audio_duration:.3f} sec")
print(f"Realtime factor: {inference_duration/audio_duration:.3f}")
print(f"Inverse RTF (RTFX): {audio_duration/inference_duration:.3f}\n")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--model', type=str, default='en_US-lessac-high', help="model path or name to download")
parser.add_argument('--config', type=str, default=None, help="path to the model's json config (if unspecified, will be inferred from --model)")
parser.add_argument('--cache', type=str, default=os.environ.get('PIPER_CACHE'), help="the location to save downloaded models")
parser.add_argument('--speaker', type=int, default=0, help="the speaker ID from the voice to use")
parser.add_argument('--length-scale', type=float, default=1.0, help="speaking speed")
parser.add_argument('--noise-scale', type=float, default=0.667, help="noise added to the generator")
parser.add_argument('--noise-w', type=float, default=0.8, help="phoneme width variation")
parser.add_argument('--sentence-silence', type=float, default=0.2, help="seconds of silence after each sentence")
parser.add_argument('--prompt', type=str, default=None, help="the test prompt to generate (will be set to a default prompt if left none)")
parser.add_argument('--output', type=str, default=None, help="path to output audio wav file to save (will be /data/tts/piper-$MODEL.wav by default)")
parser.add_argument('--runs', type=int, default=5, help="the number of benchmarking iterations to run")
parser.add_argument('--dump', action='store_true', help="dump all speaker voices to the output directory")
parser.add_argument('--disable-cuda', action='store_false', dest='use_cuda', help="disable CUDA and use CPU for inference instead")
parser.add_argument('--verbose', action='store_true', help="enable onnxruntime debug logging")
args = parser.parse_args()
if args.verbose:
onnxruntime.set_default_logger_severity(0)
if not args.prompt:
args.prompt = DEFAULT_PROMPT
if not args.output:
args.output = f"/data/audio/tts/piper-{os.path.splitext(os.path.basename(args.model))[0]}.wav"
print(args)
main(**vars(args))