From 4f15f77ef1713a1f0d6b185bb6109a953c9a7cd4 Mon Sep 17 00:00:00 2001
From: Sami <58188482+RAYTRAC3R@users.noreply.github.com>
Date: Fri, 12 Mar 2021 15:02:46 -0600
Subject: [PATCH 1/5] Add voice hybridization
I'm not too good at coding, but I managed to code an multispeaker option that SHOULD mix together voices when you select multiple speakers while it's activated.
---
CookieTTS/_5_infer/VDVAETTS_server/text2speech.py | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py b/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py
index b8ed22f..e28cfd7 100644
--- a/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py
+++ b/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py
@@ -436,20 +436,25 @@ def shuffle_and_return():
speaker_names.append(speaker_names.pop(0))
return first_speaker
batch_speaker_names = [shuffle_and_return() for i in range(simultaneous_texts)]
+ elif multispeaker_mode == "hybrid_voices":
+ batch_speaker_names = speaker_names * -(-simultaneous_texts//len(speaker_names))
else:
raise NotImplementedError
if 0:# (optional) use different speaker list for text inside quotes
speaker_ids = [random.choice(speakers).split("|")[2] if ('"' in text) else random.choice(narrators).split("|")[2] for text in text_batch] # pick speaker if quotemark in text, else narrator
- text_batch = [text.replace('"',"") for text in text_batch] # remove quotes from text
+ text_batch = [text.replace('"',"") for text in text_batch] # remove quotes from text
if len(batch_speaker_names) > len(text_batch):
batch_speaker_names = batch_speaker_names[:len(text_batch)]
- simultaneous_texts = len(text_batch)
+ simultaneous_texts = len(text_batch)
# get speaker_ids (VDVAETTS)
VDVAETTS_speaker_ids = [self.ttm_sp_name_lookup[speaker] for speaker in batch_speaker_names]
VDVAETTS_speaker_ids = torch.LongTensor(VDVAETTS_speaker_ids).cuda().repeat_interleave(batch_size_per_text)
+ #VDVAETTS_speaker_mix = [44]
+ #print(VDVAETTS_speaker_mix)
+ #VDVAETTS_speaker_mix = torch.LongTensor(VDVAETTS_speaker_mix).cuda().repeat_interleave(batch_size_per_text)
# get style input
try:
@@ -503,7 +508,7 @@ def shuffle_and_return():
while np.amin(best_score) < target_score:
# run VDVAETTS
if status_updates: print("..", end='')
- outputs = self.VDVAETTS.inference(sequence, text_lengths.repeat_interleave(batch_size_per_text, dim=0), VDVAETTS_speaker_ids, style_input, char_sigma=char_sigma, frame_sigma=frame_sigma)
+ outputs = self.VDVAETTS.inference(sequence, text_lengths.repeat_interleave(batch_size_per_text, dim=0), VDVAETTS_speaker_ids, style_input, multispeaker_mode, char_sigma=char_sigma, frame_sigma=frame_sigma)
batch_pred_mel = outputs['hifigan_inputs'] if self.MTW_conf.uses_latent_input else outputs['pred_mel']
# metric for html side
From 101bdb13fcb6f3f15a181ecfe9dfbc57d41afd9d Mon Sep 17 00:00:00 2001
From: Sami <58188482+RAYTRAC3R@users.noreply.github.com>
Date: Fri, 12 Mar 2021 15:04:03 -0600
Subject: [PATCH 2/5] Update main.html
---
CookieTTS/_5_infer/VDVAETTS_server/templates/main.html | 1 +
1 file changed, 1 insertion(+)
diff --git a/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html b/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html
index a767b71..f7f6c32 100644
--- a/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html
+++ b/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html
@@ -127,6 +127,7 @@
Text To Speech
+
From e4cb487a5476b09f5c7abfa6fa199a0a9f2c7236 Mon Sep 17 00:00:00 2001
From: Sami <58188482+RAYTRAC3R@users.noreply.github.com>
Date: Fri, 12 Mar 2021 15:04:55 -0600
Subject: [PATCH 3/5] Update model.py
---
CookieTTS/_2_ttm/VDVAETTS/model.py | 17 ++++++++++++++---
1 file changed, 14 insertions(+), 3 deletions(-)
diff --git a/CookieTTS/_2_ttm/VDVAETTS/model.py b/CookieTTS/_2_ttm/VDVAETTS/model.py
index dd60d35..f0681de 100644
--- a/CookieTTS/_2_ttm/VDVAETTS/model.py
+++ b/CookieTTS/_2_ttm/VDVAETTS/model.py
@@ -1443,10 +1443,10 @@ def update_device(self, **inputs):
outputs[key] = input
return outputs
- def inference(self, text_seq, text_lengths, speaker_id, torchmoji_hdn,
+ def inference(self, text_seq, text_lengths, speaker_id, torchmoji_hdn, multispeaker_mode,
char_sigma=1.0, frame_sigma=1.0,
bn_logdur=None, char_dur=None, gt_mel=None, alignment=None,
- mel_lengths=None,):# [B, enc_T], [B], [B], [B], [B, tm_dim]
+ mel_lengths=None):# [B, enc_T], [B], [B], [B], [B, tm_dim]
outputs = {}
memory = []
@@ -1458,7 +1458,18 @@ def inference(self, text_seq, text_lengths, speaker_id, torchmoji_hdn,
# (Speaker) speaker_id -> speaker_embed
if hasattr(self, "speaker_embedding"):
speaker_embed = self.speaker_embedding(speaker_id)# [B, embed]
- outputs["speaker_embed"] = speaker_embed# [B, embed]
+ if multispeaker_mode == "hybrid_voices" and speaker_embed.shape[0] > 1:
+ splits = int(speaker_embed.shape[0] / 2)
+ mix_1, mix_2 = torch.split(speaker_embed, splits)
+ speaker_embed = torch.add(mix_1, mix_2)
+ speaker_embed = torch.div(speaker_embed, 2)
+ speaker_embed = speaker_embed.repeat(2, 1)
+ #outputs["speaker_embed"] = speaker_embed# [B, embed]
+ #speaker_embed_mix = self.speaker_embedding(speaker_mix)# [B, embed]
+ #outputs["speaker_embed_mix"] = speaker_embed_mix# [B, embed]
+ #print(speaker_embed_mix)
+ #speaker_embed = torch.div(torch.add(speaker_embed, speaker_embed_mix), 2)
+ outputs["speaker_embed"] = speaker_embed
# (TorchMoji)
if hasattr(self, 'tm_bn'):
From 509856b066f34bc8f43d4eb3e0309ada4580ba13 Mon Sep 17 00:00:00 2001
From: Sami <58188482+RAYTRAC3R@users.noreply.github.com>
Date: Sat, 13 Mar 2021 12:39:00 -0600
Subject: [PATCH 4/5] Update train.py
---
CookieTTS/_2_ttm/VDVAETTS/train.py | 310 +++++++++++++++--------------
1 file changed, 156 insertions(+), 154 deletions(-)
diff --git a/CookieTTS/_2_ttm/VDVAETTS/train.py b/CookieTTS/_2_ttm/VDVAETTS/train.py
index 42aece9..ae8abda 100644
--- a/CookieTTS/_2_ttm/VDVAETTS/train.py
+++ b/CookieTTS/_2_ttm/VDVAETTS/train.py
@@ -758,163 +758,165 @@ def train(args, rank, group_name, hparams):
y = model.parse_batch(batch)# move batch to GPU (async)
y['gt_mel'].requires_grad_()
y['use_pred_z'] = False
- y_pred = force(model, valid_kwargs=model_args, **y)
-
- loss_scalars = {
- "decoder_MAE_weight": decoder_MAE_weight,
- "decoder_MSE_weight": decoder_MSE_weight,
- "decoder_KLD_weight": decoder_KLD_weight,
- "varpred_MAE_weight": varpred_MAE_weight,
- "varpred_MSE_weight": varpred_MSE_weight,
- "varpred_KLD_weight": varpred_KLD_weight,
- "postnet_f0_MAE_weight": postnet_f0_MAE_weight,
- "postnet_f0_MSE_weight": postnet_f0_MSE_weight,
- "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight,
- "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight,
- "postnet_KLD_weight": postnet_KLD_weight,
- "postnet_MAE_weight": postnet_MAE_weight,
- "postnet_MSE_weight": postnet_MSE_weight,
- "mdn_loss_weight": mdn_loss_weight,
- "dur_loss_weight": dur_loss_weight,
- "sylps_MAE_weight": sylps_MAE_weight,
- "sylps_MSE_weight": sylps_MSE_weight,
- "diag_att_weight": diag_att_weight,
- "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight,
- "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight,
- "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight,
- "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight,
- }
- loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars,
- hifiGAN if hparams.HiFiGAN_enable else None,)
-
- file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness)
-
- if hparams.distributed_run:
- reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()}
- else:
- reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()}
-
- reduced_loss = reduced_loss_dict['loss']
-
- loss = loss_dict['loss']
- if hparams.fp16_run:
- with amp.scale_loss(loss, optimizer) as scaled_loss:
- scaled_loss.backward()
- else:
- loss.backward()
-
- if rank==0 and show_gradients:# debug/extreme verbose
- try:
- _=avg_grads
- except:
- avg_grads = {}
- for param_name, params in model.named_parameters():
- if params.requires_grad and params.grad is not None:
- norm_grad = 1.0
- grad = params.grad.abs().sum().item()
- if param_name not in avg_grads:
- avg_grads[param_name] = grad
- elif grad*5. < avg_grads[param_name]:
- avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1)
- norm_grad = grad/avg_grads[param_name]
- if grad > 30.0 or norm_grad > 2.0:
- print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}')
-
- if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer.
- hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs.
-
- if grad_clip_thresh:# apply gradient clipping to params
+ print(y['gt_mel'].shape[2], y['gt_frame_logf0'].shape[1])
+ if y['gt_mel'].shape[2] == y['gt_frame_logf0'].shape[1]:
+ y_pred = force(model, valid_kwargs=model_args, **y)
+
+ loss_scalars = {
+ "decoder_MAE_weight": decoder_MAE_weight,
+ "decoder_MSE_weight": decoder_MSE_weight,
+ "decoder_KLD_weight": decoder_KLD_weight,
+ "varpred_MAE_weight": varpred_MAE_weight,
+ "varpred_MSE_weight": varpred_MSE_weight,
+ "varpred_KLD_weight": varpred_KLD_weight,
+ "postnet_f0_MAE_weight": postnet_f0_MAE_weight,
+ "postnet_f0_MSE_weight": postnet_f0_MSE_weight,
+ "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight,
+ "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight,
+ "postnet_KLD_weight": postnet_KLD_weight,
+ "postnet_MAE_weight": postnet_MAE_weight,
+ "postnet_MSE_weight": postnet_MSE_weight,
+ "mdn_loss_weight": mdn_loss_weight,
+ "dur_loss_weight": dur_loss_weight,
+ "sylps_MAE_weight": sylps_MAE_weight,
+ "sylps_MSE_weight": sylps_MSE_weight,
+ "diag_att_weight": diag_att_weight,
+ "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight,
+ "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight,
+ "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight,
+ "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight,
+ }
+ loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars,
+ hifiGAN if hparams.HiFiGAN_enable else None,)
+
+ file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness)
+
+ if hparams.distributed_run:
+ reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()}
+ else:
+ reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()}
+
+ reduced_loss = reduced_loss_dict['loss']
+
+ loss = loss_dict['loss']
if hparams.fp16_run:
- grad_norm = torch.nn.utils.clip_grad_norm_(
- amp.master_params(optimizer), grad_clip_thresh)
- is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
+ scaled_loss.backward()
else:
- grad_norm = torch.nn.utils.clip_grad_norm_(
- model.parameters(), grad_clip_thresh)
- else:
- grad_norm = 0.0
-
- if math.isfinite(grad_norm):
- optimizer.step()
-
- # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules.
- effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate)
-
- # (Optional) Discriminator Forward+Backward Pass
- if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:
- hifiGAN.train(model.training)
- with torch.random.fork_rng(devices=[0,]):
- hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars)
-
- # get current Loss Scale of first optimizer
- loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768.
-
- # restart if training/model has collapsed
- if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)):
- raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n")
- if (loss_scale < 1/4):
- raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n")
-
- if expavg_loss_dict is None:
- expavg_loss_dict = reduced_loss_dict
- else:
- expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict.
- expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict}
- expavg_loss_dict_iters += 1
-
- if expavg_loss_dict_iters > 100:# calc smoothed loss dict
- if best_loss_dict is None:
- best_loss_dict = expavg_loss_dict
+ loss.backward()
+
+ if rank==0 and show_gradients:# debug/extreme verbose
+ try:
+ _=avg_grads
+ except:
+ avg_grads = {}
+ for param_name, params in model.named_parameters():
+ if params.requires_grad and params.grad is not None:
+ norm_grad = 1.0
+ grad = params.grad.abs().sum().item()
+ if param_name not in avg_grads:
+ avg_grads[param_name] = grad
+ elif grad*5. < avg_grads[param_name]:
+ avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1)
+ norm_grad = grad/avg_grads[param_name]
+ if grad > 30.0 or norm_grad > 2.0:
+ print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}')
+
+ if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer.
+ hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs.
+
+ if grad_clip_thresh:# apply gradient clipping to params
+ if hparams.fp16_run:
+ grad_norm = torch.nn.utils.clip_grad_norm_(
+ amp.master_params(optimizer), grad_clip_thresh)
+ is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
+ else:
+ grad_norm = torch.nn.utils.clip_grad_norm_(
+ model.parameters(), grad_clip_thresh)
else:
- best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict}
-
- if rank == 0:# print + log metrics
- duration = time.time() - start_time
- if not is_overflow:
- average_loss = rolling_loss.process(reduced_loss)
- dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5)
- InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5)
- WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0)
- logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration)
- tqdm.write(
- f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] "
- f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] "
- f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] "
- f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] "
- f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]")
- if is_overflow:
- tqdm.write("Gradient Overflow, Skipping Step\n")
- start_time = time.time()
-
- if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters
- if rank == 0:
- checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}")
- save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
-
- if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards
- print("Updating File_losses dict!")
- file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
- os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
-
- if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints
- if rank == 0 and os.path.exists(save_file_check_path):
- os.remove(save_file_check_path)
- # perform validation and save "best_val_model" depending on validation loss
- val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force
- file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
- os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
- if (val_loss < best_validation_loss):
- best_validation_loss = val_loss
- if rank == 0 and hparams.save_best_val_model:
- checkpoint_path = os.path.join(args.output_directory, "best_val_model")
- save_checkpoint(
- model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss,
- average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
- just_did_val = True
-
- del y_pred, y, batch, loss_dict, reduced_loss_dict
- iteration += 1
- # end of iteration loop
+ grad_norm = 0.0
+
+ if math.isfinite(grad_norm):
+ optimizer.step()
+
+ # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules.
+ effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate)
+
+ # (Optional) Discriminator Forward+Backward Pass
+ if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:
+ hifiGAN.train(model.training)
+ with torch.random.fork_rng(devices=[0,]):
+ hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars)
+
+ # get current Loss Scale of first optimizer
+ loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768.
+
+ # restart if training/model has collapsed
+ if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)):
+ raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n")
+ if (loss_scale < 1/4):
+ raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n")
+
+ if expavg_loss_dict is None:
+ expavg_loss_dict = reduced_loss_dict
+ else:
+ expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict.
+ expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict}
+ expavg_loss_dict_iters += 1
+
+ if expavg_loss_dict_iters > 100:# calc smoothed loss dict
+ if best_loss_dict is None:
+ best_loss_dict = expavg_loss_dict
+ else:
+ best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict}
+
+ if rank == 0:# print + log metrics
+ duration = time.time() - start_time
+ if not is_overflow:
+ average_loss = rolling_loss.process(reduced_loss)
+ dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5)
+ InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5)
+ WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0)
+ logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration)
+ tqdm.write(
+ f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] "
+ f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] "
+ f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] "
+ f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] "
+ f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]")
+ if is_overflow:
+ tqdm.write("Gradient Overflow, Skipping Step\n")
+ start_time = time.time()
+
+ if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters
+ if rank == 0:
+ checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}")
+ save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
+
+ if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards
+ print("Updating File_losses dict!")
+ file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
+ os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
+
+ if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints
+ if rank == 0 and os.path.exists(save_file_check_path):
+ os.remove(save_file_check_path)
+ # perform validation and save "best_val_model" depending on validation loss
+ val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force
+ file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
+ os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
+ if (val_loss < best_validation_loss):
+ best_validation_loss = val_loss
+ if rank == 0 and hparams.save_best_val_model:
+ checkpoint_path = os.path.join(args.output_directory, "best_val_model")
+ save_checkpoint(
+ model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss,
+ average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
+ just_did_val = True
+
+ del y_pred, y, batch, loss_dict, reduced_loss_dict
+ iteration += 1
+ # end of iteration loop
# update filelist of training dataloader
print("Updating File_losses dict!")
From f6397729fc1c857b21df1470f7a8ddfc065823fd Mon Sep 17 00:00:00 2001
From: Sami <58188482+RAYTRAC3R@users.noreply.github.com>
Date: Sat, 13 Mar 2021 14:37:44 -0600
Subject: [PATCH 5/5] Update train.py
---
CookieTTS/_2_ttm/VDVAETTS/train.py | 311 ++++++++++++++---------------
1 file changed, 154 insertions(+), 157 deletions(-)
diff --git a/CookieTTS/_2_ttm/VDVAETTS/train.py b/CookieTTS/_2_ttm/VDVAETTS/train.py
index ae8abda..49981bd 100644
--- a/CookieTTS/_2_ttm/VDVAETTS/train.py
+++ b/CookieTTS/_2_ttm/VDVAETTS/train.py
@@ -758,165 +758,163 @@ def train(args, rank, group_name, hparams):
y = model.parse_batch(batch)# move batch to GPU (async)
y['gt_mel'].requires_grad_()
y['use_pred_z'] = False
- print(y['gt_mel'].shape[2], y['gt_frame_logf0'].shape[1])
- if y['gt_mel'].shape[2] == y['gt_frame_logf0'].shape[1]:
- y_pred = force(model, valid_kwargs=model_args, **y)
-
- loss_scalars = {
- "decoder_MAE_weight": decoder_MAE_weight,
- "decoder_MSE_weight": decoder_MSE_weight,
- "decoder_KLD_weight": decoder_KLD_weight,
- "varpred_MAE_weight": varpred_MAE_weight,
- "varpred_MSE_weight": varpred_MSE_weight,
- "varpred_KLD_weight": varpred_KLD_weight,
- "postnet_f0_MAE_weight": postnet_f0_MAE_weight,
- "postnet_f0_MSE_weight": postnet_f0_MSE_weight,
- "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight,
- "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight,
- "postnet_KLD_weight": postnet_KLD_weight,
- "postnet_MAE_weight": postnet_MAE_weight,
- "postnet_MSE_weight": postnet_MSE_weight,
- "mdn_loss_weight": mdn_loss_weight,
- "dur_loss_weight": dur_loss_weight,
- "sylps_MAE_weight": sylps_MAE_weight,
- "sylps_MSE_weight": sylps_MSE_weight,
- "diag_att_weight": diag_att_weight,
- "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight,
- "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight,
- "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight,
- "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight,
- }
- loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars,
- hifiGAN if hparams.HiFiGAN_enable else None,)
-
- file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness)
-
- if hparams.distributed_run:
- reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()}
- else:
- reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()}
-
- reduced_loss = reduced_loss_dict['loss']
-
- loss = loss_dict['loss']
+ y_pred = force(model, valid_kwargs=model_args, **y)
+
+ loss_scalars = {
+ "decoder_MAE_weight": decoder_MAE_weight,
+ "decoder_MSE_weight": decoder_MSE_weight,
+ "decoder_KLD_weight": decoder_KLD_weight,
+ "varpred_MAE_weight": varpred_MAE_weight,
+ "varpred_MSE_weight": varpred_MSE_weight,
+ "varpred_KLD_weight": varpred_KLD_weight,
+ "postnet_f0_MAE_weight": postnet_f0_MAE_weight,
+ "postnet_f0_MSE_weight": postnet_f0_MSE_weight,
+ "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight,
+ "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight,
+ "postnet_KLD_weight": postnet_KLD_weight,
+ "postnet_MAE_weight": postnet_MAE_weight,
+ "postnet_MSE_weight": postnet_MSE_weight,
+ "mdn_loss_weight": mdn_loss_weight,
+ "dur_loss_weight": dur_loss_weight,
+ "sylps_MAE_weight": sylps_MAE_weight,
+ "sylps_MSE_weight": sylps_MSE_weight,
+ "diag_att_weight": diag_att_weight,
+ "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight,
+ "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight,
+ "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight,
+ "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight,
+ }
+ loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars,
+ hifiGAN if hparams.HiFiGAN_enable else None,)
+
+ file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness)
+
+ if hparams.distributed_run:
+ reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()}
+ else:
+ reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()}
+
+ reduced_loss = reduced_loss_dict['loss']
+
+ loss = loss_dict['loss']
+ if hparams.fp16_run:
+ with amp.scale_loss(loss, optimizer) as scaled_loss:
+ scaled_loss.backward()
+ else:
+ loss.backward()
+
+ if rank==0 and show_gradients:# debug/extreme verbose
+ try:
+ _=avg_grads
+ except:
+ avg_grads = {}
+ for param_name, params in model.named_parameters():
+ if params.requires_grad and params.grad is not None:
+ norm_grad = 1.0
+ grad = params.grad.abs().sum().item()
+ if param_name not in avg_grads:
+ avg_grads[param_name] = grad
+ elif grad*5. < avg_grads[param_name]:
+ avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1)
+ norm_grad = grad/avg_grads[param_name]
+ if grad > 30.0 or norm_grad > 2.0:
+ print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}')
+
+ if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer.
+ hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs.
+
+ if grad_clip_thresh:# apply gradient clipping to params
if hparams.fp16_run:
- with amp.scale_loss(loss, optimizer) as scaled_loss:
- scaled_loss.backward()
+ grad_norm = torch.nn.utils.clip_grad_norm_(
+ amp.master_params(optimizer), grad_clip_thresh)
+ is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
else:
- loss.backward()
-
- if rank==0 and show_gradients:# debug/extreme verbose
- try:
- _=avg_grads
- except:
- avg_grads = {}
- for param_name, params in model.named_parameters():
- if params.requires_grad and params.grad is not None:
- norm_grad = 1.0
- grad = params.grad.abs().sum().item()
- if param_name not in avg_grads:
- avg_grads[param_name] = grad
- elif grad*5. < avg_grads[param_name]:
- avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1)
- norm_grad = grad/avg_grads[param_name]
- if grad > 30.0 or norm_grad > 2.0:
- print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}')
-
- if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer.
- hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs.
-
- if grad_clip_thresh:# apply gradient clipping to params
- if hparams.fp16_run:
- grad_norm = torch.nn.utils.clip_grad_norm_(
- amp.master_params(optimizer), grad_clip_thresh)
- is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm)
- else:
- grad_norm = torch.nn.utils.clip_grad_norm_(
- model.parameters(), grad_clip_thresh)
- else:
- grad_norm = 0.0
-
- if math.isfinite(grad_norm):
- optimizer.step()
-
- # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules.
- effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate)
-
- # (Optional) Discriminator Forward+Backward Pass
- if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:
- hifiGAN.train(model.training)
- with torch.random.fork_rng(devices=[0,]):
- hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars)
-
- # get current Loss Scale of first optimizer
- loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768.
-
- # restart if training/model has collapsed
- if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)):
- raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n")
- if (loss_scale < 1/4):
- raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n")
-
- if expavg_loss_dict is None:
- expavg_loss_dict = reduced_loss_dict
+ grad_norm = torch.nn.utils.clip_grad_norm_(
+ model.parameters(), grad_clip_thresh)
+ else:
+ grad_norm = 0.0
+
+ if math.isfinite(grad_norm):
+ optimizer.step()
+
+ # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules.
+ effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate)
+
+ # (Optional) Discriminator Forward+Backward Pass
+ if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:
+ hifiGAN.train(model.training)
+ with torch.random.fork_rng(devices=[0,]):
+ hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars)
+
+ # get current Loss Scale of first optimizer
+ loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768.
+
+ # restart if training/model has collapsed
+ if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)):
+ raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n")
+ if (loss_scale < 1/4):
+ raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n")
+
+ if expavg_loss_dict is None:
+ expavg_loss_dict = reduced_loss_dict
+ else:
+ expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict.
+ expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict}
+ expavg_loss_dict_iters += 1
+
+ if expavg_loss_dict_iters > 100:# calc smoothed loss dict
+ if best_loss_dict is None:
+ best_loss_dict = expavg_loss_dict
else:
- expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict.
- expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict}
- expavg_loss_dict_iters += 1
-
- if expavg_loss_dict_iters > 100:# calc smoothed loss dict
- if best_loss_dict is None:
- best_loss_dict = expavg_loss_dict
- else:
- best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict}
-
- if rank == 0:# print + log metrics
- duration = time.time() - start_time
- if not is_overflow:
- average_loss = rolling_loss.process(reduced_loss)
- dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5)
- InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5)
- WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0)
- logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration)
- tqdm.write(
- f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] "
- f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] "
- f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] "
- f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] "
- f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]")
- if is_overflow:
- tqdm.write("Gradient Overflow, Skipping Step\n")
- start_time = time.time()
-
- if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters
- if rank == 0:
- checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}")
- save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
-
- if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards
- print("Updating File_losses dict!")
- file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
- os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
-
- if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints
- if rank == 0 and os.path.exists(save_file_check_path):
- os.remove(save_file_check_path)
- # perform validation and save "best_val_model" depending on validation loss
- val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force
- file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
- os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
- if (val_loss < best_validation_loss):
- best_validation_loss = val_loss
- if rank == 0 and hparams.save_best_val_model:
- checkpoint_path = os.path.join(args.output_directory, "best_val_model")
- save_checkpoint(
- model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss,
- average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
- just_did_val = True
-
- del y_pred, y, batch, loss_dict, reduced_loss_dict
- iteration += 1
- # end of iteration loop
+ best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict}
+
+ if rank == 0:# print + log metrics
+ duration = time.time() - start_time
+ if not is_overflow:
+ average_loss = rolling_loss.process(reduced_loss)
+ dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5)
+ InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5)
+ WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0)
+ logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration)
+ tqdm.write(
+ f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] "
+ f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] "
+ f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] "
+ f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] "
+ f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]")
+ if is_overflow:
+ tqdm.write("Gradient Overflow, Skipping Step\n")
+ start_time = time.time()
+
+ if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters
+ if rank == 0:
+ checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}")
+ save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
+
+ if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards
+ print("Updating File_losses dict!")
+ file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
+ os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
+
+ if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints
+ if rank == 0 and os.path.exists(save_file_check_path):
+ os.remove(save_file_check_path)
+ # perform validation and save "best_val_model" depending on validation loss
+ val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force
+ file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'),
+ os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank)
+ if (val_loss < best_validation_loss):
+ best_validation_loss = val_loss
+ if rank == 0 and hparams.save_best_val_model:
+ checkpoint_path = os.path.join(args.output_directory, "best_val_model")
+ save_checkpoint(
+ model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss,
+ average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path)
+ just_did_val = True
+
+ del y_pred, y, batch, loss_dict, reduced_loss_dict
+ iteration += 1
+ # end of iteration loop
# update filelist of training dataloader
print("Updating File_losses dict!")
@@ -1007,4 +1005,3 @@ def train(args, rank, group_name, hparams):
pass
train(args, args.rank, args.group_name, hparams)
-