From 4f15f77ef1713a1f0d6b185bb6109a953c9a7cd4 Mon Sep 17 00:00:00 2001 From: Sami <58188482+RAYTRAC3R@users.noreply.github.com> Date: Fri, 12 Mar 2021 15:02:46 -0600 Subject: [PATCH 1/5] Add voice hybridization I'm not too good at coding, but I managed to code an multispeaker option that SHOULD mix together voices when you select multiple speakers while it's activated. --- CookieTTS/_5_infer/VDVAETTS_server/text2speech.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py b/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py index b8ed22f..e28cfd7 100644 --- a/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py +++ b/CookieTTS/_5_infer/VDVAETTS_server/text2speech.py @@ -436,20 +436,25 @@ def shuffle_and_return(): speaker_names.append(speaker_names.pop(0)) return first_speaker batch_speaker_names = [shuffle_and_return() for i in range(simultaneous_texts)] + elif multispeaker_mode == "hybrid_voices": + batch_speaker_names = speaker_names * -(-simultaneous_texts//len(speaker_names)) else: raise NotImplementedError if 0:# (optional) use different speaker list for text inside quotes speaker_ids = [random.choice(speakers).split("|")[2] if ('"' in text) else random.choice(narrators).split("|")[2] for text in text_batch] # pick speaker if quotemark in text, else narrator - text_batch = [text.replace('"',"") for text in text_batch] # remove quotes from text + text_batch = [text.replace('"',"") for text in text_batch] # remove quotes from text if len(batch_speaker_names) > len(text_batch): batch_speaker_names = batch_speaker_names[:len(text_batch)] - simultaneous_texts = len(text_batch) + simultaneous_texts = len(text_batch) # get speaker_ids (VDVAETTS) VDVAETTS_speaker_ids = [self.ttm_sp_name_lookup[speaker] for speaker in batch_speaker_names] VDVAETTS_speaker_ids = torch.LongTensor(VDVAETTS_speaker_ids).cuda().repeat_interleave(batch_size_per_text) + #VDVAETTS_speaker_mix = [44] + #print(VDVAETTS_speaker_mix) + #VDVAETTS_speaker_mix = torch.LongTensor(VDVAETTS_speaker_mix).cuda().repeat_interleave(batch_size_per_text) # get style input try: @@ -503,7 +508,7 @@ def shuffle_and_return(): while np.amin(best_score) < target_score: # run VDVAETTS if status_updates: print("..", end='') - outputs = self.VDVAETTS.inference(sequence, text_lengths.repeat_interleave(batch_size_per_text, dim=0), VDVAETTS_speaker_ids, style_input, char_sigma=char_sigma, frame_sigma=frame_sigma) + outputs = self.VDVAETTS.inference(sequence, text_lengths.repeat_interleave(batch_size_per_text, dim=0), VDVAETTS_speaker_ids, style_input, multispeaker_mode, char_sigma=char_sigma, frame_sigma=frame_sigma) batch_pred_mel = outputs['hifigan_inputs'] if self.MTW_conf.uses_latent_input else outputs['pred_mel'] # metric for html side From 101bdb13fcb6f3f15a181ecfe9dfbc57d41afd9d Mon Sep 17 00:00:00 2001 From: Sami <58188482+RAYTRAC3R@users.noreply.github.com> Date: Fri, 12 Mar 2021 15:04:03 -0600 Subject: [PATCH 2/5] Update main.html --- CookieTTS/_5_infer/VDVAETTS_server/templates/main.html | 1 + 1 file changed, 1 insertion(+) diff --git a/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html b/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html index a767b71..f7f6c32 100644 --- a/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html +++ b/CookieTTS/_5_infer/VDVAETTS_server/templates/main.html @@ -127,6 +127,7 @@

Text To Speech

+
From e4cb487a5476b09f5c7abfa6fa199a0a9f2c7236 Mon Sep 17 00:00:00 2001 From: Sami <58188482+RAYTRAC3R@users.noreply.github.com> Date: Fri, 12 Mar 2021 15:04:55 -0600 Subject: [PATCH 3/5] Update model.py --- CookieTTS/_2_ttm/VDVAETTS/model.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/CookieTTS/_2_ttm/VDVAETTS/model.py b/CookieTTS/_2_ttm/VDVAETTS/model.py index dd60d35..f0681de 100644 --- a/CookieTTS/_2_ttm/VDVAETTS/model.py +++ b/CookieTTS/_2_ttm/VDVAETTS/model.py @@ -1443,10 +1443,10 @@ def update_device(self, **inputs): outputs[key] = input return outputs - def inference(self, text_seq, text_lengths, speaker_id, torchmoji_hdn, + def inference(self, text_seq, text_lengths, speaker_id, torchmoji_hdn, multispeaker_mode, char_sigma=1.0, frame_sigma=1.0, bn_logdur=None, char_dur=None, gt_mel=None, alignment=None, - mel_lengths=None,):# [B, enc_T], [B], [B], [B], [B, tm_dim] + mel_lengths=None):# [B, enc_T], [B], [B], [B], [B, tm_dim] outputs = {} memory = [] @@ -1458,7 +1458,18 @@ def inference(self, text_seq, text_lengths, speaker_id, torchmoji_hdn, # (Speaker) speaker_id -> speaker_embed if hasattr(self, "speaker_embedding"): speaker_embed = self.speaker_embedding(speaker_id)# [B, embed] - outputs["speaker_embed"] = speaker_embed# [B, embed] + if multispeaker_mode == "hybrid_voices" and speaker_embed.shape[0] > 1: + splits = int(speaker_embed.shape[0] / 2) + mix_1, mix_2 = torch.split(speaker_embed, splits) + speaker_embed = torch.add(mix_1, mix_2) + speaker_embed = torch.div(speaker_embed, 2) + speaker_embed = speaker_embed.repeat(2, 1) + #outputs["speaker_embed"] = speaker_embed# [B, embed] + #speaker_embed_mix = self.speaker_embedding(speaker_mix)# [B, embed] + #outputs["speaker_embed_mix"] = speaker_embed_mix# [B, embed] + #print(speaker_embed_mix) + #speaker_embed = torch.div(torch.add(speaker_embed, speaker_embed_mix), 2) + outputs["speaker_embed"] = speaker_embed # (TorchMoji) if hasattr(self, 'tm_bn'): From 509856b066f34bc8f43d4eb3e0309ada4580ba13 Mon Sep 17 00:00:00 2001 From: Sami <58188482+RAYTRAC3R@users.noreply.github.com> Date: Sat, 13 Mar 2021 12:39:00 -0600 Subject: [PATCH 4/5] Update train.py --- CookieTTS/_2_ttm/VDVAETTS/train.py | 310 +++++++++++++++-------------- 1 file changed, 156 insertions(+), 154 deletions(-) diff --git a/CookieTTS/_2_ttm/VDVAETTS/train.py b/CookieTTS/_2_ttm/VDVAETTS/train.py index 42aece9..ae8abda 100644 --- a/CookieTTS/_2_ttm/VDVAETTS/train.py +++ b/CookieTTS/_2_ttm/VDVAETTS/train.py @@ -758,163 +758,165 @@ def train(args, rank, group_name, hparams): y = model.parse_batch(batch)# move batch to GPU (async) y['gt_mel'].requires_grad_() y['use_pred_z'] = False - y_pred = force(model, valid_kwargs=model_args, **y) - - loss_scalars = { - "decoder_MAE_weight": decoder_MAE_weight, - "decoder_MSE_weight": decoder_MSE_weight, - "decoder_KLD_weight": decoder_KLD_weight, - "varpred_MAE_weight": varpred_MAE_weight, - "varpred_MSE_weight": varpred_MSE_weight, - "varpred_KLD_weight": varpred_KLD_weight, - "postnet_f0_MAE_weight": postnet_f0_MAE_weight, - "postnet_f0_MSE_weight": postnet_f0_MSE_weight, - "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight, - "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight, - "postnet_KLD_weight": postnet_KLD_weight, - "postnet_MAE_weight": postnet_MAE_weight, - "postnet_MSE_weight": postnet_MSE_weight, - "mdn_loss_weight": mdn_loss_weight, - "dur_loss_weight": dur_loss_weight, - "sylps_MAE_weight": sylps_MAE_weight, - "sylps_MSE_weight": sylps_MSE_weight, - "diag_att_weight": diag_att_weight, - "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight, - "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight, - "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight, - "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight, - } - loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars, - hifiGAN if hparams.HiFiGAN_enable else None,) - - file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness) - - if hparams.distributed_run: - reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()} - else: - reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()} - - reduced_loss = reduced_loss_dict['loss'] - - loss = loss_dict['loss'] - if hparams.fp16_run: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() - else: - loss.backward() - - if rank==0 and show_gradients:# debug/extreme verbose - try: - _=avg_grads - except: - avg_grads = {} - for param_name, params in model.named_parameters(): - if params.requires_grad and params.grad is not None: - norm_grad = 1.0 - grad = params.grad.abs().sum().item() - if param_name not in avg_grads: - avg_grads[param_name] = grad - elif grad*5. < avg_grads[param_name]: - avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1) - norm_grad = grad/avg_grads[param_name] - if grad > 30.0 or norm_grad > 2.0: - print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}') - - if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer. - hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs. - - if grad_clip_thresh:# apply gradient clipping to params + print(y['gt_mel'].shape[2], y['gt_frame_logf0'].shape[1]) + if y['gt_mel'].shape[2] == y['gt_frame_logf0'].shape[1]: + y_pred = force(model, valid_kwargs=model_args, **y) + + loss_scalars = { + "decoder_MAE_weight": decoder_MAE_weight, + "decoder_MSE_weight": decoder_MSE_weight, + "decoder_KLD_weight": decoder_KLD_weight, + "varpred_MAE_weight": varpred_MAE_weight, + "varpred_MSE_weight": varpred_MSE_weight, + "varpred_KLD_weight": varpred_KLD_weight, + "postnet_f0_MAE_weight": postnet_f0_MAE_weight, + "postnet_f0_MSE_weight": postnet_f0_MSE_weight, + "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight, + "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight, + "postnet_KLD_weight": postnet_KLD_weight, + "postnet_MAE_weight": postnet_MAE_weight, + "postnet_MSE_weight": postnet_MSE_weight, + "mdn_loss_weight": mdn_loss_weight, + "dur_loss_weight": dur_loss_weight, + "sylps_MAE_weight": sylps_MAE_weight, + "sylps_MSE_weight": sylps_MSE_weight, + "diag_att_weight": diag_att_weight, + "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight, + "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight, + "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight, + "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight, + } + loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars, + hifiGAN if hparams.HiFiGAN_enable else None,) + + file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness) + + if hparams.distributed_run: + reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()} + else: + reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()} + + reduced_loss = reduced_loss_dict['loss'] + + loss = loss_dict['loss'] if hparams.fp16_run: - grad_norm = torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), grad_clip_thresh) - is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() else: - grad_norm = torch.nn.utils.clip_grad_norm_( - model.parameters(), grad_clip_thresh) - else: - grad_norm = 0.0 - - if math.isfinite(grad_norm): - optimizer.step() - - # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules. - effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate) - - # (Optional) Discriminator Forward+Backward Pass - if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']: - hifiGAN.train(model.training) - with torch.random.fork_rng(devices=[0,]): - hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars) - - # get current Loss Scale of first optimizer - loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768. - - # restart if training/model has collapsed - if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)): - raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") - if (loss_scale < 1/4): - raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n") - - if expavg_loss_dict is None: - expavg_loss_dict = reduced_loss_dict - else: - expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict. - expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict} - expavg_loss_dict_iters += 1 - - if expavg_loss_dict_iters > 100:# calc smoothed loss dict - if best_loss_dict is None: - best_loss_dict = expavg_loss_dict + loss.backward() + + if rank==0 and show_gradients:# debug/extreme verbose + try: + _=avg_grads + except: + avg_grads = {} + for param_name, params in model.named_parameters(): + if params.requires_grad and params.grad is not None: + norm_grad = 1.0 + grad = params.grad.abs().sum().item() + if param_name not in avg_grads: + avg_grads[param_name] = grad + elif grad*5. < avg_grads[param_name]: + avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1) + norm_grad = grad/avg_grads[param_name] + if grad > 30.0 or norm_grad > 2.0: + print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}') + + if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer. + hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs. + + if grad_clip_thresh:# apply gradient clipping to params + if hparams.fp16_run: + grad_norm = torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), grad_clip_thresh) + is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) + else: + grad_norm = torch.nn.utils.clip_grad_norm_( + model.parameters(), grad_clip_thresh) else: - best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict} - - if rank == 0:# print + log metrics - duration = time.time() - start_time - if not is_overflow: - average_loss = rolling_loss.process(reduced_loss) - dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5) - InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5) - WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0) - logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration) - tqdm.write( - f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] " - f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] " - f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] " - f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] " - f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]") - if is_overflow: - tqdm.write("Gradient Overflow, Skipping Step\n") - start_time = time.time() - - if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters - if rank == 0: - checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}") - save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) - - if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards - print("Updating File_losses dict!") - file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), - os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) - - if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints - if rank == 0 and os.path.exists(save_file_check_path): - os.remove(save_file_check_path) - # perform validation and save "best_val_model" depending on validation loss - val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force - file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), - os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) - if (val_loss < best_validation_loss): - best_validation_loss = val_loss - if rank == 0 and hparams.save_best_val_model: - checkpoint_path = os.path.join(args.output_directory, "best_val_model") - save_checkpoint( - model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, - average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) - just_did_val = True - - del y_pred, y, batch, loss_dict, reduced_loss_dict - iteration += 1 - # end of iteration loop + grad_norm = 0.0 + + if math.isfinite(grad_norm): + optimizer.step() + + # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules. + effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate) + + # (Optional) Discriminator Forward+Backward Pass + if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']: + hifiGAN.train(model.training) + with torch.random.fork_rng(devices=[0,]): + hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars) + + # get current Loss Scale of first optimizer + loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768. + + # restart if training/model has collapsed + if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)): + raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") + if (loss_scale < 1/4): + raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n") + + if expavg_loss_dict is None: + expavg_loss_dict = reduced_loss_dict + else: + expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict. + expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict} + expavg_loss_dict_iters += 1 + + if expavg_loss_dict_iters > 100:# calc smoothed loss dict + if best_loss_dict is None: + best_loss_dict = expavg_loss_dict + else: + best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict} + + if rank == 0:# print + log metrics + duration = time.time() - start_time + if not is_overflow: + average_loss = rolling_loss.process(reduced_loss) + dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5) + InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5) + WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0) + logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration) + tqdm.write( + f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] " + f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] " + f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] " + f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] " + f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]") + if is_overflow: + tqdm.write("Gradient Overflow, Skipping Step\n") + start_time = time.time() + + if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters + if rank == 0: + checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}") + save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) + + if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards + print("Updating File_losses dict!") + file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), + os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) + + if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints + if rank == 0 and os.path.exists(save_file_check_path): + os.remove(save_file_check_path) + # perform validation and save "best_val_model" depending on validation loss + val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force + file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), + os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) + if (val_loss < best_validation_loss): + best_validation_loss = val_loss + if rank == 0 and hparams.save_best_val_model: + checkpoint_path = os.path.join(args.output_directory, "best_val_model") + save_checkpoint( + model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, + average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) + just_did_val = True + + del y_pred, y, batch, loss_dict, reduced_loss_dict + iteration += 1 + # end of iteration loop # update filelist of training dataloader print("Updating File_losses dict!") From f6397729fc1c857b21df1470f7a8ddfc065823fd Mon Sep 17 00:00:00 2001 From: Sami <58188482+RAYTRAC3R@users.noreply.github.com> Date: Sat, 13 Mar 2021 14:37:44 -0600 Subject: [PATCH 5/5] Update train.py --- CookieTTS/_2_ttm/VDVAETTS/train.py | 311 ++++++++++++++--------------- 1 file changed, 154 insertions(+), 157 deletions(-) diff --git a/CookieTTS/_2_ttm/VDVAETTS/train.py b/CookieTTS/_2_ttm/VDVAETTS/train.py index ae8abda..49981bd 100644 --- a/CookieTTS/_2_ttm/VDVAETTS/train.py +++ b/CookieTTS/_2_ttm/VDVAETTS/train.py @@ -758,165 +758,163 @@ def train(args, rank, group_name, hparams): y = model.parse_batch(batch)# move batch to GPU (async) y['gt_mel'].requires_grad_() y['use_pred_z'] = False - print(y['gt_mel'].shape[2], y['gt_frame_logf0'].shape[1]) - if y['gt_mel'].shape[2] == y['gt_frame_logf0'].shape[1]: - y_pred = force(model, valid_kwargs=model_args, **y) - - loss_scalars = { - "decoder_MAE_weight": decoder_MAE_weight, - "decoder_MSE_weight": decoder_MSE_weight, - "decoder_KLD_weight": decoder_KLD_weight, - "varpred_MAE_weight": varpred_MAE_weight, - "varpred_MSE_weight": varpred_MSE_weight, - "varpred_KLD_weight": varpred_KLD_weight, - "postnet_f0_MAE_weight": postnet_f0_MAE_weight, - "postnet_f0_MSE_weight": postnet_f0_MSE_weight, - "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight, - "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight, - "postnet_KLD_weight": postnet_KLD_weight, - "postnet_MAE_weight": postnet_MAE_weight, - "postnet_MSE_weight": postnet_MSE_weight, - "mdn_loss_weight": mdn_loss_weight, - "dur_loss_weight": dur_loss_weight, - "sylps_MAE_weight": sylps_MAE_weight, - "sylps_MSE_weight": sylps_MSE_weight, - "diag_att_weight": diag_att_weight, - "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight, - "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight, - "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight, - "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight, - } - loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars, - hifiGAN if hparams.HiFiGAN_enable else None,) - - file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness) - - if hparams.distributed_run: - reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()} - else: - reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()} - - reduced_loss = reduced_loss_dict['loss'] - - loss = loss_dict['loss'] + y_pred = force(model, valid_kwargs=model_args, **y) + + loss_scalars = { + "decoder_MAE_weight": decoder_MAE_weight, + "decoder_MSE_weight": decoder_MSE_weight, + "decoder_KLD_weight": decoder_KLD_weight, + "varpred_MAE_weight": varpred_MAE_weight, + "varpred_MSE_weight": varpred_MSE_weight, + "varpred_KLD_weight": varpred_KLD_weight, + "postnet_f0_MAE_weight": postnet_f0_MAE_weight, + "postnet_f0_MSE_weight": postnet_f0_MSE_weight, + "postnet_voiced_MAE_weight": postnet_voiced_MAE_weight, + "postnet_voiced_BCE_weight": postnet_voiced_BCE_weight, + "postnet_KLD_weight": postnet_KLD_weight, + "postnet_MAE_weight": postnet_MAE_weight, + "postnet_MSE_weight": postnet_MSE_weight, + "mdn_loss_weight": mdn_loss_weight, + "dur_loss_weight": dur_loss_weight, + "sylps_MAE_weight": sylps_MAE_weight, + "sylps_MSE_weight": sylps_MSE_weight, + "diag_att_weight": diag_att_weight, + "HiFiGAN_g_all_class_weight": HiFiGAN_g_all_class_weight, + "HiFiGAN_g_all_featuremap_weight": HiFiGAN_g_all_featuremap_weight, + "HiFiGAN_g_all_mel_mae_weight": HiFiGAN_g_all_mel_mae_weight, + "HiFiGAN_d_all_class_weight": HiFiGAN_d_all_class_weight, + } + loss_dict, file_losses_batch = criterion(iteration, model, y_pred, y, loss_scalars, + hifiGAN if hparams.HiFiGAN_enable else None,) + + file_losses = update_smoothed_dict(file_losses, file_losses_batch, file_losses_smoothness) + + if hparams.distributed_run: + reduced_loss_dict = {k: reduce_tensor(v.data, args.n_gpus).item() if v is not None else 0. for k, v in loss_dict.items()} + else: + reduced_loss_dict = {k: v.item() if v is not None else 0. for k, v in loss_dict.items()} + + reduced_loss = reduced_loss_dict['loss'] + + loss = loss_dict['loss'] + if hparams.fp16_run: + with amp.scale_loss(loss, optimizer) as scaled_loss: + scaled_loss.backward() + else: + loss.backward() + + if rank==0 and show_gradients:# debug/extreme verbose + try: + _=avg_grads + except: + avg_grads = {} + for param_name, params in model.named_parameters(): + if params.requires_grad and params.grad is not None: + norm_grad = 1.0 + grad = params.grad.abs().sum().item() + if param_name not in avg_grads: + avg_grads[param_name] = grad + elif grad*5. < avg_grads[param_name]: + avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1) + norm_grad = grad/avg_grads[param_name] + if grad > 30.0 or norm_grad > 2.0: + print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}') + + if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer. + hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs. + + if grad_clip_thresh:# apply gradient clipping to params if hparams.fp16_run: - with amp.scale_loss(loss, optimizer) as scaled_loss: - scaled_loss.backward() + grad_norm = torch.nn.utils.clip_grad_norm_( + amp.master_params(optimizer), grad_clip_thresh) + is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) else: - loss.backward() - - if rank==0 and show_gradients:# debug/extreme verbose - try: - _=avg_grads - except: - avg_grads = {} - for param_name, params in model.named_parameters(): - if params.requires_grad and params.grad is not None: - norm_grad = 1.0 - grad = params.grad.abs().sum().item() - if param_name not in avg_grads: - avg_grads[param_name] = grad - elif grad*5. < avg_grads[param_name]: - avg_grads[param_name] = (avg_grads[param_name]*0.9)+(grad*0.1) - norm_grad = grad/avg_grads[param_name] - if grad > 30.0 or norm_grad > 2.0: - print(f'{norm_grad:03.1f} | {grad:020.6f} | {params.grad.abs().mean().item():06.6f}| {params.grad.abs().max().item():010.6f} | {param_name}') - - if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']:# HiFiGAN isn't supposed to use gradient clipping so the optimizer. - hifiGAN.g_optimizer_step_and_clear()# should be ran before gradient clipping occurs. - - if grad_clip_thresh:# apply gradient clipping to params - if hparams.fp16_run: - grad_norm = torch.nn.utils.clip_grad_norm_( - amp.master_params(optimizer), grad_clip_thresh) - is_overflow = math.isinf(grad_norm) or math.isnan(grad_norm) - else: - grad_norm = torch.nn.utils.clip_grad_norm_( - model.parameters(), grad_clip_thresh) - else: - grad_norm = 0.0 - - if math.isfinite(grad_norm): - optimizer.step() - - # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules. - effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate) - - # (Optional) Discriminator Forward+Backward Pass - if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']: - hifiGAN.train(model.training) - with torch.random.fork_rng(devices=[0,]): - hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars) - - # get current Loss Scale of first optimizer - loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768. - - # restart if training/model has collapsed - if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)): - raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") - if (loss_scale < 1/4): - raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n") - - if expavg_loss_dict is None: - expavg_loss_dict = reduced_loss_dict + grad_norm = torch.nn.utils.clip_grad_norm_( + model.parameters(), grad_clip_thresh) + else: + grad_norm = 0.0 + + if math.isfinite(grad_norm): + optimizer.step() + + # calcuate the effective learning rate after gradient clipping is applied, and use the effective learning rate on the GAN modules. + effective_lr = 0.0 if is_overflow else (learning_rate*min((grad_clip_thresh/grad_norm+1e-6), 1.0) if grad_clip_thresh else learning_rate) + + # (Optional) Discriminator Forward+Backward Pass + if hparams.HiFiGAN_enable and y_pred['hifigan_enabled']: + hifiGAN.train(model.training) + with torch.random.fork_rng(devices=[0,]): + hifiGAN(y_pred, y, reduced_loss_dict, loss_dict, loss_scalars) + + # get current Loss Scale of first optimizer + loss_scale = amp._amp_state.loss_scalers[0]._loss_scale if hparams.fp16_run else 32768. + + # restart if training/model has collapsed + if (iteration > 1e3 and (reduced_loss > LossExplosionThreshold)) or (math.isnan(reduced_loss)): + raise LossExplosion(f"\nLOSS EXPLOSION EXCEPTION ON RANK {rank}: Loss reached {reduced_loss} during iteration {iteration}.\n\n\n") + if (loss_scale < 1/4): + raise LossExplosion(f"\nLOSS EXCEPTION ON RANK {rank}: Loss Scaler reached {loss_scale} during iteration {iteration}.\n\n\n") + + if expavg_loss_dict is None: + expavg_loss_dict = reduced_loss_dict + else: + expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict. + expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict} + expavg_loss_dict_iters += 1 + + if expavg_loss_dict_iters > 100:# calc smoothed loss dict + if best_loss_dict is None: + best_loss_dict = expavg_loss_dict else: - expavg_loss_dict.update({k:v for k, v in reduced_loss_dict.items() if k not in expavg_loss_dict.keys()})# if new loss term appears in reduced_loss_dict, add it to the expavg_loss_dict. - expavg_loss_dict = {k: (reduced_loss_dict[k]*(1-loss_dict_smoothness))+(expavg_loss_dict[k]*loss_dict_smoothness) for k in expavg_loss_dict.keys() if k in reduced_loss_dict} - expavg_loss_dict_iters += 1 - - if expavg_loss_dict_iters > 100:# calc smoothed loss dict - if best_loss_dict is None: - best_loss_dict = expavg_loss_dict - else: - best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict} - - if rank == 0:# print + log metrics - duration = time.time() - start_time - if not is_overflow: - average_loss = rolling_loss.process(reduced_loss) - dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5) - InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5) - WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0) - logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration) - tqdm.write( - f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] " - f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] " - f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] " - f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] " - f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]") - if is_overflow: - tqdm.write("Gradient Overflow, Skipping Step\n") - start_time = time.time() - - if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters - if rank == 0: - checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}") - save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) - - if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards - print("Updating File_losses dict!") - file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), - os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) - - if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints - if rank == 0 and os.path.exists(save_file_check_path): - os.remove(save_file_check_path) - # perform validation and save "best_val_model" depending on validation loss - val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force - file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), - os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) - if (val_loss < best_validation_loss): - best_validation_loss = val_loss - if rank == 0 and hparams.save_best_val_model: - checkpoint_path = os.path.join(args.output_directory, "best_val_model") - save_checkpoint( - model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, - average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) - just_did_val = True - - del y_pred, y, batch, loss_dict, reduced_loss_dict - iteration += 1 - # end of iteration loop + best_loss_dict = {k: min(best_loss_dict[k], expavg_loss_dict[k]) for k in best_loss_dict.keys() if k in expavg_loss_dict} + + if rank == 0:# print + log metrics + duration = time.time() - start_time + if not is_overflow: + average_loss = rolling_loss.process(reduced_loss) + dbGANAccStr = expavg_loss_dict.get('dbGAN_accuracy', None) or reduced_loss_dict.get('dbGAN_accuracy', 0.5) + InfGANAccStr = expavg_loss_dict.get('InfGAN_accuracy', None) or reduced_loss_dict.get('InfGAN_accuracy', 0.5) + WScoreStr = expavg_loss_dict.get('weighted_score' , None) or reduced_loss_dict.get('weighted_score' , 0.0) + logger.log_training(model, reduced_loss_dict, expavg_loss_dict, best_loss_dict, grad_norm, learning_rate, duration, iteration) + tqdm.write( + f"{iteration} [TrainLoss:{reduced_loss:.3f} Avg:{average_loss:.3f}] " + f"[{grad_norm:03.1f}GradNorm] [{duration:.2f}s/it] " + f"[{(duration/(hparams.batch_size*args.n_gpus)):.3f}s/file] " + f"[{learning_rate:.1e}LR] [{loss_scale:.0f}LS] " + f"[{WScoreStr:.1%}AttSc] [{dbGANAccStr:.1%}dbGANAcc] [{InfGANAccStr:.1%}InfGANAcc]") + if is_overflow: + tqdm.write("Gradient Overflow, Skipping Step\n") + start_time = time.time() + + if iteration%checkpoint_interval==0 or os.path.exists(save_file_check_path):# save model checkpoint every X iters + if rank == 0: + checkpoint_path = os.path.join(args.output_directory, f"checkpoint_{iteration}") + save_checkpoint(model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) + + if iteration%dump_filelosses_interval==0:# syncronise file_losses between graphics cards + print("Updating File_losses dict!") + file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), + os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) + + if (iteration % int(validation_interval) == 0) or (os.path.exists(save_file_check_path)):# validate models and save 'best_val_model' checkpoints + if rank == 0 and os.path.exists(save_file_check_path): + os.remove(save_file_check_path) + # perform validation and save "best_val_model" depending on validation loss + val_loss, best_val_loss_dict, file_losses = validate(hparams, args, file_losses, model, criterion, hifiGAN, valset, loss_scalars, best_val_loss_dict, iteration, collate_fn, logger)# validate/teacher_force + file_losses = write_dict_to_file(file_losses, os.path.join(args.output_directory, 'file_losses.csv'), + os.path.join(args.output_directory, 'speaker_losses.csv'), speakerlist, args.n_gpus, rank) + if (val_loss < best_validation_loss): + best_validation_loss = val_loss + if rank == 0 and hparams.save_best_val_model: + checkpoint_path = os.path.join(args.output_directory, "best_val_model") + save_checkpoint( + model, optimizer, hifiGAN, learning_rate, iteration, hparams, best_validation_loss, + average_loss, best_val_loss_dict, best_loss_dict, speaker_lookup, speakerlist, checkpoint_path) + just_did_val = True + + del y_pred, y, batch, loss_dict, reduced_loss_dict + iteration += 1 + # end of iteration loop # update filelist of training dataloader print("Updating File_losses dict!") @@ -1007,4 +1005,3 @@ def train(args, rank, group_name, hparams): pass train(args, args.rank, args.group_name, hparams) -