Skip to content

Commit

Permalink
change dilation back to 2
Browse files Browse the repository at this point in the history
  • Loading branch information
yoyolicoris committed Jun 1, 2019
1 parent e0d8d57 commit 1ede586
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 21 deletions.
6 changes: 3 additions & 3 deletions LJ_speech.json
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@
"trainer": {
"steps": 580000,
"save_dir": "/media/ycy/86A4D88BA4D87F5D/DataSet/waveglow-result/",
"save_freq": 20000,
"save_freq": 50,
"verbosity": 2
},
"visualization": {
"tensorboardX": false,
"log_dir": "saved/runs"
"tensorboardX": true,
"log_dir": "/media/ycy/86A4D88BA4D87F5D/DataSet/waveglow-result/runs"
}
}
19 changes: 10 additions & 9 deletions data_loader/data_loaders.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,22 +103,21 @@ def get_nframes(info_str):
else:
assert f_obj.samplerate == self.sr
self.file_lengths = np.array(self.file_lengths)
self.boundaries = np.cumsum(self.file_lengths)
self.boundaries = np.cumsum(self.file_lengths) / (self.file_lengths.sum() - 1)

# normalization value based on each file
# will updated online
self.max_values = np.ones_like(self.boundaries) * 0.01
# will updated on the fily
self.max_values = np.zeros_like(self.boundaries)

def __len__(self):
return self.size

def __getitem__(self, index):
index = random.randint(0, self.boundaries[-1] - 1)
index = np.digitize(index, self.boundaries)
index = np.digitize(random.uniform(0, 1), self.boundaries)
f, length = self.files[index], self.file_lengths[index]
pos = random.randint(0, length - self.segment - 1)
pos = random.randint(0, length - 1)
f.seek(pos)
x = f.read(self.segment, dtype='float32', always_2d=True).mean(1)
x = f.read(self.segment, dtype='float32', always_2d=True, fill_value=0.).mean(1)
max_abs = np.abs(x).max()
if max_abs > self.max_values[index]:
self.max_values[index] = max_abs
Expand Down Expand Up @@ -149,7 +148,9 @@ def __init__(self, steps, data_dir, batch_size, num_workers, **kwargs):


if __name__ == '__main__':
loader = RandomWaveFileLoader(100, '/media/ycy/86A4D88BA4D87F5D/DataSet/LJSpeech-1.1/wavs', 64, 0, segment=16000)
loader = RandomWaveFileLoader(100, '/media/ycy/86A4D88BA4D87F5D/DataSet/LJSpeech-1.1/wavs', 1, 0, segment=16000)

import matplotlib.pyplot as plt
for x in loader:
print(x[0, :10])
plt.plot(x[0].numpy())
plt.show()
2 changes: 1 addition & 1 deletion model/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def __init__(self,
bias=False,
zero_init=True):
super().__init__()
dilations = radix ** torch.arange(depth)
dilations = 2 ** torch.arange(depth)
self.dilations = dilations.tolist()
self.in_chs = in_channels
self.res_chs = residual_channels
Expand Down
22 changes: 14 additions & 8 deletions trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@ def train(self):
data = data.to(self.device)

self.optimizer.zero_grad()
*output, mels = self.model(data)
loss = self.loss(*output)
z, logdet, mels = self.model(data)
loss = self.loss(z, logdet)
loss.backward()
self.optimizer.step()

self.writer.set_step(step)
self.writer.add_scalar('loss', loss.item())
self.writer.add_scalar('log_determinant', logdet.mean().item())
self.writer.add_scalar('z_mean', z.mean().item())
self.writer.add_scalar('z_std', z.std().item())
self.writer.add_scalar('max_memory_allocated', torch.cuda.max_memory_allocated() / (1024 ** 2))

if self.verbosity >= 2 and step % self.log_step == 0:
self.logger.info('Train Step: [{}/{} ({:.0f}%)] Loss: {:.6f}'.format(
Expand All @@ -45,12 +49,14 @@ def train(self):
100.0 * step / self.steps,
loss.item()))
#self.writer.add_image('input', make_grid(data.cpu(), nrow=8, normalize=True))
self.writer.add_image('input mel-spectrum', mels[0].cpu(), dataformats='HW')
with torch.no_grad():
z = torch.randn_like(output[0][:1]) * output[0].std()
x, _ = self.model.inverse(z, mels[:1])
torch.clamp(x, -1, 1, out=x)
self.writer.add_audio('reconstruct audio', x.cpu(), sample_rate=self.model.sr)
mel_spec = mels[0].cpu()
mel_spec -= mel_spec.min()
mel_spec /= mel_spec.max()
self.writer.add_image('input_mel-spectrum', mel_spec.flip(0), dataformats='HW')

x = self.model.infer(mels[0], z[0].std().item())
torch.clamp(x, -1, 1, out=x)
self.writer.add_audio('reconstruct_audio', x.cpu()[None, :], sample_rate=self.model.sr)

if self.lr_scheduler is not None:
self.lr_scheduler.step()
Expand Down

0 comments on commit 1ede586

Please sign in to comment.