yistLin
diff --git a/‎mel2wav.py‎
Lines changed: 3 additions & 5 deletions b/‎mel2wav.py‎
Lines changed: 3 additions & 5 deletions
diff --git a/‎models/__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎models/__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎models/universal_vocoder.py‎
Lines changed: 87 additions & 0 deletions b/‎models/universal_vocoder.py‎
Lines changed: 87 additions & 0 deletions
diff --git a/‎models/vocoder.py‎
Lines changed: 0 additions & 142 deletions b/‎models/vocoder.py‎
Lines changed: 0 additions & 142 deletions
diff --git a/‎reconstruct.py‎
Lines changed: 5 additions & 5 deletions b/‎reconstruct.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎train.py‎
Lines changed: 14 additions & 9 deletions b/‎train.py‎
Lines changed: 14 additions & 9 deletions
@@ -8,8 +8,6 @@
 import torch
 import soundfile as sf
 
-from models import Vocoder
-
 
 def parse_args():
  """Parse command-line arguments."""
@@ -25,14 +23,14 @@ def main(ckpt_path, npy_path, output_path):
 
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
- model = Vocoder.load_checkpoint(ckpt_path)
+ model = torch.jit.load(ckpt_path)
  model.to(device)
- model.eval()
 
  mel = np.load(npy_path)
  mel = torch.FloatTensor(mel).to(device).transpose(0, 1).unsqueeze(0)
 
- wav = model.generate(mel)
+ with torch.no_grad():
+ wav = model.generate(mel).squeeze().detach().cpu().numpy()
 
  npy_path_name = Path(npy_path).name
  wav_path = npy_path_name + ".wav" if output_path is None else output_path
 
@@ -1 +1 @@
-from .vocoder import *
+from .universal_vocoder import *
@@ -0,0 +1,87 @@
+"""Universal vocoder"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class UniversalVocoder(nn.Module):
+ """Universal vocoding"""
+
+ def __init__(
+ self,
+ sample_rate,
+ frames_per_sample,
+ frames_per_slice,
+ mel_dim,
+ mel_rnn_dim,
+ emb_dim,
+ wav_rnn_dim,
+ affine_dim,
+ bits,
+ hop_length,
+ ):
+ super().__init__()
+
+ self.sample_rate = sample_rate
+ self.frames_per_slice = frames_per_slice
+ self.pad = (frames_per_sample - frames_per_slice) // 2
+ self.wav_rnn_dim = wav_rnn_dim
+ self.quant_dim = 2 ** bits
+ self.hop_len = hop_length
+
+ self.mel_rnn = nn.GRU(
+ mel_dim, mel_rnn_dim, num_layers=2, batch_first=True, bidirectional=True
+ )
+ self.embedding = nn.Embedding(self.quant_dim, emb_dim)
+ self.wav_rnn = nn.GRU(emb_dim + 2 * mel_rnn_dim, wav_rnn_dim, batch_first=True)
+ self.affine = nn.Sequential(
+ nn.Linear(wav_rnn_dim, affine_dim),
+ nn.ReLU(),
+ nn.Linear(affine_dim, self.quant_dim),
+ )
+
+ def forward(self, wavs, mels):
+ """Generate waveform from mel spectrogram with teacher-forcing."""
+ mel_embs, _ = self.mel_rnn(mels)
+ mel_embs = mel_embs.transpose(1, 2)
+ mel_embs = mel_embs[:, :, self.pad : self.pad + self.frames_per_slice]
+
+ conditions = F.interpolate(mel_embs, scale_factor=float(self.hop_len))
+ conditions = conditions.transpose(1, 2)
+
+ wav_embs = self.embedding(wavs)
+ wav_outs, _ = self.wav_rnn(torch.cat((wav_embs, conditions), dim=2))
+
+ return self.affine(wav_outs)
+
+ @torch.jit.export
+ def generate(self, mels):
+ """Generate waveform from mel spectrogram."""
+ mel_embs, _ = self.mel_rnn(mels)
+ mel_embs = mel_embs.transpose(1, 2)
+
+ conditions = F.interpolate(mel_embs, scale_factor=float(self.hop_len))
+ conditions = conditions.transpose(1, 2)
+
+ hid = torch.zeros(mels.size(0), 1, self.wav_rnn_dim, device=mels.device)
+ wav = torch.full(
+ (mels.size(0),), self.quant_dim // 2, dtype=torch.long, device=mels.device,
+ )
+ wavs = torch.empty(
+ mels.size(0), mels.size(1) * self.hop_len, device=mels.device
+ )
+
+ for i, condition in enumerate(torch.unbind(conditions, dim=1)):
+ wav_emb = self.embedding(wav)
+ _, hid = self.wav_rnn(
+ torch.cat((wav_emb, condition), dim=1).unsqueeze(1), hid
+ )
+ logit = self.affine(hid.squeeze(1))
+ posterior = F.softmax(logit, dim=1)
+ wav = torch.multinomial(posterior, 1).squeeze(1)
+ wavs[:, i] = 2 * wav.item() / (self.quant_dim - 1.0) - 1.0
+
+ mu = self.quant_dim - 1
+ wavs = torch.sign(wavs) / mu * ((1 + mu) ** torch.abs(wavs) - 1)
+
+ return wavs
@@ -8,7 +8,6 @@
 from jsonargparse import ArgumentParser, ActionConfigFile
 
 from data import load_wav, log_mel_spectrogram
-from models import Vocoder
 
 
 def parse_args():
@@ -46,21 +45,22 @@ def main(
 
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
- model = Vocoder.load_checkpoint(ckpt_path)
+ model = torch.jit.load(ckpt_path)
  model.to(device)
- model.eval()
 
  wav = load_wav(audio_path, sample_rate)
  mel = log_mel_spectrogram(
  wav, preemph, sample_rate, n_mels, n_fft, hop_len, win_len, f_min
  ).T
 
  mel = torch.FloatTensor(mel).to(device).transpose(0, 1).unsqueeze(0)
- wav = model.generate(mel)
+
+ with torch.no_grad():
+ wav = model.generate(mel).squeeze().detach().cpu().numpy()
 
  npy_path_name = Path(audio_path).name
  wav_path = npy_path_name + ".rec.wav" if output_path is None else output_path
- sf.write(wav_path, wav, model.sample_rate)
+ sf.write(wav_path, wav, sample_rate)
 
 
 if __name__ == "__main__":
 
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Train reconstruction model."""
+"""Train universal vocoder."""
 
 from datetime import datetime
 from pathlib import Path
@@ -13,7 +13,8 @@
 from jsonargparse import ArgumentParser, ActionConfigFile
 
 from data import VocoderDataset
-from models import Vocoder
+
+from models import UniversalVocoder
 
 
 def parse_args():
@@ -90,17 +91,20 @@ def main(
  pin_memory=True,
  )
 
- model = Vocoder(
+ model = UniversalVocoder(
  sample_rate=dataset.sample_rate,
- mel_channels=dataset.n_mels,
- conditioning_channels=conditioning_channels,
- embedding_dim=embedding_dim,
- rnn_channels=rnn_channels,
- fc_channels=fc_channels,
+ frames_per_sample=frames_per_sample,
+ frames_per_slice=frames_per_slice,
+ mel_dim=dataset.n_mels,
+ mel_rnn_dim=conditioning_channels,
+ emb_dim=embedding_dim,
+ wav_rnn_dim=rnn_channels,
+ affine_dim=fc_channels,
  bits=bits,
  hop_length=dataset.hop_len,
  )
  model.to(device)
+ model = torch.jit.script(model)
 
  optimizer = Adam(model.parameters())
 
@@ -169,7 +173,8 @@ def main(
  save_dir_path = Path(save_dir)
  save_dir_path.mkdir(parents=True, exist_ok=True)
  checkpoint_path = save_dir_path / f"vocoder-ckpt-{step+1}.pt"
- model.save_checkpoint(checkpoint_path)
+ torch.jit.save(model.cpu(), str(checkpoint_path))
+ model.to(device)
 
 
 if __name__ == "__main__":
Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-from .vocoder import *`
	`1`	`+from .universal_vocoder import *`