yistLin
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎configs/training_config.yaml‎
Lines changed: 12 additions & 0 deletions b/‎configs/training_config.yaml‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎data/vocoder_dataset.py‎
Lines changed: 59 additions & 0 deletions b/‎data/vocoder_dataset.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎models/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎models/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎models/vocoder.py‎
Lines changed: 142 additions & 0 deletions b/‎models/vocoder.py‎
Lines changed: 142 additions & 0 deletions
@@ -1,3 +1,5 @@
 *.npy
 *.npz
-*.tfevents.*
+*.tfevents.*
+*.pt
+metadata*
@@ -0,0 +1,12 @@
+frames_per_sample: 40
+frames_per_slice: 8
+bits: 9
+conditioning_channels: 128
+embedding_dim: 256
+rnn_channels: 896
+fc_channels: 512
+batch_size: 32
+n_steps: 100000
+valid_every: 1000
+valid_ratio: 0.05
+save_every: 5000
@@ -0,0 +1,59 @@
+"""Vocoder dataset."""
+
+import json
+from random import randint
+from pathlib import Path
+
+import numpy as np
+import torch
+from torch.utils.data import Dataset
+
+from .utils import mulaw_encode
+
+
+class VocoderDataset(Dataset):
+ """Sample a segment of utterance for training vocoder."""
+
+ def __init__(
+ self, data_dir, metadata_path, frames_per_sample, frames_per_slice, bits
+ ):
+
+ with open(metadata_path, "r") as f:
+ metadata = json.load(f)
+
+ self.data_dir = Path(data_dir)
+ self.sample_rate = metadata["sample_rate"]
+ self.hop_len = metadata["hop_len"]
+ self.n_mels = metadata["n_mels"]
+ self.n_pad = (frames_per_sample - frames_per_slice) // 2
+ self.frames_per_sample = frames_per_sample
+ self.frames_per_slice = frames_per_slice
+ self.bits = bits
+ self.uttr_infos = [
+ uttr_info
+ for uttr_info in metadata["utterances"]
+ if uttr_info["mel_len"] > frames_per_sample
+ ]
+
+ def __len__(self):
+ return len(self.uttr_infos)
+
+ def __getitem__(self, index):
+ uttr_info = self.uttr_infos[index]
+ features = np.load(self.data_dir / uttr_info["feature_path"])
+ wav = features["wav"]
+ mel = features["mel"]
+
+ wav = np.pad(wav, (0, (len(mel) * self.hop_len - len(wav))), "constant")
+ mel = np.pad(mel, ((self.n_pad,), (0,)), "constant")
+ wav = np.pad(wav, (self.n_pad * self.hop_len,), "constant")
+ wav = mulaw_encode(wav, 2 ** self.bits)
+
+ pos = randint(0, len(mel) - self.frames_per_sample)
+ mel_seg = mel[pos : pos + self.frames_per_sample, :]
+
+ pos1 = pos + self.n_pad
+ pos2 = pos1 + self.frames_per_slice
+ wav_seg = wav[pos1 * self.hop_len : pos2 * self.hop_len + 1]
+
+ return torch.FloatTensor(mel_seg), torch.LongTensor(wav_seg)
@@ -0,0 +1 @@
+from .vocoder import *
@@ -0,0 +1,142 @@
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+
+
+class Vocoder(nn.Module):
+ """Universal vocoding"""
+
+ def __init__(
+ self,
+ sample_rate,
+ mel_channels,
+ conditioning_channels,
+ embedding_dim,
+ rnn_channels,
+ fc_channels,
+ bits,
+ hop_length,
+ ):
+ super().__init__()
+
+ self.init_params = {
+ "sample_rate": sample_rate,
+ "mel_channels": mel_channels,
+ "conditioning_channels": conditioning_channels,
+ "embedding_dim": embedding_dim,
+ "rnn_channels": rnn_channels,
+ "fc_channels": fc_channels,
+ "bits": bits,
+ "hop_length": hop_length,
+ }
+
+ self.rnn_channels = rnn_channels
+ self.quantization_channels = 2 ** bits
+ self.hop_length = hop_length
+ self.sample_rate = sample_rate
+
+ self.rnn1 = nn.GRU(
+ mel_channels,
+ conditioning_channels,
+ num_layers=2,
+ batch_first=True,
+ bidirectional=True,
+ )
+ self.embedding = nn.Embedding(self.quantization_channels, embedding_dim)
+ self.rnn2 = nn.GRU(
+ embedding_dim + 2 * conditioning_channels, rnn_channels, batch_first=True
+ )
+ self.fc1 = nn.Linear(rnn_channels, fc_channels)
+ self.fc2 = nn.Linear(fc_channels, self.quantization_channels)
+
+ def forward(self, x, mels):
+ sample_frames = mels.size(1)
+ audio_slice_frames = x.size(1) // self.hop_length
+ pad = (sample_frames - audio_slice_frames) // 2
+
+ mels, _ = self.rnn1(mels)
+ mels = mels[:, pad : pad + audio_slice_frames, :]
+
+ mels = F.interpolate(mels.transpose(1, 2), scale_factor=float(self.hop_length))
+ mels = mels.transpose(1, 2)
+
+ x = self.embedding(x)
+
+ x, _ = self.rnn2(torch.cat((x, mels), dim=2))
+
+ x = F.relu(self.fc1(x))
+ x = self.fc2(x)
+ return x
+
+ @classmethod
+ def load_checkpoint(cls, checkpoint_path):
+ ckpt = torch.load(checkpoint_path, map_location="cpu")
+ model = cls(**ckpt["init_params"])
+ model.load_state_dict(ckpt["model"])
+ return model
+
+ def save_checkpoint(self, checkpoint_path):
+ torch.save(
+ {"init_params": self.init_params, "model": self.state_dict(),},
+ checkpoint_path,
+ )
+
+ def generate(self, mel):
+ """Generate waveform from mel spectrogram using vocoder."""
+ output = []
+ cell = get_gru_cell(self.rnn2)
+
+ with torch.no_grad():
+ mel, _ = self.rnn1(mel)
+
+ mel = F.interpolate(
+ mel.transpose(1, 2), scale_factor=float(self.hop_length)
+ )
+ mel = mel.transpose(1, 2)
+
+ batch_size, _, _ = mel.size()
+
+ h = torch.zeros(batch_size, self.rnn_channels, device=mel.device)
+ x = (
+ torch.zeros(batch_size, device=mel.device)
+ .fill_(self.quantization_channels // 2)
+ .long()
+ )
+
+ for m in tqdm(torch.unbind(mel, dim=1), leave=False):
+ x = self.embedding(x)
+ h = cell(torch.cat((x, m), dim=1), h)
+
+ x = F.relu(self.fc1(h))
+ logits = self.fc2(x)
+
+ posterior = F.softmax(logits, dim=1)
+ dist = torch.distributions.Categorical(posterior)
+
+ x = dist.sample()
+ output.append(
+ 2 * x.float().item() / (self.quantization_channels - 1.0) - 1.0
+ )
+
+ output = np.asarray(output, dtype=np.float64)
+ output = mulaw_decode(output, self.quantization_channels)
+
+ return output
+
+
+def get_gru_cell(gru):
+ gru_cell = nn.GRUCell(gru.input_size, gru.hidden_size)
+ gru_cell.weight_hh.data = gru.weight_hh_l0.data
+ gru_cell.weight_ih.data = gru.weight_ih_l0.data
+ gru_cell.bias_hh.data = gru.bias_hh_l0.data
+ gru_cell.bias_ih.data = gru.bias_ih_l0.data
+ return gru_cell
+
+
+def mulaw_decode(x_mu: np.ndarray, n_channels: int) -> np.ndarray:
+ """Decode mu-law encoded signal."""
+ mu = n_channels - 1
+ x = np.sign(x_mu) / mu * ((1 + mu) ** np.abs(x_mu) - 1)
+ return x