Denoising Diffusion Implicit Models (DDIM) Sampling

This implements DDIM sampling from the paper Denoising Diffusion Implicit Models

16from typing import Optional, List 17 18import numpy as np 19import torch 20 21from labml import monit 22from labml_nn.diffusion.stable_diffusion.latent_diffusion import LatentDiffusion 23from labml_nn.diffusion.stable_diffusion.sampler import DiffusionSampler

DDIM Sampler

This extends the DiffusionSampler base class.

DDIM samples images by repeatedly removing noise by sampling step by step using,

x_{τ_{i - 1}} = α_{τ_{i - 1}} (\frac{x _{τ_{i}} - 1 - α _{τ_{i}} ϵ _{θ} ( x _{τ_{i}} )}{α _{τ_{i}}}) + 1 - α_{τ_{i - 1}} - σ_{τ_{i}}^{2} \cdot ϵ_{θ} (x_{τ_{i}}) + σ_{τ_{i}} ϵ_{τ_{i}}

where $ϵ_{τ_{i}}$ is random noise, $τ$ is a subsequence of $[1, 2, \dots, T]$ of length $S$ , and $σ_{τ_{i}} = η \frac{1 - α _{τ_{i - 1}}}{1 - α _{τ_{i}}} 1 - \frac{α _{τ_{i}}}{α _{τ_{i - 1}}}$

Note that, $α_{t}$ in DDIM paper refers to $\overset{α_{t}}{ˉ}$ from DDPM.

26class DDIMSampler(DiffusionSampler):

52 model: LatentDiffusion

model is the model to predict noise $ϵ_{c ond} (x_{t}, c)$
n_steps is the number of DDIM sampling steps, $S$
ddim_discretize specifies how to extract $τ$ from $[1, 2, \dots, T]$ . It can be either uniform or quad .
ddim_eta is $η$ used to calculate $σ_{τ_{i}}$ . $η = 0$ makes the sampling process deterministic.

54 def __init__(self, model: LatentDiffusion, n_steps: int, ddim_discretize: str = "uniform", ddim_eta: float = 0.):

63 super().__init__(model)

Number of steps, $T$

65 self.n_steps = model.n_steps

Calculate $τ$ to be uniformly distributed across $[1, 2, \dots, T]$

68 if ddim_discretize == 'uniform': 69 c = self.n_steps // n_steps 70 self.time_steps = np.asarray(list(range(0, self.n_steps, c))) + 1

Calculate $τ$ to be quadratically distributed across $[1, 2, \dots, T]$

72 elif ddim_discretize == 'quad': 73 self.time_steps = ((np.linspace(0, np.sqrt(self.n_steps * .8), n_steps)) ** 2).astype(int) + 1 74 else: 75 raise NotImplementedError(ddim_discretize) 76 77 with torch.no_grad():

Get $\overset{α_{t}}{ˉ}$

79 alpha_bar = self.model.alpha_bar

$α_{τ_{i}}$

82 self.ddim_alpha = alpha_bar[self.time_steps].clone().to(torch.float32)

$α_{τ_{i}}$

84 self.ddim_alpha_sqrt = torch.sqrt(self.ddim_alpha)

$α_{τ_{i - 1}}$

86 self.ddim_alpha_prev = torch.cat([alpha_bar[0:1], alpha_bar[self.time_steps[:-1]]])

$σ_{τ_{i}} = η \frac{1 - α _{τ_{i - 1}}}{1 - α _{τ_{i}}} 1 - \frac{α _{τ_{i}}}{α _{τ_{i - 1}}}$

91 self.ddim_sigma = (ddim_eta * 92 ((1 - self.ddim_alpha_prev) / (1 - self.ddim_alpha) * 93 (1 - self.ddim_alpha / self.ddim_alpha_prev)) ** .5)

$1 - α_{τ_{i}}$

96 self.ddim_sqrt_one_minus_alpha = (1. - self.ddim_alpha) ** .5

Sampling Loop

shape is the shape of the generated images in the form [batch_size, channels, height, width]
cond is the conditional embeddings $c$
temperature is the noise temperature (random noise gets multiplied by this)
x_last is $x_{τ_{S}}$ . If not provided random noise will be used.
uncond_scale is the unconditional guidance scale $s$ . This is used for $ϵ_{θ} (x_{t}, c) = s ϵ_{c ond} (x_{t}, c) + (s - 1) ϵ_{c ond} (x_{t}, c_{u})$
uncond_cond is the conditional embedding for empty prompt $c_{u}$
skip_steps is the number of time steps to skip $i^{'}$ . We start sampling from $S - i^{'}$ . And x_last is then $x_{τ_{S - i^{'}}}$ .

98 @torch.no_grad() 99 def sample(self, 100 shape: List[int], 101 cond: torch.Tensor, 102 repeat_noise: bool = False, 103 temperature: float = 1., 104 x_last: Optional[torch.Tensor] = None, 105 uncond_scale: float = 1., 106 uncond_cond: Optional[torch.Tensor] = None, 107 skip_steps: int = 0, 108 ):

Get device and batch size

125 device = self.model.device 126 bs = shape[0]

Get $x_{τ_{S}}$

129 x = x_last if x_last is not None else torch.randn(shape, device=device)

Time steps to sample at $τ_{S - i^{'}}, τ_{S - i^{'} - 1}, \dots, τ_{1}$

132 time_steps = np.flip(self.time_steps)[skip_steps:] 133 134 for i, step in monit.enum('Sample', time_steps):

Index $i$ in the list $[τ_{1}, τ_{2}, \dots, τ_{S}]$

136 index = len(time_steps) - i - 1

Time step $τ_{i}$

138 ts = x.new_full((bs,), step, dtype=torch.long)

Sample $x_{τ_{i - 1}}$

141 x, pred_x0, e_t = self.p_sample(x, cond, ts, step, index=index, 142 repeat_noise=repeat_noise, 143 temperature=temperature, 144 uncond_scale=uncond_scale, 145 uncond_cond=uncond_cond)

Return $x_{0}$

148 return x

Sample $x_{τ_{i - 1}}$

x is $x_{τ_{i}}$ of shape [batch_size, channels, height, width]
c is the conditional embeddings $c$ of shape [batch_size, emb_size]
t is $τ_{i}$ of shape [batch_size]
step is the step $τ_{i}$ as an integer
index is index $i$ in the list $[τ_{1}, τ_{2}, \dots, τ_{S}]$
repeat_noise specified whether the noise should be same for all samples in the batch
temperature is the noise temperature (random noise gets multiplied by this)
uncond_scale is the unconditional guidance scale $s$ . This is used for $ϵ_{θ} (x_{t}, c) = s ϵ_{c ond} (x_{t}, c) + (s - 1) ϵ_{c ond} (x_{t}, c_{u})$
uncond_cond is the conditional embedding for empty prompt $c_{u}$

150 @torch.no_grad() 151 def p_sample(self, x: torch.Tensor, c: torch.Tensor, t: torch.Tensor, step: int, index: int, *, 152 repeat_noise: bool = False, 153 temperature: float = 1., 154 uncond_scale: float = 1., 155 uncond_cond: Optional[torch.Tensor] = None):

Get $ϵ_{θ} (x_{τ_{i}})$

172 e_t = self.get_eps(x, t, c, 173 uncond_scale=uncond_scale, 174 uncond_cond=uncond_cond)

Calculate $x_{τ_{i - 1}}$ and predicted $x_{0}$

177 x_prev, pred_x0 = self.get_x_prev_and_pred_x0(e_t, index, x, 178 temperature=temperature, 179 repeat_noise=repeat_noise)

182 return x_prev, pred_x0, e_t

Sample $x_{τ_{i - 1}}$ given $ϵ_{θ} (x_{τ_{i}})$

184 def get_x_prev_and_pred_x0(self, e_t: torch.Tensor, index: int, x: torch.Tensor, *, 185 temperature: float, 186 repeat_noise: bool):

$α_{τ_{i}}$

192 alpha = self.ddim_alpha[index]

$α_{τ_{i - 1}}$

194 alpha_prev = self.ddim_alpha_prev[index]

$σ_{τ_{i}}$

196 sigma = self.ddim_sigma[index]

$1 - α_{τ_{i}}$

198 sqrt_one_minus_alpha = self.ddim_sqrt_one_minus_alpha[index]

Current prediction for $x_{0}$ , $\frac{x _{τ_{i}} - 1 - α _{τ_{i}} ϵ _{θ} ( x _{τ_{i}} )}{α _{τ_{i}}}$

202 pred_x0 = (x - sqrt_one_minus_alpha * e_t) / (alpha ** 0.5)

Direction pointing to $x_{t}$ $1 - α_{τ_{i - 1}} - σ_{τ_{i}}^{2} \cdot ϵ_{θ} (x_{τ_{i}})$

205 dir_xt = (1. - alpha_prev - sigma ** 2).sqrt() * e_t

No noise is added, when $η = 0$

208 if sigma == 0.: 209 noise = 0.

If same noise is used for all samples in the batch

211 elif repeat_noise: 212 noise = torch.randn((1, *x.shape[1:]), device=x.device)

Different noise for each sample

214 else: 215 noise = torch.randn(x.shape, device=x.device)

Multiply noise by the temperature

218 noise = noise * temperature

x_{τ_{i - 1}} = α_{τ_{i - 1}} (\frac{x _{τ_{i}} - 1 - α _{τ_{i}} ϵ _{θ} ( x _{τ_{i}} )}{α _{τ_{i}}}) + 1 - α_{τ_{i - 1}} - σ_{τ_{i}}^{2} \cdot ϵ_{θ} (x_{τ_{i}}) + σ_{τ_{i}} ϵ_{τ_{i}}

227 x_prev = (alpha_prev ** 0.5) * pred_x0 + dir_xt + sigma * noise

230 return x_prev, pred_x0

Sample from $q_{σ, τ} (x_{τ_{i}} ∣ x_{0})$

$q_{σ, τ} (x_{t} ∣ x_{0}) = N (x_{t}; α_{τ_{i}} x_{0}, (1 - α_{τ_{i}}) I)$

x0 is $x_{0}$ of shape [batch_size, channels, height, width]
index is the time step $τ_{i}$ index $i$
noise is the noise, $ϵ$

232 @torch.no_grad() 233 def q_sample(self, x0: torch.Tensor, index: int, noise: Optional[torch.Tensor] = None):

Random noise, if noise is not specified

246 if noise is None: 247 noise = torch.randn_like(x0)

Sample from $q_{σ, τ} (x_{t} ∣ x_{0}) = N (x_{t}; α_{τ_{i}} x_{0}, (1 - α_{τ_{i}}) I)$

252 return self.ddim_alpha_sqrt[index] * x0 + self.ddim_sqrt_one_minus_alpha[index] * noise

Painting Loop

x is $x_{S^{'}}$ of shape [batch_size, channels, height, width]
cond is the conditional embeddings $c$
t_start is the sampling step to start from, $S^{'}$
orig is the original image in latent page which we are in paining. If this is not provided, it'll be an image to image transformation.
mask is the mask to keep the original image.
orig_noise is fixed noise to be added to the original image.
uncond_scale is the unconditional guidance scale $s$ . This is used for $ϵ_{θ} (x_{t}, c) = s ϵ_{c ond} (x_{t}, c) + (s - 1) ϵ_{c ond} (x_{t}, c_{u})$
uncond_cond is the conditional embedding for empty prompt $c_{u}$

254 @torch.no_grad() 255 def paint(self, x: torch.Tensor, cond: torch.Tensor, t_start: int, *, 256 orig: Optional[torch.Tensor] = None, 257 mask: Optional[torch.Tensor] = None, orig_noise: Optional[torch.Tensor] = None, 258 uncond_scale: float = 1., 259 uncond_cond: Optional[torch.Tensor] = None, 260 ):

Get batch size

276 bs = x.shape[0]

Time steps to sample at $τ_{S ‘}, τ_{S^{'} - 1}, \dots, τ_{1}$

279 time_steps = np.flip(self.time_steps[:t_start]) 280 281 for i, step in monit.enum('Paint', time_steps):

Index $i$ in the list $[τ_{1}, τ_{2}, \dots, τ_{S}]$

283 index = len(time_steps) - i - 1

Time step $τ_{i}$

285 ts = x.new_full((bs,), step, dtype=torch.long)

Sample $x_{τ_{i - 1}}$

288 x, _, _ = self.p_sample(x, cond, ts, step, index=index, 289 uncond_scale=uncond_scale, 290 uncond_cond=uncond_cond)

Replace the masked area with original image

293 if orig is not None:

Get the $q_{σ, τ} (x_{τ_{i}} ∣ x_{0})$ for original image in latent space

295 orig_t = self.q_sample(orig, index, noise=orig_noise)

Replace the masked area

297 x = orig_t * mask + x * (1 - mask)

300 return x

Denoising Diffusion Implicit Models (DDIM) Sampling

DDIM Sampler

Sampling Loop

Sample xτi−1​​

Sample xτi−1​​ given ϵθ​(xτi​​)

Sample from qσ,τ​(xτi​​∣x0​)

Painting Loop

Sample $x_{τ_{i - 1}}$

Sample $x_{τ_{i - 1}}$ given $ϵ_{θ} (x_{τ_{i}})$

Sample from $q_{σ, τ} (x_{τ_{i}} ∣ x_{0})$