DDPM代码详细解读(2):Unet结构、正向和逆向过程、IS和FID测试、EMA优化

2023-11-13

字数统计: 3.6k字 | 阅读时长≈ 20分

DDPM代码详细解读(2):Unet结构、正向和逆向过程、IS和FID测试、EMA优化

EMA优化

使用指数移动平均对模型参数进行优化，提高测试指标增加模型鲁棒性。

EMA的公式是：（EMA[t] = α * x[t] + (1 - α) * EMA[t-1]） \[ v_t=\beta \cdot v_{t-1}+\left( 1-\beta \right) \cdot \theta _t \] 其中 β 的设置为0.999，代码如下：

def ema(source, target, decay):
    source_dict = source.state_dict()
    target_dict = target.state_dict()
    for key in source_dict.keys():
        target_dict[key].data.copy_(
            target_dict[key].data * decay +
            source_dict[key].data * (1 - decay))

在训练的过程中，每一个 step 对 net_model 和 ema_model (即sample model)做ema：

1	ema(net_model, ema_model, FLAGS.ema_decay)

训练目标和采样目标

正向过程

正向过程即p过程，逆向过程即q过程、采样过程。

正向过程不涉及参数分布的计算和预测，可以理解为一个单纯add noise的过程。

训练和采样的训练目标如下：

上一篇博客详细解释了\(x_t\)和\(\epsilon _{\theta}\)是怎么计算的，正向过程的code就非常容易理解了：

class GaussianDiffusionTrainer(nn.Module):
    def __init__(self, model, beta_1, beta_T, T):
        super().__init__()
 
        self.model = model
        self.T = T
 
        self.register_buffer(
            'betas', torch.linspace(beta_1, beta_T, T).double())
        alphas = 1. - self.betas
        alphas_bar = torch.cumprod(alphas, dim=0)
 
        # calculations for diffusion q(x_t | x_{t-1}) and others
        self.register_buffer(
            'sqrt_alphas_bar', torch.sqrt(alphas_bar))
        self.register_buffer(
            'sqrt_one_minus_alphas_bar', torch.sqrt(1. - alphas_bar))
 
    def forward(self, x_0):
        """
        Algorithm 1.
        """
        t = torch.randint(self.T, size=(x_0.shape[0], ), device=x_0.device)
        noise = torch.randn_like(x_0)
        x_t = (
            extract(self.sqrt_alphas_bar, t, x_0.shape) * x_0 +
            extract(self.sqrt_one_minus_alphas_bar, t, x_0.shape) * noise)
        loss = F.mse_loss(self.model(x_t, t), noise, reduction='none')
        return loss

逆向过程

\(x_t\)的分布符合高斯分布，这是通过均值和方差进行计算的： \[ q\left( x_t|x_0 \right) =N\left( x_t;\sqrt{\bar{\alpha}_t}x_0,\left( 1-\bar{\alpha}_t \right) I \right) \] 计算\(\sigma _tZ\)使用：

1	torch.exp(0.5 * log_var) * noise

而其他的参数都已经计算过了，所以重点是计算第一项的均值：

输入\(x_t\)，得到\(x_{t-1}\)，最终的代码如下：

class GaussianDiffusionSampler(nn.Module):
    def __init__(self, model, beta_1, beta_T, T, img_size=32,
                 mean_type='eps', var_type='fixedlarge'):
        assert mean_type in ['xprev' 'xstart', 'epsilon']
        assert var_type in ['fixedlarge', 'fixedsmall']
        super().__init__()
 
        self.model = model
        self.T = T
        self.img_size = img_size
        self.mean_type = mean_type
        self.var_type = var_type
 
        self.register_buffer(
            'betas', torch.linspace(beta_1, beta_T, T).double())
        alphas = 1. - self.betas
        alphas_bar = torch.cumprod(alphas, dim=0)
        alphas_bar_prev = F.pad(alphas_bar, [1, 0], value=1)[:T]
 
        # calculations for diffusion q(x_t | x_{t-1}) and others
        self.register_buffer(
            'sqrt_recip_alphas_bar', torch.sqrt(1. / alphas_bar))
        self.register_buffer(
            'sqrt_recipm1_alphas_bar', torch.sqrt(1. / alphas_bar - 1))
 
        # calculations for posterior q(x_{t-1} | x_t, x_0)
        self.register_buffer(
            'posterior_var',
            self.betas * (1. - alphas_bar_prev) / (1. - alphas_bar))
        # below: log calculation clipped because the posterior variance is 0 at
        # the beginning of the diffusion chain
        self.register_buffer(
            'posterior_log_var_clipped',
            torch.log(
                torch.cat([self.posterior_var[1:2], self.posterior_var[1:]])))
        self.register_buffer(
            'posterior_mean_coef1',
            torch.sqrt(alphas_bar_prev) * self.betas / (1. - alphas_bar))
        self.register_buffer(
            'posterior_mean_coef2',
            torch.sqrt(alphas) * (1. - alphas_bar_prev) / (1. - alphas_bar))
 
    def q_mean_variance(self, x_0, x_t, t):
        """
        Compute the mean and variance of the diffusion posterior
        q(x_{t-1} | x_t, x_0)
        """
        assert x_0.shape == x_t.shape
        posterior_mean = (
            extract(self.posterior_mean_coef1, t, x_t.shape) * x_0 +
            extract(self.posterior_mean_coef2, t, x_t.shape) * x_t
        )
        posterior_log_var_clipped = extract(
            self.posterior_log_var_clipped, t, x_t.shape)
        return posterior_mean, posterior_log_var_clipped
 
    def predict_xstart_from_eps(self, x_t, t, eps):
        assert x_t.shape == eps.shape
        return (
            extract(self.sqrt_recip_alphas_bar, t, x_t.shape) * x_t -
            extract(self.sqrt_recipm1_alphas_bar, t, x_t.shape) * eps
        )
 
    def predict_xstart_from_xprev(self, x_t, t, xprev):
        assert x_t.shape == xprev.shape
        return (  # (xprev - coef2*x_t) / coef1
            extract(
                1. / self.posterior_mean_coef1, t, x_t.shape) * xprev -
            extract(
                self.posterior_mean_coef2 / self.posterior_mean_coef1, t,
                x_t.shape) * x_t
        )
 
    def p_mean_variance(self, x_t, t):
        # below: only log_variance is used in the KL computations
        model_log_var = {
            # for fixedlarge, we set the initial (log-)variance like so to
            # get a better decoder log likelihood
            'fixedlarge': torch.log(torch.cat([self.posterior_var[1:2],
                                               self.betas[1:]])),
            'fixedsmall': self.posterior_log_var_clipped,
        }[self.var_type]
        model_log_var = extract(model_log_var, t, x_t.shape)
 
        # Mean parameterization
        if self.mean_type == 'xprev':       # the model predicts x_{t-1}
            x_prev = self.model(x_t, t)
            x_0 = self.predict_xstart_from_xprev(x_t, t, xprev=x_prev)
            model_mean = x_prev
        elif self.mean_type == 'xstart':    # the model predicts x_0
            x_0 = self.model(x_t, t)
            model_mean, _ = self.q_mean_variance(x_0, x_t, t)
        elif self.mean_type == 'epsilon':   # the model predicts epsilon
            eps = self.model(x_t, t)
            x_0 = self.predict_xstart_from_eps(x_t, t, eps=eps)
            model_mean, _ = self.q_mean_variance(x_0, x_t, t)
        else:
            raise NotImplementedError(self.mean_type)
        x_0 = torch.clip(x_0, -1., 1.)
 
        return model_mean, model_log_var
 
    def forward(self, x_T):
        """
        Algorithm 2.
        """
        x_t = x_T
        for time_step in reversed(range(self.T)):
            t = x_t.new_ones([x_T.shape[0], ], dtype=torch.long) * time_step
            mean, log_var = self.p_mean_variance(x_t=x_t, t=t)
            # no noise when t == 0
            if time_step > 0:
                noise = torch.randn_like(x_t)
            else:
                noise = 0
            x_t = mean + torch.exp(0.5 * log_var) * noise
        x_0 = x_t
        return torch.clip(x_0, -1, 1)

因为我们预测的是概率分布，所以最终将所有的值缩放到[-1,1]这个区间中。

IS和FID测试

IS简介

IS基于Google的预训练网络Inception Net-V3，Inception Net-V3是精心设计的卷积网络模型，输入为图片张量，输出为1000维向量。输出向量的每个维度的值对应图片属于某类的概率，因此整个向量可以看做一个概率分布。

p(y|x) 表示 Inception 输入生成图像 x 时的输出分布，p(x) 表示生成器 G 生成图像 x 的概率，p(y_i|x)表示 Inception 预测 x 为第 i 类的概率，IS 是衡量两者之间的 KL散度： \[ IS=\exp E_{x~P_G}KL\left( p\left( y|x \right) ||p\left( y \right) \right) \] IS越大，生成图片的质量越高。

FID简介

FID衡量真实图像分布和生成器生成之间的差异，因此FID越小，代表真实图像和生成图像之间的接近性，生成质量也就越高。

计算公式：（整个公式表示两个点（m, C）和（m_w, C_w）之间的距离的度量。这个距离包括它们在特征空间的欧几里得距离以及它们在协方差矩阵空间的距离。）

使用方法：

使用 Inception V2 预训练模型提取真实图像和假图像的特征向量（由生成器生成），计算生成的特征向量的特征均值。
生成特征向量\(C,C_w\)的协方差矩阵
计算矩阵的迹 \[ Tr\left( C+C_w-2\left( CC_w \right) ^{1/2} \right) \] 计算矩阵的迹可以参考博客：https://blog.csdn.net/lyxleft/article/details/84865805
计算平均向量的平方差

IS代码

import torch
from torch import nn
from torch.autograd import Variable
from torch.nn import functional as F
import torch.utils.data
from PIL import Image
 
from torchvision.models.inception import inception_v3
import os
import sys
import numpy as np
from scipy.stats import entropy

def inception_score(imgs, cuda=True, batch_size=32, resize=False, splits=1):
    """Computes the inception score of the generated images imgs
    imgs -- Torch dataset of (3xHxW) numpy images normalized in the range [-1, 1]
    cuda -- whether or not to run on GPU
    batch_size -- batch size for feeding into Inception v3
    splits -- number of splits
    """
    N = len(imgs)
 
    assert batch_size > 0
    assert N > batch_size
 
    # Set up dtype
    if cuda:
        dtype = torch.cuda.FloatTensor
    else:
        if torch.cuda.is_available():
            print("WARNING: You have a CUDA device, so you should probably set cuda=True")
        dtype = torch.FloatTensor
 
    # Set up dataloader
    dataloader = torch.utils.data.DataLoader(imgs, batch_size=batch_size)
 
    # Load inception model
    inception_model = inception_v3(pretrained=True, transform_input=False).type(dtype)
    inception_model.eval()
    up = nn.Upsample(size=(299, 299), mode='bilinear').type(dtype)
    def get_pred(x):
        if resize:
            x = up(x)
        x = inception_model(x)
        return F.softmax(x).data.cpu().numpy()
 
    # Get predictions
    preds = np.zeros((N, 1000))
 
    for i, batch in enumerate(dataloader, 0):
        batch = batch.type(dtype)
        batchv = Variable(batch)
        batch_size_i = batch.size()[0]
 
        preds[i*batch_size:i*batch_size + batch_size_i] = get_pred(batchv)
 
    # Now compute the mean kl-div
    split_scores = []
 
    for k in range(splits):
        part = preds[k * (N // splits): (k+1) * (N // splits), :]
        py = np.mean(part, axis=0)
        scores = []
        for i in range(part.shape[0]):
            pyx = part[i, :]
            scores.append(entropy(pyx, py))
        split_scores.append(np.exp(np.mean(scores)))
 
    return np.mean(split_scores), np.std(split_scores)

if __name__ == '__main__':
    import torchvision.transforms as transforms
    
    class IgnoreLabelDataset(torch.utils.data.Dataset):
        def __init__(self, folder):
            self.folder = folder
            files = os.listdir(folder)
            self.imgs = [it for it in files if (it.endswith('.jpg') or it.endswith('.png'))]
 
            self.transform = transforms.Compose([transforms.Resize((256, 256)),
                                                    transforms.ToTensor(),
                                                    transforms.Normalize((0.5, 0.5, 0.5),
                                                                        (0.5, 0.5, 0.5))])
 
        def __getitem__(self, index):
            img_path = self.imgs[index]
            img = Image.open(os.path.join(self.folder, img_path))
            img = self.transform(img)
            return img
 
        def __len__(self):
            return len(self.imgs)
 
 
    data_ = IgnoreLabelDataset(sys.argv[1])
    print('datalen is {}'.format(len(data_)))
 
    print ("Calculating Inception Score...")
    print (inception_score(data_, cuda=True, batch_size=32, resize=True, splits=10))

FID代码

def calculate_fid(image1, image2):
    model = tf.keras.applications.inception_v3.InceptionV3(
        include_top=False,
        weights='imagenet',
        pooling='avg'
    )
    image1 = tf.keras.applications.inception_v3.preprocess_input(image1)
    image2 = tf.keras.applications.inception_v3.preprocess_input(image2)
    real_embeddings = model.predict(image1)
    generated_image_embeddings = model.predict(image2)
    mu1, sigma1 = real_embeddings.mean(axis=0), np.cov(real_embeddings, rowvar=False)
    mu2, sigma2 = generated_image_embeddings.mean(axis=0), np.cov(generated_image_embeddings, rowvar=False)
    ssdiff = np.sum((mu1 - mu2) ** 2.0)
 
    covmean = sqrtm(sigma1.dot(sigma2))
    if np.iscomplexobj(covmean):
        covmean = covmean.real
 
    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
    return fid

完整实例

import matplotlib.pyplot as plt
import utils
import nn
import os
import numpy as np
import tensorflow as tf
from scipy.linalg import sqrtm
import math
 
def generate_sketch(epochs=0, num=96, weight_path='/models_DDPM/model.h5'):
    timesteps = 6 * 16
    timesteps = timesteps - (timesteps % 8) + 8     # 104
    beta_set = utils.get_beta_set()
 
    C1 = 128
    C2 = C1 * 3 // 2
    C3 = C1 * 2
    model = nn.DiffusionWriter(num_layers=2, c1=C1, c2=C2, c3=C3)
 
    _stroke = tf.random.normal([num, 104, 2])
    _text = tf.random.uniform([num, 40], dtype=tf.int32, maxval=50)
    _noise = tf.random.uniform([num, 1, 1])
    _ = model(_stroke, _noise)
    model.load_weights(weight_path)
    result = utils.run_batch_inference(model, beta_set, _text.shape[0], time_steps=timesteps,
                                       diffusion_mode='new', pen_break=0.010)
    return result   # (num, 104, 3)
    # sketch_list = []
    # for index in range(result.shape[0]):
    #     sketch_list.append(utils.draw_three(result[index]))
    # # save_image(result, sketch_list, epochs)
    #
    # sketch_list = np.array(sketch_list)     # (96, 512, 512, 3)
    # return sketch_list
 
 
def save_image(result, sketch_list, epochs, save_name='bus_test'):
    plt.xticks([])  # 去掉x轴
    plt.yticks([])  # 去掉y轴
    plt.axis('off')  # 去掉坐标轴
 
    for i in range(result.shape[0]):
        plt.imshow(sketch_list[i])
        save_path = f"./training_inference/{save_name}"
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            print(f"{save_path} not exits! make new!")
        plt.savefig(f"{save_path}/{epochs}_{i}.png")
        print(f"{save_path}/{epochs}_{i}.png is saved!")
 
 
def calculate_fid(image1, image2):
    model = tf.keras.applications.inception_v3.InceptionV3(
        include_top=False,
        weights='imagenet',
        pooling='avg'
    )
    image1 = tf.keras.applications.inception_v3.preprocess_input(image1)
    image2 = tf.keras.applications.inception_v3.preprocess_input(image2)
    real_embeddings = model.predict(image1)
    generated_image_embeddings = model.predict(image2)
    mu1, sigma1 = real_embeddings.mean(axis=0), np.cov(real_embeddings, rowvar=False)
    mu2, sigma2 = generated_image_embeddings.mean(axis=0), np.cov(generated_image_embeddings, rowvar=False)
    ssdiff = np.sum((mu1 - mu2) ** 2.0)
 
    covmean = sqrtm(sigma1.dot(sigma2))
    if np.iscomplexobj(covmean):
        covmean = covmean.real
 
    fid = ssdiff + np.trace(sigma1 + sigma2 - 2.0 * covmean)
    return fid
 
 
# assumes images have the shape 299x299x3, pixels in [0,255]
def calculate_inception_score(images, n_split=10, eps=1E-16):
    # load inception v3 model
    model = tf.keras.applications.inception_v3.InceptionV3()
    # convert from uint8 to float32
    processed = images.astype('float32')
    # pre-process raw images for inception v3 model
    processed = tf.keras.applications.inception_v3.preprocess_input(processed)
    # predict class probabilities for images
    yhat = model.predict(processed)
    # enumerate splits of images/predictions
    scores = list()
    n_part = math.floor(images.shape[0] / n_split)
    for i in range(n_split):
        # retrieve p(y|x)
        ix_start, ix_end = i * n_part, i * n_part + n_part
        p_yx = yhat[ix_start:ix_end]
        # calculate p(y)
        p_y = np.expand_dims(p_yx.mean(axis=0), 0)
        # calculate KL divergence using log probabilities
        kl_d = p_yx * (np.log(p_yx + eps) - np.log(p_y + eps))
        # sum over classes
        sum_kl_d = kl_d.sum(axis=1)
        # average over images
        avg_kl_d = np.mean(sum_kl_d)
        # undo the log
        is_score = np.exp(avg_kl_d)
        # store
        scores.append(is_score)
    # average across images
    is_avg, is_std = np.mean(scores), np.std(scores)
    return is_avg, is_std
 
 
if __name__ == "__main__":
    loaded_data1 = np.load("./FID/RNN/train_rnn_100.npz", allow_pickle=True)
    loaded_data2 = np.load("./FID/RNN/train_rnn_100_1.npz", allow_pickle=True)
    # origin_fid = loaded_data1['arr_0']
    # generate_fid = loaded_data1['arr_1']
    origin_fid = loaded_data1['arr_0']
    generate_fid = loaded_data2['arr_1']
    fid = calculate_fid(np.array(origin_fid), np.array(generate_fid))
    print(f"fid: {fid}")
    exit()

U-net网络结构

前面的博客已经详细说明了为什么要用U-net，以及U-net的结构

import math
import torch
from torch import nn
from torch.nn import init
from torch.nn import functional as F
 
 
class Swish(nn.Module):
    def forward(self, x):
        return x * torch.sigmoid(x)
 
 
class TimeEmbedding(nn.Module):
    def __init__(self, T, d_model, dim):
        assert d_model % 2 == 0
        super().__init__()
        emb = torch.arange(0, d_model, step=2) / d_model * math.log(10000)
        emb = torch.exp(-emb)
        pos = torch.arange(T).float()
        emb = pos[:, None] * emb[None, :]
        assert list(emb.shape) == [T, d_model // 2]
        emb = torch.stack([torch.sin(emb), torch.cos(emb)], dim=-1)
        assert list(emb.shape) == [T, d_model // 2, 2]
        emb = emb.view(T, d_model)
 
        self.timembedding = nn.Sequential(
            nn.Embedding.from_pretrained(emb),
            nn.Linear(d_model, dim),
            Swish(),
            nn.Linear(dim, dim),
        )
        self.initialize()
 
    def initialize(self):
        for module in self.modules():
            if isinstance(module, nn.Linear):
                init.xavier_uniform_(module.weight)
                init.zeros_(module.bias)
 
    def forward(self, t):
        emb = self.timembedding(t)
        return emb
 
 
class DownSample(nn.Module):
    def __init__(self, in_ch):
        super().__init__()
        self.main = nn.Conv2d(in_ch, in_ch, 3, stride=2, padding=1)
        self.initialize()
 
    def initialize(self):
        init.xavier_uniform_(self.main.weight)
        init.zeros_(self.main.bias)
 
    def forward(self, x, temb):
        x = self.main(x)
        return x
 
 
class UpSample(nn.Module):
    def __init__(self, in_ch):
        super().__init__()
        self.main = nn.Conv2d(in_ch, in_ch, 3, stride=1, padding=1)
        self.initialize()
 
    def initialize(self):
        init.xavier_uniform_(self.main.weight)
        init.zeros_(self.main.bias)
 
    def forward(self, x, temb):
        _, _, H, W = x.shape
        x = F.interpolate(
            x, scale_factor=2, mode='nearest')
        x = self.main(x)
        return x
 
 
class AttnBlock(nn.Module):
    def __init__(self, in_ch):
        super().__init__()
        self.group_norm = nn.GroupNorm(32, in_ch)
        self.proj_q = nn.Conv2d(in_ch, in_ch, 1, stride=1, padding=0)
        self.proj_k = nn.Conv2d(in_ch, in_ch, 1, stride=1, padding=0)
        self.proj_v = nn.Conv2d(in_ch, in_ch, 1, stride=1, padding=0)
        self.proj = nn.Conv2d(in_ch, in_ch, 1, stride=1, padding=0)
        self.initialize()
 
    def initialize(self):
        for module in [self.proj_q, self.proj_k, self.proj_v, self.proj]:
            init.xavier_uniform_(module.weight)
            init.zeros_(module.bias)
        init.xavier_uniform_(self.proj.weight, gain=1e-5)
 
    def forward(self, x):
        B, C, H, W = x.shape
        h = self.group_norm(x)
        q = self.proj_q(h)
        k = self.proj_k(h)
        v = self.proj_v(h)
 
        q = q.permute(0, 2, 3, 1).view(B, H * W, C)
        k = k.view(B, C, H * W)
        w = torch.bmm(q, k) * (int(C) ** (-0.5))
        assert list(w.shape) == [B, H * W, H * W]
        w = F.softmax(w, dim=-1)
 
        v = v.permute(0, 2, 3, 1).view(B, H * W, C)
        h = torch.bmm(w, v)
        assert list(h.shape) == [B, H * W, C]
        h = h.view(B, H, W, C).permute(0, 3, 1, 2)
        h = self.proj(h)
 
        return x + h
 
 
class ResBlock(nn.Module):
    def __init__(self, in_ch, out_ch, tdim, dropout, attn=False):
        super().__init__()
        self.block1 = nn.Sequential(
            nn.GroupNorm(32, in_ch),
            Swish(),
            nn.Conv2d(in_ch, out_ch, 3, stride=1, padding=1),
        )
        self.temb_proj = nn.Sequential(
            Swish(),
            nn.Linear(tdim, out_ch),
        )
        self.block2 = nn.Sequential(
            nn.GroupNorm(32, out_ch),
            Swish(),
            nn.Dropout(dropout),
            nn.Conv2d(out_ch, out_ch, 3, stride=1, padding=1),
        )
        if in_ch != out_ch:
            self.shortcut = nn.Conv2d(in_ch, out_ch, 1, stride=1, padding=0)
        else:
            self.shortcut = nn.Identity()
        if attn:
            self.attn = AttnBlock(out_ch)
        else:
            self.attn = nn.Identity()
        self.initialize()
 
    def initialize(self):
        for module in self.modules():
            if isinstance(module, (nn.Conv2d, nn.Linear)):
                init.xavier_uniform_(module.weight)
                init.zeros_(module.bias)
        init.xavier_uniform_(self.block2[-1].weight, gain=1e-5)
 
    def forward(self, x, temb):
        h = self.block1(x)
        h += self.temb_proj(temb)[:, :, None, None]
        h = self.block2(h)
 
        h = h + self.shortcut(x)
        h = self.attn(h)
        return h
 
 
class UNet(nn.Module):
    def __init__(self, T, ch, ch_mult, attn, num_res_blocks, dropout):
        super().__init__()
        assert all([i < len(ch_mult) for i in attn]), 'attn index out of bound'
        tdim = ch * 4
        self.time_embedding = TimeEmbedding(T, ch, tdim)
 
        self.head = nn.Conv2d(3, ch, kernel_size=3, stride=1, padding=1)
        self.downblocks = nn.ModuleList()
        chs = [ch]  # record output channel when dowmsample for upsample
        now_ch = ch
        for i, mult in enumerate(ch_mult):
            out_ch = ch * mult
            for _ in range(num_res_blocks):
                self.downblocks.append(ResBlock(
                    in_ch=now_ch, out_ch=out_ch, tdim=tdim,
                    dropout=dropout, attn=(i in attn)))
                now_ch = out_ch
                chs.append(now_ch)
            if i != len(ch_mult) - 1:
                self.downblocks.append(DownSample(now_ch))
                chs.append(now_ch)
 
        self.middleblocks = nn.ModuleList([
            ResBlock(now_ch, now_ch, tdim, dropout, attn=True),
            ResBlock(now_ch, now_ch, tdim, dropout, attn=False),
        ])
 
        self.upblocks = nn.ModuleList()
        for i, mult in reversed(list(enumerate(ch_mult))):
            out_ch = ch * mult
            for _ in range(num_res_blocks + 1):
                self.upblocks.append(ResBlock(
                    in_ch=chs.pop() + now_ch, out_ch=out_ch, tdim=tdim,
                    dropout=dropout, attn=(i in attn)))
                now_ch = out_ch
            if i != 0:
                self.upblocks.append(UpSample(now_ch))
        assert len(chs) == 0
 
        self.tail = nn.Sequential(
            nn.GroupNorm(32, now_ch),
            Swish(),
            nn.Conv2d(now_ch, 3, 3, stride=1, padding=1)
        )
        self.initialize()
 
    def initialize(self):
        init.xavier_uniform_(self.head.weight)
        init.zeros_(self.head.bias)
        init.xavier_uniform_(self.tail[-1].weight, gain=1e-5)
        init.zeros_(self.tail[-1].bias)
 
    def forward(self, x, t):
        # Timestep embedding
        temb = self.time_embedding(t)
        # Downsampling
        h = self.head(x)
        hs = [h]
        for layer in self.downblocks:
            h = layer(h, temb)
            hs.append(h)
        # Middle
        for layer in self.middleblocks:
            h = layer(h, temb)
        # Upsampling
        for layer in self.upblocks:
            if isinstance(layer, ResBlock):
                h = torch.cat([h, hs.pop()], dim=1)
            h = layer(h, temb)
        h = self.tail(h)
 
        assert len(hs) == 0
        return h
 
 
if __name__ == '__main__':
    batch_size = 8
    model = UNet(
        T=1000, ch=128, ch_mult=[1, 2, 2, 2], attn=[1],
        num_res_blocks=2, dropout=0.1)
    x = torch.randn(batch_size, 3, 32, 32)
    t = torch.randint(1000, (batch_size, ))
    y = model(x, t)

本文作者： 李宝璐
本文链接： https://libaolu312.github.io/2023/11/13/DDPM代码详细解读-2-Unet结构、正向和逆向过程、IS和FID测试、EMA优化/
版权声明： 本博客所有文章除特别声明外，均采用 MIT 许可协议。转载请注明出处！