ActionDiffusion_WACV2025/model/dit.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
# --------------------------------------------------------
# References:
# GLIDE: https://github.com/openai/glide-text2im
# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py
# --------------------------------------------------------

import torch
import torch.nn as nn
import numpy as np
import math
from timm.models.vision_transformer import Attention, Mlp # PatchEmbed
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from .helpers import SinusoidalPosEmb

class PatchEmbed(nn.Module):
    """ 2D Image to Patch Embedding
    """
    def __init__(
            self,
            img_size=224,
            patch_size=16,
            in_chans=3,
            embed_dim=768,
            norm_layer=None,
            flatten=True,
            bias=True,
    ):
        super().__init__()
        img_size = (img_size, 1)
        patch_size = (patch_size, 1)
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.flatten = flatten

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)
        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()

    def forward(self, x):
        B, C, H, W = x.shape
        #_assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")
        #_assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")
        x = self.proj(x)
        if self.flatten:
            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
        x = self.norm(x)
        return x

def modulate(x, shift, scale):
    return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)


#################################################################################
#               Embedding Layers for Timesteps and Class Labels                 #
#################################################################################

class TimestepEmbedder(nn.Module):
    """
    Embeds scalar timesteps into vector representations.
    """
    def __init__(self, hidden_size, frequency_embedding_size=256):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(frequency_embedding_size, hidden_size, bias=True),
            nn.SiLU(),
            nn.Linear(hidden_size, hidden_size, bias=True),
        )
        self.frequency_embedding_size = frequency_embedding_size

    @staticmethod
    def timestep_embedding(t, dim, max_period=10000):
        """
        Create sinusoidal timestep embeddings.
        :param t: a 1-D Tensor of N indices, one per batch element.
                          These may be fractional.
        :param dim: the dimension of the output.
        :param max_period: controls the minimum frequency of the embeddings.
        :return: an (N, D) Tensor of positional embeddings.
        """
        # https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py
        half = dim // 2
        freqs = torch.exp(
            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
        ).to(device=t.device)
        args = t[:, None].float() * freqs[None]
        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
        if dim % 2:
            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
        return embedding

    def forward(self, t):
        t_freq = self.timestep_embedding(t, self.frequency_embedding_size)
        t_emb = self.mlp(t_freq)
        return t_emb


class LabelEmbedder(nn.Module):
    """
    Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.
    """
    def __init__(self, num_classes, hidden_size, dropout_prob):
        super().__init__()
        use_cfg_embedding = dropout_prob > 0
        self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)
        self.num_classes = num_classes
        self.dropout_prob = dropout_prob

    def token_drop(self, labels, force_drop_ids=None):
        """
        Drops labels to enable classifier-free guidance.
        """
        if force_drop_ids is None:
            drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob
        else:
            drop_ids = force_drop_ids == 1
        labels = torch.where(drop_ids, self.num_classes, labels)
        return labels

    def forward(self, labels, train, force_drop_ids=None):
        use_dropout = self.dropout_prob > 0
        if (train and use_dropout) or (force_drop_ids is not None):
            labels = self.token_drop(labels, force_drop_ids)
        embeddings = self.embedding_table(labels)
        return embeddings


#################################################################################
#                                 Core DiT Model                                #
#################################################################################

class DiTBlock(nn.Module):
    """
    A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.
    """
    def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):
        super().__init__()
        self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)
        self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        mlp_hidden_dim = int(hidden_size * mlp_ratio)
        approx_gelu = lambda: nn.GELU(approximate="tanh")
        self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 6 * hidden_size, bias=True)
        )

    def forward(self, x, c):
        shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)
        x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))
        x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))
        return x


class FinalLayer(nn.Module):
    """
    The final layer of DiT.
    """
    def __init__(self, hidden_size, patch_size, out_channels):
        super().__init__()
        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
        self.linear = nn.Linear(hidden_size, patch_size * 1 * out_channels, bias=True)
        self.adaLN_modulation = nn.Sequential(
            nn.SiLU(),
            nn.Linear(hidden_size, 2 * hidden_size, bias=True)
        )

    def forward(self, x, c):
        shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)
        x = modulate(self.norm_final(x), shift, scale)
        x = self.linear(x)
        return x


class DiT(nn.Module):
    """
    Diffusion model with a Transformer backbone.
    """
    def __init__(
        self,
        input_size=32,
        patch_size=2,
        in_channels=4,
        hidden_size=384,
        depth=12,
        num_heads=6,
        mlp_ratio=4.0,
        class_dropout_prob=0.1,
        num_classes=1000,
        learn_sigma=False,
    ):
        super().__init__()
        self.learn_sigma = learn_sigma
        self.in_channels = in_channels
        self.out_channels = in_channels * 2 if learn_sigma else in_channels
        self.patch_size = patch_size
        self.num_heads = num_heads

        self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)
        self.t_embedder = TimestepEmbedder(hidden_size)
        #self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)
        num_patches = self.x_embedder.num_patches
        # Will use fixed sin-cos embedding:
        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)

        self.blocks = nn.ModuleList([
            DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)
        ])
        self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)
        self.initialize_weights()

    def initialize_weights(self):
        # Initialize transformer layers:
        def _basic_init(module):
            if isinstance(module, nn.Linear):
                torch.nn.init.xavier_uniform_(module.weight)
                if module.bias is not None:
                    nn.init.constant_(module.bias, 0)
        self.apply(_basic_init)

        # Initialize (and freeze) pos_embed by sin-cos embedding:
        pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches )) #** 0.5
        #print('pos_embed', pos_embed.shape, self.x_embedder.num_patches)
        self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))

        # Initialize patch_embed like nn.Linear (instead of nn.Conv2d):
        w = self.x_embedder.proj.weight.data
        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
        nn.init.constant_(self.x_embedder.proj.bias, 0)

        # Initialize label embedding table:
        #nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)

        # Initialize timestep embedding MLP:
        nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)
        nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)

        # Zero-out adaLN modulation layers in DiT blocks:
        for block in self.blocks:
            nn.init.constant_(block.adaLN_modulation[-1].weight, 0)
            nn.init.constant_(block.adaLN_modulation[-1].bias, 0)

        # Zero-out output layers:
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)
        nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)
        nn.init.constant_(self.final_layer.linear.weight, 0)
        nn.init.constant_(self.final_layer.linear.bias, 0)

    def unpatchify(self, x):
        """
        x: (N, T, patch_size**2 * C)
        imgs: (N, H, W, C)
        """
        c = self.out_channels
        p = self.x_embedder.patch_size[0]
        '''h = w = int(x.shape[1] ** 0.5)
        assert h * w == x.shape[1]'''
        h = x.shape[1]
        w = 1
        
        #print(x.shape)

        x = x.reshape(shape=(x.shape[0], h, w, p, 1, c))
        x = torch.einsum('nhwpqc->nchpwq', x)
        imgs = x.reshape(shape=(x.shape[0], c, h * p, w * 1))
        return imgs

    def forward(self, x, t):
        """
        Forward pass of DiT.
        x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)
        t: (N,) tensor of diffusion timesteps
        y: (N,) tensor of class labels
        """
        #print('x', x.shape, 'x_embedder', self.x_embedder(x).shape, 'pos_embed', self.pos_embed.shape)
        x = self.x_embedder(x) + self.pos_embed  # (N, T, D), where T = H * W / patch_size ** 2
        t = self.t_embedder(t)                   # (N, D)
        #y = self.y_embedder(y, self.training)    # (N, D)
        #c = t + y                              # (N, D)
        c = t
        for block in self.blocks:
            x = block(x, c)                      # (N, T, D)
        x = self.final_layer(x, c)                # (N, T, patch_size ** 2 * out_channels)
        x = self.unpatchify(x)                   # (N, out_channels, H, W)
        return x

    def forward_with_cfg(self, x, t, y, cfg_scale):
        """
        Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.
        """
        # https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb
        half = x[: len(x) // 2]
        combined = torch.cat([half, half], dim=0)
        model_out = self.forward(combined, t, y)
        # For exact reproducibility reasons, we apply classifier-free guidance on only
        # three channels by default. The standard approach to cfg applies it to all channels.
        # This can be done by uncommenting the following line and commenting-out the line following that.
        # eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]
        eps, rest = model_out[:, :3], model_out[:, 3:]
        cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)
        half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)
        eps = torch.cat([half_eps, half_eps], dim=0)
        return torch.cat([eps, rest], dim=1)


#################################################################################
#                   Sine/Cosine Positional Embedding Functions                  #
#################################################################################
# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py

def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):
    """
    grid_size: int of the grid height and width
    return:
    pos_embed: [grid_size*grid_size, embed_dim] or [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token)
    """
    grid_h = np.arange(grid_size, dtype=np.float32)
    grid_w = np.arange(1, dtype=np.float32)
    grid = np.meshgrid(grid_w, grid_h)  # here w goes first
    grid = np.stack(grid, axis=0)

    grid = grid.reshape([2, 1, grid_size, 1])
    pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)
    if cls_token and extra_tokens > 0:
        pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)
    return pos_embed


def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):
    assert embed_dim % 2 == 0

    # use half of dimensions to encode grid_h
    emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0])  # (H*W, D/2)
    emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1])  # (H*W, D/2)

    emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)
    return emb


def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):
    """
    embed_dim: output dimension for each position
    pos: a list of positions to be encoded: size (M,)
    out: (M, D)
    """
    assert embed_dim % 2 == 0
    omega = np.arange(embed_dim // 2, dtype=np.float64)
    omega /= embed_dim / 2.
    omega = 1. / 10000**omega  # (D/2,)

    pos = pos.reshape(-1)  # (M,)
    out = np.einsum('m,d->md', pos, omega)  # (M, D/2), outer product

    emb_sin = np.sin(out) # (M, D/2)
    emb_cos = np.cos(out) # (M, D/2)

    emb = np.concatenate([emb_sin, emb_cos], axis=1)  # (M, D)
    return emb

def get_emb(sin_inp):
	"""
	Gets a base embedding for one dimension with sin and cos intertwined
	"""
	emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)
	return torch.flatten(emb, -2, -1)


class PositionalEncoding1D(nn.Module):
	def __init__(self, channels, dtype_override=None):
		"""
		:param channels: The last dimension of the tensor you want to apply pos emb to.
		:param dtype_override: If set, overrides the dtype of the output embedding.
		"""
		super(PositionalEncoding1D, self).__init__()
		self.org_channels = channels
		channels = int(np.ceil(channels / 2) * 2)
		inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))
		self.register_buffer("inv_freq", inv_freq)
		self.register_buffer("cached_penc", None, persistent=False)
		self.channels = channels
		self.dtype_override = dtype_override

	def forward(self, tensor):
		"""
		:param tensor: A 3d tensor of size (batch_size, ch, x)
		:return: Positional Encoding Matrix of size (batch_size, ch, x)
		"""
		if len(tensor.shape) != 3:
			raise RuntimeError("The input tensor has to be 3d!")

		if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:
			return self.cached_penc

		self.cached_penc = None
		batch_size, orig_ch, x = tensor.shape
		pos_x = torch.arange(x, device=tensor.device, dtype=self.inv_freq.dtype)
		sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)
		emb_x = get_emb(sin_inp_x)
		#print('emb_x', emb_x.shape)
		emb = torch.zeros(
			(self.channels, x),
			device=tensor.device,
			dtype=(
				self.dtype_override if self.dtype_override is not None else tensor.dtype
			),
		)
		emb[:self.channels, :] = emb_x.permute(1,0)

		self.cached_penc = emb[None, :orig_ch, :].repeat(batch_size, 1, 1)
		return self.cached_penc

class TransformerModel(nn.Module):

	def __init__(self, ntoken, ninp, nhead, nhid, nlayers=6, dropout=0.0):
		super(TransformerModel, self).__init__()
		
		self.model_type = 'Transformer'
		self.pos_encoder = PositionalEncoding1D(ninp)
		encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout, batch_first=True)
		self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)
		#self.encoder = nn.Embedding(ntoken, ninp)
		self.ninp = ninp
		#self.decoder = nn.Linear(ninp, ntoken)
		self.time_mlp = nn.Sequential(    # should be removed for Noise and Deterministic Baselines
						SinusoidalPosEmb(ninp),
						nn.Linear(ninp-1, ninp * 4),
						nn.Mish(),
						nn.Linear(ninp * 4, ninp),
						)

		#self.init_weights()

	def generate_square_subsequent_mask(self, sz):
		mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
		mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
		return mask

	'''def init_weights(self):
		initrange = 0.1
		self.encoder.weight.data.uniform_(-initrange, initrange)
		self.decoder.bias.data.zero_()
		self.decoder.weight.data.uniform_(-initrange, initrange)'''

	def forward(self, src, t):
		#print('self.ninp', self.ninp)
		#src = self.encoder(src) * math.sqrt(self.ninp)
		#print('src', src.shape)
		
		#t = self.time_mlp(t).unsqueeze(1)
		
		emb = self.pos_encoder(src)
		#time = torch.cat((t,t,t), dim=1)
		#print('time', time.shape)
		output = self.transformer_encoder(src+emb)
		#print('shape after transformer', output.shape)
		#output = self.decoder(output)
		return output


#################################################################################
#                                   DiT Configs                                  #
#################################################################################

def DiT_XL_2(**kwargs):
    return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)

def DiT_XL_4(**kwargs):
    return DiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)

def DiT_XL_8(**kwargs):
    return DiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)

def DiT_L_2(**kwargs):
    return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)

def DiT_L_4(**kwargs):
    return DiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)

def DiT_L_8(**kwargs):
    return DiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)

def DiT_B_2(**kwargs):
    return DiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)

def DiT_B_4(**kwargs):
    return DiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)

def DiT_B_8(**kwargs):
    return DiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)

def DiT_S_2(**kwargs):
    return DiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)

def DiT_S_4(**kwargs):
    return DiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)

def DiT_S_8(**kwargs):
    return DiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)


DiT_models = {
    'DiT-XL/2': DiT_XL_2,  'DiT-XL/4': DiT_XL_4,  'DiT-XL/8': DiT_XL_8,
    'DiT-L/2':  DiT_L_2,   'DiT-L/4':  DiT_L_4,   'DiT-L/8':  DiT_L_8,
    'DiT-B/2':  DiT_B_2,   'DiT-B/4':  DiT_B_4,   'DiT-B/8':  DiT_B_8,
    'DiT-S/2':  DiT_S_2,   'DiT-S/4':  DiT_S_4,   'DiT-S/8':  DiT_S_8,
}
first commit 2024-12-02 15:42:58 +01:00			`# Copyright (c) Meta Platforms, Inc. and affiliates.`
			`# All rights reserved.`

			`# This source code is licensed under the license found in the`
			`# LICENSE file in the root directory of this source tree.`
			`# --------------------------------------------------------`
			`# References:`
			`# GLIDE: https://github.com/openai/glide-text2im`
			`# MAE: https://github.com/facebookresearch/mae/blob/main/models_mae.py`
			`# --------------------------------------------------------`

			`import torch`
			`import torch.nn as nn`
			`import numpy as np`
			`import math`
			`from timm.models.vision_transformer import Attention, Mlp # PatchEmbed`
			`from torch.nn import TransformerEncoder, TransformerEncoderLayer`
			`from .helpers import SinusoidalPosEmb`

			`class PatchEmbed(nn.Module):`
			`""" 2D Image to Patch Embedding`
			`"""`
			`def __init__(`
			`self,`
			`img_size=224,`
			`patch_size=16,`
			`in_chans=3,`
			`embed_dim=768,`
			`norm_layer=None,`
			`flatten=True,`
			`bias=True,`
			`):`
			`super().__init__()`
			`img_size = (img_size, 1)`
			`patch_size = (patch_size, 1)`
			`self.img_size = img_size`
			`self.patch_size = patch_size`
			`self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])`
			`self.num_patches = self.grid_size[0] * self.grid_size[1]`
			`self.flatten = flatten`

			`self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size, bias=bias)`
			`self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()`

			`def forward(self, x):`
			`B, C, H, W = x.shape`
			`#_assert(H == self.img_size[0], f"Input image height ({H}) doesn't match model ({self.img_size[0]}).")`
			`#_assert(W == self.img_size[1], f"Input image width ({W}) doesn't match model ({self.img_size[1]}).")`
			`x = self.proj(x)`
			`if self.flatten:`
			`x = x.flatten(2).transpose(1, 2) # BCHW -> BNC`
			`x = self.norm(x)`
			`return x`

			`def modulate(x, shift, scale):`
			`return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1)`


			`#################################################################################`
			`# Embedding Layers for Timesteps and Class Labels #`
			`#################################################################################`

			`class TimestepEmbedder(nn.Module):`
			`"""`
			`Embeds scalar timesteps into vector representations.`
			`"""`
			`def __init__(self, hidden_size, frequency_embedding_size=256):`
			`super().__init__()`
			`self.mlp = nn.Sequential(`
			`nn.Linear(frequency_embedding_size, hidden_size, bias=True),`
			`nn.SiLU(),`
			`nn.Linear(hidden_size, hidden_size, bias=True),`
			`)`
			`self.frequency_embedding_size = frequency_embedding_size`

			`@staticmethod`
			`def timestep_embedding(t, dim, max_period=10000):`
			`"""`
			`Create sinusoidal timestep embeddings.`
			`:param t: a 1-D Tensor of N indices, one per batch element.`
			`These may be fractional.`
			`:param dim: the dimension of the output.`
			`:param max_period: controls the minimum frequency of the embeddings.`
			`:return: an (N, D) Tensor of positional embeddings.`
			`"""`
			`# https://github.com/openai/glide-text2im/blob/main/glide_text2im/nn.py`
			`half = dim // 2`
			`freqs = torch.exp(`
			`-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half`
			`).to(device=t.device)`
			`args = t[:, None].float() * freqs[None]`
			`embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)`
			`if dim % 2:`
			`embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)`
			`return embedding`

			`def forward(self, t):`
			`t_freq = self.timestep_embedding(t, self.frequency_embedding_size)`
			`t_emb = self.mlp(t_freq)`
			`return t_emb`


			`class LabelEmbedder(nn.Module):`
			`"""`
			`Embeds class labels into vector representations. Also handles label dropout for classifier-free guidance.`
			`"""`
			`def __init__(self, num_classes, hidden_size, dropout_prob):`
			`super().__init__()`
			`use_cfg_embedding = dropout_prob > 0`
			`self.embedding_table = nn.Embedding(num_classes + use_cfg_embedding, hidden_size)`
			`self.num_classes = num_classes`
			`self.dropout_prob = dropout_prob`

			`def token_drop(self, labels, force_drop_ids=None):`
			`"""`
			`Drops labels to enable classifier-free guidance.`
			`"""`
			`if force_drop_ids is None:`
			`drop_ids = torch.rand(labels.shape[0], device=labels.device) < self.dropout_prob`
			`else:`
			`drop_ids = force_drop_ids == 1`
			`labels = torch.where(drop_ids, self.num_classes, labels)`
			`return labels`

			`def forward(self, labels, train, force_drop_ids=None):`
			`use_dropout = self.dropout_prob > 0`
			`if (train and use_dropout) or (force_drop_ids is not None):`
			`labels = self.token_drop(labels, force_drop_ids)`
			`embeddings = self.embedding_table(labels)`
			`return embeddings`


			`#################################################################################`
			`# Core DiT Model #`
			`#################################################################################`

			`class DiTBlock(nn.Module):`
			`"""`
			`A DiT block with adaptive layer norm zero (adaLN-Zero) conditioning.`
			`"""`
			`def __init__(self, hidden_size, num_heads, mlp_ratio=4.0, **block_kwargs):`
			`super().__init__()`
			`self.norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)`
			`self.attn = Attention(hidden_size, num_heads=num_heads, qkv_bias=True, **block_kwargs)`
			`self.norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)`
			`mlp_hidden_dim = int(hidden_size * mlp_ratio)`
			`approx_gelu = lambda: nn.GELU(approximate="tanh")`
			`self.mlp = Mlp(in_features=hidden_size, hidden_features=mlp_hidden_dim, act_layer=approx_gelu, drop=0)`
			`self.adaLN_modulation = nn.Sequential(`
			`nn.SiLU(),`
			`nn.Linear(hidden_size, 6 * hidden_size, bias=True)`
			`)`

			`def forward(self, x, c):`
			`shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.adaLN_modulation(c).chunk(6, dim=1)`
			`x = x + gate_msa.unsqueeze(1) * self.attn(modulate(self.norm1(x), shift_msa, scale_msa))`
			`x = x + gate_mlp.unsqueeze(1) * self.mlp(modulate(self.norm2(x), shift_mlp, scale_mlp))`
			`return x`


			`class FinalLayer(nn.Module):`
			`"""`
			`The final layer of DiT.`
			`"""`
			`def __init__(self, hidden_size, patch_size, out_channels):`
			`super().__init__()`
			`self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)`
			`self.linear = nn.Linear(hidden_size, patch_size * 1 * out_channels, bias=True)`
			`self.adaLN_modulation = nn.Sequential(`
			`nn.SiLU(),`
			`nn.Linear(hidden_size, 2 * hidden_size, bias=True)`
			`)`

			`def forward(self, x, c):`
			`shift, scale = self.adaLN_modulation(c).chunk(2, dim=1)`
			`x = modulate(self.norm_final(x), shift, scale)`
			`x = self.linear(x)`
			`return x`


			`class DiT(nn.Module):`
			`"""`
			`Diffusion model with a Transformer backbone.`
			`"""`
			`def __init__(`
			`self,`
			`input_size=32,`
			`patch_size=2,`
			`in_channels=4,`
			`hidden_size=384,`
			`depth=12,`
			`num_heads=6,`
			`mlp_ratio=4.0,`
			`class_dropout_prob=0.1,`
			`num_classes=1000,`
			`learn_sigma=False,`
			`):`
			`super().__init__()`
			`self.learn_sigma = learn_sigma`
			`self.in_channels = in_channels`
			`self.out_channels = in_channels * 2 if learn_sigma else in_channels`
			`self.patch_size = patch_size`
			`self.num_heads = num_heads`

			`self.x_embedder = PatchEmbed(input_size, patch_size, in_channels, hidden_size, bias=True)`
			`self.t_embedder = TimestepEmbedder(hidden_size)`
			`#self.y_embedder = LabelEmbedder(num_classes, hidden_size, class_dropout_prob)`
			`num_patches = self.x_embedder.num_patches`
			`# Will use fixed sin-cos embedding:`
			`self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, hidden_size), requires_grad=False)`

			`self.blocks = nn.ModuleList([`
			`DiTBlock(hidden_size, num_heads, mlp_ratio=mlp_ratio) for _ in range(depth)`
			`])`
			`self.final_layer = FinalLayer(hidden_size, patch_size, self.out_channels)`
			`self.initialize_weights()`

			`def initialize_weights(self):`
			`# Initialize transformer layers:`
			`def _basic_init(module):`
			`if isinstance(module, nn.Linear):`
			`torch.nn.init.xavier_uniform_(module.weight)`
			`if module.bias is not None:`
			`nn.init.constant_(module.bias, 0)`
			`self.apply(_basic_init)`

			`# Initialize (and freeze) pos_embed by sin-cos embedding:`
			`pos_embed = get_2d_sincos_pos_embed(self.pos_embed.shape[-1], int(self.x_embedder.num_patches )) #** 0.5`
			`#print('pos_embed', pos_embed.shape, self.x_embedder.num_patches)`
			`self.pos_embed.data.copy_(torch.from_numpy(pos_embed).float().unsqueeze(0))`

			`# Initialize patch_embed like nn.Linear (instead of nn.Conv2d):`
			`w = self.x_embedder.proj.weight.data`
			`nn.init.xavier_uniform_(w.view([w.shape[0], -1]))`
			`nn.init.constant_(self.x_embedder.proj.bias, 0)`

			`# Initialize label embedding table:`
			`#nn.init.normal_(self.y_embedder.embedding_table.weight, std=0.02)`

			`# Initialize timestep embedding MLP:`
			`nn.init.normal_(self.t_embedder.mlp[0].weight, std=0.02)`
			`nn.init.normal_(self.t_embedder.mlp[2].weight, std=0.02)`

			`# Zero-out adaLN modulation layers in DiT blocks:`
			`for block in self.blocks:`
			`nn.init.constant_(block.adaLN_modulation[-1].weight, 0)`
			`nn.init.constant_(block.adaLN_modulation[-1].bias, 0)`

			`# Zero-out output layers:`
			`nn.init.constant_(self.final_layer.adaLN_modulation[-1].weight, 0)`
			`nn.init.constant_(self.final_layer.adaLN_modulation[-1].bias, 0)`
			`nn.init.constant_(self.final_layer.linear.weight, 0)`
			`nn.init.constant_(self.final_layer.linear.bias, 0)`

			`def unpatchify(self, x):`
			`"""`
			`x: (N, T, patch_size*2 C)`
			`imgs: (N, H, W, C)`
			`"""`
			`c = self.out_channels`
			`p = self.x_embedder.patch_size[0]`
			`'''h = w = int(x.shape[1] ** 0.5)`
			`assert h * w == x.shape[1]'''`
			`h = x.shape[1]`
			`w = 1`

			`#print(x.shape)`

			`x = x.reshape(shape=(x.shape[0], h, w, p, 1, c))`
			`x = torch.einsum('nhwpqc->nchpwq', x)`
			`imgs = x.reshape(shape=(x.shape[0], c, h * p, w * 1))`
			`return imgs`

			`def forward(self, x, t):`
			`"""`
			`Forward pass of DiT.`
			`x: (N, C, H, W) tensor of spatial inputs (images or latent representations of images)`
			`t: (N,) tensor of diffusion timesteps`
			`y: (N,) tensor of class labels`
			`"""`
			`#print('x', x.shape, 'x_embedder', self.x_embedder(x).shape, 'pos_embed', self.pos_embed.shape)`
			`x = self.x_embedder(x) + self.pos_embed # (N, T, D), where T = H * W / patch_size ** 2`
			`t = self.t_embedder(t) # (N, D)`
			`#y = self.y_embedder(y, self.training) # (N, D)`
			`#c = t + y # (N, D)`
			`c = t`
			`for block in self.blocks:`
			`x = block(x, c) # (N, T, D)`
			`x = self.final_layer(x, c) # (N, T, patch_size ** 2 * out_channels)`
			`x = self.unpatchify(x) # (N, out_channels, H, W)`
			`return x`

			`def forward_with_cfg(self, x, t, y, cfg_scale):`
			`"""`
			`Forward pass of DiT, but also batches the unconditional forward pass for classifier-free guidance.`
			`"""`
			`# https://github.com/openai/glide-text2im/blob/main/notebooks/text2im.ipynb`
			`half = x[: len(x) // 2]`
			`combined = torch.cat([half, half], dim=0)`
			`model_out = self.forward(combined, t, y)`
			`# For exact reproducibility reasons, we apply classifier-free guidance on only`
			`# three channels by default. The standard approach to cfg applies it to all channels.`
			`# This can be done by uncommenting the following line and commenting-out the line following that.`
			`# eps, rest = model_out[:, :self.in_channels], model_out[:, self.in_channels:]`
			`eps, rest = model_out[:, :3], model_out[:, 3:]`
			`cond_eps, uncond_eps = torch.split(eps, len(eps) // 2, dim=0)`
			`half_eps = uncond_eps + cfg_scale * (cond_eps - uncond_eps)`
			`eps = torch.cat([half_eps, half_eps], dim=0)`
			`return torch.cat([eps, rest], dim=1)`


			`#################################################################################`
			`# Sine/Cosine Positional Embedding Functions #`
			`#################################################################################`
			`# https://github.com/facebookresearch/mae/blob/main/util/pos_embed.py`

			`def get_2d_sincos_pos_embed(embed_dim, grid_size, cls_token=False, extra_tokens=0):`
			`"""`
			`grid_size: int of the grid height and width`
			`return:`
			`pos_embed: [grid_sizegrid_size, embed_dim] or [1+grid_sizegrid_size, embed_dim] (w/ or w/o cls_token)`
			`"""`
			`grid_h = np.arange(grid_size, dtype=np.float32)`
			`grid_w = np.arange(1, dtype=np.float32)`
			`grid = np.meshgrid(grid_w, grid_h) # here w goes first`
			`grid = np.stack(grid, axis=0)`

			`grid = grid.reshape([2, 1, grid_size, 1])`
			`pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid)`
			`if cls_token and extra_tokens > 0:`
			`pos_embed = np.concatenate([np.zeros([extra_tokens, embed_dim]), pos_embed], axis=0)`
			`return pos_embed`


			`def get_2d_sincos_pos_embed_from_grid(embed_dim, grid):`
			`assert embed_dim % 2 == 0`

			`# use half of dimensions to encode grid_h`
			`emb_h = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[0]) # (H*W, D/2)`
			`emb_w = get_1d_sincos_pos_embed_from_grid(embed_dim // 2, grid[1]) # (H*W, D/2)`

			`emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D)`
			`return emb`


			`def get_1d_sincos_pos_embed_from_grid(embed_dim, pos):`
			`"""`
			`embed_dim: output dimension for each position`
			`pos: a list of positions to be encoded: size (M,)`
			`out: (M, D)`
			`"""`
			`assert embed_dim % 2 == 0`
			`omega = np.arange(embed_dim // 2, dtype=np.float64)`
			`omega /= embed_dim / 2.`
			`omega = 1. / 10000**omega # (D/2,)`

			`pos = pos.reshape(-1) # (M,)`
			`out = np.einsum('m,d->md', pos, omega) # (M, D/2), outer product`

			`emb_sin = np.sin(out) # (M, D/2)`
			`emb_cos = np.cos(out) # (M, D/2)`

			`emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D)`
			`return emb`

			`def get_emb(sin_inp):`
			`"""`
			`Gets a base embedding for one dimension with sin and cos intertwined`
			`"""`
			`emb = torch.stack((sin_inp.sin(), sin_inp.cos()), dim=-1)`
			`return torch.flatten(emb, -2, -1)`


			`class PositionalEncoding1D(nn.Module):`
			`def __init__(self, channels, dtype_override=None):`
			`"""`
			`:param channels: The last dimension of the tensor you want to apply pos emb to.`
			`:param dtype_override: If set, overrides the dtype of the output embedding.`
			`"""`
			`super(PositionalEncoding1D, self).__init__()`
			`self.org_channels = channels`
			`channels = int(np.ceil(channels / 2) * 2)`
			`inv_freq = 1.0 / (10000 ** (torch.arange(0, channels, 2).float() / channels))`
			`self.register_buffer("inv_freq", inv_freq)`
			`self.register_buffer("cached_penc", None, persistent=False)`
			`self.channels = channels`
			`self.dtype_override = dtype_override`

			`def forward(self, tensor):`
			`"""`
			`:param tensor: A 3d tensor of size (batch_size, ch, x)`
			`:return: Positional Encoding Matrix of size (batch_size, ch, x)`
			`"""`
			`if len(tensor.shape) != 3:`
			`raise RuntimeError("The input tensor has to be 3d!")`

			`if self.cached_penc is not None and self.cached_penc.shape == tensor.shape:`
			`return self.cached_penc`

			`self.cached_penc = None`
			`batch_size, orig_ch, x = tensor.shape`
			`pos_x = torch.arange(x, device=tensor.device, dtype=self.inv_freq.dtype)`
			`sin_inp_x = torch.einsum("i,j->ij", pos_x, self.inv_freq)`
			`emb_x = get_emb(sin_inp_x)`
			`#print('emb_x', emb_x.shape)`
			`emb = torch.zeros(`
			`(self.channels, x),`
			`device=tensor.device,`
			`dtype=(`
			`self.dtype_override if self.dtype_override is not None else tensor.dtype`
			`),`
			`)`
			`emb[:self.channels, :] = emb_x.permute(1,0)`

			`self.cached_penc = emb[None, :orig_ch, :].repeat(batch_size, 1, 1)`
			`return self.cached_penc`

			`class TransformerModel(nn.Module):`

			`def __init__(self, ntoken, ninp, nhead, nhid, nlayers=6, dropout=0.0):`
			`super(TransformerModel, self).__init__()`

			`self.model_type = 'Transformer'`
			`self.pos_encoder = PositionalEncoding1D(ninp)`
			`encoder_layers = TransformerEncoderLayer(ninp, nhead, nhid, dropout, batch_first=True)`
			`self.transformer_encoder = TransformerEncoder(encoder_layers, nlayers)`
			`#self.encoder = nn.Embedding(ntoken, ninp)`
			`self.ninp = ninp`
			`#self.decoder = nn.Linear(ninp, ntoken)`
			`self.time_mlp = nn.Sequential( # should be removed for Noise and Deterministic Baselines`
			`SinusoidalPosEmb(ninp),`
			`nn.Linear(ninp-1, ninp * 4),`
			`nn.Mish(),`
			`nn.Linear(ninp * 4, ninp),`
			`)`

			`#self.init_weights()`

			`def generate_square_subsequent_mask(self, sz):`
			`mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)`
			`mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))`
			`return mask`

			`'''def init_weights(self):`
			`initrange = 0.1`
			`self.encoder.weight.data.uniform_(-initrange, initrange)`
			`self.decoder.bias.data.zero_()`
			`self.decoder.weight.data.uniform_(-initrange, initrange)'''`

			`def forward(self, src, t):`
			`#print('self.ninp', self.ninp)`
			`#src = self.encoder(src) * math.sqrt(self.ninp)`
			`#print('src', src.shape)`

			`#t = self.time_mlp(t).unsqueeze(1)`

			`emb = self.pos_encoder(src)`
			`#time = torch.cat((t,t,t), dim=1)`
			`#print('time', time.shape)`
			`output = self.transformer_encoder(src+emb)`
			`#print('shape after transformer', output.shape)`
			`#output = self.decoder(output)`
			`return output`


			`#################################################################################`
			`# DiT Configs #`
			`#################################################################################`

			`def DiT_XL_2(**kwargs):`
			`return DiT(depth=28, hidden_size=1152, patch_size=2, num_heads=16, **kwargs)`

			`def DiT_XL_4(**kwargs):`
			`return DiT(depth=28, hidden_size=1152, patch_size=4, num_heads=16, **kwargs)`

			`def DiT_XL_8(**kwargs):`
			`return DiT(depth=28, hidden_size=1152, patch_size=8, num_heads=16, **kwargs)`

			`def DiT_L_2(**kwargs):`
			`return DiT(depth=24, hidden_size=1024, patch_size=2, num_heads=16, **kwargs)`

			`def DiT_L_4(**kwargs):`
			`return DiT(depth=24, hidden_size=1024, patch_size=4, num_heads=16, **kwargs)`

			`def DiT_L_8(**kwargs):`
			`return DiT(depth=24, hidden_size=1024, patch_size=8, num_heads=16, **kwargs)`

			`def DiT_B_2(**kwargs):`
			`return DiT(depth=12, hidden_size=768, patch_size=2, num_heads=12, **kwargs)`

			`def DiT_B_4(**kwargs):`
			`return DiT(depth=12, hidden_size=768, patch_size=4, num_heads=12, **kwargs)`

			`def DiT_B_8(**kwargs):`
			`return DiT(depth=12, hidden_size=768, patch_size=8, num_heads=12, **kwargs)`

			`def DiT_S_2(**kwargs):`
			`return DiT(depth=12, hidden_size=384, patch_size=2, num_heads=6, **kwargs)`

			`def DiT_S_4(**kwargs):`
			`return DiT(depth=12, hidden_size=384, patch_size=4, num_heads=6, **kwargs)`

			`def DiT_S_8(**kwargs):`
			`return DiT(depth=12, hidden_size=384, patch_size=8, num_heads=6, **kwargs)`


			`DiT_models = {`
			`'DiT-XL/2': DiT_XL_2, 'DiT-XL/4': DiT_XL_4, 'DiT-XL/8': DiT_XL_8,`
			`'DiT-L/2': DiT_L_2, 'DiT-L/4': DiT_L_4, 'DiT-L/8': DiT_L_8,`
			`'DiT-B/2': DiT_B_2, 'DiT-B/4': DiT_B_4, 'DiT-B/8': DiT_B_8,`
			`'DiT-S/2': DiT_S_2, 'DiT-S/4': DiT_S_4, 'DiT-S/8': DiT_S_8,`
			`}`