OLViT/src/models/base_model.py

import pytorch_lightning as pl
import torch
from torch import nn
from torch.optim import AdamW
from src.utils.positional_encoding import PositionalEncoding
from src.object_description_encoder.object_description_encoder import ObjectDescriptionEncoder
import torchmetrics as metrics
from transformers import get_cosine_schedule_with_warmup
from transformers import AutoModel
from src.combiner.option_a import CombinerOptionA
from transformers import AutoTokenizer


class TransformerModel(pl.LightningModule):
    def __init__(self, config, output_path=None):
        super().__init__()
        self.output_path = output_path
        self.config = config['model']
        self.train_config = config['training']

        self.train_acc = metrics.Accuracy('multiclass', num_classes=40)
        self.val_acc = metrics.Accuracy('multiclass', num_classes=40)
        self.test_acc = metrics.Accuracy('multiclass', num_classes=40)

        self.best_val_acc = 0
        self.loss_for_best_val_acc = 0
        self.best_train_acc = 0


        self.combiner = CombinerOptionA()
        self.initialize_text_encoder_and_feature_mapping()

        self.positional_encoder = PositionalEncoding(
            d_model=self.model_input_dim, dropout=self.config['dropout_p'], max_len=self.config['dim_feedforward']
        )

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.model_input_dim,
            batch_first=True,
            dropout=self.config['dropout_p'],
            dim_feedforward=self.config['dim_feedforward'],
            nhead=self.config['n_heads']
        )
        self.encoder = nn.TransformerEncoder(
            encoder_layer=encoder_layer,
            num_layers=self.config['n_encoder_layers'],
        )

        self.loss = nn.CrossEntropyLoss()

        if self.config['feature_type'] == 'object_text_features':
            self.object_description_encoder = ObjectDescriptionEncoder(
                d_model=self.config['v_emb_dim'],
                config=self.config
            )
            # maps the output from the pretrained lm to as smaller size used for the encoding of the object description (reduces transformer size)
            self.linear_projection_object_description = nn.Linear(self.pretrained_lm.config.hidden_size, self.config['v_emb_dim'])


        # tokenizer for translation from ids to text
        self.tokenizer = AutoTokenizer.from_pretrained(self.config['pretrained_lm_name'])


    def initialize_text_encoder_and_feature_mapping(self):
        if self.config['use_pretrained_lm']:
            self.pretrained_lm = AutoModel.from_pretrained(
                self.config['pretrained_lm_name'],
                add_pooling_layer=False
            )
            self.pretrained_lm.eval()
            # don't train the paramteres of the pretrained lm
            self.pretrained_lm.config.training = True
            # for param in self.pretrained_lm.parameters():
            #     param.requires_grad = False

            # initialize the projection layers to map the embeddings to the correct input dim
            # either use the emb_dim as done in aloe (v_emb_dim * n_heads) or the emb_dim specified in the config
            if self.config['projection_as_in_aloe']:
                self.model_input_dim = self.config['n_heads'] * self.config['v_emb_dim']
                self.linear_projection_video = nn.Linear(self.config['v_emb_dim'], self.model_input_dim - 2)
                self.linear_projection_text = nn.Linear(self.pretrained_lm.config.hidden_size, self.model_input_dim - 2)
            else:
                # take embedding size from config and map the video features from their size to the chose emb size
                self.linear_projection_video = nn.Linear(self.config['v_emb_dim'], self.config['emb_dim'] - 2)
                self.linear_projection_text = nn.Linear(self.pretrained_lm.config.hidden_size, self.config['emb_dim'] - 2)
                self.model_input_dim = self.config['emb_dim']
        else:
            # either use the emb_dim as done in aloe (v_emb_dim * n_heads) or the video_emb_dim (2 is either added or subtracted because of the input ids)
            if self.config['projection_as_in_aloe']:
                self.model_input_dim = self.config['n_heads'] * self.config['v_emb_dim']
            else:
                self.model_input_dim = self.config['emb_dim']
            self.linear_projection_video = nn.Linear(self.config['v_emb_dim'], self.model_input_dim - 2)
            self.embed = nn.Embedding(num_embeddings=self.config['vocab_size'], embedding_dim=self.model_input_dim - 2)


    def append_ids(self, tensor, id_vector, axis):
        id_vector = torch.tensor(id_vector, device=self.device)
        for a in range(len(tensor.shape)):
            if a != axis:
                id_vector = torch.unsqueeze(id_vector, axis=a)
        tiling_vector = [s if i != axis else 1 for i, s in enumerate(tensor.shape)]
        id_tensor = torch.tile(id_vector, tiling_vector)
        return torch.concat([tensor, id_tensor], axis=axis)


    def downsample_video_emb(self, video_emb):
        return video_emb[:, ::self.config['sample_rate_video'], :, :]


    def unroll_video_emb(self, video_emb):
        video_emb = video_emb.permute(0, 1, 3, 2)
        return torch.reshape(video_emb, (video_emb.shape[0], -1, video_emb.shape[3]))


    def apply_pretrained_lm(self, query, query_mask):
        output = self.pretrained_lm(
            input_ids=query,
            attention_mask=query_mask
        )
        return output['last_hidden_state']


    def prepare_lang_emb(self, query, query_mask):
        # set maximum query length TODO ------ set param in config
        if query.shape[1] > 100:
            query = query[:, :100]
            query_mask = query_mask[:, :100]

        # apply pretrained language model to embed the query if specified
        if self.config['use_pretrained_lm']:
            lang_emb = self.apply_pretrained_lm(query, query_mask)
        else:
            lang_emb = self.embed(query)

        # Aloe uses an emb_dim of v_emb_dim * n_heads. Or use the emb_dim specified in the config
        if self.config['use_pretrained_lm']:
            lang_emb = self.linear_projection_text(lang_emb)

        lang_emb = self.append_ids(lang_emb, [1, 0], 2)
        lang_emb = self.positional_encoder(lang_emb)
        return lang_emb


    def prepare_video_emb(self, video_emb):
        # shape: [batch, frames, v_emb_dim, objects]
        video_emb = self.downsample_video_emb(video_emb)

        # unroll time dimension in object dimension (only take every _ frame) - shape: [batch, objects x frames, v_emb_dim + 2]
        video_emb = self.unroll_video_emb(video_emb)

        # video_emb need to be projected to either the size of the language emb or the emb_size given by v_emb_dim * n_heads (As done in the Aloe paper)
        #if self.config['use_pretrained_lm'] or self.config['projection_as_in_aloe']:
        video_emb = self.linear_projection_video(video_emb)

        video_emb = self.append_ids(video_emb, [0, 1], 2)
        video_emb = self.positional_encoder(video_emb)
        return video_emb


    def forward(self, batch):
        output = self.answer_query(batch.query, batch.query_mask, batch.vft)
        return output


    def configure_optimizers(self):
        opt = AdamW(self.parameters(), lr=self.train_config['lr'])
        sched = get_cosine_schedule_with_warmup(
            opt,
            num_warmup_steps=self.train_config['warmup_steps'],
            num_training_steps=self.train_config['total_steps'],
        )
        return {
            'optimizer': opt,
            'lr_scheduler': {
                'scheduler': sched,
                'interval': 'step'
            }
        }